aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild1
-rw-r--r--arch/x86/Kconfig66
-rw-r--r--arch/x86/Kconfig.cpu16
-rw-r--r--arch/x86/Kconfig.debug20
-rw-r--r--arch/x86/Makefile_32.cpu2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c9
-rw-r--r--arch/x86/crypto/fpu.c10
-rw-r--r--arch/x86/ia32/ia32entry.S2
-rw-r--r--arch/x86/include/asm/acpi.h4
-rw-r--r--arch/x86/include/asm/alternative-asm.h9
-rw-r--r--arch/x86/include/asm/alternative.h9
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h13
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h28
-rw-r--r--arch/x86/include/asm/amd_nb.h1
-rw-r--r--arch/x86/include/asm/apic.h38
-rw-r--r--arch/x86/include/asm/bios_ebda.h28
-rw-r--r--arch/x86/include/asm/cpufeature.h17
-rw-r--r--arch/x86/include/asm/desc.h152
-rw-r--r--arch/x86/include/asm/dma.h12
-rw-r--r--arch/x86/include/asm/efi.h1
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/i8253.h2
-rw-r--r--arch/x86/include/asm/idle.h2
-rw-r--r--arch/x86/include/asm/io.h24
-rw-r--r--arch/x86/include/asm/io_apic.h28
-rw-r--r--arch/x86/include/asm/jump_label.h27
-rw-r--r--arch/x86/include/asm/kgdb.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h193
-rw-r--r--arch/x86/include/asm/kvm_host.h55
-rw-r--r--arch/x86/include/asm/linkage.h5
-rw-r--r--arch/x86/include/asm/mce.h2
-rw-r--r--arch/x86/include/asm/mmu.h4
-rw-r--r--arch/x86/include/asm/mmzone_32.h20
-rw-r--r--arch/x86/include/asm/mmzone_64.h23
-rw-r--r--arch/x86/include/asm/module.h2
-rw-r--r--arch/x86/include/asm/msr-index.h1
-rw-r--r--arch/x86/include/asm/nops.h146
-rw-r--r--arch/x86/include/asm/numa.h32
-rw-r--r--arch/x86/include/asm/numa_32.h10
-rw-r--r--arch/x86/include/asm/numa_64.h36
-rw-r--r--arch/x86/include/asm/numaq.h7
-rw-r--r--arch/x86/include/asm/olpc_ofw.h9
-rw-r--r--arch/x86/include/asm/pci.h2
-rw-r--r--arch/x86/include/asm/percpu.h34
-rw-r--r--arch/x86/include/asm/probe_roms.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h1
-rw-r--r--arch/x86/include/asm/processor.h4
-rw-r--r--arch/x86/include/asm/ptrace.h18
-rw-r--r--arch/x86/include/asm/setup.h4
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h3
-rw-r--r--arch/x86/include/asm/suspend_32.h2
-rw-r--r--arch/x86/include/asm/suspend_64.h5
-rw-r--r--arch/x86/include/asm/system.h85
-rw-r--r--arch/x86/include/asm/topology.h8
-rw-r--r--arch/x86/include/asm/tsc.h4
-rw-r--r--arch/x86/include/asm/uaccess.h3
-rw-r--r--arch/x86/include/asm/uaccess_32.h1
-rw-r--r--arch/x86/include/asm/uaccess_64.h1
-rw-r--r--arch/x86/include/asm/unistd_32.h4
-rw-r--r--arch/x86/include/asm/unistd_64.h4
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h590
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h71
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h1012
-rw-r--r--arch/x86/include/asm/vdso.h14
-rw-r--r--arch/x86/include/asm/vgtod.h2
-rw-r--r--arch/x86/include/asm/vsyscall.h12
-rw-r--r--arch/x86/include/asm/vvar.h52
-rw-r--r--arch/x86/include/asm/x2apic.h62
-rw-r--r--arch/x86/include/asm/xen/hypercall.h7
-rw-r--r--arch/x86/include/asm/xen/page.h5
-rw-r--r--arch/x86/include/asm/xen/pci.h16
-rw-r--r--arch/x86/kernel/Makefile12
-rw-r--r--arch/x86/kernel/acpi/boot.c8
-rw-r--r--arch/x86/kernel/acpi/sleep.c5
-rw-r--r--arch/x86/kernel/alternative.c203
-rw-r--r--arch/x86/kernel/amd_gart_64.c (renamed from arch/x86/kernel/pci-gart_64.c)0
-rw-r--r--arch/x86/kernel/amd_iommu.c527
-rw-r--r--arch/x86/kernel/amd_iommu_init.c48
-rw-r--r--arch/x86/kernel/apb_timer.c10
-rw-r--r--arch/x86/kernel/aperture_64.c34
-rw-r--r--arch/x86/kernel/apic/Makefile17
-rw-r--r--arch/x86/kernel/apic/apic.c117
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c26
-rw-r--r--arch/x86/kernel/apic/apic_noop.c9
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c13
-rw-r--r--arch/x86/kernel/apic/es7000_32.c17
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c4
-rw-r--r--arch/x86/kernel/apic/io_apic.c309
-rw-r--r--arch/x86/kernel/apic/numaq_32.c40
-rw-r--r--arch/x86/kernel/apic/probe_32.c118
-rw-r--r--arch/x86/kernel/apic/probe_64.c61
-rw-r--r--arch/x86/kernel/apic/summit_32.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c222
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c115
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c53
-rw-r--r--arch/x86/kernel/apm_32.c6
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c16
-rw-r--r--arch/x86/kernel/cpu/bugs.c1
-rw-r--r--arch/x86/kernel/cpu/common.c35
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig266
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c776
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c446
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c367
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c309
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c517
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c1029
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h353
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c327
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c331
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c624
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c261
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c752
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h43
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c1607
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h224
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c194
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c636
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c452
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c481
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h49
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c467
-rw-r--r--arch/x86/kernel/cpu/intel.c29
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c20
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c25
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c10
-rw-r--r--arch/x86/kernel/cpu/perf_event.c28
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c14
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c37
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c7
-rw-r--r--arch/x86/kernel/devicetree.c6
-rw-r--r--arch/x86/kernel/dumpstack.c17
-rw-r--r--arch/x86/kernel/ftrace.c16
-rw-r--r--arch/x86/kernel/head32.c1
-rw-r--r--arch/x86/kernel/hpet.c72
-rw-r--r--arch/x86/kernel/i8253.c86
-rw-r--r--arch/x86/kernel/irq.c5
-rw-r--r--arch/x86/kernel/jump_label.c5
-rw-r--r--arch/x86/kernel/kvmclock.c6
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c8
-rw-r--r--arch/x86/kernel/pci-dma.c64
-rw-r--r--arch/x86/kernel/pci-iommu_table.c18
-rw-r--r--arch/x86/kernel/probe_roms.c (renamed from arch/x86/kernel/probe_roms_32.c)101
-rw-r--r--arch/x86/kernel/process.c47
-rw-r--r--arch/x86/kernel/ptrace.c4
-rw-r--r--arch/x86/kernel/reboot.c24
-rw-r--r--arch/x86/kernel/setup.c16
-rw-r--r--arch/x86/kernel/signal.c14
-rw-r--r--arch/x86/kernel/smp.c5
-rw-r--r--arch/x86/kernel/smpboot.c6
-rw-r--r--arch/x86/kernel/stacktrace.c13
-rw-r--r--arch/x86/kernel/syscall_table_32.S2
-rw-r--r--arch/x86/kernel/tboot.c1
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/tsc.c19
-rw-r--r--arch/x86/kernel/vmlinux.lds.S43
-rw-r--r--arch/x86/kernel/vread_tsc_64.c36
-rw-r--r--arch/x86/kernel/vsyscall_64.c48
-rw-r--r--arch/x86/kernel/x86_init.c2
-rw-r--r--arch/x86/kvm/emulate.c1754
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/irq.h2
-rw-r--r--arch/x86/kvm/mmu.c19
-rw-r--r--arch/x86/kvm/paging_tmpl.h83
-rw-r--r--arch/x86/kvm/svm.c585
-rw-r--r--arch/x86/kvm/vmx.c228
-rw-r--r--arch/x86/kvm/x86.c570
-rw-r--r--arch/x86/kvm/x86.h2
-rw-r--r--arch/x86/lguest/boot.c7
-rw-r--r--arch/x86/lib/clear_page_64.S33
-rw-r--r--arch/x86/lib/copy_user_64.S69
-rw-r--r--arch/x86/lib/memcpy_64.S47
-rw-r--r--arch/x86/lib/memmove_64.S29
-rw-r--r--arch/x86/lib/memset_64.S54
-rw-r--r--arch/x86/mm/Makefile4
-rw-r--r--arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/amdtopology_64.c)21
-rw-r--r--arch/x86/mm/fault.c28
-rw-r--r--arch/x86/mm/hugetlbpage.c4
-rw-r--r--arch/x86/mm/init.c2
-rw-r--r--arch/x86/mm/init_32.c3
-rw-r--r--arch/x86/mm/init_64.c10
-rw-r--r--arch/x86/mm/ioremap.c14
-rw-r--r--arch/x86/mm/numa.c550
-rw-r--r--arch/x86/mm/numa_32.c398
-rw-r--r--arch/x86/mm/numa_64.c644
-rw-r--r--arch/x86/mm/numa_emulation.c16
-rw-r--r--arch/x86/mm/numa_internal.h8
-rw-r--r--arch/x86/mm/pf_in.c14
-rw-r--r--arch/x86/mm/srat.c (renamed from arch/x86/mm/srat_64.c)82
-rw-r--r--arch/x86/mm/srat_32.c288
-rw-r--r--arch/x86/net/Makefile4
-rw-r--r--arch/x86/net/bpf_jit.S140
-rw-r--r--arch/x86/net/bpf_jit_comp.c654
-rw-r--r--arch/x86/oprofile/backtrace.c13
-rw-r--r--arch/x86/pci/direct.c17
-rw-r--r--arch/x86/pci/irq.c4
-rw-r--r--arch/x86/pci/mmconfig-shared.c10
-rw-r--r--arch/x86/pci/xen.c96
-rw-r--r--arch/x86/platform/efi/efi.c123
-rw-r--r--arch/x86/platform/efi/efi_64.c37
-rw-r--r--arch/x86/platform/mrst/mrst.c4
-rw-r--r--arch/x86/platform/olpc/Makefile4
-rw-r--r--arch/x86/platform/olpc/olpc.c51
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c19
-rw-r--r--arch/x86/platform/uv/tlb_uv.c1484
-rw-r--r--arch/x86/platform/uv/uv_time.c22
-rw-r--r--arch/x86/vdso/Makefile17
-rw-r--r--arch/x86/vdso/vclock_gettime.c74
-rw-r--r--arch/x86/vdso/vdso.lds.S9
-rw-r--r--arch/x86/vdso/vextern.h16
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--arch/x86/vdso/vma.c27
-rw-r--r--arch/x86/vdso/vvar.c12
-rw-r--r--arch/x86/xen/enlighten.c20
-rw-r--r--arch/x86/xen/irq.c2
-rw-r--r--arch/x86/xen/mmu.c320
-rw-r--r--arch/x86/xen/mmu.h37
-rw-r--r--arch/x86/xen/p2m.c43
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c2
-rw-r--r--arch/x86/xen/setup.c10
-rw-r--r--arch/x86/xen/smp.c13
-rw-r--r--arch/x86/xen/time.c14
-rw-r--r--arch/x86/xen/xen-ops.h2
232 files changed, 8849 insertions, 16661 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index 0e103236b754..0e9dec6cadd1 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -15,3 +15,4 @@ obj-y += vdso/
15obj-$(CONFIG_IA32_EMULATION) += ia32/ 15obj-$(CONFIG_IA32_EMULATION) += ia32/
16 16
17obj-y += platform/ 17obj-y += platform/
18obj-y += net/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index cc6c53a95bfd..da349723d411 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -8,6 +8,7 @@ config 64BIT
8 8
9config X86_32 9config X86_32
10 def_bool !64BIT 10 def_bool !64BIT
11 select CLKSRC_I8253
11 12
12config X86_64 13config X86_64
13 def_bool 64BIT 14 def_bool 64BIT
@@ -16,8 +17,6 @@ config X86_64
16config X86 17config X86
17 def_bool y 18 def_bool y
18 select HAVE_AOUT if X86_32 19 select HAVE_AOUT if X86_32
19 select HAVE_READQ
20 select HAVE_WRITEQ
21 select HAVE_UNSTABLE_SCHED_CLOCK 20 select HAVE_UNSTABLE_SCHED_CLOCK
22 select HAVE_IDE 21 select HAVE_IDE
23 select HAVE_OPROFILE 22 select HAVE_OPROFILE
@@ -65,13 +64,12 @@ config X86
65 select HAVE_GENERIC_HARDIRQS 64 select HAVE_GENERIC_HARDIRQS
66 select HAVE_SPARSE_IRQ 65 select HAVE_SPARSE_IRQ
67 select GENERIC_FIND_FIRST_BIT 66 select GENERIC_FIND_FIRST_BIT
68 select GENERIC_FIND_NEXT_BIT
69 select GENERIC_IRQ_PROBE 67 select GENERIC_IRQ_PROBE
70 select GENERIC_PENDING_IRQ if SMP 68 select GENERIC_PENDING_IRQ if SMP
71 select GENERIC_IRQ_SHOW 69 select GENERIC_IRQ_SHOW
72 select IRQ_FORCED_THREADING 70 select IRQ_FORCED_THREADING
73 select USE_GENERIC_SMP_HELPERS if SMP 71 select USE_GENERIC_SMP_HELPERS if SMP
74 select ARCH_NO_SYSDEV_OPS 72 select HAVE_BPF_JIT if (X86_64 && NET)
75 73
76config INSTRUCTION_DECODER 74config INSTRUCTION_DECODER
77 def_bool (KPROBES || PERF_EVENTS) 75 def_bool (KPROBES || PERF_EVENTS)
@@ -112,7 +110,14 @@ config MMU
112 def_bool y 110 def_bool y
113 111
114config ZONE_DMA 112config ZONE_DMA
115 def_bool y 113 bool "DMA memory allocation support" if EXPERT
114 default y
115 help
116 DMA memory allocation support allows devices with less than 32-bit
117 addressing to allocate within the first 16MB of address space.
118 Disable if no such devices will be used.
119
120 If unsure, say Y.
116 121
117config SBUS 122config SBUS
118 bool 123 bool
@@ -365,17 +370,6 @@ config X86_UV
365# Following is an alphabetically sorted list of 32 bit extended platforms 370# Following is an alphabetically sorted list of 32 bit extended platforms
366# Please maintain the alphabetic order if and when there are additions 371# Please maintain the alphabetic order if and when there are additions
367 372
368config X86_ELAN
369 bool "AMD Elan"
370 depends on X86_32
371 depends on X86_EXTENDED_PLATFORM
372 ---help---
373 Select this for an AMD Elan processor.
374
375 Do not use this option for K6/Athlon/Opteron processors!
376
377 If unsure, choose "PC-compatible" instead.
378
379config X86_INTEL_CE 373config X86_INTEL_CE
380 bool "CE4100 TV platform" 374 bool "CE4100 TV platform"
381 depends on PCI 375 depends on PCI
@@ -690,6 +684,7 @@ config AMD_IOMMU
690 bool "AMD IOMMU support" 684 bool "AMD IOMMU support"
691 select SWIOTLB 685 select SWIOTLB
692 select PCI_MSI 686 select PCI_MSI
687 select PCI_IOV
693 depends on X86_64 && PCI && ACPI 688 depends on X86_64 && PCI && ACPI
694 ---help--- 689 ---help---
695 With this option you can enable support for AMD IOMMU hardware in 690 With this option you can enable support for AMD IOMMU hardware in
@@ -919,6 +914,7 @@ config TOSHIBA
919 914
920config I8K 915config I8K
921 tristate "Dell laptop support" 916 tristate "Dell laptop support"
917 select HWMON
922 ---help--- 918 ---help---
923 This adds a driver to safely access the System Management Mode 919 This adds a driver to safely access the System Management Mode
924 of the CPU on the Dell Inspiron 8000. The System Management Mode 920 of the CPU on the Dell Inspiron 8000. The System Management Mode
@@ -1174,7 +1170,7 @@ comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
1174config AMD_NUMA 1170config AMD_NUMA
1175 def_bool y 1171 def_bool y
1176 prompt "Old style AMD Opteron NUMA detection" 1172 prompt "Old style AMD Opteron NUMA detection"
1177 depends on X86_64 && NUMA && PCI 1173 depends on NUMA && PCI
1178 ---help--- 1174 ---help---
1179 Enable AMD NUMA node topology detection. You should say Y here if 1175 Enable AMD NUMA node topology detection. You should say Y here if
1180 you have a multi processor AMD system. This uses an old method to 1176 you have a multi processor AMD system. This uses an old method to
@@ -1201,7 +1197,7 @@ config NODES_SPAN_OTHER_NODES
1201 1197
1202config NUMA_EMU 1198config NUMA_EMU
1203 bool "NUMA emulation" 1199 bool "NUMA emulation"
1204 depends on X86_64 && NUMA 1200 depends on NUMA
1205 ---help--- 1201 ---help---
1206 Enable NUMA emulation. A flat machine will be split 1202 Enable NUMA emulation. A flat machine will be split
1207 into virtual nodes when booted with "numa=fake=N", where N is the 1203 into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1223,6 +1219,10 @@ config HAVE_ARCH_BOOTMEM
1223 def_bool y 1219 def_bool y
1224 depends on X86_32 && NUMA 1220 depends on X86_32 && NUMA
1225 1221
1222config HAVE_ARCH_ALLOC_REMAP
1223 def_bool y
1224 depends on X86_32 && NUMA
1225
1226config ARCH_HAVE_MEMORY_PRESENT 1226config ARCH_HAVE_MEMORY_PRESENT
1227 def_bool y 1227 def_bool y
1228 depends on X86_32 && DISCONTIGMEM 1228 depends on X86_32 && DISCONTIGMEM
@@ -1231,13 +1231,9 @@ config NEED_NODE_MEMMAP_SIZE
1231 def_bool y 1231 def_bool y
1232 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) 1232 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
1233 1233
1234config HAVE_ARCH_ALLOC_REMAP
1235 def_bool y
1236 depends on X86_32 && NUMA
1237
1238config ARCH_FLATMEM_ENABLE 1234config ARCH_FLATMEM_ENABLE
1239 def_bool y 1235 def_bool y
1240 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA 1236 depends on X86_32 && !NUMA
1241 1237
1242config ARCH_DISCONTIGMEM_ENABLE 1238config ARCH_DISCONTIGMEM_ENABLE
1243 def_bool y 1239 def_bool y
@@ -1247,20 +1243,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
1247 def_bool y 1243 def_bool y
1248 depends on NUMA && X86_32 1244 depends on NUMA && X86_32
1249 1245
1250config ARCH_PROC_KCORE_TEXT
1251 def_bool y
1252 depends on X86_64 && PROC_KCORE
1253
1254config ARCH_SPARSEMEM_DEFAULT
1255 def_bool y
1256 depends on X86_64
1257
1258config ARCH_SPARSEMEM_ENABLE 1246config ARCH_SPARSEMEM_ENABLE
1259 def_bool y 1247 def_bool y
1260 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD 1248 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
1261 select SPARSEMEM_STATIC if X86_32 1249 select SPARSEMEM_STATIC if X86_32
1262 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1250 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1263 1251
1252config ARCH_SPARSEMEM_DEFAULT
1253 def_bool y
1254 depends on X86_64
1255
1264config ARCH_SELECT_MEMORY_MODEL 1256config ARCH_SELECT_MEMORY_MODEL
1265 def_bool y 1257 def_bool y
1266 depends on ARCH_SPARSEMEM_ENABLE 1258 depends on ARCH_SPARSEMEM_ENABLE
@@ -1269,6 +1261,10 @@ config ARCH_MEMORY_PROBE
1269 def_bool X86_64 1261 def_bool X86_64
1270 depends on MEMORY_HOTPLUG 1262 depends on MEMORY_HOTPLUG
1271 1263
1264config ARCH_PROC_KCORE_TEXT
1265 def_bool y
1266 depends on X86_64 && PROC_KCORE
1267
1272config ILLEGAL_POINTER_VALUE 1268config ILLEGAL_POINTER_VALUE
1273 hex 1269 hex
1274 default 0 if X86_32 1270 default 0 if X86_32
@@ -1703,10 +1699,6 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
1703 def_bool y 1699 def_bool y
1704 depends on MEMORY_HOTPLUG 1700 depends on MEMORY_HOTPLUG
1705 1701
1706config HAVE_ARCH_EARLY_PFN_TO_NID
1707 def_bool X86_64
1708 depends on NUMA
1709
1710config USE_PERCPU_NUMA_NODE_ID 1702config USE_PERCPU_NUMA_NODE_ID
1711 def_bool y 1703 def_bool y
1712 depends on NUMA 1704 depends on NUMA
@@ -1848,7 +1840,7 @@ config APM_ALLOW_INTS
1848 1840
1849endif # APM 1841endif # APM
1850 1842
1851source "arch/x86/kernel/cpu/cpufreq/Kconfig" 1843source "drivers/cpufreq/Kconfig"
1852 1844
1853source "drivers/cpuidle/Kconfig" 1845source "drivers/cpuidle/Kconfig"
1854 1846
@@ -2076,7 +2068,7 @@ config OLPC
2076 depends on !X86_PAE 2068 depends on !X86_PAE
2077 select GPIOLIB 2069 select GPIOLIB
2078 select OF 2070 select OF
2079 select OF_PROMTREE if PROC_DEVICETREE 2071 select OF_PROMTREE
2080 ---help--- 2072 ---help---
2081 Add support for detecting the unique features of the OLPC 2073 Add support for detecting the unique features of the OLPC
2082 XO hardware. 2074 XO hardware.
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index d161e939df62..6a7cfdf8ff69 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,6 +1,4 @@
1# Put here option for CPU selection and depending optimization 1# Put here option for CPU selection and depending optimization
2if !X86_ELAN
3
4choice 2choice
5 prompt "Processor family" 3 prompt "Processor family"
6 default M686 if X86_32 4 default M686 if X86_32
@@ -203,6 +201,14 @@ config MWINCHIP3D
203 stores for this CPU, which can increase performance of some 201 stores for this CPU, which can increase performance of some
204 operations. 202 operations.
205 203
204config MELAN
205 bool "AMD Elan"
206 depends on X86_32
207 ---help---
208 Select this for an AMD Elan processor.
209
210 Do not use this option for K6/Athlon/Opteron processors!
211
206config MGEODEGX1 212config MGEODEGX1
207 bool "GeodeGX1" 213 bool "GeodeGX1"
208 depends on X86_32 214 depends on X86_32
@@ -292,8 +298,6 @@ config X86_GENERIC
292 This is really intended for distributors who need more 298 This is really intended for distributors who need more
293 generic optimizations. 299 generic optimizations.
294 300
295endif
296
297# 301#
298# Define implied options from the CPU selection here 302# Define implied options from the CPU selection here
299config X86_INTERNODE_CACHE_SHIFT 303config X86_INTERNODE_CACHE_SHIFT
@@ -312,7 +316,7 @@ config X86_L1_CACHE_SHIFT
312 int 316 int
313 default "7" if MPENTIUM4 || MPSC 317 default "7" if MPENTIUM4 || MPSC
314 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU 318 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
315 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 319 default "4" if MELAN || M486 || M386 || MGEODEGX1
316 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 320 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
317 321
318config X86_XADD 322config X86_XADD
@@ -358,7 +362,7 @@ config X86_POPAD_OK
358 362
359config X86_ALIGNMENT_16 363config X86_ALIGNMENT_16
360 def_bool y 364 def_bool y
361 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 365 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
362 366
363config X86_INTEL_USERCOPY 367config X86_INTEL_USERCOPY
364 def_bool y 368 def_bool y
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 615e18810f48..c0f8a5c88910 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -66,26 +66,6 @@ config DEBUG_STACKOVERFLOW
66 This option will cause messages to be printed if free stack space 66 This option will cause messages to be printed if free stack space
67 drops below a certain limit. 67 drops below a certain limit.
68 68
69config DEBUG_STACK_USAGE
70 bool "Stack utilization instrumentation"
71 depends on DEBUG_KERNEL
72 ---help---
73 Enables the display of the minimum amount of free stack which each
74 task has ever had available in the sysrq-T and sysrq-P debug output.
75
76 This option will slow down process creation somewhat.
77
78config DEBUG_PER_CPU_MAPS
79 bool "Debug access to per_cpu maps"
80 depends on DEBUG_KERNEL
81 depends on SMP
82 ---help---
83 Say Y to verify that the per_cpu map being accessed has
84 been setup. Adds a fair amount of code to kernel memory
85 and decreases performance.
86
87 Say N if unsure.
88
89config X86_PTDUMP 69config X86_PTDUMP
90 bool "Export kernel pagetable layout to userspace via debugfs" 70 bool "Export kernel pagetable layout to userspace via debugfs"
91 depends on DEBUG_KERNEL 71 depends on DEBUG_KERNEL
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index f2ee1abb1df9..86cee7b749e1 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=
37 $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) 37 $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
38 38
39# AMD Elan support 39# AMD Elan support
40cflags-$(CONFIG_X86_ELAN) += -march=i486 40cflags-$(CONFIG_MELAN) += -march=i486
41 41
42# Geode GX1 support 42# Geode GX1 support
43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx 43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 6f9872658dd2..2bf18059fbea 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
10CONFIG_AUDIT=y 10CONFIG_AUDIT=y
11CONFIG_LOG_BUF_SHIFT=18 11CONFIG_LOG_BUF_SHIFT=18
12CONFIG_CGROUPS=y 12CONFIG_CGROUPS=y
13CONFIG_CGROUP_NS=y
14CONFIG_CGROUP_FREEZER=y 13CONFIG_CGROUP_FREEZER=y
15CONFIG_CPUSETS=y 14CONFIG_CPUSETS=y
16CONFIG_CGROUP_CPUACCT=y 15CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ee01a9d5d4f0..22a0dc8e51dd 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
11CONFIG_AUDIT=y 11CONFIG_AUDIT=y
12CONFIG_LOG_BUF_SHIFT=18 12CONFIG_LOG_BUF_SHIFT=18
13CONFIG_CGROUPS=y 13CONFIG_CGROUPS=y
14CONFIG_CGROUP_NS=y
15CONFIG_CGROUP_FREEZER=y 14CONFIG_CGROUP_FREEZER=y
16CONFIG_CPUSETS=y 15CONFIG_CPUSETS=y
17CONFIG_CGROUP_CPUACCT=y 16CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1a58ad89fdf7..c04f1b7a9139 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,8 +2,6 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_FPU) += fpu.o
6
7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
9obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -24,6 +22,6 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
24twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 22twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 23salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
26 24
27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 25aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
28 26
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 27ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2577613fb32b..feee8ff1d05e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -94,6 +94,10 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
94 const u8 *in, unsigned int len, u8 *iv); 94 const u8 *in, unsigned int len, u8 *iv);
95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
96 const u8 *in, unsigned int len, u8 *iv); 96 const u8 *in, unsigned int len, u8 *iv);
97
98int crypto_fpu_init(void);
99void crypto_fpu_exit(void);
100
97#ifdef CONFIG_X86_64 101#ifdef CONFIG_X86_64
98asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 102asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
99 const u8 *in, unsigned int len, u8 *iv); 103 const u8 *in, unsigned int len, u8 *iv);
@@ -1257,6 +1261,8 @@ static int __init aesni_init(void)
1257 return -ENODEV; 1261 return -ENODEV;
1258 } 1262 }
1259 1263
1264 if ((err = crypto_fpu_init()))
1265 goto fpu_err;
1260 if ((err = crypto_register_alg(&aesni_alg))) 1266 if ((err = crypto_register_alg(&aesni_alg)))
1261 goto aes_err; 1267 goto aes_err;
1262 if ((err = crypto_register_alg(&__aesni_alg))) 1268 if ((err = crypto_register_alg(&__aesni_alg)))
@@ -1334,6 +1340,7 @@ blk_ecb_err:
1334__aes_err: 1340__aes_err:
1335 crypto_unregister_alg(&aesni_alg); 1341 crypto_unregister_alg(&aesni_alg);
1336aes_err: 1342aes_err:
1343fpu_err:
1337 return err; 1344 return err;
1338} 1345}
1339 1346
@@ -1363,6 +1370,8 @@ static void __exit aesni_exit(void)
1363 crypto_unregister_alg(&blk_ecb_alg); 1370 crypto_unregister_alg(&blk_ecb_alg);
1364 crypto_unregister_alg(&__aesni_alg); 1371 crypto_unregister_alg(&__aesni_alg);
1365 crypto_unregister_alg(&aesni_alg); 1372 crypto_unregister_alg(&aesni_alg);
1373
1374 crypto_fpu_exit();
1366} 1375}
1367 1376
1368module_init(aesni_init); 1377module_init(aesni_init);
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 1a8f8649c035..98d7a188f46b 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -150,18 +150,12 @@ static struct crypto_template crypto_fpu_tmpl = {
150 .module = THIS_MODULE, 150 .module = THIS_MODULE,
151}; 151};
152 152
153static int __init crypto_fpu_module_init(void) 153int __init crypto_fpu_init(void)
154{ 154{
155 return crypto_register_template(&crypto_fpu_tmpl); 155 return crypto_register_template(&crypto_fpu_tmpl);
156} 156}
157 157
158static void __exit crypto_fpu_module_exit(void) 158void __exit crypto_fpu_exit(void)
159{ 159{
160 crypto_unregister_template(&crypto_fpu_tmpl); 160 crypto_unregister_template(&crypto_fpu_tmpl);
161} 161}
162
163module_init(crypto_fpu_module_init);
164module_exit(crypto_fpu_module_exit);
165
166MODULE_LICENSE("GPL");
167MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 849a9d23c71d..c1870dddd322 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -848,4 +848,6 @@ ia32_sys_call_table:
848 .quad compat_sys_open_by_handle_at 848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime 849 .quad compat_sys_clock_adjtime
850 .quad sys_syncfs 850 .quad sys_syncfs
851 .quad compat_sys_sendmmsg /* 345 */
852 .quad sys_setns
851ia32_syscall_end: 853ia32_syscall_end:
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 12e0e7dd869c..610001d385dd 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -139,7 +139,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
139 boot_cpu_data.x86_model <= 0x05 && 139 boot_cpu_data.x86_model <= 0x05 &&
140 boot_cpu_data.x86_mask < 0x0A) 140 boot_cpu_data.x86_mask < 0x0A)
141 return 1; 141 return 1;
142 else if (c1e_detected) 142 else if (amd_e400_c1e_detected)
143 return 1; 143 return 1;
144 else 144 else
145 return max_cstate; 145 return max_cstate;
@@ -183,8 +183,6 @@ static inline void disable_acpi(void) { }
183 183
184#define ARCH_HAS_POWER_INIT 1 184#define ARCH_HAS_POWER_INIT 1
185 185
186struct bootnode;
187
188#ifdef CONFIG_ACPI_NUMA 186#ifdef CONFIG_ACPI_NUMA
189extern int acpi_numa; 187extern int acpi_numa;
190extern int x86_acpi_numa_init(void); 188extern int x86_acpi_numa_init(void);
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index a63a68be1cce..94d420b360d1 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -15,4 +15,13 @@
15 .endm 15 .endm
16#endif 16#endif
17 17
18.macro altinstruction_entry orig alt feature orig_len alt_len
19 .align 8
20 .quad \orig
21 .quad \alt
22 .word \feature
23 .byte \orig_len
24 .byte \alt_len
25.endm
26
18#endif /* __ASSEMBLY__ */ 27#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 13009d1af99a..bf535f947e8c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -4,7 +4,6 @@
4#include <linux/types.h> 4#include <linux/types.h>
5#include <linux/stddef.h> 5#include <linux/stddef.h>
6#include <linux/stringify.h> 6#include <linux/stringify.h>
7#include <linux/jump_label.h>
8#include <asm/asm.h> 7#include <asm/asm.h>
9 8
10/* 9/*
@@ -191,12 +190,4 @@ extern void *text_poke(void *addr, const void *opcode, size_t len);
191extern void *text_poke_smp(void *addr, const void *opcode, size_t len); 190extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
192extern void text_poke_smp_batch(struct text_poke_param *params, int n); 191extern void text_poke_smp_batch(struct text_poke_param *params, int n);
193 192
194#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
195#define IDEAL_NOP_SIZE_5 5
196extern unsigned char ideal_nop5[IDEAL_NOP_SIZE_5];
197extern void arch_init_ideal_nop5(void);
198#else
199static inline void arch_init_ideal_nop5(void) {}
200#endif
201
202#endif /* _ASM_X86_ALTERNATIVE_H */ 193#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index 916bc8111a01..55d95eb789b3 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -19,13 +19,12 @@
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H 19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H 20#define _ASM_X86_AMD_IOMMU_PROTO_H
21 21
22struct amd_iommu; 22#include <asm/amd_iommu_types.h>
23 23
24extern int amd_iommu_init_dma_ops(void); 24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void); 25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
26extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 27extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
27extern void amd_iommu_flush_all_domains(void);
28extern void amd_iommu_flush_all_devices(void);
29extern void amd_iommu_apply_erratum_63(u16 devid); 28extern void amd_iommu_apply_erratum_63(u16 devid);
30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); 29extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31extern int amd_iommu_init_devices(void); 30extern int amd_iommu_init_devices(void);
@@ -44,4 +43,12 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev)
44 (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); 43 (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
45} 44}
46 45
46static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
47{
48 if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
49 return false;
50
51 return !!(iommu->features & f);
52}
53
47#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ 54#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index e3509fc303bf..4c9982995414 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -68,12 +68,25 @@
68#define MMIO_CONTROL_OFFSET 0x0018 68#define MMIO_CONTROL_OFFSET 0x0018
69#define MMIO_EXCL_BASE_OFFSET 0x0020 69#define MMIO_EXCL_BASE_OFFSET 0x0020
70#define MMIO_EXCL_LIMIT_OFFSET 0x0028 70#define MMIO_EXCL_LIMIT_OFFSET 0x0028
71#define MMIO_EXT_FEATURES 0x0030
71#define MMIO_CMD_HEAD_OFFSET 0x2000 72#define MMIO_CMD_HEAD_OFFSET 0x2000
72#define MMIO_CMD_TAIL_OFFSET 0x2008 73#define MMIO_CMD_TAIL_OFFSET 0x2008
73#define MMIO_EVT_HEAD_OFFSET 0x2010 74#define MMIO_EVT_HEAD_OFFSET 0x2010
74#define MMIO_EVT_TAIL_OFFSET 0x2018 75#define MMIO_EVT_TAIL_OFFSET 0x2018
75#define MMIO_STATUS_OFFSET 0x2020 76#define MMIO_STATUS_OFFSET 0x2020
76 77
78
79/* Extended Feature Bits */
80#define FEATURE_PREFETCH (1ULL<<0)
81#define FEATURE_PPR (1ULL<<1)
82#define FEATURE_X2APIC (1ULL<<2)
83#define FEATURE_NX (1ULL<<3)
84#define FEATURE_GT (1ULL<<4)
85#define FEATURE_IA (1ULL<<6)
86#define FEATURE_GA (1ULL<<7)
87#define FEATURE_HE (1ULL<<8)
88#define FEATURE_PC (1ULL<<9)
89
77/* MMIO status bits */ 90/* MMIO status bits */
78#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 91#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
79 92
@@ -113,7 +126,9 @@
113/* command specific defines */ 126/* command specific defines */
114#define CMD_COMPL_WAIT 0x01 127#define CMD_COMPL_WAIT 0x01
115#define CMD_INV_DEV_ENTRY 0x02 128#define CMD_INV_DEV_ENTRY 0x02
116#define CMD_INV_IOMMU_PAGES 0x03 129#define CMD_INV_IOMMU_PAGES 0x03
130#define CMD_INV_IOTLB_PAGES 0x04
131#define CMD_INV_ALL 0x08
117 132
118#define CMD_COMPL_WAIT_STORE_MASK 0x01 133#define CMD_COMPL_WAIT_STORE_MASK 0x01
119#define CMD_COMPL_WAIT_INT_MASK 0x02 134#define CMD_COMPL_WAIT_INT_MASK 0x02
@@ -215,6 +230,8 @@
215#define IOMMU_PTE_IR (1ULL << 61) 230#define IOMMU_PTE_IR (1ULL << 61)
216#define IOMMU_PTE_IW (1ULL << 62) 231#define IOMMU_PTE_IW (1ULL << 62)
217 232
233#define DTE_FLAG_IOTLB 0x01
234
218#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 235#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
219#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 236#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
220#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) 237#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
@@ -227,6 +244,7 @@
227/* IOMMU capabilities */ 244/* IOMMU capabilities */
228#define IOMMU_CAP_IOTLB 24 245#define IOMMU_CAP_IOTLB 24
229#define IOMMU_CAP_NPCACHE 26 246#define IOMMU_CAP_NPCACHE 26
247#define IOMMU_CAP_EFR 27
230 248
231#define MAX_DOMAIN_ID 65536 249#define MAX_DOMAIN_ID 65536
232 250
@@ -249,6 +267,8 @@ extern bool amd_iommu_dump;
249 267
250/* global flag if IOMMUs cache non-present entries */ 268/* global flag if IOMMUs cache non-present entries */
251extern bool amd_iommu_np_cache; 269extern bool amd_iommu_np_cache;
270/* Only true if all IOMMUs support device IOTLBs */
271extern bool amd_iommu_iotlb_sup;
252 272
253/* 273/*
254 * Make iterating over all IOMMUs easier 274 * Make iterating over all IOMMUs easier
@@ -371,6 +391,9 @@ struct amd_iommu {
371 /* flags read from acpi table */ 391 /* flags read from acpi table */
372 u8 acpi_flags; 392 u8 acpi_flags;
373 393
394 /* Extended features */
395 u64 features;
396
374 /* 397 /*
375 * Capability pointer. There could be more than one IOMMU per PCI 398 * Capability pointer. There could be more than one IOMMU per PCI
376 * device function if there are more than one AMD IOMMU capability 399 * device function if there are more than one AMD IOMMU capability
@@ -409,9 +432,6 @@ struct amd_iommu {
409 /* if one, we need to send a completion wait command */ 432 /* if one, we need to send a completion wait command */
410 bool need_sync; 433 bool need_sync;
411 434
412 /* becomes true if a command buffer reset is running */
413 bool reset_in_progress;
414
415 /* default dma_ops domain for that IOMMU */ 435 /* default dma_ops domain for that IOMMU */
416 struct dma_ops_domain *default_dom; 436 struct dma_ops_domain *default_dom;
417 437
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
index 331682231bb4..67f87f257611 100644
--- a/arch/x86/include/asm/amd_nb.h
+++ b/arch/x86/include/asm/amd_nb.h
@@ -11,7 +11,6 @@ struct amd_nb_bus_dev_range {
11 11
12extern const struct pci_device_id amd_nb_misc_ids[]; 12extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[]; 13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14struct bootnode;
15 14
16extern bool early_is_amd_nb(u32 value); 15extern bool early_is_amd_nb(u32 value);
17extern int amd_cache_northbridges(void); 16extern int amd_cache_northbridges(void);
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 2b7d573be549..4a0b7c7e2cce 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -363,7 +363,12 @@ struct apic {
363 */ 363 */
364 int (*x86_32_early_logical_apicid)(int cpu); 364 int (*x86_32_early_logical_apicid)(int cpu);
365 365
366 /* determine CPU -> NUMA node mapping */ 366 /*
367 * Optional method called from setup_local_APIC() after logical
368 * apicid is guaranteed to be known to initialize apicid -> node
369 * mapping if NUMA initialization hasn't done so already. Don't
370 * add new users.
371 */
367 int (*x86_32_numa_cpu_node)(int cpu); 372 int (*x86_32_numa_cpu_node)(int cpu);
368#endif 373#endif
369}; 374};
@@ -376,6 +381,26 @@ struct apic {
376extern struct apic *apic; 381extern struct apic *apic;
377 382
378/* 383/*
384 * APIC drivers are probed based on how they are listed in the .apicdrivers
385 * section. So the order is important and enforced by the ordering
386 * of different apic driver files in the Makefile.
387 *
388 * For the files having two apic drivers, we use apic_drivers()
389 * to enforce the order with in them.
390 */
391#define apic_driver(sym) \
392 static struct apic *__apicdrivers_##sym __used \
393 __aligned(sizeof(struct apic *)) \
394 __section(.apicdrivers) = { &sym }
395
396#define apic_drivers(sym1, sym2) \
397 static struct apic *__apicdrivers_##sym1##sym2[2] __used \
398 __aligned(sizeof(struct apic *)) \
399 __section(.apicdrivers) = { &sym1, &sym2 }
400
401extern struct apic *__apicdrivers[], *__apicdrivers_end[];
402
403/*
379 * APIC functionality to boot other CPUs - only used on SMP: 404 * APIC functionality to boot other CPUs - only used on SMP:
380 */ 405 */
381#ifdef CONFIG_SMP 406#ifdef CONFIG_SMP
@@ -453,15 +478,10 @@ static inline unsigned default_get_apic_id(unsigned long x)
453#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 478#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
454 479
455#ifdef CONFIG_X86_64 480#ifdef CONFIG_X86_64
456extern struct apic apic_flat;
457extern struct apic apic_physflat;
458extern struct apic apic_x2apic_cluster;
459extern struct apic apic_x2apic_phys;
460extern int default_acpi_madt_oem_check(char *, char *); 481extern int default_acpi_madt_oem_check(char *, char *);
461 482
462extern void apic_send_IPI_self(int vector); 483extern void apic_send_IPI_self(int vector);
463 484
464extern struct apic apic_x2apic_uv_x;
465DECLARE_PER_CPU(int, x2apic_extra_bits); 485DECLARE_PER_CPU(int, x2apic_extra_bits);
466 486
467extern int default_cpu_present_to_apicid(int mps_cpu); 487extern int default_cpu_present_to_apicid(int mps_cpu);
@@ -475,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
475 return; 495 return;
476} 496}
477 497
478extern void generic_bigsmp_probe(void); 498extern struct apic *generic_bigsmp_probe(void);
479 499
480 500
481#ifdef CONFIG_X86_LOCAL_APIC 501#ifdef CONFIG_X86_LOCAL_APIC
@@ -511,8 +531,6 @@ extern struct apic apic_noop;
511 531
512#ifdef CONFIG_X86_32 532#ifdef CONFIG_X86_32
513 533
514extern struct apic apic_default;
515
516static inline int noop_x86_32_early_logical_apicid(int cpu) 534static inline int noop_x86_32_early_logical_apicid(int cpu)
517{ 535{
518 return BAD_APICID; 536 return BAD_APICID;
@@ -537,8 +555,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
537 return cpuid_apic >> index_msb; 555 return cpuid_apic >> index_msb;
538} 556}
539 557
540extern int default_x86_32_numa_cpu_node(int cpu);
541
542#endif 558#endif
543 559
544static inline unsigned int 560static inline unsigned int
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 3c7521063d3f..aa6a3170ab5a 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -4,16 +4,40 @@
4#include <asm/io.h> 4#include <asm/io.h>
5 5
6/* 6/*
7 * there is a real-mode segmented pointer pointing to the 7 * Returns physical address of EBDA. Returns 0 if there is no EBDA.
8 * 4K EBDA area at 0x40E.
9 */ 8 */
10static inline unsigned int get_bios_ebda(void) 9static inline unsigned int get_bios_ebda(void)
11{ 10{
11 /*
12 * There is a real-mode segmented pointer pointing to the
13 * 4K EBDA area at 0x40E.
14 */
12 unsigned int address = *(unsigned short *)phys_to_virt(0x40E); 15 unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
13 address <<= 4; 16 address <<= 4;
14 return address; /* 0 means none */ 17 return address; /* 0 means none */
15} 18}
16 19
20/*
21 * Return the sanitized length of the EBDA in bytes, if it exists.
22 */
23static inline unsigned int get_bios_ebda_length(void)
24{
25 unsigned int address;
26 unsigned int length;
27
28 address = get_bios_ebda();
29 if (!address)
30 return 0;
31
32 /* EBDA length is byte 0 of the EBDA (stored in KiB) */
33 length = *(unsigned char *)phys_to_virt(address);
34 length <<= 10;
35
36 /* Trim the length if it extends beyond 640KiB */
37 length = min_t(unsigned int, (640 * 1024) - address, length);
38 return length;
39}
40
17void reserve_ebda_region(void); 41void reserve_ebda_region(void);
18 42
19#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 43#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 91f3e087cf21..71cc3800712c 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,7 +125,7 @@
125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
127#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ 127#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
128#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */ 128#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */
129#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ 129#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
130 130
131/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ 131/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -195,6 +195,8 @@
195 195
196/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 196/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
197#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 197#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
198#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
199#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
198 200
199#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 201#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
200 202
@@ -207,8 +209,7 @@ extern const char * const x86_power_flags[32];
207#define test_cpu_cap(c, bit) \ 209#define test_cpu_cap(c, bit) \
208 test_bit(bit, (unsigned long *)((c)->x86_capability)) 210 test_bit(bit, (unsigned long *)((c)->x86_capability))
209 211
210#define cpu_has(c, bit) \ 212#define REQUIRED_MASK_BIT_SET(bit) \
211 (__builtin_constant_p(bit) && \
212 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ 213 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
213 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ 214 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
214 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ 215 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
@@ -218,10 +219,16 @@ extern const char * const x86_power_flags[32];
218 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 219 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
219 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ 220 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
220 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ 221 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
221 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ 222 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
222 ? 1 : \ 223
224#define cpu_has(c, bit) \
225 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
223 test_cpu_cap(c, bit)) 226 test_cpu_cap(c, bit))
224 227
228#define this_cpu_has(bit) \
229 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
230 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
231
225#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) 232#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
226 233
227#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) 234#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 617bd56b3070..7b439d9aea2a 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,30 +4,33 @@
4#include <asm/desc_defs.h> 4#include <asm/desc_defs.h>
5#include <asm/ldt.h> 5#include <asm/ldt.h>
6#include <asm/mmu.h> 6#include <asm/mmu.h>
7
7#include <linux/smp.h> 8#include <linux/smp.h>
8 9
9static inline void fill_ldt(struct desc_struct *desc, 10static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
10 const struct user_desc *info) 11{
11{ 12 desc->limit0 = info->limit & 0x0ffff;
12 desc->limit0 = info->limit & 0x0ffff; 13
13 desc->base0 = info->base_addr & 0x0000ffff; 14 desc->base0 = (info->base_addr & 0x0000ffff);
14 15 desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
15 desc->base1 = (info->base_addr & 0x00ff0000) >> 16; 16
16 desc->type = (info->read_exec_only ^ 1) << 1; 17 desc->type = (info->read_exec_only ^ 1) << 1;
17 desc->type |= info->contents << 2; 18 desc->type |= info->contents << 2;
18 desc->s = 1; 19
19 desc->dpl = 0x3; 20 desc->s = 1;
20 desc->p = info->seg_not_present ^ 1; 21 desc->dpl = 0x3;
21 desc->limit = (info->limit & 0xf0000) >> 16; 22 desc->p = info->seg_not_present ^ 1;
22 desc->avl = info->useable; 23 desc->limit = (info->limit & 0xf0000) >> 16;
23 desc->d = info->seg_32bit; 24 desc->avl = info->useable;
24 desc->g = info->limit_in_pages; 25 desc->d = info->seg_32bit;
25 desc->base2 = (info->base_addr & 0xff000000) >> 24; 26 desc->g = info->limit_in_pages;
27
28 desc->base2 = (info->base_addr & 0xff000000) >> 24;
26 /* 29 /*
27 * Don't allow setting of the lm bit. It is useless anyway 30 * Don't allow setting of the lm bit. It is useless anyway
28 * because 64bit system calls require __USER_CS: 31 * because 64bit system calls require __USER_CS:
29 */ 32 */
30 desc->l = 0; 33 desc->l = 0;
31} 34}
32 35
33extern struct desc_ptr idt_descr; 36extern struct desc_ptr idt_descr;
@@ -36,6 +39,7 @@ extern gate_desc idt_table[];
36struct gdt_page { 39struct gdt_page {
37 struct desc_struct gdt[GDT_ENTRIES]; 40 struct desc_struct gdt[GDT_ENTRIES];
38} __attribute__((aligned(PAGE_SIZE))); 41} __attribute__((aligned(PAGE_SIZE)));
42
39DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); 43DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
40 44
41static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) 45static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
@@ -48,16 +52,16 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
48static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, 52static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
49 unsigned dpl, unsigned ist, unsigned seg) 53 unsigned dpl, unsigned ist, unsigned seg)
50{ 54{
51 gate->offset_low = PTR_LOW(func); 55 gate->offset_low = PTR_LOW(func);
52 gate->segment = __KERNEL_CS; 56 gate->segment = __KERNEL_CS;
53 gate->ist = ist; 57 gate->ist = ist;
54 gate->p = 1; 58 gate->p = 1;
55 gate->dpl = dpl; 59 gate->dpl = dpl;
56 gate->zero0 = 0; 60 gate->zero0 = 0;
57 gate->zero1 = 0; 61 gate->zero1 = 0;
58 gate->type = type; 62 gate->type = type;
59 gate->offset_middle = PTR_MIDDLE(func); 63 gate->offset_middle = PTR_MIDDLE(func);
60 gate->offset_high = PTR_HIGH(func); 64 gate->offset_high = PTR_HIGH(func);
61} 65}
62 66
63#else 67#else
@@ -66,8 +70,7 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
66 unsigned short seg) 70 unsigned short seg)
67{ 71{
68 gate->a = (seg << 16) | (base & 0xffff); 72 gate->a = (seg << 16) | (base & 0xffff);
69 gate->b = (base & 0xffff0000) | 73 gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
70 (((0x80 | type | (dpl << 5)) & 0xff) << 8);
71} 74}
72 75
73#endif 76#endif
@@ -75,31 +78,29 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
75static inline int desc_empty(const void *ptr) 78static inline int desc_empty(const void *ptr)
76{ 79{
77 const u32 *desc = ptr; 80 const u32 *desc = ptr;
81
78 return !(desc[0] | desc[1]); 82 return !(desc[0] | desc[1]);
79} 83}
80 84
81#ifdef CONFIG_PARAVIRT 85#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h> 86#include <asm/paravirt.h>
83#else 87#else
84#define load_TR_desc() native_load_tr_desc() 88#define load_TR_desc() native_load_tr_desc()
85#define load_gdt(dtr) native_load_gdt(dtr) 89#define load_gdt(dtr) native_load_gdt(dtr)
86#define load_idt(dtr) native_load_idt(dtr) 90#define load_idt(dtr) native_load_idt(dtr)
87#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) 91#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
88#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) 92#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
89 93
90#define store_gdt(dtr) native_store_gdt(dtr) 94#define store_gdt(dtr) native_store_gdt(dtr)
91#define store_idt(dtr) native_store_idt(dtr) 95#define store_idt(dtr) native_store_idt(dtr)
92#define store_tr(tr) (tr = native_store_tr()) 96#define store_tr(tr) (tr = native_store_tr())
93 97
94#define load_TLS(t, cpu) native_load_tls(t, cpu) 98#define load_TLS(t, cpu) native_load_tls(t, cpu)
95#define set_ldt native_set_ldt 99#define set_ldt native_set_ldt
96 100
97#define write_ldt_entry(dt, entry, desc) \ 101#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc)
98 native_write_ldt_entry(dt, entry, desc) 102#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type)
99#define write_gdt_entry(dt, entry, desc, type) \ 103#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
100 native_write_gdt_entry(dt, entry, desc, type)
101#define write_idt_entry(dt, entry, g) \
102 native_write_idt_entry(dt, entry, g)
103 104
104static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) 105static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
105{ 106{
@@ -112,33 +113,27 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
112 113
113#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) 114#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
114 115
115static inline void native_write_idt_entry(gate_desc *idt, int entry, 116static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
116 const gate_desc *gate)
117{ 117{
118 memcpy(&idt[entry], gate, sizeof(*gate)); 118 memcpy(&idt[entry], gate, sizeof(*gate));
119} 119}
120 120
121static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, 121static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
122 const void *desc)
123{ 122{
124 memcpy(&ldt[entry], desc, 8); 123 memcpy(&ldt[entry], desc, 8);
125} 124}
126 125
127static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, 126static inline void
128 const void *desc, int type) 127native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
129{ 128{
130 unsigned int size; 129 unsigned int size;
130
131 switch (type) { 131 switch (type) {
132 case DESC_TSS: 132 case DESC_TSS: size = sizeof(tss_desc); break;
133 size = sizeof(tss_desc); 133 case DESC_LDT: size = sizeof(ldt_desc); break;
134 break; 134 default: size = sizeof(*gdt); break;
135 case DESC_LDT:
136 size = sizeof(ldt_desc);
137 break;
138 default:
139 size = sizeof(struct desc_struct);
140 break;
141 } 135 }
136
142 memcpy(&gdt[entry], desc, size); 137 memcpy(&gdt[entry], desc, size);
143} 138}
144 139
@@ -154,20 +149,21 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
154} 149}
155 150
156 151
157static inline void set_tssldt_descriptor(void *d, unsigned long addr, 152static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
158 unsigned type, unsigned size)
159{ 153{
160#ifdef CONFIG_X86_64 154#ifdef CONFIG_X86_64
161 struct ldttss_desc64 *desc = d; 155 struct ldttss_desc64 *desc = d;
156
162 memset(desc, 0, sizeof(*desc)); 157 memset(desc, 0, sizeof(*desc));
163 desc->limit0 = size & 0xFFFF; 158
164 desc->base0 = PTR_LOW(addr); 159 desc->limit0 = size & 0xFFFF;
165 desc->base1 = PTR_MIDDLE(addr) & 0xFF; 160 desc->base0 = PTR_LOW(addr);
166 desc->type = type; 161 desc->base1 = PTR_MIDDLE(addr) & 0xFF;
167 desc->p = 1; 162 desc->type = type;
168 desc->limit1 = (size >> 16) & 0xF; 163 desc->p = 1;
169 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; 164 desc->limit1 = (size >> 16) & 0xF;
170 desc->base3 = PTR_HIGH(addr); 165 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
166 desc->base3 = PTR_HIGH(addr);
171#else 167#else
172 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); 168 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
173#endif 169#endif
@@ -237,14 +233,16 @@ static inline void native_store_idt(struct desc_ptr *dtr)
237static inline unsigned long native_store_tr(void) 233static inline unsigned long native_store_tr(void)
238{ 234{
239 unsigned long tr; 235 unsigned long tr;
236
240 asm volatile("str %0":"=r" (tr)); 237 asm volatile("str %0":"=r" (tr));
238
241 return tr; 239 return tr;
242} 240}
243 241
244static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) 242static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
245{ 243{
246 unsigned int i;
247 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 244 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
245 unsigned int i;
248 246
249 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) 247 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
250 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; 248 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
@@ -313,6 +311,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
313 unsigned dpl, unsigned ist, unsigned seg) 311 unsigned dpl, unsigned ist, unsigned seg)
314{ 312{
315 gate_desc s; 313 gate_desc s;
314
316 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); 315 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
317 /* 316 /*
318 * does not need to be atomic because it is only done once at 317 * does not need to be atomic because it is only done once at
@@ -343,8 +342,9 @@ static inline void alloc_system_vector(int vector)
343 set_bit(vector, used_vectors); 342 set_bit(vector, used_vectors);
344 if (first_system_vector > vector) 343 if (first_system_vector > vector)
345 first_system_vector = vector; 344 first_system_vector = vector;
346 } else 345 } else {
347 BUG(); 346 BUG();
347 }
348} 348}
349 349
350static inline void alloc_intr_gate(unsigned int n, void *addr) 350static inline void alloc_intr_gate(unsigned int n, void *addr)
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index 057099e5faba..0bdb0c54d9a1 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -69,22 +69,18 @@
69 69
70#define MAX_DMA_CHANNELS 8 70#define MAX_DMA_CHANNELS 8
71 71
72#ifdef CONFIG_X86_32
73
74/* The maximum address that we can perform a DMA transfer to on this platform */
75#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
76
77#else
78
79/* 16MB ISA DMA zone */ 72/* 16MB ISA DMA zone */
80#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) 73#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
81 74
82/* 4GB broken PCI/AGP hardware bus master zone */ 75/* 4GB broken PCI/AGP hardware bus master zone */
83#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) 76#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
84 77
78#ifdef CONFIG_X86_32
79/* The maximum address that we can perform a DMA transfer to on this platform */
80#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
81#else
85/* Compat define for old dma zone */ 82/* Compat define for old dma zone */
86#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) 83#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
87
88#endif 84#endif
89 85
90/* 8237 DMA controllers */ 86/* 8237 DMA controllers */
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8e4a16508d4e..7093e4a6a0bc 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,6 +90,7 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
90#endif /* CONFIG_X86_32 */ 90#endif /* CONFIG_X86_32 */
91 91
92extern int add_efi_memmap; 92extern int add_efi_memmap;
93extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
93extern void efi_memblock_x86_reserve_range(void); 94extern void efi_memblock_x86_reserve_range(void);
94extern void efi_call_phys_prelog(void); 95extern void efi_call_phys_prelog(void);
95extern void efi_call_phys_epilog(void); 96extern void efi_call_phys_epilog(void);
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index db24c2278be0..268c783ab1c0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -38,11 +38,10 @@ extern void mcount(void);
38static inline unsigned long ftrace_call_adjust(unsigned long addr) 38static inline unsigned long ftrace_call_adjust(unsigned long addr)
39{ 39{
40 /* 40 /*
41 * call mcount is "e8 <4 byte offset>" 41 * addr is the address of the mcount call instruction.
42 * The addr points to the 4 byte offset and the caller of this 42 * recordmcount does the necessary offset calculation.
43 * function wants the pointer to e8. Simply subtract one.
44 */ 43 */
45 return addr - 1; 44 return addr;
46} 45}
47 46
48#ifdef CONFIG_DYNAMIC_FTRACE 47#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index fc1f579fb965..65aaa91d5850 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,6 +6,8 @@
6#define PIT_CH0 0x40 6#define PIT_CH0 0x40
7#define PIT_CH2 0x42 7#define PIT_CH2 0x42
8 8
9#define PIT_LATCH LATCH
10
9extern raw_spinlock_t i8253_lock; 11extern raw_spinlock_t i8253_lock;
10 12
11extern struct clock_event_device *global_clock_event; 13extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 38d87379e270..f49253d75710 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -16,6 +16,6 @@ static inline void enter_idle(void) { }
16static inline void exit_idle(void) { } 16static inline void exit_idle(void) { }
17#endif /* CONFIG_X86_64 */ 17#endif /* CONFIG_X86_64 */
18 18
19void c1e_remove_cpu(int cpu); 19void amd_e400_remove_cpu(int cpu);
20 20
21#endif /* _ASM_X86_IDLE_H */ 21#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 072273082528..d02804d650c4 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -38,7 +38,6 @@
38 38
39#include <linux/string.h> 39#include <linux/string.h>
40#include <linux/compiler.h> 40#include <linux/compiler.h>
41#include <asm-generic/int-ll64.h>
42#include <asm/page.h> 41#include <asm/page.h>
43 42
44#include <xen/xen.h> 43#include <xen/xen.h>
@@ -87,27 +86,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
87build_mmio_read(readq, "q", unsigned long, "=r", :"memory") 86build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
88build_mmio_write(writeq, "q", unsigned long, "r", :"memory") 87build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
89 88
90#else
91
92static inline __u64 readq(const volatile void __iomem *addr)
93{
94 const volatile u32 __iomem *p = addr;
95 u32 low, high;
96
97 low = readl(p);
98 high = readl(p + 1);
99
100 return low + ((u64)high << 32);
101}
102
103static inline void writeq(__u64 val, volatile void __iomem *addr)
104{
105 writel(val, addr);
106 writel(val >> 32, addr+4);
107}
108
109#endif
110
111#define readq_relaxed(a) readq(a) 89#define readq_relaxed(a) readq(a)
112 90
113#define __raw_readq(a) readq(a) 91#define __raw_readq(a) readq(a)
@@ -117,6 +95,8 @@ static inline void writeq(__u64 val, volatile void __iomem *addr)
117#define readq readq 95#define readq readq
118#define writeq writeq 96#define writeq writeq
119 97
98#endif
99
120/** 100/**
121 * virt_to_phys - map virtual addresses to physical 101 * virt_to_phys - map virtual addresses to physical
122 * @address: address to remap 102 * @address: address to remap
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index a97a240f67f3..690d1cc9a877 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -105,12 +105,12 @@ struct IR_IO_APIC_route_entry {
105 * # of IO-APICs and # of IRQ routing registers 105 * # of IO-APICs and # of IRQ routing registers
106 */ 106 */
107extern int nr_ioapics; 107extern int nr_ioapics;
108extern int nr_ioapic_registers[MAX_IO_APICS];
109 108
110#define MP_MAX_IOAPIC_PIN 127 109extern int mpc_ioapic_id(int ioapic);
110extern unsigned int mpc_ioapic_addr(int ioapic);
111extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
111 112
112/* I/O APIC entries */ 113#define MP_MAX_IOAPIC_PIN 127
113extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
114 114
115/* # of MP IRQ source entries */ 115/* # of MP IRQ source entries */
116extern int mp_irq_entries; 116extern int mp_irq_entries;
@@ -152,11 +152,9 @@ extern void ioapic_insert_resources(void);
152 152
153int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr); 153int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
154 154
155extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 155extern int save_ioapic_entries(void);
156extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries); 156extern void mask_ioapic_entries(void);
157extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries); 157extern int restore_ioapic_entries(void);
158extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
159extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
160 158
161extern int get_nr_irqs_gsi(void); 159extern int get_nr_irqs_gsi(void);
162 160
@@ -192,19 +190,13 @@ struct io_apic_irq_attr;
192static inline int io_apic_set_pci_routing(struct device *dev, int irq, 190static inline int io_apic_set_pci_routing(struct device *dev, int irq,
193 struct io_apic_irq_attr *irq_attr) { return 0; } 191 struct io_apic_irq_attr *irq_attr) { return 0; }
194 192
195static inline struct IO_APIC_route_entry **alloc_ioapic_entries(void) 193static inline int save_ioapic_entries(void)
196{
197 return NULL;
198}
199
200static inline void free_ioapic_entries(struct IO_APIC_route_entry **ent) { }
201static inline int save_IO_APIC_setup(struct IO_APIC_route_entry **ent)
202{ 194{
203 return -ENOMEM; 195 return -ENOMEM;
204} 196}
205 197
206static inline void mask_IO_APIC_setup(struct IO_APIC_route_entry **ent) { } 198static inline void mask_ioapic_entries(void) { }
207static inline int restore_IO_APIC_setup(struct IO_APIC_route_entry **ent) 199static inline int restore_ioapic_entries(void)
208{ 200{
209 return -ENOMEM; 201 return -ENOMEM;
210} 202}
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
index 574dbc22893a..a32b18ce6ead 100644
--- a/arch/x86/include/asm/jump_label.h
+++ b/arch/x86/include/asm/jump_label.h
@@ -5,20 +5,25 @@
5 5
6#include <linux/types.h> 6#include <linux/types.h>
7#include <asm/nops.h> 7#include <asm/nops.h>
8#include <asm/asm.h>
8 9
9#define JUMP_LABEL_NOP_SIZE 5 10#define JUMP_LABEL_NOP_SIZE 5
10 11
11# define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t" 12#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
12 13
13# define JUMP_LABEL(key, label) \ 14static __always_inline bool arch_static_branch(struct jump_label_key *key)
14 do { \ 15{
15 asm goto("1:" \ 16 asm goto("1:"
16 JUMP_LABEL_INITIAL_NOP \ 17 JUMP_LABEL_INITIAL_NOP
17 ".pushsection __jump_table, \"aw\" \n\t"\ 18 ".pushsection __jump_table, \"aw\" \n\t"
18 _ASM_PTR "1b, %l[" #label "], %c0 \n\t" \ 19 _ASM_ALIGN "\n\t"
19 ".popsection \n\t" \ 20 _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
20 : : "i" (key) : : label); \ 21 ".popsection \n\t"
21 } while (0) 22 : : "i" (key) : : l_yes);
23 return false;
24l_yes:
25 return true;
26}
22 27
23#endif /* __KERNEL__ */ 28#endif /* __KERNEL__ */
24 29
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 396f5b5fc4d7..77e95f54570a 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -77,6 +77,7 @@ static inline void arch_kgdb_breakpoint(void)
77} 77}
78#define BREAK_INSTR_SIZE 1 78#define BREAK_INSTR_SIZE 1
79#define CACHE_FLUSH_IS_SAFE 1 79#define CACHE_FLUSH_IS_SAFE 1
80#define GDB_ADJUSTS_BREAK_OFFSET
80 81
81extern int kgdb_ll_trap(int cmd, const char *str, 82extern int kgdb_ll_trap(int cmd, const char *str,
82 struct pt_regs *regs, long err, int trap, int sig); 83 struct pt_regs *regs, long err, int trap, int sig);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 0f5213564326..0049211959c0 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -14,6 +14,8 @@
14#include <asm/desc_defs.h> 14#include <asm/desc_defs.h>
15 15
16struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
17enum x86_intercept;
18enum x86_intercept_stage;
17 19
18struct x86_exception { 20struct x86_exception {
19 u8 vector; 21 u8 vector;
@@ -24,6 +26,24 @@ struct x86_exception {
24}; 26};
25 27
26/* 28/*
29 * This struct is used to carry enough information from the instruction
30 * decoder to main KVM so that a decision can be made whether the
31 * instruction needs to be intercepted or not.
32 */
33struct x86_instruction_info {
34 u8 intercept; /* which intercept */
35 u8 rep_prefix; /* rep prefix? */
36 u8 modrm_mod; /* mod part of modrm */
37 u8 modrm_reg; /* index of register used */
38 u8 modrm_rm; /* rm part of modrm */
39 u64 src_val; /* value of source operand */
40 u8 src_bytes; /* size of source operand */
41 u8 dst_bytes; /* size of destination operand */
42 u8 ad_bytes; /* size of src/dst address */
43 u64 next_rip; /* rip following the instruction */
44};
45
46/*
27 * x86_emulate_ops: 47 * x86_emulate_ops:
28 * 48 *
29 * These operations represent the instruction emulator's interface to memory. 49 * These operations represent the instruction emulator's interface to memory.
@@ -62,6 +82,7 @@ struct x86_exception {
62#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ 82#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
63#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ 83#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
64#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ 84#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
85#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
65 86
66struct x86_emulate_ops { 87struct x86_emulate_ops {
67 /* 88 /*
@@ -71,8 +92,9 @@ struct x86_emulate_ops {
71 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 92 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
72 * @bytes: [IN ] Number of bytes to read from memory. 93 * @bytes: [IN ] Number of bytes to read from memory.
73 */ 94 */
74 int (*read_std)(unsigned long addr, void *val, 95 int (*read_std)(struct x86_emulate_ctxt *ctxt,
75 unsigned int bytes, struct kvm_vcpu *vcpu, 96 unsigned long addr, void *val,
97 unsigned int bytes,
76 struct x86_exception *fault); 98 struct x86_exception *fault);
77 99
78 /* 100 /*
@@ -82,8 +104,8 @@ struct x86_emulate_ops {
82 * @val: [OUT] Value write to memory, zero-extended to 'u_long'. 104 * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
83 * @bytes: [IN ] Number of bytes to write to memory. 105 * @bytes: [IN ] Number of bytes to write to memory.
84 */ 106 */
85 int (*write_std)(unsigned long addr, void *val, 107 int (*write_std)(struct x86_emulate_ctxt *ctxt,
86 unsigned int bytes, struct kvm_vcpu *vcpu, 108 unsigned long addr, void *val, unsigned int bytes,
87 struct x86_exception *fault); 109 struct x86_exception *fault);
88 /* 110 /*
89 * fetch: Read bytes of standard (non-emulated/special) memory. 111 * fetch: Read bytes of standard (non-emulated/special) memory.
@@ -92,8 +114,8 @@ struct x86_emulate_ops {
92 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 114 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
93 * @bytes: [IN ] Number of bytes to read from memory. 115 * @bytes: [IN ] Number of bytes to read from memory.
94 */ 116 */
95 int (*fetch)(unsigned long addr, void *val, 117 int (*fetch)(struct x86_emulate_ctxt *ctxt,
96 unsigned int bytes, struct kvm_vcpu *vcpu, 118 unsigned long addr, void *val, unsigned int bytes,
97 struct x86_exception *fault); 119 struct x86_exception *fault);
98 120
99 /* 121 /*
@@ -102,11 +124,9 @@ struct x86_emulate_ops {
102 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 124 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
103 * @bytes: [IN ] Number of bytes to read from memory. 125 * @bytes: [IN ] Number of bytes to read from memory.
104 */ 126 */
105 int (*read_emulated)(unsigned long addr, 127 int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
106 void *val, 128 unsigned long addr, void *val, unsigned int bytes,
107 unsigned int bytes, 129 struct x86_exception *fault);
108 struct x86_exception *fault,
109 struct kvm_vcpu *vcpu);
110 130
111 /* 131 /*
112 * write_emulated: Write bytes to emulated/special memory area. 132 * write_emulated: Write bytes to emulated/special memory area.
@@ -115,11 +135,10 @@ struct x86_emulate_ops {
115 * required). 135 * required).
116 * @bytes: [IN ] Number of bytes to write to memory. 136 * @bytes: [IN ] Number of bytes to write to memory.
117 */ 137 */
118 int (*write_emulated)(unsigned long addr, 138 int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
119 const void *val, 139 unsigned long addr, const void *val,
120 unsigned int bytes, 140 unsigned int bytes,
121 struct x86_exception *fault, 141 struct x86_exception *fault);
122 struct kvm_vcpu *vcpu);
123 142
124 /* 143 /*
125 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an 144 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -129,40 +148,54 @@ struct x86_emulate_ops {
129 * @new: [IN ] Value to write to @addr. 148 * @new: [IN ] Value to write to @addr.
130 * @bytes: [IN ] Number of bytes to access using CMPXCHG. 149 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
131 */ 150 */
132 int (*cmpxchg_emulated)(unsigned long addr, 151 int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
152 unsigned long addr,
133 const void *old, 153 const void *old,
134 const void *new, 154 const void *new,
135 unsigned int bytes, 155 unsigned int bytes,
136 struct x86_exception *fault, 156 struct x86_exception *fault);
137 struct kvm_vcpu *vcpu); 157 void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
138 158
139 int (*pio_in_emulated)(int size, unsigned short port, void *val, 159 int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
140 unsigned int count, struct kvm_vcpu *vcpu); 160 int size, unsigned short port, void *val,
141 161 unsigned int count);
142 int (*pio_out_emulated)(int size, unsigned short port, const void *val, 162
143 unsigned int count, struct kvm_vcpu *vcpu); 163 int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
144 164 int size, unsigned short port, const void *val,
145 bool (*get_cached_descriptor)(struct desc_struct *desc, u32 *base3, 165 unsigned int count);
146 int seg, struct kvm_vcpu *vcpu); 166
147 void (*set_cached_descriptor)(struct desc_struct *desc, u32 base3, 167 bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
148 int seg, struct kvm_vcpu *vcpu); 168 struct desc_struct *desc, u32 *base3, int seg);
149 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 169 void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
150 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 170 struct desc_struct *desc, u32 base3, int seg);
151 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); 171 unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
152 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 172 int seg);
153 void (*get_idt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 173 void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
154 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 174 void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
155 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 175 void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
156 int (*cpl)(struct kvm_vcpu *vcpu); 176 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
157 int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); 177 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
158 int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); 178 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
159 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 179 int (*cpl)(struct x86_emulate_ctxt *ctxt);
160 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 180 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
182 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
183 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
184 void (*halt)(struct x86_emulate_ctxt *ctxt);
185 void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
186 int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
187 void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */
188 void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */
189 int (*intercept)(struct x86_emulate_ctxt *ctxt,
190 struct x86_instruction_info *info,
191 enum x86_intercept_stage stage);
161}; 192};
162 193
194typedef u32 __attribute__((vector_size(16))) sse128_t;
195
163/* Type, address-of, and value of an instruction's operand. */ 196/* Type, address-of, and value of an instruction's operand. */
164struct operand { 197struct operand {
165 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 198 enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type;
166 unsigned int bytes; 199 unsigned int bytes;
167 union { 200 union {
168 unsigned long orig_val; 201 unsigned long orig_val;
@@ -174,11 +207,13 @@ struct operand {
174 ulong ea; 207 ulong ea;
175 unsigned seg; 208 unsigned seg;
176 } mem; 209 } mem;
210 unsigned xmm;
177 } addr; 211 } addr;
178 union { 212 union {
179 unsigned long val; 213 unsigned long val;
180 u64 val64; 214 u64 val64;
181 char valptr[sizeof(unsigned long) + 2]; 215 char valptr[sizeof(unsigned long) + 2];
216 sse128_t vec_val;
182 }; 217 };
183}; 218};
184 219
@@ -197,6 +232,7 @@ struct read_cache {
197struct decode_cache { 232struct decode_cache {
198 u8 twobyte; 233 u8 twobyte;
199 u8 b; 234 u8 b;
235 u8 intercept;
200 u8 lock_prefix; 236 u8 lock_prefix;
201 u8 rep_prefix; 237 u8 rep_prefix;
202 u8 op_bytes; 238 u8 op_bytes;
@@ -209,6 +245,7 @@ struct decode_cache {
209 u8 seg_override; 245 u8 seg_override;
210 unsigned int d; 246 unsigned int d;
211 int (*execute)(struct x86_emulate_ctxt *ctxt); 247 int (*execute)(struct x86_emulate_ctxt *ctxt);
248 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
212 unsigned long regs[NR_VCPU_REGS]; 249 unsigned long regs[NR_VCPU_REGS];
213 unsigned long eip; 250 unsigned long eip;
214 /* modrm */ 251 /* modrm */
@@ -227,17 +264,15 @@ struct x86_emulate_ctxt {
227 struct x86_emulate_ops *ops; 264 struct x86_emulate_ops *ops;
228 265
229 /* Register state before/after emulation. */ 266 /* Register state before/after emulation. */
230 struct kvm_vcpu *vcpu;
231
232 unsigned long eflags; 267 unsigned long eflags;
233 unsigned long eip; /* eip before instruction emulation */ 268 unsigned long eip; /* eip before instruction emulation */
234 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 269 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
235 int mode; 270 int mode;
236 u32 cs_base;
237 271
238 /* interruptibility state, as a result of execution of STI or MOV SS */ 272 /* interruptibility state, as a result of execution of STI or MOV SS */
239 int interruptibility; 273 int interruptibility;
240 274
275 bool guest_mode; /* guest running a nested guest */
241 bool perm_ok; /* do not check permissions if true */ 276 bool perm_ok; /* do not check permissions if true */
242 bool only_vendor_specific_insn; 277 bool only_vendor_specific_insn;
243 278
@@ -249,8 +284,8 @@ struct x86_emulate_ctxt {
249}; 284};
250 285
251/* Repeat String Operation Prefix */ 286/* Repeat String Operation Prefix */
252#define REPE_PREFIX 1 287#define REPE_PREFIX 0xf3
253#define REPNE_PREFIX 2 288#define REPNE_PREFIX 0xf2
254 289
255/* Execution mode, passed to the emulator. */ 290/* Execution mode, passed to the emulator. */
256#define X86EMUL_MODE_REAL 0 /* Real mode. */ 291#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -259,6 +294,69 @@ struct x86_emulate_ctxt {
259#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ 294#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
260#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 295#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
261 296
297/* any protected mode */
298#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
299 X86EMUL_MODE_PROT64)
300
301enum x86_intercept_stage {
302 X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
303 X86_ICPT_PRE_EXCEPT,
304 X86_ICPT_POST_EXCEPT,
305 X86_ICPT_POST_MEMACCESS,
306};
307
308enum x86_intercept {
309 x86_intercept_none,
310 x86_intercept_cr_read,
311 x86_intercept_cr_write,
312 x86_intercept_clts,
313 x86_intercept_lmsw,
314 x86_intercept_smsw,
315 x86_intercept_dr_read,
316 x86_intercept_dr_write,
317 x86_intercept_lidt,
318 x86_intercept_sidt,
319 x86_intercept_lgdt,
320 x86_intercept_sgdt,
321 x86_intercept_lldt,
322 x86_intercept_sldt,
323 x86_intercept_ltr,
324 x86_intercept_str,
325 x86_intercept_rdtsc,
326 x86_intercept_rdpmc,
327 x86_intercept_pushf,
328 x86_intercept_popf,
329 x86_intercept_cpuid,
330 x86_intercept_rsm,
331 x86_intercept_iret,
332 x86_intercept_intn,
333 x86_intercept_invd,
334 x86_intercept_pause,
335 x86_intercept_hlt,
336 x86_intercept_invlpg,
337 x86_intercept_invlpga,
338 x86_intercept_vmrun,
339 x86_intercept_vmload,
340 x86_intercept_vmsave,
341 x86_intercept_vmmcall,
342 x86_intercept_stgi,
343 x86_intercept_clgi,
344 x86_intercept_skinit,
345 x86_intercept_rdtscp,
346 x86_intercept_icebp,
347 x86_intercept_wbinvd,
348 x86_intercept_monitor,
349 x86_intercept_mwait,
350 x86_intercept_rdmsr,
351 x86_intercept_wrmsr,
352 x86_intercept_in,
353 x86_intercept_ins,
354 x86_intercept_out,
355 x86_intercept_outs,
356
357 nr_x86_intercepts
358};
359
262/* Host execution mode. */ 360/* Host execution mode. */
263#if defined(CONFIG_X86_32) 361#if defined(CONFIG_X86_32)
264#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 362#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
@@ -270,6 +368,7 @@ int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
270#define EMULATION_FAILED -1 368#define EMULATION_FAILED -1
271#define EMULATION_OK 0 369#define EMULATION_OK 0
272#define EMULATION_RESTART 1 370#define EMULATION_RESTART 1
371#define EMULATION_INTERCEPTED 2
273int x86_emulate_insn(struct x86_emulate_ctxt *ctxt); 372int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
274int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 373int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
275 u16 tss_selector, int reason, 374 u16 tss_selector, int reason,
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c8af0991fdf0..d2ac8e2ee897 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -30,14 +30,30 @@
30#define KVM_MEMORY_SLOTS 32 30#define KVM_MEMORY_SLOTS 32
31/* memory slots that does not exposed to userspace */ 31/* memory slots that does not exposed to userspace */
32#define KVM_PRIVATE_MEM_SLOTS 4 32#define KVM_PRIVATE_MEM_SLOTS 4
33#define KVM_MMIO_SIZE 16
33 34
34#define KVM_PIO_PAGE_OFFSET 1 35#define KVM_PIO_PAGE_OFFSET 1
35#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 36#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
36 37
38#define CR0_RESERVED_BITS \
39 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
40 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
41 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
42
37#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 43#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
38#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 44#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
39#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 45#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
40 0xFFFFFF0000000000ULL) 46 0xFFFFFF0000000000ULL)
47#define CR4_RESERVED_BITS \
48 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
49 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
50 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
51 | X86_CR4_OSXSAVE \
52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
53
54#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
55
56
41 57
42#define INVALID_PAGE (~(hpa_t)0) 58#define INVALID_PAGE (~(hpa_t)0)
43#define VALID_PAGE(x) ((x) != INVALID_PAGE) 59#define VALID_PAGE(x) ((x) != INVALID_PAGE)
@@ -118,6 +134,9 @@ enum kvm_reg {
118enum kvm_reg_ex { 134enum kvm_reg_ex {
119 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 135 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
120 VCPU_EXREG_CR3, 136 VCPU_EXREG_CR3,
137 VCPU_EXREG_RFLAGS,
138 VCPU_EXREG_CPL,
139 VCPU_EXREG_SEGMENTS,
121}; 140};
122 141
123enum { 142enum {
@@ -256,7 +275,7 @@ struct kvm_mmu {
256 struct kvm_mmu_page *sp); 275 struct kvm_mmu_page *sp);
257 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 276 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
258 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 277 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
259 u64 *spte, const void *pte, unsigned long mmu_seq); 278 u64 *spte, const void *pte);
260 hpa_t root_hpa; 279 hpa_t root_hpa;
261 int root_level; 280 int root_level;
262 int shadow_root_level; 281 int shadow_root_level;
@@ -340,7 +359,6 @@ struct kvm_vcpu_arch {
340 struct fpu guest_fpu; 359 struct fpu guest_fpu;
341 u64 xcr0; 360 u64 xcr0;
342 361
343 gva_t mmio_fault_cr2;
344 struct kvm_pio_request pio; 362 struct kvm_pio_request pio;
345 void *pio_data; 363 void *pio_data;
346 364
@@ -367,18 +385,22 @@ struct kvm_vcpu_arch {
367 /* emulate context */ 385 /* emulate context */
368 386
369 struct x86_emulate_ctxt emulate_ctxt; 387 struct x86_emulate_ctxt emulate_ctxt;
388 bool emulate_regs_need_sync_to_vcpu;
389 bool emulate_regs_need_sync_from_vcpu;
370 390
371 gpa_t time; 391 gpa_t time;
372 struct pvclock_vcpu_time_info hv_clock; 392 struct pvclock_vcpu_time_info hv_clock;
373 unsigned int hw_tsc_khz; 393 unsigned int hw_tsc_khz;
374 unsigned int time_offset; 394 unsigned int time_offset;
375 struct page *time_page; 395 struct page *time_page;
376 u64 last_host_tsc;
377 u64 last_guest_tsc; 396 u64 last_guest_tsc;
378 u64 last_kernel_ns; 397 u64 last_kernel_ns;
379 u64 last_tsc_nsec; 398 u64 last_tsc_nsec;
380 u64 last_tsc_write; 399 u64 last_tsc_write;
400 u32 virtual_tsc_khz;
381 bool tsc_catchup; 401 bool tsc_catchup;
402 u32 tsc_catchup_mult;
403 s8 tsc_catchup_shift;
382 404
383 bool nmi_pending; 405 bool nmi_pending;
384 bool nmi_injected; 406 bool nmi_injected;
@@ -448,9 +470,6 @@ struct kvm_arch {
448 u64 last_tsc_nsec; 470 u64 last_tsc_nsec;
449 u64 last_tsc_offset; 471 u64 last_tsc_offset;
450 u64 last_tsc_write; 472 u64 last_tsc_write;
451 u32 virtual_tsc_khz;
452 u32 virtual_tsc_mult;
453 s8 virtual_tsc_shift;
454 473
455 struct kvm_xen_hvm_config xen_hvm_config; 474 struct kvm_xen_hvm_config xen_hvm_config;
456 475
@@ -502,6 +521,8 @@ struct kvm_vcpu_stat {
502 u32 nmi_injections; 521 u32 nmi_injections;
503}; 522};
504 523
524struct x86_instruction_info;
525
505struct kvm_x86_ops { 526struct kvm_x86_ops {
506 int (*cpu_has_kvm_support)(void); /* __init */ 527 int (*cpu_has_kvm_support)(void); /* __init */
507 int (*disabled_by_bios)(void); /* __init */ 528 int (*disabled_by_bios)(void); /* __init */
@@ -586,9 +607,17 @@ struct kvm_x86_ops {
586 607
587 bool (*has_wbinvd_exit)(void); 608 bool (*has_wbinvd_exit)(void);
588 609
610 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
589 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset); 611 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
590 612
613 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
614
591 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2); 615 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
616
617 int (*check_intercept)(struct kvm_vcpu *vcpu,
618 struct x86_instruction_info *info,
619 enum x86_intercept_stage stage);
620
592 const struct trace_print_flags *exit_reasons_str; 621 const struct trace_print_flags *exit_reasons_str;
593}; 622};
594 623
@@ -627,6 +656,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
627 656
628extern bool tdp_enabled; 657extern bool tdp_enabled;
629 658
659/* control of guest tsc rate supported? */
660extern bool kvm_has_tsc_control;
661/* minimum supported tsc_khz for guests */
662extern u32 kvm_min_guest_tsc_khz;
663/* maximum supported tsc_khz for guests */
664extern u32 kvm_max_guest_tsc_khz;
665
630enum emulation_result { 666enum emulation_result {
631 EMULATE_DONE, /* no further processing */ 667 EMULATE_DONE, /* no further processing */
632 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ 668 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
@@ -645,9 +681,6 @@ static inline int emulate_instruction(struct kvm_vcpu *vcpu,
645 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0); 681 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
646} 682}
647 683
648void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
649void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
650
651void kvm_enable_efer_bits(u64); 684void kvm_enable_efer_bits(u64);
652int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 685int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
653int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 686int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
@@ -657,8 +690,6 @@ struct x86_emulate_ctxt;
657int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); 690int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
658void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
659int kvm_emulate_halt(struct kvm_vcpu *vcpu); 692int kvm_emulate_halt(struct kvm_vcpu *vcpu);
660int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
661int emulate_clts(struct kvm_vcpu *vcpu);
662int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); 693int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
663 694
664void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 695void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -721,8 +752,6 @@ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
721 752
722int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 753int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
723 754
724int kvm_fix_hypercall(struct kvm_vcpu *vcpu);
725
726int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code, 755int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
727 void *insn, int insn_len); 756 void *insn, int insn_len);
728void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 757void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 12d55e773eb6..48142971b25d 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -8,11 +8,6 @@
8 8
9#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) 10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
11/*
12 * For 32-bit UML - mark functions implemented in assembly that use
13 * regparm input parameters:
14 */
15#define asmregparm __attribute__((regparm(3)))
16 11
17/* 12/*
18 * Make sure the compiler doesn't do anything stupid with the 13 * Make sure the compiler doesn't do anything stupid with the
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index eb16e94ae04f..021979a6e23f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -142,8 +142,6 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
142static inline void enable_p5_mce(void) {} 142static inline void enable_p5_mce(void) {}
143#endif 143#endif
144 144
145extern void (*x86_mce_decode_callback)(struct mce *m);
146
147void mce_setup(struct mce *m); 145void mce_setup(struct mce *m);
148void mce_log(struct mce *m); 146void mce_log(struct mce *m);
149DECLARE_PER_CPU(struct sys_device, mce_dev); 147DECLARE_PER_CPU(struct sys_device, mce_dev);
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index aeff3e89b222..5f55e6962769 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -11,14 +11,14 @@
11typedef struct { 11typedef struct {
12 void *ldt; 12 void *ldt;
13 int size; 13 int size;
14 struct mutex lock;
15 void *vdso;
16 14
17#ifdef CONFIG_X86_64 15#ifdef CONFIG_X86_64
18 /* True if mm supports a task running in 32 bit compatibility mode. */ 16 /* True if mm supports a task running in 32 bit compatibility mode. */
19 unsigned short ia32_compat; 17 unsigned short ia32_compat;
20#endif 18#endif
21 19
20 struct mutex lock;
21 void *vdso;
22} mm_context_t; 22} mm_context_t;
23 23
24#ifdef CONFIG_SMP 24#ifdef CONFIG_SMP
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 91df7c51806c..5e83a416eca8 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -13,31 +13,11 @@ extern struct pglist_data *node_data[];
13#define NODE_DATA(nid) (node_data[nid]) 13#define NODE_DATA(nid) (node_data[nid])
14 14
15#include <asm/numaq.h> 15#include <asm/numaq.h>
16/* summit or generic arch */
17#include <asm/srat.h>
18
19extern int get_memcfg_numa_flat(void);
20/*
21 * This allows any one NUMA architecture to be compiled
22 * for, and still fall back to the flat function if it
23 * fails.
24 */
25static inline void get_memcfg_numa(void)
26{
27
28 if (get_memcfg_numaq())
29 return;
30 if (get_memcfg_from_srat())
31 return;
32 get_memcfg_numa_flat();
33}
34 16
35extern void resume_map_numa_kva(pgd_t *pgd); 17extern void resume_map_numa_kva(pgd_t *pgd);
36 18
37#else /* !CONFIG_NUMA */ 19#else /* !CONFIG_NUMA */
38 20
39#define get_memcfg_numa get_memcfg_numa_flat
40
41static inline void resume_map_numa_kva(pgd_t *pgd) {} 21static inline void resume_map_numa_kva(pgd_t *pgd) {}
42 22
43#endif /* CONFIG_NUMA */ 23#endif /* CONFIG_NUMA */
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index 288b96f815a6..b3f88d7867c7 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -4,36 +4,13 @@
4#ifndef _ASM_X86_MMZONE_64_H 4#ifndef _ASM_X86_MMZONE_64_H
5#define _ASM_X86_MMZONE_64_H 5#define _ASM_X86_MMZONE_64_H
6 6
7
8#ifdef CONFIG_NUMA 7#ifdef CONFIG_NUMA
9 8
10#include <linux/mmdebug.h> 9#include <linux/mmdebug.h>
11
12#include <asm/smp.h> 10#include <asm/smp.h>
13 11
14/* Simple perfect hash to map physical addresses to node numbers */
15struct memnode {
16 int shift;
17 unsigned int mapsize;
18 s16 *map;
19 s16 embedded_map[64 - 8];
20} ____cacheline_aligned; /* total size = 128 bytes */
21extern struct memnode memnode;
22#define memnode_shift memnode.shift
23#define memnodemap memnode.map
24#define memnodemapsize memnode.mapsize
25
26extern struct pglist_data *node_data[]; 12extern struct pglist_data *node_data[];
27 13
28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
29{
30 unsigned nid;
31 VIRTUAL_BUG_ON(!memnodemap);
32 nid = memnodemap[addr >> memnode_shift];
33 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
34 return nid;
35}
36
37#define NODE_DATA(nid) (node_data[nid]) 14#define NODE_DATA(nid) (node_data[nid])
38 15
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 16#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 67763c5d8b4e..9eae7752ae9b 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -35,7 +35,7 @@
35#define MODULE_PROC_FAMILY "K7 " 35#define MODULE_PROC_FAMILY "K7 "
36#elif defined CONFIG_MK8 36#elif defined CONFIG_MK8
37#define MODULE_PROC_FAMILY "K8 " 37#define MODULE_PROC_FAMILY "K8 "
38#elif defined CONFIG_X86_ELAN 38#elif defined CONFIG_MELAN
39#define MODULE_PROC_FAMILY "ELAN " 39#define MODULE_PROC_FAMILY "ELAN "
40#elif defined CONFIG_MCRUSOE 40#elif defined CONFIG_MCRUSOE
41#define MODULE_PROC_FAMILY "CRUSOE " 41#define MODULE_PROC_FAMILY "CRUSOE "
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 3cce71413d0b..485b4f1f079b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -118,6 +118,7 @@
118 complete list. */ 118 complete list. */
119 119
120#define MSR_AMD64_PATCH_LEVEL 0x0000008b 120#define MSR_AMD64_PATCH_LEVEL 0x0000008b
121#define MSR_AMD64_TSC_RATIO 0xc0000104
121#define MSR_AMD64_NB_CFG 0xc001001f 122#define MSR_AMD64_NB_CFG 0xc001001f
122#define MSR_AMD64_PATCH_LOADER 0xc0010020 123#define MSR_AMD64_PATCH_LOADER 0xc0010020
123#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 124#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index af788496020b..405b4032a60b 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -1,7 +1,13 @@
1#ifndef _ASM_X86_NOPS_H 1#ifndef _ASM_X86_NOPS_H
2#define _ASM_X86_NOPS_H 2#define _ASM_X86_NOPS_H
3 3
4/* Define nops for use with alternative() */ 4/*
5 * Define nops for use with alternative() and for tracing.
6 *
7 * *_NOP5_ATOMIC must be a single instruction.
8 */
9
10#define NOP_DS_PREFIX 0x3e
5 11
6/* generic versions from gas 12/* generic versions from gas
7 1: nop 13 1: nop
@@ -13,14 +19,15 @@
13 6: leal 0x00000000(%esi),%esi 19 6: leal 0x00000000(%esi),%esi
14 7: leal 0x00000000(,%esi,1),%esi 20 7: leal 0x00000000(,%esi,1),%esi
15*/ 21*/
16#define GENERIC_NOP1 ".byte 0x90\n" 22#define GENERIC_NOP1 0x90
17#define GENERIC_NOP2 ".byte 0x89,0xf6\n" 23#define GENERIC_NOP2 0x89,0xf6
18#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" 24#define GENERIC_NOP3 0x8d,0x76,0x00
19#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" 25#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
20#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 26#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
21#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" 27#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
22#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" 28#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
23#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 29#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
30#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
24 31
25/* Opteron 64bit nops 32/* Opteron 64bit nops
26 1: nop 33 1: nop
@@ -29,13 +36,14 @@
29 4: osp osp osp nop 36 4: osp osp osp nop
30*/ 37*/
31#define K8_NOP1 GENERIC_NOP1 38#define K8_NOP1 GENERIC_NOP1
32#define K8_NOP2 ".byte 0x66,0x90\n" 39#define K8_NOP2 0x66,K8_NOP1
33#define K8_NOP3 ".byte 0x66,0x66,0x90\n" 40#define K8_NOP3 0x66,K8_NOP2
34#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" 41#define K8_NOP4 0x66,K8_NOP3
35#define K8_NOP5 K8_NOP3 K8_NOP2 42#define K8_NOP5 K8_NOP3,K8_NOP2
36#define K8_NOP6 K8_NOP3 K8_NOP3 43#define K8_NOP6 K8_NOP3,K8_NOP3
37#define K8_NOP7 K8_NOP4 K8_NOP3 44#define K8_NOP7 K8_NOP4,K8_NOP3
38#define K8_NOP8 K8_NOP4 K8_NOP4 45#define K8_NOP8 K8_NOP4,K8_NOP4
46#define K8_NOP5_ATOMIC 0x66,K8_NOP4
39 47
40/* K7 nops 48/* K7 nops
41 uses eax dependencies (arbitrary choice) 49 uses eax dependencies (arbitrary choice)
@@ -47,13 +55,14 @@
47 7: leal 0x00000000(,%eax,1),%eax 55 7: leal 0x00000000(,%eax,1),%eax
48*/ 56*/
49#define K7_NOP1 GENERIC_NOP1 57#define K7_NOP1 GENERIC_NOP1
50#define K7_NOP2 ".byte 0x8b,0xc0\n" 58#define K7_NOP2 0x8b,0xc0
51#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" 59#define K7_NOP3 0x8d,0x04,0x20
52#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" 60#define K7_NOP4 0x8d,0x44,0x20,0x00
53#define K7_NOP5 K7_NOP4 ASM_NOP1 61#define K7_NOP5 K7_NOP4,K7_NOP1
54#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" 62#define K7_NOP6 0x8d,0x80,0,0,0,0
55#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" 63#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0
56#define K7_NOP8 K7_NOP7 ASM_NOP1 64#define K7_NOP8 K7_NOP7,K7_NOP1
65#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
57 66
58/* P6 nops 67/* P6 nops
59 uses eax dependencies (Intel-recommended choice) 68 uses eax dependencies (Intel-recommended choice)
@@ -69,52 +78,65 @@
69 There is kernel code that depends on this. 78 There is kernel code that depends on this.
70*/ 79*/
71#define P6_NOP1 GENERIC_NOP1 80#define P6_NOP1 GENERIC_NOP1
72#define P6_NOP2 ".byte 0x66,0x90\n" 81#define P6_NOP2 0x66,0x90
73#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" 82#define P6_NOP3 0x0f,0x1f,0x00
74#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" 83#define P6_NOP4 0x0f,0x1f,0x40,0
75#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" 84#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
76#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" 85#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
77#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" 86#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
78#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" 87#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
88#define P6_NOP5_ATOMIC P6_NOP5
89
90#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
79 91
80#if defined(CONFIG_MK7) 92#if defined(CONFIG_MK7)
81#define ASM_NOP1 K7_NOP1 93#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
82#define ASM_NOP2 K7_NOP2 94#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
83#define ASM_NOP3 K7_NOP3 95#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
84#define ASM_NOP4 K7_NOP4 96#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
85#define ASM_NOP5 K7_NOP5 97#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
86#define ASM_NOP6 K7_NOP6 98#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
87#define ASM_NOP7 K7_NOP7 99#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
88#define ASM_NOP8 K7_NOP8 100#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
101#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
89#elif defined(CONFIG_X86_P6_NOP) 102#elif defined(CONFIG_X86_P6_NOP)
90#define ASM_NOP1 P6_NOP1 103#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
91#define ASM_NOP2 P6_NOP2 104#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
92#define ASM_NOP3 P6_NOP3 105#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
93#define ASM_NOP4 P6_NOP4 106#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
94#define ASM_NOP5 P6_NOP5 107#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
95#define ASM_NOP6 P6_NOP6 108#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
96#define ASM_NOP7 P6_NOP7 109#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
97#define ASM_NOP8 P6_NOP8 110#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
111#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
98#elif defined(CONFIG_X86_64) 112#elif defined(CONFIG_X86_64)
99#define ASM_NOP1 K8_NOP1 113#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
100#define ASM_NOP2 K8_NOP2 114#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
101#define ASM_NOP3 K8_NOP3 115#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
102#define ASM_NOP4 K8_NOP4 116#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
103#define ASM_NOP5 K8_NOP5 117#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
104#define ASM_NOP6 K8_NOP6 118#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
105#define ASM_NOP7 K8_NOP7 119#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
106#define ASM_NOP8 K8_NOP8 120#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
121#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
107#else 122#else
108#define ASM_NOP1 GENERIC_NOP1 123#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
109#define ASM_NOP2 GENERIC_NOP2 124#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
110#define ASM_NOP3 GENERIC_NOP3 125#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
111#define ASM_NOP4 GENERIC_NOP4 126#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
112#define ASM_NOP5 GENERIC_NOP5 127#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
113#define ASM_NOP6 GENERIC_NOP6 128#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
114#define ASM_NOP7 GENERIC_NOP7 129#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
115#define ASM_NOP8 GENERIC_NOP8 130#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
131#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
116#endif 132#endif
117 133
118#define ASM_NOP_MAX 8 134#define ASM_NOP_MAX 8
135#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */
136
137#ifndef __ASSEMBLY__
138extern const unsigned char * const *ideal_nops;
139extern void arch_init_ideal_nops(void);
140#endif
119 141
120#endif /* _ASM_X86_NOPS_H */ 142#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index a50fc9f493b3..bfacd2ccf651 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,12 +1,24 @@
1#ifndef _ASM_X86_NUMA_H 1#ifndef _ASM_X86_NUMA_H
2#define _ASM_X86_NUMA_H 2#define _ASM_X86_NUMA_H
3 3
4#include <linux/nodemask.h>
5
4#include <asm/topology.h> 6#include <asm/topology.h>
5#include <asm/apicdef.h> 7#include <asm/apicdef.h>
6 8
7#ifdef CONFIG_NUMA 9#ifdef CONFIG_NUMA
8 10
9#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 11#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
12#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
13
14/*
15 * Too small node sizes may confuse the VM badly. Usually they
16 * result from BIOS bugs. So dont recognize nodes as standalone
17 * NUMA entities that have less than this amount of RAM listed:
18 */
19#define NODE_MIN_SIZE (4*1024*1024)
20
21extern int numa_off;
10 22
11/* 23/*
12 * __apicid_to_node[] stores the raw mapping between physical apicid and 24 * __apicid_to_node[] stores the raw mapping between physical apicid and
@@ -17,15 +29,27 @@
17 * numa_cpu_node(). 29 * numa_cpu_node().
18 */ 30 */
19extern s16 __apicid_to_node[MAX_LOCAL_APIC]; 31extern s16 __apicid_to_node[MAX_LOCAL_APIC];
32extern nodemask_t numa_nodes_parsed __initdata;
33
34extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
35extern void __init numa_set_distance(int from, int to, int distance);
20 36
21static inline void set_apicid_to_node(int apicid, s16 node) 37static inline void set_apicid_to_node(int apicid, s16 node)
22{ 38{
23 __apicid_to_node[apicid] = node; 39 __apicid_to_node[apicid] = node;
24} 40}
41
42extern int __cpuinit numa_cpu_node(int cpu);
43
25#else /* CONFIG_NUMA */ 44#else /* CONFIG_NUMA */
26static inline void set_apicid_to_node(int apicid, s16 node) 45static inline void set_apicid_to_node(int apicid, s16 node)
27{ 46{
28} 47}
48
49static inline int numa_cpu_node(int cpu)
50{
51 return NUMA_NO_NODE;
52}
29#endif /* CONFIG_NUMA */ 53#endif /* CONFIG_NUMA */
30 54
31#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
@@ -37,14 +61,12 @@ static inline void set_apicid_to_node(int apicid, s16 node)
37#ifdef CONFIG_NUMA 61#ifdef CONFIG_NUMA
38extern void __cpuinit numa_set_node(int cpu, int node); 62extern void __cpuinit numa_set_node(int cpu, int node);
39extern void __cpuinit numa_clear_node(int cpu); 63extern void __cpuinit numa_clear_node(int cpu);
40extern void __init numa_init_array(void);
41extern void __init init_cpu_to_node(void); 64extern void __init init_cpu_to_node(void);
42extern void __cpuinit numa_add_cpu(int cpu); 65extern void __cpuinit numa_add_cpu(int cpu);
43extern void __cpuinit numa_remove_cpu(int cpu); 66extern void __cpuinit numa_remove_cpu(int cpu);
44#else /* CONFIG_NUMA */ 67#else /* CONFIG_NUMA */
45static inline void numa_set_node(int cpu, int node) { } 68static inline void numa_set_node(int cpu, int node) { }
46static inline void numa_clear_node(int cpu) { } 69static inline void numa_clear_node(int cpu) { }
47static inline void numa_init_array(void) { }
48static inline void init_cpu_to_node(void) { } 70static inline void init_cpu_to_node(void) { }
49static inline void numa_add_cpu(int cpu) { } 71static inline void numa_add_cpu(int cpu) { }
50static inline void numa_remove_cpu(int cpu) { } 72static inline void numa_remove_cpu(int cpu) { }
@@ -54,4 +76,10 @@ static inline void numa_remove_cpu(int cpu) { }
54void debug_cpumask_set_cpu(int cpu, int node, bool enable); 76void debug_cpumask_set_cpu(int cpu, int node, bool enable);
55#endif 77#endif
56 78
79#ifdef CONFIG_NUMA_EMU
80#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
81#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
82void numa_emu_cmdline(char *);
83#endif /* CONFIG_NUMA_EMU */
84
57#endif /* _ASM_X86_NUMA_H */ 85#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index c6beed1ef103..e7d6b8254742 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,16 +1,6 @@
1#ifndef _ASM_X86_NUMA_32_H 1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H 2#define _ASM_X86_NUMA_32_H
3 3
4extern int numa_off;
5
6extern int pxm_to_nid(int pxm);
7
8#ifdef CONFIG_NUMA
9extern int __cpuinit numa_cpu_node(int cpu);
10#else /* CONFIG_NUMA */
11static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
12#endif /* CONFIG_NUMA */
13
14#ifdef CONFIG_HIGHMEM 4#ifdef CONFIG_HIGHMEM
15extern void set_highmem_pages_init(void); 5extern void set_highmem_pages_init(void);
16#else 6#else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 344eb1790b46..0c05f7ae46e8 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,42 +1,6 @@
1#ifndef _ASM_X86_NUMA_64_H 1#ifndef _ASM_X86_NUMA_64_H
2#define _ASM_X86_NUMA_64_H 2#define _ASM_X86_NUMA_64_H
3 3
4#include <linux/nodemask.h>
5
6struct bootnode {
7 u64 start;
8 u64 end;
9};
10
11#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
12
13extern int numa_off;
14
15extern unsigned long numa_free_all_bootmem(void); 4extern unsigned long numa_free_all_bootmem(void);
16extern void setup_node_bootmem(int nodeid, unsigned long start,
17 unsigned long end);
18
19#ifdef CONFIG_NUMA
20/*
21 * Too small node sizes may confuse the VM badly. Usually they
22 * result from BIOS bugs. So dont recognize nodes as standalone
23 * NUMA entities that have less than this amount of RAM listed:
24 */
25#define NODE_MIN_SIZE (4*1024*1024)
26
27extern nodemask_t numa_nodes_parsed __initdata;
28
29extern int __cpuinit numa_cpu_node(int cpu);
30extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
31extern void __init numa_set_distance(int from, int to, int distance);
32
33#ifdef CONFIG_NUMA_EMU
34#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
35#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
36void numa_emu_cmdline(char *);
37#endif /* CONFIG_NUMA_EMU */
38#else
39static inline int numa_cpu_node(int cpu) { return NUMA_NO_NODE; }
40#endif
41 5
42#endif /* _ASM_X86_NUMA_64_H */ 6#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 37c516545ec8..c3b3c322fd87 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -29,7 +29,7 @@
29#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
30 30
31extern int found_numaq; 31extern int found_numaq;
32extern int get_memcfg_numaq(void); 32extern int numaq_numa_init(void);
33extern int pci_numaq_init(void); 33extern int pci_numaq_init(void);
34 34
35extern void *xquad_portio; 35extern void *xquad_portio;
@@ -166,11 +166,6 @@ struct sys_cfg_data {
166 166
167void numaq_tsc_disable(void); 167void numaq_tsc_disable(void);
168 168
169#else
170static inline int get_memcfg_numaq(void)
171{
172 return 0;
173}
174#endif /* CONFIG_X86_NUMAQ */ 169#endif /* CONFIG_X86_NUMAQ */
175#endif /* _ASM_X86_NUMAQ_H */ 170#endif /* _ASM_X86_NUMAQ_H */
176 171
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index c5d3a5abbb9f..24487712e0b1 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -26,15 +26,12 @@ extern void setup_olpc_ofw_pgd(void);
26/* check if OFW was detected during boot */ 26/* check if OFW was detected during boot */
27extern bool olpc_ofw_present(void); 27extern bool olpc_ofw_present(void);
28 28
29extern void olpc_dt_build_devicetree(void);
30
29#else /* !CONFIG_OLPC */ 31#else /* !CONFIG_OLPC */
30static inline void olpc_ofw_detect(void) { } 32static inline void olpc_ofw_detect(void) { }
31static inline void setup_olpc_ofw_pgd(void) { } 33static inline void setup_olpc_ofw_pgd(void) { }
32#endif /* !CONFIG_OLPC */
33
34#ifdef CONFIG_OF_PROMTREE
35extern void olpc_dt_build_devicetree(void);
36#else
37static inline void olpc_dt_build_devicetree(void) { } 34static inline void olpc_dt_build_devicetree(void) { }
38#endif 35#endif /* !CONFIG_OLPC */
39 36
40#endif /* _ASM_X86_OLPC_OFW_H */ 37#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index 676129229630..d498943b906c 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -135,8 +135,6 @@ void default_teardown_msi_irqs(struct pci_dev *dev);
135#include "pci_64.h" 135#include "pci_64.h"
136#endif 136#endif
137 137
138void dma32_reserve_bootmem(void);
139
140/* implement the pci_ DMA API in terms of the generic device dma_ one */ 138/* implement the pci_ DMA API in terms of the generic device dma_ one */
141#include <asm-generic/pci-dma-compat.h> 139#include <asm-generic/pci-dma-compat.h>
142 140
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index d475b4398d8b..a0a9779084d1 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -509,6 +509,11 @@ do { \
509 * it in software. The address used in the cmpxchg16 instruction must be 509 * it in software. The address used in the cmpxchg16 instruction must be
510 * aligned to a 16 byte boundary. 510 * aligned to a 16 byte boundary.
511 */ 511 */
512#ifdef CONFIG_SMP
513#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
514#else
515#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
516#endif
512#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \ 517#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
513({ \ 518({ \
514 char __ret; \ 519 char __ret; \
@@ -517,7 +522,7 @@ do { \
517 typeof(o2) __o2 = o2; \ 522 typeof(o2) __o2 = o2; \
518 typeof(o2) __n2 = n2; \ 523 typeof(o2) __n2 = n2; \
519 typeof(o2) __dummy; \ 524 typeof(o2) __dummy; \
520 alternative_io("call this_cpu_cmpxchg16b_emu\n\t" P6_NOP4, \ 525 alternative_io(CMPXCHG16B_EMU_CALL, \
521 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \ 526 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
522 X86_FEATURE_CX16, \ 527 X86_FEATURE_CX16, \
523 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \ 528 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
@@ -542,6 +547,33 @@ do { \
542 old__; \ 547 old__; \
543}) 548})
544 549
550static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
551 const unsigned long __percpu *addr)
552{
553 unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
554
555 return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
556}
557
558static inline int x86_this_cpu_variable_test_bit(int nr,
559 const unsigned long __percpu *addr)
560{
561 int oldbit;
562
563 asm volatile("bt "__percpu_arg(2)",%1\n\t"
564 "sbb %0,%0"
565 : "=r" (oldbit)
566 : "m" (*(unsigned long *)addr), "Ir" (nr));
567
568 return oldbit;
569}
570
571#define x86_this_cpu_test_bit(nr, addr) \
572 (__builtin_constant_p((nr)) \
573 ? x86_this_cpu_constant_test_bit((nr), (addr)) \
574 : x86_this_cpu_variable_test_bit((nr), (addr)))
575
576
545#include <asm-generic/percpu.h> 577#include <asm-generic/percpu.h>
546 578
547/* We can use this directly for local CPU (faster). */ 579/* We can use this directly for local CPU (faster). */
diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 000000000000..4950a0b1d09c
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,8 @@
1#ifndef _PROBE_ROMS_H_
2#define _PROBE_ROMS_H_
3struct pci_dev;
4
5extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
6extern void pci_unmap_biosrom(void __iomem *rom);
7extern size_t pci_biosrom_size(struct pci_dev *pdev);
8#endif
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index a898a2b6e10c..59ab4dffa377 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -60,6 +60,7 @@
60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
63#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
63 64
64/* 65/*
65 * x86-64 Task Priority Register, CR8 66 * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 4c25ab48257b..219371546afd 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -754,10 +754,10 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
754extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); 754extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
755 755
756extern void select_idle_routine(const struct cpuinfo_x86 *c); 756extern void select_idle_routine(const struct cpuinfo_x86 *c);
757extern void init_c1e_mask(void); 757extern void init_amd_e400_c1e_mask(void);
758 758
759extern unsigned long boot_option_idle_override; 759extern unsigned long boot_option_idle_override;
760extern bool c1e_detected; 760extern bool amd_e400_c1e_detected;
761 761
762enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT, 762enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
763 IDLE_POLL, IDLE_FORCE_MWAIT}; 763 IDLE_POLL, IDLE_FORCE_MWAIT};
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 1babf8adecdf..94e7618fcac8 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -136,6 +136,7 @@ struct cpuinfo_x86;
136struct task_struct; 136struct task_struct;
137 137
138extern unsigned long profile_pc(struct pt_regs *regs); 138extern unsigned long profile_pc(struct pt_regs *regs);
139#define profile_pc profile_pc
139 140
140extern unsigned long 141extern unsigned long
141convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); 142convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
@@ -202,20 +203,11 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
202#endif 203#endif
203} 204}
204 205
205static inline unsigned long instruction_pointer(struct pt_regs *regs) 206#define GET_IP(regs) ((regs)->ip)
206{ 207#define GET_FP(regs) ((regs)->bp)
207 return regs->ip; 208#define GET_USP(regs) ((regs)->sp)
208}
209
210static inline unsigned long frame_pointer(struct pt_regs *regs)
211{
212 return regs->bp;
213}
214 209
215static inline unsigned long user_stack_pointer(struct pt_regs *regs) 210#include <asm-generic/ptrace.h>
216{
217 return regs->sp;
218}
219 211
220/* Query offset/name of register from its name/offset */ 212/* Query offset/name of register from its name/offset */
221extern int regs_query_register_offset(const char *name); 213extern int regs_query_register_offset(const char *name);
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index db8aa19a08a2..9756551ec760 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -88,7 +88,7 @@ void *extend_brk(size_t size, size_t align);
88 * executable.) 88 * executable.)
89 */ 89 */
90#define RESERVE_BRK(name,sz) \ 90#define RESERVE_BRK(name,sz) \
91 static void __section(.discard.text) __used \ 91 static void __section(.discard.text) __used notrace \
92 __brk_reservation_fn_##name##__(void) { \ 92 __brk_reservation_fn_##name##__(void) { \
93 asm volatile ( \ 93 asm volatile ( \
94 ".pushsection .brk_reservation,\"aw\",@nobits;" \ 94 ".pushsection .brk_reservation,\"aw\",@nobits;" \
@@ -104,10 +104,10 @@ void *extend_brk(size_t size, size_t align);
104 type *name; \ 104 type *name; \
105 RESERVE_BRK(name, sizeof(type) * entries) 105 RESERVE_BRK(name, sizeof(type) * entries)
106 106
107extern void probe_roms(void);
107#ifdef __i386__ 108#ifdef __i386__
108 109
109void __init i386_start_kernel(void); 110void __init i386_start_kernel(void);
110extern void probe_roms(void);
111 111
112#else 112#else
113void __init x86_64_start_kernel(char *real_mode); 113void __init x86_64_start_kernel(char *real_mode);
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
deleted file mode 100644
index b508d639d1a7..000000000000
--- a/arch/x86/include/asm/srat.h
+++ /dev/null
@@ -1,39 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26
27#ifndef _ASM_X86_SRAT_H
28#define _ASM_X86_SRAT_H
29
30#ifdef CONFIG_ACPI_NUMA
31extern int get_memcfg_from_srat(void);
32#else
33static inline int get_memcfg_from_srat(void)
34{
35 return 0;
36}
37#endif
38
39#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index d7e89c83645d..70bbe39043a9 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -37,9 +37,6 @@ print_context_stack_bp(struct thread_info *tinfo,
37/* Generic stack tracer with callbacks */ 37/* Generic stack tracer with callbacks */
38 38
39struct stacktrace_ops { 39struct stacktrace_ops {
40 void (*warning)(void *data, char *msg);
41 /* msg must contain %s for the symbol */
42 void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
43 void (*address)(void *data, unsigned long address, int reliable); 40 void (*address)(void *data, unsigned long address, int reliable);
44 /* On negative return stop dumping */ 41 /* On negative return stop dumping */
45 int (*stack)(void *data, char *name); 42 int (*stack)(void *data, char *name);
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index fd921c3a6841..487055c8c1aa 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -9,8 +9,6 @@
9#include <asm/desc.h> 9#include <asm/desc.h>
10#include <asm/i387.h> 10#include <asm/i387.h>
11 11
12static inline int arch_prepare_suspend(void) { return 0; }
13
14/* image of the saved processor state */ 12/* image of the saved processor state */
15struct saved_context { 13struct saved_context {
16 u16 es, fs, gs, ss; 14 u16 es, fs, gs, ss;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 8d942afae681..09b0bf104156 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -9,11 +9,6 @@
9#include <asm/desc.h> 9#include <asm/desc.h>
10#include <asm/i387.h> 10#include <asm/i387.h>
11 11
12static inline int arch_prepare_suspend(void)
13{
14 return 0;
15}
16
17/* 12/*
18 * Image of the saved processor state, used by the low level ACPI suspend to 13 * Image of the saved processor state, used by the low level ACPI suspend to
19 * RAM code and by the low level hibernation code. 14 * RAM code and by the low level hibernation code.
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 12569e691ce3..c2ff2a1d845e 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -303,24 +303,81 @@ static inline void native_wbinvd(void)
303#ifdef CONFIG_PARAVIRT 303#ifdef CONFIG_PARAVIRT
304#include <asm/paravirt.h> 304#include <asm/paravirt.h>
305#else 305#else
306#define read_cr0() (native_read_cr0()) 306
307#define write_cr0(x) (native_write_cr0(x)) 307static inline unsigned long read_cr0(void)
308#define read_cr2() (native_read_cr2()) 308{
309#define write_cr2(x) (native_write_cr2(x)) 309 return native_read_cr0();
310#define read_cr3() (native_read_cr3()) 310}
311#define write_cr3(x) (native_write_cr3(x)) 311
312#define read_cr4() (native_read_cr4()) 312static inline void write_cr0(unsigned long x)
313#define read_cr4_safe() (native_read_cr4_safe()) 313{
314#define write_cr4(x) (native_write_cr4(x)) 314 native_write_cr0(x);
315#define wbinvd() (native_wbinvd()) 315}
316
317static inline unsigned long read_cr2(void)
318{
319 return native_read_cr2();
320}
321
322static inline void write_cr2(unsigned long x)
323{
324 native_write_cr2(x);
325}
326
327static inline unsigned long read_cr3(void)
328{
329 return native_read_cr3();
330}
331
332static inline void write_cr3(unsigned long x)
333{
334 native_write_cr3(x);
335}
336
337static inline unsigned long read_cr4(void)
338{
339 return native_read_cr4();
340}
341
342static inline unsigned long read_cr4_safe(void)
343{
344 return native_read_cr4_safe();
345}
346
347static inline void write_cr4(unsigned long x)
348{
349 native_write_cr4(x);
350}
351
352static inline void wbinvd(void)
353{
354 native_wbinvd();
355}
356
316#ifdef CONFIG_X86_64 357#ifdef CONFIG_X86_64
317#define read_cr8() (native_read_cr8()) 358
318#define write_cr8(x) (native_write_cr8(x)) 359static inline unsigned long read_cr8(void)
319#define load_gs_index native_load_gs_index 360{
361 return native_read_cr8();
362}
363
364static inline void write_cr8(unsigned long x)
365{
366 native_write_cr8(x);
367}
368
369static inline void load_gs_index(unsigned selector)
370{
371 native_load_gs_index(selector);
372}
373
320#endif 374#endif
321 375
322/* Clear the 'TS' bit */ 376/* Clear the 'TS' bit */
323#define clts() (native_clts()) 377static inline void clts(void)
378{
379 native_clts();
380}
324 381
325#endif/* CONFIG_PARAVIRT */ 382#endif/* CONFIG_PARAVIRT */
326 383
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 910a7084f7f2..c00692476e9f 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -93,19 +93,11 @@ extern void setup_node_to_cpumask_map(void);
93#define pcibus_to_node(bus) __pcibus_to_node(bus) 93#define pcibus_to_node(bus) __pcibus_to_node(bus)
94 94
95#ifdef CONFIG_X86_32 95#ifdef CONFIG_X86_32
96extern unsigned long node_start_pfn[];
97extern unsigned long node_end_pfn[];
98extern unsigned long node_remap_size[];
99#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
100
101# define SD_CACHE_NICE_TRIES 1 96# define SD_CACHE_NICE_TRIES 1
102# define SD_IDLE_IDX 1 97# define SD_IDLE_IDX 1
103
104#else 98#else
105
106# define SD_CACHE_NICE_TRIES 2 99# define SD_CACHE_NICE_TRIES 2
107# define SD_IDLE_IDX 2 100# define SD_IDLE_IDX 2
108
109#endif 101#endif
110 102
111/* sched_domains SD_NODE_INIT for NUMA machines */ 103/* sched_domains SD_NODE_INIT for NUMA machines */
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 83e2efd181e2..9db5583b6d38 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
51extern int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void); 52extern unsigned long native_calibrate_tsc(void);
53 53
54#ifdef CONFIG_X86_64
55extern cycles_t vread_tsc(void);
56#endif
57
54/* 58/*
55 * Boot-time check whether the TSCs are synchronized across 59 * Boot-time check whether the TSCs are synchronized across
56 * all CPUs/cores: 60 * all CPUs/cores:
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index abd3e0ea762a..99ddd148a760 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -6,7 +6,6 @@
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/compiler.h> 7#include <linux/compiler.h>
8#include <linux/thread_info.h> 8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h> 9#include <linux/string.h>
11#include <asm/asm.h> 10#include <asm/asm.h>
12#include <asm/page.h> 11#include <asm/page.h>
@@ -42,7 +41,7 @@
42 * Returns 0 if the range is valid, nonzero otherwise. 41 * Returns 0 if the range is valid, nonzero otherwise.
43 * 42 *
44 * This is equivalent to the following test: 43 * This is equivalent to the following test:
45 * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) 44 * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
46 * 45 *
47 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... 46 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
48 */ 47 */
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 088d09fb1615..566e803cc602 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -6,7 +6,6 @@
6 */ 6 */
7#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/thread_info.h> 8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h> 9#include <linux/string.h>
11#include <asm/asm.h> 10#include <asm/asm.h>
12#include <asm/page.h> 11#include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 316708d5af92..1c66d30971ad 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -6,7 +6,6 @@
6 */ 6 */
7#include <linux/compiler.h> 7#include <linux/compiler.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/prefetch.h>
10#include <linux/lockdep.h> 9#include <linux/lockdep.h>
11#include <asm/alternative.h> 10#include <asm/alternative.h>
12#include <asm/cpufeature.h> 11#include <asm/cpufeature.h>
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index a755ef5e5977..593485b38ab3 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -350,10 +350,12 @@
350#define __NR_open_by_handle_at 342 350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343 351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344 352#define __NR_syncfs 344
353#define __NR_sendmmsg 345
354#define __NR_setns 346
353 355
354#ifdef __KERNEL__ 356#ifdef __KERNEL__
355 357
356#define NR_syscalls 345 358#define NR_syscalls 347
357 359
358#define __ARCH_WANT_IPC_PARSE_VERSION 360#define __ARCH_WANT_IPC_PARSE_VERSION
359#define __ARCH_WANT_OLD_READDIR 361#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 160fa76bd578..705bf139288c 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -677,6 +677,10 @@ __SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime) 677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
678#define __NR_syncfs 306 678#define __NR_syncfs 306
679__SYSCALL(__NR_syncfs, sys_syncfs) 679__SYSCALL(__NR_syncfs, sys_syncfs)
680#define __NR_sendmmsg 307
681__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
682#define __NR_setns 308
683__SYSCALL(__NR_setns, sys_setns)
680 684
681#ifndef __NO_STUBS 685#ifndef __NO_STUBS
682#define __ARCH_WANT_OLD_READDIR 686#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 130f1eeee5fe..a291c40efd43 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV Broadcast Assist Unit definitions 6 * SGI UV Broadcast Assist Unit definitions
7 * 7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#ifndef _ASM_X86_UV_UV_BAU_H 11#ifndef _ASM_X86_UV_UV_BAU_H
@@ -35,17 +35,20 @@
35 35
36#define MAX_CPUS_PER_UVHUB 64 36#define MAX_CPUS_PER_UVHUB 64
37#define MAX_CPUS_PER_SOCKET 32 37#define MAX_CPUS_PER_SOCKET 32
38#define UV_ADP_SIZE 64 /* hardware-provided max. */ 38#define ADP_SZ 64 /* hardware-provided max. */
39#define UV_CPUS_PER_ACT_STATUS 32 /* hardware-provided max. */ 39#define UV_CPUS_PER_AS 32 /* hardware-provided max. */
40#define UV_ITEMS_PER_DESCRIPTOR 8 40#define ITEMS_PER_DESC 8
41/* the 'throttle' to prevent the hardware stay-busy bug */ 41/* the 'throttle' to prevent the hardware stay-busy bug */
42#define MAX_BAU_CONCURRENT 3 42#define MAX_BAU_CONCURRENT 3
43#define UV_ACT_STATUS_MASK 0x3 43#define UV_ACT_STATUS_MASK 0x3
44#define UV_ACT_STATUS_SIZE 2 44#define UV_ACT_STATUS_SIZE 2
45#define UV_DISTRIBUTION_SIZE 256 45#define UV_DISTRIBUTION_SIZE 256
46#define UV_SW_ACK_NPENDING 8 46#define UV_SW_ACK_NPENDING 8
47#define UV_NET_ENDPOINT_INTD 0x38 47#define UV1_NET_ENDPOINT_INTD 0x38
48#define UV_DESC_BASE_PNODE_SHIFT 49 48#define UV2_NET_ENDPOINT_INTD 0x28
49#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \
50 UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
51#define UV_DESC_PSHIFT 49
49#define UV_PAYLOADQ_PNODE_SHIFT 49 52#define UV_PAYLOADQ_PNODE_SHIFT 49
50#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" 53#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
51#define UV_BAU_BASENAME "sgi_uv/bau_tunables" 54#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
@@ -53,29 +56,64 @@
53#define UV_BAU_TUNABLES_FILE "bau_tunables" 56#define UV_BAU_TUNABLES_FILE "bau_tunables"
54#define WHITESPACE " \t\n" 57#define WHITESPACE " \t\n"
55#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) 58#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
56#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 59#define cpubit_isset(cpu, bau_local_cpumask) \
57#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 60 test_bit((cpu), (bau_local_cpumask).bits)
58#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL 61
59/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ 62/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */
60#define BAU_MISC_CONTROL_MULT_MASK 3 63/*
64 * UV2: Bit 19 selects between
65 * (0): 10 microsecond timebase and
66 * (1): 80 microseconds
67 * we're using 655us, similar to UV1: 65 units of 10us
68 */
69#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
70#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (65*10UL)
71
72#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
73 UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
74 UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
61 75
62#define UVH_AGING_PRESCALE_SEL 0x000000b000UL 76#define BAU_MISC_CONTROL_MULT_MASK 3
77
78#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
63/* [30:28] URGENCY_7 an index into a table of times */ 79/* [30:28] URGENCY_7 an index into a table of times */
64#define BAU_URGENCY_7_SHIFT 28 80#define BAU_URGENCY_7_SHIFT 28
65#define BAU_URGENCY_7_MASK 7 81#define BAU_URGENCY_7_MASK 7
66 82
67#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL 83#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
68/* [45:40] BAU - BAU transaction timeout select - a multiplier */ 84/* [45:40] BAU - BAU transaction timeout select - a multiplier */
69#define BAU_TRANS_SHIFT 40 85#define BAU_TRANS_SHIFT 40
70#define BAU_TRANS_MASK 0x3f 86#define BAU_TRANS_MASK 0x3f
87
88/*
89 * shorten some awkward names
90 */
91#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT
92#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
93#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
94#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
95#define write_gmmr uv_write_global_mmr64
96#define write_lmmr uv_write_local_mmr
97#define read_lmmr uv_read_local_mmr
98#define read_gmmr uv_read_global_mmr64
71 99
72/* 100/*
73 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 101 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
74 */ 102 */
75#define DESC_STATUS_IDLE 0 103#define DS_IDLE 0
76#define DESC_STATUS_ACTIVE 1 104#define DS_ACTIVE 1
77#define DESC_STATUS_DESTINATION_TIMEOUT 2 105#define DS_DESTINATION_TIMEOUT 2
78#define DESC_STATUS_SOURCE_TIMEOUT 3 106#define DS_SOURCE_TIMEOUT 3
107/*
108 * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
109 * values 1 and 5 will not occur
110 */
111#define UV2H_DESC_IDLE 0
112#define UV2H_DESC_DEST_TIMEOUT 2
113#define UV2H_DESC_DEST_STRONG_NACK 3
114#define UV2H_DESC_BUSY 4
115#define UV2H_DESC_SOURCE_TIMEOUT 6
116#define UV2H_DESC_DEST_PUT_ERR 7
79 117
80/* 118/*
81 * delay for 'plugged' timeout retries, in microseconds 119 * delay for 'plugged' timeout retries, in microseconds
@@ -86,15 +124,24 @@
86 * threshholds at which to use IPI to free resources 124 * threshholds at which to use IPI to free resources
87 */ 125 */
88/* after this # consecutive 'plugged' timeouts, use IPI to release resources */ 126/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
89#define PLUGSB4RESET 100 127#define PLUGSB4RESET 100
90/* after this many consecutive timeouts, use IPI to release resources */ 128/* after this many consecutive timeouts, use IPI to release resources */
91#define TIMEOUTSB4RESET 1 129#define TIMEOUTSB4RESET 1
92/* at this number uses of IPI to release resources, giveup the request */ 130/* at this number uses of IPI to release resources, giveup the request */
93#define IPI_RESET_LIMIT 1 131#define IPI_RESET_LIMIT 1
94/* after this # consecutive successes, bump up the throttle if it was lowered */ 132/* after this # consecutive successes, bump up the throttle if it was lowered */
95#define COMPLETE_THRESHOLD 5 133#define COMPLETE_THRESHOLD 5
134
135#define UV_LB_SUBNODEID 0x10
96 136
97#define UV_LB_SUBNODEID 0x10 137/* these two are the same for UV1 and UV2: */
138#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
139#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK
140/* 4 bits of software ack period */
141#define UV2_ACK_MASK 0x7UL
142#define UV2_ACK_UNITS_SHFT 3
143#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT
144#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
98 145
99/* 146/*
100 * number of entries in the destination side payload queue 147 * number of entries in the destination side payload queue
@@ -115,9 +162,16 @@
115/* 162/*
116 * tuning the action when the numalink network is extremely delayed 163 * tuning the action when the numalink network is extremely delayed
117 */ 164 */
118#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */ 165#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in
119#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ 166 microseconds */
120#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */ 167#define CONGESTED_REPS 10 /* long delays averaged over
168 this many broadcasts */
169#define CONGESTED_PERIOD 30 /* time for the bau to be
170 disabled, in seconds */
171/* see msg_type: */
172#define MSG_NOOP 0
173#define MSG_REGULAR 1
174#define MSG_RETRY 2
121 175
122/* 176/*
123 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) 177 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -129,8 +183,8 @@
129 * 'base_dest_nasid' field of the header corresponds to the 183 * 'base_dest_nasid' field of the header corresponds to the
130 * destination nodeID associated with that specified bit. 184 * destination nodeID associated with that specified bit.
131 */ 185 */
132struct bau_target_uvhubmask { 186struct bau_targ_hubmask {
133 unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; 187 unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
134}; 188};
135 189
136/* 190/*
@@ -139,7 +193,7 @@ struct bau_target_uvhubmask {
139 * enough bits for max. cpu's per uvhub) 193 * enough bits for max. cpu's per uvhub)
140 */ 194 */
141struct bau_local_cpumask { 195struct bau_local_cpumask {
142 unsigned long bits; 196 unsigned long bits;
143}; 197};
144 198
145/* 199/*
@@ -160,14 +214,14 @@ struct bau_local_cpumask {
160 * The payload is software-defined for INTD transactions 214 * The payload is software-defined for INTD transactions
161 */ 215 */
162struct bau_msg_payload { 216struct bau_msg_payload {
163 unsigned long address; /* signifies a page or all TLB's 217 unsigned long address; /* signifies a page or all
164 of the cpu */ 218 TLB's of the cpu */
165 /* 64 bits */ 219 /* 64 bits */
166 unsigned short sending_cpu; /* filled in by sender */ 220 unsigned short sending_cpu; /* filled in by sender */
167 /* 16 bits */ 221 /* 16 bits */
168 unsigned short acknowledge_count;/* filled in by destination */ 222 unsigned short acknowledge_count; /* filled in by destination */
169 /* 16 bits */ 223 /* 16 bits */
170 unsigned int reserved1:32; /* not usable */ 224 unsigned int reserved1:32; /* not usable */
171}; 225};
172 226
173 227
@@ -176,93 +230,96 @@ struct bau_msg_payload {
176 * see table 4.2.3.0.1 in broacast_assist spec. 230 * see table 4.2.3.0.1 in broacast_assist spec.
177 */ 231 */
178struct bau_msg_header { 232struct bau_msg_header {
179 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 233 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
180 /* bits 5:0 */ 234 /* bits 5:0 */
181 unsigned int base_dest_nasid:15; /* nasid of the */ 235 unsigned int base_dest_nasid:15; /* nasid of the first bit */
182 /* bits 20:6 */ /* first bit in uvhub map */ 236 /* bits 20:6 */ /* in uvhub map */
183 unsigned int command:8; /* message type */ 237 unsigned int command:8; /* message type */
184 /* bits 28:21 */ 238 /* bits 28:21 */
185 /* 0x38: SN3net EndPoint Message */ 239 /* 0x38: SN3net EndPoint Message */
186 unsigned int rsvd_1:3; /* must be zero */ 240 unsigned int rsvd_1:3; /* must be zero */
187 /* bits 31:29 */ 241 /* bits 31:29 */
188 /* int will align on 32 bits */ 242 /* int will align on 32 bits */
189 unsigned int rsvd_2:9; /* must be zero */ 243 unsigned int rsvd_2:9; /* must be zero */
190 /* bits 40:32 */ 244 /* bits 40:32 */
191 /* Suppl_A is 56-41 */ 245 /* Suppl_A is 56-41 */
192 unsigned int sequence:16;/* message sequence number */ 246 unsigned int sequence:16; /* message sequence number */
193 /* bits 56:41 */ /* becomes bytes 16-17 of msg */ 247 /* bits 56:41 */ /* becomes bytes 16-17 of msg */
194 /* Address field (96:57) is never used as an 248 /* Address field (96:57) is
195 address (these are address bits 42:3) */ 249 never used as an address
196 250 (these are address bits
197 unsigned int rsvd_3:1; /* must be zero */ 251 42:3) */
252
253 unsigned int rsvd_3:1; /* must be zero */
198 /* bit 57 */ 254 /* bit 57 */
199 /* address bits 27:4 are payload */ 255 /* address bits 27:4 are payload */
200 /* these next 24 (58-81) bits become bytes 12-14 of msg */ 256 /* these next 24 (58-81) bits become bytes 12-14 of msg */
201
202 /* bits 65:58 land in byte 12 */ 257 /* bits 65:58 land in byte 12 */
203 unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */ 258 unsigned int replied_to:1; /* sent as 0 by the source to
259 byte 12 */
204 /* bit 58 */ 260 /* bit 58 */
205 unsigned int msg_type:3; /* software type of the message*/ 261 unsigned int msg_type:3; /* software type of the
262 message */
206 /* bits 61:59 */ 263 /* bits 61:59 */
207 unsigned int canceled:1; /* message canceled, resource to be freed*/ 264 unsigned int canceled:1; /* message canceled, resource
265 is to be freed*/
208 /* bit 62 */ 266 /* bit 62 */
209 unsigned int payload_1a:1;/* not currently used */ 267 unsigned int payload_1a:1; /* not currently used */
210 /* bit 63 */ 268 /* bit 63 */
211 unsigned int payload_1b:2;/* not currently used */ 269 unsigned int payload_1b:2; /* not currently used */
212 /* bits 65:64 */ 270 /* bits 65:64 */
213 271
214 /* bits 73:66 land in byte 13 */ 272 /* bits 73:66 land in byte 13 */
215 unsigned int payload_1ca:6;/* not currently used */ 273 unsigned int payload_1ca:6; /* not currently used */
216 /* bits 71:66 */ 274 /* bits 71:66 */
217 unsigned int payload_1c:2;/* not currently used */ 275 unsigned int payload_1c:2; /* not currently used */
218 /* bits 73:72 */ 276 /* bits 73:72 */
219 277
220 /* bits 81:74 land in byte 14 */ 278 /* bits 81:74 land in byte 14 */
221 unsigned int payload_1d:6;/* not currently used */ 279 unsigned int payload_1d:6; /* not currently used */
222 /* bits 79:74 */ 280 /* bits 79:74 */
223 unsigned int payload_1e:2;/* not currently used */ 281 unsigned int payload_1e:2; /* not currently used */
224 /* bits 81:80 */ 282 /* bits 81:80 */
225 283
226 unsigned int rsvd_4:7; /* must be zero */ 284 unsigned int rsvd_4:7; /* must be zero */
227 /* bits 88:82 */ 285 /* bits 88:82 */
228 unsigned int sw_ack_flag:1;/* software acknowledge flag */ 286 unsigned int swack_flag:1; /* software acknowledge flag */
229 /* bit 89 */ 287 /* bit 89 */
230 /* INTD trasactions at destination are to 288 /* INTD trasactions at
231 wait for software acknowledge */ 289 destination are to wait for
232 unsigned int rsvd_5:6; /* must be zero */ 290 software acknowledge */
291 unsigned int rsvd_5:6; /* must be zero */
233 /* bits 95:90 */ 292 /* bits 95:90 */
234 unsigned int rsvd_6:5; /* must be zero */ 293 unsigned int rsvd_6:5; /* must be zero */
235 /* bits 100:96 */ 294 /* bits 100:96 */
236 unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */ 295 unsigned int int_both:1; /* if 1, interrupt both sockets
296 on the uvhub */
237 /* bit 101*/ 297 /* bit 101*/
238 unsigned int fairness:3;/* usually zero */ 298 unsigned int fairness:3; /* usually zero */
239 /* bits 104:102 */ 299 /* bits 104:102 */
240 unsigned int multilevel:1; /* multi-level multicast format */ 300 unsigned int multilevel:1; /* multi-level multicast
301 format */
241 /* bit 105 */ 302 /* bit 105 */
242 /* 0 for TLB: endpoint multi-unicast messages */ 303 /* 0 for TLB: endpoint multi-unicast messages */
243 unsigned int chaining:1;/* next descriptor is part of this activation*/ 304 unsigned int chaining:1; /* next descriptor is part of
305 this activation*/
244 /* bit 106 */ 306 /* bit 106 */
245 unsigned int rsvd_7:21; /* must be zero */ 307 unsigned int rsvd_7:21; /* must be zero */
246 /* bits 127:107 */ 308 /* bits 127:107 */
247}; 309};
248 310
249/* see msg_type: */
250#define MSG_NOOP 0
251#define MSG_REGULAR 1
252#define MSG_RETRY 2
253
254/* 311/*
255 * The activation descriptor: 312 * The activation descriptor:
256 * The format of the message to send, plus all accompanying control 313 * The format of the message to send, plus all accompanying control
257 * Should be 64 bytes 314 * Should be 64 bytes
258 */ 315 */
259struct bau_desc { 316struct bau_desc {
260 struct bau_target_uvhubmask distribution; 317 struct bau_targ_hubmask distribution;
261 /* 318 /*
262 * message template, consisting of header and payload: 319 * message template, consisting of header and payload:
263 */ 320 */
264 struct bau_msg_header header; 321 struct bau_msg_header header;
265 struct bau_msg_payload payload; 322 struct bau_msg_payload payload;
266}; 323};
267/* 324/*
268 * -payload-- ---------header------ 325 * -payload-- ---------header------
@@ -281,59 +338,51 @@ struct bau_desc {
281 * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17 338 * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
282 * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120) 339 * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
283 * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from 340 * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
284 * sw_ack_vector and payload_2) 341 * swack_vec and payload_2)
285 * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software 342 * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
286 * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload 343 * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
287 * operation." 344 * operation."
288 */ 345 */
289struct bau_payload_queue_entry { 346struct bau_pq_entry {
290 unsigned long address; /* signifies a page or all TLB's 347 unsigned long address; /* signifies a page or all TLB's
291 of the cpu */ 348 of the cpu */
292 /* 64 bits, bytes 0-7 */ 349 /* 64 bits, bytes 0-7 */
293 350 unsigned short sending_cpu; /* cpu that sent the message */
294 unsigned short sending_cpu; /* cpu that sent the message */
295 /* 16 bits, bytes 8-9 */ 351 /* 16 bits, bytes 8-9 */
296 352 unsigned short acknowledge_count; /* filled in by destination */
297 unsigned short acknowledge_count; /* filled in by destination */
298 /* 16 bits, bytes 10-11 */ 353 /* 16 bits, bytes 10-11 */
299
300 /* these next 3 bytes come from bits 58-81 of the message header */ 354 /* these next 3 bytes come from bits 58-81 of the message header */
301 unsigned short replied_to:1; /* sent as 0 by the source */ 355 unsigned short replied_to:1; /* sent as 0 by the source */
302 unsigned short msg_type:3; /* software message type */ 356 unsigned short msg_type:3; /* software message type */
303 unsigned short canceled:1; /* sent as 0 by the source */ 357 unsigned short canceled:1; /* sent as 0 by the source */
304 unsigned short unused1:3; /* not currently using */ 358 unsigned short unused1:3; /* not currently using */
305 /* byte 12 */ 359 /* byte 12 */
306 360 unsigned char unused2a; /* not currently using */
307 unsigned char unused2a; /* not currently using */
308 /* byte 13 */ 361 /* byte 13 */
309 unsigned char unused2; /* not currently using */ 362 unsigned char unused2; /* not currently using */
310 /* byte 14 */ 363 /* byte 14 */
311 364 unsigned char swack_vec; /* filled in by the hardware */
312 unsigned char sw_ack_vector; /* filled in by the hardware */
313 /* byte 15 (bits 127:120) */ 365 /* byte 15 (bits 127:120) */
314 366 unsigned short sequence; /* message sequence number */
315 unsigned short sequence; /* message sequence number */
316 /* bytes 16-17 */ 367 /* bytes 16-17 */
317 unsigned char unused4[2]; /* not currently using bytes 18-19 */ 368 unsigned char unused4[2]; /* not currently using bytes 18-19 */
318 /* bytes 18-19 */ 369 /* bytes 18-19 */
319 370 int number_of_cpus; /* filled in at destination */
320 int number_of_cpus; /* filled in at destination */
321 /* 32 bits, bytes 20-23 (aligned) */ 371 /* 32 bits, bytes 20-23 (aligned) */
322 372 unsigned char unused5[8]; /* not using */
323 unsigned char unused5[8]; /* not using */
324 /* bytes 24-31 */ 373 /* bytes 24-31 */
325}; 374};
326 375
327struct msg_desc { 376struct msg_desc {
328 struct bau_payload_queue_entry *msg; 377 struct bau_pq_entry *msg;
329 int msg_slot; 378 int msg_slot;
330 int sw_ack_slot; 379 int swack_slot;
331 struct bau_payload_queue_entry *va_queue_first; 380 struct bau_pq_entry *queue_first;
332 struct bau_payload_queue_entry *va_queue_last; 381 struct bau_pq_entry *queue_last;
333}; 382};
334 383
335struct reset_args { 384struct reset_args {
336 int sender; 385 int sender;
337}; 386};
338 387
339/* 388/*
@@ -341,112 +390,226 @@ struct reset_args {
341 */ 390 */
342struct ptc_stats { 391struct ptc_stats {
343 /* sender statistics */ 392 /* sender statistics */
344 unsigned long s_giveup; /* number of fall backs to IPI-style flushes */ 393 unsigned long s_giveup; /* number of fall backs to
345 unsigned long s_requestor; /* number of shootdown requests */ 394 IPI-style flushes */
346 unsigned long s_stimeout; /* source side timeouts */ 395 unsigned long s_requestor; /* number of shootdown
347 unsigned long s_dtimeout; /* destination side timeouts */ 396 requests */
348 unsigned long s_time; /* time spent in sending side */ 397 unsigned long s_stimeout; /* source side timeouts */
349 unsigned long s_retriesok; /* successful retries */ 398 unsigned long s_dtimeout; /* destination side timeouts */
350 unsigned long s_ntargcpu; /* total number of cpu's targeted */ 399 unsigned long s_time; /* time spent in sending side */
351 unsigned long s_ntargself; /* times the sending cpu was targeted */ 400 unsigned long s_retriesok; /* successful retries */
352 unsigned long s_ntarglocals; /* targets of cpus on the local blade */ 401 unsigned long s_ntargcpu; /* total number of cpu's
353 unsigned long s_ntargremotes; /* targets of cpus on remote blades */ 402 targeted */
354 unsigned long s_ntarglocaluvhub; /* targets of the local hub */ 403 unsigned long s_ntargself; /* times the sending cpu was
355 unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ 404 targeted */
356 unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ 405 unsigned long s_ntarglocals; /* targets of cpus on the local
357 unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ 406 blade */
358 unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */ 407 unsigned long s_ntargremotes; /* targets of cpus on remote
359 unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */ 408 blades */
360 unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */ 409 unsigned long s_ntarglocaluvhub; /* targets of the local hub */
361 unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */ 410 unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
362 unsigned long s_resets_plug; /* ipi-style resets from plug state */ 411 unsigned long s_ntarguvhub; /* total number of uvhubs
363 unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ 412 targeted */
364 unsigned long s_busy; /* status stayed busy past s/w timer */ 413 unsigned long s_ntarguvhub16; /* number of times target
365 unsigned long s_throttles; /* waits in throttle */ 414 hubs >= 16*/
366 unsigned long s_retry_messages; /* retry broadcasts */ 415 unsigned long s_ntarguvhub8; /* number of times target
367 unsigned long s_bau_reenabled; /* for bau enable/disable */ 416 hubs >= 8 */
368 unsigned long s_bau_disabled; /* for bau enable/disable */ 417 unsigned long s_ntarguvhub4; /* number of times target
418 hubs >= 4 */
419 unsigned long s_ntarguvhub2; /* number of times target
420 hubs >= 2 */
421 unsigned long s_ntarguvhub1; /* number of times target
422 hubs == 1 */
423 unsigned long s_resets_plug; /* ipi-style resets from plug
424 state */
425 unsigned long s_resets_timeout; /* ipi-style resets from
426 timeouts */
427 unsigned long s_busy; /* status stayed busy past
428 s/w timer */
429 unsigned long s_throttles; /* waits in throttle */
430 unsigned long s_retry_messages; /* retry broadcasts */
431 unsigned long s_bau_reenabled; /* for bau enable/disable */
432 unsigned long s_bau_disabled; /* for bau enable/disable */
369 /* destination statistics */ 433 /* destination statistics */
370 unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ 434 unsigned long d_alltlb; /* times all tlb's on this
371 unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ 435 cpu were flushed */
372 unsigned long d_multmsg; /* interrupts with multiple messages */ 436 unsigned long d_onetlb; /* times just one tlb on this
373 unsigned long d_nomsg; /* interrupts with no message */ 437 cpu was flushed */
374 unsigned long d_time; /* time spent on destination side */ 438 unsigned long d_multmsg; /* interrupts with multiple
375 unsigned long d_requestee; /* number of messages processed */ 439 messages */
376 unsigned long d_retries; /* number of retry messages processed */ 440 unsigned long d_nomsg; /* interrupts with no message */
377 unsigned long d_canceled; /* number of messages canceled by retries */ 441 unsigned long d_time; /* time spent on destination
378 unsigned long d_nocanceled; /* retries that found nothing to cancel */ 442 side */
379 unsigned long d_resets; /* number of ipi-style requests processed */ 443 unsigned long d_requestee; /* number of messages
380 unsigned long d_rcanceled; /* number of messages canceled by resets */ 444 processed */
445 unsigned long d_retries; /* number of retry messages
446 processed */
447 unsigned long d_canceled; /* number of messages canceled
448 by retries */
449 unsigned long d_nocanceled; /* retries that found nothing
450 to cancel */
451 unsigned long d_resets; /* number of ipi-style requests
452 processed */
453 unsigned long d_rcanceled; /* number of messages canceled
454 by resets */
455};
456
457struct tunables {
458 int *tunp;
459 int deflt;
381}; 460};
382 461
383struct hub_and_pnode { 462struct hub_and_pnode {
384 short uvhub; 463 short uvhub;
385 short pnode; 464 short pnode;
386}; 465};
466
467struct socket_desc {
468 short num_cpus;
469 short cpu_number[MAX_CPUS_PER_SOCKET];
470};
471
472struct uvhub_desc {
473 unsigned short socket_mask;
474 short num_cpus;
475 short uvhub;
476 short pnode;
477 struct socket_desc socket[2];
478};
479
387/* 480/*
388 * one per-cpu; to locate the software tables 481 * one per-cpu; to locate the software tables
389 */ 482 */
390struct bau_control { 483struct bau_control {
391 struct bau_desc *descriptor_base; 484 struct bau_desc *descriptor_base;
392 struct bau_payload_queue_entry *va_queue_first; 485 struct bau_pq_entry *queue_first;
393 struct bau_payload_queue_entry *va_queue_last; 486 struct bau_pq_entry *queue_last;
394 struct bau_payload_queue_entry *bau_msg_head; 487 struct bau_pq_entry *bau_msg_head;
395 struct bau_control *uvhub_master; 488 struct bau_control *uvhub_master;
396 struct bau_control *socket_master; 489 struct bau_control *socket_master;
397 struct ptc_stats *statp; 490 struct ptc_stats *statp;
398 unsigned long timeout_interval; 491 unsigned long timeout_interval;
399 unsigned long set_bau_on_time; 492 unsigned long set_bau_on_time;
400 atomic_t active_descriptor_count; 493 atomic_t active_descriptor_count;
401 int plugged_tries; 494 int plugged_tries;
402 int timeout_tries; 495 int timeout_tries;
403 int ipi_attempts; 496 int ipi_attempts;
404 int conseccompletes; 497 int conseccompletes;
405 int baudisabled; 498 int baudisabled;
406 int set_bau_off; 499 int set_bau_off;
407 short cpu; 500 short cpu;
408 short osnode; 501 short osnode;
409 short uvhub_cpu; 502 short uvhub_cpu;
410 short uvhub; 503 short uvhub;
411 short cpus_in_socket; 504 short cpus_in_socket;
412 short cpus_in_uvhub; 505 short cpus_in_uvhub;
413 short partition_base_pnode; 506 short partition_base_pnode;
414 unsigned short message_number; 507 unsigned short message_number;
415 unsigned short uvhub_quiesce; 508 unsigned short uvhub_quiesce;
416 short socket_acknowledge_count[DEST_Q_SIZE]; 509 short socket_acknowledge_count[DEST_Q_SIZE];
417 cycles_t send_message; 510 cycles_t send_message;
418 spinlock_t uvhub_lock; 511 spinlock_t uvhub_lock;
419 spinlock_t queue_lock; 512 spinlock_t queue_lock;
420 /* tunables */ 513 /* tunables */
421 int max_bau_concurrent; 514 int max_concurr;
422 int max_bau_concurrent_constant; 515 int max_concurr_const;
423 int plugged_delay; 516 int plugged_delay;
424 int plugsb4reset; 517 int plugsb4reset;
425 int timeoutsb4reset; 518 int timeoutsb4reset;
426 int ipi_reset_limit; 519 int ipi_reset_limit;
427 int complete_threshold; 520 int complete_threshold;
428 int congested_response_us; 521 int cong_response_us;
429 int congested_reps; 522 int cong_reps;
430 int congested_period; 523 int cong_period;
431 cycles_t period_time; 524 cycles_t period_time;
432 long period_requests; 525 long period_requests;
433 struct hub_and_pnode *target_hub_and_pnode; 526 struct hub_and_pnode *thp;
434}; 527};
435 528
436static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) 529static unsigned long read_mmr_uv2_status(void)
530{
531 return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
532}
533
534static void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
535{
536 write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
537}
538
539static void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
540{
541 write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image);
542}
543
544static void write_mmr_activation(unsigned long index)
545{
546 write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
547}
548
549static void write_gmmr_activation(int pnode, unsigned long mmr_image)
550{
551 write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
552}
553
554static void write_mmr_payload_first(int pnode, unsigned long mmr_image)
555{
556 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
557}
558
559static void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
560{
561 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image);
562}
563
564static void write_mmr_payload_last(int pnode, unsigned long mmr_image)
565{
566 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image);
567}
568
569static void write_mmr_misc_control(int pnode, unsigned long mmr_image)
570{
571 write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
572}
573
574static unsigned long read_mmr_misc_control(int pnode)
575{
576 return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL);
577}
578
579static void write_mmr_sw_ack(unsigned long mr)
580{
581 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
582}
583
584static unsigned long read_mmr_sw_ack(void)
585{
586 return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
587}
588
589static unsigned long read_gmmr_sw_ack(int pnode)
590{
591 return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
592}
593
594static void write_mmr_data_config(int pnode, unsigned long mr)
595{
596 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
597}
598
599static inline int bau_uvhub_isset(int uvhub, struct bau_targ_hubmask *dstp)
437{ 600{
438 return constant_test_bit(uvhub, &dstp->bits[0]); 601 return constant_test_bit(uvhub, &dstp->bits[0]);
439} 602}
440static inline void bau_uvhub_set(int pnode, struct bau_target_uvhubmask *dstp) 603static inline void bau_uvhub_set(int pnode, struct bau_targ_hubmask *dstp)
441{ 604{
442 __set_bit(pnode, &dstp->bits[0]); 605 __set_bit(pnode, &dstp->bits[0]);
443} 606}
444static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, 607static inline void bau_uvhubs_clear(struct bau_targ_hubmask *dstp,
445 int nbits) 608 int nbits)
446{ 609{
447 bitmap_zero(&dstp->bits[0], nbits); 610 bitmap_zero(&dstp->bits[0], nbits);
448} 611}
449static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp) 612static inline int bau_uvhub_weight(struct bau_targ_hubmask *dstp)
450{ 613{
451 return bitmap_weight((unsigned long *)&dstp->bits[0], 614 return bitmap_weight((unsigned long *)&dstp->bits[0],
452 UV_DISTRIBUTION_SIZE); 615 UV_DISTRIBUTION_SIZE);
@@ -457,9 +620,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
457 bitmap_zero(&dstp->bits, nbits); 620 bitmap_zero(&dstp->bits, nbits);
458} 621}
459 622
460#define cpubit_isset(cpu, bau_local_cpumask) \
461 test_bit((cpu), (bau_local_cpumask).bits)
462
463extern void uv_bau_message_intr1(void); 623extern void uv_bau_message_intr1(void);
464extern void uv_bau_timeout_intr1(void); 624extern void uv_bau_timeout_intr1(void);
465 625
@@ -467,7 +627,7 @@ struct atomic_short {
467 short counter; 627 short counter;
468}; 628};
469 629
470/** 630/*
471 * atomic_read_short - read a short atomic variable 631 * atomic_read_short - read a short atomic variable
472 * @v: pointer of type atomic_short 632 * @v: pointer of type atomic_short
473 * 633 *
@@ -478,14 +638,14 @@ static inline int atomic_read_short(const struct atomic_short *v)
478 return v->counter; 638 return v->counter;
479} 639}
480 640
481/** 641/*
482 * atomic_add_short_return - add and return a short int 642 * atom_asr - add and return a short int
483 * @i: short value to add 643 * @i: short value to add
484 * @v: pointer of type atomic_short 644 * @v: pointer of type atomic_short
485 * 645 *
486 * Atomically adds @i to @v and returns @i + @v 646 * Atomically adds @i to @v and returns @i + @v
487 */ 647 */
488static inline int atomic_add_short_return(short i, struct atomic_short *v) 648static inline int atom_asr(short i, struct atomic_short *v)
489{ 649{
490 short __i = i; 650 short __i = i;
491 asm volatile(LOCK_PREFIX "xaddw %0, %1" 651 asm volatile(LOCK_PREFIX "xaddw %0, %1"
@@ -494,4 +654,26 @@ static inline int atomic_add_short_return(short i, struct atomic_short *v)
494 return i + __i; 654 return i + __i;
495} 655}
496 656
657/*
658 * conditionally add 1 to *v, unless *v is >= u
659 * return 0 if we cannot add 1 to *v because it is >= u
660 * return 1 if we can add 1 to *v because it is < u
661 * the add is atomic
662 *
663 * This is close to atomic_add_unless(), but this allows the 'u' value
664 * to be lowered below the current 'v'. atomic_add_unless can only stop
665 * on equal.
666 */
667static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
668{
669 spin_lock(lock);
670 if (atomic_read(v) >= u) {
671 spin_unlock(lock);
672 return 0;
673 }
674 atomic_inc(v);
675 spin_unlock(lock);
676 return 1;
677}
678
497#endif /* _ASM_X86_UV_UV_BAU_H */ 679#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index 4298002d0c83..f26544a15214 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -77,8 +77,9 @@
77 * 77 *
78 * 1111110000000000 78 * 1111110000000000
79 * 5432109876543210 79 * 5432109876543210
80 * pppppppppplc0cch Nehalem-EX 80 * pppppppppplc0cch Nehalem-EX (12 bits in hdw reg)
81 * ppppppppplcc0cch Westmere-EX 81 * ppppppppplcc0cch Westmere-EX (12 bits in hdw reg)
82 * pppppppppppcccch SandyBridge (15 bits in hdw reg)
82 * sssssssssss 83 * sssssssssss
83 * 84 *
84 * p = pnode bits 85 * p = pnode bits
@@ -87,7 +88,7 @@
87 * h = hyperthread 88 * h = hyperthread
88 * s = bits that are in the SOCKET_ID CSR 89 * s = bits that are in the SOCKET_ID CSR
89 * 90 *
90 * Note: Processor only supports 12 bits in the APICID register. The ACPI 91 * Note: Processor may support fewer bits in the APICID register. The ACPI
91 * tables hold all 16 bits. Software needs to be aware of this. 92 * tables hold all 16 bits. Software needs to be aware of this.
92 * 93 *
93 * Unless otherwise specified, all references to APICID refer to 94 * Unless otherwise specified, all references to APICID refer to
@@ -138,6 +139,8 @@ struct uv_hub_info_s {
138 unsigned long global_mmr_base; 139 unsigned long global_mmr_base;
139 unsigned long gpa_mask; 140 unsigned long gpa_mask;
140 unsigned int gnode_extra; 141 unsigned int gnode_extra;
142 unsigned char hub_revision;
143 unsigned char apic_pnode_shift;
141 unsigned long gnode_upper; 144 unsigned long gnode_upper;
142 unsigned long lowmem_remap_top; 145 unsigned long lowmem_remap_top;
143 unsigned long lowmem_remap_base; 146 unsigned long lowmem_remap_base;
@@ -149,13 +152,31 @@ struct uv_hub_info_s {
149 unsigned char m_val; 152 unsigned char m_val;
150 unsigned char n_val; 153 unsigned char n_val;
151 struct uv_scir_s scir; 154 struct uv_scir_s scir;
152 unsigned char apic_pnode_shift;
153}; 155};
154 156
155DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 157DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
156#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) 158#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
157#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) 159#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
158 160
161/*
162 * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2
163 * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE.
164 * This is a software convention - NOT the hardware revision numbers in
165 * the hub chip.
166 */
167#define UV1_HUB_REVISION_BASE 1
168#define UV2_HUB_REVISION_BASE 3
169
170static inline int is_uv1_hub(void)
171{
172 return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE;
173}
174
175static inline int is_uv2_hub(void)
176{
177 return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
178}
179
159union uvh_apicid { 180union uvh_apicid {
160 unsigned long v; 181 unsigned long v;
161 struct uvh_apicid_s { 182 struct uvh_apicid_s {
@@ -180,11 +201,25 @@ union uvh_apicid {
180#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) 201#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
181#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) 202#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
182 203
183#define UV_LOCAL_MMR_BASE 0xf4000000UL 204#define UV1_LOCAL_MMR_BASE 0xf4000000UL
184#define UV_GLOBAL_MMR32_BASE 0xf8000000UL 205#define UV1_GLOBAL_MMR32_BASE 0xf8000000UL
206#define UV1_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
207#define UV1_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
208
209#define UV2_LOCAL_MMR_BASE 0xfa000000UL
210#define UV2_GLOBAL_MMR32_BASE 0xfc000000UL
211#define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
212#define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024)
213
214#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE \
215 : UV2_LOCAL_MMR_BASE)
216#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE \
217 : UV2_GLOBAL_MMR32_BASE)
218#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \
219 UV2_LOCAL_MMR_SIZE)
220#define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\
221 UV2_GLOBAL_MMR32_SIZE)
185#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) 222#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
186#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
187#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
188 223
189#define UV_GLOBAL_GRU_MMR_BASE 0x4000000 224#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
190 225
@@ -301,6 +336,17 @@ static inline int uv_apicid_to_pnode(int apicid)
301} 336}
302 337
303/* 338/*
339 * Convert an apicid to the socket number on the blade
340 */
341static inline int uv_apicid_to_socket(int apicid)
342{
343 if (is_uv1_hub())
344 return (apicid >> (uv_hub_info->apic_pnode_shift - 1)) & 1;
345 else
346 return 0;
347}
348
349/*
304 * Access global MMRs using the low memory MMR32 space. This region supports 350 * Access global MMRs using the low memory MMR32 space. This region supports
305 * faster MMR access but not all MMRs are accessible in this space. 351 * faster MMR access but not all MMRs are accessible in this space.
306 */ 352 */
@@ -519,14 +565,13 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
519 565
520/* 566/*
521 * Get the minimum revision number of the hub chips within the partition. 567 * Get the minimum revision number of the hub chips within the partition.
522 * 1 - initial rev 1.0 silicon 568 * 1 - UV1 rev 1.0 initial silicon
523 * 2 - rev 2.0 production silicon 569 * 2 - UV1 rev 2.0 production silicon
570 * 3 - UV2 rev 1.0 initial silicon
524 */ 571 */
525static inline int uv_get_min_hub_revision_id(void) 572static inline int uv_get_min_hub_revision_id(void)
526{ 573{
527 extern int uv_min_hub_revision_id; 574 return uv_hub_info->hub_revision;
528
529 return uv_min_hub_revision_id;
530} 575}
531 576
532#endif /* CONFIG_X86_64 */ 577#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index f5bb64a823d7..4be52c863448 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -11,13 +11,64 @@
11#ifndef _ASM_X86_UV_UV_MMRS_H 11#ifndef _ASM_X86_UV_UV_MMRS_H
12#define _ASM_X86_UV_UV_MMRS_H 12#define _ASM_X86_UV_UV_MMRS_H
13 13
14/*
15 * This file contains MMR definitions for both UV1 & UV2 hubs.
16 *
17 * In general, MMR addresses and structures are identical on both hubs.
18 * These MMRs are identified as:
19 * #define UVH_xxx <address>
20 * union uvh_xxx {
21 * unsigned long v;
22 * struct uvh_int_cmpd_s {
23 * } s;
24 * };
25 *
26 * If the MMR exists on both hub type but has different addresses or
27 * contents, the MMR definition is similar to:
28 * #define UV1H_xxx <uv1 address>
29 * #define UV2H_xxx <uv2address>
30 * #define UVH_xxx (is_uv1_hub() ? UV1H_xxx : UV2H_xxx)
31 * union uvh_xxx {
32 * unsigned long v;
33 * struct uv1h_int_cmpd_s { (Common fields only)
34 * } s;
35 * struct uv1h_int_cmpd_s { (Full UV1 definition)
36 * } s1;
37 * struct uv2h_int_cmpd_s { (Full UV2 definition)
38 * } s2;
39 * };
40 *
41 * Only essential difference are enumerated. For example, if the address is
42 * the same for both UV1 & UV2, only a single #define is generated. Likewise,
43 * if the contents is the same for both hubs, only the "s" structure is
44 * generated.
45 *
46 * If the MMR exists on ONLY 1 type of hub, no generic definition is
47 * generated:
48 * #define UVnH_xxx <uvn address>
49 * union uvnh_xxx {
50 * unsigned long v;
51 * struct uvh_int_cmpd_s {
52 * } sn;
53 * };
54 */
55
14#define UV_MMR_ENABLE (1UL << 63) 56#define UV_MMR_ENABLE (1UL << 63)
15 57
58#define UV1_HUB_PART_NUMBER 0x88a5
59#define UV2_HUB_PART_NUMBER 0x8eb8
60
61/* Compat: if this #define is present, UV headers support UV2 */
62#define UV2_HUB_IS_SUPPORTED 1
63
64/* KABI compat: if this #define is present, KABI hacks are present */
65#define UV2_HUB_KABI_HACKS 1
66
16/* ========================================================================= */ 67/* ========================================================================= */
17/* UVH_BAU_DATA_BROADCAST */ 68/* UVH_BAU_DATA_BROADCAST */
18/* ========================================================================= */ 69/* ========================================================================= */
19#define UVH_BAU_DATA_BROADCAST 0x61688UL 70#define UVH_BAU_DATA_BROADCAST 0x61688UL
20#define UVH_BAU_DATA_BROADCAST_32 0x0440 71#define UVH_BAU_DATA_BROADCAST_32 0x440
21 72
22#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 73#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
23#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL 74#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
@@ -34,7 +85,7 @@ union uvh_bau_data_broadcast_u {
34/* UVH_BAU_DATA_CONFIG */ 85/* UVH_BAU_DATA_CONFIG */
35/* ========================================================================= */ 86/* ========================================================================= */
36#define UVH_BAU_DATA_CONFIG 0x61680UL 87#define UVH_BAU_DATA_CONFIG 0x61680UL
37#define UVH_BAU_DATA_CONFIG_32 0x0438 88#define UVH_BAU_DATA_CONFIG_32 0x438
38 89
39#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 90#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
40#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL 91#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
@@ -73,125 +124,245 @@ union uvh_bau_data_config_u {
73/* UVH_EVENT_OCCURRED0 */ 124/* UVH_EVENT_OCCURRED0 */
74/* ========================================================================= */ 125/* ========================================================================= */
75#define UVH_EVENT_OCCURRED0 0x70000UL 126#define UVH_EVENT_OCCURRED0 0x70000UL
76#define UVH_EVENT_OCCURRED0_32 0x005e8 127#define UVH_EVENT_OCCURRED0_32 0x5e8
77 128
78#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 129#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
79#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL 130#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
80#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 131#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
81#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL 132#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
82#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 133#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
83#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL 134#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
84#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3 135#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3
85#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL 136#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
86#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4 137#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4
87#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL 138#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
88#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5 139#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5
89#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL 140#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
90#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6 141#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6
91#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL 142#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
92#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 143#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
93#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL 144#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
94#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 145#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
95#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL 146#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
96#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 147#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
97#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL 148#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
98#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 149#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
99#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL 150#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
100#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 151#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
101#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL 152#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
102#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 153#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
103#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL 154#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
104#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 155#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
105#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL 156#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
106#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 157#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
107#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL 158#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
108#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 159#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
109#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL 160#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
110#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 161#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
111#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL 162#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
112#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 163#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
113#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL 164#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
114#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 165#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
115#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL 166#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
116#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 167#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
117#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL 168#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
118#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 169#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
119#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL 170#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
120#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 171#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
121#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL 172#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
122#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 173#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
123#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL 174#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
124#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 175#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
125#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL 176#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
126#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 177#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
127#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL 178#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
128#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 179#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
129#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL 180#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
130#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 181#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
131#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL 182#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
132#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 183#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
133#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL 184#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
134#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 185#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
135#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL 186#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
136#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 187#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
137#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL 188#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
138#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 189#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
139#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL 190#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
140#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 191#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
141#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL 192#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
142#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 193#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
143#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL 194#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
144#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 195#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
145#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL 196#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
146#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 197#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
147#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL 198#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
148#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 199#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
149#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL 200#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
150#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 201#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
151#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL 202#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
152#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 203#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
153#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL 204#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
154#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 205#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
155#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL 206#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
156#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 207#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
157#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL 208#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
158#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 209#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
159#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL 210#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
160#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 211#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
161#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL 212#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
162#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 213#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
163#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL 214#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
164#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43 215#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43
165#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL 216#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
166#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 217#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
167#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL 218#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
168#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45 219#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45
169#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL 220#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
170#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 221#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
171#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL 222#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
172#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 223#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
173#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL 224#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
174#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 225#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
175#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL 226#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
176#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 227#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
177#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL 228#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
178#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 229#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
179#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL 230#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
180#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51 231#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51
181#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL 232#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
182#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52 233#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52
183#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL 234#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
184#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53 235#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53
185#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL 236#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
186#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54 237#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54
187#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL 238#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
188#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55 239#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55
189#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL 240#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
190#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 241#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
191#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL 242#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
243
244#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
245#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
246#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
247#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
248#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2
249#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
250#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
251#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
252#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
253#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
254#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
255#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
256#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
257#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
258#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
259#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
260#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
261#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
262#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
263#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
264#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
265#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
266#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
267#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
268#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
269#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
270#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
271#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
272#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
273#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
274#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
275#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
276#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
277#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
278#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
279#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
280#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
281#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
282#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
283#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
284#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
285#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
286#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
287#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
288#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
289#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
290#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
291#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
292#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
293#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
294#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
295#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
296#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
297#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
298#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
299#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
300#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
301#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
302#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
303#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
304#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
305#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
306#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
307#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
308#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
309#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
310#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
311#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
312#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
313#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
314#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
315#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
316#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
317#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
318#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
319#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
320#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
321#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
322#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
323#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
324#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
325#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
326#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
327#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
328#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
329#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
330#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
331#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
332#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
333#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
334#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
335#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
336#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
337#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
338#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
339#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
340#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
341#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
342#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
343#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
344#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
345#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
346#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
347#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
348#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
349#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
350#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
351#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
352#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
353#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
354#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
355#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
356#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
357#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
358#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
359#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
360#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
361#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
362
192union uvh_event_occurred0_u { 363union uvh_event_occurred0_u {
193 unsigned long v; 364 unsigned long v;
194 struct uvh_event_occurred0_s { 365 struct uv1h_event_occurred0_s {
195 unsigned long lb_hcerr : 1; /* RW, W1C */ 366 unsigned long lb_hcerr : 1; /* RW, W1C */
196 unsigned long gr0_hcerr : 1; /* RW, W1C */ 367 unsigned long gr0_hcerr : 1; /* RW, W1C */
197 unsigned long gr1_hcerr : 1; /* RW, W1C */ 368 unsigned long gr1_hcerr : 1; /* RW, W1C */
@@ -250,14 +421,76 @@ union uvh_event_occurred0_u {
250 unsigned long bau_data : 1; /* RW, W1C */ 421 unsigned long bau_data : 1; /* RW, W1C */
251 unsigned long power_management_req : 1; /* RW, W1C */ 422 unsigned long power_management_req : 1; /* RW, W1C */
252 unsigned long rsvd_57_63 : 7; /* */ 423 unsigned long rsvd_57_63 : 7; /* */
253 } s; 424 } s1;
425 struct uv2h_event_occurred0_s {
426 unsigned long lb_hcerr : 1; /* RW */
427 unsigned long qp_hcerr : 1; /* RW */
428 unsigned long rh_hcerr : 1; /* RW */
429 unsigned long lh0_hcerr : 1; /* RW */
430 unsigned long lh1_hcerr : 1; /* RW */
431 unsigned long gr0_hcerr : 1; /* RW */
432 unsigned long gr1_hcerr : 1; /* RW */
433 unsigned long ni0_hcerr : 1; /* RW */
434 unsigned long ni1_hcerr : 1; /* RW */
435 unsigned long lb_aoerr0 : 1; /* RW */
436 unsigned long qp_aoerr0 : 1; /* RW */
437 unsigned long rh_aoerr0 : 1; /* RW */
438 unsigned long lh0_aoerr0 : 1; /* RW */
439 unsigned long lh1_aoerr0 : 1; /* RW */
440 unsigned long gr0_aoerr0 : 1; /* RW */
441 unsigned long gr1_aoerr0 : 1; /* RW */
442 unsigned long xb_aoerr0 : 1; /* RW */
443 unsigned long rt_aoerr0 : 1; /* RW */
444 unsigned long ni0_aoerr0 : 1; /* RW */
445 unsigned long ni1_aoerr0 : 1; /* RW */
446 unsigned long lb_aoerr1 : 1; /* RW */
447 unsigned long qp_aoerr1 : 1; /* RW */
448 unsigned long rh_aoerr1 : 1; /* RW */
449 unsigned long lh0_aoerr1 : 1; /* RW */
450 unsigned long lh1_aoerr1 : 1; /* RW */
451 unsigned long gr0_aoerr1 : 1; /* RW */
452 unsigned long gr1_aoerr1 : 1; /* RW */
453 unsigned long xb_aoerr1 : 1; /* RW */
454 unsigned long rt_aoerr1 : 1; /* RW */
455 unsigned long ni0_aoerr1 : 1; /* RW */
456 unsigned long ni1_aoerr1 : 1; /* RW */
457 unsigned long system_shutdown_int : 1; /* RW */
458 unsigned long lb_irq_int_0 : 1; /* RW */
459 unsigned long lb_irq_int_1 : 1; /* RW */
460 unsigned long lb_irq_int_2 : 1; /* RW */
461 unsigned long lb_irq_int_3 : 1; /* RW */
462 unsigned long lb_irq_int_4 : 1; /* RW */
463 unsigned long lb_irq_int_5 : 1; /* RW */
464 unsigned long lb_irq_int_6 : 1; /* RW */
465 unsigned long lb_irq_int_7 : 1; /* RW */
466 unsigned long lb_irq_int_8 : 1; /* RW */
467 unsigned long lb_irq_int_9 : 1; /* RW */
468 unsigned long lb_irq_int_10 : 1; /* RW */
469 unsigned long lb_irq_int_11 : 1; /* RW */
470 unsigned long lb_irq_int_12 : 1; /* RW */
471 unsigned long lb_irq_int_13 : 1; /* RW */
472 unsigned long lb_irq_int_14 : 1; /* RW */
473 unsigned long lb_irq_int_15 : 1; /* RW */
474 unsigned long l1_nmi_int : 1; /* RW */
475 unsigned long stop_clock : 1; /* RW */
476 unsigned long asic_to_l1 : 1; /* RW */
477 unsigned long l1_to_asic : 1; /* RW */
478 unsigned long la_seq_trigger : 1; /* RW */
479 unsigned long ipi_int : 1; /* RW */
480 unsigned long extio_int0 : 1; /* RW */
481 unsigned long extio_int1 : 1; /* RW */
482 unsigned long extio_int2 : 1; /* RW */
483 unsigned long extio_int3 : 1; /* RW */
484 unsigned long profile_int : 1; /* RW */
485 unsigned long rsvd_59_63 : 5; /* */
486 } s2;
254}; 487};
255 488
256/* ========================================================================= */ 489/* ========================================================================= */
257/* UVH_EVENT_OCCURRED0_ALIAS */ 490/* UVH_EVENT_OCCURRED0_ALIAS */
258/* ========================================================================= */ 491/* ========================================================================= */
259#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL 492#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
260#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0 493#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
261 494
262/* ========================================================================= */ 495/* ========================================================================= */
263/* UVH_GR0_TLB_INT0_CONFIG */ 496/* UVH_GR0_TLB_INT0_CONFIG */
@@ -432,8 +665,16 @@ union uvh_int_cmpb_u {
432/* ========================================================================= */ 665/* ========================================================================= */
433#define UVH_INT_CMPC 0x22100UL 666#define UVH_INT_CMPC 0x22100UL
434 667
435#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0 668#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0
436#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL 669#define UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT 0
670#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT (is_uv1_hub() ? \
671 UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT : \
672 UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT)
673#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
674#define UV2H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
675#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK (is_uv1_hub() ? \
676 UV1H_INT_CMPC_REAL_TIME_CMPC_MASK : \
677 UV2H_INT_CMPC_REAL_TIME_CMPC_MASK)
437 678
438union uvh_int_cmpc_u { 679union uvh_int_cmpc_u {
439 unsigned long v; 680 unsigned long v;
@@ -448,8 +689,16 @@ union uvh_int_cmpc_u {
448/* ========================================================================= */ 689/* ========================================================================= */
449#define UVH_INT_CMPD 0x22180UL 690#define UVH_INT_CMPD 0x22180UL
450 691
451#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0 692#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0
452#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL 693#define UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT 0
694#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT (is_uv1_hub() ? \
695 UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT : \
696 UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT)
697#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
698#define UV2H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
699#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK (is_uv1_hub() ? \
700 UV1H_INT_CMPD_REAL_TIME_CMPD_MASK : \
701 UV2H_INT_CMPD_REAL_TIME_CMPD_MASK)
453 702
454union uvh_int_cmpd_u { 703union uvh_int_cmpd_u {
455 unsigned long v; 704 unsigned long v;
@@ -463,7 +712,7 @@ union uvh_int_cmpd_u {
463/* UVH_IPI_INT */ 712/* UVH_IPI_INT */
464/* ========================================================================= */ 713/* ========================================================================= */
465#define UVH_IPI_INT 0x60500UL 714#define UVH_IPI_INT 0x60500UL
466#define UVH_IPI_INT_32 0x0348 715#define UVH_IPI_INT_32 0x348
467 716
468#define UVH_IPI_INT_VECTOR_SHFT 0 717#define UVH_IPI_INT_VECTOR_SHFT 0
469#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL 718#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
@@ -493,7 +742,7 @@ union uvh_ipi_int_u {
493/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ 742/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
494/* ========================================================================= */ 743/* ========================================================================= */
495#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL 744#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
496#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0 745#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
497 746
498#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 747#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
499#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL 748#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
@@ -515,7 +764,7 @@ union uvh_lb_bau_intd_payload_queue_first_u {
515/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ 764/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
516/* ========================================================================= */ 765/* ========================================================================= */
517#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL 766#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
518#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8 767#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
519 768
520#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 769#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
521#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL 770#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
@@ -533,7 +782,7 @@ union uvh_lb_bau_intd_payload_queue_last_u {
533/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ 782/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
534/* ========================================================================= */ 783/* ========================================================================= */
535#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL 784#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
536#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0 785#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
537 786
538#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 787#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
539#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL 788#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
@@ -551,7 +800,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u {
551/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ 800/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
552/* ========================================================================= */ 801/* ========================================================================= */
553#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL 802#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
554#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68 803#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
555 804
556#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 805#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
557#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL 806#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
@@ -585,6 +834,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u {
585#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL 834#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
586#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 835#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
587#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL 836#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
837
588union uvh_lb_bau_intd_software_acknowledge_u { 838union uvh_lb_bau_intd_software_acknowledge_u {
589 unsigned long v; 839 unsigned long v;
590 struct uvh_lb_bau_intd_software_acknowledge_s { 840 struct uvh_lb_bau_intd_software_acknowledge_s {
@@ -612,13 +862,13 @@ union uvh_lb_bau_intd_software_acknowledge_u {
612/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ 862/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
613/* ========================================================================= */ 863/* ========================================================================= */
614#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL 864#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
615#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70 865#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
616 866
617/* ========================================================================= */ 867/* ========================================================================= */
618/* UVH_LB_BAU_MISC_CONTROL */ 868/* UVH_LB_BAU_MISC_CONTROL */
619/* ========================================================================= */ 869/* ========================================================================= */
620#define UVH_LB_BAU_MISC_CONTROL 0x320170UL 870#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
621#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10 871#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
622 872
623#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 873#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
624#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL 874#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
@@ -628,8 +878,8 @@ union uvh_lb_bau_intd_software_acknowledge_u {
628#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL 878#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
629#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 879#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
630#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL 880#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
631#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11 881#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
632#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL 882#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
633#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 883#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
634#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL 884#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
635#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 885#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
@@ -650,8 +900,86 @@ union uvh_lb_bau_intd_software_acknowledge_u {
650#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL 900#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
651#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 901#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
652#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL 902#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
653#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 903
654#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL 904#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
905#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
906#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
907#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
908#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
909#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
910#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
911#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
912#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
913#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
914#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
915#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
916#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
917#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
918#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
919#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
920#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
921#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
922#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
923#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
924#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
925#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
926#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
927#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
928#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
929#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
930#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
931#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
932#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
933#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
934#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
935#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
936
937#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
938#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
939#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
940#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
941#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
942#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
943#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
944#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
945#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
946#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
947#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
948#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
949#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
950#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
951#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
952#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
953#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
954#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
955#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
956#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
957#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
958#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
959#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
960#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
961#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
962#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
963#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
964#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
965#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
966#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
967#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
968#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
969#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
970#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
971#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
972#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
973#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
974#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
975#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
976#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
977#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
978#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
979#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
980#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
981#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
982#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
655 983
656union uvh_lb_bau_misc_control_u { 984union uvh_lb_bau_misc_control_u {
657 unsigned long v; 985 unsigned long v;
@@ -660,7 +988,25 @@ union uvh_lb_bau_misc_control_u {
660 unsigned long apic_mode : 1; /* RW */ 988 unsigned long apic_mode : 1; /* RW */
661 unsigned long force_broadcast : 1; /* RW */ 989 unsigned long force_broadcast : 1; /* RW */
662 unsigned long force_lock_nop : 1; /* RW */ 990 unsigned long force_lock_nop : 1; /* RW */
663 unsigned long csi_agent_presence_vector : 3; /* RW */ 991 unsigned long qpi_agent_presence_vector : 3; /* RW */
992 unsigned long descriptor_fetch_mode : 1; /* RW */
993 unsigned long enable_intd_soft_ack_mode : 1; /* RW */
994 unsigned long intd_soft_ack_timeout_period : 4; /* RW */
995 unsigned long enable_dual_mapping_mode : 1; /* RW */
996 unsigned long vga_io_port_decode_enable : 1; /* RW */
997 unsigned long vga_io_port_16_bit_decode : 1; /* RW */
998 unsigned long suppress_dest_registration : 1; /* RW */
999 unsigned long programmed_initial_priority : 3; /* RW */
1000 unsigned long use_incoming_priority : 1; /* RW */
1001 unsigned long enable_programmed_initial_priority : 1; /* RW */
1002 unsigned long rsvd_29_63 : 35;
1003 } s;
1004 struct uv1h_lb_bau_misc_control_s {
1005 unsigned long rejection_delay : 8; /* RW */
1006 unsigned long apic_mode : 1; /* RW */
1007 unsigned long force_broadcast : 1; /* RW */
1008 unsigned long force_lock_nop : 1; /* RW */
1009 unsigned long qpi_agent_presence_vector : 3; /* RW */
664 unsigned long descriptor_fetch_mode : 1; /* RW */ 1010 unsigned long descriptor_fetch_mode : 1; /* RW */
665 unsigned long enable_intd_soft_ack_mode : 1; /* RW */ 1011 unsigned long enable_intd_soft_ack_mode : 1; /* RW */
666 unsigned long intd_soft_ack_timeout_period : 4; /* RW */ 1012 unsigned long intd_soft_ack_timeout_period : 4; /* RW */
@@ -673,14 +1019,40 @@ union uvh_lb_bau_misc_control_u {
673 unsigned long enable_programmed_initial_priority : 1; /* RW */ 1019 unsigned long enable_programmed_initial_priority : 1; /* RW */
674 unsigned long rsvd_29_47 : 19; /* */ 1020 unsigned long rsvd_29_47 : 19; /* */
675 unsigned long fun : 16; /* RW */ 1021 unsigned long fun : 16; /* RW */
676 } s; 1022 } s1;
1023 struct uv2h_lb_bau_misc_control_s {
1024 unsigned long rejection_delay : 8; /* RW */
1025 unsigned long apic_mode : 1; /* RW */
1026 unsigned long force_broadcast : 1; /* RW */
1027 unsigned long force_lock_nop : 1; /* RW */
1028 unsigned long qpi_agent_presence_vector : 3; /* RW */
1029 unsigned long descriptor_fetch_mode : 1; /* RW */
1030 unsigned long enable_intd_soft_ack_mode : 1; /* RW */
1031 unsigned long intd_soft_ack_timeout_period : 4; /* RW */
1032 unsigned long enable_dual_mapping_mode : 1; /* RW */
1033 unsigned long vga_io_port_decode_enable : 1; /* RW */
1034 unsigned long vga_io_port_16_bit_decode : 1; /* RW */
1035 unsigned long suppress_dest_registration : 1; /* RW */
1036 unsigned long programmed_initial_priority : 3; /* RW */
1037 unsigned long use_incoming_priority : 1; /* RW */
1038 unsigned long enable_programmed_initial_priority : 1; /* RW */
1039 unsigned long enable_automatic_apic_mode_selection : 1; /* RW */
1040 unsigned long apic_mode_status : 1; /* RO */
1041 unsigned long suppress_interrupts_to_self : 1; /* RW */
1042 unsigned long enable_lock_based_system_flush : 1; /* RW */
1043 unsigned long enable_extended_sb_status : 1; /* RW */
1044 unsigned long suppress_int_prio_udt_to_self : 1; /* RW */
1045 unsigned long use_legacy_descriptor_formats : 1; /* RW */
1046 unsigned long rsvd_36_47 : 12; /* */
1047 unsigned long fun : 16; /* RW */
1048 } s2;
677}; 1049};
678 1050
679/* ========================================================================= */ 1051/* ========================================================================= */
680/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ 1052/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
681/* ========================================================================= */ 1053/* ========================================================================= */
682#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL 1054#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
683#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8 1055#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
684 1056
685#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 1057#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
686#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL 1058#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
@@ -703,7 +1075,7 @@ union uvh_lb_bau_sb_activation_control_u {
703/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ 1075/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
704/* ========================================================================= */ 1076/* ========================================================================= */
705#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL 1077#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
706#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0 1078#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
707 1079
708#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 1080#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
709#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL 1081#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
@@ -719,7 +1091,7 @@ union uvh_lb_bau_sb_activation_status_0_u {
719/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ 1091/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
720/* ========================================================================= */ 1092/* ========================================================================= */
721#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL 1093#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
722#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8 1094#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
723 1095
724#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 1096#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
725#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL 1097#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
@@ -735,7 +1107,7 @@ union uvh_lb_bau_sb_activation_status_1_u {
735/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ 1107/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
736/* ========================================================================= */ 1108/* ========================================================================= */
737#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL 1109#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
738#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0 1110#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
739 1111
740#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 1112#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
741#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL 1113#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
@@ -754,23 +1126,6 @@ union uvh_lb_bau_sb_descriptor_base_u {
754}; 1126};
755 1127
756/* ========================================================================= */ 1128/* ========================================================================= */
757/* UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK */
758/* ========================================================================= */
759#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
760#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x009f0
761
762#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
763#define UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
764
765union uvh_lb_target_physical_apic_id_mask_u {
766 unsigned long v;
767 struct uvh_lb_target_physical_apic_id_mask_s {
768 unsigned long bit_enables : 32; /* RW */
769 unsigned long rsvd_32_63 : 32; /* */
770 } s;
771};
772
773/* ========================================================================= */
774/* UVH_NODE_ID */ 1129/* UVH_NODE_ID */
775/* ========================================================================= */ 1130/* ========================================================================= */
776#define UVH_NODE_ID 0x0UL 1131#define UVH_NODE_ID 0x0UL
@@ -785,10 +1140,36 @@ union uvh_lb_target_physical_apic_id_mask_u {
785#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL 1140#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
786#define UVH_NODE_ID_NODE_ID_SHFT 32 1141#define UVH_NODE_ID_NODE_ID_SHFT 32
787#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL 1142#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
788#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48 1143
789#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL 1144#define UV1H_NODE_ID_FORCE1_SHFT 0
790#define UVH_NODE_ID_NI_PORT_SHFT 56 1145#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
791#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL 1146#define UV1H_NODE_ID_MANUFACTURER_SHFT 1
1147#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
1148#define UV1H_NODE_ID_PART_NUMBER_SHFT 12
1149#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
1150#define UV1H_NODE_ID_REVISION_SHFT 28
1151#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
1152#define UV1H_NODE_ID_NODE_ID_SHFT 32
1153#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
1154#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48
1155#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
1156#define UV1H_NODE_ID_NI_PORT_SHFT 56
1157#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
1158
1159#define UV2H_NODE_ID_FORCE1_SHFT 0
1160#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
1161#define UV2H_NODE_ID_MANUFACTURER_SHFT 1
1162#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
1163#define UV2H_NODE_ID_PART_NUMBER_SHFT 12
1164#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
1165#define UV2H_NODE_ID_REVISION_SHFT 28
1166#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
1167#define UV2H_NODE_ID_NODE_ID_SHFT 32
1168#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
1169#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50
1170#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
1171#define UV2H_NODE_ID_NI_PORT_SHFT 57
1172#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
792 1173
793union uvh_node_id_u { 1174union uvh_node_id_u {
794 unsigned long v; 1175 unsigned long v;
@@ -798,12 +1179,31 @@ union uvh_node_id_u {
798 unsigned long part_number : 16; /* RO */ 1179 unsigned long part_number : 16; /* RO */
799 unsigned long revision : 4; /* RO */ 1180 unsigned long revision : 4; /* RO */
800 unsigned long node_id : 15; /* RW */ 1181 unsigned long node_id : 15; /* RW */
1182 unsigned long rsvd_47_63 : 17;
1183 } s;
1184 struct uv1h_node_id_s {
1185 unsigned long force1 : 1; /* RO */
1186 unsigned long manufacturer : 11; /* RO */
1187 unsigned long part_number : 16; /* RO */
1188 unsigned long revision : 4; /* RO */
1189 unsigned long node_id : 15; /* RW */
801 unsigned long rsvd_47 : 1; /* */ 1190 unsigned long rsvd_47 : 1; /* */
802 unsigned long nodes_per_bit : 7; /* RW */ 1191 unsigned long nodes_per_bit : 7; /* RW */
803 unsigned long rsvd_55 : 1; /* */ 1192 unsigned long rsvd_55 : 1; /* */
804 unsigned long ni_port : 4; /* RO */ 1193 unsigned long ni_port : 4; /* RO */
805 unsigned long rsvd_60_63 : 4; /* */ 1194 unsigned long rsvd_60_63 : 4; /* */
806 } s; 1195 } s1;
1196 struct uv2h_node_id_s {
1197 unsigned long force1 : 1; /* RO */
1198 unsigned long manufacturer : 11; /* RO */
1199 unsigned long part_number : 16; /* RO */
1200 unsigned long revision : 4; /* RO */
1201 unsigned long node_id : 15; /* RW */
1202 unsigned long rsvd_47_49 : 3; /* */
1203 unsigned long nodes_per_bit : 7; /* RO */
1204 unsigned long ni_port : 5; /* RO */
1205 unsigned long rsvd_62_63 : 2; /* */
1206 } s2;
807}; 1207};
808 1208
809/* ========================================================================= */ 1209/* ========================================================================= */
@@ -954,18 +1354,38 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
954#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL 1354#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
955#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6 1355#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
956#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL 1356#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
957#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12 1357
958#define UVH_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL 1358#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
1359#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
1360#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
1361#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
1362#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
1363#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
1364
1365#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
1366#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
1367#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
1368#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
959 1369
960union uvh_rh_gam_config_mmr_u { 1370union uvh_rh_gam_config_mmr_u {
961 unsigned long v; 1371 unsigned long v;
962 struct uvh_rh_gam_config_mmr_s { 1372 struct uvh_rh_gam_config_mmr_s {
963 unsigned long m_skt : 6; /* RW */ 1373 unsigned long m_skt : 6; /* RW */
964 unsigned long n_skt : 4; /* RW */ 1374 unsigned long n_skt : 4; /* RW */
1375 unsigned long rsvd_10_63 : 54;
1376 } s;
1377 struct uv1h_rh_gam_config_mmr_s {
1378 unsigned long m_skt : 6; /* RW */
1379 unsigned long n_skt : 4; /* RW */
965 unsigned long rsvd_10_11: 2; /* */ 1380 unsigned long rsvd_10_11: 2; /* */
966 unsigned long mmiol_cfg : 1; /* RW */ 1381 unsigned long mmiol_cfg : 1; /* RW */
967 unsigned long rsvd_13_63: 51; /* */ 1382 unsigned long rsvd_13_63: 51; /* */
968 } s; 1383 } s1;
1384 struct uv2h_rh_gam_config_mmr_s {
1385 unsigned long m_skt : 6; /* RW */
1386 unsigned long n_skt : 4; /* RW */
1387 unsigned long rsvd_10_63: 54; /* */
1388 } s2;
969}; 1389};
970 1390
971/* ========================================================================= */ 1391/* ========================================================================= */
@@ -975,25 +1395,49 @@ union uvh_rh_gam_config_mmr_u {
975 1395
976#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 1396#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
977#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL 1397#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
978#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 1398
979#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL 1399#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
980#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 1400#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
981#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL 1401#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
982#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1402#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
983#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1403#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
1404#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
1405#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1406#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1407
1408#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
1409#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
1410#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
1411#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
1412#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1413#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
984 1414
985union uvh_rh_gam_gru_overlay_config_mmr_u { 1415union uvh_rh_gam_gru_overlay_config_mmr_u {
986 unsigned long v; 1416 unsigned long v;
987 struct uvh_rh_gam_gru_overlay_config_mmr_s { 1417 struct uvh_rh_gam_gru_overlay_config_mmr_s {
988 unsigned long rsvd_0_27: 28; /* */ 1418 unsigned long rsvd_0_27: 28; /* */
989 unsigned long base : 18; /* RW */ 1419 unsigned long base : 18; /* RW */
1420 unsigned long rsvd_46_62 : 17;
1421 unsigned long enable : 1; /* RW */
1422 } s;
1423 struct uv1h_rh_gam_gru_overlay_config_mmr_s {
1424 unsigned long rsvd_0_27: 28; /* */
1425 unsigned long base : 18; /* RW */
990 unsigned long rsvd_46_47: 2; /* */ 1426 unsigned long rsvd_46_47: 2; /* */
991 unsigned long gr4 : 1; /* RW */ 1427 unsigned long gr4 : 1; /* RW */
992 unsigned long rsvd_49_51: 3; /* */ 1428 unsigned long rsvd_49_51: 3; /* */
993 unsigned long n_gru : 4; /* RW */ 1429 unsigned long n_gru : 4; /* RW */
994 unsigned long rsvd_56_62: 7; /* */ 1430 unsigned long rsvd_56_62: 7; /* */
995 unsigned long enable : 1; /* RW */ 1431 unsigned long enable : 1; /* RW */
996 } s; 1432 } s1;
1433 struct uv2h_rh_gam_gru_overlay_config_mmr_s {
1434 unsigned long rsvd_0_27: 28; /* */
1435 unsigned long base : 18; /* RW */
1436 unsigned long rsvd_46_51: 6; /* */
1437 unsigned long n_gru : 4; /* RW */
1438 unsigned long rsvd_56_62: 7; /* */
1439 unsigned long enable : 1; /* RW */
1440 } s2;
997}; 1441};
998 1442
999/* ========================================================================= */ 1443/* ========================================================================= */
@@ -1001,25 +1445,42 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
1001/* ========================================================================= */ 1445/* ========================================================================= */
1002#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL 1446#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
1003 1447
1004#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 1448#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
1005#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL 1449#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
1006#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 1450#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
1007#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL 1451#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1008#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 1452#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1009#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL 1453#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1010#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1454#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1011#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1455#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1456
1457#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27
1458#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL
1459#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
1460#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1461#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1462#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1463#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1464#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1012 1465
1013union uvh_rh_gam_mmioh_overlay_config_mmr_u { 1466union uvh_rh_gam_mmioh_overlay_config_mmr_u {
1014 unsigned long v; 1467 unsigned long v;
1015 struct uvh_rh_gam_mmioh_overlay_config_mmr_s { 1468 struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
1016 unsigned long rsvd_0_29: 30; /* */ 1469 unsigned long rsvd_0_29: 30; /* */
1017 unsigned long base : 16; /* RW */ 1470 unsigned long base : 16; /* RW */
1018 unsigned long m_io : 6; /* RW */ 1471 unsigned long m_io : 6; /* RW */
1019 unsigned long n_io : 4; /* RW */ 1472 unsigned long n_io : 4; /* RW */
1020 unsigned long rsvd_56_62: 7; /* */ 1473 unsigned long rsvd_56_62: 7; /* */
1021 unsigned long enable : 1; /* RW */ 1474 unsigned long enable : 1; /* RW */
1022 } s; 1475 } s1;
1476 struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
1477 unsigned long rsvd_0_26: 27; /* */
1478 unsigned long base : 19; /* RW */
1479 unsigned long m_io : 6; /* RW */
1480 unsigned long n_io : 4; /* RW */
1481 unsigned long rsvd_56_62: 7; /* */
1482 unsigned long enable : 1; /* RW */
1483 } s2;
1023}; 1484};
1024 1485
1025/* ========================================================================= */ 1486/* ========================================================================= */
@@ -1029,20 +1490,40 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u {
1029 1490
1030#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 1491#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1031#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL 1492#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1032#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 1493
1033#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL 1494#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1034#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1495#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1035#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1496#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
1497#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
1498#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1499#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1500
1501#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1502#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1503#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1504#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1036 1505
1037union uvh_rh_gam_mmr_overlay_config_mmr_u { 1506union uvh_rh_gam_mmr_overlay_config_mmr_u {
1038 unsigned long v; 1507 unsigned long v;
1039 struct uvh_rh_gam_mmr_overlay_config_mmr_s { 1508 struct uvh_rh_gam_mmr_overlay_config_mmr_s {
1040 unsigned long rsvd_0_25: 26; /* */ 1509 unsigned long rsvd_0_25: 26; /* */
1041 unsigned long base : 20; /* RW */ 1510 unsigned long base : 20; /* RW */
1511 unsigned long rsvd_46_62 : 17;
1512 unsigned long enable : 1; /* RW */
1513 } s;
1514 struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
1515 unsigned long rsvd_0_25: 26; /* */
1516 unsigned long base : 20; /* RW */
1042 unsigned long dual_hub : 1; /* RW */ 1517 unsigned long dual_hub : 1; /* RW */
1043 unsigned long rsvd_47_62: 16; /* */ 1518 unsigned long rsvd_47_62: 16; /* */
1044 unsigned long enable : 1; /* RW */ 1519 unsigned long enable : 1; /* RW */
1045 } s; 1520 } s1;
1521 struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
1522 unsigned long rsvd_0_25: 26; /* */
1523 unsigned long base : 20; /* RW */
1524 unsigned long rsvd_46_62: 17; /* */
1525 unsigned long enable : 1; /* RW */
1526 } s2;
1046}; 1527};
1047 1528
1048/* ========================================================================= */ 1529/* ========================================================================= */
@@ -1103,10 +1584,11 @@ union uvh_rtc1_int_config_u {
1103/* UVH_SCRATCH5 */ 1584/* UVH_SCRATCH5 */
1104/* ========================================================================= */ 1585/* ========================================================================= */
1105#define UVH_SCRATCH5 0x2d0200UL 1586#define UVH_SCRATCH5 0x2d0200UL
1106#define UVH_SCRATCH5_32 0x00778 1587#define UVH_SCRATCH5_32 0x778
1107 1588
1108#define UVH_SCRATCH5_SCRATCH5_SHFT 0 1589#define UVH_SCRATCH5_SCRATCH5_SHFT 0
1109#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL 1590#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
1591
1110union uvh_scratch5_u { 1592union uvh_scratch5_u {
1111 unsigned long v; 1593 unsigned long v;
1112 struct uvh_scratch5_s { 1594 struct uvh_scratch5_s {
@@ -1114,4 +1596,154 @@ union uvh_scratch5_u {
1114 } s; 1596 } s;
1115}; 1597};
1116 1598
1599/* ========================================================================= */
1600/* UV2H_EVENT_OCCURRED2 */
1601/* ========================================================================= */
1602#define UV2H_EVENT_OCCURRED2 0x70100UL
1603#define UV2H_EVENT_OCCURRED2_32 0xb68
1604
1605#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
1606#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
1607#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
1608#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
1609#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
1610#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
1611#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
1612#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
1613#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
1614#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
1615#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
1616#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
1617#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
1618#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
1619#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
1620#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
1621#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
1622#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
1623#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
1624#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
1625#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
1626#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
1627#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
1628#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
1629#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
1630#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
1631#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
1632#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
1633#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
1634#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
1635#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
1636#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
1637#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
1638#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
1639#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
1640#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
1641#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
1642#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
1643#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
1644#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
1645#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
1646#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
1647#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
1648#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
1649#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
1650#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
1651#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
1652#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
1653#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
1654#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
1655#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
1656#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
1657#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
1658#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
1659#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
1660#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
1661#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
1662#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
1663#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
1664#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
1665#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
1666#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
1667#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
1668#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
1669
1670union uv2h_event_occurred2_u {
1671 unsigned long v;
1672 struct uv2h_event_occurred2_s {
1673 unsigned long rtc_0 : 1; /* RW */
1674 unsigned long rtc_1 : 1; /* RW */
1675 unsigned long rtc_2 : 1; /* RW */
1676 unsigned long rtc_3 : 1; /* RW */
1677 unsigned long rtc_4 : 1; /* RW */
1678 unsigned long rtc_5 : 1; /* RW */
1679 unsigned long rtc_6 : 1; /* RW */
1680 unsigned long rtc_7 : 1; /* RW */
1681 unsigned long rtc_8 : 1; /* RW */
1682 unsigned long rtc_9 : 1; /* RW */
1683 unsigned long rtc_10 : 1; /* RW */
1684 unsigned long rtc_11 : 1; /* RW */
1685 unsigned long rtc_12 : 1; /* RW */
1686 unsigned long rtc_13 : 1; /* RW */
1687 unsigned long rtc_14 : 1; /* RW */
1688 unsigned long rtc_15 : 1; /* RW */
1689 unsigned long rtc_16 : 1; /* RW */
1690 unsigned long rtc_17 : 1; /* RW */
1691 unsigned long rtc_18 : 1; /* RW */
1692 unsigned long rtc_19 : 1; /* RW */
1693 unsigned long rtc_20 : 1; /* RW */
1694 unsigned long rtc_21 : 1; /* RW */
1695 unsigned long rtc_22 : 1; /* RW */
1696 unsigned long rtc_23 : 1; /* RW */
1697 unsigned long rtc_24 : 1; /* RW */
1698 unsigned long rtc_25 : 1; /* RW */
1699 unsigned long rtc_26 : 1; /* RW */
1700 unsigned long rtc_27 : 1; /* RW */
1701 unsigned long rtc_28 : 1; /* RW */
1702 unsigned long rtc_29 : 1; /* RW */
1703 unsigned long rtc_30 : 1; /* RW */
1704 unsigned long rtc_31 : 1; /* RW */
1705 unsigned long rsvd_32_63: 32; /* */
1706 } s1;
1707};
1708
1709/* ========================================================================= */
1710/* UV2H_EVENT_OCCURRED2_ALIAS */
1711/* ========================================================================= */
1712#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL
1713#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
1714
1715/* ========================================================================= */
1716/* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */
1717/* ========================================================================= */
1718#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
1719#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
1720
1721#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
1722#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
1723
1724union uv2h_lb_bau_sb_activation_status_2_u {
1725 unsigned long v;
1726 struct uv2h_lb_bau_sb_activation_status_2_s {
1727 unsigned long aux_error : 64; /* RW */
1728 } s1;
1729};
1730
1731/* ========================================================================= */
1732/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */
1733/* ========================================================================= */
1734#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
1735#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0
1736
1737#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
1738#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
1739
1740union uv1h_lb_target_physical_apic_id_mask_u {
1741 unsigned long v;
1742 struct uv1h_lb_target_physical_apic_id_mask_s {
1743 unsigned long bit_enables : 32; /* RW */
1744 unsigned long rsvd_32_63 : 32; /* */
1745 } s1;
1746};
1747
1748
1117#endif /* __ASM_UV_MMRS_X86_H__ */ 1749#endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052b73de..bb0522850b74 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,20 +1,6 @@
1#ifndef _ASM_X86_VDSO_H 1#ifndef _ASM_X86_VDSO_H
2#define _ASM_X86_VDSO_H 2#define _ASM_X86_VDSO_H
3 3
4#ifdef CONFIG_X86_64
5extern const char VDSO64_PRELINK[];
6
7/*
8 * Given a pointer to the vDSO image, find the pointer to VDSO64_name
9 * as that symbol is defined in the vDSO sources or linker script.
10 */
11#define VDSO64_SYMBOL(base, name) \
12({ \
13 extern const char VDSO64_##name[]; \
14 (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
15})
16#endif
17
18#if defined CONFIG_X86_32 || defined CONFIG_COMPAT 4#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
19extern const char VDSO32_PRELINK[]; 5extern const char VDSO32_PRELINK[];
20 6
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e204826f..646b4c1ca695 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -23,8 +23,6 @@ struct vsyscall_gtod_data {
23 struct timespec wall_to_monotonic; 23 struct timespec wall_to_monotonic;
24 struct timespec wall_time_coarse; 24 struct timespec wall_time_coarse;
25}; 25};
26extern struct vsyscall_gtod_data __vsyscall_gtod_data
27__section_vsyscall_gtod_data;
28extern struct vsyscall_gtod_data vsyscall_gtod_data; 26extern struct vsyscall_gtod_data vsyscall_gtod_data;
29 27
30#endif /* _ASM_X86_VGTOD_H */ 28#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fbd..d55597351f6a 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,27 +16,19 @@ enum vsyscall_num {
16#ifdef __KERNEL__ 16#ifdef __KERNEL__
17#include <linux/seqlock.h> 17#include <linux/seqlock.h>
18 18
19#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
20#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
21
22/* Definitions for CONFIG_GENERIC_TIME definitions */ 19/* Definitions for CONFIG_GENERIC_TIME definitions */
23#define __section_vsyscall_gtod_data __attribute__ \
24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
25#define __section_vsyscall_clock __attribute__ \
26 ((unused, __section__ (".vsyscall_clock"),aligned(16)))
27#define __vsyscall_fn \ 20#define __vsyscall_fn \
28 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace 21 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
29 22
30#define VGETCPU_RDTSCP 1 23#define VGETCPU_RDTSCP 1
31#define VGETCPU_LSL 2 24#define VGETCPU_LSL 2
32 25
33extern int __vgetcpu_mode;
34extern volatile unsigned long __jiffies;
35
36/* kernel space (writeable) */ 26/* kernel space (writeable) */
37extern int vgetcpu_mode; 27extern int vgetcpu_mode;
38extern struct timezone sys_tz; 28extern struct timezone sys_tz;
39 29
30#include <asm/vvar.h>
31
40extern void map_vsyscall(void); 32extern void map_vsyscall(void);
41 33
42#endif /* __KERNEL__ */ 34#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
new file mode 100644
index 000000000000..341b3559452b
--- /dev/null
+++ b/arch/x86/include/asm/vvar.h
@@ -0,0 +1,52 @@
1/*
2 * vvar.h: Shared vDSO/kernel variable declarations
3 * Copyright (c) 2011 Andy Lutomirski
4 * Subject to the GNU General Public License, version 2
5 *
6 * A handful of variables are accessible (read-only) from userspace
7 * code in the vsyscall page and the vdso. They are declared here.
8 * Some other file must define them with DEFINE_VVAR.
9 *
10 * In normal kernel code, they are used like any other variable.
11 * In user code, they are accessed through the VVAR macro.
12 *
13 * Each of these variables lives in the vsyscall page, and each
14 * one needs a unique offset within the little piece of the page
15 * reserved for vvars. Specify that offset in DECLARE_VVAR.
16 * (There are 896 bytes available. If you mess up, the linker will
17 * catch it.)
18 */
19
20/* Offset of vars within vsyscall page */
21#define VSYSCALL_VARS_OFFSET (3072 + 128)
22
23#if defined(__VVAR_KERNEL_LDS)
24
25/* The kernel linker script defines its own magic to put vvars in the
26 * right place.
27 */
28#define DECLARE_VVAR(offset, type, name) \
29 EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
30
31#else
32
33#define DECLARE_VVAR(offset, type, name) \
34 static type const * const vvaraddr_ ## name = \
35 (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
36
37#define DEFINE_VVAR(type, name) \
38 type __vvar_ ## name \
39 __attribute__((section(".vsyscall_var_" #name), aligned(16)))
40
41#define VVAR(name) (*vvaraddr_ ## name)
42
43#endif
44
45/* DECLARE_VVAR(offset, type, name) */
46
47DECLARE_VVAR(0, volatile unsigned long, jiffies)
48DECLARE_VVAR(8, int, vgetcpu_mode)
49DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
50
51#undef DECLARE_VVAR
52#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
new file mode 100644
index 000000000000..6bf5b8e478c0
--- /dev/null
+++ b/arch/x86/include/asm/x2apic.h
@@ -0,0 +1,62 @@
1/*
2 * Common bits for X2APIC cluster/physical modes.
3 */
4
5#ifndef _ASM_X86_X2APIC_H
6#define _ASM_X86_X2APIC_H
7
8#include <asm/apic.h>
9#include <asm/ipi.h>
10#include <linux/cpumask.h>
11
12/*
13 * Need to use more than cpu 0, because we need more vectors
14 * when MSI-X are used.
15 */
16static const struct cpumask *x2apic_target_cpus(void)
17{
18 return cpu_online_mask;
19}
20
21static int x2apic_apic_id_registered(void)
22{
23 return 1;
24}
25
26/*
27 * For now each logical cpu is in its own vector allocation domain.
28 */
29static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
30{
31 cpumask_clear(retmask);
32 cpumask_set_cpu(cpu, retmask);
33}
34
35static void
36__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
37{
38 unsigned long cfg = __prepare_ICR(0, vector, dest);
39 native_x2apic_icr_write(cfg, apicid);
40}
41
42static unsigned int x2apic_get_apic_id(unsigned long id)
43{
44 return id;
45}
46
47static unsigned long x2apic_set_apic_id(unsigned int id)
48{
49 return id;
50}
51
52static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
53{
54 return initial_apicid >> index_msb;
55}
56
57static void x2apic_send_IPI_self(int vector)
58{
59 apic_write(APIC_SELF_IPI, vector);
60}
61
62#endif /* _ASM_X86_X2APIC_H */
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 8508bfe52296..d240ea950519 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -447,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg)
447 return _hypercall2(unsigned long, hvm_op, op, arg); 447 return _hypercall2(unsigned long, hvm_op, op, arg);
448} 448}
449 449
450static inline int
451HYPERVISOR_tmem_op(
452 struct tmem_op *op)
453{
454 return _hypercall1(int, tmem_op, op);
455}
456
450static inline void 457static inline void
451MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) 458MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
452{ 459{
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index c61934fbf22a..64a619d47d34 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -47,8 +47,9 @@ extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern unsigned long set_phys_range_identity(unsigned long pfn_s, 47extern unsigned long set_phys_range_identity(unsigned long pfn_s,
48 unsigned long pfn_e); 48 unsigned long pfn_e);
49 49
50extern int m2p_add_override(unsigned long mfn, struct page *page); 50extern int m2p_add_override(unsigned long mfn, struct page *page,
51extern int m2p_remove_override(struct page *page); 51 bool clear_pte);
52extern int m2p_remove_override(struct page *page, bool clear_pte);
52extern struct page *m2p_find_override(unsigned long mfn); 53extern struct page *m2p_find_override(unsigned long mfn);
53extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn); 54extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
54 55
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
index aa8620989162..4fbda9a3f339 100644
--- a/arch/x86/include/asm/xen/pci.h
+++ b/arch/x86/include/asm/xen/pci.h
@@ -15,10 +15,26 @@ static inline int pci_xen_hvm_init(void)
15#endif 15#endif
16#if defined(CONFIG_XEN_DOM0) 16#if defined(CONFIG_XEN_DOM0)
17void __init xen_setup_pirqs(void); 17void __init xen_setup_pirqs(void);
18int xen_find_device_domain_owner(struct pci_dev *dev);
19int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
20int xen_unregister_device_domain_owner(struct pci_dev *dev);
18#else 21#else
19static inline void __init xen_setup_pirqs(void) 22static inline void __init xen_setup_pirqs(void)
20{ 23{
21} 24}
25static inline int xen_find_device_domain_owner(struct pci_dev *dev)
26{
27 return -1;
28}
29static inline int xen_register_device_domain_owner(struct pci_dev *dev,
30 uint16_t domain)
31{
32 return -1;
33}
34static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
35{
36 return -1;
37}
22#endif 38#endif
23 39
24#if defined(CONFIG_PCI_MSI) 40#if defined(CONFIG_PCI_MSI)
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 7338ef2218bc..90b06d4daee2 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,22 +24,26 @@ endif
24nostackp := $(call cc-option, -fno-stack-protector) 24nostackp := $(call cc-option, -fno-stack-protector)
25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
26CFLAGS_hpet.o := $(nostackp) 26CFLAGS_hpet.o := $(nostackp)
27CFLAGS_tsc.o := $(nostackp) 27CFLAGS_vread_tsc_64.o := $(nostackp)
28CFLAGS_paravirt.o := $(nostackp) 28CFLAGS_paravirt.o := $(nostackp)
29GCOV_PROFILE_vsyscall_64.o := n 29GCOV_PROFILE_vsyscall_64.o := n
30GCOV_PROFILE_hpet.o := n 30GCOV_PROFILE_hpet.o := n
31GCOV_PROFILE_tsc.o := n 31GCOV_PROFILE_tsc.o := n
32GCOV_PROFILE_vread_tsc_64.o := n
32GCOV_PROFILE_paravirt.o := n 33GCOV_PROFILE_paravirt.o := n
33 34
35# vread_tsc_64 is hot and should be fully optimized:
36CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
37
34obj-y := process_$(BITS).o signal.o entry_$(BITS).o 38obj-y := process_$(BITS).o signal.o entry_$(BITS).o
35obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 39obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
36obj-y += time.o ioport.o ldt.o dumpstack.o 40obj-y += time.o ioport.o ldt.o dumpstack.o
37obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o 41obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
38obj-$(CONFIG_IRQ_WORK) += irq_work.o 42obj-$(CONFIG_IRQ_WORK) += irq_work.o
39obj-$(CONFIG_X86_32) += probe_roms_32.o 43obj-y += probe_roms.o
40obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 44obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 45obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 46obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
43obj-y += bootflag.o e820.o 47obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o topology.o kdebugfs.o 48obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 49obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
@@ -117,7 +121,7 @@ obj-$(CONFIG_OF) += devicetree.o
117ifeq ($(CONFIG_X86_64),y) 121ifeq ($(CONFIG_X86_64),y)
118 obj-$(CONFIG_AUDIT) += audit_64.o 122 obj-$(CONFIG_AUDIT) += audit_64.o
119 123
120 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 124 obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
121 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 125 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
122 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 126 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
123 127
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 9a966c579af5..4558f0d0822d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -970,7 +970,7 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
970 mp_irq.irqflag = (trigger << 2) | polarity; 970 mp_irq.irqflag = (trigger << 2) | polarity;
971 mp_irq.srcbus = MP_ISA_BUS; 971 mp_irq.srcbus = MP_ISA_BUS;
972 mp_irq.srcbusirq = bus_irq; /* IRQ */ 972 mp_irq.srcbusirq = bus_irq; /* IRQ */
973 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ 973 mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
974 mp_irq.dstirq = pin; /* INTIN# */ 974 mp_irq.dstirq = pin; /* INTIN# */
975 975
976 mp_save_irq(&mp_irq); 976 mp_save_irq(&mp_irq);
@@ -1021,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1021 if (ioapic < 0) 1021 if (ioapic < 0)
1022 continue; 1022 continue;
1023 pin = mp_find_ioapic_pin(ioapic, gsi); 1023 pin = mp_find_ioapic_pin(ioapic, gsi);
1024 dstapic = mp_ioapics[ioapic].apicid; 1024 dstapic = mpc_ioapic_id(ioapic);
1025 1025
1026 for (idx = 0; idx < mp_irq_entries; idx++) { 1026 for (idx = 0; idx < mp_irq_entries; idx++) {
1027 struct mpc_intsrc *irq = mp_irqs + idx; 1027 struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1082,7 +1082,7 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1082 mp_irq.srcbus = number; 1082 mp_irq.srcbus = number;
1083 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1083 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1084 ioapic = mp_find_ioapic(gsi); 1084 ioapic = mp_find_ioapic(gsi);
1085 mp_irq.dstapic = mp_ioapics[ioapic].apicid; 1085 mp_irq.dstapic = mpc_ioapic_id(ioapic);
1086 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); 1086 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1087 1087
1088 mp_save_irq(&mp_irq); 1088 mp_save_irq(&mp_irq);
@@ -1113,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1113 1113
1114 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1114 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1115 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1115 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1116 "%d-%d\n", mp_ioapics[ioapic].apicid, 1116 "%d-%d\n", mpc_ioapic_id(ioapic),
1117 ioapic_pin); 1117 ioapic_pin);
1118 return gsi; 1118 return gsi;
1119 } 1119 }
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ff93bc1b09c3..18a857ba7a25 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -112,11 +112,6 @@ static int __init acpi_sleep_setup(char *str)
112#ifdef CONFIG_HIBERNATION 112#ifdef CONFIG_HIBERNATION
113 if (strncmp(str, "s4_nohwsig", 10) == 0) 113 if (strncmp(str, "s4_nohwsig", 10) == 0)
114 acpi_no_s4_hw_signature(); 114 acpi_no_s4_hw_signature();
115 if (strncmp(str, "s4_nonvs", 8) == 0) {
116 pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
117 "please use acpi_sleep=nonvs instead");
118 acpi_nvs_nosave();
119 }
120#endif 115#endif
121 if (strncmp(str, "nonvs", 5) == 0) 116 if (strncmp(str, "nonvs", 5) == 0)
122 acpi_nvs_nosave(); 117 acpi_nvs_nosave();
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 4a234677e213..a81f2d52f869 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
67#define DPRINTK(fmt, args...) if (debug_alternative) \ 67#define DPRINTK(fmt, args...) if (debug_alternative) \
68 printk(KERN_DEBUG fmt, args) 68 printk(KERN_DEBUG fmt, args)
69 69
70/*
71 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
72 * that correspond to that nop. Getting from one nop to the next, we
73 * add to the array the offset that is equal to the sum of all sizes of
74 * nops preceding the one we are after.
75 *
76 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
77 * nice symmetry of sizes of the previous nops.
78 */
70#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) 79#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
71/* Use inline assembly to define this because the nops are defined 80static const unsigned char intelnops[] =
72 as inline assembly strings in the include files and we cannot 81{
73 get them easily into strings. */ 82 GENERIC_NOP1,
74asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " 83 GENERIC_NOP2,
75 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 84 GENERIC_NOP3,
76 GENERIC_NOP7 GENERIC_NOP8 85 GENERIC_NOP4,
77 "\t.previous"); 86 GENERIC_NOP5,
78extern const unsigned char intelnops[]; 87 GENERIC_NOP6,
79static const unsigned char *const __initconst_or_module 88 GENERIC_NOP7,
80intel_nops[ASM_NOP_MAX+1] = { 89 GENERIC_NOP8,
90 GENERIC_NOP5_ATOMIC
91};
92static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
93{
81 NULL, 94 NULL,
82 intelnops, 95 intelnops,
83 intelnops + 1, 96 intelnops + 1,
@@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = {
87 intelnops + 1 + 2 + 3 + 4 + 5, 100 intelnops + 1 + 2 + 3 + 4 + 5,
88 intelnops + 1 + 2 + 3 + 4 + 5 + 6, 101 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
89 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 102 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
103 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
90}; 104};
91#endif 105#endif
92 106
93#ifdef K8_NOP1 107#ifdef K8_NOP1
94asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " 108static const unsigned char k8nops[] =
95 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 109{
96 K8_NOP7 K8_NOP8 110 K8_NOP1,
97 "\t.previous"); 111 K8_NOP2,
98extern const unsigned char k8nops[]; 112 K8_NOP3,
99static const unsigned char *const __initconst_or_module 113 K8_NOP4,
100k8_nops[ASM_NOP_MAX+1] = { 114 K8_NOP5,
115 K8_NOP6,
116 K8_NOP7,
117 K8_NOP8,
118 K8_NOP5_ATOMIC
119};
120static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
121{
101 NULL, 122 NULL,
102 k8nops, 123 k8nops,
103 k8nops + 1, 124 k8nops + 1,
@@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = {
107 k8nops + 1 + 2 + 3 + 4 + 5, 128 k8nops + 1 + 2 + 3 + 4 + 5,
108 k8nops + 1 + 2 + 3 + 4 + 5 + 6, 129 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
109 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 130 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
131 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
110}; 132};
111#endif 133#endif
112 134
113#if defined(K7_NOP1) && !defined(CONFIG_X86_64) 135#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
114asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " 136static const unsigned char k7nops[] =
115 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 137{
116 K7_NOP7 K7_NOP8 138 K7_NOP1,
117 "\t.previous"); 139 K7_NOP2,
118extern const unsigned char k7nops[]; 140 K7_NOP3,
119static const unsigned char *const __initconst_or_module 141 K7_NOP4,
120k7_nops[ASM_NOP_MAX+1] = { 142 K7_NOP5,
143 K7_NOP6,
144 K7_NOP7,
145 K7_NOP8,
146 K7_NOP5_ATOMIC
147};
148static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
149{
121 NULL, 150 NULL,
122 k7nops, 151 k7nops,
123 k7nops + 1, 152 k7nops + 1,
@@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = {
127 k7nops + 1 + 2 + 3 + 4 + 5, 156 k7nops + 1 + 2 + 3 + 4 + 5,
128 k7nops + 1 + 2 + 3 + 4 + 5 + 6, 157 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
129 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 158 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
159 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
130}; 160};
131#endif 161#endif
132 162
133#ifdef P6_NOP1 163#ifdef P6_NOP1
134asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " 164static const unsigned char __initconst_or_module p6nops[] =
135 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 165{
136 P6_NOP7 P6_NOP8 166 P6_NOP1,
137 "\t.previous"); 167 P6_NOP2,
138extern const unsigned char p6nops[]; 168 P6_NOP3,
139static const unsigned char *const __initconst_or_module 169 P6_NOP4,
140p6_nops[ASM_NOP_MAX+1] = { 170 P6_NOP5,
171 P6_NOP6,
172 P6_NOP7,
173 P6_NOP8,
174 P6_NOP5_ATOMIC
175};
176static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
177{
141 NULL, 178 NULL,
142 p6nops, 179 p6nops,
143 p6nops + 1, 180 p6nops + 1,
@@ -147,47 +184,65 @@ p6_nops[ASM_NOP_MAX+1] = {
147 p6nops + 1 + 2 + 3 + 4 + 5, 184 p6nops + 1 + 2 + 3 + 4 + 5,
148 p6nops + 1 + 2 + 3 + 4 + 5 + 6, 185 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
149 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 186 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
187 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
150}; 188};
151#endif 189#endif
152 190
191/* Initialize these to a safe default */
153#ifdef CONFIG_X86_64 192#ifdef CONFIG_X86_64
193const unsigned char * const *ideal_nops = p6_nops;
194#else
195const unsigned char * const *ideal_nops = intel_nops;
196#endif
154 197
155extern char __vsyscall_0; 198void __init arch_init_ideal_nops(void)
156static const unsigned char *const *__init_or_module find_nop_table(void)
157{ 199{
158 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 200 switch (boot_cpu_data.x86_vendor) {
159 boot_cpu_has(X86_FEATURE_NOPL)) 201 case X86_VENDOR_INTEL:
160 return p6_nops; 202 /*
161 else 203 * Due to a decoder implementation quirk, some
162 return k8_nops; 204 * specific Intel CPUs actually perform better with
163} 205 * the "k8_nops" than with the SDM-recommended NOPs.
164 206 */
165#else /* CONFIG_X86_64 */ 207 if (boot_cpu_data.x86 == 6 &&
208 boot_cpu_data.x86_model >= 0x0f &&
209 boot_cpu_data.x86_model != 0x1c &&
210 boot_cpu_data.x86_model != 0x26 &&
211 boot_cpu_data.x86_model != 0x27 &&
212 boot_cpu_data.x86_model < 0x30) {
213 ideal_nops = k8_nops;
214 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
215 ideal_nops = p6_nops;
216 } else {
217#ifdef CONFIG_X86_64
218 ideal_nops = k8_nops;
219#else
220 ideal_nops = intel_nops;
221#endif
222 }
166 223
167static const unsigned char *const *__init_or_module find_nop_table(void) 224 default:
168{ 225#ifdef CONFIG_X86_64
169 if (boot_cpu_has(X86_FEATURE_K8)) 226 ideal_nops = k8_nops;
170 return k8_nops; 227#else
171 else if (boot_cpu_has(X86_FEATURE_K7)) 228 if (boot_cpu_has(X86_FEATURE_K8))
172 return k7_nops; 229 ideal_nops = k8_nops;
173 else if (boot_cpu_has(X86_FEATURE_NOPL)) 230 else if (boot_cpu_has(X86_FEATURE_K7))
174 return p6_nops; 231 ideal_nops = k7_nops;
175 else 232 else
176 return intel_nops; 233 ideal_nops = intel_nops;
234#endif
235 }
177} 236}
178 237
179#endif /* CONFIG_X86_64 */
180
181/* Use this to add nops to a buffer, then text_poke the whole buffer. */ 238/* Use this to add nops to a buffer, then text_poke the whole buffer. */
182static void __init_or_module add_nops(void *insns, unsigned int len) 239static void __init_or_module add_nops(void *insns, unsigned int len)
183{ 240{
184 const unsigned char *const *noptable = find_nop_table();
185
186 while (len > 0) { 241 while (len > 0) {
187 unsigned int noplen = len; 242 unsigned int noplen = len;
188 if (noplen > ASM_NOP_MAX) 243 if (noplen > ASM_NOP_MAX)
189 noplen = ASM_NOP_MAX; 244 noplen = ASM_NOP_MAX;
190 memcpy(insns, noptable[noplen], noplen); 245 memcpy(insns, ideal_nops[noplen], noplen);
191 insns += noplen; 246 insns += noplen;
192 len -= noplen; 247 len -= noplen;
193 } 248 }
@@ -195,6 +250,7 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
195 250
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 251extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern s32 __smp_locks[], __smp_locks_end[]; 252extern s32 __smp_locks[], __smp_locks_end[];
253extern char __vsyscall_0;
198void *text_poke_early(void *addr, const void *opcode, size_t len); 254void *text_poke_early(void *addr, const void *opcode, size_t len);
199 255
200/* Replace instructions with better alternatives for this CPU type. 256/* Replace instructions with better alternatives for this CPU type.
@@ -210,6 +266,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
210 u8 insnbuf[MAX_PATCH_LEN]; 266 u8 insnbuf[MAX_PATCH_LEN];
211 267
212 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 268 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
269 /*
270 * The scan order should be from start to end. A later scanned
271 * alternative code can overwrite a previous scanned alternative code.
272 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
273 * patch code.
274 *
275 * So be careful if you want to change the scan order to any other
276 * order.
277 */
213 for (a = start; a < end; a++) { 278 for (a = start; a < end; a++) {
214 u8 *instr = a->instr; 279 u8 *instr = a->instr;
215 BUG_ON(a->replacementlen > a->instrlen); 280 BUG_ON(a->replacementlen > a->instrlen);
@@ -678,29 +743,3 @@ void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
678 wrote_text = 0; 743 wrote_text = 0;
679 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); 744 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
680} 745}
681
682#if defined(CONFIG_DYNAMIC_FTRACE) || defined(HAVE_JUMP_LABEL)
683
684#ifdef CONFIG_X86_64
685unsigned char ideal_nop5[5] = { 0x66, 0x66, 0x66, 0x66, 0x90 };
686#else
687unsigned char ideal_nop5[5] = { 0x3e, 0x8d, 0x74, 0x26, 0x00 };
688#endif
689
690void __init arch_init_ideal_nop5(void)
691{
692 /*
693 * There is no good nop for all x86 archs. This selection
694 * algorithm should be unified with the one in find_nop_table(),
695 * but this should be good enough for now.
696 *
697 * For cases other than the ones below, use the safe (as in
698 * always functional) defaults above.
699 */
700#ifdef CONFIG_X86_64
701 /* Don't use these on 32 bits due to broken virtualizers */
702 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
703 memcpy(ideal_nop5, p6_nops[5], 5);
704#endif
705}
706#endif
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index b117efd24f71..b117efd24f71 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 57ca77787220..cd8cbeb5fa34 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/pci-ats.h>
21#include <linux/bitmap.h> 22#include <linux/bitmap.h>
22#include <linux/slab.h> 23#include <linux/slab.h>
23#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -25,6 +26,7 @@
25#include <linux/dma-mapping.h> 26#include <linux/dma-mapping.h>
26#include <linux/iommu-helper.h> 27#include <linux/iommu-helper.h>
27#include <linux/iommu.h> 28#include <linux/iommu.h>
29#include <linux/delay.h>
28#include <asm/proto.h> 30#include <asm/proto.h>
29#include <asm/iommu.h> 31#include <asm/iommu.h>
30#include <asm/gart.h> 32#include <asm/gart.h>
@@ -34,7 +36,7 @@
34 36
35#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 37#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
36 38
37#define EXIT_LOOP_COUNT 10000000 39#define LOOP_TIMEOUT 100000
38 40
39static DEFINE_RWLOCK(amd_iommu_devtable_lock); 41static DEFINE_RWLOCK(amd_iommu_devtable_lock);
40 42
@@ -57,7 +59,6 @@ struct iommu_cmd {
57 u32 data[4]; 59 u32 data[4];
58}; 60};
59 61
60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
61static void update_domain(struct protection_domain *domain); 62static void update_domain(struct protection_domain *domain);
62 63
63/**************************************************************************** 64/****************************************************************************
@@ -322,8 +323,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
322 break; 323 break;
323 case EVENT_TYPE_ILL_CMD: 324 case EVENT_TYPE_ILL_CMD:
324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 325 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 dump_command(address); 326 dump_command(address);
328 break; 327 break;
329 case EVENT_TYPE_CMD_HARD_ERR: 328 case EVENT_TYPE_CMD_HARD_ERR:
@@ -367,7 +366,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
367 spin_unlock_irqrestore(&iommu->lock, flags); 366 spin_unlock_irqrestore(&iommu->lock, flags);
368} 367}
369 368
370irqreturn_t amd_iommu_int_handler(int irq, void *data) 369irqreturn_t amd_iommu_int_thread(int irq, void *data)
371{ 370{
372 struct amd_iommu *iommu; 371 struct amd_iommu *iommu;
373 372
@@ -377,192 +376,300 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
377 return IRQ_HANDLED; 376 return IRQ_HANDLED;
378} 377}
379 378
379irqreturn_t amd_iommu_int_handler(int irq, void *data)
380{
381 return IRQ_WAKE_THREAD;
382}
383
380/**************************************************************************** 384/****************************************************************************
381 * 385 *
382 * IOMMU command queuing functions 386 * IOMMU command queuing functions
383 * 387 *
384 ****************************************************************************/ 388 ****************************************************************************/
385 389
386/* 390static int wait_on_sem(volatile u64 *sem)
387 * Writes the command to the IOMMUs command buffer and informs the 391{
388 * hardware about the new command. Must be called with iommu->lock held. 392 int i = 0;
389 */ 393
390static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 394 while (*sem == 0 && i < LOOP_TIMEOUT) {
395 udelay(1);
396 i += 1;
397 }
398
399 if (i == LOOP_TIMEOUT) {
400 pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
401 return -EIO;
402 }
403
404 return 0;
405}
406
407static void copy_cmd_to_buffer(struct amd_iommu *iommu,
408 struct iommu_cmd *cmd,
409 u32 tail)
391{ 410{
392 u32 tail, head;
393 u8 *target; 411 u8 *target;
394 412
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
397 target = iommu->cmd_buf + tail; 413 target = iommu->cmd_buf + tail;
398 memcpy_toio(target, cmd, sizeof(*cmd)); 414 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
399 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 415
400 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 416 /* Copy command to buffer */
401 if (tail == head) 417 memcpy(target, cmd, sizeof(*cmd));
402 return -ENOMEM; 418
419 /* Tell the IOMMU about it */
403 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 420 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
421}
404 422
405 return 0; 423static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
424{
425 WARN_ON(address & 0x7ULL);
426
427 memset(cmd, 0, sizeof(*cmd));
428 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
429 cmd->data[1] = upper_32_bits(__pa(address));
430 cmd->data[2] = 1;
431 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
432}
433
434static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
435{
436 memset(cmd, 0, sizeof(*cmd));
437 cmd->data[0] = devid;
438 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
439}
440
441static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
442 size_t size, u16 domid, int pde)
443{
444 u64 pages;
445 int s;
446
447 pages = iommu_num_pages(address, size, PAGE_SIZE);
448 s = 0;
449
450 if (pages > 1) {
451 /*
452 * If we have to flush more than one page, flush all
453 * TLB entries for this domain
454 */
455 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
456 s = 1;
457 }
458
459 address &= PAGE_MASK;
460
461 memset(cmd, 0, sizeof(*cmd));
462 cmd->data[1] |= domid;
463 cmd->data[2] = lower_32_bits(address);
464 cmd->data[3] = upper_32_bits(address);
465 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
466 if (s) /* size bit - we flush more than one 4kb page */
467 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
468 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
469 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
470}
471
472static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
473 u64 address, size_t size)
474{
475 u64 pages;
476 int s;
477
478 pages = iommu_num_pages(address, size, PAGE_SIZE);
479 s = 0;
480
481 if (pages > 1) {
482 /*
483 * If we have to flush more than one page, flush all
484 * TLB entries for this domain
485 */
486 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
487 s = 1;
488 }
489
490 address &= PAGE_MASK;
491
492 memset(cmd, 0, sizeof(*cmd));
493 cmd->data[0] = devid;
494 cmd->data[0] |= (qdep & 0xff) << 24;
495 cmd->data[1] = devid;
496 cmd->data[2] = lower_32_bits(address);
497 cmd->data[3] = upper_32_bits(address);
498 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
499 if (s)
500 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
501}
502
503static void build_inv_all(struct iommu_cmd *cmd)
504{
505 memset(cmd, 0, sizeof(*cmd));
506 CMD_SET_TYPE(cmd, CMD_INV_ALL);
406} 507}
407 508
408/* 509/*
409 * General queuing function for commands. Takes iommu->lock and calls 510 * Writes the command to the IOMMUs command buffer and informs the
410 * __iommu_queue_command(). 511 * hardware about the new command.
411 */ 512 */
412static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 513static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
413{ 514{
515 u32 left, tail, head, next_tail;
414 unsigned long flags; 516 unsigned long flags;
415 int ret;
416 517
518 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
519
520again:
417 spin_lock_irqsave(&iommu->lock, flags); 521 spin_lock_irqsave(&iommu->lock, flags);
418 ret = __iommu_queue_command(iommu, cmd);
419 if (!ret)
420 iommu->need_sync = true;
421 spin_unlock_irqrestore(&iommu->lock, flags);
422 522
423 return ret; 523 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
424} 524 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
525 next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
526 left = (head - next_tail) % iommu->cmd_buf_size;
425 527
426/* 528 if (left <= 2) {
427 * This function waits until an IOMMU has completed a completion 529 struct iommu_cmd sync_cmd;
428 * wait command 530 volatile u64 sem = 0;
429 */ 531 int ret;
430static void __iommu_wait_for_completion(struct amd_iommu *iommu) 532
431{ 533 build_completion_wait(&sync_cmd, (u64)&sem);
432 int ready = 0; 534 copy_cmd_to_buffer(iommu, &sync_cmd, tail);
433 unsigned status = 0;
434 unsigned long i = 0;
435 535
436 INC_STATS_COUNTER(compl_wait); 536 spin_unlock_irqrestore(&iommu->lock, flags);
537
538 if ((ret = wait_on_sem(&sem)) != 0)
539 return ret;
437 540
438 while (!ready && (i < EXIT_LOOP_COUNT)) { 541 goto again;
439 ++i;
440 /* wait for the bit to become one */
441 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET);
442 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK;
443 } 542 }
444 543
445 /* set bit back to zero */ 544 copy_cmd_to_buffer(iommu, cmd, tail);
446 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 545
447 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 546 /* We need to sync now to make sure all commands are processed */
547 iommu->need_sync = true;
548
549 spin_unlock_irqrestore(&iommu->lock, flags);
448 550
449 if (unlikely(i == EXIT_LOOP_COUNT)) 551 return 0;
450 iommu->reset_in_progress = true;
451} 552}
452 553
453/* 554/*
454 * This function queues a completion wait command into the command 555 * This function queues a completion wait command into the command
455 * buffer of an IOMMU 556 * buffer of an IOMMU
456 */ 557 */
457static int __iommu_completion_wait(struct amd_iommu *iommu) 558static int iommu_completion_wait(struct amd_iommu *iommu)
458{ 559{
459 struct iommu_cmd cmd; 560 struct iommu_cmd cmd;
561 volatile u64 sem = 0;
562 int ret;
563
564 if (!iommu->need_sync)
565 return 0;
460 566
461 memset(&cmd, 0, sizeof(cmd)); 567 build_completion_wait(&cmd, (u64)&sem);
462 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK;
463 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
464 568
465 return __iommu_queue_command(iommu, &cmd); 569 ret = iommu_queue_command(iommu, &cmd);
570 if (ret)
571 return ret;
572
573 return wait_on_sem(&sem);
466} 574}
467 575
468/* 576static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
469 * This function is called whenever we need to ensure that the IOMMU has
470 * completed execution of all commands we sent. It sends a
471 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
472 * us about that by writing a value to a physical address we pass with
473 * the command.
474 */
475static int iommu_completion_wait(struct amd_iommu *iommu)
476{ 577{
477 int ret = 0; 578 struct iommu_cmd cmd;
478 unsigned long flags;
479
480 spin_lock_irqsave(&iommu->lock, flags);
481 579
482 if (!iommu->need_sync) 580 build_inv_dte(&cmd, devid);
483 goto out;
484 581
485 ret = __iommu_completion_wait(iommu); 582 return iommu_queue_command(iommu, &cmd);
583}
486 584
487 iommu->need_sync = false; 585static void iommu_flush_dte_all(struct amd_iommu *iommu)
586{
587 u32 devid;
488 588
489 if (ret) 589 for (devid = 0; devid <= 0xffff; ++devid)
490 goto out; 590 iommu_flush_dte(iommu, devid);
491 591
492 __iommu_wait_for_completion(iommu); 592 iommu_completion_wait(iommu);
593}
493 594
494out: 595/*
495 spin_unlock_irqrestore(&iommu->lock, flags); 596 * This function uses heavy locking and may disable irqs for some time. But
597 * this is no issue because it is only called during resume.
598 */
599static void iommu_flush_tlb_all(struct amd_iommu *iommu)
600{
601 u32 dom_id;
496 602
497 if (iommu->reset_in_progress) 603 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
498 reset_iommu_command_buffer(iommu); 604 struct iommu_cmd cmd;
605 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
606 dom_id, 1);
607 iommu_queue_command(iommu, &cmd);
608 }
499 609
500 return 0; 610 iommu_completion_wait(iommu);
501} 611}
502 612
503static void iommu_flush_complete(struct protection_domain *domain) 613static void iommu_flush_all(struct amd_iommu *iommu)
504{ 614{
505 int i; 615 struct iommu_cmd cmd;
506 616
507 for (i = 0; i < amd_iommus_present; ++i) { 617 build_inv_all(&cmd);
508 if (!domain->dev_iommu[i])
509 continue;
510 618
511 /* 619 iommu_queue_command(iommu, &cmd);
512 * Devices of this domain are behind this IOMMU 620 iommu_completion_wait(iommu);
513 * We need to wait for completion of all commands. 621}
514 */ 622
515 iommu_completion_wait(amd_iommus[i]); 623void iommu_flush_all_caches(struct amd_iommu *iommu)
624{
625 if (iommu_feature(iommu, FEATURE_IA)) {
626 iommu_flush_all(iommu);
627 } else {
628 iommu_flush_dte_all(iommu);
629 iommu_flush_tlb_all(iommu);
516 } 630 }
517} 631}
518 632
519/* 633/*
520 * Command send function for invalidating a device table entry 634 * Command send function for flushing on-device TLB
521 */ 635 */
522static int iommu_flush_device(struct device *dev) 636static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
523{ 637{
638 struct pci_dev *pdev = to_pci_dev(dev);
524 struct amd_iommu *iommu; 639 struct amd_iommu *iommu;
525 struct iommu_cmd cmd; 640 struct iommu_cmd cmd;
526 u16 devid; 641 u16 devid;
642 int qdep;
527 643
644 qdep = pci_ats_queue_depth(pdev);
528 devid = get_device_id(dev); 645 devid = get_device_id(dev);
529 iommu = amd_iommu_rlookup_table[devid]; 646 iommu = amd_iommu_rlookup_table[devid];
530 647
531 /* Build command */ 648 build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
532 memset(&cmd, 0, sizeof(cmd));
533 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
534 cmd.data[0] = devid;
535 649
536 return iommu_queue_command(iommu, &cmd); 650 return iommu_queue_command(iommu, &cmd);
537} 651}
538 652
539static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
540 u16 domid, int pde, int s)
541{
542 memset(cmd, 0, sizeof(*cmd));
543 address &= PAGE_MASK;
544 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
545 cmd->data[1] |= domid;
546 cmd->data[2] = lower_32_bits(address);
547 cmd->data[3] = upper_32_bits(address);
548 if (s) /* size bit - we flush more than one 4kb page */
549 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
550 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
551 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
552}
553
554/* 653/*
555 * Generic command send function for invalidaing TLB entries 654 * Command send function for invalidating a device table entry
556 */ 655 */
557static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 656static int device_flush_dte(struct device *dev)
558 u64 address, u16 domid, int pde, int s)
559{ 657{
560 struct iommu_cmd cmd; 658 struct amd_iommu *iommu;
659 struct pci_dev *pdev;
660 u16 devid;
561 int ret; 661 int ret;
562 662
563 __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); 663 pdev = to_pci_dev(dev);
664 devid = get_device_id(dev);
665 iommu = amd_iommu_rlookup_table[devid];
564 666
565 ret = iommu_queue_command(iommu, &cmd); 667 ret = iommu_flush_dte(iommu, devid);
668 if (ret)
669 return ret;
670
671 if (pci_ats_enabled(pdev))
672 ret = device_flush_iotlb(dev, 0, ~0UL);
566 673
567 return ret; 674 return ret;
568} 675}
@@ -572,23 +679,14 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
572 * It invalidates a single PTE if the range to flush is within a single 679 * It invalidates a single PTE if the range to flush is within a single
573 * page. Otherwise it flushes the whole TLB of the IOMMU. 680 * page. Otherwise it flushes the whole TLB of the IOMMU.
574 */ 681 */
575static void __iommu_flush_pages(struct protection_domain *domain, 682static void __domain_flush_pages(struct protection_domain *domain,
576 u64 address, size_t size, int pde) 683 u64 address, size_t size, int pde)
577{ 684{
578 int s = 0, i; 685 struct iommu_dev_data *dev_data;
579 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); 686 struct iommu_cmd cmd;
580 687 int ret = 0, i;
581 address &= PAGE_MASK;
582
583 if (pages > 1) {
584 /*
585 * If we have to flush more than one page, flush all
586 * TLB entries for this domain
587 */
588 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
589 s = 1;
590 }
591 688
689 build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
592 690
593 for (i = 0; i < amd_iommus_present; ++i) { 691 for (i = 0; i < amd_iommus_present; ++i) {
594 if (!domain->dev_iommu[i]) 692 if (!domain->dev_iommu[i])
@@ -598,101 +696,70 @@ static void __iommu_flush_pages(struct protection_domain *domain,
598 * Devices of this domain are behind this IOMMU 696 * Devices of this domain are behind this IOMMU
599 * We need a TLB flush 697 * We need a TLB flush
600 */ 698 */
601 iommu_queue_inv_iommu_pages(amd_iommus[i], address, 699 ret |= iommu_queue_command(amd_iommus[i], &cmd);
602 domain->id, pde, s); 700 }
701
702 list_for_each_entry(dev_data, &domain->dev_list, list) {
703 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
704
705 if (!pci_ats_enabled(pdev))
706 continue;
707
708 ret |= device_flush_iotlb(dev_data->dev, address, size);
603 } 709 }
604 710
605 return; 711 WARN_ON(ret);
606} 712}
607 713
608static void iommu_flush_pages(struct protection_domain *domain, 714static void domain_flush_pages(struct protection_domain *domain,
609 u64 address, size_t size) 715 u64 address, size_t size)
610{ 716{
611 __iommu_flush_pages(domain, address, size, 0); 717 __domain_flush_pages(domain, address, size, 0);
612} 718}
613 719
614/* Flush the whole IO/TLB for a given protection domain */ 720/* Flush the whole IO/TLB for a given protection domain */
615static void iommu_flush_tlb(struct protection_domain *domain) 721static void domain_flush_tlb(struct protection_domain *domain)
616{ 722{
617 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); 723 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
618} 724}
619 725
620/* Flush the whole IO/TLB for a given protection domain - including PDE */ 726/* Flush the whole IO/TLB for a given protection domain - including PDE */
621static void iommu_flush_tlb_pde(struct protection_domain *domain) 727static void domain_flush_tlb_pde(struct protection_domain *domain)
622{ 728{
623 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1); 729 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
624}
625
626
627/*
628 * This function flushes the DTEs for all devices in domain
629 */
630static void iommu_flush_domain_devices(struct protection_domain *domain)
631{
632 struct iommu_dev_data *dev_data;
633 unsigned long flags;
634
635 spin_lock_irqsave(&domain->lock, flags);
636
637 list_for_each_entry(dev_data, &domain->dev_list, list)
638 iommu_flush_device(dev_data->dev);
639
640 spin_unlock_irqrestore(&domain->lock, flags);
641} 730}
642 731
643static void iommu_flush_all_domain_devices(void) 732static void domain_flush_complete(struct protection_domain *domain)
644{ 733{
645 struct protection_domain *domain; 734 int i;
646 unsigned long flags;
647 735
648 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 736 for (i = 0; i < amd_iommus_present; ++i) {
737 if (!domain->dev_iommu[i])
738 continue;
649 739
650 list_for_each_entry(domain, &amd_iommu_pd_list, list) { 740 /*
651 iommu_flush_domain_devices(domain); 741 * Devices of this domain are behind this IOMMU
652 iommu_flush_complete(domain); 742 * We need to wait for completion of all commands.
743 */
744 iommu_completion_wait(amd_iommus[i]);
653 } 745 }
654
655 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
656} 746}
657 747
658void amd_iommu_flush_all_devices(void)
659{
660 iommu_flush_all_domain_devices();
661}
662 748
663/* 749/*
664 * This function uses heavy locking and may disable irqs for some time. But 750 * This function flushes the DTEs for all devices in domain
665 * this is no issue because it is only called during resume.
666 */ 751 */
667void amd_iommu_flush_all_domains(void) 752static void domain_flush_devices(struct protection_domain *domain)
668{ 753{
669 struct protection_domain *domain; 754 struct iommu_dev_data *dev_data;
670 unsigned long flags; 755 unsigned long flags;
671 756
672 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 757 spin_lock_irqsave(&domain->lock, flags);
673
674 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
675 spin_lock(&domain->lock);
676 iommu_flush_tlb_pde(domain);
677 iommu_flush_complete(domain);
678 spin_unlock(&domain->lock);
679 }
680
681 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
682}
683
684static void reset_iommu_command_buffer(struct amd_iommu *iommu)
685{
686 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
687
688 if (iommu->reset_in_progress)
689 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
690 758
691 amd_iommu_reset_cmd_buffer(iommu); 759 list_for_each_entry(dev_data, &domain->dev_list, list)
692 amd_iommu_flush_all_devices(); 760 device_flush_dte(dev_data->dev);
693 amd_iommu_flush_all_domains();
694 761
695 iommu->reset_in_progress = false; 762 spin_unlock_irqrestore(&domain->lock, flags);
696} 763}
697 764
698/**************************************************************************** 765/****************************************************************************
@@ -1410,17 +1477,22 @@ static bool dma_ops_domain(struct protection_domain *domain)
1410 return domain->flags & PD_DMA_OPS_MASK; 1477 return domain->flags & PD_DMA_OPS_MASK;
1411} 1478}
1412 1479
1413static void set_dte_entry(u16 devid, struct protection_domain *domain) 1480static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1414{ 1481{
1415 u64 pte_root = virt_to_phys(domain->pt_root); 1482 u64 pte_root = virt_to_phys(domain->pt_root);
1483 u32 flags = 0;
1416 1484
1417 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 1485 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1418 << DEV_ENTRY_MODE_SHIFT; 1486 << DEV_ENTRY_MODE_SHIFT;
1419 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; 1487 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1420 1488
1421 amd_iommu_dev_table[devid].data[2] = domain->id; 1489 if (ats)
1422 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1490 flags |= DTE_FLAG_IOTLB;
1423 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1491
1492 amd_iommu_dev_table[devid].data[3] |= flags;
1493 amd_iommu_dev_table[devid].data[2] = domain->id;
1494 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1495 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1424} 1496}
1425 1497
1426static void clear_dte_entry(u16 devid) 1498static void clear_dte_entry(u16 devid)
@@ -1437,23 +1509,29 @@ static void do_attach(struct device *dev, struct protection_domain *domain)
1437{ 1509{
1438 struct iommu_dev_data *dev_data; 1510 struct iommu_dev_data *dev_data;
1439 struct amd_iommu *iommu; 1511 struct amd_iommu *iommu;
1512 struct pci_dev *pdev;
1513 bool ats = false;
1440 u16 devid; 1514 u16 devid;
1441 1515
1442 devid = get_device_id(dev); 1516 devid = get_device_id(dev);
1443 iommu = amd_iommu_rlookup_table[devid]; 1517 iommu = amd_iommu_rlookup_table[devid];
1444 dev_data = get_dev_data(dev); 1518 dev_data = get_dev_data(dev);
1519 pdev = to_pci_dev(dev);
1520
1521 if (amd_iommu_iotlb_sup)
1522 ats = pci_ats_enabled(pdev);
1445 1523
1446 /* Update data structures */ 1524 /* Update data structures */
1447 dev_data->domain = domain; 1525 dev_data->domain = domain;
1448 list_add(&dev_data->list, &domain->dev_list); 1526 list_add(&dev_data->list, &domain->dev_list);
1449 set_dte_entry(devid, domain); 1527 set_dte_entry(devid, domain, ats);
1450 1528
1451 /* Do reference counting */ 1529 /* Do reference counting */
1452 domain->dev_iommu[iommu->index] += 1; 1530 domain->dev_iommu[iommu->index] += 1;
1453 domain->dev_cnt += 1; 1531 domain->dev_cnt += 1;
1454 1532
1455 /* Flush the DTE entry */ 1533 /* Flush the DTE entry */
1456 iommu_flush_device(dev); 1534 device_flush_dte(dev);
1457} 1535}
1458 1536
1459static void do_detach(struct device *dev) 1537static void do_detach(struct device *dev)
@@ -1476,7 +1554,7 @@ static void do_detach(struct device *dev)
1476 clear_dte_entry(devid); 1554 clear_dte_entry(devid);
1477 1555
1478 /* Flush the DTE entry */ 1556 /* Flush the DTE entry */
1479 iommu_flush_device(dev); 1557 device_flush_dte(dev);
1480} 1558}
1481 1559
1482/* 1560/*
@@ -1539,9 +1617,13 @@ out_unlock:
1539static int attach_device(struct device *dev, 1617static int attach_device(struct device *dev,
1540 struct protection_domain *domain) 1618 struct protection_domain *domain)
1541{ 1619{
1620 struct pci_dev *pdev = to_pci_dev(dev);
1542 unsigned long flags; 1621 unsigned long flags;
1543 int ret; 1622 int ret;
1544 1623
1624 if (amd_iommu_iotlb_sup)
1625 pci_enable_ats(pdev, PAGE_SHIFT);
1626
1545 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1627 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1546 ret = __attach_device(dev, domain); 1628 ret = __attach_device(dev, domain);
1547 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1629 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
@@ -1551,7 +1633,7 @@ static int attach_device(struct device *dev,
1551 * left the caches in the IOMMU dirty. So we have to flush 1633 * left the caches in the IOMMU dirty. So we have to flush
1552 * here to evict all dirty stuff. 1634 * here to evict all dirty stuff.
1553 */ 1635 */
1554 iommu_flush_tlb_pde(domain); 1636 domain_flush_tlb_pde(domain);
1555 1637
1556 return ret; 1638 return ret;
1557} 1639}
@@ -1598,12 +1680,16 @@ static void __detach_device(struct device *dev)
1598 */ 1680 */
1599static void detach_device(struct device *dev) 1681static void detach_device(struct device *dev)
1600{ 1682{
1683 struct pci_dev *pdev = to_pci_dev(dev);
1601 unsigned long flags; 1684 unsigned long flags;
1602 1685
1603 /* lock device table */ 1686 /* lock device table */
1604 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1687 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1605 __detach_device(dev); 1688 __detach_device(dev);
1606 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1689 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1690
1691 if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
1692 pci_disable_ats(pdev);
1607} 1693}
1608 1694
1609/* 1695/*
@@ -1615,10 +1701,9 @@ static struct protection_domain *domain_for_device(struct device *dev)
1615 struct protection_domain *dom; 1701 struct protection_domain *dom;
1616 struct iommu_dev_data *dev_data, *alias_data; 1702 struct iommu_dev_data *dev_data, *alias_data;
1617 unsigned long flags; 1703 unsigned long flags;
1618 u16 devid, alias; 1704 u16 devid;
1619 1705
1620 devid = get_device_id(dev); 1706 devid = get_device_id(dev);
1621 alias = amd_iommu_alias_table[devid];
1622 dev_data = get_dev_data(dev); 1707 dev_data = get_dev_data(dev);
1623 alias_data = get_dev_data(dev_data->alias); 1708 alias_data = get_dev_data(dev_data->alias);
1624 if (!alias_data) 1709 if (!alias_data)
@@ -1692,7 +1777,7 @@ static int device_change_notifier(struct notifier_block *nb,
1692 goto out; 1777 goto out;
1693 } 1778 }
1694 1779
1695 iommu_flush_device(dev); 1780 device_flush_dte(dev);
1696 iommu_completion_wait(iommu); 1781 iommu_completion_wait(iommu);
1697 1782
1698out: 1783out:
@@ -1753,8 +1838,9 @@ static void update_device_table(struct protection_domain *domain)
1753 struct iommu_dev_data *dev_data; 1838 struct iommu_dev_data *dev_data;
1754 1839
1755 list_for_each_entry(dev_data, &domain->dev_list, list) { 1840 list_for_each_entry(dev_data, &domain->dev_list, list) {
1841 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
1756 u16 devid = get_device_id(dev_data->dev); 1842 u16 devid = get_device_id(dev_data->dev);
1757 set_dte_entry(devid, domain); 1843 set_dte_entry(devid, domain, pci_ats_enabled(pdev));
1758 } 1844 }
1759} 1845}
1760 1846
@@ -1764,8 +1850,9 @@ static void update_domain(struct protection_domain *domain)
1764 return; 1850 return;
1765 1851
1766 update_device_table(domain); 1852 update_device_table(domain);
1767 iommu_flush_domain_devices(domain); 1853
1768 iommu_flush_tlb_pde(domain); 1854 domain_flush_devices(domain);
1855 domain_flush_tlb_pde(domain);
1769 1856
1770 domain->updated = false; 1857 domain->updated = false;
1771} 1858}
@@ -1924,10 +2011,10 @@ retry:
1924 ADD_STATS_COUNTER(alloced_io_mem, size); 2011 ADD_STATS_COUNTER(alloced_io_mem, size);
1925 2012
1926 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 2013 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1927 iommu_flush_tlb(&dma_dom->domain); 2014 domain_flush_tlb(&dma_dom->domain);
1928 dma_dom->need_flush = false; 2015 dma_dom->need_flush = false;
1929 } else if (unlikely(amd_iommu_np_cache)) 2016 } else if (unlikely(amd_iommu_np_cache))
1930 iommu_flush_pages(&dma_dom->domain, address, size); 2017 domain_flush_pages(&dma_dom->domain, address, size);
1931 2018
1932out: 2019out:
1933 return address; 2020 return address;
@@ -1976,7 +2063,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
1976 dma_ops_free_addresses(dma_dom, dma_addr, pages); 2063 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1977 2064
1978 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 2065 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1979 iommu_flush_pages(&dma_dom->domain, flush_addr, size); 2066 domain_flush_pages(&dma_dom->domain, flush_addr, size);
1980 dma_dom->need_flush = false; 2067 dma_dom->need_flush = false;
1981 } 2068 }
1982} 2069}
@@ -2012,7 +2099,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
2012 if (addr == DMA_ERROR_CODE) 2099 if (addr == DMA_ERROR_CODE)
2013 goto out; 2100 goto out;
2014 2101
2015 iommu_flush_complete(domain); 2102 domain_flush_complete(domain);
2016 2103
2017out: 2104out:
2018 spin_unlock_irqrestore(&domain->lock, flags); 2105 spin_unlock_irqrestore(&domain->lock, flags);
@@ -2039,7 +2126,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
2039 2126
2040 __unmap_single(domain->priv, dma_addr, size, dir); 2127 __unmap_single(domain->priv, dma_addr, size, dir);
2041 2128
2042 iommu_flush_complete(domain); 2129 domain_flush_complete(domain);
2043 2130
2044 spin_unlock_irqrestore(&domain->lock, flags); 2131 spin_unlock_irqrestore(&domain->lock, flags);
2045} 2132}
@@ -2104,7 +2191,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
2104 goto unmap; 2191 goto unmap;
2105 } 2192 }
2106 2193
2107 iommu_flush_complete(domain); 2194 domain_flush_complete(domain);
2108 2195
2109out: 2196out:
2110 spin_unlock_irqrestore(&domain->lock, flags); 2197 spin_unlock_irqrestore(&domain->lock, flags);
@@ -2150,7 +2237,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2150 s->dma_address = s->dma_length = 0; 2237 s->dma_address = s->dma_length = 0;
2151 } 2238 }
2152 2239
2153 iommu_flush_complete(domain); 2240 domain_flush_complete(domain);
2154 2241
2155 spin_unlock_irqrestore(&domain->lock, flags); 2242 spin_unlock_irqrestore(&domain->lock, flags);
2156} 2243}
@@ -2200,7 +2287,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
2200 goto out_free; 2287 goto out_free;
2201 } 2288 }
2202 2289
2203 iommu_flush_complete(domain); 2290 domain_flush_complete(domain);
2204 2291
2205 spin_unlock_irqrestore(&domain->lock, flags); 2292 spin_unlock_irqrestore(&domain->lock, flags);
2206 2293
@@ -2232,7 +2319,7 @@ static void free_coherent(struct device *dev, size_t size,
2232 2319
2233 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2320 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2234 2321
2235 iommu_flush_complete(domain); 2322 domain_flush_complete(domain);
2236 2323
2237 spin_unlock_irqrestore(&domain->lock, flags); 2324 spin_unlock_irqrestore(&domain->lock, flags);
2238 2325
@@ -2476,7 +2563,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
2476 if (!iommu) 2563 if (!iommu)
2477 return; 2564 return;
2478 2565
2479 iommu_flush_device(dev); 2566 device_flush_dte(dev);
2480 iommu_completion_wait(iommu); 2567 iommu_completion_wait(iommu);
2481} 2568}
2482 2569
@@ -2542,7 +2629,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2542 unmap_size = iommu_unmap_page(domain, iova, page_size); 2629 unmap_size = iommu_unmap_page(domain, iova, page_size);
2543 mutex_unlock(&domain->api_lock); 2630 mutex_unlock(&domain->api_lock);
2544 2631
2545 iommu_flush_tlb_pde(domain); 2632 domain_flush_tlb_pde(domain);
2546 2633
2547 return get_order(unmap_size); 2634 return get_order(unmap_size);
2548} 2635}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 246d727b65b7..9179c21120a8 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -137,6 +137,7 @@ int amd_iommus_present;
137 137
138/* IOMMUs have a non-present cache? */ 138/* IOMMUs have a non-present cache? */
139bool amd_iommu_np_cache __read_mostly; 139bool amd_iommu_np_cache __read_mostly;
140bool amd_iommu_iotlb_sup __read_mostly = true;
140 141
141/* 142/*
142 * The ACPI table parsing functions set this variable on an error 143 * The ACPI table parsing functions set this variable on an error
@@ -180,6 +181,12 @@ static u32 dev_table_size; /* size of the device table */
180static u32 alias_table_size; /* size of the alias table */ 181static u32 alias_table_size; /* size of the alias table */
181static u32 rlookup_table_size; /* size if the rlookup table */ 182static u32 rlookup_table_size; /* size if the rlookup table */
182 183
184/*
185 * This function flushes all internal caches of
186 * the IOMMU used by this driver.
187 */
188extern void iommu_flush_all_caches(struct amd_iommu *iommu);
189
183static inline void update_last_devid(u16 devid) 190static inline void update_last_devid(u16 devid)
184{ 191{
185 if (devid > amd_iommu_last_bdf) 192 if (devid > amd_iommu_last_bdf)
@@ -293,9 +300,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
293/* Function to enable the hardware */ 300/* Function to enable the hardware */
294static void iommu_enable(struct amd_iommu *iommu) 301static void iommu_enable(struct amd_iommu *iommu)
295{ 302{
296 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", 303 static const char * const feat_str[] = {
304 "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
305 "IA", "GA", "HE", "PC", NULL
306 };
307 int i;
308
309 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
297 dev_name(&iommu->dev->dev), iommu->cap_ptr); 310 dev_name(&iommu->dev->dev), iommu->cap_ptr);
298 311
312 if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
313 printk(KERN_CONT " extended features: ");
314 for (i = 0; feat_str[i]; ++i)
315 if (iommu_feature(iommu, (1ULL << i)))
316 printk(KERN_CONT " %s", feat_str[i]);
317 }
318 printk(KERN_CONT "\n");
319
299 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 320 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
300} 321}
301 322
@@ -651,7 +672,7 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
651static void __init init_iommu_from_pci(struct amd_iommu *iommu) 672static void __init init_iommu_from_pci(struct amd_iommu *iommu)
652{ 673{
653 int cap_ptr = iommu->cap_ptr; 674 int cap_ptr = iommu->cap_ptr;
654 u32 range, misc; 675 u32 range, misc, low, high;
655 int i, j; 676 int i, j;
656 677
657 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, 678 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
@@ -667,6 +688,15 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
667 MMIO_GET_LD(range)); 688 MMIO_GET_LD(range));
668 iommu->evt_msi_num = MMIO_MSI_NUM(misc); 689 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
669 690
691 if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
692 amd_iommu_iotlb_sup = false;
693
694 /* read extended feature bits */
695 low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
696 high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
697
698 iommu->features = ((u64)high << 32) | low;
699
670 if (!is_rd890_iommu(iommu->dev)) 700 if (!is_rd890_iommu(iommu->dev))
671 return; 701 return;
672 702
@@ -1004,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
1004 if (pci_enable_msi(iommu->dev)) 1034 if (pci_enable_msi(iommu->dev))
1005 return 1; 1035 return 1;
1006 1036
1007 r = request_irq(iommu->dev->irq, amd_iommu_int_handler, 1037 r = request_threaded_irq(iommu->dev->irq,
1008 IRQF_SAMPLE_RANDOM, 1038 amd_iommu_int_handler,
1009 "AMD-Vi", 1039 amd_iommu_int_thread,
1010 NULL); 1040 0, "AMD-Vi",
1041 iommu->dev);
1011 1042
1012 if (r) { 1043 if (r) {
1013 pci_disable_msi(iommu->dev); 1044 pci_disable_msi(iommu->dev);
@@ -1244,6 +1275,7 @@ static void enable_iommus(void)
1244 iommu_set_exclusion_range(iommu); 1275 iommu_set_exclusion_range(iommu);
1245 iommu_init_msi(iommu); 1276 iommu_init_msi(iommu);
1246 iommu_enable(iommu); 1277 iommu_enable(iommu);
1278 iommu_flush_all_caches(iommu);
1247 } 1279 }
1248} 1280}
1249 1281
@@ -1274,8 +1306,8 @@ static void amd_iommu_resume(void)
1274 * we have to flush after the IOMMUs are enabled because a 1306 * we have to flush after the IOMMUs are enabled because a
1275 * disabled IOMMU will never execute the commands we send 1307 * disabled IOMMU will never execute the commands we send
1276 */ 1308 */
1277 amd_iommu_flush_all_devices(); 1309 for_each_iommu(iommu)
1278 amd_iommu_flush_all_domains(); 1310 iommu_flush_all_caches(iommu);
1279} 1311}
1280 1312
1281static int amd_iommu_suspend(void) 1313static int amd_iommu_suspend(void)
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index cd1ffed4ee22..289e92862fd9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = {
177 .rating = APBT_CLOCKSOURCE_RATING, 177 .rating = APBT_CLOCKSOURCE_RATING,
178 .read = apbt_read_clocksource, 178 .read = apbt_read_clocksource,
179 .mask = APBT_MASK, 179 .mask = APBT_MASK,
180 .shift = APBT_SHIFT,
181 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 180 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
182 .resume = apbt_restart_clocksource, 181 .resume = apbt_restart_clocksource,
183}; 182};
@@ -543,14 +542,7 @@ static int apbt_clocksource_register(void)
543 if (t1 == apbt_read_clocksource(&clocksource_apbt)) 542 if (t1 == apbt_read_clocksource(&clocksource_apbt))
544 panic("APBT counter not counting. APBT disabled\n"); 543 panic("APBT counter not counting. APBT disabled\n");
545 544
546 /* 545 clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
547 * initialize and register APBT clocksource
548 * convert that to ns/clock cycle
549 * mult = (ns/c) * 2^APBT_SHIFT
550 */
551 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
552 (unsigned long) apbt_freq, APBT_SHIFT);
553 clocksource_register(&clocksource_apbt);
554 546
555 return 0; 547 return 0;
556} 548}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 73fb469908c6..3d2661ca6542 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -30,6 +30,22 @@
30#include <asm/amd_nb.h> 30#include <asm/amd_nb.h>
31#include <asm/x86_init.h> 31#include <asm/x86_init.h>
32 32
33/*
34 * Using 512M as goal, in case kexec will load kernel_big
35 * that will do the on-position decompress, and could overlap with
36 * with the gart aperture that is used.
37 * Sequence:
38 * kernel_small
39 * ==> kexec (with kdump trigger path or gart still enabled)
40 * ==> kernel_small (gart area become e820_reserved)
41 * ==> kexec (with kdump trigger path or gart still enabled)
42 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
43 * So don't use 512M below as gart iommu, leave the space for kernel
44 * code for safe.
45 */
46#define GART_MIN_ADDR (512ULL << 20)
47#define GART_MAX_ADDR (1ULL << 32)
48
33int gart_iommu_aperture; 49int gart_iommu_aperture;
34int gart_iommu_aperture_disabled __initdata; 50int gart_iommu_aperture_disabled __initdata;
35int gart_iommu_aperture_allowed __initdata; 51int gart_iommu_aperture_allowed __initdata;
@@ -70,21 +86,9 @@ static u32 __init allocate_aperture(void)
70 * memory. Unfortunately we cannot move it up because that would 86 * memory. Unfortunately we cannot move it up because that would
71 * make the IOMMU useless. 87 * make the IOMMU useless.
72 */ 88 */
73 /* 89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
74 * using 512M as goal, in case kexec will load kernel_big 90 aper_size, aper_size);
75 * that will do the on position decompress, and could overlap with 91 if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
76 * that position with gart that is used.
77 * sequende:
78 * kernel_small
79 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
80 * ==> kernel_small(gart area become e820_reserved)
81 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart)
82 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
83 * so don't use 512M below as gart iommu, leave the space for kernel
84 * code for safe
85 */
86 addr = memblock_find_in_range(0, 1ULL<<32, aper_size, 512ULL<<20);
87 if (addr == MEMBLOCK_ERROR || addr + aper_size > 0xffffffff) {
88 printk(KERN_ERR 92 printk(KERN_ERR
89 "Cannot allocate aperture memory hole (%lx,%uK)\n", 93 "Cannot allocate aperture memory hole (%lx,%uK)\n",
90 addr, aper_size>>10); 94 addr, aper_size>>10);
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 3966b564ea47..767fd04f2843 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,20 +2,25 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
6obj-y += hw_nmi.o 6obj-y += hw_nmi.o
7 7
8obj-$(CONFIG_X86_IO_APIC) += io_apic.o 8obj-$(CONFIG_X86_IO_APIC) += io_apic.o
9obj-$(CONFIG_SMP) += ipi.o 9obj-$(CONFIG_SMP) += ipi.o
10 10
11ifeq ($(CONFIG_X86_64),y) 11ifeq ($(CONFIG_X86_64),y)
12obj-y += apic_flat_64.o 12# APIC probe will depend on the listing order here
13obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_UV) += x2apic_uv_x.o 13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
16obj-y += apic_flat_64.o
16endif 17endif
17 18
18obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o 19# APIC probe will depend on the listing order here
19obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 20obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
20obj-$(CONFIG_X86_ES7000) += es7000_32.o
21obj-$(CONFIG_X86_SUMMIT) += summit_32.o 21obj-$(CONFIG_X86_SUMMIT) += summit_32.o
22obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
23obj-$(CONFIG_X86_ES7000) += es7000_32.o
24
25# For 32bit, probe_32 need to be listed last
26obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index a0bf78a0918c..b9338b8cf420 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -506,7 +506,7 @@ static void __cpuinit setup_APIC_timer(void)
506{ 506{
507 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 507 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
508 508
509 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_ARAT)) { 509 if (this_cpu_has(X86_FEATURE_ARAT)) {
510 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; 510 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
511 /* Make LAPIC timer preferrable over percpu HPET */ 511 /* Make LAPIC timer preferrable over percpu HPET */
512 lapic_clockevent.rating = 150; 512 lapic_clockevent.rating = 150;
@@ -1238,6 +1238,17 @@ void __cpuinit setup_local_APIC(void)
1238 /* always use the value from LDR */ 1238 /* always use the value from LDR */
1239 early_per_cpu(x86_cpu_to_logical_apicid, cpu) = 1239 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1240 logical_smp_processor_id(); 1240 logical_smp_processor_id();
1241
1242 /*
1243 * Some NUMA implementations (NUMAQ) don't initialize apicid to
1244 * node mapping during NUMA init. Now that logical apicid is
1245 * guaranteed to be known, give it another chance. This is already
1246 * a bit too late - percpu allocation has already happened without
1247 * proper NUMA affinity.
1248 */
1249 if (apic->x86_32_numa_cpu_node)
1250 set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
1251 apic->x86_32_numa_cpu_node(cpu));
1241#endif 1252#endif
1242 1253
1243 /* 1254 /*
@@ -1451,7 +1462,6 @@ int __init enable_IR(void)
1451void __init enable_IR_x2apic(void) 1462void __init enable_IR_x2apic(void)
1452{ 1463{
1453 unsigned long flags; 1464 unsigned long flags;
1454 struct IO_APIC_route_entry **ioapic_entries;
1455 int ret, x2apic_enabled = 0; 1465 int ret, x2apic_enabled = 0;
1456 int dmar_table_init_ret; 1466 int dmar_table_init_ret;
1457 1467
@@ -1459,13 +1469,7 @@ void __init enable_IR_x2apic(void)
1459 if (dmar_table_init_ret && !x2apic_supported()) 1469 if (dmar_table_init_ret && !x2apic_supported())
1460 return; 1470 return;
1461 1471
1462 ioapic_entries = alloc_ioapic_entries(); 1472 ret = save_ioapic_entries();
1463 if (!ioapic_entries) {
1464 pr_err("Allocate ioapic_entries failed\n");
1465 goto out;
1466 }
1467
1468 ret = save_IO_APIC_setup(ioapic_entries);
1469 if (ret) { 1473 if (ret) {
1470 pr_info("Saving IO-APIC state failed: %d\n", ret); 1474 pr_info("Saving IO-APIC state failed: %d\n", ret);
1471 goto out; 1475 goto out;
@@ -1473,7 +1477,7 @@ void __init enable_IR_x2apic(void)
1473 1477
1474 local_irq_save(flags); 1478 local_irq_save(flags);
1475 legacy_pic->mask_all(); 1479 legacy_pic->mask_all();
1476 mask_IO_APIC_setup(ioapic_entries); 1480 mask_ioapic_entries();
1477 1481
1478 if (dmar_table_init_ret) 1482 if (dmar_table_init_ret)
1479 ret = 0; 1483 ret = 0;
@@ -1504,14 +1508,11 @@ void __init enable_IR_x2apic(void)
1504 1508
1505nox2apic: 1509nox2apic:
1506 if (!ret) /* IR enabling failed */ 1510 if (!ret) /* IR enabling failed */
1507 restore_IO_APIC_setup(ioapic_entries); 1511 restore_ioapic_entries();
1508 legacy_pic->restore_mask(); 1512 legacy_pic->restore_mask();
1509 local_irq_restore(flags); 1513 local_irq_restore(flags);
1510 1514
1511out: 1515out:
1512 if (ioapic_entries)
1513 free_ioapic_entries(ioapic_entries);
1514
1515 if (x2apic_enabled) 1516 if (x2apic_enabled)
1516 return; 1517 return;
1517 1518
@@ -1813,30 +1814,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1813 */ 1814 */
1814void smp_error_interrupt(struct pt_regs *regs) 1815void smp_error_interrupt(struct pt_regs *regs)
1815{ 1816{
1816 u32 v, v1; 1817 u32 v0, v1;
1818 u32 i = 0;
1819 static const char * const error_interrupt_reason[] = {
1820 "Send CS error", /* APIC Error Bit 0 */
1821 "Receive CS error", /* APIC Error Bit 1 */
1822 "Send accept error", /* APIC Error Bit 2 */
1823 "Receive accept error", /* APIC Error Bit 3 */
1824 "Redirectable IPI", /* APIC Error Bit 4 */
1825 "Send illegal vector", /* APIC Error Bit 5 */
1826 "Received illegal vector", /* APIC Error Bit 6 */
1827 "Illegal register address", /* APIC Error Bit 7 */
1828 };
1817 1829
1818 exit_idle(); 1830 exit_idle();
1819 irq_enter(); 1831 irq_enter();
1820 /* First tickle the hardware, only then report what went on. -- REW */ 1832 /* First tickle the hardware, only then report what went on. -- REW */
1821 v = apic_read(APIC_ESR); 1833 v0 = apic_read(APIC_ESR);
1822 apic_write(APIC_ESR, 0); 1834 apic_write(APIC_ESR, 0);
1823 v1 = apic_read(APIC_ESR); 1835 v1 = apic_read(APIC_ESR);
1824 ack_APIC_irq(); 1836 ack_APIC_irq();
1825 atomic_inc(&irq_err_count); 1837 atomic_inc(&irq_err_count);
1826 1838
1827 /* 1839 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
1828 * Here is what the APIC error bits mean: 1840 smp_processor_id(), v0 , v1);
1829 * 0: Send CS error 1841
1830 * 1: Receive CS error 1842 v1 = v1 & 0xff;
1831 * 2: Send accept error 1843 while (v1) {
1832 * 3: Receive accept error 1844 if (v1 & 0x1)
1833 * 4: Reserved 1845 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1834 * 5: Send illegal vector 1846 i++;
1835 * 6: Received illegal vector 1847 v1 >>= 1;
1836 * 7: Illegal register address 1848 };
1837 */ 1849
1838 pr_debug("APIC error on CPU%d: %02x(%02x)\n", 1850 apic_printk(APIC_DEBUG, KERN_CONT "\n");
1839 smp_processor_id(), v , v1); 1851
1840 irq_exit(); 1852 irq_exit();
1841} 1853}
1842 1854
@@ -2004,21 +2016,6 @@ void default_init_apic_ldr(void)
2004 apic_write(APIC_LDR, val); 2016 apic_write(APIC_LDR, val);
2005} 2017}
2006 2018
2007#ifdef CONFIG_X86_32
2008int default_x86_32_numa_cpu_node(int cpu)
2009{
2010#ifdef CONFIG_NUMA
2011 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
2012
2013 if (apicid != BAD_APICID)
2014 return __apicid_to_node[apicid];
2015 return NUMA_NO_NODE;
2016#else
2017 return 0;
2018#endif
2019}
2020#endif
2021
2022/* 2019/*
2023 * Power management 2020 * Power management
2024 */ 2021 */
@@ -2089,28 +2086,20 @@ static void lapic_resume(void)
2089{ 2086{
2090 unsigned int l, h; 2087 unsigned int l, h;
2091 unsigned long flags; 2088 unsigned long flags;
2092 int maxlvt, ret; 2089 int maxlvt;
2093 struct IO_APIC_route_entry **ioapic_entries = NULL;
2094 2090
2095 if (!apic_pm_state.active) 2091 if (!apic_pm_state.active)
2096 return; 2092 return;
2097 2093
2098 local_irq_save(flags); 2094 local_irq_save(flags);
2099 if (intr_remapping_enabled) { 2095 if (intr_remapping_enabled) {
2100 ioapic_entries = alloc_ioapic_entries(); 2096 /*
2101 if (!ioapic_entries) { 2097 * IO-APIC and PIC have their own resume routines.
2102 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2098 * We just mask them here to make sure the interrupt
2103 goto restore; 2099 * subsystem is completely quiet while we enable x2apic
2104 } 2100 * and interrupt-remapping.
2105 2101 */
2106 ret = save_IO_APIC_setup(ioapic_entries); 2102 mask_ioapic_entries();
2107 if (ret) {
2108 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2109 free_ioapic_entries(ioapic_entries);
2110 goto restore;
2111 }
2112
2113 mask_IO_APIC_setup(ioapic_entries);
2114 legacy_pic->mask_all(); 2103 legacy_pic->mask_all();
2115 } 2104 }
2116 2105
@@ -2153,13 +2142,9 @@ static void lapic_resume(void)
2153 apic_write(APIC_ESR, 0); 2142 apic_write(APIC_ESR, 0);
2154 apic_read(APIC_ESR); 2143 apic_read(APIC_ESR);
2155 2144
2156 if (intr_remapping_enabled) { 2145 if (intr_remapping_enabled)
2157 reenable_intr_remapping(x2apic_mode); 2146 reenable_intr_remapping(x2apic_mode);
2158 legacy_pic->restore_mask(); 2147
2159 restore_IO_APIC_setup(ioapic_entries);
2160 free_ioapic_entries(ioapic_entries);
2161 }
2162restore:
2163 local_irq_restore(flags); 2148 local_irq_restore(flags);
2164} 2149}
2165 2150
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 5652d31fe108..f7a41e4cae47 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -16,6 +16,7 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/module.h>
19#include <asm/smp.h> 20#include <asm/smp.h>
20#include <asm/apic.h> 21#include <asm/apic.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
@@ -24,6 +25,12 @@
24#include <acpi/acpi_bus.h> 25#include <acpi/acpi_bus.h>
25#endif 26#endif
26 27
28static struct apic apic_physflat;
29static struct apic apic_flat;
30
31struct apic __read_mostly *apic = &apic_flat;
32EXPORT_SYMBOL_GPL(apic);
33
27static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 34static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
28{ 35{
29 return 1; 36 return 1;
@@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
164 return initial_apic_id >> index_msb; 171 return initial_apic_id >> index_msb;
165} 172}
166 173
167struct apic apic_flat = { 174static struct apic apic_flat = {
168 .name = "flat", 175 .name = "flat",
169 .probe = NULL, 176 .probe = NULL,
170 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 177 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -312,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
312 return per_cpu(x86_cpu_to_apicid, cpu); 319 return per_cpu(x86_cpu_to_apicid, cpu);
313} 320}
314 321
315struct apic apic_physflat = { 322static int physflat_probe(void)
323{
324 if (apic == &apic_physflat || num_possible_cpus() > 8)
325 return 1;
326
327 return 0;
328}
329
330static struct apic apic_physflat = {
316 331
317 .name = "physical flat", 332 .name = "physical flat",
318 .probe = NULL, 333 .probe = physflat_probe,
319 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 334 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
320 .apic_id_registered = flat_apic_id_registered, 335 .apic_id_registered = flat_apic_id_registered,
321 336
@@ -369,3 +384,8 @@ struct apic apic_physflat = {
369 .wait_icr_idle = native_apic_wait_icr_idle, 384 .wait_icr_idle = native_apic_wait_icr_idle,
370 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 385 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
371}; 386};
387
388/*
389 * We need to check for physflat first, so this order is important.
390 */
391apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index f1baa2dc087a..775b82bc655c 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -119,14 +119,6 @@ static void noop_apic_write(u32 reg, u32 v)
119 WARN_ON_ONCE(cpu_has_apic && !disable_apic); 119 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
120} 120}
121 121
122#ifdef CONFIG_X86_32
123static int noop_x86_32_numa_cpu_node(int cpu)
124{
125 /* we're always on node 0 */
126 return 0;
127}
128#endif
129
130struct apic apic_noop = { 122struct apic apic_noop = {
131 .name = "noop", 123 .name = "noop",
132 .probe = noop_probe, 124 .probe = noop_probe,
@@ -195,6 +187,5 @@ struct apic apic_noop = {
195 187
196#ifdef CONFIG_X86_32 188#ifdef CONFIG_X86_32
197 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, 189 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
198 .x86_32_numa_cpu_node = noop_x86_32_numa_cpu_node,
199#endif 190#endif
200}; 191};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 541a2e431659..efd737e827f4 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -193,7 +193,7 @@ static int probe_bigsmp(void)
193 return dmi_bigsmp; 193 return dmi_bigsmp;
194} 194}
195 195
196struct apic apic_bigsmp = { 196static struct apic apic_bigsmp = {
197 197
198 .name = "bigsmp", 198 .name = "bigsmp",
199 .probe = probe_bigsmp, 199 .probe = probe_bigsmp,
@@ -253,5 +253,14 @@ struct apic apic_bigsmp = {
253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254 254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid, 255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
256 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
257}; 256};
257
258struct apic * __init generic_bigsmp_probe(void)
259{
260 if (probe_bigsmp())
261 return &apic_bigsmp;
262
263 return NULL;
264}
265
266apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 3e9de4854c5b..9536b3fe43f8 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -510,11 +510,6 @@ static void es7000_setup_apic_routing(void)
510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
511} 511}
512 512
513static int es7000_numa_cpu_node(int cpu)
514{
515 return 0;
516}
517
518static int es7000_cpu_present_to_apicid(int mps_cpu) 513static int es7000_cpu_present_to_apicid(int mps_cpu)
519{ 514{
520 if (!mps_cpu) 515 if (!mps_cpu)
@@ -625,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
625} 620}
626 621
627/* We've been warned by a false positive warning.Use __refdata to keep calm. */ 622/* We've been warned by a false positive warning.Use __refdata to keep calm. */
628struct apic __refdata apic_es7000_cluster = { 623static struct apic __refdata apic_es7000_cluster = {
629 624
630 .name = "es7000", 625 .name = "es7000",
631 .probe = probe_es7000, 626 .probe = probe_es7000,
@@ -688,10 +683,9 @@ struct apic __refdata apic_es7000_cluster = {
688 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 683 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
689 684
690 .x86_32_early_logical_apicid = es7000_early_logical_apicid, 685 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
691 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
692}; 686};
693 687
694struct apic __refdata apic_es7000 = { 688static struct apic __refdata apic_es7000 = {
695 689
696 .name = "es7000", 690 .name = "es7000",
697 .probe = probe_es7000, 691 .probe = probe_es7000,
@@ -752,5 +746,10 @@ struct apic __refdata apic_es7000 = {
752 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 746 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
753 747
754 .x86_32_early_logical_apicid = es7000_early_logical_apicid, 748 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
755 .x86_32_numa_cpu_node = es7000_numa_cpu_node,
756}; 749};
750
751/*
752 * Need to check for es7000 followed by es7000_cluster, so this order
753 * in apic_drivers is important.
754 */
755apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index 5260fe91bcb6..d5e57db0f7be 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -19,9 +19,9 @@
19#include <linux/delay.h> 19#include <linux/delay.h>
20 20
21#ifdef CONFIG_HARDLOCKUP_DETECTOR 21#ifdef CONFIG_HARDLOCKUP_DETECTOR
22u64 hw_nmi_get_sample_period(void) 22u64 hw_nmi_get_sample_period(int watchdog_thresh)
23{ 23{
24 return (u64)(cpu_khz) * 1000 * 60; 24 return (u64)(cpu_khz) * 1000 * watchdog_thresh;
25} 25}
26#endif 26#endif
27 27
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 45fd33d1fd3a..e5293394b548 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -76,17 +76,40 @@ int sis_apic_bug = -1;
76static DEFINE_RAW_SPINLOCK(ioapic_lock); 76static DEFINE_RAW_SPINLOCK(ioapic_lock);
77static DEFINE_RAW_SPINLOCK(vector_lock); 77static DEFINE_RAW_SPINLOCK(vector_lock);
78 78
79/* 79static struct ioapic {
80 * # of IRQ routing registers 80 /*
81 */ 81 * # of IRQ routing registers
82int nr_ioapic_registers[MAX_IO_APICS]; 82 */
83 int nr_registers;
84 /*
85 * Saved state during suspend/resume, or while enabling intr-remap.
86 */
87 struct IO_APIC_route_entry *saved_registers;
88 /* I/O APIC config */
89 struct mpc_ioapic mp_config;
90 /* IO APIC gsi routing info */
91 struct mp_ioapic_gsi gsi_config;
92 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
93} ioapics[MAX_IO_APICS];
83 94
84/* I/O APIC entries */ 95#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver
85struct mpc_ioapic mp_ioapics[MAX_IO_APICS]; 96
86int nr_ioapics; 97int mpc_ioapic_id(int id)
98{
99 return ioapics[id].mp_config.apicid;
100}
87 101
88/* IO APIC gsi routing info */ 102unsigned int mpc_ioapic_addr(int id)
89struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; 103{
104 return ioapics[id].mp_config.apicaddr;
105}
106
107struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
108{
109 return &ioapics[id].gsi_config;
110}
111
112int nr_ioapics;
90 113
91/* The one past the highest gsi number used */ 114/* The one past the highest gsi number used */
92u32 gsi_top; 115u32 gsi_top;
@@ -179,6 +202,14 @@ int __init arch_early_irq_init(void)
179 io_apic_irqs = ~0UL; 202 io_apic_irqs = ~0UL;
180 } 203 }
181 204
205 for (i = 0; i < nr_ioapics; i++) {
206 ioapics[i].saved_registers =
207 kzalloc(sizeof(struct IO_APIC_route_entry) *
208 ioapics[i].nr_registers, GFP_KERNEL);
209 if (!ioapics[i].saved_registers)
210 pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
211 }
212
182 cfg = irq_cfgx; 213 cfg = irq_cfgx;
183 count = ARRAY_SIZE(irq_cfgx); 214 count = ARRAY_SIZE(irq_cfgx);
184 node = cpu_to_node(0); 215 node = cpu_to_node(0);
@@ -297,7 +328,7 @@ struct io_apic {
297static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 328static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
298{ 329{
299 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 330 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
300 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); 331 + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
301} 332}
302 333
303static inline void io_apic_eoi(unsigned int apic, unsigned int vector) 334static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
@@ -573,7 +604,7 @@ static void clear_IO_APIC (void)
573 int apic, pin; 604 int apic, pin;
574 605
575 for (apic = 0; apic < nr_ioapics; apic++) 606 for (apic = 0; apic < nr_ioapics; apic++)
576 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 607 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
577 clear_IO_APIC_pin(apic, pin); 608 clear_IO_APIC_pin(apic, pin);
578} 609}
579 610
@@ -615,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str)
615__setup("pirq=", ioapic_pirq_setup); 646__setup("pirq=", ioapic_pirq_setup);
616#endif /* CONFIG_X86_32 */ 647#endif /* CONFIG_X86_32 */
617 648
618struct IO_APIC_route_entry **alloc_ioapic_entries(void)
619{
620 int apic;
621 struct IO_APIC_route_entry **ioapic_entries;
622
623 ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
624 GFP_KERNEL);
625 if (!ioapic_entries)
626 return 0;
627
628 for (apic = 0; apic < nr_ioapics; apic++) {
629 ioapic_entries[apic] =
630 kzalloc(sizeof(struct IO_APIC_route_entry) *
631 nr_ioapic_registers[apic], GFP_KERNEL);
632 if (!ioapic_entries[apic])
633 goto nomem;
634 }
635
636 return ioapic_entries;
637
638nomem:
639 while (--apic >= 0)
640 kfree(ioapic_entries[apic]);
641 kfree(ioapic_entries);
642
643 return 0;
644}
645
646/* 649/*
647 * Saves all the IO-APIC RTE's 650 * Saves all the IO-APIC RTE's
648 */ 651 */
649int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 652int save_ioapic_entries(void)
650{ 653{
651 int apic, pin; 654 int apic, pin;
652 655 int err = 0;
653 if (!ioapic_entries)
654 return -ENOMEM;
655 656
656 for (apic = 0; apic < nr_ioapics; apic++) { 657 for (apic = 0; apic < nr_ioapics; apic++) {
657 if (!ioapic_entries[apic]) 658 if (!ioapics[apic].saved_registers) {
658 return -ENOMEM; 659 err = -ENOMEM;
660 continue;
661 }
659 662
660 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 663 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
661 ioapic_entries[apic][pin] = 664 ioapics[apic].saved_registers[pin] =
662 ioapic_read_entry(apic, pin); 665 ioapic_read_entry(apic, pin);
663 } 666 }
664 667
665 return 0; 668 return err;
666} 669}
667 670
668/* 671/*
669 * Mask all IO APIC entries. 672 * Mask all IO APIC entries.
670 */ 673 */
671void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 674void mask_ioapic_entries(void)
672{ 675{
673 int apic, pin; 676 int apic, pin;
674 677
675 if (!ioapic_entries)
676 return;
677
678 for (apic = 0; apic < nr_ioapics; apic++) { 678 for (apic = 0; apic < nr_ioapics; apic++) {
679 if (!ioapic_entries[apic]) 679 if (!ioapics[apic].saved_registers)
680 break; 680 continue;
681 681
682 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 682 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
683 struct IO_APIC_route_entry entry; 683 struct IO_APIC_route_entry entry;
684 684
685 entry = ioapic_entries[apic][pin]; 685 entry = ioapics[apic].saved_registers[pin];
686 if (!entry.mask) { 686 if (!entry.mask) {
687 entry.mask = 1; 687 entry.mask = 1;
688 ioapic_write_entry(apic, pin, entry); 688 ioapic_write_entry(apic, pin, entry);
@@ -692,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
692} 692}
693 693
694/* 694/*
695 * Restore IO APIC entries which was saved in ioapic_entries. 695 * Restore IO APIC entries which was saved in the ioapic structure.
696 */ 696 */
697int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 697int restore_ioapic_entries(void)
698{ 698{
699 int apic, pin; 699 int apic, pin;
700 700
701 if (!ioapic_entries)
702 return -ENOMEM;
703
704 for (apic = 0; apic < nr_ioapics; apic++) { 701 for (apic = 0; apic < nr_ioapics; apic++) {
705 if (!ioapic_entries[apic]) 702 if (!ioapics[apic].saved_registers)
706 return -ENOMEM; 703 continue;
707 704
708 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 705 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
709 ioapic_write_entry(apic, pin, 706 ioapic_write_entry(apic, pin,
710 ioapic_entries[apic][pin]); 707 ioapics[apic].saved_registers[pin]);
711 } 708 }
712 return 0; 709 return 0;
713} 710}
714 711
715void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
716{
717 int apic;
718
719 for (apic = 0; apic < nr_ioapics; apic++)
720 kfree(ioapic_entries[apic]);
721
722 kfree(ioapic_entries);
723}
724
725/* 712/*
726 * Find the IRQ entry number of a certain pin. 713 * Find the IRQ entry number of a certain pin.
727 */ 714 */
@@ -731,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type)
731 718
732 for (i = 0; i < mp_irq_entries; i++) 719 for (i = 0; i < mp_irq_entries; i++)
733 if (mp_irqs[i].irqtype == type && 720 if (mp_irqs[i].irqtype == type &&
734 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || 721 (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
735 mp_irqs[i].dstapic == MP_APIC_ALL) && 722 mp_irqs[i].dstapic == MP_APIC_ALL) &&
736 mp_irqs[i].dstirq == pin) 723 mp_irqs[i].dstirq == pin)
737 return i; 724 return i;
@@ -773,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type)
773 if (i < mp_irq_entries) { 760 if (i < mp_irq_entries) {
774 int apic; 761 int apic;
775 for(apic = 0; apic < nr_ioapics; apic++) { 762 for(apic = 0; apic < nr_ioapics; apic++) {
776 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) 763 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
777 return apic; 764 return apic;
778 } 765 }
779 } 766 }
@@ -942,6 +929,7 @@ static int pin_2_irq(int idx, int apic, int pin)
942{ 929{
943 int irq; 930 int irq;
944 int bus = mp_irqs[idx].srcbus; 931 int bus = mp_irqs[idx].srcbus;
932 struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
945 933
946 /* 934 /*
947 * Debugging check, we are in big trouble if this message pops up! 935 * Debugging check, we are in big trouble if this message pops up!
@@ -952,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin)
952 if (test_bit(bus, mp_bus_not_pci)) { 940 if (test_bit(bus, mp_bus_not_pci)) {
953 irq = mp_irqs[idx].srcbusirq; 941 irq = mp_irqs[idx].srcbusirq;
954 } else { 942 } else {
955 u32 gsi = mp_gsi_routing[apic].gsi_base + pin; 943 u32 gsi = gsi_cfg->gsi_base + pin;
956 944
957 if (gsi >= NR_IRQS_LEGACY) 945 if (gsi >= NR_IRQS_LEGACY)
958 irq = gsi; 946 irq = gsi;
@@ -1003,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1003 int lbus = mp_irqs[i].srcbus; 991 int lbus = mp_irqs[i].srcbus;
1004 992
1005 for (apic = 0; apic < nr_ioapics; apic++) 993 for (apic = 0; apic < nr_ioapics; apic++)
1006 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || 994 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
1007 mp_irqs[i].dstapic == MP_APIC_ALL) 995 mp_irqs[i].dstapic == MP_APIC_ALL)
1008 break; 996 break;
1009 997
@@ -1222,7 +1210,7 @@ static inline int IO_APIC_irq_trigger(int irq)
1222 int apic, idx, pin; 1210 int apic, idx, pin;
1223 1211
1224 for (apic = 0; apic < nr_ioapics; apic++) { 1212 for (apic = 0; apic < nr_ioapics; apic++) {
1225 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1213 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
1226 idx = find_irq_entry(apic, pin, mp_INT); 1214 idx = find_irq_entry(apic, pin, mp_INT);
1227 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) 1215 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1228 return irq_trigger(idx); 1216 return irq_trigger(idx);
@@ -1350,14 +1338,14 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1350 apic_printk(APIC_VERBOSE,KERN_DEBUG 1338 apic_printk(APIC_VERBOSE,KERN_DEBUG
1351 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1339 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1352 "IRQ %d Mode:%i Active:%i)\n", 1340 "IRQ %d Mode:%i Active:%i)\n",
1353 apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, 1341 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
1354 irq, trigger, polarity); 1342 irq, trigger, polarity);
1355 1343
1356 1344
1357 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, 1345 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
1358 dest, trigger, polarity, cfg->vector, pin)) { 1346 dest, trigger, polarity, cfg->vector, pin)) {
1359 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1347 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1360 mp_ioapics[apic_id].apicid, pin); 1348 mpc_ioapic_id(apic_id), pin);
1361 __clear_irq_vector(irq, cfg); 1349 __clear_irq_vector(irq, cfg);
1362 return; 1350 return;
1363 } 1351 }
@@ -1369,17 +1357,13 @@ static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1369 ioapic_write_entry(apic_id, pin, entry); 1357 ioapic_write_entry(apic_id, pin, entry);
1370} 1358}
1371 1359
1372static struct {
1373 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1374} mp_ioapic_routing[MAX_IO_APICS];
1375
1376static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin) 1360static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1377{ 1361{
1378 if (idx != -1) 1362 if (idx != -1)
1379 return false; 1363 return false;
1380 1364
1381 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n", 1365 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1382 mp_ioapics[apic_id].apicid, pin); 1366 mpc_ioapic_id(apic_id), pin);
1383 return true; 1367 return true;
1384} 1368}
1385 1369
@@ -1389,7 +1373,7 @@ static void __init __io_apic_setup_irqs(unsigned int apic_id)
1389 struct io_apic_irq_attr attr; 1373 struct io_apic_irq_attr attr;
1390 unsigned int pin, irq; 1374 unsigned int pin, irq;
1391 1375
1392 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1376 for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
1393 idx = find_irq_entry(apic_id, pin, mp_INT); 1377 idx = find_irq_entry(apic_id, pin, mp_INT);
1394 if (io_apic_pin_not_connected(idx, apic_id, pin)) 1378 if (io_apic_pin_not_connected(idx, apic_id, pin))
1395 continue; 1379 continue;
@@ -1511,7 +1495,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1511 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1495 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1512 for (i = 0; i < nr_ioapics; i++) 1496 for (i = 0; i < nr_ioapics; i++)
1513 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1497 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1514 mp_ioapics[i].apicid, nr_ioapic_registers[i]); 1498 mpc_ioapic_id(i), ioapics[i].nr_registers);
1515 1499
1516 /* 1500 /*
1517 * We are a bit conservative about what we expect. We have to 1501 * We are a bit conservative about what we expect. We have to
@@ -1531,7 +1515,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1531 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1515 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1532 1516
1533 printk("\n"); 1517 printk("\n");
1534 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1518 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
1535 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1519 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1536 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1520 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1537 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1521 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1825,7 +1809,7 @@ void __init enable_IO_APIC(void)
1825 for(apic = 0; apic < nr_ioapics; apic++) { 1809 for(apic = 0; apic < nr_ioapics; apic++) {
1826 int pin; 1810 int pin;
1827 /* See if any of the pins is in ExtINT mode */ 1811 /* See if any of the pins is in ExtINT mode */
1828 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1812 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
1829 struct IO_APIC_route_entry entry; 1813 struct IO_APIC_route_entry entry;
1830 entry = ioapic_read_entry(apic, pin); 1814 entry = ioapic_read_entry(apic, pin);
1831 1815
@@ -1949,14 +1933,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1949 reg_00.raw = io_apic_read(apic_id, 0); 1933 reg_00.raw = io_apic_read(apic_id, 0);
1950 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1934 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1951 1935
1952 old_id = mp_ioapics[apic_id].apicid; 1936 old_id = mpc_ioapic_id(apic_id);
1953 1937
1954 if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { 1938 if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
1955 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 1939 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
1956 apic_id, mp_ioapics[apic_id].apicid); 1940 apic_id, mpc_ioapic_id(apic_id));
1957 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1941 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1958 reg_00.bits.ID); 1942 reg_00.bits.ID);
1959 mp_ioapics[apic_id].apicid = reg_00.bits.ID; 1943 ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
1960 } 1944 }
1961 1945
1962 /* 1946 /*
@@ -1965,9 +1949,9 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1965 * 'stuck on smp_invalidate_needed IPI wait' messages. 1949 * 'stuck on smp_invalidate_needed IPI wait' messages.
1966 */ 1950 */
1967 if (apic->check_apicid_used(&phys_id_present_map, 1951 if (apic->check_apicid_used(&phys_id_present_map,
1968 mp_ioapics[apic_id].apicid)) { 1952 mpc_ioapic_id(apic_id))) {
1969 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 1953 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
1970 apic_id, mp_ioapics[apic_id].apicid); 1954 apic_id, mpc_ioapic_id(apic_id));
1971 for (i = 0; i < get_physical_broadcast(); i++) 1955 for (i = 0; i < get_physical_broadcast(); i++)
1972 if (!physid_isset(i, phys_id_present_map)) 1956 if (!physid_isset(i, phys_id_present_map))
1973 break; 1957 break;
@@ -1976,13 +1960,14 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1976 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1960 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
1977 i); 1961 i);
1978 physid_set(i, phys_id_present_map); 1962 physid_set(i, phys_id_present_map);
1979 mp_ioapics[apic_id].apicid = i; 1963 ioapics[apic_id].mp_config.apicid = i;
1980 } else { 1964 } else {
1981 physid_mask_t tmp; 1965 physid_mask_t tmp;
1982 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); 1966 apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
1967 &tmp);
1983 apic_printk(APIC_VERBOSE, "Setting %d in the " 1968 apic_printk(APIC_VERBOSE, "Setting %d in the "
1984 "phys_id_present_map\n", 1969 "phys_id_present_map\n",
1985 mp_ioapics[apic_id].apicid); 1970 mpc_ioapic_id(apic_id));
1986 physids_or(phys_id_present_map, phys_id_present_map, tmp); 1971 physids_or(phys_id_present_map, phys_id_present_map, tmp);
1987 } 1972 }
1988 1973
@@ -1990,24 +1975,24 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
1990 * We need to adjust the IRQ routing table 1975 * We need to adjust the IRQ routing table
1991 * if the ID changed. 1976 * if the ID changed.
1992 */ 1977 */
1993 if (old_id != mp_ioapics[apic_id].apicid) 1978 if (old_id != mpc_ioapic_id(apic_id))
1994 for (i = 0; i < mp_irq_entries; i++) 1979 for (i = 0; i < mp_irq_entries; i++)
1995 if (mp_irqs[i].dstapic == old_id) 1980 if (mp_irqs[i].dstapic == old_id)
1996 mp_irqs[i].dstapic 1981 mp_irqs[i].dstapic
1997 = mp_ioapics[apic_id].apicid; 1982 = mpc_ioapic_id(apic_id);
1998 1983
1999 /* 1984 /*
2000 * Update the ID register according to the right value 1985 * Update the ID register according to the right value
2001 * from the MPC table if they are different. 1986 * from the MPC table if they are different.
2002 */ 1987 */
2003 if (mp_ioapics[apic_id].apicid == reg_00.bits.ID) 1988 if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
2004 continue; 1989 continue;
2005 1990
2006 apic_printk(APIC_VERBOSE, KERN_INFO 1991 apic_printk(APIC_VERBOSE, KERN_INFO
2007 "...changing IO-APIC physical APIC ID to %d ...", 1992 "...changing IO-APIC physical APIC ID to %d ...",
2008 mp_ioapics[apic_id].apicid); 1993 mpc_ioapic_id(apic_id));
2009 1994
2010 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 1995 reg_00.bits.ID = mpc_ioapic_id(apic_id);
2011 raw_spin_lock_irqsave(&ioapic_lock, flags); 1996 raw_spin_lock_irqsave(&ioapic_lock, flags);
2012 io_apic_write(apic_id, 0, reg_00.raw); 1997 io_apic_write(apic_id, 0, reg_00.raw);
2013 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1998 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2018,7 +2003,7 @@ void __init setup_ioapic_ids_from_mpc_nocheck(void)
2018 raw_spin_lock_irqsave(&ioapic_lock, flags); 2003 raw_spin_lock_irqsave(&ioapic_lock, flags);
2019 reg_00.raw = io_apic_read(apic_id, 0); 2004 reg_00.raw = io_apic_read(apic_id, 0);
2020 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2005 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2021 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2006 if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
2022 printk("could not set ID!\n"); 2007 printk("could not set ID!\n");
2023 else 2008 else
2024 apic_printk(APIC_VERBOSE, " ok.\n"); 2009 apic_printk(APIC_VERBOSE, " ok.\n");
@@ -2404,7 +2389,7 @@ static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2404 2389
2405 raw_spin_lock_irqsave(&ioapic_lock, flags); 2390 raw_spin_lock_irqsave(&ioapic_lock, flags);
2406 for_each_irq_pin(entry, cfg->irq_2_pin) { 2391 for_each_irq_pin(entry, cfg->irq_2_pin) {
2407 if (mp_ioapics[entry->apic].apicver >= 0x20) { 2392 if (mpc_ioapic_ver(entry->apic) >= 0x20) {
2408 /* 2393 /*
2409 * Intr-remapping uses pin number as the virtual vector 2394 * Intr-remapping uses pin number as the virtual vector
2410 * in the RTE. Actual vector is programmed in 2395 * in the RTE. Actual vector is programmed in
@@ -2918,49 +2903,19 @@ static int __init io_apic_bug_finalize(void)
2918 2903
2919late_initcall(io_apic_bug_finalize); 2904late_initcall(io_apic_bug_finalize);
2920 2905
2921static struct IO_APIC_route_entry *ioapic_saved_data[MAX_IO_APICS]; 2906static void resume_ioapic_id(int ioapic_id)
2922
2923static void suspend_ioapic(int ioapic_id)
2924{ 2907{
2925 struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
2926 int i;
2927
2928 if (!saved_data)
2929 return;
2930
2931 for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
2932 saved_data[i] = ioapic_read_entry(ioapic_id, i);
2933}
2934
2935static int ioapic_suspend(void)
2936{
2937 int ioapic_id;
2938
2939 for (ioapic_id = 0; ioapic_id < nr_ioapics; ioapic_id++)
2940 suspend_ioapic(ioapic_id);
2941
2942 return 0;
2943}
2944
2945static void resume_ioapic(int ioapic_id)
2946{
2947 struct IO_APIC_route_entry *saved_data = ioapic_saved_data[ioapic_id];
2948 unsigned long flags; 2908 unsigned long flags;
2949 union IO_APIC_reg_00 reg_00; 2909 union IO_APIC_reg_00 reg_00;
2950 int i;
2951 2910
2952 if (!saved_data)
2953 return;
2954 2911
2955 raw_spin_lock_irqsave(&ioapic_lock, flags); 2912 raw_spin_lock_irqsave(&ioapic_lock, flags);
2956 reg_00.raw = io_apic_read(ioapic_id, 0); 2913 reg_00.raw = io_apic_read(ioapic_id, 0);
2957 if (reg_00.bits.ID != mp_ioapics[ioapic_id].apicid) { 2914 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
2958 reg_00.bits.ID = mp_ioapics[ioapic_id].apicid; 2915 reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
2959 io_apic_write(ioapic_id, 0, reg_00.raw); 2916 io_apic_write(ioapic_id, 0, reg_00.raw);
2960 } 2917 }
2961 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2918 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2962 for (i = 0; i < nr_ioapic_registers[ioapic_id]; i++)
2963 ioapic_write_entry(ioapic_id, i, saved_data[i]);
2964} 2919}
2965 2920
2966static void ioapic_resume(void) 2921static void ioapic_resume(void)
@@ -2968,28 +2923,18 @@ static void ioapic_resume(void)
2968 int ioapic_id; 2923 int ioapic_id;
2969 2924
2970 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--) 2925 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
2971 resume_ioapic(ioapic_id); 2926 resume_ioapic_id(ioapic_id);
2927
2928 restore_ioapic_entries();
2972} 2929}
2973 2930
2974static struct syscore_ops ioapic_syscore_ops = { 2931static struct syscore_ops ioapic_syscore_ops = {
2975 .suspend = ioapic_suspend, 2932 .suspend = save_ioapic_entries,
2976 .resume = ioapic_resume, 2933 .resume = ioapic_resume,
2977}; 2934};
2978 2935
2979static int __init ioapic_init_ops(void) 2936static int __init ioapic_init_ops(void)
2980{ 2937{
2981 int i;
2982
2983 for (i = 0; i < nr_ioapics; i++) {
2984 unsigned int size;
2985
2986 size = nr_ioapic_registers[i]
2987 * sizeof(struct IO_APIC_route_entry);
2988 ioapic_saved_data[i] = kzalloc(size, GFP_KERNEL);
2989 if (!ioapic_saved_data[i])
2990 pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
2991 }
2992
2993 register_syscore_ops(&ioapic_syscore_ops); 2938 register_syscore_ops(&ioapic_syscore_ops);
2994 2939
2995 return 0; 2940 return 0;
@@ -3592,14 +3537,14 @@ int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3592 int ret; 3537 int ret;
3593 3538
3594 /* Avoid redundant programming */ 3539 /* Avoid redundant programming */
3595 if (test_bit(pin, mp_ioapic_routing[id].pin_programmed)) { 3540 if (test_bit(pin, ioapics[id].pin_programmed)) {
3596 pr_debug("Pin %d-%d already programmed\n", 3541 pr_debug("Pin %d-%d already programmed\n",
3597 mp_ioapics[id].apicid, pin); 3542 mpc_ioapic_id(id), pin);
3598 return 0; 3543 return 0;
3599 } 3544 }
3600 ret = io_apic_setup_irq_pin(irq, node, attr); 3545 ret = io_apic_setup_irq_pin(irq, node, attr);
3601 if (!ret) 3546 if (!ret)
3602 set_bit(pin, mp_ioapic_routing[id].pin_programmed); 3547 set_bit(pin, ioapics[id].pin_programmed);
3603 return ret; 3548 return ret;
3604} 3549}
3605 3550
@@ -3764,8 +3709,7 @@ static u8 __init io_apic_unique_id(u8 id)
3764 3709
3765 bitmap_zero(used, 256); 3710 bitmap_zero(used, 256);
3766 for (i = 0; i < nr_ioapics; i++) { 3711 for (i = 0; i < nr_ioapics; i++) {
3767 struct mpc_ioapic *ia = &mp_ioapics[i]; 3712 __set_bit(mpc_ioapic_id(i), used);
3768 __set_bit(ia->apicid, used);
3769 } 3713 }
3770 if (!test_bit(id, used)) 3714 if (!test_bit(id, used))
3771 return id; 3715 return id;
@@ -3825,7 +3769,7 @@ void __init setup_ioapic_dest(void)
3825 return; 3769 return;
3826 3770
3827 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) 3771 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
3828 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 3772 for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
3829 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 3773 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
3830 if (irq_entry == -1) 3774 if (irq_entry == -1)
3831 continue; 3775 continue;
@@ -3896,7 +3840,7 @@ void __init ioapic_and_gsi_init(void)
3896 ioapic_res = ioapic_setup_resources(nr_ioapics); 3840 ioapic_res = ioapic_setup_resources(nr_ioapics);
3897 for (i = 0; i < nr_ioapics; i++) { 3841 for (i = 0; i < nr_ioapics; i++) {
3898 if (smp_found_config) { 3842 if (smp_found_config) {
3899 ioapic_phys = mp_ioapics[i].apicaddr; 3843 ioapic_phys = mpc_ioapic_addr(i);
3900#ifdef CONFIG_X86_32 3844#ifdef CONFIG_X86_32
3901 if (!ioapic_phys) { 3845 if (!ioapic_phys) {
3902 printk(KERN_ERR 3846 printk(KERN_ERR
@@ -3956,8 +3900,9 @@ int mp_find_ioapic(u32 gsi)
3956 3900
3957 /* Find the IOAPIC that manages this GSI. */ 3901 /* Find the IOAPIC that manages this GSI. */
3958 for (i = 0; i < nr_ioapics; i++) { 3902 for (i = 0; i < nr_ioapics; i++) {
3959 if ((gsi >= mp_gsi_routing[i].gsi_base) 3903 struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
3960 && (gsi <= mp_gsi_routing[i].gsi_end)) 3904 if ((gsi >= gsi_cfg->gsi_base)
3905 && (gsi <= gsi_cfg->gsi_end))
3961 return i; 3906 return i;
3962 } 3907 }
3963 3908
@@ -3967,12 +3912,16 @@ int mp_find_ioapic(u32 gsi)
3967 3912
3968int mp_find_ioapic_pin(int ioapic, u32 gsi) 3913int mp_find_ioapic_pin(int ioapic, u32 gsi)
3969{ 3914{
3915 struct mp_ioapic_gsi *gsi_cfg;
3916
3970 if (WARN_ON(ioapic == -1)) 3917 if (WARN_ON(ioapic == -1))
3971 return -1; 3918 return -1;
3972 if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) 3919
3920 gsi_cfg = mp_ioapic_gsi_routing(ioapic);
3921 if (WARN_ON(gsi > gsi_cfg->gsi_end))
3973 return -1; 3922 return -1;
3974 3923
3975 return gsi - mp_gsi_routing[ioapic].gsi_base; 3924 return gsi - gsi_cfg->gsi_base;
3976} 3925}
3977 3926
3978static __init int bad_ioapic(unsigned long address) 3927static __init int bad_ioapic(unsigned long address)
@@ -3994,40 +3943,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
3994{ 3943{
3995 int idx = 0; 3944 int idx = 0;
3996 int entries; 3945 int entries;
3946 struct mp_ioapic_gsi *gsi_cfg;
3997 3947
3998 if (bad_ioapic(address)) 3948 if (bad_ioapic(address))
3999 return; 3949 return;
4000 3950
4001 idx = nr_ioapics; 3951 idx = nr_ioapics;
4002 3952
4003 mp_ioapics[idx].type = MP_IOAPIC; 3953 ioapics[idx].mp_config.type = MP_IOAPIC;
4004 mp_ioapics[idx].flags = MPC_APIC_USABLE; 3954 ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
4005 mp_ioapics[idx].apicaddr = address; 3955 ioapics[idx].mp_config.apicaddr = address;
4006 3956
4007 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 3957 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4008 mp_ioapics[idx].apicid = io_apic_unique_id(id); 3958 ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
4009 mp_ioapics[idx].apicver = io_apic_get_version(idx); 3959 ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
4010 3960
4011 /* 3961 /*
4012 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 3962 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4013 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 3963 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4014 */ 3964 */
4015 entries = io_apic_get_redir_entries(idx); 3965 entries = io_apic_get_redir_entries(idx);
4016 mp_gsi_routing[idx].gsi_base = gsi_base; 3966 gsi_cfg = mp_ioapic_gsi_routing(idx);
4017 mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; 3967 gsi_cfg->gsi_base = gsi_base;
3968 gsi_cfg->gsi_end = gsi_base + entries - 1;
4018 3969
4019 /* 3970 /*
4020 * The number of IO-APIC IRQ registers (== #pins): 3971 * The number of IO-APIC IRQ registers (== #pins):
4021 */ 3972 */
4022 nr_ioapic_registers[idx] = entries; 3973 ioapics[idx].nr_registers = entries;
4023 3974
4024 if (mp_gsi_routing[idx].gsi_end >= gsi_top) 3975 if (gsi_cfg->gsi_end >= gsi_top)
4025 gsi_top = mp_gsi_routing[idx].gsi_end + 1; 3976 gsi_top = gsi_cfg->gsi_end + 1;
4026 3977
4027 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 3978 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4028 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, 3979 "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
4029 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, 3980 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
4030 mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); 3981 gsi_cfg->gsi_base, gsi_cfg->gsi_end);
4031 3982
4032 nr_ioapics++; 3983 nr_ioapics++;
4033} 3984}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 6273eee5134b..c4a61ca1349a 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -48,8 +48,6 @@
48#include <asm/e820.h> 48#include <asm/e820.h>
49#include <asm/ipi.h> 49#include <asm/ipi.h>
50 50
51#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
52
53int found_numaq; 51int found_numaq;
54 52
55/* 53/*
@@ -79,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4];
79static inline void numaq_register_node(int node, struct sys_cfg_data *scd) 77static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
80{ 78{
81 struct eachquadmem *eq = scd->eq + node; 79 struct eachquadmem *eq = scd->eq + node;
80 u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
81 u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
82 int ret;
82 83
83 node_set_online(node); 84 node_set(node, numa_nodes_parsed);
84 85 ret = numa_add_memblk(node, start, end);
85 /* Convert to pages */ 86 BUG_ON(ret < 0);
86 node_start_pfn[node] =
87 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
88
89 node_end_pfn[node] =
90 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
91
92 memblock_x86_register_active_regions(node, node_start_pfn[node],
93 node_end_pfn[node]);
94
95 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
96
97 node_remap_size[node] = node_memmap_size_bytes(node,
98 node_start_pfn[node],
99 node_end_pfn[node]);
100} 87}
101 88
102/* 89/*
103 * Function: smp_dump_qct() 90 * Function: smp_dump_qct()
104 * 91 *
105 * Description: gets memory layout from the quad config table. This 92 * Description: gets memory layout from the quad config table. This
106 * function also updates node_online_map with the nodes (quads) present. 93 * function also updates numa_nodes_parsed with the nodes (quads) present.
107 */ 94 */
108static void __init smp_dump_qct(void) 95static void __init smp_dump_qct(void)
109{ 96{
@@ -112,7 +99,6 @@ static void __init smp_dump_qct(void)
112 99
113 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); 100 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
114 101
115 nodes_clear(node_online_map);
116 for_each_node(node) { 102 for_each_node(node) {
117 if (scd->quads_present31_0 & (1 << node)) 103 if (scd->quads_present31_0 & (1 << node))
118 numaq_register_node(node, scd); 104 numaq_register_node(node, scd);
@@ -282,14 +268,14 @@ static __init void early_check_numaq(void)
282 } 268 }
283} 269}
284 270
285int __init get_memcfg_numaq(void) 271int __init numaq_numa_init(void)
286{ 272{
287 early_check_numaq(); 273 early_check_numaq();
288 if (!found_numaq) 274 if (!found_numaq)
289 return 0; 275 return -ENOENT;
290 smp_dump_qct(); 276 smp_dump_qct();
291 277
292 return 1; 278 return 0;
293} 279}
294 280
295#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 281#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
@@ -486,8 +472,8 @@ static void numaq_setup_portio_remap(void)
486 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 472 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
487} 473}
488 474
489/* Use __refdata to keep false positive warning calm. */ 475/* Use __refdata to keep false positive warning calm. */
490struct apic __refdata apic_numaq = { 476static struct apic __refdata apic_numaq = {
491 477
492 .name = "NUMAQ", 478 .name = "NUMAQ",
493 .probe = probe_numaq, 479 .probe = probe_numaq,
@@ -551,3 +537,5 @@ struct apic __refdata apic_numaq = {
551 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid, 537 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
552 .x86_32_numa_cpu_node = numaq_numa_cpu_node, 538 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
553}; 539};
540
541apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index fc84c7b61108..b5254ad044ab 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,31 +52,6 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void __init default_setup_apic_routing(void)
56{
57 int version = apic_version[boot_cpu_physical_apicid];
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78}
79
80static int default_x86_32_early_logical_apicid(int cpu) 55static int default_x86_32_early_logical_apicid(int cpu)
81{ 56{
82 return 1 << cpu; 57 return 1 << cpu;
@@ -112,7 +87,7 @@ static int probe_default(void)
112 return 1; 87 return 1;
113} 88}
114 89
115struct apic apic_default = { 90static struct apic apic_default = {
116 91
117 .name = "default", 92 .name = "default",
118 .probe = probe_default, 93 .probe = probe_default,
@@ -172,47 +147,24 @@ struct apic apic_default = {
172 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 147 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
173 148
174 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid, 149 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
175 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
176}; 150};
177 151
178extern struct apic apic_numaq; 152apic_driver(apic_default);
179extern struct apic apic_summit;
180extern struct apic apic_bigsmp;
181extern struct apic apic_es7000;
182extern struct apic apic_es7000_cluster;
183 153
184struct apic *apic = &apic_default; 154struct apic *apic = &apic_default;
185EXPORT_SYMBOL_GPL(apic); 155EXPORT_SYMBOL_GPL(apic);
186 156
187static struct apic *apic_probe[] __initdata = {
188#ifdef CONFIG_X86_NUMAQ
189 &apic_numaq,
190#endif
191#ifdef CONFIG_X86_SUMMIT
192 &apic_summit,
193#endif
194#ifdef CONFIG_X86_BIGSMP
195 &apic_bigsmp,
196#endif
197#ifdef CONFIG_X86_ES7000
198 &apic_es7000,
199 &apic_es7000_cluster,
200#endif
201 &apic_default, /* must be last */
202 NULL,
203};
204
205static int cmdline_apic __initdata; 157static int cmdline_apic __initdata;
206static int __init parse_apic(char *arg) 158static int __init parse_apic(char *arg)
207{ 159{
208 int i; 160 struct apic **drv;
209 161
210 if (!arg) 162 if (!arg)
211 return -EINVAL; 163 return -EINVAL;
212 164
213 for (i = 0; apic_probe[i]; i++) { 165 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
214 if (!strcmp(apic_probe[i]->name, arg)) { 166 if (!strcmp((*drv)->name, arg)) {
215 apic = apic_probe[i]; 167 apic = *drv;
216 cmdline_apic = 1; 168 cmdline_apic = 1;
217 return 0; 169 return 0;
218 } 170 }
@@ -223,38 +175,58 @@ static int __init parse_apic(char *arg)
223} 175}
224early_param("apic", parse_apic); 176early_param("apic", parse_apic);
225 177
226void __init generic_bigsmp_probe(void) 178void __init default_setup_apic_routing(void)
227{ 179{
180 int version = apic_version[boot_cpu_physical_apicid];
181
182 if (num_possible_cpus() > 8) {
183 switch (boot_cpu_data.x86_vendor) {
184 case X86_VENDOR_INTEL:
185 if (!APIC_XAPIC(version)) {
186 def_to_bigsmp = 0;
187 break;
188 }
189 /* If P4 and above fall through */
190 case X86_VENDOR_AMD:
191 def_to_bigsmp = 1;
192 }
193 }
194
228#ifdef CONFIG_X86_BIGSMP 195#ifdef CONFIG_X86_BIGSMP
229 /* 196 /*
230 * This routine is used to switch to bigsmp mode when 197 * This is used to switch to bigsmp mode when
231 * - There is no apic= option specified by the user 198 * - There is no apic= option specified by the user
232 * - generic_apic_probe() has chosen apic_default as the sub_arch 199 * - generic_apic_probe() has chosen apic_default as the sub_arch
233 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support 200 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
234 */ 201 */
235 202
236 if (!cmdline_apic && apic == &apic_default) { 203 if (!cmdline_apic && apic == &apic_default) {
237 if (apic_bigsmp.probe()) { 204 struct apic *bigsmp = generic_bigsmp_probe();
238 apic = &apic_bigsmp; 205 if (bigsmp) {
206 apic = bigsmp;
239 printk(KERN_INFO "Overriding APIC driver with %s\n", 207 printk(KERN_INFO "Overriding APIC driver with %s\n",
240 apic->name); 208 apic->name);
241 } 209 }
242 } 210 }
243#endif 211#endif
212
213 if (apic->setup_apic_routing)
214 apic->setup_apic_routing();
244} 215}
245 216
246void __init generic_apic_probe(void) 217void __init generic_apic_probe(void)
247{ 218{
248 if (!cmdline_apic) { 219 if (!cmdline_apic) {
249 int i; 220 struct apic **drv;
250 for (i = 0; apic_probe[i]; i++) { 221
251 if (apic_probe[i]->probe()) { 222 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
252 apic = apic_probe[i]; 223 if ((*drv)->probe()) {
224 apic = *drv;
253 break; 225 break;
254 } 226 }
255 } 227 }
256 /* Not visible without early console */ 228 /* Not visible without early console */
257 if (!apic_probe[i]) 229 if (drv == __apicdrivers_end)
258 panic("Didn't find an APIC driver"); 230 panic("Didn't find an APIC driver");
259 } 231 }
260 printk(KERN_INFO "Using APIC driver %s\n", apic->name); 232 printk(KERN_INFO "Using APIC driver %s\n", apic->name);
@@ -265,16 +237,16 @@ void __init generic_apic_probe(void)
265int __init 237int __init
266generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) 238generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
267{ 239{
268 int i; 240 struct apic **drv;
269 241
270 for (i = 0; apic_probe[i]; ++i) { 242 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
271 if (!apic_probe[i]->mps_oem_check) 243 if (!((*drv)->mps_oem_check))
272 continue; 244 continue;
273 if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) 245 if (!(*drv)->mps_oem_check(mpc, oem, productid))
274 continue; 246 continue;
275 247
276 if (!cmdline_apic) { 248 if (!cmdline_apic) {
277 apic = apic_probe[i]; 249 apic = *drv;
278 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 250 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
279 apic->name); 251 apic->name);
280 } 252 }
@@ -285,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
285 257
286int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 258int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
287{ 259{
288 int i; 260 struct apic **drv;
289 261
290 for (i = 0; apic_probe[i]; ++i) { 262 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
291 if (!apic_probe[i]->acpi_madt_oem_check) 263 if (!(*drv)->acpi_madt_oem_check)
292 continue; 264 continue;
293 if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) 265 if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
294 continue; 266 continue;
295 267
296 if (!cmdline_apic) { 268 if (!cmdline_apic) {
297 apic = apic_probe[i]; 269 apic = *drv;
298 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 270 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
299 apic->name); 271 apic->name);
300 } 272 }
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index d8c4a6feb286..3fe986698929 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,27 +23,6 @@
23#include <asm/ipi.h> 23#include <asm/ipi.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26extern struct apic apic_flat;
27extern struct apic apic_physflat;
28extern struct apic apic_x2xpic_uv_x;
29extern struct apic apic_x2apic_phys;
30extern struct apic apic_x2apic_cluster;
31
32struct apic __read_mostly *apic = &apic_flat;
33EXPORT_SYMBOL_GPL(apic);
34
35static struct apic *apic_probe[] __initdata = {
36#ifdef CONFIG_X86_UV
37 &apic_x2apic_uv_x,
38#endif
39#ifdef CONFIG_X86_X2APIC
40 &apic_x2apic_phys,
41 &apic_x2apic_cluster,
42#endif
43 &apic_physflat,
44 NULL,
45};
46
47static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) 26static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
48{ 27{
49 return hard_smp_processor_id() >> index_msb; 28 return hard_smp_processor_id() >> index_msb;
@@ -54,26 +33,20 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
54 */ 33 */
55void __init default_setup_apic_routing(void) 34void __init default_setup_apic_routing(void)
56{ 35{
36 struct apic **drv;
57 37
58 enable_IR_x2apic(); 38 enable_IR_x2apic();
59 39
60#ifdef CONFIG_X86_X2APIC 40 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
61 if (x2apic_mode 41 if ((*drv)->probe && (*drv)->probe()) {
62#ifdef CONFIG_X86_UV 42 if (apic != *drv) {
63 && apic != &apic_x2apic_uv_x 43 apic = *drv;
64#endif 44 pr_info("Switched APIC routing to %s.\n",
65 ) { 45 apic->name);
66 if (x2apic_phys) 46 }
67 apic = &apic_x2apic_phys; 47 break;
68 else 48 }
69 apic = &apic_x2apic_cluster;
70 } 49 }
71#endif
72
73 if (apic == &apic_flat && num_possible_cpus() > 8)
74 apic = &apic_physflat;
75
76 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
77 50
78 if (is_vsmp_box()) { 51 if (is_vsmp_box()) {
79 /* need to update phys_pkg_id */ 52 /* need to update phys_pkg_id */
@@ -90,13 +63,15 @@ void apic_send_IPI_self(int vector)
90 63
91int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 64int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
92{ 65{
93 int i; 66 struct apic **drv;
94 67
95 for (i = 0; apic_probe[i]; ++i) { 68 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
96 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 69 if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
97 apic = apic_probe[i]; 70 if (apic != *drv) {
98 printk(KERN_INFO "Setting APIC routing to %s.\n", 71 apic = *drv;
99 apic->name); 72 pr_info("Setting APIC routing to %s.\n",
73 apic->name);
74 }
100 return 1; 75 return 1;
101 } 76 }
102 } 77 }
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index e4b8059b414a..19114423c58c 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -491,7 +491,7 @@ void setup_summit(void)
491} 491}
492#endif 492#endif
493 493
494struct apic apic_summit = { 494static struct apic apic_summit = {
495 495
496 .name = "summit", 496 .name = "summit",
497 .probe = probe_summit, 497 .probe = probe_summit,
@@ -551,5 +551,6 @@ struct apic apic_summit = {
551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552 552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid, 553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
554 .x86_32_numa_cpu_node = default_x86_32_numa_cpu_node,
555}; 554};
555
556apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index 90949bbd566d..500795875827 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,118 +5,95 @@
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8#include <linux/cpu.h>
8 9
9#include <asm/smp.h> 10#include <asm/smp.h>
10#include <asm/apic.h> 11#include <asm/x2apic.h>
11#include <asm/ipi.h>
12 12
13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
15static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
14 16
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 17static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 18{
17 return x2apic_enabled(); 19 return x2apic_enabled();
18} 20}
19 21
20/* 22static inline u32 x2apic_cluster(int cpu)
21 * need to use more than cpu 0, because we need more vectors when
22 * MSI-X are used.
23 */
24static const struct cpumask *x2apic_target_cpus(void)
25{ 23{
26 return cpu_online_mask; 24 return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
27}
28
29/*
30 * for now each logical cpu is in its own vector allocation domain.
31 */
32static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
33{
34 cpumask_clear(retmask);
35 cpumask_set_cpu(cpu, retmask);
36} 25}
37 26
38static void 27static void
39 __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) 28__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
40{ 29{
41 unsigned long cfg; 30 struct cpumask *cpus_in_cluster_ptr;
31 struct cpumask *ipi_mask_ptr;
32 unsigned int cpu, this_cpu;
33 unsigned long flags;
34 u32 dest;
35
36 x2apic_wrmsr_fence();
37
38 local_irq_save(flags);
42 39
43 cfg = __prepare_ICR(0, vector, dest); 40 this_cpu = smp_processor_id();
44 41
45 /* 42 /*
46 * send the IPI. 43 * We are to modify mask, so we need an own copy
44 * and be sure it's manipulated with irq off.
47 */ 45 */
48 native_x2apic_icr_write(cfg, apicid); 46 ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
49} 47 cpumask_copy(ipi_mask_ptr, mask);
50 48
51/* 49 /*
52 * for now, we send the IPI's one by one in the cpumask. 50 * The idea is to send one IPI per cluster.
53 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group 51 */
54 * at once. We have 16 cpu's in a cluster. This will minimize IPI register 52 for_each_cpu(cpu, ipi_mask_ptr) {
55 * writes. 53 unsigned long i;
56 */
57static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58{
59 unsigned long query_cpu;
60 unsigned long flags;
61 54
62 x2apic_wrmsr_fence(); 55 cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
56 dest = 0;
63 57
64 local_irq_save(flags); 58 /* Collect cpus in cluster. */
65 for_each_cpu(query_cpu, mask) { 59 for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
66 __x2apic_send_IPI_dest( 60 if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
67 per_cpu(x86_cpu_to_logical_apicid, query_cpu), 61 dest |= per_cpu(x86_cpu_to_logical_apicid, i);
68 vector, apic->dest_logical); 62 }
63
64 if (!dest)
65 continue;
66
67 __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
68 /*
69 * Cluster sibling cpus should be discared now so
70 * we would not send IPI them second time.
71 */
72 cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
69 } 73 }
74
70 local_irq_restore(flags); 75 local_irq_restore(flags);
71} 76}
72 77
78static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
79{
80 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
81}
82
73static void 83static void
74 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 84 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
75{ 85{
76 unsigned long this_cpu = smp_processor_id(); 86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
77 unsigned long query_cpu;
78 unsigned long flags;
79
80 x2apic_wrmsr_fence();
81
82 local_irq_save(flags);
83 for_each_cpu(query_cpu, mask) {
84 if (query_cpu == this_cpu)
85 continue;
86 __x2apic_send_IPI_dest(
87 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
88 vector, apic->dest_logical);
89 }
90 local_irq_restore(flags);
91} 87}
92 88
93static void x2apic_send_IPI_allbutself(int vector) 89static void x2apic_send_IPI_allbutself(int vector)
94{ 90{
95 unsigned long this_cpu = smp_processor_id(); 91 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
96 unsigned long query_cpu;
97 unsigned long flags;
98
99 x2apic_wrmsr_fence();
100
101 local_irq_save(flags);
102 for_each_online_cpu(query_cpu) {
103 if (query_cpu == this_cpu)
104 continue;
105 __x2apic_send_IPI_dest(
106 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
107 vector, apic->dest_logical);
108 }
109 local_irq_restore(flags);
110} 92}
111 93
112static void x2apic_send_IPI_all(int vector) 94static void x2apic_send_IPI_all(int vector)
113{ 95{
114 x2apic_send_IPI_mask(cpu_online_mask, vector); 96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
115}
116
117static int x2apic_apic_id_registered(void)
118{
119 return 1;
120} 97}
121 98
122static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 99static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
151 return per_cpu(x86_cpu_to_logical_apicid, cpu); 128 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152} 129}
153 130
154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 131static void init_x2apic_ldr(void)
155{ 132{
156 unsigned int id; 133 unsigned int this_cpu = smp_processor_id();
134 unsigned int cpu;
157 135
158 id = x; 136 per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
159 return id; 137
138 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
139 for_each_online_cpu(cpu) {
140 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
141 continue;
142 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
143 __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
144 }
160} 145}
161 146
162static unsigned long set_apic_id(unsigned int id) 147 /*
148 * At CPU state changes, update the x2apic cluster sibling info.
149 */
150static int __cpuinit
151update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
163{ 152{
164 unsigned long x; 153 unsigned int this_cpu = (unsigned long)hcpu;
154 unsigned int cpu;
155 int err = 0;
156
157 switch (action) {
158 case CPU_UP_PREPARE:
159 if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
160 GFP_KERNEL)) {
161 err = -ENOMEM;
162 } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
163 GFP_KERNEL)) {
164 free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
165 err = -ENOMEM;
166 }
167 break;
168 case CPU_UP_CANCELED:
169 case CPU_UP_CANCELED_FROZEN:
170 case CPU_DEAD:
171 for_each_online_cpu(cpu) {
172 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
173 continue;
174 __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
175 __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
176 }
177 free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
178 free_cpumask_var(per_cpu(ipi_mask, this_cpu));
179 break;
180 }
165 181
166 x = id; 182 return notifier_from_errno(err);
167 return x;
168} 183}
169 184
170static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 185static struct notifier_block __refdata x2apic_cpu_notifier = {
171{ 186 .notifier_call = update_clusterinfo,
172 return initial_apicid >> index_msb; 187};
173}
174 188
175static void x2apic_send_IPI_self(int vector) 189static int x2apic_init_cpu_notifier(void)
176{ 190{
177 apic_write(APIC_SELF_IPI, vector); 191 int cpu = smp_processor_id();
192
193 zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
194 zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
195
196 BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
197
198 __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
199 register_hotcpu_notifier(&x2apic_cpu_notifier);
200 return 1;
178} 201}
179 202
180static void init_x2apic_ldr(void) 203static int x2apic_cluster_probe(void)
181{ 204{
182 int cpu = smp_processor_id(); 205 if (x2apic_mode)
183 206 return x2apic_init_cpu_notifier();
184 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); 207 else
208 return 0;
185} 209}
186 210
187struct apic apic_x2apic_cluster = { 211static struct apic apic_x2apic_cluster = {
188 212
189 .name = "cluster x2apic", 213 .name = "cluster x2apic",
190 .probe = NULL, 214 .probe = x2apic_cluster_probe,
191 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
192 .apic_id_registered = x2apic_apic_id_registered, 216 .apic_id_registered = x2apic_apic_id_registered,
193 217
@@ -211,11 +235,11 @@ struct apic apic_x2apic_cluster = {
211 .setup_portio_remap = NULL, 235 .setup_portio_remap = NULL,
212 .check_phys_apicid_present = default_check_phys_apicid_present, 236 .check_phys_apicid_present = default_check_phys_apicid_present,
213 .enable_apic_mode = NULL, 237 .enable_apic_mode = NULL,
214 .phys_pkg_id = x2apic_cluster_phys_pkg_id, 238 .phys_pkg_id = x2apic_phys_pkg_id,
215 .mps_oem_check = NULL, 239 .mps_oem_check = NULL,
216 240
217 .get_apic_id = x2apic_cluster_phys_get_apic_id, 241 .get_apic_id = x2apic_get_apic_id,
218 .set_apic_id = set_apic_id, 242 .set_apic_id = x2apic_set_apic_id,
219 .apic_id_mask = 0xFFFFFFFFu, 243 .apic_id_mask = 0xFFFFFFFFu,
220 244
221 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 245 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -240,3 +264,5 @@ struct apic apic_x2apic_cluster = {
240 .wait_icr_idle = native_x2apic_wait_icr_idle, 264 .wait_icr_idle = native_x2apic_wait_icr_idle,
241 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, 265 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
242}; 266};
267
268apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index c7e6d6645bf4..f5373dfde21e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,11 +7,12 @@
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8 8
9#include <asm/smp.h> 9#include <asm/smp.h>
10#include <asm/apic.h> 10#include <asm/x2apic.h>
11#include <asm/ipi.h>
12 11
13int x2apic_phys; 12int x2apic_phys;
14 13
14static struct apic apic_x2apic_phys;
15
15static int set_x2apic_phys_mode(char *arg) 16static int set_x2apic_phys_mode(char *arg)
16{ 17{
17 x2apic_phys = 1; 18 x2apic_phys = 1;
@@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
27 return 0; 28 return 0;
28} 29}
29 30
30/* 31static void
31 * need to use more than cpu 0, because we need more vectors when 32__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
32 * MSI-X are used.
33 */
34static const struct cpumask *x2apic_target_cpus(void)
35{
36 return cpu_online_mask;
37}
38
39static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
40{
41 cpumask_clear(retmask);
42 cpumask_set_cpu(cpu, retmask);
43}
44
45static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
46 unsigned int dest)
47{
48 unsigned long cfg;
49
50 cfg = __prepare_ICR(0, vector, dest);
51
52 /*
53 * send the IPI.
54 */
55 native_x2apic_icr_write(cfg, apicid);
56}
57
58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
59{ 33{
60 unsigned long query_cpu; 34 unsigned long query_cpu;
35 unsigned long this_cpu;
61 unsigned long flags; 36 unsigned long flags;
62 37
63 x2apic_wrmsr_fence(); 38 x2apic_wrmsr_fence();
64 39
65 local_irq_save(flags); 40 local_irq_save(flags);
41
42 this_cpu = smp_processor_id();
66 for_each_cpu(query_cpu, mask) { 43 for_each_cpu(query_cpu, mask) {
44 if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
45 continue;
67 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 46 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
68 vector, APIC_DEST_PHYSICAL); 47 vector, APIC_DEST_PHYSICAL);
69 } 48 }
70 local_irq_restore(flags); 49 local_irq_restore(flags);
71} 50}
72 51
52static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
53{
54 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
55}
56
73static void 57static void
74 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 58 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
75{ 59{
76 unsigned long this_cpu = smp_processor_id(); 60 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
77 unsigned long query_cpu;
78 unsigned long flags;
79
80 x2apic_wrmsr_fence();
81
82 local_irq_save(flags);
83 for_each_cpu(query_cpu, mask) {
84 if (query_cpu != this_cpu)
85 __x2apic_send_IPI_dest(
86 per_cpu(x86_cpu_to_apicid, query_cpu),
87 vector, APIC_DEST_PHYSICAL);
88 }
89 local_irq_restore(flags);
90} 61}
91 62
92static void x2apic_send_IPI_allbutself(int vector) 63static void x2apic_send_IPI_allbutself(int vector)
93{ 64{
94 unsigned long this_cpu = smp_processor_id(); 65 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
95 unsigned long query_cpu;
96 unsigned long flags;
97
98 x2apic_wrmsr_fence();
99
100 local_irq_save(flags);
101 for_each_online_cpu(query_cpu) {
102 if (query_cpu == this_cpu)
103 continue;
104 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
105 vector, APIC_DEST_PHYSICAL);
106 }
107 local_irq_restore(flags);
108} 66}
109 67
110static void x2apic_send_IPI_all(int vector) 68static void x2apic_send_IPI_all(int vector)
111{ 69{
112 x2apic_send_IPI_mask(cpu_online_mask, vector); 70 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
113}
114
115static int x2apic_apic_id_registered(void)
116{
117 return 1;
118} 71}
119 72
120static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 73static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
149 return per_cpu(x86_cpu_to_apicid, cpu); 102 return per_cpu(x86_cpu_to_apicid, cpu);
150} 103}
151 104
152static unsigned int x2apic_phys_get_apic_id(unsigned long x) 105static void init_x2apic_ldr(void)
153{
154 return x;
155}
156
157static unsigned long set_apic_id(unsigned int id)
158{
159 return id;
160}
161
162static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
163{ 106{
164 return initial_apicid >> index_msb;
165} 107}
166 108
167static void x2apic_send_IPI_self(int vector) 109static int x2apic_phys_probe(void)
168{ 110{
169 apic_write(APIC_SELF_IPI, vector); 111 if (x2apic_mode && x2apic_phys)
170} 112 return 1;
171 113
172static void init_x2apic_ldr(void) 114 return apic == &apic_x2apic_phys;
173{
174} 115}
175 116
176struct apic apic_x2apic_phys = { 117static struct apic apic_x2apic_phys = {
177 118
178 .name = "physical x2apic", 119 .name = "physical x2apic",
179 .probe = NULL, 120 .probe = x2apic_phys_probe,
180 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
181 .apic_id_registered = x2apic_apic_id_registered, 122 .apic_id_registered = x2apic_apic_id_registered,
182 123
@@ -203,8 +144,8 @@ struct apic apic_x2apic_phys = {
203 .phys_pkg_id = x2apic_phys_pkg_id, 144 .phys_pkg_id = x2apic_phys_pkg_id,
204 .mps_oem_check = NULL, 145 .mps_oem_check = NULL,
205 146
206 .get_apic_id = x2apic_phys_get_apic_id, 147 .get_apic_id = x2apic_get_apic_id,
207 .set_apic_id = set_apic_id, 148 .set_apic_id = x2apic_set_apic_id,
208 .apic_id_mask = 0xFFFFFFFFu, 149 .apic_id_mask = 0xFFFFFFFFu,
209 150
210 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 151 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -229,3 +170,5 @@ struct apic apic_x2apic_phys = {
229 .wait_icr_idle = native_x2apic_wait_icr_idle, 170 .wait_icr_idle = native_x2apic_wait_icr_idle,
230 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, 171 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
231}; 172};
173
174apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 7acd2d2ac965..b511a011b7d0 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -58,6 +58,8 @@ unsigned int uv_apicid_hibits;
58EXPORT_SYMBOL_GPL(uv_apicid_hibits); 58EXPORT_SYMBOL_GPL(uv_apicid_hibits);
59static DEFINE_SPINLOCK(uv_nmi_lock); 59static DEFINE_SPINLOCK(uv_nmi_lock);
60 60
61static struct apic apic_x2apic_uv_x;
62
61static unsigned long __init uv_early_read_mmr(unsigned long addr) 63static unsigned long __init uv_early_read_mmr(unsigned long addr)
62{ 64{
63 unsigned long val, *mmr; 65 unsigned long val, *mmr;
@@ -89,6 +91,10 @@ static int __init early_get_pnodeid(void)
89 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR); 91 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
90 uv_min_hub_revision_id = node_id.s.revision; 92 uv_min_hub_revision_id = node_id.s.revision;
91 93
94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
96
97 uv_hub_info->hub_revision = uv_min_hub_revision_id;
92 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); 98 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
93 return pnode; 99 return pnode;
94} 100}
@@ -110,17 +116,25 @@ static void __init early_get_apic_pnode_shift(void)
110 */ 116 */
111static void __init uv_set_apicid_hibit(void) 117static void __init uv_set_apicid_hibit(void)
112{ 118{
113 union uvh_lb_target_physical_apic_id_mask_u apicid_mask; 119 union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
114 120
115 apicid_mask.v = uv_early_read_mmr(UVH_LB_TARGET_PHYSICAL_APIC_ID_MASK); 121 if (is_uv1_hub()) {
116 uv_apicid_hibits = apicid_mask.s.bit_enables & UV_APICID_HIBIT_MASK; 122 apicid_mask.v =
123 uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
124 uv_apicid_hibits =
125 apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
126 }
117} 127}
118 128
119static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 129static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
120{ 130{
121 int pnodeid; 131 int pnodeid, is_uv1, is_uv2;
122 132
123 if (!strcmp(oem_id, "SGI")) { 133 is_uv1 = !strcmp(oem_id, "SGI");
134 is_uv2 = !strcmp(oem_id, "SGI2");
135 if (is_uv1 || is_uv2) {
136 uv_hub_info->hub_revision =
137 is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
124 pnodeid = early_get_pnodeid(); 138 pnodeid = early_get_pnodeid();
125 early_get_apic_pnode_shift(); 139 early_get_apic_pnode_shift();
126 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 140 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
@@ -326,10 +340,15 @@ static void uv_send_IPI_self(int vector)
326 apic_write(APIC_SELF_IPI, vector); 340 apic_write(APIC_SELF_IPI, vector);
327} 341}
328 342
329struct apic __refdata apic_x2apic_uv_x = { 343static int uv_probe(void)
344{
345 return apic == &apic_x2apic_uv_x;
346}
347
348static struct apic __refdata apic_x2apic_uv_x = {
330 349
331 .name = "UV large system", 350 .name = "UV large system",
332 .probe = NULL, 351 .probe = uv_probe,
333 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 352 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
334 .apic_id_registered = uv_apic_id_registered, 353 .apic_id_registered = uv_apic_id_registered,
335 354
@@ -477,12 +496,19 @@ static __init void map_mmr_high(int max_pnode)
477static __init void map_mmioh_high(int max_pnode) 496static __init void map_mmioh_high(int max_pnode)
478{ 497{
479 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 498 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
480 int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; 499 int shift;
481 500
482 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 501 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
483 if (mmioh.s.enable) 502 if (is_uv1_hub() && mmioh.s1.enable) {
484 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, 503 shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
504 map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
485 max_pnode, map_uc); 505 max_pnode, map_uc);
506 }
507 if (is_uv2_hub() && mmioh.s2.enable) {
508 shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
509 map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
510 max_pnode, map_uc);
511 }
486} 512}
487 513
488static __init void map_low_mmrs(void) 514static __init void map_low_mmrs(void)
@@ -729,13 +755,14 @@ void __init uv_system_init(void)
729 unsigned long mmr_base, present, paddr; 755 unsigned long mmr_base, present, paddr;
730 unsigned short pnode_mask, pnode_io_mask; 756 unsigned short pnode_mask, pnode_io_mask;
731 757
758 printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
732 map_low_mmrs(); 759 map_low_mmrs();
733 760
734 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR ); 761 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
735 m_val = m_n_config.s.m_skt; 762 m_val = m_n_config.s.m_skt;
736 n_val = m_n_config.s.n_skt; 763 n_val = m_n_config.s.n_skt;
737 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 764 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
738 n_io = mmioh.s.n_io; 765 n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
739 mmr_base = 766 mmr_base =
740 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 767 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
741 ~UV_MMR_ENABLE; 768 ~UV_MMR_ENABLE;
@@ -804,6 +831,8 @@ void __init uv_system_init(void)
804 */ 831 */
805 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask; 832 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
806 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift; 833 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
834 uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
835
807 pnode = uv_apicid_to_pnode(apicid); 836 pnode = uv_apicid_to_pnode(apicid);
808 blade = boot_pnode_to_blade(pnode); 837 blade = boot_pnode_to_blade(pnode);
809 lcpu = uv_blade_info[blade].nr_possible_cpus; 838 lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -859,3 +888,5 @@ void __init uv_system_init(void)
859 if (is_kdump_kernel()) 888 if (is_kdump_kernel())
860 reboot_type = BOOT_ACPI; 889 reboot_type = BOOT_ACPI;
861} 890}
891
892apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index adee12e0da1f..965a7666c283 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -361,6 +361,7 @@ struct apm_user {
361 * idle percentage above which bios idle calls are done 361 * idle percentage above which bios idle calls are done
362 */ 362 */
363#ifdef CONFIG_APM_CPU_IDLE 363#ifdef CONFIG_APM_CPU_IDLE
364#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
364#define DEFAULT_IDLE_THRESHOLD 95 365#define DEFAULT_IDLE_THRESHOLD 95
365#else 366#else
366#define DEFAULT_IDLE_THRESHOLD 100 367#define DEFAULT_IDLE_THRESHOLD 100
@@ -904,6 +905,7 @@ static void apm_cpu_idle(void)
904 unsigned int jiffies_since_last_check = jiffies - last_jiffies; 905 unsigned int jiffies_since_last_check = jiffies - last_jiffies;
905 unsigned int bucket; 906 unsigned int bucket;
906 907
908 WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
907recalc: 909recalc:
908 if (jiffies_since_last_check > IDLE_CALC_LIMIT) { 910 if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
909 use_apm_idle = 0; 911 use_apm_idle = 0;
@@ -1238,7 +1240,6 @@ static int suspend(int vetoable)
1238 dpm_suspend_noirq(PMSG_SUSPEND); 1240 dpm_suspend_noirq(PMSG_SUSPEND);
1239 1241
1240 local_irq_disable(); 1242 local_irq_disable();
1241 sysdev_suspend(PMSG_SUSPEND);
1242 syscore_suspend(); 1243 syscore_suspend();
1243 1244
1244 local_irq_enable(); 1245 local_irq_enable();
@@ -1258,7 +1259,6 @@ static int suspend(int vetoable)
1258 err = (err == APM_SUCCESS) ? 0 : -EIO; 1259 err = (err == APM_SUCCESS) ? 0 : -EIO;
1259 1260
1260 syscore_resume(); 1261 syscore_resume();
1261 sysdev_resume();
1262 local_irq_enable(); 1262 local_irq_enable();
1263 1263
1264 dpm_resume_noirq(PMSG_RESUME); 1264 dpm_resume_noirq(PMSG_RESUME);
@@ -1282,7 +1282,6 @@ static void standby(void)
1282 dpm_suspend_noirq(PMSG_SUSPEND); 1282 dpm_suspend_noirq(PMSG_SUSPEND);
1283 1283
1284 local_irq_disable(); 1284 local_irq_disable();
1285 sysdev_suspend(PMSG_SUSPEND);
1286 syscore_suspend(); 1285 syscore_suspend();
1287 local_irq_enable(); 1286 local_irq_enable();
1288 1287
@@ -1292,7 +1291,6 @@ static void standby(void)
1292 1291
1293 local_irq_disable(); 1292 local_irq_disable();
1294 syscore_resume(); 1293 syscore_resume();
1295 sysdev_resume();
1296 local_irq_enable(); 1294 local_irq_enable();
1297 1295
1298 dpm_resume_noirq(PMSG_RESUME); 1296 dpm_resume_noirq(PMSG_RESUME);
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a01..6042981d0309 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
30 30
31obj-$(CONFIG_X86_MCE) += mcheck/ 31obj-$(CONFIG_X86_MCE) += mcheck/
32obj-$(CONFIG_MTRR) += mtrr/ 32obj-$(CONFIG_MTRR) += mtrr/
33obj-$(CONFIG_CPU_FREQ) += cpufreq/
34 33
35obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 34obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
36 35
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index 6f9d1f6063e9..b13ed393dfce 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -612,8 +612,11 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
612 } 612 }
613#endif 613#endif
614 614
615 /* As a rule processors have APIC timer running in deep C states */ 615 /*
616 if (c->x86 > 0xf && !cpu_has_amd_erratum(amd_erratum_400)) 616 * Family 0x12 and above processors have APIC timer
617 * running in deep C states.
618 */
619 if (c->x86 > 0x11)
617 set_cpu_cap(c, X86_FEATURE_ARAT); 620 set_cpu_cap(c, X86_FEATURE_ARAT);
618 621
619 /* 622 /*
@@ -629,10 +632,13 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
629 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012 632 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
630 */ 633 */
631 u64 mask; 634 u64 mask;
635 int err;
632 636
633 rdmsrl(MSR_AMD64_MCx_MASK(4), mask); 637 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
634 mask |= (1 << 10); 638 if (err == 0) {
635 wrmsrl(MSR_AMD64_MCx_MASK(4), mask); 639 mask |= (1 << 10);
640 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
641 }
636 } 642 }
637} 643}
638 644
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb3018..525514cf33c3 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,7 @@
19 19
20static int __init no_halt(char *s) 20static int __init no_halt(char *s)
21{ 21{
22 WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
22 boot_cpu_data.hlt_works_ok = 0; 23 boot_cpu_data.hlt_works_ok = 0;
23 return 1; 24 return 1;
24} 25}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index e2ced0074a45..22a073d7fbff 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
254} 254}
255#endif 255#endif
256 256
257static int disable_smep __cpuinitdata;
258static __init int setup_disable_smep(char *arg)
259{
260 disable_smep = 1;
261 return 1;
262}
263__setup("nosmep", setup_disable_smep);
264
265static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
266{
267 if (cpu_has(c, X86_FEATURE_SMEP)) {
268 if (unlikely(disable_smep)) {
269 setup_clear_cpu_cap(X86_FEATURE_SMEP);
270 clear_in_cr4(X86_CR4_SMEP);
271 } else
272 set_in_cr4(X86_CR4_SMEP);
273 }
274}
275
257/* 276/*
258 * Some CPU features depend on higher CPUID levels, which may not always 277 * Some CPU features depend on higher CPUID levels, which may not always
259 * be available due to CPUID level capping or broken virtualization 278 * be available due to CPUID level capping or broken virtualization
@@ -458,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
458 if (smp_num_siblings <= 1) 477 if (smp_num_siblings <= 1)
459 goto out; 478 goto out;
460 479
461 if (smp_num_siblings > nr_cpu_ids) {
462 pr_warning("CPU: Unsupported number of siblings %d",
463 smp_num_siblings);
464 smp_num_siblings = 1;
465 return;
466 }
467
468 index_msb = get_count_order(smp_num_siblings); 480 index_msb = get_count_order(smp_num_siblings);
469 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); 481 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
470 482
@@ -565,8 +577,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
565 577
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); 578 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567 579
568 if (eax > 0) 580 c->x86_capability[9] = ebx;
569 c->x86_capability[9] = ebx;
570 } 581 }
571 582
572 /* AMD-defined flags: level 0x80000001 */ 583 /* AMD-defined flags: level 0x80000001 */
@@ -668,6 +679,8 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
668 c->cpu_index = 0; 679 c->cpu_index = 0;
669#endif 680#endif
670 filter_cpuid_features(c, false); 681 filter_cpuid_features(c, false);
682
683 setup_smep(c);
671} 684}
672 685
673void __init early_cpu_init(void) 686void __init early_cpu_init(void)
@@ -753,6 +766,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
753#endif 766#endif
754 } 767 }
755 768
769 setup_smep(c);
770
756 get_model_name(c); /* Default name */ 771 get_model_name(c); /* Default name */
757 772
758 detect_nopl(c); 773 detect_nopl(c);
@@ -887,7 +902,7 @@ static void vgetcpu_set_mode(void)
887void __init identify_boot_cpu(void) 902void __init identify_boot_cpu(void)
888{ 903{
889 identify_cpu(&boot_cpu_data); 904 identify_cpu(&boot_cpu_data);
890 init_c1e_mask(); 905 init_amd_e400_c1e_mask();
891#ifdef CONFIG_X86_32 906#ifdef CONFIG_X86_32
892 sysenter_setup(); 907 sysenter_setup();
893 enable_sep_cpu(); 908 enable_sep_cpu();
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad28..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
27config X86_ACPI_CPUFREQ
28 tristate "ACPI Processor P-States driver"
29 select CPU_FREQ_TABLE
30 depends on ACPI_PROCESSOR
31 help
32 This driver adds a CPUFreq driver which utilizes the ACPI
33 Processor Performance States.
34 This driver also supports Intel Enhanced Speedstep.
35
36 To compile this driver as a module, choose M here: the
37 module will be called acpi-cpufreq.
38
39 For details, take a look at <file:Documentation/cpu-freq/>.
40
41 If in doubt, say N.
42
43config ELAN_CPUFREQ
44 tristate "AMD Elan SC400 and SC410"
45 select CPU_FREQ_TABLE
46 depends on X86_ELAN
47 ---help---
48 This adds the CPUFreq driver for AMD Elan SC400 and SC410
49 processors.
50
51 You need to specify the processor maximum speed as boot
52 parameter: elanfreq=maxspeed (in kHz) or as module
53 parameter "max_freq".
54
55 For details, take a look at <file:Documentation/cpu-freq/>.
56
57 If in doubt, say N.
58
59config SC520_CPUFREQ
60 tristate "AMD Elan SC520"
61 select CPU_FREQ_TABLE
62 depends on X86_ELAN
63 ---help---
64 This adds the CPUFreq driver for AMD Elan SC520 processor.
65
66 For details, take a look at <file:Documentation/cpu-freq/>.
67
68 If in doubt, say N.
69
70
71config X86_POWERNOW_K6
72 tristate "AMD Mobile K6-2/K6-3 PowerNow!"
73 select CPU_FREQ_TABLE
74 depends on X86_32
75 help
76 This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
77 AMD K6-3+ processors.
78
79 For details, take a look at <file:Documentation/cpu-freq/>.
80
81 If in doubt, say N.
82
83config X86_POWERNOW_K7
84 tristate "AMD Mobile Athlon/Duron PowerNow!"
85 select CPU_FREQ_TABLE
86 depends on X86_32
87 help
88 This adds the CPUFreq driver for mobile AMD K7 mobile processors.
89
90 For details, take a look at <file:Documentation/cpu-freq/>.
91
92 If in doubt, say N.
93
94config X86_POWERNOW_K7_ACPI
95 bool
96 depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
97 depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
98 depends on X86_32
99 default y
100
101config X86_POWERNOW_K8
102 tristate "AMD Opteron/Athlon64 PowerNow!"
103 select CPU_FREQ_TABLE
104 depends on ACPI && ACPI_PROCESSOR
105 help
106 This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
107
108 To compile this driver as a module, choose M here: the
109 module will be called powernow-k8.
110
111 For details, take a look at <file:Documentation/cpu-freq/>.
112
113config X86_GX_SUSPMOD
114 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
115 depends on X86_32 && PCI
116 help
117 This add the CPUFreq driver for NatSemi Geode processors which
118 support suspend modulation.
119
120 For details, take a look at <file:Documentation/cpu-freq/>.
121
122 If in doubt, say N.
123
124config X86_SPEEDSTEP_CENTRINO
125 tristate "Intel Enhanced SpeedStep (deprecated)"
126 select CPU_FREQ_TABLE
127 select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
128 depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
129 help
130 This is deprecated and this functionality is now merged into
131 acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
132 speedstep_centrino.
133 This adds the CPUFreq driver for Enhanced SpeedStep enabled
134 mobile CPUs. This means Intel Pentium M (Centrino) CPUs
135 or 64bit enabled Intel Xeons.
136
137 To compile this driver as a module, choose M here: the
138 module will be called speedstep-centrino.
139
140 For details, take a look at <file:Documentation/cpu-freq/>.
141
142 If in doubt, say N.
143
144config X86_SPEEDSTEP_CENTRINO_TABLE
145 bool "Built-in tables for Banias CPUs"
146 depends on X86_32 && X86_SPEEDSTEP_CENTRINO
147 default y
148 help
149 Use built-in tables for Banias CPUs if ACPI encoding
150 is not available.
151
152 If in doubt, say N.
153
154config X86_SPEEDSTEP_ICH
155 tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
156 select CPU_FREQ_TABLE
157 depends on X86_32
158 help
159 This adds the CPUFreq driver for certain mobile Intel Pentium III
160 (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
161 mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
162 ICH3 or ICH4 southbridge.
163
164 For details, take a look at <file:Documentation/cpu-freq/>.
165
166 If in doubt, say N.
167
168config X86_SPEEDSTEP_SMI
169 tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
170 select CPU_FREQ_TABLE
171 depends on X86_32 && EXPERIMENTAL
172 help
173 This adds the CPUFreq driver for certain mobile Intel Pentium III
174 (Coppermine), all mobile Intel Pentium III-M (Tualatin)
175 on systems which have an Intel 440BX/ZX/MX southbridge.
176
177 For details, take a look at <file:Documentation/cpu-freq/>.
178
179 If in doubt, say N.
180
181config X86_P4_CLOCKMOD
182 tristate "Intel Pentium 4 clock modulation"
183 select CPU_FREQ_TABLE
184 help
185 This adds the CPUFreq driver for Intel Pentium 4 / XEON
186 processors. When enabled it will lower CPU temperature by skipping
187 clocks.
188
189 This driver should be only used in exceptional
190 circumstances when very low power is needed because it causes severe
191 slowdowns and noticeable latencies. Normally Speedstep should be used
192 instead.
193
194 To compile this driver as a module, choose M here: the
195 module will be called p4-clockmod.
196
197 For details, take a look at <file:Documentation/cpu-freq/>.
198
199 Unless you are absolutely sure say N.
200
201config X86_CPUFREQ_NFORCE2
202 tristate "nVidia nForce2 FSB changing"
203 depends on X86_32 && EXPERIMENTAL
204 help
205 This adds the CPUFreq driver for FSB changing on nVidia nForce2
206 platforms.
207
208 For details, take a look at <file:Documentation/cpu-freq/>.
209
210 If in doubt, say N.
211
212config X86_LONGRUN
213 tristate "Transmeta LongRun"
214 depends on X86_32
215 help
216 This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
217 which support LongRun.
218
219 For details, take a look at <file:Documentation/cpu-freq/>.
220
221 If in doubt, say N.
222
223config X86_LONGHAUL
224 tristate "VIA Cyrix III Longhaul"
225 select CPU_FREQ_TABLE
226 depends on X86_32 && ACPI_PROCESSOR
227 help
228 This adds the CPUFreq driver for VIA Samuel/CyrixIII,
229 VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
230 processors.
231
232 For details, take a look at <file:Documentation/cpu-freq/>.
233
234 If in doubt, say N.
235
236config X86_E_POWERSAVER
237 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
238 select CPU_FREQ_TABLE
239 depends on X86_32 && EXPERIMENTAL
240 help
241 This adds the CPUFreq driver for VIA C7 processors. However, this driver
242 does not have any safeguards to prevent operating the CPU out of spec
243 and is thus considered dangerous. Please use the regular ACPI cpufreq
244 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
245
246 If in doubt, say N.
247
248comment "shared options"
249
250config X86_SPEEDSTEP_LIB
251 tristate
252 default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
253
254config X86_SPEEDSTEP_RELAXED_CAP_CHECK
255 bool "Relaxed speedstep capability checks"
256 depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
257 help
258 Don't perform all checks for a speedstep capable system which would
259 normally be done. Some ancient or strange systems, though speedstep
260 capable, don't always indicate that they are speedstep capable. This
261 option lets the probing code bypass some of those checks if the
262 parameter "relaxed_check=1" is passed to the module.
263
264endif # CPU_FREQ
265
266endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6fb..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
1# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod.
4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
11obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
12obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
13obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
14obj-$(CONFIG_X86_LONGRUN) += longrun.o
15obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
16obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
17obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
18obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
19obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
20obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
21obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index a2baafb2fe6d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,776 +0,0 @@
1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver
3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
8 *
9 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or (at
14 * your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
24 *
25 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 */
27
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/smp.h>
32#include <linux/sched.h>
33#include <linux/cpufreq.h>
34#include <linux/compiler.h>
35#include <linux/dmi.h>
36#include <linux/slab.h>
37
38#include <linux/acpi.h>
39#include <linux/io.h>
40#include <linux/delay.h>
41#include <linux/uaccess.h>
42
43#include <acpi/processor.h>
44
45#include <asm/msr.h>
46#include <asm/processor.h>
47#include <asm/cpufeature.h>
48#include "mperf.h"
49
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg)
52
53MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
54MODULE_DESCRIPTION("ACPI Processor P-States Driver");
55MODULE_LICENSE("GPL");
56
57enum {
58 UNDEFINED_CAPABLE = 0,
59 SYSTEM_INTEL_MSR_CAPABLE,
60 SYSTEM_IO_CAPABLE,
61};
62
63#define INTEL_MSR_RANGE (0xffff)
64
65struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data;
67 struct cpufreq_frequency_table *freq_table;
68 unsigned int resume;
69 unsigned int cpu_feature;
70};
71
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73
74/* acpi_perf_data is a pointer to percpu data. */
75static struct acpi_processor_performance __percpu *acpi_perf_data;
76
77static struct cpufreq_driver acpi_cpufreq_driver;
78
79static unsigned int acpi_pstate_strict;
80
81static int check_est_cpu(unsigned int cpuid)
82{
83 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
84
85 return cpu_has(cpu, X86_FEATURE_EST);
86}
87
88static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
89{
90 struct acpi_processor_performance *perf;
91 int i;
92
93 perf = data->acpi_data;
94
95 for (i = 0; i < perf->state_count; i++) {
96 if (value == perf->states[i].status)
97 return data->freq_table[i].frequency;
98 }
99 return 0;
100}
101
102static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
103{
104 int i;
105 struct acpi_processor_performance *perf;
106
107 msr &= INTEL_MSR_RANGE;
108 perf = data->acpi_data;
109
110 for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
111 if (msr == perf->states[data->freq_table[i].index].status)
112 return data->freq_table[i].frequency;
113 }
114 return data->freq_table[0].frequency;
115}
116
117static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
118{
119 switch (data->cpu_feature) {
120 case SYSTEM_INTEL_MSR_CAPABLE:
121 return extract_msr(val, data);
122 case SYSTEM_IO_CAPABLE:
123 return extract_io(val, data);
124 default:
125 return 0;
126 }
127}
128
129struct msr_addr {
130 u32 reg;
131};
132
133struct io_addr {
134 u16 port;
135 u8 bit_width;
136};
137
138struct drv_cmd {
139 unsigned int type;
140 const struct cpumask *mask;
141 union {
142 struct msr_addr msr;
143 struct io_addr io;
144 } addr;
145 u32 val;
146};
147
148/* Called via smp_call_function_single(), on the target CPU */
149static void do_drv_read(void *_cmd)
150{
151 struct drv_cmd *cmd = _cmd;
152 u32 h;
153
154 switch (cmd->type) {
155 case SYSTEM_INTEL_MSR_CAPABLE:
156 rdmsr(cmd->addr.msr.reg, cmd->val, h);
157 break;
158 case SYSTEM_IO_CAPABLE:
159 acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
160 &cmd->val,
161 (u32)cmd->addr.io.bit_width);
162 break;
163 default:
164 break;
165 }
166}
167
168/* Called via smp_call_function_many(), on the target CPUs */
169static void do_drv_write(void *_cmd)
170{
171 struct drv_cmd *cmd = _cmd;
172 u32 lo, hi;
173
174 switch (cmd->type) {
175 case SYSTEM_INTEL_MSR_CAPABLE:
176 rdmsr(cmd->addr.msr.reg, lo, hi);
177 lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
178 wrmsr(cmd->addr.msr.reg, lo, hi);
179 break;
180 case SYSTEM_IO_CAPABLE:
181 acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
182 cmd->val,
183 (u32)cmd->addr.io.bit_width);
184 break;
185 default:
186 break;
187 }
188}
189
190static void drv_read(struct drv_cmd *cmd)
191{
192 int err;
193 cmd->val = 0;
194
195 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
196 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
197}
198
199static void drv_write(struct drv_cmd *cmd)
200{
201 int this_cpu;
202
203 this_cpu = get_cpu();
204 if (cpumask_test_cpu(this_cpu, cmd->mask))
205 do_drv_write(cmd);
206 smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
207 put_cpu();
208}
209
210static u32 get_cur_val(const struct cpumask *mask)
211{
212 struct acpi_processor_performance *perf;
213 struct drv_cmd cmd;
214
215 if (unlikely(cpumask_empty(mask)))
216 return 0;
217
218 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
219 case SYSTEM_INTEL_MSR_CAPABLE:
220 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
221 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
222 break;
223 case SYSTEM_IO_CAPABLE:
224 cmd.type = SYSTEM_IO_CAPABLE;
225 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
226 cmd.addr.io.port = perf->control_register.address;
227 cmd.addr.io.bit_width = perf->control_register.bit_width;
228 break;
229 default:
230 return 0;
231 }
232
233 cmd.mask = mask;
234 drv_read(&cmd);
235
236 dprintk("get_cur_val = %u\n", cmd.val);
237
238 return cmd.val;
239}
240
241static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
242{
243 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
244 unsigned int freq;
245 unsigned int cached_freq;
246
247 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
248
249 if (unlikely(data == NULL ||
250 data->acpi_data == NULL || data->freq_table == NULL)) {
251 return 0;
252 }
253
254 cached_freq = data->freq_table[data->acpi_data->state].frequency;
255 freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
256 if (freq != cached_freq) {
257 /*
258 * The dreaded BIOS frequency change behind our back.
259 * Force set the frequency on next target call.
260 */
261 data->resume = 1;
262 }
263
264 dprintk("cur freq = %u\n", freq);
265
266 return freq;
267}
268
269static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
270 struct acpi_cpufreq_data *data)
271{
272 unsigned int cur_freq;
273 unsigned int i;
274
275 for (i = 0; i < 100; i++) {
276 cur_freq = extract_freq(get_cur_val(mask), data);
277 if (cur_freq == freq)
278 return 1;
279 udelay(10);
280 }
281 return 0;
282}
283
284static int acpi_cpufreq_target(struct cpufreq_policy *policy,
285 unsigned int target_freq, unsigned int relation)
286{
287 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
288 struct acpi_processor_performance *perf;
289 struct cpufreq_freqs freqs;
290 struct drv_cmd cmd;
291 unsigned int next_state = 0; /* Index into freq_table */
292 unsigned int next_perf_state = 0; /* Index into perf table */
293 unsigned int i;
294 int result = 0;
295
296 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
297
298 if (unlikely(data == NULL ||
299 data->acpi_data == NULL || data->freq_table == NULL)) {
300 return -ENODEV;
301 }
302
303 perf = data->acpi_data;
304 result = cpufreq_frequency_table_target(policy,
305 data->freq_table,
306 target_freq,
307 relation, &next_state);
308 if (unlikely(result)) {
309 result = -ENODEV;
310 goto out;
311 }
312
313 next_perf_state = data->freq_table[next_state].index;
314 if (perf->state == next_perf_state) {
315 if (unlikely(data->resume)) {
316 dprintk("Called after resume, resetting to P%d\n",
317 next_perf_state);
318 data->resume = 0;
319 } else {
320 dprintk("Already at target state (P%d)\n",
321 next_perf_state);
322 goto out;
323 }
324 }
325
326 switch (data->cpu_feature) {
327 case SYSTEM_INTEL_MSR_CAPABLE:
328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
329 cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
330 cmd.val = (u32) perf->states[next_perf_state].control;
331 break;
332 case SYSTEM_IO_CAPABLE:
333 cmd.type = SYSTEM_IO_CAPABLE;
334 cmd.addr.io.port = perf->control_register.address;
335 cmd.addr.io.bit_width = perf->control_register.bit_width;
336 cmd.val = (u32) perf->states[next_perf_state].control;
337 break;
338 default:
339 result = -ENODEV;
340 goto out;
341 }
342
343 /* cpufreq holds the hotplug lock, so we are safe from here on */
344 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
345 cmd.mask = policy->cpus;
346 else
347 cmd.mask = cpumask_of(policy->cpu);
348
349 freqs.old = perf->states[perf->state].core_frequency * 1000;
350 freqs.new = data->freq_table[next_state].frequency;
351 for_each_cpu(i, policy->cpus) {
352 freqs.cpu = i;
353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
354 }
355
356 drv_write(&cmd);
357
358 if (acpi_pstate_strict) {
359 if (!check_freqs(cmd.mask, freqs.new, data)) {
360 dprintk("acpi_cpufreq_target failed (%d)\n",
361 policy->cpu);
362 result = -EAGAIN;
363 goto out;
364 }
365 }
366
367 for_each_cpu(i, policy->cpus) {
368 freqs.cpu = i;
369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
370 }
371 perf->state = next_perf_state;
372
373out:
374 return result;
375}
376
377static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
378{
379 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
380
381 dprintk("acpi_cpufreq_verify\n");
382
383 return cpufreq_frequency_table_verify(policy, data->freq_table);
384}
385
386static unsigned long
387acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
388{
389 struct acpi_processor_performance *perf = data->acpi_data;
390
391 if (cpu_khz) {
392 /* search the closest match to cpu_khz */
393 unsigned int i;
394 unsigned long freq;
395 unsigned long freqn = perf->states[0].core_frequency * 1000;
396
397 for (i = 0; i < (perf->state_count-1); i++) {
398 freq = freqn;
399 freqn = perf->states[i+1].core_frequency * 1000;
400 if ((2 * cpu_khz) > (freqn + freq)) {
401 perf->state = i;
402 return freq;
403 }
404 }
405 perf->state = perf->state_count-1;
406 return freqn;
407 } else {
408 /* assume CPU is at P0... */
409 perf->state = 0;
410 return perf->states[0].core_frequency * 1000;
411 }
412}
413
414static void free_acpi_perf_data(void)
415{
416 unsigned int i;
417
418 /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
419 for_each_possible_cpu(i)
420 free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
421 ->shared_cpu_map);
422 free_percpu(acpi_perf_data);
423}
424
425/*
426 * acpi_cpufreq_early_init - initialize ACPI P-States library
427 *
428 * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
429 * in order to determine correct frequency and voltage pairings. We can
430 * do _PDC and _PSD and find out the processor dependency for the
431 * actual init that will happen later...
432 */
433static int __init acpi_cpufreq_early_init(void)
434{
435 unsigned int i;
436 dprintk("acpi_cpufreq_early_init\n");
437
438 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
439 if (!acpi_perf_data) {
440 dprintk("Memory allocation error for acpi_perf_data.\n");
441 return -ENOMEM;
442 }
443 for_each_possible_cpu(i) {
444 if (!zalloc_cpumask_var_node(
445 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
446 GFP_KERNEL, cpu_to_node(i))) {
447
448 /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
449 free_acpi_perf_data();
450 return -ENOMEM;
451 }
452 }
453
454 /* Do initialization in ACPI core */
455 acpi_processor_preregister_performance(acpi_perf_data);
456 return 0;
457}
458
459#ifdef CONFIG_SMP
460/*
461 * Some BIOSes do SW_ANY coordination internally, either set it up in hw
462 * or do it in BIOS firmware and won't inform about it to OS. If not
463 * detected, this has a side effect of making CPU run at a different speed
464 * than OS intended it to run at. Detect it and handle it cleanly.
465 */
466static int bios_with_sw_any_bug;
467
468static int sw_any_bug_found(const struct dmi_system_id *d)
469{
470 bios_with_sw_any_bug = 1;
471 return 0;
472}
473
474static const struct dmi_system_id sw_any_bug_dmi_table[] = {
475 {
476 .callback = sw_any_bug_found,
477 .ident = "Supermicro Server X6DLP",
478 .matches = {
479 DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
480 DMI_MATCH(DMI_BIOS_VERSION, "080010"),
481 DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
482 },
483 },
484 { }
485};
486
487static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
488{
489 /* Intel Xeon Processor 7100 Series Specification Update
490 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
491 * AL30: A Machine Check Exception (MCE) Occurring during an
492 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
493 * Both Processor Cores to Lock Up. */
494 if (c->x86_vendor == X86_VENDOR_INTEL) {
495 if ((c->x86 == 15) &&
496 (c->x86_model == 6) &&
497 (c->x86_mask == 8)) {
498 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
499 "Xeon(R) 7100 Errata AL30, processors may "
500 "lock up on frequency changes: disabling "
501 "acpi-cpufreq.\n");
502 return -ENODEV;
503 }
504 }
505 return 0;
506}
507#endif
508
509static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
510{
511 unsigned int i;
512 unsigned int valid_states = 0;
513 unsigned int cpu = policy->cpu;
514 struct acpi_cpufreq_data *data;
515 unsigned int result = 0;
516 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
517 struct acpi_processor_performance *perf;
518#ifdef CONFIG_SMP
519 static int blacklisted;
520#endif
521
522 dprintk("acpi_cpufreq_cpu_init\n");
523
524#ifdef CONFIG_SMP
525 if (blacklisted)
526 return blacklisted;
527 blacklisted = acpi_cpufreq_blacklist(c);
528 if (blacklisted)
529 return blacklisted;
530#endif
531
532 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
533 if (!data)
534 return -ENOMEM;
535
536 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
537 per_cpu(acfreq_data, cpu) = data;
538
539 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
540 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
541
542 result = acpi_processor_register_performance(data->acpi_data, cpu);
543 if (result)
544 goto err_free;
545
546 perf = data->acpi_data;
547 policy->shared_type = perf->shared_type;
548
549 /*
550 * Will let policy->cpus know about dependency only when software
551 * coordination is required.
552 */
553 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
554 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
555 cpumask_copy(policy->cpus, perf->shared_cpu_map);
556 }
557 cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
558
559#ifdef CONFIG_SMP
560 dmi_check_system(sw_any_bug_dmi_table);
561 if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
562 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
563 cpumask_copy(policy->cpus, cpu_core_mask(cpu));
564 }
565#endif
566
567 /* capability check */
568 if (perf->state_count <= 1) {
569 dprintk("No P-States\n");
570 result = -ENODEV;
571 goto err_unreg;
572 }
573
574 if (perf->control_register.space_id != perf->status_register.space_id) {
575 result = -ENODEV;
576 goto err_unreg;
577 }
578
579 switch (perf->control_register.space_id) {
580 case ACPI_ADR_SPACE_SYSTEM_IO:
581 dprintk("SYSTEM IO addr space\n");
582 data->cpu_feature = SYSTEM_IO_CAPABLE;
583 break;
584 case ACPI_ADR_SPACE_FIXED_HARDWARE:
585 dprintk("HARDWARE addr space\n");
586 if (!check_est_cpu(cpu)) {
587 result = -ENODEV;
588 goto err_unreg;
589 }
590 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
591 break;
592 default:
593 dprintk("Unknown addr space %d\n",
594 (u32) (perf->control_register.space_id));
595 result = -ENODEV;
596 goto err_unreg;
597 }
598
599 data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
600 (perf->state_count+1), GFP_KERNEL);
601 if (!data->freq_table) {
602 result = -ENOMEM;
603 goto err_unreg;
604 }
605
606 /* detect transition latency */
607 policy->cpuinfo.transition_latency = 0;
608 for (i = 0; i < perf->state_count; i++) {
609 if ((perf->states[i].transition_latency * 1000) >
610 policy->cpuinfo.transition_latency)
611 policy->cpuinfo.transition_latency =
612 perf->states[i].transition_latency * 1000;
613 }
614
615 /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
616 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
617 policy->cpuinfo.transition_latency > 20 * 1000) {
618 policy->cpuinfo.transition_latency = 20 * 1000;
619 printk_once(KERN_INFO
620 "P-state transition latency capped at 20 uS\n");
621 }
622
623 /* table init */
624 for (i = 0; i < perf->state_count; i++) {
625 if (i > 0 && perf->states[i].core_frequency >=
626 data->freq_table[valid_states-1].frequency / 1000)
627 continue;
628
629 data->freq_table[valid_states].index = i;
630 data->freq_table[valid_states].frequency =
631 perf->states[i].core_frequency * 1000;
632 valid_states++;
633 }
634 data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
635 perf->state = 0;
636
637 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
638 if (result)
639 goto err_freqfree;
640
641 if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
642 printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
643
644 switch (perf->control_register.space_id) {
645 case ACPI_ADR_SPACE_SYSTEM_IO:
646 /* Current speed is unknown and not detectable by IO port */
647 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
648 break;
649 case ACPI_ADR_SPACE_FIXED_HARDWARE:
650 acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
651 policy->cur = get_cur_freq_on_cpu(cpu);
652 break;
653 default:
654 break;
655 }
656
657 /* notify BIOS that we exist */
658 acpi_processor_notify_smm(THIS_MODULE);
659
660 /* Check for APERF/MPERF support in hardware */
661 if (cpu_has(c, X86_FEATURE_APERFMPERF))
662 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
663
664 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
665 for (i = 0; i < perf->state_count; i++)
666 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
667 (i == perf->state ? '*' : ' '), i,
668 (u32) perf->states[i].core_frequency,
669 (u32) perf->states[i].power,
670 (u32) perf->states[i].transition_latency);
671
672 cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
673
674 /*
675 * the first call to ->target() should result in us actually
676 * writing something to the appropriate registers.
677 */
678 data->resume = 1;
679
680 return result;
681
682err_freqfree:
683 kfree(data->freq_table);
684err_unreg:
685 acpi_processor_unregister_performance(perf, cpu);
686err_free:
687 kfree(data);
688 per_cpu(acfreq_data, cpu) = NULL;
689
690 return result;
691}
692
693static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
694{
695 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
696
697 dprintk("acpi_cpufreq_cpu_exit\n");
698
699 if (data) {
700 cpufreq_frequency_table_put_attr(policy->cpu);
701 per_cpu(acfreq_data, policy->cpu) = NULL;
702 acpi_processor_unregister_performance(data->acpi_data,
703 policy->cpu);
704 kfree(data->freq_table);
705 kfree(data);
706 }
707
708 return 0;
709}
710
711static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
712{
713 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
714
715 dprintk("acpi_cpufreq_resume\n");
716
717 data->resume = 1;
718
719 return 0;
720}
721
722static struct freq_attr *acpi_cpufreq_attr[] = {
723 &cpufreq_freq_attr_scaling_available_freqs,
724 NULL,
725};
726
727static struct cpufreq_driver acpi_cpufreq_driver = {
728 .verify = acpi_cpufreq_verify,
729 .target = acpi_cpufreq_target,
730 .bios_limit = acpi_processor_get_bios_limit,
731 .init = acpi_cpufreq_cpu_init,
732 .exit = acpi_cpufreq_cpu_exit,
733 .resume = acpi_cpufreq_resume,
734 .name = "acpi-cpufreq",
735 .owner = THIS_MODULE,
736 .attr = acpi_cpufreq_attr,
737};
738
739static int __init acpi_cpufreq_init(void)
740{
741 int ret;
742
743 if (acpi_disabled)
744 return 0;
745
746 dprintk("acpi_cpufreq_init\n");
747
748 ret = acpi_cpufreq_early_init();
749 if (ret)
750 return ret;
751
752 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
753 if (ret)
754 free_acpi_perf_data();
755
756 return ret;
757}
758
759static void __exit acpi_cpufreq_exit(void)
760{
761 dprintk("acpi_cpufreq_exit\n");
762
763 cpufreq_unregister_driver(&acpi_cpufreq_driver);
764
765 free_percpu(acpi_perf_data);
766}
767
768module_param(acpi_pstate_strict, uint, 0644);
769MODULE_PARM_DESC(acpi_pstate_strict,
770 "value 0 or non-zero. non-zero -> strict ACPI checks are "
771 "performed during frequency changes.");
772
773late_initcall(acpi_cpufreq_init);
774module_exit(acpi_cpufreq_exit);
775
776MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 141abebc4516..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
1/*
2 * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 * Based upon reverse engineered information
6 *
7 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/moduleparam.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/pci.h>
16#include <linux/delay.h>
17
18#define NFORCE2_XTAL 25
19#define NFORCE2_BOOTFSB 0x48
20#define NFORCE2_PLLENABLE 0xa8
21#define NFORCE2_PLLREG 0xa4
22#define NFORCE2_PLLADR 0xa0
23#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
24
25#define NFORCE2_MIN_FSB 50
26#define NFORCE2_SAFE_DISTANCE 50
27
28/* Delay in ms between FSB changes */
29/* #define NFORCE2_DELAY 10 */
30
31/*
32 * nforce2_chipset:
33 * FSB is changed using the chipset
34 */
35static struct pci_dev *nforce2_dev;
36
37/* fid:
38 * multiplier * 10
39 */
40static int fid;
41
42/* min_fsb, max_fsb:
43 * minimum and maximum FSB (= FSB at boot time)
44 */
45static int min_fsb;
46static int max_fsb;
47
48MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
49MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
50MODULE_LICENSE("GPL");
51
52module_param(fid, int, 0444);
53module_param(min_fsb, int, 0444);
54
55MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
56MODULE_PARM_DESC(min_fsb,
57 "Minimum FSB to use, if not defined: current FSB - 50");
58
59#define PFX "cpufreq-nforce2: "
60#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
61 "cpufreq-nforce2", msg)
62
63/**
64 * nforce2_calc_fsb - calculate FSB
65 * @pll: PLL value
66 *
67 * Calculates FSB from PLL value
68 */
69static int nforce2_calc_fsb(int pll)
70{
71 unsigned char mul, div;
72
73 mul = (pll >> 8) & 0xff;
74 div = pll & 0xff;
75
76 if (div > 0)
77 return NFORCE2_XTAL * mul / div;
78
79 return 0;
80}
81
82/**
83 * nforce2_calc_pll - calculate PLL value
84 * @fsb: FSB
85 *
86 * Calculate PLL value for given FSB
87 */
88static int nforce2_calc_pll(unsigned int fsb)
89{
90 unsigned char xmul, xdiv;
91 unsigned char mul = 0, div = 0;
92 int tried = 0;
93
94 /* Try to calculate multiplier and divider up to 4 times */
95 while (((mul == 0) || (div == 0)) && (tried <= 3)) {
96 for (xdiv = 2; xdiv <= 0x80; xdiv++)
97 for (xmul = 1; xmul <= 0xfe; xmul++)
98 if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
99 fsb + tried) {
100 mul = xmul;
101 div = xdiv;
102 }
103 tried++;
104 }
105
106 if ((mul == 0) || (div == 0))
107 return -1;
108
109 return NFORCE2_PLL(mul, div);
110}
111
112/**
113 * nforce2_write_pll - write PLL value to chipset
114 * @pll: PLL value
115 *
116 * Writes new FSB PLL value to chipset
117 */
118static void nforce2_write_pll(int pll)
119{
120 int temp;
121
122 /* Set the pll addr. to 0x00 */
123 pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
124
125 /* Now write the value in all 64 registers */
126 for (temp = 0; temp <= 0x3f; temp++)
127 pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
128
129 return;
130}
131
132/**
133 * nforce2_fsb_read - Read FSB
134 *
135 * Read FSB from chipset
136 * If bootfsb != 0, return FSB at boot-time
137 */
138static unsigned int nforce2_fsb_read(int bootfsb)
139{
140 struct pci_dev *nforce2_sub5;
141 u32 fsb, temp = 0;
142
143 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
144 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
145 PCI_ANY_ID, PCI_ANY_ID, NULL);
146 if (!nforce2_sub5)
147 return 0;
148
149 pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
150 fsb /= 1000000;
151
152 /* Check if PLL register is already set */
153 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
154
155 if (bootfsb || !temp)
156 return fsb;
157
158 /* Use PLL register FSB value */
159 pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
160 fsb = nforce2_calc_fsb(temp);
161
162 return fsb;
163}
164
165/**
166 * nforce2_set_fsb - set new FSB
167 * @fsb: New FSB
168 *
169 * Sets new FSB
170 */
171static int nforce2_set_fsb(unsigned int fsb)
172{
173 u32 temp = 0;
174 unsigned int tfsb;
175 int diff;
176 int pll = 0;
177
178 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
179 printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
180 return -EINVAL;
181 }
182
183 tfsb = nforce2_fsb_read(0);
184 if (!tfsb) {
185 printk(KERN_ERR PFX "Error while reading the FSB\n");
186 return -EINVAL;
187 }
188
189 /* First write? Then set actual value */
190 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
191 if (!temp) {
192 pll = nforce2_calc_pll(tfsb);
193
194 if (pll < 0)
195 return -EINVAL;
196
197 nforce2_write_pll(pll);
198 }
199
200 /* Enable write access */
201 temp = 0x01;
202 pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
203
204 diff = tfsb - fsb;
205
206 if (!diff)
207 return 0;
208
209 while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
210 if (diff < 0)
211 tfsb++;
212 else
213 tfsb--;
214
215 /* Calculate the PLL reg. value */
216 pll = nforce2_calc_pll(tfsb);
217 if (pll == -1)
218 return -EINVAL;
219
220 nforce2_write_pll(pll);
221#ifdef NFORCE2_DELAY
222 mdelay(NFORCE2_DELAY);
223#endif
224 }
225
226 temp = 0x40;
227 pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
228
229 return 0;
230}
231
232/**
233 * nforce2_get - get the CPU frequency
234 * @cpu: CPU number
235 *
236 * Returns the CPU frequency
237 */
238static unsigned int nforce2_get(unsigned int cpu)
239{
240 if (cpu)
241 return 0;
242 return nforce2_fsb_read(0) * fid * 100;
243}
244
245/**
246 * nforce2_target - set a new CPUFreq policy
247 * @policy: new policy
248 * @target_freq: the target frequency
249 * @relation: how that frequency relates to achieved frequency
250 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
251 *
252 * Sets a new CPUFreq policy.
253 */
254static int nforce2_target(struct cpufreq_policy *policy,
255 unsigned int target_freq, unsigned int relation)
256{
257/* unsigned long flags; */
258 struct cpufreq_freqs freqs;
259 unsigned int target_fsb;
260
261 if ((target_freq > policy->max) || (target_freq < policy->min))
262 return -EINVAL;
263
264 target_fsb = target_freq / (fid * 100);
265
266 freqs.old = nforce2_get(policy->cpu);
267 freqs.new = target_fsb * fid * 100;
268 freqs.cpu = 0; /* Only one CPU on nForce2 platforms */
269
270 if (freqs.old == freqs.new)
271 return 0;
272
273 dprintk("Old CPU frequency %d kHz, new %d kHz\n",
274 freqs.old, freqs.new);
275
276 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
277
278 /* Disable IRQs */
279 /* local_irq_save(flags); */
280
281 if (nforce2_set_fsb(target_fsb) < 0)
282 printk(KERN_ERR PFX "Changing FSB to %d failed\n",
283 target_fsb);
284 else
285 dprintk("Changed FSB successfully to %d\n",
286 target_fsb);
287
288 /* Enable IRQs */
289 /* local_irq_restore(flags); */
290
291 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
292
293 return 0;
294}
295
296/**
297 * nforce2_verify - verifies a new CPUFreq policy
298 * @policy: new policy
299 */
300static int nforce2_verify(struct cpufreq_policy *policy)
301{
302 unsigned int fsb_pol_max;
303
304 fsb_pol_max = policy->max / (fid * 100);
305
306 if (policy->min < (fsb_pol_max * fid * 100))
307 policy->max = (fsb_pol_max + 1) * fid * 100;
308
309 cpufreq_verify_within_limits(policy,
310 policy->cpuinfo.min_freq,
311 policy->cpuinfo.max_freq);
312 return 0;
313}
314
315static int nforce2_cpu_init(struct cpufreq_policy *policy)
316{
317 unsigned int fsb;
318 unsigned int rfid;
319
320 /* capability check */
321 if (policy->cpu != 0)
322 return -ENODEV;
323
324 /* Get current FSB */
325 fsb = nforce2_fsb_read(0);
326
327 if (!fsb)
328 return -EIO;
329
330 /* FIX: Get FID from CPU */
331 if (!fid) {
332 if (!cpu_khz) {
333 printk(KERN_WARNING PFX
334 "cpu_khz not set, can't calculate multiplier!\n");
335 return -ENODEV;
336 }
337
338 fid = cpu_khz / (fsb * 100);
339 rfid = fid % 5;
340
341 if (rfid) {
342 if (rfid > 2)
343 fid += 5 - rfid;
344 else
345 fid -= rfid;
346 }
347 }
348
349 printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
350 fid / 10, fid % 10);
351
352 /* Set maximum FSB to FSB at boot time */
353 max_fsb = nforce2_fsb_read(1);
354
355 if (!max_fsb)
356 return -EIO;
357
358 if (!min_fsb)
359 min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
360
361 if (min_fsb < NFORCE2_MIN_FSB)
362 min_fsb = NFORCE2_MIN_FSB;
363
364 /* cpuinfo and default policy values */
365 policy->cpuinfo.min_freq = min_fsb * fid * 100;
366 policy->cpuinfo.max_freq = max_fsb * fid * 100;
367 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
368 policy->cur = nforce2_get(policy->cpu);
369 policy->min = policy->cpuinfo.min_freq;
370 policy->max = policy->cpuinfo.max_freq;
371
372 return 0;
373}
374
375static int nforce2_cpu_exit(struct cpufreq_policy *policy)
376{
377 return 0;
378}
379
380static struct cpufreq_driver nforce2_driver = {
381 .name = "nforce2",
382 .verify = nforce2_verify,
383 .target = nforce2_target,
384 .get = nforce2_get,
385 .init = nforce2_cpu_init,
386 .exit = nforce2_cpu_exit,
387 .owner = THIS_MODULE,
388};
389
390/**
391 * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
392 *
393 * Detects nForce2 A2 and C1 stepping
394 *
395 */
396static int nforce2_detect_chipset(void)
397{
398 nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
399 PCI_DEVICE_ID_NVIDIA_NFORCE2,
400 PCI_ANY_ID, PCI_ANY_ID, NULL);
401
402 if (nforce2_dev == NULL)
403 return -ENODEV;
404
405 printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
406 nforce2_dev->revision);
407 printk(KERN_INFO PFX
408 "FSB changing is maybe unstable and can lead to "
409 "crashes and data loss.\n");
410
411 return 0;
412}
413
414/**
415 * nforce2_init - initializes the nForce2 CPUFreq driver
416 *
417 * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
418 * devices, -EINVAL on problems during initiatization, and zero on
419 * success.
420 */
421static int __init nforce2_init(void)
422{
423 /* TODO: do we need to detect the processor? */
424
425 /* detect chipset */
426 if (nforce2_detect_chipset()) {
427 printk(KERN_INFO PFX "No nForce2 chipset.\n");
428 return -ENODEV;
429 }
430
431 return cpufreq_register_driver(&nforce2_driver);
432}
433
434/**
435 * nforce2_exit - unregisters cpufreq module
436 *
437 * Unregisters nForce2 FSB change support.
438 */
439static void __exit nforce2_exit(void)
440{
441 cpufreq_unregister_driver(&nforce2_driver);
442}
443
444module_init(nforce2_init);
445module_exit(nforce2_exit);
446
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb7..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
1/*
2 * Based on documentation provided by Dave Jones. Thanks!
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/ioport.h>
14#include <linux/slab.h>
15#include <linux/timex.h>
16#include <linux/io.h>
17#include <linux/delay.h>
18
19#include <asm/msr.h>
20#include <asm/tsc.h>
21
22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1
24#define EPS_BRAND_EDEN 2
25#define EPS_BRAND_C3 3
26#define EPS_BRAND_C7D 4
27
28struct eps_cpu_data {
29 u32 fsb;
30 struct cpufreq_frequency_table freq_table[];
31};
32
33static struct eps_cpu_data *eps_cpu[NR_CPUS];
34
35
36static unsigned int eps_get(unsigned int cpu)
37{
38 struct eps_cpu_data *centaur;
39 u32 lo, hi;
40
41 if (cpu)
42 return 0;
43 centaur = eps_cpu[cpu];
44 if (centaur == NULL)
45 return 0;
46
47 /* Return current frequency */
48 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
49 return centaur->fsb * ((lo >> 8) & 0xff);
50}
51
52static int eps_set_state(struct eps_cpu_data *centaur,
53 unsigned int cpu,
54 u32 dest_state)
55{
56 struct cpufreq_freqs freqs;
57 u32 lo, hi;
58 int err = 0;
59 int i;
60
61 freqs.old = eps_get(cpu);
62 freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
63 freqs.cpu = cpu;
64 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
65
66 /* Wait while CPU is busy */
67 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
68 i = 0;
69 while (lo & ((1 << 16) | (1 << 17))) {
70 udelay(16);
71 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
72 i++;
73 if (unlikely(i > 64)) {
74 err = -ENODEV;
75 goto postchange;
76 }
77 }
78 /* Set new multiplier and voltage */
79 wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
80 /* Wait until transition end */
81 i = 0;
82 do {
83 udelay(16);
84 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
85 i++;
86 if (unlikely(i > 64)) {
87 err = -ENODEV;
88 goto postchange;
89 }
90 } while (lo & ((1 << 16) | (1 << 17)));
91
92 /* Return current frequency */
93postchange:
94 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
95 freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
96
97#ifdef DEBUG
98 {
99 u8 current_multiplier, current_voltage;
100
101 /* Print voltage and multiplier */
102 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
103 current_voltage = lo & 0xff;
104 printk(KERN_INFO "eps: Current voltage = %dmV\n",
105 current_voltage * 16 + 700);
106 current_multiplier = (lo >> 8) & 0xff;
107 printk(KERN_INFO "eps: Current multiplier = %d\n",
108 current_multiplier);
109 }
110#endif
111 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
112 return err;
113}
114
115static int eps_target(struct cpufreq_policy *policy,
116 unsigned int target_freq,
117 unsigned int relation)
118{
119 struct eps_cpu_data *centaur;
120 unsigned int newstate = 0;
121 unsigned int cpu = policy->cpu;
122 unsigned int dest_state;
123 int ret;
124
125 if (unlikely(eps_cpu[cpu] == NULL))
126 return -ENODEV;
127 centaur = eps_cpu[cpu];
128
129 if (unlikely(cpufreq_frequency_table_target(policy,
130 &eps_cpu[cpu]->freq_table[0],
131 target_freq,
132 relation,
133 &newstate))) {
134 return -EINVAL;
135 }
136
137 /* Make frequency transition */
138 dest_state = centaur->freq_table[newstate].index & 0xffff;
139 ret = eps_set_state(centaur, cpu, dest_state);
140 if (ret)
141 printk(KERN_ERR "eps: Timeout!\n");
142 return ret;
143}
144
145static int eps_verify(struct cpufreq_policy *policy)
146{
147 return cpufreq_frequency_table_verify(policy,
148 &eps_cpu[policy->cpu]->freq_table[0]);
149}
150
151static int eps_cpu_init(struct cpufreq_policy *policy)
152{
153 unsigned int i;
154 u32 lo, hi;
155 u64 val;
156 u8 current_multiplier, current_voltage;
157 u8 max_multiplier, max_voltage;
158 u8 min_multiplier, min_voltage;
159 u8 brand = 0;
160 u32 fsb;
161 struct eps_cpu_data *centaur;
162 struct cpuinfo_x86 *c = &cpu_data(0);
163 struct cpufreq_frequency_table *f_table;
164 int k, step, voltage;
165 int ret;
166 int states;
167
168 if (policy->cpu != 0)
169 return -ENODEV;
170
171 /* Check brand */
172 printk(KERN_INFO "eps: Detected VIA ");
173
174 switch (c->x86_model) {
175 case 10:
176 rdmsr(0x1153, lo, hi);
177 brand = (((lo >> 2) ^ lo) >> 18) & 3;
178 printk(KERN_CONT "Model A ");
179 break;
180 case 13:
181 rdmsr(0x1154, lo, hi);
182 brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
183 printk(KERN_CONT "Model D ");
184 break;
185 }
186
187 switch (brand) {
188 case EPS_BRAND_C7M:
189 printk(KERN_CONT "C7-M\n");
190 break;
191 case EPS_BRAND_C7:
192 printk(KERN_CONT "C7\n");
193 break;
194 case EPS_BRAND_EDEN:
195 printk(KERN_CONT "Eden\n");
196 break;
197 case EPS_BRAND_C7D:
198 printk(KERN_CONT "C7-D\n");
199 break;
200 case EPS_BRAND_C3:
201 printk(KERN_CONT "C3\n");
202 return -ENODEV;
203 break;
204 }
205 /* Enable Enhanced PowerSaver */
206 rdmsrl(MSR_IA32_MISC_ENABLE, val);
207 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
208 val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
209 wrmsrl(MSR_IA32_MISC_ENABLE, val);
210 /* Can be locked at 0 */
211 rdmsrl(MSR_IA32_MISC_ENABLE, val);
212 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
213 printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
214 return -ENODEV;
215 }
216 }
217
218 /* Print voltage and multiplier */
219 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
220 current_voltage = lo & 0xff;
221 printk(KERN_INFO "eps: Current voltage = %dmV\n",
222 current_voltage * 16 + 700);
223 current_multiplier = (lo >> 8) & 0xff;
224 printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
225
226 /* Print limits */
227 max_voltage = hi & 0xff;
228 printk(KERN_INFO "eps: Highest voltage = %dmV\n",
229 max_voltage * 16 + 700);
230 max_multiplier = (hi >> 8) & 0xff;
231 printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
232 min_voltage = (hi >> 16) & 0xff;
233 printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
234 min_voltage * 16 + 700);
235 min_multiplier = (hi >> 24) & 0xff;
236 printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
237
238 /* Sanity checks */
239 if (current_multiplier == 0 || max_multiplier == 0
240 || min_multiplier == 0)
241 return -EINVAL;
242 if (current_multiplier > max_multiplier
243 || max_multiplier <= min_multiplier)
244 return -EINVAL;
245 if (current_voltage > 0x1f || max_voltage > 0x1f)
246 return -EINVAL;
247 if (max_voltage < min_voltage)
248 return -EINVAL;
249
250 /* Calc FSB speed */
251 fsb = cpu_khz / current_multiplier;
252 /* Calc number of p-states supported */
253 if (brand == EPS_BRAND_C7M)
254 states = max_multiplier - min_multiplier + 1;
255 else
256 states = 2;
257
258 /* Allocate private data and frequency table for current cpu */
259 centaur = kzalloc(sizeof(struct eps_cpu_data)
260 + (states + 1) * sizeof(struct cpufreq_frequency_table),
261 GFP_KERNEL);
262 if (!centaur)
263 return -ENOMEM;
264 eps_cpu[0] = centaur;
265
266 /* Copy basic values */
267 centaur->fsb = fsb;
268
269 /* Fill frequency and MSR value table */
270 f_table = &centaur->freq_table[0];
271 if (brand != EPS_BRAND_C7M) {
272 f_table[0].frequency = fsb * min_multiplier;
273 f_table[0].index = (min_multiplier << 8) | min_voltage;
274 f_table[1].frequency = fsb * max_multiplier;
275 f_table[1].index = (max_multiplier << 8) | max_voltage;
276 f_table[2].frequency = CPUFREQ_TABLE_END;
277 } else {
278 k = 0;
279 step = ((max_voltage - min_voltage) * 256)
280 / (max_multiplier - min_multiplier);
281 for (i = min_multiplier; i <= max_multiplier; i++) {
282 voltage = (k * step) / 256 + min_voltage;
283 f_table[k].frequency = fsb * i;
284 f_table[k].index = (i << 8) | voltage;
285 k++;
286 }
287 f_table[k].frequency = CPUFREQ_TABLE_END;
288 }
289
290 policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
291 policy->cur = fsb * current_multiplier;
292
293 ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
294 if (ret) {
295 kfree(centaur);
296 return ret;
297 }
298
299 cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
300 return 0;
301}
302
303static int eps_cpu_exit(struct cpufreq_policy *policy)
304{
305 unsigned int cpu = policy->cpu;
306 struct eps_cpu_data *centaur;
307 u32 lo, hi;
308
309 if (eps_cpu[cpu] == NULL)
310 return -ENODEV;
311 centaur = eps_cpu[cpu];
312
313 /* Get max frequency */
314 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
315 /* Set max frequency */
316 eps_set_state(centaur, cpu, hi & 0xffff);
317 /* Bye */
318 cpufreq_frequency_table_put_attr(policy->cpu);
319 kfree(eps_cpu[cpu]);
320 eps_cpu[cpu] = NULL;
321 return 0;
322}
323
324static struct freq_attr *eps_attr[] = {
325 &cpufreq_freq_attr_scaling_available_freqs,
326 NULL,
327};
328
329static struct cpufreq_driver eps_driver = {
330 .verify = eps_verify,
331 .target = eps_target,
332 .init = eps_cpu_init,
333 .exit = eps_cpu_exit,
334 .get = eps_get,
335 .name = "e_powersaver",
336 .owner = THIS_MODULE,
337 .attr = eps_attr,
338};
339
340static int __init eps_init(void)
341{
342 struct cpuinfo_x86 *c = &cpu_data(0);
343
344 /* This driver will work only on Centaur C7 processors with
345 * Enhanced SpeedStep/PowerSaver registers */
346 if (c->x86_vendor != X86_VENDOR_CENTAUR
347 || c->x86 != 6 || c->x86_model < 10)
348 return -ENODEV;
349 if (!cpu_has(c, X86_FEATURE_EST))
350 return -ENODEV;
351
352 if (cpufreq_register_driver(&eps_driver))
353 return -EINVAL;
354 return 0;
355}
356
357static void __exit eps_exit(void)
358{
359 cpufreq_unregister_driver(&eps_driver);
360}
361
362MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
363MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
364MODULE_LICENSE("GPL");
365
366module_init(eps_init);
367module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a75..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
1/*
2 * elanfreq: cpufreq driver for the AMD ELAN family
3 *
4 * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
5 *
6 * Parts of this code are (c) Sven Geggus <sven@geggus.net>
7 *
8 * All Rights Reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/module.h>
21#include <linux/init.h>
22
23#include <linux/delay.h>
24#include <linux/cpufreq.h>
25
26#include <asm/msr.h>
27#include <linux/timex.h>
28#include <linux/io.h>
29
30#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
31#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
32
33/* Module parameter */
34static int max_freq;
35
36struct s_elan_multiplier {
37 int clock; /* frequency in kHz */
38 int val40h; /* PMU Force Mode register */
39 int val80h; /* CPU Clock Speed Register */
40};
41
42/*
43 * It is important that the frequencies
44 * are listed in ascending order here!
45 */
46static struct s_elan_multiplier elan_multiplier[] = {
47 {1000, 0x02, 0x18},
48 {2000, 0x02, 0x10},
49 {4000, 0x02, 0x08},
50 {8000, 0x00, 0x00},
51 {16000, 0x00, 0x02},
52 {33000, 0x00, 0x04},
53 {66000, 0x01, 0x04},
54 {99000, 0x01, 0x05}
55};
56
57static struct cpufreq_frequency_table elanfreq_table[] = {
58 {0, 1000},
59 {1, 2000},
60 {2, 4000},
61 {3, 8000},
62 {4, 16000},
63 {5, 33000},
64 {6, 66000},
65 {7, 99000},
66 {0, CPUFREQ_TABLE_END},
67};
68
69
70/**
71 * elanfreq_get_cpu_frequency: determine current cpu speed
72 *
73 * Finds out at which frequency the CPU of the Elan SOC runs
74 * at the moment. Frequencies from 1 to 33 MHz are generated
75 * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
76 * and have the rest of the chip running with 33 MHz.
77 */
78
79static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
80{
81 u8 clockspeed_reg; /* Clock Speed Register */
82
83 local_irq_disable();
84 outb_p(0x80, REG_CSCIR);
85 clockspeed_reg = inb_p(REG_CSCDR);
86 local_irq_enable();
87
88 if ((clockspeed_reg & 0xE0) == 0xE0)
89 return 0;
90
91 /* Are we in CPU clock multiplied mode (66/99 MHz)? */
92 if ((clockspeed_reg & 0xE0) == 0xC0) {
93 if ((clockspeed_reg & 0x01) == 0)
94 return 66000;
95 else
96 return 99000;
97 }
98
99 /* 33 MHz is not 32 MHz... */
100 if ((clockspeed_reg & 0xE0) == 0xA0)
101 return 33000;
102
103 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
104}
105
106
107/**
108 * elanfreq_set_cpu_frequency: Change the CPU core frequency
109 * @cpu: cpu number
110 * @freq: frequency in kHz
111 *
112 * This function takes a frequency value and changes the CPU frequency
113 * according to this. Note that the frequency has to be checked by
114 * elanfreq_validatespeed() for correctness!
115 *
116 * There is no return value.
117 */
118
119static void elanfreq_set_cpu_state(unsigned int state)
120{
121 struct cpufreq_freqs freqs;
122
123 freqs.old = elanfreq_get_cpu_frequency(0);
124 freqs.new = elan_multiplier[state].clock;
125 freqs.cpu = 0; /* elanfreq.c is UP only driver */
126
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128
129 printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
130 elan_multiplier[state].clock);
131
132
133 /*
134 * Access to the Elan's internal registers is indexed via
135 * 0x22: Chip Setup & Control Register Index Register (CSCI)
136 * 0x23: Chip Setup & Control Register Data Register (CSCD)
137 *
138 */
139
140 /*
141 * 0x40 is the Power Management Unit's Force Mode Register.
142 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
143 */
144
145 local_irq_disable();
146 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
147 outb_p(0x00, REG_CSCDR);
148 local_irq_enable(); /* wait till internal pipelines and */
149 udelay(1000); /* buffers have cleaned up */
150
151 local_irq_disable();
152
153 /* now, set the CPU clock speed register (0x80) */
154 outb_p(0x80, REG_CSCIR);
155 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
156
157 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
158 outb_p(0x40, REG_CSCIR);
159 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
160 udelay(10000);
161 local_irq_enable();
162
163 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
164};
165
166
167/**
168 * elanfreq_validatespeed: test if frequency range is valid
169 * @policy: the policy to validate
170 *
171 * This function checks if a given frequency range in kHz is valid
172 * for the hardware supported by the driver.
173 */
174
175static int elanfreq_verify(struct cpufreq_policy *policy)
176{
177 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
178}
179
180static int elanfreq_target(struct cpufreq_policy *policy,
181 unsigned int target_freq,
182 unsigned int relation)
183{
184 unsigned int newstate = 0;
185
186 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
187 target_freq, relation, &newstate))
188 return -EINVAL;
189
190 elanfreq_set_cpu_state(newstate);
191
192 return 0;
193}
194
195
196/*
197 * Module init and exit code
198 */
199
200static int elanfreq_cpu_init(struct cpufreq_policy *policy)
201{
202 struct cpuinfo_x86 *c = &cpu_data(0);
203 unsigned int i;
204 int result;
205
206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV;
210
211 /* max freq */
212 if (!max_freq)
213 max_freq = elanfreq_get_cpu_frequency(0);
214
215 /* table init */
216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 }
220
221 /* cpuinfo and default policy values */
222 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
223 policy->cur = elanfreq_get_cpu_frequency(0);
224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result)
227 return result;
228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0;
231}
232
233
234static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
235{
236 cpufreq_frequency_table_put_attr(policy->cpu);
237 return 0;
238}
239
240
241#ifndef MODULE
242/**
243 * elanfreq_setup - elanfreq command line parameter parsing
244 *
245 * elanfreq command line parameter. Use:
246 * elanfreq=66000
247 * to set the maximum CPU frequency to 66 MHz. Note that in
248 * case you do not give this boot parameter, the maximum
249 * frequency will fall back to _current_ CPU frequency which
250 * might be lower. If you build this as a module, use the
251 * max_freq module parameter instead.
252 */
253static int __init elanfreq_setup(char *str)
254{
255 max_freq = simple_strtoul(str, &str, 0);
256 printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
257 return 1;
258}
259__setup("elanfreq=", elanfreq_setup);
260#endif
261
262
263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL,
266};
267
268
269static struct cpufreq_driver elanfreq_driver = {
270 .get = elanfreq_get_cpu_frequency,
271 .verify = elanfreq_verify,
272 .target = elanfreq_target,
273 .init = elanfreq_cpu_init,
274 .exit = elanfreq_cpu_exit,
275 .name = "elanfreq",
276 .owner = THIS_MODULE,
277 .attr = elanfreq_attr,
278};
279
280
281static int __init elanfreq_init(void)
282{
283 struct cpuinfo_x86 *c = &cpu_data(0);
284
285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV;
290 }
291 return cpufreq_register_driver(&elanfreq_driver);
292}
293
294
295static void __exit elanfreq_exit(void)
296{
297 cpufreq_unregister_driver(&elanfreq_driver);
298}
299
300
301module_param(max_freq, int, 0444);
302
303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
305 "Sven Geggus <sven@geggus.net>");
306MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
307
308module_init(elanfreq_init);
309module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 32974cf84232..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,517 +0,0 @@
1/*
2 * Cyrix MediaGX and NatSemi Geode Suspend Modulation
3 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
4 * (C) 2002 Hiroshi Miura <miura@da-cha.org>
5 * All Rights Reserved
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation
10 *
11 * The author(s) of this software shall not be held liable for damages
12 * of any nature resulting due to the use of this software. This
13 * software is provided AS-IS with no warranties.
14 *
15 * Theoretical note:
16 *
17 * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
18 *
19 * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
20 * are based on Suspend Modulation.
21 *
22 * Suspend Modulation works by asserting and de-asserting the SUSP# pin
23 * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
24 * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
25 * asserted then power consumption is reduced.
26 *
27 * Suspend Modulation's OFF/ON duration are configurable
28 * with 'Suspend Modulation OFF Count Register'
29 * and 'Suspend Modulation ON Count Register'.
30 * These registers are 8bit counters that represent the number of
31 * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
32 * to the processor.
33 *
34 * These counters define a ratio which is the effective frequency
35 * of operation of the system.
36 *
37 * OFF Count
38 * F_eff = Fgx * ----------------------
39 * OFF Count + ON Count
40 *
41 * 0 <= On Count, Off Count <= 255
42 *
43 * From these limits, we can get register values
44 *
45 * off_duration + on_duration <= MAX_DURATION
46 * on_duration = off_duration * (stock_freq - freq) / freq
47 *
48 * off_duration = (freq * DURATION) / stock_freq
49 * on_duration = DURATION - off_duration
50 *
51 *
52 *---------------------------------------------------------------------------
53 *
54 * ChangeLog:
55 * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
56 * - fix on/off register mistake
57 * - fix cpu_khz calc when it stops cpu modulation.
58 *
59 * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
60 * - rewrite for Cyrix MediaGX Cx5510/5520 and
61 * NatSemi Geode Cs5530(A).
62 *
63 * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
64 * - cs5530_mod patch for 2.4.19-rc1.
65 *
66 *---------------------------------------------------------------------------
67 *
68 * Todo
69 * Test on machines with 5510, 5530, 5530A
70 */
71
72/************************************************************************
73 * Suspend Modulation - Definitions *
74 ************************************************************************/
75
76#include <linux/kernel.h>
77#include <linux/module.h>
78#include <linux/init.h>
79#include <linux/smp.h>
80#include <linux/cpufreq.h>
81#include <linux/pci.h>
82#include <linux/errno.h>
83#include <linux/slab.h>
84
85#include <asm/processor-cyrix.h>
86
87/* PCI config registers, all at F0 */
88#define PCI_PMER1 0x80 /* power management enable register 1 */
89#define PCI_PMER2 0x81 /* power management enable register 2 */
90#define PCI_PMER3 0x82 /* power management enable register 3 */
91#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
92#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
93#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
94#define PCI_MODON 0x95 /* suspend modulation ON counter register */
95#define PCI_SUSCFG 0x96 /* suspend configuration register */
96
97/* PMER1 bits */
98#define GPM (1<<0) /* global power management */
99#define GIT (1<<1) /* globally enable PM device idle timers */
100#define GTR (1<<2) /* globally enable IO traps */
101#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
102#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
103
104/* SUSCFG bits */
105#define SUSMOD (1<<0) /* enable/disable suspend modulation */
106/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
107#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
108 /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
109#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
110/* the below is supported only with cs5530A */
111#define PWRSVE_ISA (1<<3) /* stop ISA clock */
112#define PWRSVE (1<<4) /* active idle */
113
114struct gxfreq_params {
115 u8 on_duration;
116 u8 off_duration;
117 u8 pci_suscfg;
118 u8 pci_pmer1;
119 u8 pci_pmer2;
120 struct pci_dev *cs55x0;
121};
122
123static struct gxfreq_params *gx_params;
124static int stock_freq;
125
126/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
127static int pci_busclk;
128module_param(pci_busclk, int, 0444);
129
130/* maximum duration for which the cpu may be suspended
131 * (32us * MAX_DURATION). If no parameter is given, this defaults
132 * to 255.
133 * Note that this leads to a maximum of 8 ms(!) where the CPU clock
134 * is suspended -- processing power is just 0.39% of what it used to be,
135 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
136static int max_duration = 255;
137module_param(max_duration, int, 0444);
138
139/* For the default policy, we want at least some processing power
140 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
141 */
142#define POLICY_MIN_DIV 20
143
144
145#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
146 "gx-suspmod", msg)
147
148/**
149 * we can detect a core multipiler from dir0_lsb
150 * from GX1 datasheet p.56,
151 * MULT[3:0]:
152 * 0000 = SYSCLK multiplied by 4 (test only)
153 * 0001 = SYSCLK multiplied by 10
154 * 0010 = SYSCLK multiplied by 4
155 * 0011 = SYSCLK multiplied by 6
156 * 0100 = SYSCLK multiplied by 9
157 * 0101 = SYSCLK multiplied by 5
158 * 0110 = SYSCLK multiplied by 7
159 * 0111 = SYSCLK multiplied by 8
160 * of 33.3MHz
161 **/
162static int gx_freq_mult[16] = {
163 4, 10, 4, 6, 9, 5, 7, 8,
164 0, 0, 0, 0, 0, 0, 0, 0
165};
166
167
168/****************************************************************
169 * Low Level chipset interface *
170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 { 0, },
176};
177
178static void gx_write_byte(int reg, int value)
179{
180 pci_write_config_byte(gx_params->cs55x0, reg, value);
181}
182
183/**
184 * gx_detect_chipset:
185 *
186 **/
187static __init struct pci_dev *gx_detect_chipset(void)
188{
189 struct pci_dev *gx_pci = NULL;
190
191 /* check if CPU is a MediaGX or a Geode. */
192 if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
193 (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
194 dprintk("error: no MediaGX/Geode processor found!\n");
195 return NULL;
196 }
197
198 /* detect which companion chip is used */
199 for_each_pci_dev(gx_pci) {
200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
201 return gx_pci;
202 }
203
204 dprintk("error: no supported chipset found!\n");
205 return NULL;
206}
207
208/**
209 * gx_get_cpuspeed:
210 *
211 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
212 * Geode CPU runs.
213 */
214static unsigned int gx_get_cpuspeed(unsigned int cpu)
215{
216 if ((gx_params->pci_suscfg & SUSMOD) == 0)
217 return stock_freq;
218
219 return (stock_freq * gx_params->off_duration)
220 / (gx_params->on_duration + gx_params->off_duration);
221}
222
223/**
224 * gx_validate_speed:
225 * determine current cpu speed
226 *
227 **/
228
229static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
230 u8 *off_duration)
231{
232 unsigned int i;
233 u8 tmp_on, tmp_off;
234 int old_tmp_freq = stock_freq;
235 int tmp_freq;
236
237 *off_duration = 1;
238 *on_duration = 0;
239
240 for (i = max_duration; i > 0; i--) {
241 tmp_off = ((khz * i) / stock_freq) & 0xff;
242 tmp_on = i - tmp_off;
243 tmp_freq = (stock_freq * tmp_off) / i;
244 /* if this relation is closer to khz, use this. If it's equal,
245 * prefer it, too - lower latency */
246 if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
247 *on_duration = tmp_on;
248 *off_duration = tmp_off;
249 old_tmp_freq = tmp_freq;
250 }
251 }
252
253 return old_tmp_freq;
254}
255
256
257/**
258 * gx_set_cpuspeed:
259 * set cpu speed in khz.
260 **/
261
262static void gx_set_cpuspeed(unsigned int khz)
263{
264 u8 suscfg, pmer1;
265 unsigned int new_khz;
266 unsigned long flags;
267 struct cpufreq_freqs freqs;
268
269 freqs.cpu = 0;
270 freqs.old = gx_get_cpuspeed(0);
271
272 new_khz = gx_validate_speed(khz, &gx_params->on_duration,
273 &gx_params->off_duration);
274
275 freqs.new = new_khz;
276
277 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
278 local_irq_save(flags);
279
280
281
282 if (new_khz != stock_freq) {
283 /* if new khz == 100% of CPU speed, it is special case */
284 switch (gx_params->cs55x0->device) {
285 case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
286 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
287 /* FIXME: need to test other values -- Zwane,Miura */
288 /* typical 2 to 4ms */
289 gx_write_byte(PCI_IRQTC, 4);
290 /* typical 50 to 100ms */
291 gx_write_byte(PCI_VIDTC, 100);
292 gx_write_byte(PCI_PMER1, pmer1);
293
294 if (gx_params->cs55x0->revision < 0x10) {
295 /* CS5530(rev 1.2, 1.3) */
296 suscfg = gx_params->pci_suscfg|SUSMOD;
297 } else {
298 /* CS5530A,B.. */
299 suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
300 }
301 break;
302 case PCI_DEVICE_ID_CYRIX_5520:
303 case PCI_DEVICE_ID_CYRIX_5510:
304 suscfg = gx_params->pci_suscfg | SUSMOD;
305 break;
306 default:
307 local_irq_restore(flags);
308 dprintk("fatal: try to set unknown chipset.\n");
309 return;
310 }
311 } else {
312 suscfg = gx_params->pci_suscfg & ~(SUSMOD);
313 gx_params->off_duration = 0;
314 gx_params->on_duration = 0;
315 dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
316 }
317
318 gx_write_byte(PCI_MODOFF, gx_params->off_duration);
319 gx_write_byte(PCI_MODON, gx_params->on_duration);
320
321 gx_write_byte(PCI_SUSCFG, suscfg);
322 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
323
324 local_irq_restore(flags);
325
326 gx_params->pci_suscfg = suscfg;
327
328 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
329
330 dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
331 gx_params->on_duration * 32, gx_params->off_duration * 32);
332 dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
333}
334
335/****************************************************************
336 * High level functions *
337 ****************************************************************/
338
339/*
340 * cpufreq_gx_verify: test if frequency range is valid
341 *
342 * This function checks if a given frequency range in kHz is valid
343 * for the hardware supported by the driver.
344 */
345
346static int cpufreq_gx_verify(struct cpufreq_policy *policy)
347{
348 unsigned int tmp_freq = 0;
349 u8 tmp1, tmp2;
350
351 if (!stock_freq || !policy)
352 return -EINVAL;
353
354 policy->cpu = 0;
355 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
356 stock_freq);
357
358 /* it needs to be assured that at least one supported frequency is
359 * within policy->min and policy->max. If it is not, policy->max
360 * needs to be increased until one freuqency is supported.
361 * policy->min may not be decreased, though. This way we guarantee a
362 * specific processing capacity.
363 */
364 tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
365 if (tmp_freq < policy->min)
366 tmp_freq += stock_freq / max_duration;
367 policy->min = tmp_freq;
368 if (policy->min > policy->max)
369 policy->max = tmp_freq;
370 tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
371 if (tmp_freq > policy->max)
372 tmp_freq -= stock_freq / max_duration;
373 policy->max = tmp_freq;
374 if (policy->max < policy->min)
375 policy->max = policy->min;
376 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
377 stock_freq);
378
379 return 0;
380}
381
382/*
383 * cpufreq_gx_target:
384 *
385 */
386static int cpufreq_gx_target(struct cpufreq_policy *policy,
387 unsigned int target_freq,
388 unsigned int relation)
389{
390 u8 tmp1, tmp2;
391 unsigned int tmp_freq;
392
393 if (!stock_freq || !policy)
394 return -EINVAL;
395
396 policy->cpu = 0;
397
398 tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
399 while (tmp_freq < policy->min) {
400 tmp_freq += stock_freq / max_duration;
401 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
402 }
403 while (tmp_freq > policy->max) {
404 tmp_freq -= stock_freq / max_duration;
405 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
406 }
407
408 gx_set_cpuspeed(tmp_freq);
409
410 return 0;
411}
412
413static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
414{
415 unsigned int maxfreq, curfreq;
416
417 if (!policy || policy->cpu != 0)
418 return -ENODEV;
419
420 /* determine maximum frequency */
421 if (pci_busclk)
422 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
423 else if (cpu_khz)
424 maxfreq = cpu_khz;
425 else
426 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
427
428 stock_freq = maxfreq;
429 curfreq = gx_get_cpuspeed(0);
430
431 dprintk("cpu max frequency is %d.\n", maxfreq);
432 dprintk("cpu current frequency is %dkHz.\n", curfreq);
433
434 /* setup basic struct for cpufreq API */
435 policy->cpu = 0;
436
437 if (max_duration < POLICY_MIN_DIV)
438 policy->min = maxfreq / max_duration;
439 else
440 policy->min = maxfreq / POLICY_MIN_DIV;
441 policy->max = maxfreq;
442 policy->cur = curfreq;
443 policy->cpuinfo.min_freq = maxfreq / max_duration;
444 policy->cpuinfo.max_freq = maxfreq;
445 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
446
447 return 0;
448}
449
450/*
451 * cpufreq_gx_init:
452 * MediaGX/Geode GX initialize cpufreq driver
453 */
454static struct cpufreq_driver gx_suspmod_driver = {
455 .get = gx_get_cpuspeed,
456 .verify = cpufreq_gx_verify,
457 .target = cpufreq_gx_target,
458 .init = cpufreq_gx_cpu_init,
459 .name = "gx-suspmod",
460 .owner = THIS_MODULE,
461};
462
463static int __init cpufreq_gx_init(void)
464{
465 int ret;
466 struct gxfreq_params *params;
467 struct pci_dev *gx_pci;
468
469 /* Test if we have the right hardware */
470 gx_pci = gx_detect_chipset();
471 if (gx_pci == NULL)
472 return -ENODEV;
473
474 /* check whether module parameters are sane */
475 if (max_duration > 0xff)
476 max_duration = 0xff;
477
478 dprintk("geode suspend modulation available.\n");
479
480 params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
481 if (params == NULL)
482 return -ENOMEM;
483
484 params->cs55x0 = gx_pci;
485 gx_params = params;
486
487 /* keep cs55x0 configurations */
488 pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
489 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
490 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
491 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
492 pci_read_config_byte(params->cs55x0, PCI_MODOFF,
493 &(params->off_duration));
494
495 ret = cpufreq_register_driver(&gx_suspmod_driver);
496 if (ret) {
497 kfree(params);
498 return ret; /* register error! */
499 }
500
501 return 0;
502}
503
504static void __exit cpufreq_gx_exit(void)
505{
506 cpufreq_unregister_driver(&gx_suspmod_driver);
507 pci_dev_put(gx_params->cs55x0);
508 kfree(gx_params);
509}
510
511MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
512MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
513MODULE_LICENSE("GPL");
514
515module_init(cpufreq_gx_init);
516module_exit(cpufreq_gx_exit);
517
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index cf48cdd6907d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
1/*
2 * (C) 2001-2004 Dave Jones. <davej@redhat.com>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon datasheets & sample CPUs kindly provided by VIA.
7 *
8 * VIA have currently 3 different versions of Longhaul.
9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
11 * Version 2 of longhaul is backward compatible with v1, but adds
12 * LONGHAUL MSR for purpose of both frequency and voltage scaling.
13 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
14 * Version 3 of longhaul got renamed to Powersaver and redesigned
15 * to use only the POWERSAVER MSR at 0x110a.
16 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
17 * It's pretty much the same feature wise to longhaul v2, though
18 * there is provision for scaling FSB too, but this doesn't work
19 * too well in practice so we don't even try to use this.
20 *
21 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
22 */
23
24#include <linux/kernel.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/init.h>
28#include <linux/cpufreq.h>
29#include <linux/pci.h>
30#include <linux/slab.h>
31#include <linux/string.h>
32#include <linux/delay.h>
33#include <linux/timex.h>
34#include <linux/io.h>
35#include <linux/acpi.h>
36
37#include <asm/msr.h>
38#include <acpi/processor.h>
39
40#include "longhaul.h"
41
42#define PFX "longhaul: "
43
44#define TYPE_LONGHAUL_V1 1
45#define TYPE_LONGHAUL_V2 2
46#define TYPE_POWERSAVER 3
47
48#define CPU_SAMUEL 1
49#define CPU_SAMUEL2 2
50#define CPU_EZRA 3
51#define CPU_EZRA_T 4
52#define CPU_NEHEMIAH 5
53#define CPU_NEHEMIAH_C 6
54
55/* Flags */
56#define USE_ACPI_C3 (1 << 1)
57#define USE_NORTHBRIDGE (1 << 2)
58
59static int cpu_model;
60static unsigned int numscales = 16;
61static unsigned int fsb;
62
63static const struct mV_pos *vrm_mV_table;
64static const unsigned char *mV_vrm_table;
65
66static unsigned int highest_speed, lowest_speed; /* kHz */
67static unsigned int minmult, maxmult;
68static int can_scale_voltage;
69static struct acpi_processor *pr;
70static struct acpi_processor_cx *cx;
71static u32 acpi_regs_addr;
72static u8 longhaul_flags;
73static unsigned int longhaul_index;
74
75/* Module parameters */
76static int scale_voltage;
77static int disable_acpi_c3;
78static int revid_errata;
79
80#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
81 "longhaul", msg)
82
83
84/* Clock ratios multiplied by 10 */
85static int mults[32];
86static int eblcr[32];
87static int longhaul_version;
88static struct cpufreq_frequency_table *longhaul_table;
89
90#ifdef CONFIG_CPU_FREQ_DEBUG
91static char speedbuffer[8];
92
93static char *print_speed(int speed)
94{
95 if (speed < 1000) {
96 snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
97 return speedbuffer;
98 }
99
100 if (speed%1000 == 0)
101 snprintf(speedbuffer, sizeof(speedbuffer),
102 "%dGHz", speed/1000);
103 else
104 snprintf(speedbuffer, sizeof(speedbuffer),
105 "%d.%dGHz", speed/1000, (speed%1000)/100);
106
107 return speedbuffer;
108}
109#endif
110
111
112static unsigned int calc_speed(int mult)
113{
114 int khz;
115 khz = (mult/10)*fsb;
116 if (mult%10)
117 khz += fsb/2;
118 khz *= 1000;
119 return khz;
120}
121
122
123static int longhaul_get_cpu_mult(void)
124{
125 unsigned long invalue = 0, lo, hi;
126
127 rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
128 invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
129 if (longhaul_version == TYPE_LONGHAUL_V2 ||
130 longhaul_version == TYPE_POWERSAVER) {
131 if (lo & (1<<27))
132 invalue += 16;
133 }
134 return eblcr[invalue];
135}
136
137/* For processor with BCR2 MSR */
138
139static void do_longhaul1(unsigned int mults_index)
140{
141 union msr_bcr2 bcr2;
142
143 rdmsrl(MSR_VIA_BCR2, bcr2.val);
144 /* Enable software clock multiplier */
145 bcr2.bits.ESOFTBF = 1;
146 bcr2.bits.CLOCKMUL = mults_index & 0xff;
147
148 /* Sync to timer tick */
149 safe_halt();
150 /* Change frequency on next halt or sleep */
151 wrmsrl(MSR_VIA_BCR2, bcr2.val);
152 /* Invoke transition */
153 ACPI_FLUSH_CPU_CACHE();
154 halt();
155
156 /* Disable software clock multiplier */
157 local_irq_disable();
158 rdmsrl(MSR_VIA_BCR2, bcr2.val);
159 bcr2.bits.ESOFTBF = 0;
160 wrmsrl(MSR_VIA_BCR2, bcr2.val);
161}
162
163/* For processor with Longhaul MSR */
164
165static void do_powersaver(int cx_address, unsigned int mults_index,
166 unsigned int dir)
167{
168 union msr_longhaul longhaul;
169 u32 t;
170
171 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
172 /* Setup new frequency */
173 if (!revid_errata)
174 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
175 else
176 longhaul.bits.RevisionKey = 0;
177 longhaul.bits.SoftBusRatio = mults_index & 0xf;
178 longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
179 /* Setup new voltage */
180 if (can_scale_voltage)
181 longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
182 /* Sync to timer tick */
183 safe_halt();
184 /* Raise voltage if necessary */
185 if (can_scale_voltage && dir) {
186 longhaul.bits.EnableSoftVID = 1;
187 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
188 /* Change voltage */
189 if (!cx_address) {
190 ACPI_FLUSH_CPU_CACHE();
191 halt();
192 } else {
193 ACPI_FLUSH_CPU_CACHE();
194 /* Invoke C3 */
195 inb(cx_address);
196 /* Dummy op - must do something useless after P_LVL3
197 * read */
198 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
199 }
200 longhaul.bits.EnableSoftVID = 0;
201 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
202 }
203
204 /* Change frequency on next halt or sleep */
205 longhaul.bits.EnableSoftBusRatio = 1;
206 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
207 if (!cx_address) {
208 ACPI_FLUSH_CPU_CACHE();
209 halt();
210 } else {
211 ACPI_FLUSH_CPU_CACHE();
212 /* Invoke C3 */
213 inb(cx_address);
214 /* Dummy op - must do something useless after P_LVL3 read */
215 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
216 }
217 /* Disable bus ratio bit */
218 longhaul.bits.EnableSoftBusRatio = 0;
219 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
220
221 /* Reduce voltage if necessary */
222 if (can_scale_voltage && !dir) {
223 longhaul.bits.EnableSoftVID = 1;
224 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
225 /* Change voltage */
226 if (!cx_address) {
227 ACPI_FLUSH_CPU_CACHE();
228 halt();
229 } else {
230 ACPI_FLUSH_CPU_CACHE();
231 /* Invoke C3 */
232 inb(cx_address);
233 /* Dummy op - must do something useless after P_LVL3
234 * read */
235 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
236 }
237 longhaul.bits.EnableSoftVID = 0;
238 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
239 }
240}
241
242/**
243 * longhaul_set_cpu_frequency()
244 * @mults_index : bitpattern of the new multiplier.
245 *
246 * Sets a new clock ratio.
247 */
248
249static void longhaul_setstate(unsigned int table_index)
250{
251 unsigned int mults_index;
252 int speed, mult;
253 struct cpufreq_freqs freqs;
254 unsigned long flags;
255 unsigned int pic1_mask, pic2_mask;
256 u16 bm_status = 0;
257 u32 bm_timeout = 1000;
258 unsigned int dir = 0;
259
260 mults_index = longhaul_table[table_index].index;
261 /* Safety precautions */
262 mult = mults[mults_index & 0x1f];
263 if (mult == -1)
264 return;
265 speed = calc_speed(mult);
266 if ((speed > highest_speed) || (speed < lowest_speed))
267 return;
268 /* Voltage transition before frequency transition? */
269 if (can_scale_voltage && longhaul_index < table_index)
270 dir = 1;
271
272 freqs.old = calc_speed(longhaul_get_cpu_mult());
273 freqs.new = speed;
274 freqs.cpu = 0; /* longhaul.c is UP only driver */
275
276 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
277
278 dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
279 fsb, mult/10, mult%10, print_speed(speed/1000));
280retry_loop:
281 preempt_disable();
282 local_irq_save(flags);
283
284 pic2_mask = inb(0xA1);
285 pic1_mask = inb(0x21); /* works on C3. save mask. */
286 outb(0xFF, 0xA1); /* Overkill */
287 outb(0xFE, 0x21); /* TMR0 only */
288
289 /* Wait while PCI bus is busy. */
290 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
291 || ((pr != NULL) && pr->flags.bm_control))) {
292 bm_status = inw(acpi_regs_addr);
293 bm_status &= 1 << 4;
294 while (bm_status && bm_timeout) {
295 outw(1 << 4, acpi_regs_addr);
296 bm_timeout--;
297 bm_status = inw(acpi_regs_addr);
298 bm_status &= 1 << 4;
299 }
300 }
301
302 if (longhaul_flags & USE_NORTHBRIDGE) {
303 /* Disable AGP and PCI arbiters */
304 outb(3, 0x22);
305 } else if ((pr != NULL) && pr->flags.bm_control) {
306 /* Disable bus master arbitration */
307 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
308 }
309 switch (longhaul_version) {
310
311 /*
312 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
313 * Software controlled multipliers only.
314 */
315 case TYPE_LONGHAUL_V1:
316 do_longhaul1(mults_index);
317 break;
318
319 /*
320 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
321 *
322 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
323 * Nehemiah can do FSB scaling too, but this has never been proven
324 * to work in practice.
325 */
326 case TYPE_LONGHAUL_V2:
327 case TYPE_POWERSAVER:
328 if (longhaul_flags & USE_ACPI_C3) {
329 /* Don't allow wakeup */
330 acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
331 do_powersaver(cx->address, mults_index, dir);
332 } else {
333 do_powersaver(0, mults_index, dir);
334 }
335 break;
336 }
337
338 if (longhaul_flags & USE_NORTHBRIDGE) {
339 /* Enable arbiters */
340 outb(0, 0x22);
341 } else if ((pr != NULL) && pr->flags.bm_control) {
342 /* Enable bus master arbitration */
343 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
344 }
345 outb(pic2_mask, 0xA1); /* restore mask */
346 outb(pic1_mask, 0x21);
347
348 local_irq_restore(flags);
349 preempt_enable();
350
351 freqs.new = calc_speed(longhaul_get_cpu_mult());
352 /* Check if requested frequency is set. */
353 if (unlikely(freqs.new != speed)) {
354 printk(KERN_INFO PFX "Failed to set requested frequency!\n");
355 /* Revision ID = 1 but processor is expecting revision key
356 * equal to 0. Jumpers at the bottom of processor will change
357 * multiplier and FSB, but will not change bits in Longhaul
358 * MSR nor enable voltage scaling. */
359 if (!revid_errata) {
360 printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
361 "option.\n");
362 revid_errata = 1;
363 msleep(200);
364 goto retry_loop;
365 }
366 /* Why ACPI C3 sometimes doesn't work is a mystery for me.
367 * But it does happen. Processor is entering ACPI C3 state,
368 * but it doesn't change frequency. I tried poking various
369 * bits in northbridge registers, but without success. */
370 if (longhaul_flags & USE_ACPI_C3) {
371 printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
372 longhaul_flags &= ~USE_ACPI_C3;
373 if (revid_errata) {
374 printk(KERN_INFO PFX "Disabling \"Ignore "
375 "Revision ID\" option.\n");
376 revid_errata = 0;
377 }
378 msleep(200);
379 goto retry_loop;
380 }
381 /* This shouldn't happen. Longhaul ver. 2 was reported not
382 * working on processors without voltage scaling, but with
383 * RevID = 1. RevID errata will make things right. Just
384 * to be 100% sure. */
385 if (longhaul_version == TYPE_LONGHAUL_V2) {
386 printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
387 longhaul_version = TYPE_LONGHAUL_V1;
388 msleep(200);
389 goto retry_loop;
390 }
391 }
392 /* Report true CPU frequency */
393 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
394
395 if (!bm_timeout)
396 printk(KERN_INFO PFX "Warning: Timeout while waiting for "
397 "idle PCI bus.\n");
398}
399
400/*
401 * Centaur decided to make life a little more tricky.
402 * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
403 * Samuel2 and above have to try and guess what the FSB is.
404 * We do this by assuming we booted at maximum multiplier, and interpolate
405 * between that value multiplied by possible FSBs and cpu_mhz which
406 * was calculated at boot time. Really ugly, but no other way to do this.
407 */
408
409#define ROUNDING 0xf
410
411static int guess_fsb(int mult)
412{
413 int speed = cpu_khz / 1000;
414 int i;
415 int speeds[] = { 666, 1000, 1333, 2000 };
416 int f_max, f_min;
417
418 for (i = 0; i < 4; i++) {
419 f_max = ((speeds[i] * mult) + 50) / 100;
420 f_max += (ROUNDING / 2);
421 f_min = f_max - ROUNDING;
422 if ((speed <= f_max) && (speed >= f_min))
423 return speeds[i] / 10;
424 }
425 return 0;
426}
427
428
429static int __cpuinit longhaul_get_ranges(void)
430{
431 unsigned int i, j, k = 0;
432 unsigned int ratio;
433 int mult;
434
435 /* Get current frequency */
436 mult = longhaul_get_cpu_mult();
437 if (mult == -1) {
438 printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
439 return -EINVAL;
440 }
441 fsb = guess_fsb(mult);
442 if (fsb == 0) {
443 printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
444 return -EINVAL;
445 }
446 /* Get max multiplier - as we always did.
447 * Longhaul MSR is useful only when voltage scaling is enabled.
448 * C3 is booting at max anyway. */
449 maxmult = mult;
450 /* Get min multiplier */
451 switch (cpu_model) {
452 case CPU_NEHEMIAH:
453 minmult = 50;
454 break;
455 case CPU_NEHEMIAH_C:
456 minmult = 40;
457 break;
458 default:
459 minmult = 30;
460 break;
461 }
462
463 dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
464 minmult/10, minmult%10, maxmult/10, maxmult%10);
465
466 highest_speed = calc_speed(maxmult);
467 lowest_speed = calc_speed(minmult);
468 dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
469 print_speed(lowest_speed/1000),
470 print_speed(highest_speed/1000));
471
472 if (lowest_speed == highest_speed) {
473 printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
474 return -EINVAL;
475 }
476 if (lowest_speed > highest_speed) {
477 printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
478 lowest_speed, highest_speed);
479 return -EINVAL;
480 }
481
482 longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
483 GFP_KERNEL);
484 if (!longhaul_table)
485 return -ENOMEM;
486
487 for (j = 0; j < numscales; j++) {
488 ratio = mults[j];
489 if (ratio == -1)
490 continue;
491 if (ratio > maxmult || ratio < minmult)
492 continue;
493 longhaul_table[k].frequency = calc_speed(ratio);
494 longhaul_table[k].index = j;
495 k++;
496 }
497 if (k <= 1) {
498 kfree(longhaul_table);
499 return -ENODEV;
500 }
501 /* Sort */
502 for (j = 0; j < k - 1; j++) {
503 unsigned int min_f, min_i;
504 min_f = longhaul_table[j].frequency;
505 min_i = j;
506 for (i = j + 1; i < k; i++) {
507 if (longhaul_table[i].frequency < min_f) {
508 min_f = longhaul_table[i].frequency;
509 min_i = i;
510 }
511 }
512 if (min_i != j) {
513 swap(longhaul_table[j].frequency,
514 longhaul_table[min_i].frequency);
515 swap(longhaul_table[j].index,
516 longhaul_table[min_i].index);
517 }
518 }
519
520 longhaul_table[k].frequency = CPUFREQ_TABLE_END;
521
522 /* Find index we are running on */
523 for (j = 0; j < k; j++) {
524 if (mults[longhaul_table[j].index & 0x1f] == mult) {
525 longhaul_index = j;
526 break;
527 }
528 }
529 return 0;
530}
531
532
533static void __cpuinit longhaul_setup_voltagescaling(void)
534{
535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid;
537 unsigned int j, speed, pos, kHz_step, numvscales;
538 int min_vid_speed;
539
540 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
541 if (!(longhaul.bits.RevisionID & 1)) {
542 printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
543 return;
544 }
545
546 if (!longhaul.bits.VRMRev) {
547 printk(KERN_INFO PFX "VRM 8.5\n");
548 vrm_mV_table = &vrm85_mV[0];
549 mV_vrm_table = &mV_vrm85[0];
550 } else {
551 printk(KERN_INFO PFX "Mobile VRM\n");
552 if (cpu_model < CPU_NEHEMIAH)
553 return;
554 vrm_mV_table = &mobilevrm_mV[0];
555 mV_vrm_table = &mV_mobilevrm[0];
556 }
557
558 minvid = vrm_mV_table[longhaul.bits.MinimumVID];
559 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
560
561 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
562 printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
563 "Voltage scaling disabled.\n",
564 minvid.mV/1000, minvid.mV%1000,
565 maxvid.mV/1000, maxvid.mV%1000);
566 return;
567 }
568
569 if (minvid.mV == maxvid.mV) {
570 printk(KERN_INFO PFX "Claims to support voltage scaling but "
571 "min & max are both %d.%03d. "
572 "Voltage scaling disabled\n",
573 maxvid.mV/1000, maxvid.mV%1000);
574 return;
575 }
576
577 /* How many voltage steps*/
578 numvscales = maxvid.pos - minvid.pos + 1;
579 printk(KERN_INFO PFX
580 "Max VID=%d.%03d "
581 "Min VID=%d.%03d, "
582 "%d possible voltage scales\n",
583 maxvid.mV/1000, maxvid.mV%1000,
584 minvid.mV/1000, minvid.mV%1000,
585 numvscales);
586
587 /* Calculate max frequency at min voltage */
588 j = longhaul.bits.MinMHzBR;
589 if (longhaul.bits.MinMHzBR4)
590 j += 16;
591 min_vid_speed = eblcr[j];
592 if (min_vid_speed == -1)
593 return;
594 switch (longhaul.bits.MinMHzFSB) {
595 case 0:
596 min_vid_speed *= 13333;
597 break;
598 case 1:
599 min_vid_speed *= 10000;
600 break;
601 case 3:
602 min_vid_speed *= 6666;
603 break;
604 default:
605 return;
606 break;
607 }
608 if (min_vid_speed >= highest_speed)
609 return;
610 /* Calculate kHz for one voltage step */
611 kHz_step = (highest_speed - min_vid_speed) / numvscales;
612
613 j = 0;
614 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
615 speed = longhaul_table[j].frequency;
616 if (speed > min_vid_speed)
617 pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
618 else
619 pos = minvid.pos;
620 longhaul_table[j].index |= mV_vrm_table[pos] << 8;
621 vid = vrm_mV_table[mV_vrm_table[pos]];
622 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
623 speed, j, vid.mV);
624 j++;
625 }
626
627 can_scale_voltage = 1;
628 printk(KERN_INFO PFX "Voltage scaling enabled.\n");
629}
630
631
632static int longhaul_verify(struct cpufreq_policy *policy)
633{
634 return cpufreq_frequency_table_verify(policy, longhaul_table);
635}
636
637
638static int longhaul_target(struct cpufreq_policy *policy,
639 unsigned int target_freq, unsigned int relation)
640{
641 unsigned int table_index = 0;
642 unsigned int i;
643 unsigned int dir = 0;
644 u8 vid, current_vid;
645
646 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
647 relation, &table_index))
648 return -EINVAL;
649
650 /* Don't set same frequency again */
651 if (longhaul_index == table_index)
652 return 0;
653
654 if (!can_scale_voltage)
655 longhaul_setstate(table_index);
656 else {
657 /* On test system voltage transitions exceeding single
658 * step up or down were turning motherboard off. Both
659 * "ondemand" and "userspace" are unsafe. C7 is doing
660 * this in hardware, C3 is old and we need to do this
661 * in software. */
662 i = longhaul_index;
663 current_vid = (longhaul_table[longhaul_index].index >> 8);
664 current_vid &= 0x1f;
665 if (table_index > longhaul_index)
666 dir = 1;
667 while (i != table_index) {
668 vid = (longhaul_table[i].index >> 8) & 0x1f;
669 if (vid != current_vid) {
670 longhaul_setstate(i);
671 current_vid = vid;
672 msleep(200);
673 }
674 if (dir)
675 i++;
676 else
677 i--;
678 }
679 longhaul_setstate(table_index);
680 }
681 longhaul_index = table_index;
682 return 0;
683}
684
685
686static unsigned int longhaul_get(unsigned int cpu)
687{
688 if (cpu)
689 return 0;
690 return calc_speed(longhaul_get_cpu_mult());
691}
692
693static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
694 u32 nesting_level,
695 void *context, void **return_value)
696{
697 struct acpi_device *d;
698
699 if (acpi_bus_get_device(obj_handle, &d))
700 return 0;
701
702 *return_value = acpi_driver_data(d);
703 return 1;
704}
705
706/* VIA don't support PM2 reg, but have something similar */
707static int enable_arbiter_disable(void)
708{
709 struct pci_dev *dev;
710 int status = 1;
711 int reg;
712 u8 pci_cmd;
713
714 /* Find PLE133 host bridge */
715 reg = 0x78;
716 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
717 NULL);
718 /* Find PM133/VT8605 host bridge */
719 if (dev == NULL)
720 dev = pci_get_device(PCI_VENDOR_ID_VIA,
721 PCI_DEVICE_ID_VIA_8605_0, NULL);
722 /* Find CLE266 host bridge */
723 if (dev == NULL) {
724 reg = 0x76;
725 dev = pci_get_device(PCI_VENDOR_ID_VIA,
726 PCI_DEVICE_ID_VIA_862X_0, NULL);
727 /* Find CN400 V-Link host bridge */
728 if (dev == NULL)
729 dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
730 }
731 if (dev != NULL) {
732 /* Enable access to port 0x22 */
733 pci_read_config_byte(dev, reg, &pci_cmd);
734 if (!(pci_cmd & 1<<7)) {
735 pci_cmd |= 1<<7;
736 pci_write_config_byte(dev, reg, pci_cmd);
737 pci_read_config_byte(dev, reg, &pci_cmd);
738 if (!(pci_cmd & 1<<7)) {
739 printk(KERN_ERR PFX
740 "Can't enable access to port 0x22.\n");
741 status = 0;
742 }
743 }
744 pci_dev_put(dev);
745 return status;
746 }
747 return 0;
748}
749
750static int longhaul_setup_southbridge(void)
751{
752 struct pci_dev *dev;
753 u8 pci_cmd;
754
755 /* Find VT8235 southbridge */
756 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
757 if (dev == NULL)
758 /* Find VT8237 southbridge */
759 dev = pci_get_device(PCI_VENDOR_ID_VIA,
760 PCI_DEVICE_ID_VIA_8237, NULL);
761 if (dev != NULL) {
762 /* Set transition time to max */
763 pci_read_config_byte(dev, 0xec, &pci_cmd);
764 pci_cmd &= ~(1 << 2);
765 pci_write_config_byte(dev, 0xec, pci_cmd);
766 pci_read_config_byte(dev, 0xe4, &pci_cmd);
767 pci_cmd &= ~(1 << 7);
768 pci_write_config_byte(dev, 0xe4, pci_cmd);
769 pci_read_config_byte(dev, 0xe5, &pci_cmd);
770 pci_cmd |= 1 << 7;
771 pci_write_config_byte(dev, 0xe5, pci_cmd);
772 /* Get address of ACPI registers block*/
773 pci_read_config_byte(dev, 0x81, &pci_cmd);
774 if (pci_cmd & 1 << 7) {
775 pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
776 acpi_regs_addr &= 0xff00;
777 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
778 acpi_regs_addr);
779 }
780
781 pci_dev_put(dev);
782 return 1;
783 }
784 return 0;
785}
786
787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{
789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL;
791 int ret;
792 u32 lo, hi;
793
794 /* Check what we have on this motherboard */
795 switch (c->x86_model) {
796 case 6:
797 cpu_model = CPU_SAMUEL;
798 cpuname = "C3 'Samuel' [C5A]";
799 longhaul_version = TYPE_LONGHAUL_V1;
800 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
801 memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
802 break;
803
804 case 7:
805 switch (c->x86_mask) {
806 case 0:
807 longhaul_version = TYPE_LONGHAUL_V1;
808 cpu_model = CPU_SAMUEL2;
809 cpuname = "C3 'Samuel 2' [C5B]";
810 /* Note, this is not a typo, early Samuel2's had
811 * Samuel1 ratios. */
812 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break;
815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]";
820 } else {
821 cpu_model = CPU_EZRA;
822 cpuname = "C3 'Ezra' [C5C]";
823 }
824 memcpy(mults, ezra_mults, sizeof(ezra_mults));
825 memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
826 break;
827 }
828 break;
829
830 case 8:
831 cpu_model = CPU_EZRA_T;
832 cpuname = "C3 'Ezra-T' [C5M]";
833 longhaul_version = TYPE_POWERSAVER;
834 numscales = 32;
835 memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
836 memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
837 break;
838
839 case 9:
840 longhaul_version = TYPE_POWERSAVER;
841 numscales = 32;
842 memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
843 memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
844 switch (c->x86_mask) {
845 case 0 ... 1:
846 cpu_model = CPU_NEHEMIAH;
847 cpuname = "C3 'Nehemiah A' [C5XLOE]";
848 break;
849 case 2 ... 4:
850 cpu_model = CPU_NEHEMIAH;
851 cpuname = "C3 'Nehemiah B' [C5XLOH]";
852 break;
853 case 5 ... 15:
854 cpu_model = CPU_NEHEMIAH_C;
855 cpuname = "C3 'Nehemiah C' [C5P]";
856 break;
857 }
858 break;
859
860 default:
861 cpuname = "Unknown";
862 break;
863 }
864 /* Check Longhaul ver. 2 */
865 if (longhaul_version == TYPE_LONGHAUL_V2) {
866 rdmsr(MSR_VIA_LONGHAUL, lo, hi);
867 if (lo == 0 && hi == 0)
868 /* Looks like MSR isn't present */
869 longhaul_version = TYPE_LONGHAUL_V1;
870 }
871
872 printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
873 switch (longhaul_version) {
874 case TYPE_LONGHAUL_V1:
875 case TYPE_LONGHAUL_V2:
876 printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
877 break;
878 case TYPE_POWERSAVER:
879 printk(KERN_CONT "Powersaver supported.\n");
880 break;
881 };
882
883 /* Doesn't hurt */
884 longhaul_setup_southbridge();
885
886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr);
890
891 /* Check ACPI support for C3 state */
892 if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
893 cx = &pr->power.states[ACPI_STATE_C3];
894 if (cx->address > 0 && cx->latency <= 1000)
895 longhaul_flags |= USE_ACPI_C3;
896 }
897 /* Disable if it isn't working */
898 if (disable_acpi_c3)
899 longhaul_flags &= ~USE_ACPI_C3;
900 /* Check if northbridge is friendly */
901 if (enable_arbiter_disable())
902 longhaul_flags |= USE_NORTHBRIDGE;
903
904 /* Check ACPI support for bus master arbiter disable */
905 if (!(longhaul_flags & USE_ACPI_C3
906 || longhaul_flags & USE_NORTHBRIDGE)
907 && ((pr == NULL) || !(pr->flags.bm_control))) {
908 printk(KERN_ERR PFX
909 "No ACPI support. Unsupported northbridge.\n");
910 return -ENODEV;
911 }
912
913 if (longhaul_flags & USE_NORTHBRIDGE)
914 printk(KERN_INFO PFX "Using northbridge support.\n");
915 if (longhaul_flags & USE_ACPI_C3)
916 printk(KERN_INFO PFX "Using ACPI support.\n");
917
918 ret = longhaul_get_ranges();
919 if (ret != 0)
920 return ret;
921
922 if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
923 longhaul_setup_voltagescaling();
924
925 policy->cpuinfo.transition_latency = 200000; /* nsec */
926 policy->cur = calc_speed(longhaul_get_cpu_mult());
927
928 ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
929 if (ret)
930 return ret;
931
932 cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
933
934 return 0;
935}
936
937static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
938{
939 cpufreq_frequency_table_put_attr(policy->cpu);
940 return 0;
941}
942
943static struct freq_attr *longhaul_attr[] = {
944 &cpufreq_freq_attr_scaling_available_freqs,
945 NULL,
946};
947
948static struct cpufreq_driver longhaul_driver = {
949 .verify = longhaul_verify,
950 .target = longhaul_target,
951 .get = longhaul_get,
952 .init = longhaul_cpu_init,
953 .exit = __devexit_p(longhaul_cpu_exit),
954 .name = "longhaul",
955 .owner = THIS_MODULE,
956 .attr = longhaul_attr,
957};
958
959
960static int __init longhaul_init(void)
961{
962 struct cpuinfo_x86 *c = &cpu_data(0);
963
964 if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
965 return -ENODEV;
966
967#ifdef CONFIG_SMP
968 if (num_online_cpus() > 1) {
969 printk(KERN_ERR PFX "More than 1 CPU detected, "
970 "longhaul disabled.\n");
971 return -ENODEV;
972 }
973#endif
974#ifdef CONFIG_X86_IO_APIC
975 if (cpu_has_apic) {
976 printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
977 "broken in this configuration.\n");
978 return -ENODEV;
979 }
980#endif
981 switch (c->x86_model) {
982 case 6 ... 9:
983 return cpufreq_register_driver(&longhaul_driver);
984 case 10:
985 printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
986 default:
987 ;
988 }
989
990 return -ENODEV;
991}
992
993
994static void __exit longhaul_exit(void)
995{
996 int i;
997
998 for (i = 0; i < numscales; i++) {
999 if (mults[i] == maxmult) {
1000 longhaul_setstate(i);
1001 break;
1002 }
1003 }
1004
1005 cpufreq_unregister_driver(&longhaul_driver);
1006 kfree(longhaul_table);
1007}
1008
1009/* Even if BIOS is exporting ACPI C3 state, and it is used
1010 * with success when CPU is idle, this state doesn't
1011 * trigger frequency transition in some cases. */
1012module_param(disable_acpi_c3, int, 0644);
1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1014/* Change CPU voltage with frequency. Very useful to save
1015 * power, but most VIA C3 processors aren't supporting it. */
1016module_param(scale_voltage, int, 0644);
1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1018/* Force revision key to 0 for processors which doesn't
1019 * support voltage scaling, but are introducing itself as
1020 * such. */
1021module_param(revid_errata, int, 0644);
1022MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1023
1024MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
1025MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
1026MODULE_LICENSE("GPL");
1027
1028late_initcall(longhaul_init);
1029module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca881..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
1/*
2 * longhaul.h
3 * (C) 2003 Dave Jones.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * VIA-specific information
8 */
9
10union msr_bcr2 {
11 struct {
12 unsigned Reseved:19, // 18:0
13 ESOFTBF:1, // 19
14 Reserved2:3, // 22:20
15 CLOCKMUL:4, // 26:23
16 Reserved3:5; // 31:27
17 } bits;
18 unsigned long val;
19};
20
21union msr_longhaul {
22 struct {
23 unsigned RevisionID:4, // 3:0
24 RevisionKey:4, // 7:4
25 EnableSoftBusRatio:1, // 8
26 EnableSoftVID:1, // 9
27 EnableSoftBSEL:1, // 10
28 Reserved:3, // 11:13
29 SoftBusRatio4:1, // 14
30 VRMRev:1, // 15
31 SoftBusRatio:4, // 19:16
32 SoftVID:5, // 24:20
33 Reserved2:3, // 27:25
34 SoftBSEL:2, // 29:28
35 Reserved3:2, // 31:30
36 MaxMHzBR:4, // 35:32
37 MaximumVID:5, // 40:36
38 MaxMHzFSB:2, // 42:41
39 MaxMHzBR4:1, // 43
40 Reserved4:4, // 47:44
41 MinMHzBR:4, // 51:48
42 MinimumVID:5, // 56:52
43 MinMHzFSB:2, // 58:57
44 MinMHzBR4:1, // 59
45 Reserved5:4; // 63:60
46 } bits;
47 unsigned long long val;
48};
49
50/*
51 * Clock ratio tables. Div/Mod by 10 to get ratio.
52 * The eblcr values specify the ratio read from the CPU.
53 * The mults values specify what to write to the CPU.
54 */
55
56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */
59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */
63 -1, /* 0011 -> RESERVED */
64 -1, /* 0100 -> RESERVED */
65 35, /* 0101 -> 3.5x */
66 45, /* 0110 -> 4.5x */
67 55, /* 0111 -> 5.5x */
68 60, /* 1000 -> 6.0x */
69 70, /* 1001 -> 7.0x */
70 80, /* 1010 -> 8.0x */
71 50, /* 1011 -> 5.0x */
72 65, /* 1100 -> 6.5x */
73 75, /* 1101 -> 7.5x */
74 -1, /* 1110 -> RESERVED */
75 -1, /* 1111 -> RESERVED */
76};
77
78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */
82 -1, /* 0011 -> RESERVED */
83 55, /* 0100 -> 5.5x */
84 35, /* 0101 -> 3.5x */
85 45, /* 0110 -> 4.5x */
86 -1, /* 0111 -> RESERVED */
87 -1, /* 1000 -> RESERVED */
88 70, /* 1001 -> 7.0x */
89 80, /* 1010 -> 8.0x */
90 60, /* 1011 -> 6.0x */
91 -1, /* 1100 -> RESERVED */
92 75, /* 1101 -> 7.5x */
93 -1, /* 1110 -> RESERVED */
94 65, /* 1111 -> 6.5x */
95};
96
97/*
98 * VIA C3 Samuel2 Stepping 1->15
99 */
100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */
104 100, /* 0011 -> 10.0x */
105 55, /* 0100 -> 5.5x */
106 35, /* 0101 -> 3.5x */
107 45, /* 0110 -> 4.5x */
108 110, /* 0111 -> 11.0x */
109 90, /* 1000 -> 9.0x */
110 70, /* 1001 -> 7.0x */
111 80, /* 1010 -> 8.0x */
112 60, /* 1011 -> 6.0x */
113 120, /* 1100 -> 12.0x */
114 75, /* 1101 -> 7.5x */
115 130, /* 1110 -> 13.0x */
116 65, /* 1111 -> 6.5x */
117};
118
119/*
120 * VIA C3 Ezra
121 */
122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */
126 90, /* 0011 -> 9.0x */
127 95, /* 0100 -> 9.5x */
128 35, /* 0101 -> 3.5x */
129 45, /* 0110 -> 4.5x */
130 55, /* 0111 -> 5.5x */
131 60, /* 1000 -> 6.0x */
132 70, /* 1001 -> 7.0x */
133 80, /* 1010 -> 8.0x */
134 50, /* 1011 -> 5.0x */
135 65, /* 1100 -> 6.5x */
136 75, /* 1101 -> 7.5x */
137 85, /* 1110 -> 8.5x */
138 120, /* 1111 -> 12.0x */
139};
140
141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */
145 100, /* 0011 -> 10.0x */
146 55, /* 0100 -> 5.5x */
147 35, /* 0101 -> 3.5x */
148 45, /* 0110 -> 4.5x */
149 95, /* 0111 -> 9.5x */
150 90, /* 1000 -> 9.0x */
151 70, /* 1001 -> 7.0x */
152 80, /* 1010 -> 8.0x */
153 60, /* 1011 -> 6.0x */
154 120, /* 1100 -> 12.0x */
155 75, /* 1101 -> 7.5x */
156 85, /* 1110 -> 8.5x */
157 65, /* 1111 -> 6.5x */
158};
159
160/*
161 * VIA C3 (Ezra-T) [C5M].
162 */
163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */
167 90, /* 0011 -> 9.0x */
168 95, /* 0100 -> 9.5x */
169 35, /* 0101 -> 3.5x */
170 45, /* 0110 -> 4.5x */
171 55, /* 0111 -> 5.5x */
172 60, /* 1000 -> 6.0x */
173 70, /* 1001 -> 7.0x */
174 80, /* 1010 -> 8.0x */
175 50, /* 1011 -> 5.0x */
176 65, /* 1100 -> 6.5x */
177 75, /* 1101 -> 7.5x */
178 85, /* 1110 -> 8.5x */
179 120, /* 1111 -> 12.0x */
180
181 -1, /* 0000 -> RESERVED (10.0x) */
182 110, /* 0001 -> 11.0x */
183 -1, /* 0010 -> 12.0x */
184 -1, /* 0011 -> RESERVED (9.0x)*/
185 105, /* 0100 -> 10.5x */
186 115, /* 0101 -> 11.5x */
187 125, /* 0110 -> 12.5x */
188 135, /* 0111 -> 13.5x */
189 140, /* 1000 -> 14.0x */
190 150, /* 1001 -> 15.0x */
191 160, /* 1010 -> 16.0x */
192 130, /* 1011 -> 13.0x */
193 145, /* 1100 -> 14.5x */
194 155, /* 1101 -> 15.5x */
195 -1, /* 1110 -> RESERVED (13.0x) */
196 -1, /* 1111 -> RESERVED (12.0x) */
197};
198
199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */
203 100, /* 0011 -> 10.0x */
204 55, /* 0100 -> 5.5x */
205 35, /* 0101 -> 3.5x */
206 45, /* 0110 -> 4.5x */
207 95, /* 0111 -> 9.5x */
208 90, /* 1000 -> 9.0x */
209 70, /* 1001 -> 7.0x */
210 80, /* 1010 -> 8.0x */
211 60, /* 1011 -> 6.0x */
212 120, /* 1100 -> 12.0x */
213 75, /* 1101 -> 7.5x */
214 85, /* 1110 -> 8.5x */
215 65, /* 1111 -> 6.5x */
216
217 -1, /* 0000 -> RESERVED (9.0x) */
218 110, /* 0001 -> 11.0x */
219 120, /* 0010 -> 12.0x */
220 -1, /* 0011 -> RESERVED (10.0x)*/
221 135, /* 0100 -> 13.5x */
222 115, /* 0101 -> 11.5x */
223 125, /* 0110 -> 12.5x */
224 105, /* 0111 -> 10.5x */
225 130, /* 1000 -> 13.0x */
226 150, /* 1001 -> 15.0x */
227 160, /* 1010 -> 16.0x */
228 140, /* 1011 -> 14.0x */
229 -1, /* 1100 -> RESERVED (12.0x) */
230 155, /* 1101 -> 15.5x */
231 -1, /* 1110 -> RESERVED (13.0x) */
232 145, /* 1111 -> 14.5x */
233};
234
235/*
236 * VIA C3 Nehemiah */
237
238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */
242 90, /* 0011 -> 9.0x */
243 95, /* 0100 -> 9.5x */
244 -1, /* 0101 -> RESERVED */
245 45, /* 0110 -> 4.5x */
246 55, /* 0111 -> 5.5x */
247 60, /* 1000 -> 6.0x */
248 70, /* 1001 -> 7.0x */
249 80, /* 1010 -> 8.0x */
250 50, /* 1011 -> 5.0x */
251 65, /* 1100 -> 6.5x */
252 75, /* 1101 -> 7.5x */
253 85, /* 1110 -> 8.5x */
254 120, /* 1111 -> 12.0x */
255 -1, /* 0000 -> 10.0x */
256 110, /* 0001 -> 11.0x */
257 -1, /* 0010 -> 12.0x */
258 -1, /* 0011 -> 9.0x */
259 105, /* 0100 -> 10.5x */
260 115, /* 0101 -> 11.5x */
261 125, /* 0110 -> 12.5x */
262 135, /* 0111 -> 13.5x */
263 140, /* 1000 -> 14.0x */
264 150, /* 1001 -> 15.0x */
265 160, /* 1010 -> 16.0x */
266 130, /* 1011 -> 13.0x */
267 145, /* 1100 -> 14.5x */
268 155, /* 1101 -> 15.5x */
269 -1, /* 1110 -> RESERVED (13.0x) */
270 -1, /* 1111 -> 12.0x */
271};
272
273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */
277 100, /* 0011 -> 10.0x */
278 55, /* 0100 -> 5.5x */
279 -1, /* 0101 -> RESERVED */
280 45, /* 0110 -> 4.5x */
281 95, /* 0111 -> 9.5x */
282 90, /* 1000 -> 9.0x */
283 70, /* 1001 -> 7.0x */
284 80, /* 1010 -> 8.0x */
285 60, /* 1011 -> 6.0x */
286 120, /* 1100 -> 12.0x */
287 75, /* 1101 -> 7.5x */
288 85, /* 1110 -> 8.5x */
289 65, /* 1111 -> 6.5x */
290 90, /* 0000 -> 9.0x */
291 110, /* 0001 -> 11.0x */
292 120, /* 0010 -> 12.0x */
293 100, /* 0011 -> 10.0x */
294 135, /* 0100 -> 13.5x */
295 115, /* 0101 -> 11.5x */
296 125, /* 0110 -> 12.5x */
297 105, /* 0111 -> 10.5x */
298 130, /* 1000 -> 13.0x */
299 150, /* 1001 -> 15.0x */
300 160, /* 1010 -> 16.0x */
301 140, /* 1011 -> 14.0x */
302 120, /* 1100 -> 12.0x */
303 155, /* 1101 -> 15.5x */
304 -1, /* 1110 -> RESERVED (13.0x) */
305 145 /* 1111 -> 14.5x */
306};
307
308/*
309 * Voltage scales. Div/Mod by 1000 to get actual voltage.
310 * Which scale to use depends on the VRM type in use.
311 */
312
313struct mV_pos {
314 unsigned short mV;
315 unsigned short pos;
316};
317
318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
322 {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
323 {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
324 {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
325 {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327};
328
329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334};
335
336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
340 {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
341 {975, 15}, {950, 14}, {925, 13}, {900, 12},
342 {875, 11}, {850, 10}, {825, 9}, {800, 8},
343 {775, 7}, {750, 6}, {725, 5}, {700, 4},
344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345};
346
347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
351 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
352};
353
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index d9f51367666b..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/timex.h>
14
15#include <asm/msr.h>
16#include <asm/processor.h>
17
18#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
19 "longrun", msg)
20
21static struct cpufreq_driver longrun_driver;
22
23/**
24 * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
25 * values into per cent values. In TMTA microcode, the following is valid:
26 * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
27 */
28static unsigned int longrun_low_freq, longrun_high_freq;
29
30
31/**
32 * longrun_get_policy - get the current LongRun policy
33 * @policy: struct cpufreq_policy where current policy is written into
34 *
35 * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
36 * and MSR_TMTA_LONGRUN_CTRL
37 */
38static void __cpuinit longrun_get_policy(struct cpufreq_policy *policy)
39{
40 u32 msr_lo, msr_hi;
41
42 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
43 dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
44 if (msr_lo & 0x01)
45 policy->policy = CPUFREQ_POLICY_PERFORMANCE;
46 else
47 policy->policy = CPUFREQ_POLICY_POWERSAVE;
48
49 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
50 dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
51 msr_lo &= 0x0000007F;
52 msr_hi &= 0x0000007F;
53
54 if (longrun_high_freq <= longrun_low_freq) {
55 /* Assume degenerate Longrun table */
56 policy->min = policy->max = longrun_high_freq;
57 } else {
58 policy->min = longrun_low_freq + msr_lo *
59 ((longrun_high_freq - longrun_low_freq) / 100);
60 policy->max = longrun_low_freq + msr_hi *
61 ((longrun_high_freq - longrun_low_freq) / 100);
62 }
63 policy->cpu = 0;
64}
65
66
67/**
68 * longrun_set_policy - sets a new CPUFreq policy
69 * @policy: new policy
70 *
71 * Sets a new CPUFreq policy on LongRun-capable processors. This function
72 * has to be called with cpufreq_driver locked.
73 */
74static int longrun_set_policy(struct cpufreq_policy *policy)
75{
76 u32 msr_lo, msr_hi;
77 u32 pctg_lo, pctg_hi;
78
79 if (!policy)
80 return -EINVAL;
81
82 if (longrun_high_freq <= longrun_low_freq) {
83 /* Assume degenerate Longrun table */
84 pctg_lo = pctg_hi = 100;
85 } else {
86 pctg_lo = (policy->min - longrun_low_freq) /
87 ((longrun_high_freq - longrun_low_freq) / 100);
88 pctg_hi = (policy->max - longrun_low_freq) /
89 ((longrun_high_freq - longrun_low_freq) / 100);
90 }
91
92 if (pctg_hi > 100)
93 pctg_hi = 100;
94 if (pctg_lo > pctg_hi)
95 pctg_lo = pctg_hi;
96
97 /* performance or economy mode */
98 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
99 msr_lo &= 0xFFFFFFFE;
100 switch (policy->policy) {
101 case CPUFREQ_POLICY_PERFORMANCE:
102 msr_lo |= 0x00000001;
103 break;
104 case CPUFREQ_POLICY_POWERSAVE:
105 break;
106 }
107 wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
108
109 /* lower and upper boundary */
110 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
111 msr_lo &= 0xFFFFFF80;
112 msr_hi &= 0xFFFFFF80;
113 msr_lo |= pctg_lo;
114 msr_hi |= pctg_hi;
115 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
116
117 return 0;
118}
119
120
121/**
122 * longrun_verify_poliy - verifies a new CPUFreq policy
123 * @policy: the policy to verify
124 *
125 * Validates a new CPUFreq policy. This function has to be called with
126 * cpufreq_driver locked.
127 */
128static int longrun_verify_policy(struct cpufreq_policy *policy)
129{
130 if (!policy)
131 return -EINVAL;
132
133 policy->cpu = 0;
134 cpufreq_verify_within_limits(policy,
135 policy->cpuinfo.min_freq,
136 policy->cpuinfo.max_freq);
137
138 if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
139 (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
140 return -EINVAL;
141
142 return 0;
143}
144
145static unsigned int longrun_get(unsigned int cpu)
146{
147 u32 eax, ebx, ecx, edx;
148
149 if (cpu)
150 return 0;
151
152 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
153 dprintk("cpuid eax is %u\n", eax);
154
155 return eax * 1000;
156}
157
158/**
159 * longrun_determine_freqs - determines the lowest and highest possible core frequency
160 * @low_freq: an int to put the lowest frequency into
161 * @high_freq: an int to put the highest frequency into
162 *
163 * Determines the lowest and highest possible core frequencies on this CPU.
164 * This is necessary to calculate the performance percentage according to
165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */
168static int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq)
170{
171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi;
173 u32 eax, ebx, ecx, edx;
174 u32 try_hi;
175 struct cpuinfo_x86 *c = &cpu_data(0);
176
177 if (!low_freq || !high_freq)
178 return -EINVAL;
179
180 if (cpu_has(c, X86_FEATURE_LRTI)) {
181 /* if the LongRun Table Interface is present, the
182 * detection is a bit easier:
183 * For minimum frequency, read out the maximum
184 * level (msr_hi), write that into "currently
185 * selected level", and read out the frequency.
186 * For maximum frequency, read out level zero.
187 */
188 /* minimum */
189 rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
190 wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
191 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
192 *low_freq = msr_lo * 1000; /* to kHz */
193
194 /* maximum */
195 wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
196 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
197 *high_freq = msr_lo * 1000; /* to kHz */
198
199 dprintk("longrun table interface told %u - %u kHz\n",
200 *low_freq, *high_freq);
201
202 if (*low_freq > *high_freq)
203 *low_freq = *high_freq;
204 return 0;
205 }
206
207 /* set the upper border to the value determined during TSC init */
208 *high_freq = (cpu_khz / 1000);
209 *high_freq = *high_freq * 1000;
210 dprintk("high frequency is %u kHz\n", *high_freq);
211
212 /* get current borders */
213 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
214 save_lo = msr_lo & 0x0000007F;
215 save_hi = msr_hi & 0x0000007F;
216
217 /* if current perf_pctg is larger than 90%, we need to decrease the
218 * upper limit to make the calculation more accurate.
219 */
220 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
221 /* try decreasing in 10% steps, some processors react only
222 * on some barrier values */
223 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
224 /* set to 0 to try_hi perf_pctg */
225 msr_lo &= 0xFFFFFF80;
226 msr_hi &= 0xFFFFFF80;
227 msr_hi |= try_hi;
228 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
229
230 /* read out current core MHz and current perf_pctg */
231 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
232
233 /* restore values */
234 wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
235 }
236 dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
237
238 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
239 * eqals
240 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
241 *
242 * high_freq * perf_pctg is stored tempoarily into "ebx".
243 */
244 ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
245
246 if ((ecx > 95) || (ecx == 0) || (eax < ebx))
247 return -EIO;
248
249 edx = ((eax - ebx) * 100) / (100 - ecx);
250 *low_freq = edx * 1000; /* back to kHz */
251
252 dprintk("low frequency is %u kHz\n", *low_freq);
253
254 if (*low_freq > *high_freq)
255 *low_freq = *high_freq;
256
257 return 0;
258}
259
260
261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{
263 int result = 0;
264
265 /* capability check */
266 if (policy->cpu != 0)
267 return -ENODEV;
268
269 /* detect low and high frequency */
270 result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
271 if (result)
272 return result;
273
274 /* cpuinfo and default policy values */
275 policy->cpuinfo.min_freq = longrun_low_freq;
276 policy->cpuinfo.max_freq = longrun_high_freq;
277 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
278 longrun_get_policy(policy);
279
280 return 0;
281}
282
283
284static struct cpufreq_driver longrun_driver = {
285 .flags = CPUFREQ_CONST_LOOPS,
286 .verify = longrun_verify_policy,
287 .setpolicy = longrun_set_policy,
288 .get = longrun_get,
289 .init = longrun_cpu_init,
290 .name = "longrun",
291 .owner = THIS_MODULE,
292};
293
294
295/**
296 * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
297 *
298 * Initializes the LongRun support.
299 */
300static int __init longrun_init(void)
301{
302 struct cpuinfo_x86 *c = &cpu_data(0);
303
304 if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
305 !cpu_has(c, X86_FEATURE_LONGRUN))
306 return -ENODEV;
307
308 return cpufreq_register_driver(&longrun_driver);
309}
310
311
312/**
313 * longrun_exit - unregisters LongRun support
314 */
315static void __exit longrun_exit(void)
316{
317 cpufreq_unregister_driver(&longrun_driver);
318}
319
320
321MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
322MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
323 "Efficeon processors.");
324MODULE_LICENSE("GPL");
325
326module_init(longrun_init);
327module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018ae..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc22..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index 52c93648e492..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,331 +0,0 @@
1/*
2 * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
5 * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
6 * (C) 2002 Tora T. Engstad
7 * All Rights Reserved
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * The author(s) of this software shall not be held liable for damages
15 * of any nature resulting due to the use of this software. This
16 * software is provided AS-IS with no warranties.
17 *
18 * Date Errata Description
19 * 20020525 N44, O17 12.5% or 25% DC causes lockup
20 *
21 */
22
23#include <linux/kernel.h>
24#include <linux/module.h>
25#include <linux/init.h>
26#include <linux/smp.h>
27#include <linux/cpufreq.h>
28#include <linux/cpumask.h>
29#include <linux/timex.h>
30
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/timer.h>
34
35#include "speedstep-lib.h"
36
37#define PFX "p4-clockmod: "
38#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
39 "p4-clockmod", msg)
40
41/*
42 * Duty Cycle (3bits), note DC_DISABLE is not specified in
43 * intel docs i just use it to mean disable
44 */
45enum {
46 DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
47 DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
48};
49
50#define DC_ENTRIES 8
51
52
53static int has_N44_O17_errata[NR_CPUS];
54static unsigned int stock_freq;
55static struct cpufreq_driver p4clockmod_driver;
56static unsigned int cpufreq_p4_get(unsigned int cpu);
57
58static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
59{
60 u32 l, h;
61
62 if (!cpu_online(cpu) ||
63 (newstate > DC_DISABLE) || (newstate == DC_RESV))
64 return -EINVAL;
65
66 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
67
68 if (l & 0x01)
69 dprintk("CPU#%d currently thermal throttled\n", cpu);
70
71 if (has_N44_O17_errata[cpu] &&
72 (newstate == DC_25PT || newstate == DC_DFLT))
73 newstate = DC_38PT;
74
75 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
76 if (newstate == DC_DISABLE) {
77 dprintk("CPU#%d disabling modulation\n", cpu);
78 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
79 } else {
80 dprintk("CPU#%d setting duty cycle to %d%%\n",
81 cpu, ((125 * newstate) / 10));
82 /* bits 63 - 5 : reserved
83 * bit 4 : enable/disable
84 * bits 3-1 : duty cycle
85 * bit 0 : reserved
86 */
87 l = (l & ~14);
88 l = l | (1<<4) | ((newstate & 0x7)<<1);
89 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
90 }
91
92 return 0;
93}
94
95
96static struct cpufreq_frequency_table p4clockmod_table[] = {
97 {DC_RESV, CPUFREQ_ENTRY_INVALID},
98 {DC_DFLT, 0},
99 {DC_25PT, 0},
100 {DC_38PT, 0},
101 {DC_50PT, 0},
102 {DC_64PT, 0},
103 {DC_75PT, 0},
104 {DC_88PT, 0},
105 {DC_DISABLE, 0},
106 {DC_RESV, CPUFREQ_TABLE_END},
107};
108
109
110static int cpufreq_p4_target(struct cpufreq_policy *policy,
111 unsigned int target_freq,
112 unsigned int relation)
113{
114 unsigned int newstate = DC_RESV;
115 struct cpufreq_freqs freqs;
116 int i;
117
118 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
119 target_freq, relation, &newstate))
120 return -EINVAL;
121
122 freqs.old = cpufreq_p4_get(policy->cpu);
123 freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
124
125 if (freqs.new == freqs.old)
126 return 0;
127
128 /* notifiers */
129 for_each_cpu(i, policy->cpus) {
130 freqs.cpu = i;
131 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
132 }
133
134 /* run on each logical CPU,
135 * see section 13.15.3 of IA32 Intel Architecture Software
136 * Developer's Manual, Volume 3
137 */
138 for_each_cpu(i, policy->cpus)
139 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
140
141 /* notifiers */
142 for_each_cpu(i, policy->cpus) {
143 freqs.cpu = i;
144 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
145 }
146
147 return 0;
148}
149
150
151static int cpufreq_p4_verify(struct cpufreq_policy *policy)
152{
153 return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
154}
155
156
157static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
158{
159 if (c->x86 == 0x06) {
160 if (cpu_has(c, X86_FEATURE_EST))
161 printk_once(KERN_WARNING PFX "Warning: EST-capable "
162 "CPU detected. The acpi-cpufreq module offers "
163 "voltage scaling in addition to frequency "
164 "scaling. You should use that instead of "
165 "p4-clockmod, if possible.\n");
166 switch (c->x86_model) {
167 case 0x0E: /* Core */
168 case 0x0F: /* Core Duo */
169 case 0x16: /* Celeron Core */
170 case 0x1C: /* Atom */
171 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
172 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
173 case 0x0D: /* Pentium M (Dothan) */
174 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
175 /* fall through */
176 case 0x09: /* Pentium M (Banias) */
177 return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
178 }
179 }
180
181 if (c->x86 != 0xF)
182 return 0;
183
184 /* on P-4s, the TSC runs with constant frequency independent whether
185 * throttling is active or not. */
186 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
187
188 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
189 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
190 "The speedstep-ich or acpi cpufreq modules offer "
191 "voltage scaling in addition of frequency scaling. "
192 "You should use either one instead of p4-clockmod, "
193 "if possible.\n");
194 return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
195 }
196
197 return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
198}
199
200
201
202static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
203{
204 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
205 int cpuid = 0;
206 unsigned int i;
207
208#ifdef CONFIG_SMP
209 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
210#endif
211
212 /* Errata workaround */
213 cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
214 switch (cpuid) {
215 case 0x0f07:
216 case 0x0f0a:
217 case 0x0f11:
218 case 0x0f12:
219 has_N44_O17_errata[policy->cpu] = 1;
220 dprintk("has errata -- disabling low frequencies\n");
221 }
222
223 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
224 c->x86_model < 2) {
225 /* switch to maximum frequency and measure result */
226 cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
227 recalibrate_cpu_khz();
228 }
229 /* get max frequency */
230 stock_freq = cpufreq_p4_get_frequency(c);
231 if (!stock_freq)
232 return -EINVAL;
233
234 /* table init */
235 for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
236 if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
237 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
238 else
239 p4clockmod_table[i].frequency = (stock_freq * i)/8;
240 }
241 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
242
243 /* cpuinfo and default policy values */
244
245 /* the transition latency is set to be 1 higher than the maximum
246 * transition latency of the ondemand governor */
247 policy->cpuinfo.transition_latency = 10000001;
248 policy->cur = stock_freq;
249
250 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
251}
252
253
254static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
255{
256 cpufreq_frequency_table_put_attr(policy->cpu);
257 return 0;
258}
259
260static unsigned int cpufreq_p4_get(unsigned int cpu)
261{
262 u32 l, h;
263
264 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
265
266 if (l & 0x10) {
267 l = l >> 1;
268 l &= 0x7;
269 } else
270 l = DC_DISABLE;
271
272 if (l != DC_DISABLE)
273 return stock_freq * l / 8;
274
275 return stock_freq;
276}
277
278static struct freq_attr *p4clockmod_attr[] = {
279 &cpufreq_freq_attr_scaling_available_freqs,
280 NULL,
281};
282
283static struct cpufreq_driver p4clockmod_driver = {
284 .verify = cpufreq_p4_verify,
285 .target = cpufreq_p4_target,
286 .init = cpufreq_p4_cpu_init,
287 .exit = cpufreq_p4_cpu_exit,
288 .get = cpufreq_p4_get,
289 .name = "p4-clockmod",
290 .owner = THIS_MODULE,
291 .attr = p4clockmod_attr,
292};
293
294
295static int __init cpufreq_p4_init(void)
296{
297 struct cpuinfo_x86 *c = &cpu_data(0);
298 int ret;
299
300 /*
301 * THERM_CONTROL is architectural for IA32 now, so
302 * we can rely on the capability checks
303 */
304 if (c->x86_vendor != X86_VENDOR_INTEL)
305 return -ENODEV;
306
307 if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
308 !test_cpu_cap(c, X86_FEATURE_ACC))
309 return -ENODEV;
310
311 ret = cpufreq_register_driver(&p4clockmod_driver);
312 if (!ret)
313 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
314 "Modulation available\n");
315
316 return ret;
317}
318
319
320static void __exit cpufreq_p4_exit(void)
321{
322 cpufreq_unregister_driver(&p4clockmod_driver);
323}
324
325
326MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
327MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
328MODULE_LICENSE("GPL");
329
330late_initcall(cpufreq_p4_init);
331module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 755a31e0f5b0..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,624 +0,0 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu __percpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return 0;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 in_params[0].type = ACPI_TYPE_BUFFER;
319 in_params[0].buffer.length = 16;
320 in_params[0].buffer.pointer = OSC_UUID;
321 in_params[1].type = ACPI_TYPE_INTEGER;
322 in_params[1].integer.value = 1;
323 in_params[2].type = ACPI_TYPE_INTEGER;
324 in_params[2].integer.value = 2;
325 in_params[3].type = ACPI_TYPE_BUFFER;
326 in_params[3].buffer.length = 8;
327 in_params[3].buffer.pointer = (u8 *)&capabilities;
328
329 capabilities[0] = OSC_QUERY_ENABLE;
330 capabilities[1] = 0x1;
331
332 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
333 if (ACPI_FAILURE(status))
334 return -ENODEV;
335
336 if (!output.length)
337 return -ENODEV;
338
339 out_obj = output.pointer;
340 if (out_obj->type != ACPI_TYPE_BUFFER) {
341 ret = -ENODEV;
342 goto out_free;
343 }
344
345 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
346 if (errors) {
347 ret = -ENODEV;
348 goto out_free;
349 }
350
351 supported = *((u32 *)(out_obj->buffer.pointer + 4));
352 if (!(supported & 0x1)) {
353 ret = -ENODEV;
354 goto out_free;
355 }
356
357 kfree(output.pointer);
358 capabilities[0] = 0x0;
359 capabilities[1] = 0x1;
360
361 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
362 if (ACPI_FAILURE(status))
363 return -ENODEV;
364
365 if (!output.length)
366 return -ENODEV;
367
368 out_obj = output.pointer;
369 if (out_obj->type != ACPI_TYPE_BUFFER) {
370 ret = -ENODEV;
371 goto out_free;
372 }
373
374 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
375 if (errors) {
376 ret = -ENODEV;
377 goto out_free;
378 }
379
380 supported = *((u32 *)(out_obj->buffer.pointer + 4));
381 if (!(supported & 0x1)) {
382 ret = -ENODEV;
383 goto out_free;
384 }
385
386out_free:
387 kfree(output.pointer);
388 return ret;
389}
390
391static int __init pcc_cpufreq_probe(void)
392{
393 acpi_status status;
394 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
395 struct pcc_memory_resource *mem_resource;
396 struct pcc_register_resource *reg_resource;
397 union acpi_object *out_obj, *member;
398 acpi_handle handle, osc_handle, pcch_handle;
399 int ret = 0;
400
401 status = acpi_get_handle(NULL, "\\_SB", &handle);
402 if (ACPI_FAILURE(status))
403 return -ENODEV;
404
405 status = acpi_get_handle(handle, "PCCH", &pcch_handle);
406 if (ACPI_FAILURE(status))
407 return -ENODEV;
408
409 status = acpi_get_handle(handle, "_OSC", &osc_handle);
410 if (ACPI_SUCCESS(status)) {
411 ret = pcc_cpufreq_do_osc(&osc_handle);
412 if (ret)
413 dprintk("probe: _OSC evaluation did not succeed\n");
414 /* Firmware's use of _OSC is optional */
415 ret = 0;
416 }
417
418 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
419 if (ACPI_FAILURE(status))
420 return -ENODEV;
421
422 out_obj = output.pointer;
423 if (out_obj->type != ACPI_TYPE_PACKAGE) {
424 ret = -ENODEV;
425 goto out_free;
426 }
427
428 member = &out_obj->package.elements[0];
429 if (member->type != ACPI_TYPE_BUFFER) {
430 ret = -ENODEV;
431 goto out_free;
432 }
433
434 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
435
436 dprintk("probe: mem_resource descriptor: 0x%x,"
437 " length: %d, space_id: %d, resource_usage: %d,"
438 " type_specific: %d, granularity: 0x%llx,"
439 " minimum: 0x%llx, maximum: 0x%llx,"
440 " translation_offset: 0x%llx, address_length: 0x%llx\n",
441 mem_resource->descriptor, mem_resource->length,
442 mem_resource->space_id, mem_resource->resource_usage,
443 mem_resource->type_specific, mem_resource->granularity,
444 mem_resource->minimum, mem_resource->maximum,
445 mem_resource->translation_offset,
446 mem_resource->address_length);
447
448 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
449 ret = -ENODEV;
450 goto out_free;
451 }
452
453 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
454 mem_resource->address_length);
455 if (pcch_virt_addr == NULL) {
456 dprintk("probe: could not map shared mem region\n");
457 goto out_free;
458 }
459 pcch_hdr = pcch_virt_addr;
460
461 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
462 dprintk("probe: PCCH header is at physical address: 0x%llx,"
463 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
464 " supported features: 0x%x, command field: 0x%x,"
465 " status field: 0x%x, nominal latency: %d us\n",
466 mem_resource->minimum, ioread32(&pcch_hdr->signature),
467 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
468 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
469 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
470 ioread32(&pcch_hdr->latency));
471
472 dprintk("probe: min time between commands: %d us,"
473 " max time between commands: %d us,"
474 " nominal CPU frequency: %d MHz,"
475 " minimum CPU frequency: %d MHz,"
476 " minimum CPU frequency without throttling: %d MHz\n",
477 ioread32(&pcch_hdr->minimum_time),
478 ioread32(&pcch_hdr->maximum_time),
479 ioread32(&pcch_hdr->nominal),
480 ioread32(&pcch_hdr->throttled_frequency),
481 ioread32(&pcch_hdr->minimum_frequency));
482
483 member = &out_obj->package.elements[1];
484 if (member->type != ACPI_TYPE_BUFFER) {
485 ret = -ENODEV;
486 goto pcch_free;
487 }
488
489 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
490
491 doorbell.space_id = reg_resource->space_id;
492 doorbell.bit_width = reg_resource->bit_width;
493 doorbell.bit_offset = reg_resource->bit_offset;
494 doorbell.access_width = 64;
495 doorbell.address = reg_resource->address;
496
497 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
498 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
499 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
500 doorbell.access_width, reg_resource->address);
501
502 member = &out_obj->package.elements[2];
503 if (member->type != ACPI_TYPE_INTEGER) {
504 ret = -ENODEV;
505 goto pcch_free;
506 }
507
508 doorbell_preserve = member->integer.value;
509
510 member = &out_obj->package.elements[3];
511 if (member->type != ACPI_TYPE_INTEGER) {
512 ret = -ENODEV;
513 goto pcch_free;
514 }
515
516 doorbell_write = member->integer.value;
517
518 dprintk("probe: doorbell_preserve: 0x%llx,"
519 " doorbell_write: 0x%llx\n",
520 doorbell_preserve, doorbell_write);
521
522 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
523 if (!pcc_cpu_info) {
524 ret = -ENOMEM;
525 goto pcch_free;
526 }
527
528 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
529 " limits: %d MHz, %d MHz\n", PCC_VERSION,
530 ioread32(&pcch_hdr->minimum_frequency),
531 ioread32(&pcch_hdr->nominal));
532 kfree(output.pointer);
533 return ret;
534pcch_free:
535 pcc_clear_mapping();
536out_free:
537 kfree(output.pointer);
538 return ret;
539}
540
541static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
542{
543 unsigned int cpu = policy->cpu;
544 unsigned int result = 0;
545
546 if (!pcch_virt_addr) {
547 result = -1;
548 goto out;
549 }
550
551 result = pcc_get_offset(cpu);
552 if (result) {
553 dprintk("init: PCCP evaluation failed\n");
554 goto out;
555 }
556
557 policy->max = policy->cpuinfo.max_freq =
558 ioread32(&pcch_hdr->nominal) * 1000;
559 policy->min = policy->cpuinfo.min_freq =
560 ioread32(&pcch_hdr->minimum_frequency) * 1000;
561 policy->cur = pcc_get_freq(cpu);
562
563 if (!policy->cur) {
564 dprintk("init: Unable to get current CPU frequency\n");
565 result = -EINVAL;
566 goto out;
567 }
568
569 dprintk("init: policy->max is %d, policy->min is %d\n",
570 policy->max, policy->min);
571out:
572 return result;
573}
574
575static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
576{
577 return 0;
578}
579
580static struct cpufreq_driver pcc_cpufreq_driver = {
581 .flags = CPUFREQ_CONST_LOOPS,
582 .get = pcc_get_freq,
583 .verify = pcc_cpufreq_verify,
584 .target = pcc_cpufreq_target,
585 .init = pcc_cpufreq_cpu_init,
586 .exit = pcc_cpufreq_cpu_exit,
587 .name = "pcc-cpufreq",
588 .owner = THIS_MODULE,
589};
590
591static int __init pcc_cpufreq_init(void)
592{
593 int ret;
594
595 if (acpi_disabled)
596 return 0;
597
598 ret = pcc_cpufreq_probe();
599 if (ret) {
600 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
601 return ret;
602 }
603
604 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
605
606 return ret;
607}
608
609static void __exit pcc_cpufreq_exit(void)
610{
611 cpufreq_unregister_driver(&pcc_cpufreq_driver);
612
613 pcc_clear_mapping();
614
615 free_percpu(pcc_cpu_info);
616}
617
618MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
619MODULE_VERSION(PCC_VERSION);
620MODULE_DESCRIPTION("Processor Clocking Control interface driver");
621MODULE_LICENSE("GPL");
622
623late_initcall(pcc_cpufreq_init);
624module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c57..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
1/*
2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
4 * Dominik Brodowski.
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/ioport.h>
16#include <linux/timex.h>
17#include <linux/io.h>
18
19#include <asm/msr.h>
20
21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22 as it is unused */
23
24#define PFX "powernow-k6: "
25static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier;
27
28
29/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
30static struct cpufreq_frequency_table clock_ratio[] = {
31 {45, /* 000 -> 4.5x */ 0},
32 {50, /* 001 -> 5.0x */ 0},
33 {40, /* 010 -> 4.0x */ 0},
34 {55, /* 011 -> 5.5x */ 0},
35 {20, /* 100 -> 2.0x */ 0},
36 {30, /* 101 -> 3.0x */ 0},
37 {60, /* 110 -> 6.0x */ 0},
38 {35, /* 111 -> 3.5x */ 0},
39 {0, CPUFREQ_TABLE_END}
40};
41
42
43/**
44 * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
45 *
46 * Returns the current setting of the frequency multiplier. Core clock
47 * speed is frequency of the Front-Side Bus multiplied with this value.
48 */
49static int powernow_k6_get_cpu_multiplier(void)
50{
51 u64 invalue = 0;
52 u32 msrval;
53
54 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59
60 return clock_ratio[(invalue >> 5)&7].index;
61}
62
63
64/**
65 * powernow_k6_set_state - set the PowerNow! multiplier
66 * @best_i: clock_ratio[best_i] is the target multiplier
67 *
68 * Tries to change the PowerNow! multiplier
69 */
70static void powernow_k6_set_state(unsigned int best_i)
71{
72 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval;
74 struct cpufreq_freqs freqs;
75
76 if (clock_ratio[best_i].index > max_multiplier) {
77 printk(KERN_ERR PFX "invalid target frequency\n");
78 return;
79 }
80
81 freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
82 freqs.new = busfreq * clock_ratio[best_i].index;
83 freqs.cpu = 0; /* powernow-k6.c is UP only driver */
84
85 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
86
87 /* we now need to transform best_i to the BVC format, see AMD#23446 */
88
89 outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
90
91 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue;
96 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99
100 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
101
102 return;
103}
104
105
106/**
107 * powernow_k6_verify - verifies a new CPUfreq policy
108 * @policy: new policy
109 *
110 * Policy must be within lowest and highest possible CPU Frequency,
111 * and at least one possible state must be within min and max.
112 */
113static int powernow_k6_verify(struct cpufreq_policy *policy)
114{
115 return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
116}
117
118
119/**
120 * powernow_k6_setpolicy - sets a new CPUFreq policy
121 * @policy: new policy
122 * @target_freq: the target frequency
123 * @relation: how that frequency relates to achieved frequency
124 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
125 *
126 * sets a new CPUFreq policy
127 */
128static int powernow_k6_target(struct cpufreq_policy *policy,
129 unsigned int target_freq,
130 unsigned int relation)
131{
132 unsigned int newstate = 0;
133
134 if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
135 target_freq, relation, &newstate))
136 return -EINVAL;
137
138 powernow_k6_set_state(newstate);
139
140 return 0;
141}
142
143
144static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
145{
146 unsigned int i, f;
147 int result;
148
149 if (policy->cpu != 0)
150 return -ENODEV;
151
152 /* get frequencies */
153 max_multiplier = powernow_k6_get_cpu_multiplier();
154 busfreq = cpu_khz / max_multiplier;
155
156 /* table init */
157 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
158 f = clock_ratio[i].index;
159 if (f > max_multiplier)
160 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
161 else
162 clock_ratio[i].frequency = busfreq * f;
163 }
164
165 /* cpuinfo and default policy values */
166 policy->cpuinfo.transition_latency = 200000;
167 policy->cur = busfreq * max_multiplier;
168
169 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
170 if (result)
171 return result;
172
173 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
174
175 return 0;
176}
177
178
179static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
180{
181 unsigned int i;
182 for (i = 0; i < 8; i++) {
183 if (i == max_multiplier)
184 powernow_k6_set_state(i);
185 }
186 cpufreq_frequency_table_put_attr(policy->cpu);
187 return 0;
188}
189
190static unsigned int powernow_k6_get(unsigned int cpu)
191{
192 unsigned int ret;
193 ret = (busfreq * powernow_k6_get_cpu_multiplier());
194 return ret;
195}
196
197static struct freq_attr *powernow_k6_attr[] = {
198 &cpufreq_freq_attr_scaling_available_freqs,
199 NULL,
200};
201
202static struct cpufreq_driver powernow_k6_driver = {
203 .verify = powernow_k6_verify,
204 .target = powernow_k6_target,
205 .init = powernow_k6_cpu_init,
206 .exit = powernow_k6_cpu_exit,
207 .get = powernow_k6_get,
208 .name = "powernow-k6",
209 .owner = THIS_MODULE,
210 .attr = powernow_k6_attr,
211};
212
213
214/**
215 * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
216 *
217 * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
218 * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
219 * on success.
220 */
221static int __init powernow_k6_init(void)
222{
223 struct cpuinfo_x86 *c = &cpu_data(0);
224
225 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
226 ((c->x86_model != 12) && (c->x86_model != 13)))
227 return -ENODEV;
228
229 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
230 printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
231 return -EIO;
232 }
233
234 if (cpufreq_register_driver(&powernow_k6_driver)) {
235 release_region(POWERNOW_IOPORT, 16);
236 return -EINVAL;
237 }
238
239 return 0;
240}
241
242
243/**
244 * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
245 *
246 * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
247 */
248static void __exit powernow_k6_exit(void)
249{
250 cpufreq_unregister_driver(&powernow_k6_driver);
251 release_region(POWERNOW_IOPORT, 16);
252}
253
254
255MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
256 "Dominik Brodowski <linux@brodo.de>");
257MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
258MODULE_LICENSE("GPL");
259
260module_init(powernow_k6_init);
261module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 4a45fd6e41ba..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
1/*
2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 * Based upon datasheets & sample CPUs kindly provided by AMD.
8 *
9 * Errata 5:
10 * CPU may fail to execute a FID/VID change in presence of interrupt.
11 * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
12 * Errata 15:
13 * CPU with half frequency multipliers may hang upon wakeup from disconnect.
14 * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
15 */
16
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <linux/moduleparam.h>
20#include <linux/init.h>
21#include <linux/cpufreq.h>
22#include <linux/slab.h>
23#include <linux/string.h>
24#include <linux/dmi.h>
25#include <linux/timex.h>
26#include <linux/io.h>
27
28#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
29#include <asm/msr.h>
30#include <asm/system.h>
31
32#ifdef CONFIG_X86_POWERNOW_K7_ACPI
33#include <linux/acpi.h>
34#include <acpi/processor.h>
35#endif
36
37#include "powernow-k7.h"
38
39#define PFX "powernow: "
40
41
42struct psb_s {
43 u8 signature[10];
44 u8 tableversion;
45 u8 flags;
46 u16 settlingtime;
47 u8 reserved1;
48 u8 numpst;
49};
50
51struct pst_s {
52 u32 cpuid;
53 u8 fsbspeed;
54 u8 maxfid;
55 u8 startvid;
56 u8 numpstates;
57};
58
59#ifdef CONFIG_X86_POWERNOW_K7_ACPI
60union powernow_acpi_control_t {
61 struct {
62 unsigned long fid:5,
63 vid:5,
64 sgtc:20,
65 res1:2;
66 } bits;
67 unsigned long val;
68};
69#endif
70
71#ifdef CONFIG_CPU_FREQ_DEBUG
72/* divide by 1000 to get VCore voltage in V. */
73static const int mobile_vid_table[32] = {
74 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
75 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
76 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
77 1075, 1050, 1025, 1000, 975, 950, 925, 0,
78};
79#endif
80
81/* divide by 10 to get FID. */
82static const int fid_codes[32] = {
83 110, 115, 120, 125, 50, 55, 60, 65,
84 70, 75, 80, 85, 90, 95, 100, 105,
85 30, 190, 40, 200, 130, 135, 140, 210,
86 150, 225, 160, 165, 170, 180, -1, -1,
87};
88
89/* This parameter is used in order to force ACPI instead of legacy method for
90 * configuration purpose.
91 */
92
93static int acpi_force;
94
95static struct cpufreq_frequency_table *powernow_table;
96
97static unsigned int can_scale_bus;
98static unsigned int can_scale_vid;
99static unsigned int minimum_speed = -1;
100static unsigned int maximum_speed;
101static unsigned int number_scales;
102static unsigned int fsb;
103static unsigned int latency;
104static char have_a0;
105
106#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
107 "powernow-k7", msg)
108
109static int check_fsb(unsigned int fsbspeed)
110{
111 int delta;
112 unsigned int f = fsb / 1000;
113
114 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
115 return delta < 5;
116}
117
118static int check_powernow(void)
119{
120 struct cpuinfo_x86 *c = &cpu_data(0);
121 unsigned int maxei, eax, ebx, ecx, edx;
122
123 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
124#ifdef MODULE
125 printk(KERN_INFO PFX "This module only works with "
126 "AMD K7 CPUs\n");
127#endif
128 return 0;
129 }
130
131 /* Get maximum capabilities */
132 maxei = cpuid_eax(0x80000000);
133 if (maxei < 0x80000007) { /* Any powernow info ? */
134#ifdef MODULE
135 printk(KERN_INFO PFX "No powernow capabilities detected\n");
136#endif
137 return 0;
138 }
139
140 if ((c->x86_model == 6) && (c->x86_mask == 0)) {
141 printk(KERN_INFO PFX "K7 660[A0] core detected, "
142 "enabling errata workarounds\n");
143 have_a0 = 1;
144 }
145
146 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
147
148 /* Check we can actually do something before we say anything.*/
149 if (!(edx & (1 << 1 | 1 << 2)))
150 return 0;
151
152 printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
153
154 if (edx & 1 << 1) {
155 printk("frequency");
156 can_scale_bus = 1;
157 }
158
159 if ((edx & (1 << 1 | 1 << 2)) == 0x6)
160 printk(" and ");
161
162 if (edx & 1 << 2) {
163 printk("voltage");
164 can_scale_vid = 1;
165 }
166
167 printk(".\n");
168 return 1;
169}
170
171#ifdef CONFIG_X86_POWERNOW_K7_ACPI
172static void invalidate_entry(unsigned int entry)
173{
174 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
175}
176#endif
177
178static int get_ranges(unsigned char *pst)
179{
180 unsigned int j;
181 unsigned int speed;
182 u8 fid, vid;
183
184 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
185 (number_scales + 1)), GFP_KERNEL);
186 if (!powernow_table)
187 return -ENOMEM;
188
189 for (j = 0 ; j < number_scales; j++) {
190 fid = *pst++;
191
192 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
193 powernow_table[j].index = fid; /* lower 8 bits */
194
195 speed = powernow_table[j].frequency;
196
197 if ((fid_codes[fid] % 10) == 5) {
198#ifdef CONFIG_X86_POWERNOW_K7_ACPI
199 if (have_a0 == 1)
200 invalidate_entry(j);
201#endif
202 }
203
204 if (speed < minimum_speed)
205 minimum_speed = speed;
206 if (speed > maximum_speed)
207 maximum_speed = speed;
208
209 vid = *pst++;
210 powernow_table[j].index |= (vid << 8); /* upper 8 bits */
211
212 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
213 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
214 fid_codes[fid] % 10, speed/1000, vid,
215 mobile_vid_table[vid]/1000,
216 mobile_vid_table[vid]%1000);
217 }
218 powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
219 powernow_table[number_scales].index = 0;
220
221 return 0;
222}
223
224
225static void change_FID(int fid)
226{
227 union msr_fidvidctl fidvidctl;
228
229 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
230 if (fidvidctl.bits.FID != fid) {
231 fidvidctl.bits.SGTC = latency;
232 fidvidctl.bits.FID = fid;
233 fidvidctl.bits.VIDC = 0;
234 fidvidctl.bits.FIDC = 1;
235 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
236 }
237}
238
239
240static void change_VID(int vid)
241{
242 union msr_fidvidctl fidvidctl;
243
244 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
245 if (fidvidctl.bits.VID != vid) {
246 fidvidctl.bits.SGTC = latency;
247 fidvidctl.bits.VID = vid;
248 fidvidctl.bits.FIDC = 0;
249 fidvidctl.bits.VIDC = 1;
250 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
251 }
252}
253
254
255static void change_speed(unsigned int index)
256{
257 u8 fid, vid;
258 struct cpufreq_freqs freqs;
259 union msr_fidvidstatus fidvidstatus;
260 int cfid;
261
262 /* fid are the lower 8 bits of the index we stored into
263 * the cpufreq frequency table in powernow_decode_bios,
264 * vid are the upper 8 bits.
265 */
266
267 fid = powernow_table[index].index & 0xFF;
268 vid = (powernow_table[index].index & 0xFF00) >> 8;
269
270 freqs.cpu = 0;
271
272 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
273 cfid = fidvidstatus.bits.CFID;
274 freqs.old = fsb * fid_codes[cfid] / 10;
275
276 freqs.new = powernow_table[index].frequency;
277
278 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
279
280 /* Now do the magic poking into the MSRs. */
281
282 if (have_a0 == 1) /* A0 errata 5 */
283 local_irq_disable();
284
285 if (freqs.old > freqs.new) {
286 /* Going down, so change FID first */
287 change_FID(fid);
288 change_VID(vid);
289 } else {
290 /* Going up, so change VID first */
291 change_VID(vid);
292 change_FID(fid);
293 }
294
295
296 if (have_a0 == 1)
297 local_irq_enable();
298
299 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
300}
301
302
303#ifdef CONFIG_X86_POWERNOW_K7_ACPI
304
305static struct acpi_processor_performance *acpi_processor_perf;
306
307static int powernow_acpi_init(void)
308{
309 int i;
310 int retval = 0;
311 union powernow_acpi_control_t pc;
312
313 if (acpi_processor_perf != NULL && powernow_table != NULL) {
314 retval = -EINVAL;
315 goto err0;
316 }
317
318 acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
319 GFP_KERNEL);
320 if (!acpi_processor_perf) {
321 retval = -ENOMEM;
322 goto err0;
323 }
324
325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
326 GFP_KERNEL)) {
327 retval = -ENOMEM;
328 goto err05;
329 }
330
331 if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
332 retval = -EIO;
333 goto err1;
334 }
335
336 if (acpi_processor_perf->control_register.space_id !=
337 ACPI_ADR_SPACE_FIXED_HARDWARE) {
338 retval = -ENODEV;
339 goto err2;
340 }
341
342 if (acpi_processor_perf->status_register.space_id !=
343 ACPI_ADR_SPACE_FIXED_HARDWARE) {
344 retval = -ENODEV;
345 goto err2;
346 }
347
348 number_scales = acpi_processor_perf->state_count;
349
350 if (number_scales < 2) {
351 retval = -ENODEV;
352 goto err2;
353 }
354
355 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
356 (number_scales + 1)), GFP_KERNEL);
357 if (!powernow_table) {
358 retval = -ENOMEM;
359 goto err2;
360 }
361
362 pc.val = (unsigned long) acpi_processor_perf->states[0].control;
363 for (i = 0; i < number_scales; i++) {
364 u8 fid, vid;
365 struct acpi_processor_px *state =
366 &acpi_processor_perf->states[i];
367 unsigned int speed, speed_mhz;
368
369 pc.val = (unsigned long) state->control;
370 dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
371 i,
372 (u32) state->core_frequency,
373 (u32) state->power,
374 (u32) state->transition_latency,
375 (u32) state->control,
376 pc.bits.sgtc);
377
378 vid = pc.bits.vid;
379 fid = pc.bits.fid;
380
381 powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
382 powernow_table[i].index = fid; /* lower 8 bits */
383 powernow_table[i].index |= (vid << 8); /* upper 8 bits */
384
385 speed = powernow_table[i].frequency;
386 speed_mhz = speed / 1000;
387
388 /* processor_perflib will multiply the MHz value by 1000 to
389 * get a KHz value (e.g. 1266000). However, powernow-k7 works
390 * with true KHz values (e.g. 1266768). To ensure that all
391 * powernow frequencies are available, we must ensure that
392 * ACPI doesn't restrict them, so we round up the MHz value
393 * to ensure that perflib's computed KHz value is greater than
394 * or equal to powernow's KHz value.
395 */
396 if (speed % 1000 > 0)
397 speed_mhz++;
398
399 if ((fid_codes[fid] % 10) == 5) {
400 if (have_a0 == 1)
401 invalidate_entry(i);
402 }
403
404 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
405 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
406 fid_codes[fid] % 10, speed_mhz, vid,
407 mobile_vid_table[vid]/1000,
408 mobile_vid_table[vid]%1000);
409
410 if (state->core_frequency != speed_mhz) {
411 state->core_frequency = speed_mhz;
412 dprintk(" Corrected ACPI frequency to %d\n",
413 speed_mhz);
414 }
415
416 if (latency < pc.bits.sgtc)
417 latency = pc.bits.sgtc;
418
419 if (speed < minimum_speed)
420 minimum_speed = speed;
421 if (speed > maximum_speed)
422 maximum_speed = speed;
423 }
424
425 powernow_table[i].frequency = CPUFREQ_TABLE_END;
426 powernow_table[i].index = 0;
427
428 /* notify BIOS that we exist */
429 acpi_processor_notify_smm(THIS_MODULE);
430
431 return 0;
432
433err2:
434 acpi_processor_unregister_performance(acpi_processor_perf, 0);
435err1:
436 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
437err05:
438 kfree(acpi_processor_perf);
439err0:
440 printk(KERN_WARNING PFX "ACPI perflib can not be used on "
441 "this platform\n");
442 acpi_processor_perf = NULL;
443 return retval;
444}
445#else
446static int powernow_acpi_init(void)
447{
448 printk(KERN_INFO PFX "no support for ACPI processor found."
449 " Please recompile your kernel with ACPI processor\n");
450 return -EINVAL;
451}
452#endif
453
454static void print_pst_entry(struct pst_s *pst, unsigned int j)
455{
456 dprintk("PST:%d (@%p)\n", j, pst);
457 dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
458 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
459}
460
461static int powernow_decode_bios(int maxfid, int startvid)
462{
463 struct psb_s *psb;
464 struct pst_s *pst;
465 unsigned int i, j;
466 unsigned char *p;
467 unsigned int etuple;
468 unsigned int ret;
469
470 etuple = cpuid_eax(0x80000001);
471
472 for (i = 0xC0000; i < 0xffff0 ; i += 16) {
473
474 p = phys_to_virt(i);
475
476 if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
477 dprintk("Found PSB header at %p\n", p);
478 psb = (struct psb_s *) p;
479 dprintk("Table version: 0x%x\n", psb->tableversion);
480 if (psb->tableversion != 0x12) {
481 printk(KERN_INFO PFX "Sorry, only v1.2 tables"
482 " supported right now\n");
483 return -ENODEV;
484 }
485
486 dprintk("Flags: 0x%x\n", psb->flags);
487 if ((psb->flags & 1) == 0)
488 dprintk("Mobile voltage regulator\n");
489 else
490 dprintk("Desktop voltage regulator\n");
491
492 latency = psb->settlingtime;
493 if (latency < 100) {
494 printk(KERN_INFO PFX "BIOS set settling time "
495 "to %d microseconds. "
496 "Should be at least 100. "
497 "Correcting.\n", latency);
498 latency = 100;
499 }
500 dprintk("Settling Time: %d microseconds.\n",
501 psb->settlingtime);
502 dprintk("Has %d PST tables. (Only dumping ones "
503 "relevant to this CPU).\n",
504 psb->numpst);
505
506 p += sizeof(struct psb_s);
507
508 pst = (struct pst_s *) p;
509
510 for (j = 0; j < psb->numpst; j++) {
511 pst = (struct pst_s *) p;
512 number_scales = pst->numpstates;
513
514 if ((etuple == pst->cpuid) &&
515 check_fsb(pst->fsbspeed) &&
516 (maxfid == pst->maxfid) &&
517 (startvid == pst->startvid)) {
518 print_pst_entry(pst, j);
519 p = (char *)pst + sizeof(struct pst_s);
520 ret = get_ranges(p);
521 return ret;
522 } else {
523 unsigned int k;
524 p = (char *)pst + sizeof(struct pst_s);
525 for (k = 0; k < number_scales; k++)
526 p += 2;
527 }
528 }
529 printk(KERN_INFO PFX "No PST tables match this cpuid "
530 "(0x%x)\n", etuple);
531 printk(KERN_INFO PFX "This is indicative of a broken "
532 "BIOS.\n");
533
534 return -EINVAL;
535 }
536 p++;
537 }
538
539 return -ENODEV;
540}
541
542
543static int powernow_target(struct cpufreq_policy *policy,
544 unsigned int target_freq,
545 unsigned int relation)
546{
547 unsigned int newstate;
548
549 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
550 relation, &newstate))
551 return -EINVAL;
552
553 change_speed(newstate);
554
555 return 0;
556}
557
558
559static int powernow_verify(struct cpufreq_policy *policy)
560{
561 return cpufreq_frequency_table_verify(policy, powernow_table);
562}
563
564/*
565 * We use the fact that the bus frequency is somehow
566 * a multiple of 100000/3 khz, then we compute sgtc according
567 * to this multiple.
568 * That way, we match more how AMD thinks all of that work.
569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS.
571 */
572static int __cpuinit fixup_sgtc(void)
573{
574 unsigned int sgtc;
575 unsigned int m;
576
577 m = fsb / 3333;
578 if ((m % 10) >= 5)
579 m += 5;
580
581 m /= 10;
582
583 sgtc = 100 * m * latency;
584 sgtc = sgtc / 3;
585 if (sgtc > 0xfffff) {
586 printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
587 sgtc = 0xfffff;
588 }
589 return sgtc;
590}
591
592static unsigned int powernow_get(unsigned int cpu)
593{
594 union msr_fidvidstatus fidvidstatus;
595 unsigned int cfid;
596
597 if (cpu)
598 return 0;
599 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
600 cfid = fidvidstatus.bits.CFID;
601
602 return fsb * fid_codes[cfid] / 10;
603}
604
605
606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{
608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n",
610 d->ident);
611 printk(KERN_WARNING PFX
612 "You need to downgrade to 3A21 (09/09/2002), or try a newer "
613 "BIOS than 3A71 (01/20/2003)\n");
614 printk(KERN_WARNING PFX
615 "cpufreq scaling has been disabled as a result of this.\n");
616 return 0;
617}
618
619/*
620 * Some Athlon laptops have really fucked PST tables.
621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq.
623 */
624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 {
626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire",
628 .matches = {
629 DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
630 DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
631 },
632 },
633 { }
634};
635
636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{
638 union msr_fidvidstatus fidvidstatus;
639 int result;
640
641 if (policy->cpu != 0)
642 return -ENODEV;
643
644 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
645
646 recalibrate_cpu_khz();
647
648 fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
649 if (!fsb) {
650 printk(KERN_WARNING PFX "can not determine bus frequency\n");
651 return -EINVAL;
652 }
653 dprintk("FSB: %3dMHz\n", fsb/1000);
654
655 if (dmi_check_system(powernow_dmi_table) || acpi_force) {
656 printk(KERN_INFO PFX "PSB/PST known to be broken. "
657 "Trying ACPI instead\n");
658 result = powernow_acpi_init();
659 } else {
660 result = powernow_decode_bios(fidvidstatus.bits.MFID,
661 fidvidstatus.bits.SVID);
662 if (result) {
663 printk(KERN_INFO PFX "Trying ACPI perflib\n");
664 maximum_speed = 0;
665 minimum_speed = -1;
666 latency = 0;
667 result = powernow_acpi_init();
668 if (result) {
669 printk(KERN_INFO PFX
670 "ACPI and legacy methods failed\n");
671 }
672 } else {
673 /* SGTC use the bus clock as timer */
674 latency = fixup_sgtc();
675 printk(KERN_INFO PFX "SGTC: %d\n", latency);
676 }
677 }
678
679 if (result)
680 return result;
681
682 printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
683 minimum_speed/1000, maximum_speed/1000);
684
685 policy->cpuinfo.transition_latency =
686 cpufreq_scale(2000000UL, fsb, latency);
687
688 policy->cur = powernow_get(0);
689
690 cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
691
692 return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
693}
694
695static int powernow_cpu_exit(struct cpufreq_policy *policy)
696{
697 cpufreq_frequency_table_put_attr(policy->cpu);
698
699#ifdef CONFIG_X86_POWERNOW_K7_ACPI
700 if (acpi_processor_perf) {
701 acpi_processor_unregister_performance(acpi_processor_perf, 0);
702 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
703 kfree(acpi_processor_perf);
704 }
705#endif
706
707 kfree(powernow_table);
708 return 0;
709}
710
711static struct freq_attr *powernow_table_attr[] = {
712 &cpufreq_freq_attr_scaling_available_freqs,
713 NULL,
714};
715
716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify,
718 .target = powernow_target,
719 .get = powernow_get,
720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .bios_limit = acpi_processor_get_bios_limit,
722#endif
723 .init = powernow_cpu_init,
724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
728};
729
730static int __init powernow_init(void)
731{
732 if (check_powernow() == 0)
733 return -ENODEV;
734 return cpufreq_register_driver(&powernow_driver);
735}
736
737
738static void __exit powernow_exit(void)
739{
740 cpufreq_unregister_driver(&powernow_driver);
741}
742
743module_param(acpi_force, int, 0444);
744MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
745
746MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
747MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
748MODULE_LICENSE("GPL");
749
750late_initcall(powernow_init);
751module_exit(powernow_exit);
752
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1c..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
1/*
2 * (C) 2003 Dave Jones.
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * AMD-specific information
7 *
8 */
9
10union msr_fidvidctl {
11 struct {
12 unsigned FID:5, // 4:0
13 reserved1:3, // 7:5
14 VID:5, // 12:8
15 reserved2:3, // 15:13
16 FIDC:1, // 16
17 VIDC:1, // 17
18 reserved3:2, // 19:18
19 FIDCHGRATIO:1, // 20
20 reserved4:11, // 31-21
21 SGTC:20, // 32:51
22 reserved5:12; // 63:52
23 } bits;
24 unsigned long long val;
25};
26
27union msr_fidvidstatus {
28 struct {
29 unsigned CFID:5, // 4:0
30 reserved1:3, // 7:5
31 SFID:5, // 12:8
32 reserved2:3, // 15:13
33 MFID:5, // 20:16
34 reserved3:11, // 31:21
35 CVID:5, // 36:32
36 reserved4:3, // 39:37
37 SVID:5, // 44:40
38 reserved5:3, // 47:45
39 MVID:5, // 52:48
40 reserved6:11; // 63:53
41 } bits;
42 unsigned long long val;
43};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 2368e38327b3..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1607 +0,0 @@
1/*
2 * (c) 2003-2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Support : mark.langsdorf@amd.com
8 *
9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 *
16 * Valuable input gratefully received from Dave Jones, Pavel Machek,
17 * Dominik Brodowski, Jacob Shin, and others.
18 * Originally developed by Paul Devriendt.
19 * Processor information obtained from Chapter 9 (Power and Thermal Management)
20 * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
21 * Opteron Processors" available for download from www.amd.com
22 *
23 * Tables for specific CPUs can be inferred from
24 * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
25 */
26
27#include <linux/kernel.h>
28#include <linux/smp.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/cpufreq.h>
32#include <linux/slab.h>
33#include <linux/string.h>
34#include <linux/cpumask.h>
35#include <linux/sched.h> /* for current / set_cpus_allowed() */
36#include <linux/io.h>
37#include <linux/delay.h>
38
39#include <asm/msr.h>
40
41#include <linux/acpi.h>
42#include <linux/mutex.h>
43#include <acpi/processor.h>
44
45#define PFX "powernow-k8: "
46#define VERSION "version 2.20.00"
47#include "powernow-k8.h"
48#include "mperf.h"
49
50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex);
52
53static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54
55static int cpu_family = CPU_OPTERON;
56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
63#ifndef CONFIG_SMP
64static inline const struct cpumask *cpu_core_mask(int cpu)
65{
66 return cpumask_of(0);
67}
68#endif
69
70/* Return a frequency in MHz, given an input fid */
71static u32 find_freq_from_fid(u32 fid)
72{
73 return 800 + (fid * 100);
74}
75
76/* Return a frequency in KHz, given an input fid */
77static u32 find_khz_freq_from_fid(u32 fid)
78{
79 return 1000 * find_freq_from_fid(fid);
80}
81
82static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
83 u32 pstate)
84{
85 return data[pstate].frequency;
86}
87
88/* Return the vco fid for an input fid
89 *
90 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
91 * only from corresponding high fids. This returns "high" fid corresponding to
92 * "low" one.
93 */
94static u32 convert_fid_to_vco_fid(u32 fid)
95{
96 if (fid < HI_FID_TABLE_BOTTOM)
97 return 8 + (2 * fid);
98 else
99 return fid;
100}
101
102/*
103 * Return 1 if the pending bit is set. Unless we just instructed the processor
104 * to transition to a new state, seeing this bit set is really bad news.
105 */
106static int pending_bit_stuck(void)
107{
108 u32 lo, hi;
109
110 if (cpu_family == CPU_HW_PSTATE)
111 return 0;
112
113 rdmsr(MSR_FIDVID_STATUS, lo, hi);
114 return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
115}
116
117/*
118 * Update the global current fid / vid values from the status msr.
119 * Returns 1 on error.
120 */
121static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
122{
123 u32 lo, hi;
124 u32 i = 0;
125
126 if (cpu_family == CPU_HW_PSTATE) {
127 rdmsr(MSR_PSTATE_STATUS, lo, hi);
128 i = lo & HW_PSTATE_MASK;
129 data->currpstate = i;
130
131 /*
132 * a workaround for family 11h erratum 311 might cause
133 * an "out-of-range Pstate if the core is in Pstate-0
134 */
135 if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
136 data->currpstate = HW_PSTATE_0;
137
138 return 0;
139 }
140 do {
141 if (i++ > 10000) {
142 dprintk("detected change pending stuck\n");
143 return 1;
144 }
145 rdmsr(MSR_FIDVID_STATUS, lo, hi);
146 } while (lo & MSR_S_LO_CHANGE_PENDING);
147
148 data->currvid = hi & MSR_S_HI_CURRENT_VID;
149 data->currfid = lo & MSR_S_LO_CURRENT_FID;
150
151 return 0;
152}
153
154/* the isochronous relief time */
155static void count_off_irt(struct powernow_k8_data *data)
156{
157 udelay((1 << data->irt) * 10);
158 return;
159}
160
161/* the voltage stabilization time */
162static void count_off_vst(struct powernow_k8_data *data)
163{
164 udelay(data->vstable * VST_UNITS_20US);
165 return;
166}
167
168/* need to init the control msr to a safe value (for each cpu) */
169static void fidvid_msr_init(void)
170{
171 u32 lo, hi;
172 u8 fid, vid;
173
174 rdmsr(MSR_FIDVID_STATUS, lo, hi);
175 vid = hi & MSR_S_HI_CURRENT_VID;
176 fid = lo & MSR_S_LO_CURRENT_FID;
177 lo = fid | (vid << MSR_C_LO_VID_SHIFT);
178 hi = MSR_C_HI_STP_GNT_BENIGN;
179 dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
180 wrmsr(MSR_FIDVID_CTL, lo, hi);
181}
182
183/* write the new fid value along with the other control fields to the msr */
184static int write_new_fid(struct powernow_k8_data *data, u32 fid)
185{
186 u32 lo;
187 u32 savevid = data->currvid;
188 u32 i = 0;
189
190 if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
191 printk(KERN_ERR PFX "internal error - overflow on fid write\n");
192 return 1;
193 }
194
195 lo = fid;
196 lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
197 lo |= MSR_C_LO_INIT_FID_VID;
198
199 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
200 fid, lo, data->plllock * PLL_LOCK_CONVERSION);
201
202 do {
203 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
204 if (i++ > 100) {
205 printk(KERN_ERR PFX
206 "Hardware error - pending bit very stuck - "
207 "no further pstate changes possible\n");
208 return 1;
209 }
210 } while (query_current_values_with_pending_wait(data));
211
212 count_off_irt(data);
213
214 if (savevid != data->currvid) {
215 printk(KERN_ERR PFX
216 "vid change on fid trans, old 0x%x, new 0x%x\n",
217 savevid, data->currvid);
218 return 1;
219 }
220
221 if (fid != data->currfid) {
222 printk(KERN_ERR PFX
223 "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
224 data->currfid);
225 return 1;
226 }
227
228 return 0;
229}
230
231/* Write a new vid to the hardware */
232static int write_new_vid(struct powernow_k8_data *data, u32 vid)
233{
234 u32 lo;
235 u32 savefid = data->currfid;
236 int i = 0;
237
238 if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
239 printk(KERN_ERR PFX "internal error - overflow on vid write\n");
240 return 1;
241 }
242
243 lo = data->currfid;
244 lo |= (vid << MSR_C_LO_VID_SHIFT);
245 lo |= MSR_C_LO_INIT_FID_VID;
246
247 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
248 vid, lo, STOP_GRANT_5NS);
249
250 do {
251 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
252 if (i++ > 100) {
253 printk(KERN_ERR PFX "internal error - pending bit "
254 "very stuck - no further pstate "
255 "changes possible\n");
256 return 1;
257 }
258 } while (query_current_values_with_pending_wait(data));
259
260 if (savefid != data->currfid) {
261 printk(KERN_ERR PFX "fid changed on vid trans, old "
262 "0x%x new 0x%x\n",
263 savefid, data->currfid);
264 return 1;
265 }
266
267 if (vid != data->currvid) {
268 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
269 "curr 0x%x\n",
270 vid, data->currvid);
271 return 1;
272 }
273
274 return 0;
275}
276
277/*
278 * Reduce the vid by the max of step or reqvid.
279 * Decreasing vid codes represent increasing voltages:
280 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
281 */
282static int decrease_vid_code_by_step(struct powernow_k8_data *data,
283 u32 reqvid, u32 step)
284{
285 if ((data->currvid - reqvid) > step)
286 reqvid = data->currvid - step;
287
288 if (write_new_vid(data, reqvid))
289 return 1;
290
291 count_off_vst(data);
292
293 return 0;
294}
295
296/* Change hardware pstate by single MSR write */
297static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
298{
299 wrmsr(MSR_PSTATE_CTRL, pstate, 0);
300 data->currpstate = pstate;
301 return 0;
302}
303
304/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
305static int transition_fid_vid(struct powernow_k8_data *data,
306 u32 reqfid, u32 reqvid)
307{
308 if (core_voltage_pre_transition(data, reqvid, reqfid))
309 return 1;
310
311 if (core_frequency_transition(data, reqfid))
312 return 1;
313
314 if (core_voltage_post_transition(data, reqvid))
315 return 1;
316
317 if (query_current_values_with_pending_wait(data))
318 return 1;
319
320 if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
321 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
322 "curr 0x%x 0x%x\n",
323 smp_processor_id(),
324 reqfid, reqvid, data->currfid, data->currvid);
325 return 1;
326 }
327
328 dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
329 smp_processor_id(), data->currfid, data->currvid);
330
331 return 0;
332}
333
334/* Phase 1 - core voltage transition ... setup voltage */
335static int core_voltage_pre_transition(struct powernow_k8_data *data,
336 u32 reqvid, u32 reqfid)
337{
338 u32 rvosteps = data->rvo;
339 u32 savefid = data->currfid;
340 u32 maxvid, lo, rvomult = 1;
341
342 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
343 "reqvid 0x%x, rvo 0x%x\n",
344 smp_processor_id(),
345 data->currfid, data->currvid, reqvid, data->rvo);
346
347 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
348 rvomult = 2;
349 rvosteps *= rvomult;
350 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
351 maxvid = 0x1f & (maxvid >> 16);
352 dprintk("ph1 maxvid=0x%x\n", maxvid);
353 if (reqvid < maxvid) /* lower numbers are higher voltages */
354 reqvid = maxvid;
355
356 while (data->currvid > reqvid) {
357 dprintk("ph1: curr 0x%x, req vid 0x%x\n",
358 data->currvid, reqvid);
359 if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
360 return 1;
361 }
362
363 while ((rvosteps > 0) &&
364 ((rvomult * data->rvo + data->currvid) > reqvid)) {
365 if (data->currvid == maxvid) {
366 rvosteps = 0;
367 } else {
368 dprintk("ph1: changing vid for rvo, req 0x%x\n",
369 data->currvid - 1);
370 if (decrease_vid_code_by_step(data, data->currvid-1, 1))
371 return 1;
372 rvosteps--;
373 }
374 }
375
376 if (query_current_values_with_pending_wait(data))
377 return 1;
378
379 if (savefid != data->currfid) {
380 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
381 data->currfid);
382 return 1;
383 }
384
385 dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
386 data->currfid, data->currvid);
387
388 return 0;
389}
390
391/* Phase 2 - core frequency transition */
392static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
393{
394 u32 vcoreqfid, vcocurrfid, vcofiddiff;
395 u32 fid_interval, savevid = data->currvid;
396
397 if (data->currfid == reqfid) {
398 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
399 data->currfid);
400 return 0;
401 }
402
403 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
404 "reqfid 0x%x\n",
405 smp_processor_id(),
406 data->currfid, data->currvid, reqfid);
407
408 vcoreqfid = convert_fid_to_vco_fid(reqfid);
409 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
410 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
411 : vcoreqfid - vcocurrfid;
412
413 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
414 vcofiddiff = 0;
415
416 while (vcofiddiff > 2) {
417 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
418
419 if (reqfid > data->currfid) {
420 if (data->currfid > LO_FID_TABLE_TOP) {
421 if (write_new_fid(data,
422 data->currfid + fid_interval))
423 return 1;
424 } else {
425 if (write_new_fid
426 (data,
427 2 + convert_fid_to_vco_fid(data->currfid)))
428 return 1;
429 }
430 } else {
431 if (write_new_fid(data, data->currfid - fid_interval))
432 return 1;
433 }
434
435 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
436 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
437 : vcoreqfid - vcocurrfid;
438 }
439
440 if (write_new_fid(data, reqfid))
441 return 1;
442
443 if (query_current_values_with_pending_wait(data))
444 return 1;
445
446 if (data->currfid != reqfid) {
447 printk(KERN_ERR PFX
448 "ph2: mismatch, failed fid transition, "
449 "curr 0x%x, req 0x%x\n",
450 data->currfid, reqfid);
451 return 1;
452 }
453
454 if (savevid != data->currvid) {
455 printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
456 savevid, data->currvid);
457 return 1;
458 }
459
460 dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
461 data->currfid, data->currvid);
462
463 return 0;
464}
465
466/* Phase 3 - core voltage transition flow ... jump to the final vid. */
467static int core_voltage_post_transition(struct powernow_k8_data *data,
468 u32 reqvid)
469{
470 u32 savefid = data->currfid;
471 u32 savereqvid = reqvid;
472
473 dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
474 smp_processor_id(),
475 data->currfid, data->currvid);
476
477 if (reqvid != data->currvid) {
478 if (write_new_vid(data, reqvid))
479 return 1;
480
481 if (savefid != data->currfid) {
482 printk(KERN_ERR PFX
483 "ph3: bad fid change, save 0x%x, curr 0x%x\n",
484 savefid, data->currfid);
485 return 1;
486 }
487
488 if (data->currvid != reqvid) {
489 printk(KERN_ERR PFX
490 "ph3: failed vid transition\n, "
491 "req 0x%x, curr 0x%x",
492 reqvid, data->currvid);
493 return 1;
494 }
495 }
496
497 if (query_current_values_with_pending_wait(data))
498 return 1;
499
500 if (savereqvid != data->currvid) {
501 dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
502 return 1;
503 }
504
505 if (savefid != data->currfid) {
506 dprintk("ph3 failed, currfid changed 0x%x\n",
507 data->currfid);
508 return 1;
509 }
510
511 dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
512 data->currfid, data->currvid);
513
514 return 0;
515}
516
517static void check_supported_cpu(void *_rc)
518{
519 u32 eax, ebx, ecx, edx;
520 int *rc = _rc;
521
522 *rc = -ENODEV;
523
524 if (__this_cpu_read(cpu_info.x86_vendor) != X86_VENDOR_AMD)
525 return;
526
527 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
528 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
529 ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
530 return;
531
532 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
533 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
534 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
535 printk(KERN_INFO PFX
536 "Processor cpuid %x not supported\n", eax);
537 return;
538 }
539
540 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
541 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
542 printk(KERN_INFO PFX
543 "No frequency change capabilities detected\n");
544 return;
545 }
546
547 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
548 if ((edx & P_STATE_TRANSITION_CAPABLE)
549 != P_STATE_TRANSITION_CAPABLE) {
550 printk(KERN_INFO PFX
551 "Power state transitions not supported\n");
552 return;
553 }
554 } else { /* must be a HW Pstate capable processor */
555 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
556 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
557 cpu_family = CPU_HW_PSTATE;
558 else
559 return;
560 }
561
562 *rc = 0;
563}
564
565static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
566 u8 maxvid)
567{
568 unsigned int j;
569 u8 lastfid = 0xff;
570
571 for (j = 0; j < data->numps; j++) {
572 if (pst[j].vid > LEAST_VID) {
573 printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
574 j, pst[j].vid);
575 return -EINVAL;
576 }
577 if (pst[j].vid < data->rvo) {
578 /* vid + rvo >= 0 */
579 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
580 " %d\n", j);
581 return -ENODEV;
582 }
583 if (pst[j].vid < maxvid + data->rvo) {
584 /* vid + rvo >= maxvid */
585 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
586 " %d\n", j);
587 return -ENODEV;
588 }
589 if (pst[j].fid > MAX_FID) {
590 printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
591 " %d\n", j);
592 return -ENODEV;
593 }
594 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
595 /* Only first fid is allowed to be in "low" range */
596 printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
597 "0x%x\n", j, pst[j].fid);
598 return -EINVAL;
599 }
600 if (pst[j].fid < lastfid)
601 lastfid = pst[j].fid;
602 }
603 if (lastfid & 1) {
604 printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
605 return -EINVAL;
606 }
607 if (lastfid > LO_FID_TABLE_TOP)
608 printk(KERN_INFO FW_BUG PFX
609 "first fid not from lo freq table\n");
610
611 return 0;
612}
613
614static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
615 unsigned int entry)
616{
617 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
618}
619
620static void print_basics(struct powernow_k8_data *data)
621{
622 int j;
623 for (j = 0; j < data->numps; j++) {
624 if (data->powernow_table[j].frequency !=
625 CPUFREQ_ENTRY_INVALID) {
626 if (cpu_family == CPU_HW_PSTATE) {
627 printk(KERN_INFO PFX
628 " %d : pstate %d (%d MHz)\n", j,
629 data->powernow_table[j].index,
630 data->powernow_table[j].frequency/1000);
631 } else {
632 printk(KERN_INFO PFX
633 "fid 0x%x (%d MHz), vid 0x%x\n",
634 data->powernow_table[j].index & 0xff,
635 data->powernow_table[j].frequency/1000,
636 data->powernow_table[j].index >> 8);
637 }
638 }
639 }
640 if (data->batps)
641 printk(KERN_INFO PFX "Only %d pstates on battery\n",
642 data->batps);
643}
644
645static u32 freq_from_fid_did(u32 fid, u32 did)
646{
647 u32 mhz = 0;
648
649 if (boot_cpu_data.x86 == 0x10)
650 mhz = (100 * (fid + 0x10)) >> did;
651 else if (boot_cpu_data.x86 == 0x11)
652 mhz = (100 * (fid + 8)) >> did;
653 else
654 BUG();
655
656 return mhz * 1000;
657}
658
659static int fill_powernow_table(struct powernow_k8_data *data,
660 struct pst_s *pst, u8 maxvid)
661{
662 struct cpufreq_frequency_table *powernow_table;
663 unsigned int j;
664
665 if (data->batps) {
666 /* use ACPI support to get full speed on mains power */
667 printk(KERN_WARNING PFX
668 "Only %d pstates usable (use ACPI driver for full "
669 "range\n", data->batps);
670 data->numps = data->batps;
671 }
672
673 for (j = 1; j < data->numps; j++) {
674 if (pst[j-1].fid >= pst[j].fid) {
675 printk(KERN_ERR PFX "PST out of sequence\n");
676 return -EINVAL;
677 }
678 }
679
680 if (data->numps < 2) {
681 printk(KERN_ERR PFX "no p states to transition\n");
682 return -ENODEV;
683 }
684
685 if (check_pst_table(data, pst, maxvid))
686 return -EINVAL;
687
688 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
689 * (data->numps + 1)), GFP_KERNEL);
690 if (!powernow_table) {
691 printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
692 return -ENOMEM;
693 }
694
695 for (j = 0; j < data->numps; j++) {
696 int freq;
697 powernow_table[j].index = pst[j].fid; /* lower 8 bits */
698 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
699 freq = find_khz_freq_from_fid(pst[j].fid);
700 powernow_table[j].frequency = freq;
701 }
702 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
703 powernow_table[data->numps].index = 0;
704
705 if (query_current_values_with_pending_wait(data)) {
706 kfree(powernow_table);
707 return -EIO;
708 }
709
710 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
711 data->powernow_table = powernow_table;
712 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
713 print_basics(data);
714
715 for (j = 0; j < data->numps; j++)
716 if ((pst[j].fid == data->currfid) &&
717 (pst[j].vid == data->currvid))
718 return 0;
719
720 dprintk("currfid/vid do not match PST, ignoring\n");
721 return 0;
722}
723
724/* Find and validate the PSB/PST table in BIOS. */
725static int find_psb_table(struct powernow_k8_data *data)
726{
727 struct psb_s *psb;
728 unsigned int i;
729 u32 mvs;
730 u8 maxvid;
731 u32 cpst = 0;
732 u32 thiscpuid;
733
734 for (i = 0xc0000; i < 0xffff0; i += 0x10) {
735 /* Scan BIOS looking for the signature. */
736 /* It can not be at ffff0 - it is too big. */
737
738 psb = phys_to_virt(i);
739 if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
740 continue;
741
742 dprintk("found PSB header at 0x%p\n", psb);
743
744 dprintk("table vers: 0x%x\n", psb->tableversion);
745 if (psb->tableversion != PSB_VERSION_1_4) {
746 printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
747 return -ENODEV;
748 }
749
750 dprintk("flags: 0x%x\n", psb->flags1);
751 if (psb->flags1) {
752 printk(KERN_ERR FW_BUG PFX "unknown flags\n");
753 return -ENODEV;
754 }
755
756 data->vstable = psb->vstable;
757 dprintk("voltage stabilization time: %d(*20us)\n",
758 data->vstable);
759
760 dprintk("flags2: 0x%x\n", psb->flags2);
761 data->rvo = psb->flags2 & 3;
762 data->irt = ((psb->flags2) >> 2) & 3;
763 mvs = ((psb->flags2) >> 4) & 3;
764 data->vidmvs = 1 << mvs;
765 data->batps = ((psb->flags2) >> 6) & 3;
766
767 dprintk("ramp voltage offset: %d\n", data->rvo);
768 dprintk("isochronous relief time: %d\n", data->irt);
769 dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
770
771 dprintk("numpst: 0x%x\n", psb->num_tables);
772 cpst = psb->num_tables;
773 if ((psb->cpuid == 0x00000fc0) ||
774 (psb->cpuid == 0x00000fe0)) {
775 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
776 if ((thiscpuid == 0x00000fc0) ||
777 (thiscpuid == 0x00000fe0))
778 cpst = 1;
779 }
780 if (cpst != 1) {
781 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
782 return -ENODEV;
783 }
784
785 data->plllock = psb->plllocktime;
786 dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
787 dprintk("maxfid: 0x%x\n", psb->maxfid);
788 dprintk("maxvid: 0x%x\n", psb->maxvid);
789 maxvid = psb->maxvid;
790
791 data->numps = psb->numps;
792 dprintk("numpstates: 0x%x\n", data->numps);
793 return fill_powernow_table(data,
794 (struct pst_s *)(psb+1), maxvid);
795 }
796 /*
797 * If you see this message, complain to BIOS manufacturer. If
798 * he tells you "we do not support Linux" or some similar
799 * nonsense, remember that Windows 2000 uses the same legacy
800 * mechanism that the old Linux PSB driver uses. Tell them it
801 * is broken with Windows 2000.
802 *
803 * The reference to the AMD documentation is chapter 9 in the
804 * BIOS and Kernel Developer's Guide, which is available on
805 * www.amd.com
806 */
807 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
808 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
809 " and Cool'N'Quiet support is enabled in BIOS setup\n");
810 return -ENODEV;
811}
812
813static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
814 unsigned int index)
815{
816 u64 control;
817
818 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
819 return;
820
821 control = data->acpi_data.states[index].control;
822 data->irt = (control >> IRT_SHIFT) & IRT_MASK;
823 data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
824 data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
825 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
826 data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
827 data->vstable = (control >> VST_SHIFT) & VST_MASK;
828}
829
830static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
831{
832 struct cpufreq_frequency_table *powernow_table;
833 int ret_val = -ENODEV;
834 u64 control, status;
835
836 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
837 dprintk("register performance failed: bad ACPI data\n");
838 return -EIO;
839 }
840
841 /* verify the data contained in the ACPI structures */
842 if (data->acpi_data.state_count <= 1) {
843 dprintk("No ACPI P-States\n");
844 goto err_out;
845 }
846
847 control = data->acpi_data.control_register.space_id;
848 status = data->acpi_data.status_register.space_id;
849
850 if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
851 (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
852 dprintk("Invalid control/status registers (%x - %x)\n",
853 control, status);
854 goto err_out;
855 }
856
857 /* fill in data->powernow_table */
858 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
859 * (data->acpi_data.state_count + 1)), GFP_KERNEL);
860 if (!powernow_table) {
861 dprintk("powernow_table memory alloc failure\n");
862 goto err_out;
863 }
864
865 /* fill in data */
866 data->numps = data->acpi_data.state_count;
867 powernow_k8_acpi_pst_values(data, 0);
868
869 if (cpu_family == CPU_HW_PSTATE)
870 ret_val = fill_powernow_table_pstate(data, powernow_table);
871 else
872 ret_val = fill_powernow_table_fidvid(data, powernow_table);
873 if (ret_val)
874 goto err_out_mem;
875
876 powernow_table[data->acpi_data.state_count].frequency =
877 CPUFREQ_TABLE_END;
878 powernow_table[data->acpi_data.state_count].index = 0;
879 data->powernow_table = powernow_table;
880
881 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
882 print_basics(data);
883
884 /* notify BIOS that we exist */
885 acpi_processor_notify_smm(THIS_MODULE);
886
887 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
888 printk(KERN_ERR PFX
889 "unable to alloc powernow_k8_data cpumask\n");
890 ret_val = -ENOMEM;
891 goto err_out_mem;
892 }
893
894 return 0;
895
896err_out_mem:
897 kfree(powernow_table);
898
899err_out:
900 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
901
902 /* data->acpi_data.state_count informs us at ->exit()
903 * whether ACPI was used */
904 data->acpi_data.state_count = 0;
905
906 return ret_val;
907}
908
909static int fill_powernow_table_pstate(struct powernow_k8_data *data,
910 struct cpufreq_frequency_table *powernow_table)
911{
912 int i;
913 u32 hi = 0, lo = 0;
914 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
915 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
916
917 for (i = 0; i < data->acpi_data.state_count; i++) {
918 u32 index;
919
920 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
921 if (index > data->max_hw_pstate) {
922 printk(KERN_ERR PFX "invalid pstate %d - "
923 "bad value %d.\n", i, index);
924 printk(KERN_ERR PFX "Please report to BIOS "
925 "manufacturer\n");
926 invalidate_entry(powernow_table, i);
927 continue;
928 }
929 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
930 if (!(hi & HW_PSTATE_VALID_MASK)) {
931 dprintk("invalid pstate %d, ignoring\n", index);
932 invalidate_entry(powernow_table, i);
933 continue;
934 }
935
936 powernow_table[i].index = index;
937
938 /* Frequency may be rounded for these */
939 if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
940 || boot_cpu_data.x86 == 0x11) {
941 powernow_table[i].frequency =
942 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
943 } else
944 powernow_table[i].frequency =
945 data->acpi_data.states[i].core_frequency * 1000;
946 }
947 return 0;
948}
949
950static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
951 struct cpufreq_frequency_table *powernow_table)
952{
953 int i;
954
955 for (i = 0; i < data->acpi_data.state_count; i++) {
956 u32 fid;
957 u32 vid;
958 u32 freq, index;
959 u64 status, control;
960
961 if (data->exttype) {
962 status = data->acpi_data.states[i].status;
963 fid = status & EXT_FID_MASK;
964 vid = (status >> VID_SHIFT) & EXT_VID_MASK;
965 } else {
966 control = data->acpi_data.states[i].control;
967 fid = control & FID_MASK;
968 vid = (control >> VID_SHIFT) & VID_MASK;
969 }
970
971 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
972
973 index = fid | (vid<<8);
974 powernow_table[i].index = index;
975
976 freq = find_khz_freq_from_fid(fid);
977 powernow_table[i].frequency = freq;
978
979 /* verify frequency is OK */
980 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
981 dprintk("invalid freq %u kHz, ignoring\n", freq);
982 invalidate_entry(powernow_table, i);
983 continue;
984 }
985
986 /* verify voltage is OK -
987 * BIOSs are using "off" to indicate invalid */
988 if (vid == VID_OFF) {
989 dprintk("invalid vid %u, ignoring\n", vid);
990 invalidate_entry(powernow_table, i);
991 continue;
992 }
993
994 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
995 printk(KERN_INFO PFX "invalid freq entries "
996 "%u kHz vs. %u kHz\n", freq,
997 (unsigned int)
998 (data->acpi_data.states[i].core_frequency
999 * 1000));
1000 invalidate_entry(powernow_table, i);
1001 continue;
1002 }
1003 }
1004 return 0;
1005}
1006
1007static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
1008{
1009 if (data->acpi_data.state_count)
1010 acpi_processor_unregister_performance(&data->acpi_data,
1011 data->cpu);
1012 free_cpumask_var(data->acpi_data.shared_cpu_map);
1013}
1014
1015static int get_transition_latency(struct powernow_k8_data *data)
1016{
1017 int max_latency = 0;
1018 int i;
1019 for (i = 0; i < data->acpi_data.state_count; i++) {
1020 int cur_latency = data->acpi_data.states[i].transition_latency
1021 + data->acpi_data.states[i].bus_master_latency;
1022 if (cur_latency > max_latency)
1023 max_latency = cur_latency;
1024 }
1025 if (max_latency == 0) {
1026 /*
1027 * Fam 11h and later may return 0 as transition latency. This
1028 * is intended and means "very fast". While cpufreq core and
1029 * governors currently can handle that gracefully, better set it
1030 * to 1 to avoid problems in the future.
1031 */
1032 if (boot_cpu_data.x86 < 0x11)
1033 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1034 "latency\n");
1035 max_latency = 1;
1036 }
1037 /* value in usecs, needs to be in nanoseconds */
1038 return 1000 * max_latency;
1039}
1040
1041/* Take a frequency, and issue the fid/vid transition command */
1042static int transition_frequency_fidvid(struct powernow_k8_data *data,
1043 unsigned int index)
1044{
1045 u32 fid = 0;
1046 u32 vid = 0;
1047 int res, i;
1048 struct cpufreq_freqs freqs;
1049
1050 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1051
1052 /* fid/vid correctness check for k8 */
1053 /* fid are the lower 8 bits of the index we stored into
1054 * the cpufreq frequency table in find_psb_table, vid
1055 * are the upper 8 bits.
1056 */
1057 fid = data->powernow_table[index].index & 0xFF;
1058 vid = (data->powernow_table[index].index & 0xFF00) >> 8;
1059
1060 dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
1061
1062 if (query_current_values_with_pending_wait(data))
1063 return 1;
1064
1065 if ((data->currvid == vid) && (data->currfid == fid)) {
1066 dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
1067 fid, vid);
1068 return 0;
1069 }
1070
1071 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1072 smp_processor_id(), fid, vid);
1073 freqs.old = find_khz_freq_from_fid(data->currfid);
1074 freqs.new = find_khz_freq_from_fid(fid);
1075
1076 for_each_cpu(i, data->available_cores) {
1077 freqs.cpu = i;
1078 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1079 }
1080
1081 res = transition_fid_vid(data, fid, vid);
1082 freqs.new = find_khz_freq_from_fid(data->currfid);
1083
1084 for_each_cpu(i, data->available_cores) {
1085 freqs.cpu = i;
1086 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1087 }
1088 return res;
1089}
1090
1091/* Take a frequency, and issue the hardware pstate transition command */
1092static int transition_frequency_pstate(struct powernow_k8_data *data,
1093 unsigned int index)
1094{
1095 u32 pstate = 0;
1096 int res, i;
1097 struct cpufreq_freqs freqs;
1098
1099 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1100
1101 /* get MSR index for hardware pstate transition */
1102 pstate = index & HW_PSTATE_MASK;
1103 if (pstate > data->max_hw_pstate)
1104 return 0;
1105 freqs.old = find_khz_freq_from_pstate(data->powernow_table,
1106 data->currpstate);
1107 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1108
1109 for_each_cpu(i, data->available_cores) {
1110 freqs.cpu = i;
1111 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1112 }
1113
1114 res = transition_pstate(data, pstate);
1115 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1116
1117 for_each_cpu(i, data->available_cores) {
1118 freqs.cpu = i;
1119 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1120 }
1121 return res;
1122}
1123
1124/* Driver entry point to switch to the target frequency */
1125static int powernowk8_target(struct cpufreq_policy *pol,
1126 unsigned targfreq, unsigned relation)
1127{
1128 cpumask_var_t oldmask;
1129 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1130 u32 checkfid;
1131 u32 checkvid;
1132 unsigned int newstate;
1133 int ret = -EIO;
1134
1135 if (!data)
1136 return -EINVAL;
1137
1138 checkfid = data->currfid;
1139 checkvid = data->currvid;
1140
1141 /* only run on specific CPU from here on. */
1142 /* This is poor form: use a workqueue or smp_call_function_single */
1143 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1144 return -ENOMEM;
1145
1146 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1147 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1148
1149 if (smp_processor_id() != pol->cpu) {
1150 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1151 goto err_out;
1152 }
1153
1154 if (pending_bit_stuck()) {
1155 printk(KERN_ERR PFX "failing targ, change pending bit set\n");
1156 goto err_out;
1157 }
1158
1159 dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
1160 pol->cpu, targfreq, pol->min, pol->max, relation);
1161
1162 if (query_current_values_with_pending_wait(data))
1163 goto err_out;
1164
1165 if (cpu_family != CPU_HW_PSTATE) {
1166 dprintk("targ: curr fid 0x%x, vid 0x%x\n",
1167 data->currfid, data->currvid);
1168
1169 if ((checkvid != data->currvid) ||
1170 (checkfid != data->currfid)) {
1171 printk(KERN_INFO PFX
1172 "error - out of sync, fix 0x%x 0x%x, "
1173 "vid 0x%x 0x%x\n",
1174 checkfid, data->currfid,
1175 checkvid, data->currvid);
1176 }
1177 }
1178
1179 if (cpufreq_frequency_table_target(pol, data->powernow_table,
1180 targfreq, relation, &newstate))
1181 goto err_out;
1182
1183 mutex_lock(&fidvid_mutex);
1184
1185 powernow_k8_acpi_pst_values(data, newstate);
1186
1187 if (cpu_family == CPU_HW_PSTATE)
1188 ret = transition_frequency_pstate(data, newstate);
1189 else
1190 ret = transition_frequency_fidvid(data, newstate);
1191 if (ret) {
1192 printk(KERN_ERR PFX "transition frequency failed\n");
1193 ret = 1;
1194 mutex_unlock(&fidvid_mutex);
1195 goto err_out;
1196 }
1197 mutex_unlock(&fidvid_mutex);
1198
1199 if (cpu_family == CPU_HW_PSTATE)
1200 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1201 newstate);
1202 else
1203 pol->cur = find_khz_freq_from_fid(data->currfid);
1204 ret = 0;
1205
1206err_out:
1207 set_cpus_allowed_ptr(current, oldmask);
1208 free_cpumask_var(oldmask);
1209 return ret;
1210}
1211
1212/* Driver entry point to verify the policy and range of frequencies */
1213static int powernowk8_verify(struct cpufreq_policy *pol)
1214{
1215 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1216
1217 if (!data)
1218 return -EINVAL;
1219
1220 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1221}
1222
1223struct init_on_cpu {
1224 struct powernow_k8_data *data;
1225 int rc;
1226};
1227
1228static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
1229{
1230 struct init_on_cpu *init_on_cpu = _init_on_cpu;
1231
1232 if (pending_bit_stuck()) {
1233 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1234 init_on_cpu->rc = -ENODEV;
1235 return;
1236 }
1237
1238 if (query_current_values_with_pending_wait(init_on_cpu->data)) {
1239 init_on_cpu->rc = -ENODEV;
1240 return;
1241 }
1242
1243 if (cpu_family == CPU_OPTERON)
1244 fidvid_msr_init();
1245
1246 init_on_cpu->rc = 0;
1247}
1248
1249/* per CPU init entry point to the driver */
1250static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1251{
1252 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1253 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1254 FW_BUG PFX "Try again with latest BIOS.\n";
1255 struct powernow_k8_data *data;
1256 struct init_on_cpu init_on_cpu;
1257 int rc;
1258 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1259
1260 if (!cpu_online(pol->cpu))
1261 return -ENODEV;
1262
1263 smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
1264 if (rc)
1265 return -ENODEV;
1266
1267 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
1268 if (!data) {
1269 printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
1270 return -ENOMEM;
1271 }
1272
1273 data->cpu = pol->cpu;
1274 data->currpstate = HW_PSTATE_INVALID;
1275
1276 if (powernow_k8_cpu_init_acpi(data)) {
1277 /*
1278 * Use the PSB BIOS structure. This is only available on
1279 * an UP version, and is deprecated by AMD.
1280 */
1281 if (num_online_cpus() != 1) {
1282 printk_once(ACPI_PSS_BIOS_BUG_MSG);
1283 goto err_out;
1284 }
1285 if (pol->cpu != 0) {
1286 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1287 "CPU other than CPU0. Complain to your BIOS "
1288 "vendor.\n");
1289 goto err_out;
1290 }
1291 rc = find_psb_table(data);
1292 if (rc)
1293 goto err_out;
1294
1295 /* Take a crude guess here.
1296 * That guess was in microseconds, so multiply with 1000 */
1297 pol->cpuinfo.transition_latency = (
1298 ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
1299 ((1 << data->irt) * 30)) * 1000;
1300 } else /* ACPI _PSS objects available */
1301 pol->cpuinfo.transition_latency = get_transition_latency(data);
1302
1303 /* only run on specific CPU from here on */
1304 init_on_cpu.data = data;
1305 smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
1306 &init_on_cpu, 1);
1307 rc = init_on_cpu.rc;
1308 if (rc != 0)
1309 goto err_out_exit_acpi;
1310
1311 if (cpu_family == CPU_HW_PSTATE)
1312 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
1313 else
1314 cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
1315 data->available_cores = pol->cpus;
1316
1317 if (cpu_family == CPU_HW_PSTATE)
1318 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1319 data->currpstate);
1320 else
1321 pol->cur = find_khz_freq_from_fid(data->currfid);
1322 dprintk("policy current frequency %d kHz\n", pol->cur);
1323
1324 /* min/max the cpu is capable of */
1325 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1326 printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
1327 powernow_k8_cpu_exit_acpi(data);
1328 kfree(data->powernow_table);
1329 kfree(data);
1330 return -EINVAL;
1331 }
1332
1333 /* Check for APERF/MPERF support in hardware */
1334 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1335 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1336
1337 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1338
1339 if (cpu_family == CPU_HW_PSTATE)
1340 dprintk("cpu_init done, current pstate 0x%x\n",
1341 data->currpstate);
1342 else
1343 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1344 data->currfid, data->currvid);
1345
1346 per_cpu(powernow_data, pol->cpu) = data;
1347
1348 return 0;
1349
1350err_out_exit_acpi:
1351 powernow_k8_cpu_exit_acpi(data);
1352
1353err_out:
1354 kfree(data);
1355 return -ENODEV;
1356}
1357
1358static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1359{
1360 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1361
1362 if (!data)
1363 return -EINVAL;
1364
1365 powernow_k8_cpu_exit_acpi(data);
1366
1367 cpufreq_frequency_table_put_attr(pol->cpu);
1368
1369 kfree(data->powernow_table);
1370 kfree(data);
1371 per_cpu(powernow_data, pol->cpu) = NULL;
1372
1373 return 0;
1374}
1375
1376static void query_values_on_cpu(void *_err)
1377{
1378 int *err = _err;
1379 struct powernow_k8_data *data = __this_cpu_read(powernow_data);
1380
1381 *err = query_current_values_with_pending_wait(data);
1382}
1383
1384static unsigned int powernowk8_get(unsigned int cpu)
1385{
1386 struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
1387 unsigned int khz = 0;
1388 int err;
1389
1390 if (!data)
1391 return 0;
1392
1393 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1394 if (err)
1395 goto out;
1396
1397 if (cpu_family == CPU_HW_PSTATE)
1398 khz = find_khz_freq_from_pstate(data->powernow_table,
1399 data->currpstate);
1400 else
1401 khz = find_khz_freq_from_fid(data->currfid);
1402
1403
1404out:
1405 return khz;
1406}
1407
1408static void _cpb_toggle_msrs(bool t)
1409{
1410 int cpu;
1411
1412 get_online_cpus();
1413
1414 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1415
1416 for_each_cpu(cpu, cpu_online_mask) {
1417 struct msr *reg = per_cpu_ptr(msrs, cpu);
1418 if (t)
1419 reg->l &= ~BIT(25);
1420 else
1421 reg->l |= BIT(25);
1422 }
1423 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1424
1425 put_online_cpus();
1426}
1427
1428/*
1429 * Switch on/off core performance boosting.
1430 *
1431 * 0=disable
1432 * 1=enable.
1433 */
1434static void cpb_toggle(bool t)
1435{
1436 if (!cpb_capable)
1437 return;
1438
1439 if (t && !cpb_enabled) {
1440 cpb_enabled = true;
1441 _cpb_toggle_msrs(t);
1442 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1443 } else if (!t && cpb_enabled) {
1444 cpb_enabled = false;
1445 _cpb_toggle_msrs(t);
1446 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1447 }
1448}
1449
1450static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1451 size_t count)
1452{
1453 int ret = -EINVAL;
1454 unsigned long val = 0;
1455
1456 ret = strict_strtoul(buf, 10, &val);
1457 if (!ret && (val == 0 || val == 1) && cpb_capable)
1458 cpb_toggle(val);
1459 else
1460 return -EINVAL;
1461
1462 return count;
1463}
1464
1465static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1466{
1467 return sprintf(buf, "%u\n", cpb_enabled);
1468}
1469
1470#define define_one_rw(_name) \
1471static struct freq_attr _name = \
1472__ATTR(_name, 0644, show_##_name, store_##_name)
1473
1474define_one_rw(cpb);
1475
1476static struct freq_attr *powernow_k8_attr[] = {
1477 &cpufreq_freq_attr_scaling_available_freqs,
1478 &cpb,
1479 NULL,
1480};
1481
1482static struct cpufreq_driver cpufreq_amd64_driver = {
1483 .verify = powernowk8_verify,
1484 .target = powernowk8_target,
1485 .bios_limit = acpi_processor_get_bios_limit,
1486 .init = powernowk8_cpu_init,
1487 .exit = __devexit_p(powernowk8_cpu_exit),
1488 .get = powernowk8_get,
1489 .name = "powernow-k8",
1490 .owner = THIS_MODULE,
1491 .attr = powernow_k8_attr,
1492};
1493
1494/*
1495 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1496 * cannot block the remaining ones from boosting. On the CPU_UP path we
1497 * simply keep the boost-disable flag in sync with the current global
1498 * state.
1499 */
1500static int cpb_notify(struct notifier_block *nb, unsigned long action,
1501 void *hcpu)
1502{
1503 unsigned cpu = (long)hcpu;
1504 u32 lo, hi;
1505
1506 switch (action) {
1507 case CPU_UP_PREPARE:
1508 case CPU_UP_PREPARE_FROZEN:
1509
1510 if (!cpb_enabled) {
1511 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1512 lo |= BIT(25);
1513 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1514 }
1515 break;
1516
1517 case CPU_DOWN_PREPARE:
1518 case CPU_DOWN_PREPARE_FROZEN:
1519 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1520 lo &= ~BIT(25);
1521 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1522 break;
1523
1524 default:
1525 break;
1526 }
1527
1528 return NOTIFY_OK;
1529}
1530
1531static struct notifier_block cpb_nb = {
1532 .notifier_call = cpb_notify,
1533};
1534
1535/* driver entry point for init */
1536static int __cpuinit powernowk8_init(void)
1537{
1538 unsigned int i, supported_cpus = 0, cpu;
1539 int rv;
1540
1541 for_each_online_cpu(i) {
1542 int rc;
1543 smp_call_function_single(i, check_supported_cpu, &rc, 1);
1544 if (rc == 0)
1545 supported_cpus++;
1546 }
1547
1548 if (supported_cpus != num_online_cpus())
1549 return -ENODEV;
1550
1551 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1552 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1553
1554 if (boot_cpu_has(X86_FEATURE_CPB)) {
1555
1556 cpb_capable = true;
1557
1558 msrs = msrs_alloc();
1559 if (!msrs) {
1560 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1561 return -ENOMEM;
1562 }
1563
1564 register_cpu_notifier(&cpb_nb);
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567
1568 for_each_cpu(cpu, cpu_online_mask) {
1569 struct msr *reg = per_cpu_ptr(msrs, cpu);
1570 cpb_enabled |= !(!!(reg->l & BIT(25)));
1571 }
1572
1573 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1574 (cpb_enabled ? "on" : "off"));
1575 }
1576
1577 rv = cpufreq_register_driver(&cpufreq_amd64_driver);
1578 if (rv < 0 && boot_cpu_has(X86_FEATURE_CPB)) {
1579 unregister_cpu_notifier(&cpb_nb);
1580 msrs_free(msrs);
1581 msrs = NULL;
1582 }
1583 return rv;
1584}
1585
1586/* driver entry point for term */
1587static void __exit powernowk8_exit(void)
1588{
1589 dprintk("exit\n");
1590
1591 if (boot_cpu_has(X86_FEATURE_CPB)) {
1592 msrs_free(msrs);
1593 msrs = NULL;
1594
1595 unregister_cpu_notifier(&cpb_nb);
1596 }
1597
1598 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1599}
1600
1601MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
1602 "Mark Langsdorf <mark.langsdorf@amd.com>");
1603MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1604MODULE_LICENSE("GPL");
1605
1606late_initcall(powernowk8_init);
1607module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
1/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8enum pstate {
9 HW_PSTATE_INVALID = 0xff,
10 HW_PSTATE_0 = 0,
11 HW_PSTATE_1 = 1,
12 HW_PSTATE_2 = 2,
13 HW_PSTATE_3 = 3,
14 HW_PSTATE_4 = 4,
15 HW_PSTATE_5 = 5,
16 HW_PSTATE_6 = 6,
17 HW_PSTATE_7 = 7,
18};
19
20struct powernow_k8_data {
21 unsigned int cpu;
22
23 u32 numps; /* number of p-states */
24 u32 batps; /* number of p-states supported on battery */
25 u32 max_hw_pstate; /* maximum legal hardware pstate */
26
27 /* these values are constant when the PSB is used to determine
28 * vid/fid pairings, but are modified during the ->target() call
29 * when ACPI is used */
30 u32 rvo; /* ramp voltage offset */
31 u32 irt; /* isochronous relief time */
32 u32 vidmvs; /* usable value calculated from mvs */
33 u32 vstable; /* voltage stabilization time, units 20 us */
34 u32 plllock; /* pll lock time, units 1 us */
35 u32 exttype; /* extended interface = 1 */
36
37 /* keep track of the current fid / vid or pstate */
38 u32 currvid;
39 u32 currfid;
40 enum pstate currpstate;
41
42 /* the powernow_table includes all frequency and vid/fid pairings:
43 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
44 * frequency is in kHz */
45 struct cpufreq_frequency_table *powernow_table;
46
47 /* the acpi table needs to be kept. it's only available if ACPI was
48 * used to determine valid frequency/vid/fid states */
49 struct acpi_processor_performance acpi_data;
50
51 /* we need to keep track of associated cores, but let cpufreq
52 * handle hotplug events - so just point at cpufreq pol->cpus
53 * structure */
54 struct cpumask *available_cores;
55};
56
57/* processor's cpuid instruction support */
58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
59#define CPUID_XFAM 0x0ff00000 /* extended family */
60#define CPUID_XFAM_K8 0
61#define CPUID_XMOD 0x000f0000 /* extended model */
62#define CPUID_XMOD_REV_MASK 0x000c0000
63#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
64#define CPUID_USE_XFAM_XMOD 0x00000f00
65#define CPUID_GET_MAX_CAPABILITIES 0x80000000
66#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
67#define P_STATE_TRANSITION_CAPABLE 6
68
69/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
70/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
71/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
72/* the register number is placed in ecx, and the data is returned in edx:eax. */
73
74#define MSR_FIDVID_CTL 0xc0010041
75#define MSR_FIDVID_STATUS 0xc0010042
76
77/* Field definitions within the FID VID Low Control MSR : */
78#define MSR_C_LO_INIT_FID_VID 0x00010000
79#define MSR_C_LO_NEW_VID 0x00003f00
80#define MSR_C_LO_NEW_FID 0x0000003f
81#define MSR_C_LO_VID_SHIFT 8
82
83/* Field definitions within the FID VID High Control MSR : */
84#define MSR_C_HI_STP_GNT_TO 0x000fffff
85
86/* Field definitions within the FID VID Low Status MSR : */
87#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
88#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
89#define MSR_S_LO_MAX_FID 0x003f0000
90#define MSR_S_LO_START_FID 0x00003f00
91#define MSR_S_LO_CURRENT_FID 0x0000003f
92
93/* Field definitions within the FID VID High Status MSR : */
94#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
95#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
96#define MSR_S_HI_START_VID 0x00003f00
97#define MSR_S_HI_CURRENT_VID 0x0000003f
98#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
99
100
101/* Hardware Pstate _PSS and MSR definitions */
102#define USE_HW_PSTATE 0x00000080
103#define HW_PSTATE_MASK 0x00000007
104#define HW_PSTATE_VALID_MASK 0x80000000
105#define HW_PSTATE_MAX_MASK 0x000000f0
106#define HW_PSTATE_MAX_SHIFT 4
107#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
108#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
109#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
110#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
111
112/* define the two driver architectures */
113#define CPU_OPTERON 0
114#define CPU_HW_PSTATE 1
115
116
117/*
118 * There are restrictions frequencies have to follow:
119 * - only 1 entry in the low fid table ( <=1.4GHz )
120 * - lowest entry in the high fid table must be >= 2 * the entry in the
121 * low fid table
122 * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
123 * in the low fid table
124 * - the parts can only step at <= 200 MHz intervals, odd fid values are
125 * supported in revision G and later revisions.
126 * - lowest frequency must be >= interprocessor hypertransport link speed
127 * (only applies to MP systems obviously)
128 */
129
130/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
131#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
132#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
133
134#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
135#define HI_VCOFREQ_TABLE_BOTTOM 1600
136
137#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
138
139#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
140#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
141
142#define MIN_FREQ 800 /* Min and max freqs, per spec */
143#define MAX_FREQ 5000
144
145#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
146#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
147
148#define VID_OFF 0x3f
149
150#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
151
152#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
153
154#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
155#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */
156
157/*
158 * Most values of interest are encoded in a single field of the _PSS
159 * entries: the "control" value.
160 */
161
162#define IRT_SHIFT 30
163#define RVO_SHIFT 28
164#define EXT_TYPE_SHIFT 27
165#define PLL_L_SHIFT 20
166#define MVS_SHIFT 18
167#define VST_SHIFT 11
168#define VID_SHIFT 6
169#define IRT_MASK 3
170#define RVO_MASK 3
171#define EXT_TYPE_MASK 1
172#define PLL_L_MASK 0x7f
173#define MVS_MASK 3
174#define VST_MASK 0x7f
175#define VID_MASK 0x1f
176#define FID_MASK 0x1f
177#define EXT_VID_MASK 0x3f
178#define EXT_FID_MASK 0x3f
179
180
181/*
182 * Version 1.4 of the PSB table. This table is constructed by BIOS and is
183 * to tell the OS's power management driver which VIDs and FIDs are
184 * supported by this particular processor.
185 * If the data in the PSB / PST is wrong, then this driver will program the
186 * wrong values into hardware, which is very likely to lead to a crash.
187 */
188
189#define PSB_ID_STRING "AMDK7PNOW!"
190#define PSB_ID_STRING_LEN 10
191
192#define PSB_VERSION_1_4 0x14
193
194struct psb_s {
195 u8 signature[10];
196 u8 tableversion;
197 u8 flags1;
198 u16 vstable;
199 u8 flags2;
200 u8 num_tables;
201 u32 cpuid;
202 u8 plllocktime;
203 u8 maxfid;
204 u8 maxvid;
205 u8 numps;
206};
207
208/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
209struct pst_s {
210 u8 fid;
211 u8 vid;
212};
213
214#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
215
216static int core_voltage_pre_transition(struct powernow_k8_data *data,
217 u32 reqvid, u32 regfid);
218static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
219static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
220
221static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
222
223static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
224static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
1/*
2 * sc520_freq.c: cpufreq driver for the AMD Elan sc520
3 *
4 * Copyright (C) 2005 Sean Young <sean@mess.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Based on elanfreq.c
12 *
13 * 2005-03-30: - initial revision
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19
20#include <linux/delay.h>
21#include <linux/cpufreq.h>
22#include <linux/timex.h>
23#include <linux/io.h>
24
25#include <asm/msr.h>
26
27#define MMCR_BASE 0xfffef000 /* The default base address */
28#define OFFS_CPUCTL 0x2 /* CPU Control Register */
29
30static __u8 __iomem *cpuctl;
31
32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
33 "sc520_freq", msg)
34#define PFX "sc520_freq: "
35
36static struct cpufreq_frequency_table sc520_freq_table[] = {
37 {0x01, 100000},
38 {0x02, 133000},
39 {0, CPUFREQ_TABLE_END},
40};
41
42static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
43{
44 u8 clockspeed_reg = *cpuctl;
45
46 switch (clockspeed_reg & 0x03) {
47 default:
48 printk(KERN_ERR PFX "error: cpuctl register has unexpected "
49 "value %02x\n", clockspeed_reg);
50 case 0x01:
51 return 100000;
52 case 0x02:
53 return 133000;
54 }
55}
56
57static void sc520_freq_set_cpu_state(unsigned int state)
58{
59
60 struct cpufreq_freqs freqs;
61 u8 clockspeed_reg;
62
63 freqs.old = sc520_freq_get_cpu_frequency(0);
64 freqs.new = sc520_freq_table[state].frequency;
65 freqs.cpu = 0; /* AMD Elan is UP */
66
67 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
68
69 dprintk("attempting to set frequency to %i kHz\n",
70 sc520_freq_table[state].frequency);
71
72 local_irq_disable();
73
74 clockspeed_reg = *cpuctl & ~0x03;
75 *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
76
77 local_irq_enable();
78
79 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
80};
81
82static int sc520_freq_verify(struct cpufreq_policy *policy)
83{
84 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
85}
86
87static int sc520_freq_target(struct cpufreq_policy *policy,
88 unsigned int target_freq,
89 unsigned int relation)
90{
91 unsigned int newstate = 0;
92
93 if (cpufreq_frequency_table_target(policy, sc520_freq_table,
94 target_freq, relation, &newstate))
95 return -EINVAL;
96
97 sc520_freq_set_cpu_state(newstate);
98
99 return 0;
100}
101
102
103/*
104 * Module init and exit code
105 */
106
107static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
108{
109 struct cpuinfo_x86 *c = &cpu_data(0);
110 int result;
111
112 /* capability check */
113 if (c->x86_vendor != X86_VENDOR_AMD ||
114 c->x86 != 4 || c->x86_model != 9)
115 return -ENODEV;
116
117 /* cpuinfo and default policy values */
118 policy->cpuinfo.transition_latency = 1000000; /* 1ms */
119 policy->cur = sc520_freq_get_cpu_frequency(0);
120
121 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
122 if (result)
123 return result;
124
125 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
126
127 return 0;
128}
129
130
131static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
132{
133 cpufreq_frequency_table_put_attr(policy->cpu);
134 return 0;
135}
136
137
138static struct freq_attr *sc520_freq_attr[] = {
139 &cpufreq_freq_attr_scaling_available_freqs,
140 NULL,
141};
142
143
144static struct cpufreq_driver sc520_freq_driver = {
145 .get = sc520_freq_get_cpu_frequency,
146 .verify = sc520_freq_verify,
147 .target = sc520_freq_target,
148 .init = sc520_freq_cpu_init,
149 .exit = sc520_freq_cpu_exit,
150 .name = "sc520_freq",
151 .owner = THIS_MODULE,
152 .attr = sc520_freq_attr,
153};
154
155
156static int __init sc520_freq_init(void)
157{
158 struct cpuinfo_x86 *c = &cpu_data(0);
159 int err;
160
161 /* Test if we have the right hardware */
162 if (c->x86_vendor != X86_VENDOR_AMD ||
163 c->x86 != 4 || c->x86_model != 9) {
164 dprintk("no Elan SC520 processor found!\n");
165 return -ENODEV;
166 }
167 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
168 if (!cpuctl) {
169 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
170 return -ENOMEM;
171 }
172
173 err = cpufreq_register_driver(&sc520_freq_driver);
174 if (err)
175 iounmap(cpuctl);
176
177 return err;
178}
179
180
181static void __exit sc520_freq_exit(void)
182{
183 cpufreq_unregister_driver(&sc520_freq_driver);
184 iounmap(cpuctl);
185}
186
187
188MODULE_LICENSE("GPL");
189MODULE_AUTHOR("Sean Young <sean@mess.org>");
190MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
191
192module_init(sc520_freq_init);
193module_exit(sc520_freq_exit);
194
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
1/*
2 * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
3 * M (part of the Centrino chipset).
4 *
5 * Since the original Pentium M, most new Intel CPUs support Enhanced
6 * SpeedStep.
7 *
8 * Despite the "SpeedStep" in the name, this is almost entirely unlike
9 * traditional SpeedStep.
10 *
11 * Modelled on speedstep.c
12 *
13 * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/sched.h> /* current */
21#include <linux/delay.h>
22#include <linux/compiler.h>
23#include <linux/gfp.h>
24
25#include <asm/msr.h>
26#include <asm/processor.h>
27#include <asm/cpufeature.h>
28
29#define PFX "speedstep-centrino: "
30#define MAINTAINER "cpufreq@vger.kernel.org"
31
32#define dprintk(msg...) \
33 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
34
35#define INTEL_MSR_RANGE (0xffff)
36
37struct cpu_id
38{
39 __u8 x86; /* CPU family */
40 __u8 x86_model; /* model */
41 __u8 x86_mask; /* stepping */
42};
43
44enum {
45 CPU_BANIAS,
46 CPU_DOTHAN_A1,
47 CPU_DOTHAN_A2,
48 CPU_DOTHAN_B0,
49 CPU_MP4HT_D0,
50 CPU_MP4HT_E0,
51};
52
53static const struct cpu_id cpu_ids[] = {
54 [CPU_BANIAS] = { 6, 9, 5 },
55 [CPU_DOTHAN_A1] = { 6, 13, 1 },
56 [CPU_DOTHAN_A2] = { 6, 13, 2 },
57 [CPU_DOTHAN_B0] = { 6, 13, 6 },
58 [CPU_MP4HT_D0] = {15, 3, 4 },
59 [CPU_MP4HT_E0] = {15, 4, 1 },
60};
61#define N_IDS ARRAY_SIZE(cpu_ids)
62
63struct cpu_model
64{
65 const struct cpu_id *cpu_id;
66 const char *model_name;
67 unsigned max_freq; /* max clock in kHz */
68
69 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
70};
71static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
72 const struct cpu_id *x);
73
74/* Operating points for current CPU */
75static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
76static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
77
78static struct cpufreq_driver centrino_driver;
79
80#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
81
82/* Computes the correct form for IA32_PERF_CTL MSR for a particular
83 frequency/voltage operating point; frequency in MHz, volts in mV.
84 This is stored as "index" in the structure. */
85#define OP(mhz, mv) \
86 { \
87 .frequency = (mhz) * 1000, \
88 .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
89 }
90
91/*
92 * These voltage tables were derived from the Intel Pentium M
93 * datasheet, document 25261202.pdf, Table 5. I have verified they
94 * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
95 * M.
96 */
97
98/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
99static struct cpufreq_frequency_table banias_900[] =
100{
101 OP(600, 844),
102 OP(800, 988),
103 OP(900, 1004),
104 { .frequency = CPUFREQ_TABLE_END }
105};
106
107/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
108static struct cpufreq_frequency_table banias_1000[] =
109{
110 OP(600, 844),
111 OP(800, 972),
112 OP(900, 988),
113 OP(1000, 1004),
114 { .frequency = CPUFREQ_TABLE_END }
115};
116
117/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
118static struct cpufreq_frequency_table banias_1100[] =
119{
120 OP( 600, 956),
121 OP( 800, 1020),
122 OP( 900, 1100),
123 OP(1000, 1164),
124 OP(1100, 1180),
125 { .frequency = CPUFREQ_TABLE_END }
126};
127
128
129/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
130static struct cpufreq_frequency_table banias_1200[] =
131{
132 OP( 600, 956),
133 OP( 800, 1004),
134 OP( 900, 1020),
135 OP(1000, 1100),
136 OP(1100, 1164),
137 OP(1200, 1180),
138 { .frequency = CPUFREQ_TABLE_END }
139};
140
141/* Intel Pentium M processor 1.30GHz (Banias) */
142static struct cpufreq_frequency_table banias_1300[] =
143{
144 OP( 600, 956),
145 OP( 800, 1260),
146 OP(1000, 1292),
147 OP(1200, 1356),
148 OP(1300, 1388),
149 { .frequency = CPUFREQ_TABLE_END }
150};
151
152/* Intel Pentium M processor 1.40GHz (Banias) */
153static struct cpufreq_frequency_table banias_1400[] =
154{
155 OP( 600, 956),
156 OP( 800, 1180),
157 OP(1000, 1308),
158 OP(1200, 1436),
159 OP(1400, 1484),
160 { .frequency = CPUFREQ_TABLE_END }
161};
162
163/* Intel Pentium M processor 1.50GHz (Banias) */
164static struct cpufreq_frequency_table banias_1500[] =
165{
166 OP( 600, 956),
167 OP( 800, 1116),
168 OP(1000, 1228),
169 OP(1200, 1356),
170 OP(1400, 1452),
171 OP(1500, 1484),
172 { .frequency = CPUFREQ_TABLE_END }
173};
174
175/* Intel Pentium M processor 1.60GHz (Banias) */
176static struct cpufreq_frequency_table banias_1600[] =
177{
178 OP( 600, 956),
179 OP( 800, 1036),
180 OP(1000, 1164),
181 OP(1200, 1276),
182 OP(1400, 1420),
183 OP(1600, 1484),
184 { .frequency = CPUFREQ_TABLE_END }
185};
186
187/* Intel Pentium M processor 1.70GHz (Banias) */
188static struct cpufreq_frequency_table banias_1700[] =
189{
190 OP( 600, 956),
191 OP( 800, 1004),
192 OP(1000, 1116),
193 OP(1200, 1228),
194 OP(1400, 1308),
195 OP(1700, 1484),
196 { .frequency = CPUFREQ_TABLE_END }
197};
198#undef OP
199
200#define _BANIAS(cpuid, max, name) \
201{ .cpu_id = cpuid, \
202 .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
203 .max_freq = (max)*1000, \
204 .op_points = banias_##max, \
205}
206#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
207
208/* CPU models, their operating frequency range, and freq/voltage
209 operating points */
210static struct cpu_model models[] =
211{
212 _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
213 BANIAS(1000),
214 BANIAS(1100),
215 BANIAS(1200),
216 BANIAS(1300),
217 BANIAS(1400),
218 BANIAS(1500),
219 BANIAS(1600),
220 BANIAS(1700),
221
222 /* NULL model_name is a wildcard */
223 { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
224 { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
225 { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
226 { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
227 { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
228
229 { NULL, }
230};
231#undef _BANIAS
232#undef BANIAS
233
234static int centrino_cpu_init_table(struct cpufreq_policy *policy)
235{
236 struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
237 struct cpu_model *model;
238
239 for(model = models; model->cpu_id != NULL; model++)
240 if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
241 (model->model_name == NULL ||
242 strcmp(cpu->x86_model_id, model->model_name) == 0))
243 break;
244
245 if (model->cpu_id == NULL) {
246 /* No match at all */
247 dprintk("no support for CPU model \"%s\": "
248 "send /proc/cpuinfo to " MAINTAINER "\n",
249 cpu->x86_model_id);
250 return -ENOENT;
251 }
252
253 if (model->op_points == NULL) {
254 /* Matched a non-match */
255 dprintk("no table support for CPU model \"%s\"\n",
256 cpu->x86_model_id);
257 dprintk("try using the acpi-cpufreq driver\n");
258 return -ENOENT;
259 }
260
261 per_cpu(centrino_model, policy->cpu) = model;
262
263 dprintk("found \"%s\": max frequency: %dkHz\n",
264 model->model_name, model->max_freq);
265
266 return 0;
267}
268
269#else
270static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
271{
272 return -ENODEV;
273}
274#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
275
276static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
277 const struct cpu_id *x)
278{
279 if ((c->x86 == x->x86) &&
280 (c->x86_model == x->x86_model) &&
281 (c->x86_mask == x->x86_mask))
282 return 1;
283 return 0;
284}
285
286/* To be called only after centrino_model is initialized */
287static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
288{
289 int i;
290
291 /*
292 * Extract clock in kHz from PERF_CTL value
293 * for centrino, as some DSDTs are buggy.
294 * Ideally, this can be done using the acpi_data structure.
295 */
296 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
298 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
299 msr = (msr >> 8) & 0xff;
300 return msr * 100000;
301 }
302
303 if ((!per_cpu(centrino_model, cpu)) ||
304 (!per_cpu(centrino_model, cpu)->op_points))
305 return 0;
306
307 msr &= 0xffff;
308 for (i = 0;
309 per_cpu(centrino_model, cpu)->op_points[i].frequency
310 != CPUFREQ_TABLE_END;
311 i++) {
312 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
313 return per_cpu(centrino_model, cpu)->
314 op_points[i].frequency;
315 }
316 if (failsafe)
317 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
318 else
319 return 0;
320}
321
322/* Return the current CPU frequency in kHz */
323static unsigned int get_cur_freq(unsigned int cpu)
324{
325 unsigned l, h;
326 unsigned clock_freq;
327
328 rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
329 clock_freq = extract_clock(l, cpu, 0);
330
331 if (unlikely(clock_freq == 0)) {
332 /*
333 * On some CPUs, we can see transient MSR values (which are
334 * not present in _PSS), while CPU is doing some automatic
335 * P-state transition (like TM2). Get the last freq set
336 * in PERF_CTL.
337 */
338 rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
339 clock_freq = extract_clock(l, cpu, 1);
340 }
341 return clock_freq;
342}
343
344
345static int centrino_cpu_init(struct cpufreq_policy *policy)
346{
347 struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
348 unsigned freq;
349 unsigned l, h;
350 int ret;
351 int i;
352
353 /* Only Intel makes Enhanced Speedstep-capable CPUs */
354 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
355 !cpu_has(cpu, X86_FEATURE_EST))
356 return -ENODEV;
357
358 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
359 centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
360
361 if (policy->cpu != 0)
362 return -ENODEV;
363
364 for (i = 0; i < N_IDS; i++)
365 if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
366 break;
367
368 if (i != N_IDS)
369 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
370
371 if (!per_cpu(centrino_cpu, policy->cpu)) {
372 dprintk("found unsupported CPU with "
373 "Enhanced SpeedStep: send /proc/cpuinfo to "
374 MAINTAINER "\n");
375 return -ENODEV;
376 }
377
378 if (centrino_cpu_init_table(policy)) {
379 return -ENODEV;
380 }
381
382 /* Check to see if Enhanced SpeedStep is enabled, and try to
383 enable it if not. */
384 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
385
386 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
387 l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
388 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
389 wrmsr(MSR_IA32_MISC_ENABLE, l, h);
390
391 /* check to see if it stuck */
392 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
393 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
394 printk(KERN_INFO PFX
395 "couldn't enable Enhanced SpeedStep\n");
396 return -ENODEV;
397 }
398 }
399
400 freq = get_cur_freq(policy->cpu);
401 policy->cpuinfo.transition_latency = 10000;
402 /* 10uS transition latency */
403 policy->cur = freq;
404
405 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
406
407 ret = cpufreq_frequency_table_cpuinfo(policy,
408 per_cpu(centrino_model, policy->cpu)->op_points);
409 if (ret)
410 return (ret);
411
412 cpufreq_frequency_table_get_attr(
413 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
414
415 return 0;
416}
417
418static int centrino_cpu_exit(struct cpufreq_policy *policy)
419{
420 unsigned int cpu = policy->cpu;
421
422 if (!per_cpu(centrino_model, cpu))
423 return -ENODEV;
424
425 cpufreq_frequency_table_put_attr(cpu);
426
427 per_cpu(centrino_model, cpu) = NULL;
428
429 return 0;
430}
431
432/**
433 * centrino_verify - verifies a new CPUFreq policy
434 * @policy: new policy
435 *
436 * Limit must be within this model's frequency range at least one
437 * border included.
438 */
439static int centrino_verify (struct cpufreq_policy *policy)
440{
441 return cpufreq_frequency_table_verify(policy,
442 per_cpu(centrino_model, policy->cpu)->op_points);
443}
444
445/**
446 * centrino_setpolicy - set a new CPUFreq policy
447 * @policy: new policy
448 * @target_freq: the target frequency
449 * @relation: how that frequency relates to achieved frequency
450 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
451 *
452 * Sets a new CPUFreq policy.
453 */
454static int centrino_target (struct cpufreq_policy *policy,
455 unsigned int target_freq,
456 unsigned int relation)
457{
458 unsigned int newstate = 0;
459 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
460 struct cpufreq_freqs freqs;
461 int retval = 0;
462 unsigned int j, k, first_cpu, tmp;
463 cpumask_var_t covered_cpus;
464
465 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
466 return -ENOMEM;
467
468 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
469 retval = -ENODEV;
470 goto out;
471 }
472
473 if (unlikely(cpufreq_frequency_table_target(policy,
474 per_cpu(centrino_model, cpu)->op_points,
475 target_freq,
476 relation,
477 &newstate))) {
478 retval = -EINVAL;
479 goto out;
480 }
481
482 first_cpu = 1;
483 for_each_cpu(j, policy->cpus) {
484 int good_cpu;
485
486 /* cpufreq holds the hotplug lock, so we are safe here */
487 if (!cpu_online(j))
488 continue;
489
490 /*
491 * Support for SMP systems.
492 * Make sure we are running on CPU that wants to change freq
493 */
494 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
495 good_cpu = cpumask_any_and(policy->cpus,
496 cpu_online_mask);
497 else
498 good_cpu = j;
499
500 if (good_cpu >= nr_cpu_ids) {
501 dprintk("couldn't limit to CPUs in this domain\n");
502 retval = -EAGAIN;
503 if (first_cpu) {
504 /* We haven't started the transition yet. */
505 goto out;
506 }
507 break;
508 }
509
510 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
511
512 if (first_cpu) {
513 rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
514 if (msr == (oldmsr & 0xffff)) {
515 dprintk("no change needed - msr was and needs "
516 "to be %x\n", oldmsr);
517 retval = 0;
518 goto out;
519 }
520
521 freqs.old = extract_clock(oldmsr, cpu, 0);
522 freqs.new = extract_clock(msr, cpu, 0);
523
524 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
525 target_freq, freqs.old, freqs.new, msr);
526
527 for_each_cpu(k, policy->cpus) {
528 if (!cpu_online(k))
529 continue;
530 freqs.cpu = k;
531 cpufreq_notify_transition(&freqs,
532 CPUFREQ_PRECHANGE);
533 }
534
535 first_cpu = 0;
536 /* all but 16 LSB are reserved, treat them with care */
537 oldmsr &= ~0xffff;
538 msr &= 0xffff;
539 oldmsr |= msr;
540 }
541
542 wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
543 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
544 break;
545
546 cpumask_set_cpu(j, covered_cpus);
547 }
548
549 for_each_cpu(k, policy->cpus) {
550 if (!cpu_online(k))
551 continue;
552 freqs.cpu = k;
553 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
554 }
555
556 if (unlikely(retval)) {
557 /*
558 * We have failed halfway through the frequency change.
559 * We have sent callbacks to policy->cpus and
560 * MSRs have already been written on coverd_cpus.
561 * Best effort undo..
562 */
563
564 for_each_cpu(j, covered_cpus)
565 wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
566
567 tmp = freqs.new;
568 freqs.new = freqs.old;
569 freqs.old = tmp;
570 for_each_cpu(j, policy->cpus) {
571 if (!cpu_online(j))
572 continue;
573 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
574 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
575 }
576 }
577 retval = 0;
578
579out:
580 free_cpumask_var(covered_cpus);
581 return retval;
582}
583
584static struct freq_attr* centrino_attr[] = {
585 &cpufreq_freq_attr_scaling_available_freqs,
586 NULL,
587};
588
589static struct cpufreq_driver centrino_driver = {
590 .name = "centrino", /* should be speedstep-centrino,
591 but there's a 16 char limit */
592 .init = centrino_cpu_init,
593 .exit = centrino_cpu_exit,
594 .verify = centrino_verify,
595 .target = centrino_target,
596 .get = get_cur_freq,
597 .attr = centrino_attr,
598 .owner = THIS_MODULE,
599};
600
601
602/**
603 * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
604 *
605 * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
606 * unsupported devices, -ENOENT if there's no voltage table for this
607 * particular CPU model, -EINVAL on problems during initiatization,
608 * and zero on success.
609 *
610 * This is quite picky. Not only does the CPU have to advertise the
611 * "est" flag in the cpuid capability flags, we look for a specific
612 * CPU model and stepping, and we need to have the exact model name in
613 * our voltage tables. That is, be paranoid about not releasing
614 * someone's valuable magic smoke.
615 */
616static int __init centrino_init(void)
617{
618 struct cpuinfo_x86 *cpu = &cpu_data(0);
619
620 if (!cpu_has(cpu, X86_FEATURE_EST))
621 return -ENODEV;
622
623 return cpufreq_register_driver(&centrino_driver);
624}
625
626static void __exit centrino_exit(void)
627{
628 cpufreq_unregister_driver(&centrino_driver);
629}
630
631MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
632MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
633MODULE_LICENSE ("GPL");
634
635late_initcall(centrino_init);
636module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e95180..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
1/*
2 * (C) 2001 Dave Jones, Arjan van de ven.
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon reverse engineered information, and on Intel documentation
7 * for chipsets ICH2-M and ICH3-M.
8 *
9 * Many thanks to Ducrot Bruno for finding and fixing the last
10 * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
11 * for extensive testing.
12 *
13 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
14 */
15
16
17/*********************************************************************
18 * SPEEDSTEP - DEFINITIONS *
19 *********************************************************************/
20
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/init.h>
24#include <linux/cpufreq.h>
25#include <linux/pci.h>
26#include <linux/sched.h>
27
28#include "speedstep-lib.h"
29
30
31/* speedstep_chipset:
32 * It is necessary to know which chipset is used. As accesses to
33 * this device occur at various places in this module, we need a
34 * static struct pci_dev * pointing to that device.
35 */
36static struct pci_dev *speedstep_chipset_dev;
37
38
39/* speedstep_processor
40 */
41static enum speedstep_processor speedstep_processor;
42
43static u32 pmbase;
44
45/*
46 * There are only two frequency states for each processor. Values
47 * are in kHz for the time being.
48 */
49static struct cpufreq_frequency_table speedstep_freqs[] = {
50 {SPEEDSTEP_HIGH, 0},
51 {SPEEDSTEP_LOW, 0},
52 {0, CPUFREQ_TABLE_END},
53};
54
55
56#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
57 "speedstep-ich", msg)
58
59
60/**
61 * speedstep_find_register - read the PMBASE address
62 *
63 * Returns: -ENODEV if no register could be found
64 */
65static int speedstep_find_register(void)
66{
67 if (!speedstep_chipset_dev)
68 return -ENODEV;
69
70 /* get PMBASE */
71 pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
72 if (!(pmbase & 0x01)) {
73 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
74 return -ENODEV;
75 }
76
77 pmbase &= 0xFFFFFFFE;
78 if (!pmbase) {
79 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
80 return -ENODEV;
81 }
82
83 dprintk("pmbase is 0x%x\n", pmbase);
84 return 0;
85}
86
87/**
88 * speedstep_set_state - set the SpeedStep state
89 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
90 *
91 * Tries to change the SpeedStep state. Can be called from
92 * smp_call_function_single.
93 */
94static void speedstep_set_state(unsigned int state)
95{
96 u8 pm2_blk;
97 u8 value;
98 unsigned long flags;
99
100 if (state > 0x1)
101 return;
102
103 /* Disable IRQs */
104 local_irq_save(flags);
105
106 /* read state */
107 value = inb(pmbase + 0x50);
108
109 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
110
111 /* write new state */
112 value &= 0xFE;
113 value |= state;
114
115 dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
116
117 /* Disable bus master arbitration */
118 pm2_blk = inb(pmbase + 0x20);
119 pm2_blk |= 0x01;
120 outb(pm2_blk, (pmbase + 0x20));
121
122 /* Actual transition */
123 outb(value, (pmbase + 0x50));
124
125 /* Restore bus master arbitration */
126 pm2_blk &= 0xfe;
127 outb(pm2_blk, (pmbase + 0x20));
128
129 /* check if transition was successful */
130 value = inb(pmbase + 0x50);
131
132 /* Enable IRQs */
133 local_irq_restore(flags);
134
135 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
136
137 if (state == (value & 0x1))
138 dprintk("change to %u MHz succeeded\n",
139 speedstep_get_frequency(speedstep_processor) / 1000);
140 else
141 printk(KERN_ERR "cpufreq: change failed - I/O error\n");
142
143 return;
144}
145
146/* Wrapper for smp_call_function_single. */
147static void _speedstep_set_state(void *_state)
148{
149 speedstep_set_state(*(unsigned int *)_state);
150}
151
152/**
153 * speedstep_activate - activate SpeedStep control in the chipset
154 *
155 * Tries to activate the SpeedStep status and control registers.
156 * Returns -EINVAL on an unsupported chipset, and zero on success.
157 */
158static int speedstep_activate(void)
159{
160 u16 value = 0;
161
162 if (!speedstep_chipset_dev)
163 return -EINVAL;
164
165 pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
166 if (!(value & 0x08)) {
167 value |= 0x08;
168 dprintk("activating SpeedStep (TM) registers\n");
169 pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
170 }
171
172 return 0;
173}
174
175
176/**
177 * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
178 *
179 * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
180 * the LPC bridge / PM module which contains all power-management
181 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
182 * chipset, or zero on failure.
183 */
184static unsigned int speedstep_detect_chipset(void)
185{
186 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
187 PCI_DEVICE_ID_INTEL_82801DB_12,
188 PCI_ANY_ID, PCI_ANY_ID,
189 NULL);
190 if (speedstep_chipset_dev)
191 return 4; /* 4-M */
192
193 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
194 PCI_DEVICE_ID_INTEL_82801CA_12,
195 PCI_ANY_ID, PCI_ANY_ID,
196 NULL);
197 if (speedstep_chipset_dev)
198 return 3; /* 3-M */
199
200
201 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
202 PCI_DEVICE_ID_INTEL_82801BA_10,
203 PCI_ANY_ID, PCI_ANY_ID,
204 NULL);
205 if (speedstep_chipset_dev) {
206 /* speedstep.c causes lockups on Dell Inspirons 8000 and
207 * 8100 which use a pretty old revision of the 82815
208 * host brige. Abort on these systems.
209 */
210 static struct pci_dev *hostbridge;
211
212 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
213 PCI_DEVICE_ID_INTEL_82815_MC,
214 PCI_ANY_ID, PCI_ANY_ID,
215 NULL);
216
217 if (!hostbridge)
218 return 2; /* 2-M */
219
220 if (hostbridge->revision < 5) {
221 dprintk("hostbridge does not support speedstep\n");
222 speedstep_chipset_dev = NULL;
223 pci_dev_put(hostbridge);
224 return 0;
225 }
226
227 pci_dev_put(hostbridge);
228 return 2; /* 2-M */
229 }
230
231 return 0;
232}
233
234static void get_freq_data(void *_speed)
235{
236 unsigned int *speed = _speed;
237
238 *speed = speedstep_get_frequency(speedstep_processor);
239}
240
241static unsigned int speedstep_get(unsigned int cpu)
242{
243 unsigned int speed;
244
245 /* You're supposed to ensure CPU is online. */
246 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
247 BUG();
248
249 dprintk("detected %u kHz as current frequency\n", speed);
250 return speed;
251}
252
253/**
254 * speedstep_target - set a new CPUFreq policy
255 * @policy: new policy
256 * @target_freq: the target frequency
257 * @relation: how that frequency relates to achieved frequency
258 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
259 *
260 * Sets a new CPUFreq policy.
261 */
262static int speedstep_target(struct cpufreq_policy *policy,
263 unsigned int target_freq,
264 unsigned int relation)
265{
266 unsigned int newstate = 0, policy_cpu;
267 struct cpufreq_freqs freqs;
268 int i;
269
270 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
271 target_freq, relation, &newstate))
272 return -EINVAL;
273
274 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
275 freqs.old = speedstep_get(policy_cpu);
276 freqs.new = speedstep_freqs[newstate].frequency;
277 freqs.cpu = policy->cpu;
278
279 dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
280
281 /* no transition necessary */
282 if (freqs.old == freqs.new)
283 return 0;
284
285 for_each_cpu(i, policy->cpus) {
286 freqs.cpu = i;
287 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
288 }
289
290 smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
291 true);
292
293 for_each_cpu(i, policy->cpus) {
294 freqs.cpu = i;
295 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
296 }
297
298 return 0;
299}
300
301
302/**
303 * speedstep_verify - verifies a new CPUFreq policy
304 * @policy: new policy
305 *
306 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
307 * at least one border included.
308 */
309static int speedstep_verify(struct cpufreq_policy *policy)
310{
311 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
312}
313
314struct get_freqs {
315 struct cpufreq_policy *policy;
316 int ret;
317};
318
319static void get_freqs_on_cpu(void *_get_freqs)
320{
321 struct get_freqs *get_freqs = _get_freqs;
322
323 get_freqs->ret =
324 speedstep_get_freqs(speedstep_processor,
325 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
326 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
327 &get_freqs->policy->cpuinfo.transition_latency,
328 &speedstep_set_state);
329}
330
331static int speedstep_cpu_init(struct cpufreq_policy *policy)
332{
333 int result;
334 unsigned int policy_cpu, speed;
335 struct get_freqs gf;
336
337 /* only run on CPU to be set, or on its sibling */
338#ifdef CONFIG_SMP
339 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
340#endif
341 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
342
343 /* detect low and high frequency and transition latency */
344 gf.policy = policy;
345 smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
346 if (gf.ret)
347 return gf.ret;
348
349 /* get current speed setting */
350 speed = speedstep_get(policy_cpu);
351 if (!speed)
352 return -EIO;
353
354 dprintk("currently at %s speed setting - %i MHz\n",
355 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
356 ? "low" : "high",
357 (speed / 1000));
358
359 /* cpuinfo and default policy values */
360 policy->cur = speed;
361
362 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
363 if (result)
364 return result;
365
366 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
367
368 return 0;
369}
370
371
372static int speedstep_cpu_exit(struct cpufreq_policy *policy)
373{
374 cpufreq_frequency_table_put_attr(policy->cpu);
375 return 0;
376}
377
378static struct freq_attr *speedstep_attr[] = {
379 &cpufreq_freq_attr_scaling_available_freqs,
380 NULL,
381};
382
383
384static struct cpufreq_driver speedstep_driver = {
385 .name = "speedstep-ich",
386 .verify = speedstep_verify,
387 .target = speedstep_target,
388 .init = speedstep_cpu_init,
389 .exit = speedstep_cpu_exit,
390 .get = speedstep_get,
391 .owner = THIS_MODULE,
392 .attr = speedstep_attr,
393};
394
395
396/**
397 * speedstep_init - initializes the SpeedStep CPUFreq driver
398 *
399 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
400 * devices, -EINVAL on problems during initiatization, and zero on
401 * success.
402 */
403static int __init speedstep_init(void)
404{
405 /* detect processor */
406 speedstep_processor = speedstep_detect_processor();
407 if (!speedstep_processor) {
408 dprintk("Intel(R) SpeedStep(TM) capable processor "
409 "not found\n");
410 return -ENODEV;
411 }
412
413 /* detect chipset */
414 if (!speedstep_detect_chipset()) {
415 dprintk("Intel(R) SpeedStep(TM) for this chipset not "
416 "(yet) available.\n");
417 return -ENODEV;
418 }
419
420 /* activate speedstep support */
421 if (speedstep_activate()) {
422 pci_dev_put(speedstep_chipset_dev);
423 return -EINVAL;
424 }
425
426 if (speedstep_find_register())
427 return -ENODEV;
428
429 return cpufreq_register_driver(&speedstep_driver);
430}
431
432
433/**
434 * speedstep_exit - unregisters SpeedStep support
435 *
436 * Unregisters SpeedStep support.
437 */
438static void __exit speedstep_exit(void)
439{
440 pci_dev_put(speedstep_chipset_dev);
441 cpufreq_unregister_driver(&speedstep_driver);
442}
443
444
445MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
446 "Dominik Brodowski <linux@brodo.de>");
447MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
448 "with ICH-M southbridges.");
449MODULE_LICENSE("GPL");
450
451module_init(speedstep_init);
452module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69fa..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/init.h>
15#include <linux/cpufreq.h>
16
17#include <asm/msr.h>
18#include <asm/tsc.h>
19#include "speedstep-lib.h"
20
21#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
22 "speedstep-lib", msg)
23
24#define PFX "speedstep-lib: "
25
26#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
27static int relaxed_check;
28#else
29#define relaxed_check 0
30#endif
31
32/*********************************************************************
33 * GET PROCESSOR CORE SPEED IN KHZ *
34 *********************************************************************/
35
36static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
37{
38 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
39 struct {
40 unsigned int ratio; /* Frequency Multiplier (x10) */
41 u8 bitmap; /* power on configuration bits
42 [27, 25:22] (in MSR 0x2a) */
43 } msr_decode_mult[] = {
44 { 30, 0x01 },
45 { 35, 0x05 },
46 { 40, 0x02 },
47 { 45, 0x06 },
48 { 50, 0x00 },
49 { 55, 0x04 },
50 { 60, 0x0b },
51 { 65, 0x0f },
52 { 70, 0x09 },
53 { 75, 0x0d },
54 { 80, 0x0a },
55 { 85, 0x26 },
56 { 90, 0x20 },
57 { 100, 0x2b },
58 { 0, 0xff } /* error or unknown value */
59 };
60
61 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
62 struct {
63 unsigned int value; /* Front Side Bus speed in MHz */
64 u8 bitmap; /* power on configuration bits [18: 19]
65 (in MSR 0x2a) */
66 } msr_decode_fsb[] = {
67 { 66, 0x0 },
68 { 100, 0x2 },
69 { 133, 0x1 },
70 { 0, 0xff}
71 };
72
73 u32 msr_lo, msr_tmp;
74 int i = 0, j = 0;
75
76 /* read MSR 0x2a - we only need the low 32 bits */
77 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
78 dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
79 msr_tmp = msr_lo;
80
81 /* decode the FSB */
82 msr_tmp &= 0x00c0000;
83 msr_tmp >>= 18;
84 while (msr_tmp != msr_decode_fsb[i].bitmap) {
85 if (msr_decode_fsb[i].bitmap == 0xff)
86 return 0;
87 i++;
88 }
89
90 /* decode the multiplier */
91 if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
92 dprintk("workaround for early PIIIs\n");
93 msr_lo &= 0x03c00000;
94 } else
95 msr_lo &= 0x0bc00000;
96 msr_lo >>= 22;
97 while (msr_lo != msr_decode_mult[j].bitmap) {
98 if (msr_decode_mult[j].bitmap == 0xff)
99 return 0;
100 j++;
101 }
102
103 dprintk("speed is %u\n",
104 (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
105
106 return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
107}
108
109
110static unsigned int pentiumM_get_frequency(void)
111{
112 u32 msr_lo, msr_tmp;
113
114 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
115 dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
116
117 /* see table B-2 of 24547212.pdf */
118 if (msr_lo & 0x00040000) {
119 printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
120 msr_lo, msr_tmp);
121 return 0;
122 }
123
124 msr_tmp = (msr_lo >> 22) & 0x1f;
125 dprintk("bits 22-26 are 0x%x, speed is %u\n",
126 msr_tmp, (msr_tmp * 100 * 1000));
127
128 return msr_tmp * 100 * 1000;
129}
130
131static unsigned int pentium_core_get_frequency(void)
132{
133 u32 fsb = 0;
134 u32 msr_lo, msr_tmp;
135 int ret;
136
137 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
138 /* see table B-2 of 25366920.pdf */
139 switch (msr_lo & 0x07) {
140 case 5:
141 fsb = 100000;
142 break;
143 case 1:
144 fsb = 133333;
145 break;
146 case 3:
147 fsb = 166667;
148 break;
149 case 2:
150 fsb = 200000;
151 break;
152 case 0:
153 fsb = 266667;
154 break;
155 case 4:
156 fsb = 333333;
157 break;
158 default:
159 printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
160 }
161
162 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
163 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
164 msr_lo, msr_tmp);
165
166 msr_tmp = (msr_lo >> 22) & 0x1f;
167 dprintk("bits 22-26 are 0x%x, speed is %u\n",
168 msr_tmp, (msr_tmp * fsb));
169
170 ret = (msr_tmp * fsb);
171 return ret;
172}
173
174
175static unsigned int pentium4_get_frequency(void)
176{
177 struct cpuinfo_x86 *c = &boot_cpu_data;
178 u32 msr_lo, msr_hi, mult;
179 unsigned int fsb = 0;
180 unsigned int ret;
181 u8 fsb_code;
182
183 /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
184 * to System Bus Frequency Ratio Field in the Processor Frequency
185 * Configuration Register of the MSR. Therefore the current
186 * frequency cannot be calculated and has to be measured.
187 */
188 if (c->x86_model < 2)
189 return cpu_khz;
190
191 rdmsr(0x2c, msr_lo, msr_hi);
192
193 dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
194
195 /* decode the FSB: see IA-32 Intel (C) Architecture Software
196 * Developer's Manual, Volume 3: System Prgramming Guide,
197 * revision #12 in Table B-1: MSRs in the Pentium 4 and
198 * Intel Xeon Processors, on page B-4 and B-5.
199 */
200 fsb_code = (msr_lo >> 16) & 0x7;
201 switch (fsb_code) {
202 case 0:
203 fsb = 100 * 1000;
204 break;
205 case 1:
206 fsb = 13333 * 10;
207 break;
208 case 2:
209 fsb = 200 * 1000;
210 break;
211 }
212
213 if (!fsb)
214 printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
215 "Please send an e-mail to <linux@brodo.de>\n");
216
217 /* Multiplier. */
218 mult = msr_lo >> 24;
219
220 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
221 fsb, mult, (fsb * mult));
222
223 ret = (fsb * mult);
224 return ret;
225}
226
227
228/* Warning: may get called from smp_call_function_single. */
229unsigned int speedstep_get_frequency(enum speedstep_processor processor)
230{
231 switch (processor) {
232 case SPEEDSTEP_CPU_PCORE:
233 return pentium_core_get_frequency();
234 case SPEEDSTEP_CPU_PM:
235 return pentiumM_get_frequency();
236 case SPEEDSTEP_CPU_P4D:
237 case SPEEDSTEP_CPU_P4M:
238 return pentium4_get_frequency();
239 case SPEEDSTEP_CPU_PIII_T:
240 case SPEEDSTEP_CPU_PIII_C:
241 case SPEEDSTEP_CPU_PIII_C_EARLY:
242 return pentium3_get_frequency(processor);
243 default:
244 return 0;
245 };
246 return 0;
247}
248EXPORT_SYMBOL_GPL(speedstep_get_frequency);
249
250
251/*********************************************************************
252 * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
253 *********************************************************************/
254
255unsigned int speedstep_detect_processor(void)
256{
257 struct cpuinfo_x86 *c = &cpu_data(0);
258 u32 ebx, msr_lo, msr_hi;
259
260 dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
261
262 if ((c->x86_vendor != X86_VENDOR_INTEL) ||
263 ((c->x86 != 6) && (c->x86 != 0xF)))
264 return 0;
265
266 if (c->x86 == 0xF) {
267 /* Intel Mobile Pentium 4-M
268 * or Intel Mobile Pentium 4 with 533 MHz FSB */
269 if (c->x86_model != 2)
270 return 0;
271
272 ebx = cpuid_ebx(0x00000001);
273 ebx &= 0x000000FF;
274
275 dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
276
277 switch (c->x86_mask) {
278 case 4:
279 /*
280 * B-stepping [M-P4-M]
281 * sample has ebx = 0x0f, production has 0x0e.
282 */
283 if ((ebx == 0x0e) || (ebx == 0x0f))
284 return SPEEDSTEP_CPU_P4M;
285 break;
286 case 7:
287 /*
288 * C-stepping [M-P4-M]
289 * needs to have ebx=0x0e, else it's a celeron:
290 * cf. 25130917.pdf / page 7, footnote 5 even
291 * though 25072120.pdf / page 7 doesn't say
292 * samples are only of B-stepping...
293 */
294 if (ebx == 0x0e)
295 return SPEEDSTEP_CPU_P4M;
296 break;
297 case 9:
298 /*
299 * D-stepping [M-P4-M or M-P4/533]
300 *
301 * this is totally strange: CPUID 0x0F29 is
302 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
303 * The latter need to be sorted out as they don't
304 * support speedstep.
305 * Celerons with CPUID 0x0F29 may have either
306 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
307 * specific.
308 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
309 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
310 * also, M-P4M HTs have ebx=0x8, too
311 * For now, they are distinguished by the model_id
312 * string
313 */
314 if ((ebx == 0x0e) ||
315 (strstr(c->x86_model_id,
316 "Mobile Intel(R) Pentium(R) 4") != NULL))
317 return SPEEDSTEP_CPU_P4M;
318 break;
319 default:
320 break;
321 }
322 return 0;
323 }
324
325 switch (c->x86_model) {
326 case 0x0B: /* Intel PIII [Tualatin] */
327 /* cpuid_ebx(1) is 0x04 for desktop PIII,
328 * 0x06 for mobile PIII-M */
329 ebx = cpuid_ebx(0x00000001);
330 dprintk("ebx is %x\n", ebx);
331
332 ebx &= 0x000000FF;
333
334 if (ebx != 0x06)
335 return 0;
336
337 /* So far all PIII-M processors support SpeedStep. See
338 * Intel's 24540640.pdf of June 2003
339 */
340 return SPEEDSTEP_CPU_PIII_T;
341
342 case 0x08: /* Intel PIII [Coppermine] */
343
344 /* all mobile PIII Coppermines have FSB 100 MHz
345 * ==> sort out a few desktop PIIIs. */
346 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
347 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
348 msr_lo, msr_hi);
349 msr_lo &= 0x00c0000;
350 if (msr_lo != 0x0080000)
351 return 0;
352
353 /*
354 * If the processor is a mobile version,
355 * platform ID has bit 50 set
356 * it has SpeedStep technology if either
357 * bit 56 or 57 is set
358 */
359 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
360 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
361 msr_lo, msr_hi);
362 if ((msr_hi & (1<<18)) &&
363 (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
364 if (c->x86_mask == 0x01) {
365 dprintk("early PIII version\n");
366 return SPEEDSTEP_CPU_PIII_C_EARLY;
367 } else
368 return SPEEDSTEP_CPU_PIII_C;
369 }
370
371 default:
372 return 0;
373 }
374}
375EXPORT_SYMBOL_GPL(speedstep_detect_processor);
376
377
378/*********************************************************************
379 * DETECT SPEEDSTEP SPEEDS *
380 *********************************************************************/
381
382unsigned int speedstep_get_freqs(enum speedstep_processor processor,
383 unsigned int *low_speed,
384 unsigned int *high_speed,
385 unsigned int *transition_latency,
386 void (*set_state) (unsigned int state))
387{
388 unsigned int prev_speed;
389 unsigned int ret = 0;
390 unsigned long flags;
391 struct timeval tv1, tv2;
392
393 if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
394 return -EINVAL;
395
396 dprintk("trying to determine both speeds\n");
397
398 /* get current speed */
399 prev_speed = speedstep_get_frequency(processor);
400 if (!prev_speed)
401 return -EIO;
402
403 dprintk("previous speed is %u\n", prev_speed);
404
405 local_irq_save(flags);
406
407 /* switch to low state */
408 set_state(SPEEDSTEP_LOW);
409 *low_speed = speedstep_get_frequency(processor);
410 if (!*low_speed) {
411 ret = -EIO;
412 goto out;
413 }
414
415 dprintk("low speed is %u\n", *low_speed);
416
417 /* start latency measurement */
418 if (transition_latency)
419 do_gettimeofday(&tv1);
420
421 /* switch to high state */
422 set_state(SPEEDSTEP_HIGH);
423
424 /* end latency measurement */
425 if (transition_latency)
426 do_gettimeofday(&tv2);
427
428 *high_speed = speedstep_get_frequency(processor);
429 if (!*high_speed) {
430 ret = -EIO;
431 goto out;
432 }
433
434 dprintk("high speed is %u\n", *high_speed);
435
436 if (*low_speed == *high_speed) {
437 ret = -ENODEV;
438 goto out;
439 }
440
441 /* switch to previous state, if necessary */
442 if (*high_speed != prev_speed)
443 set_state(SPEEDSTEP_LOW);
444
445 if (transition_latency) {
446 *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
447 tv2.tv_usec - tv1.tv_usec;
448 dprintk("transition latency is %u uSec\n", *transition_latency);
449
450 /* convert uSec to nSec and add 20% for safety reasons */
451 *transition_latency *= 1200;
452
453 /* check if the latency measurement is too high or too low
454 * and set it to a safe value (500uSec) in that case
455 */
456 if (*transition_latency > 10000000 ||
457 *transition_latency < 50000) {
458 printk(KERN_WARNING PFX "frequency transition "
459 "measured seems out of range (%u "
460 "nSec), falling back to a safe one of"
461 "%u nSec.\n",
462 *transition_latency, 500000);
463 *transition_latency = 500000;
464 }
465 }
466
467out:
468 local_irq_restore(flags);
469 return ret;
470}
471EXPORT_SYMBOL_GPL(speedstep_get_freqs);
472
473#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
474module_param(relaxed_check, int, 0444);
475MODULE_PARM_DESC(relaxed_check,
476 "Don't do all checks for speedstep capability.");
477#endif
478
479MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
480MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
481MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11
12
13/* processors */
14enum speedstep_processor {
15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19/* the following processors are not speedstep-capable and are not auto-detected
20 * in speedstep_detect_processor(). However, their speed can be detected using
21 * the speedstep_get_frequency() call. */
22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26
27/* speedstep states -- only two of them */
28
29#define SPEEDSTEP_HIGH 0x00000000
30#define SPEEDSTEP_LOW 0x00000001
31
32
33/* detect a speedstep-capable processor */
34extern enum speedstep_processor speedstep_detect_processor(void);
35
36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38
39
40/* detect the low and high speeds of the processor. The callback
41 * set_state"'s first argument is either SPEEDSTEP_HIGH or
42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated.
44 */
45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed,
47 unsigned int *high_speed,
48 unsigned int *transition_latency,
49 void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 91bc25b67bc1..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
1/*
2 * Intel SpeedStep SMI driver.
3 *
4 * (C) 2003 Hiroshi Miura <miura@da-cha.org>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 */
9
10
11/*********************************************************************
12 * SPEEDSTEP - DEFINITIONS *
13 *********************************************************************/
14
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/moduleparam.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/delay.h>
21#include <linux/io.h>
22#include <asm/ist.h>
23
24#include "speedstep-lib.h"
25
26/* speedstep system management interface port/command.
27 *
28 * These parameters are got from IST-SMI BIOS call.
29 * If user gives it, these are used.
30 *
31 */
32static int smi_port;
33static int smi_cmd;
34static unsigned int smi_sig;
35
36/* info about the processor */
37static enum speedstep_processor speedstep_processor;
38
39/*
40 * There are only two frequency states for each processor. Values
41 * are in kHz for the time being.
42 */
43static struct cpufreq_frequency_table speedstep_freqs[] = {
44 {SPEEDSTEP_HIGH, 0},
45 {SPEEDSTEP_LOW, 0},
46 {0, CPUFREQ_TABLE_END},
47};
48
49#define GET_SPEEDSTEP_OWNER 0
50#define GET_SPEEDSTEP_STATE 1
51#define SET_SPEEDSTEP_STATE 2
52#define GET_SPEEDSTEP_FREQS 4
53
54/* how often shall the SMI call be tried if it failed, e.g. because
55 * of DMA activity going on? */
56#define SMI_TRIES 5
57
58#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
59 "speedstep-smi", msg)
60
61/**
62 * speedstep_smi_ownership
63 */
64static int speedstep_smi_ownership(void)
65{
66 u32 command, result, magic, dummy;
67 u32 function = GET_SPEEDSTEP_OWNER;
68 unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
69
70 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
71 magic = virt_to_phys(magic_data);
72
73 dprintk("trying to obtain ownership with command %x at port %x\n",
74 command, smi_port);
75
76 __asm__ __volatile__(
77 "push %%ebp\n"
78 "out %%al, (%%dx)\n"
79 "pop %%ebp\n"
80 : "=D" (result),
81 "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
82 "=S" (dummy)
83 : "a" (command), "b" (function), "c" (0), "d" (smi_port),
84 "D" (0), "S" (magic)
85 : "memory"
86 );
87
88 dprintk("result is %x\n", result);
89
90 return result;
91}
92
93/**
94 * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
95 * @low: the low frequency value is placed here
96 * @high: the high frequency value is placed here
97 *
98 * Only available on later SpeedStep-enabled systems, returns false results or
99 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
100 * shows that the latter occurs if !(ist_info.event & 0xFFFF).
101 */
102static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
103{
104 u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
105 u32 state = 0;
106 u32 function = GET_SPEEDSTEP_FREQS;
107
108 if (!(ist_info.event & 0xFFFF)) {
109 dprintk("bug #1422 -- can't read freqs from BIOS\n");
110 return -ENODEV;
111 }
112
113 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
114
115 dprintk("trying to determine frequencies with command %x at port %x\n",
116 command, smi_port);
117
118 __asm__ __volatile__(
119 "push %%ebp\n"
120 "out %%al, (%%dx)\n"
121 "pop %%ebp"
122 : "=a" (result),
123 "=b" (high_mhz),
124 "=c" (low_mhz),
125 "=d" (state), "=D" (edi), "=S" (dummy)
126 : "a" (command),
127 "b" (function),
128 "c" (state),
129 "d" (smi_port), "S" (0), "D" (0)
130 );
131
132 dprintk("result %x, low_freq %u, high_freq %u\n",
133 result, low_mhz, high_mhz);
134
135 /* abort if results are obviously incorrect... */
136 if ((high_mhz + low_mhz) < 600)
137 return -EINVAL;
138
139 *high = high_mhz * 1000;
140 *low = low_mhz * 1000;
141
142 return result;
143}
144
145/**
146 * speedstep_get_state - set the SpeedStep state
147 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
148 *
149 */
150static int speedstep_get_state(void)
151{
152 u32 function = GET_SPEEDSTEP_STATE;
153 u32 result, state, edi, command, dummy;
154
155 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
156
157 dprintk("trying to determine current setting with command %x "
158 "at port %x\n", command, smi_port);
159
160 __asm__ __volatile__(
161 "push %%ebp\n"
162 "out %%al, (%%dx)\n"
163 "pop %%ebp\n"
164 : "=a" (result),
165 "=b" (state), "=D" (edi),
166 "=c" (dummy), "=d" (dummy), "=S" (dummy)
167 : "a" (command), "b" (function), "c" (0),
168 "d" (smi_port), "S" (0), "D" (0)
169 );
170
171 dprintk("state is %x, result is %x\n", state, result);
172
173 return state & 1;
174}
175
176
177/**
178 * speedstep_set_state - set the SpeedStep state
179 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
180 *
181 */
182static void speedstep_set_state(unsigned int state)
183{
184 unsigned int result = 0, command, new_state, dummy;
185 unsigned long flags;
186 unsigned int function = SET_SPEEDSTEP_STATE;
187 unsigned int retry = 0;
188
189 if (state > 0x1)
190 return;
191
192 /* Disable IRQs */
193 local_irq_save(flags);
194
195 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
196
197 dprintk("trying to set frequency to state %u "
198 "with command %x at port %x\n",
199 state, command, smi_port);
200
201 do {
202 if (retry) {
203 dprintk("retry %u, previous result %u, waiting...\n",
204 retry, result);
205 mdelay(retry * 50);
206 }
207 retry++;
208 __asm__ __volatile__(
209 "push %%ebp\n"
210 "out %%al, (%%dx)\n"
211 "pop %%ebp"
212 : "=b" (new_state), "=D" (result),
213 "=c" (dummy), "=a" (dummy),
214 "=d" (dummy), "=S" (dummy)
215 : "a" (command), "b" (function), "c" (state),
216 "d" (smi_port), "S" (0), "D" (0)
217 );
218 } while ((new_state != state) && (retry <= SMI_TRIES));
219
220 /* enable IRQs */
221 local_irq_restore(flags);
222
223 if (new_state == state)
224 dprintk("change to %u MHz succeeded after %u tries "
225 "with result %u\n",
226 (speedstep_freqs[new_state].frequency / 1000),
227 retry, result);
228 else
229 printk(KERN_ERR "cpufreq: change to state %u "
230 "failed with new_state %u and result %u\n",
231 state, new_state, result);
232
233 return;
234}
235
236
237/**
238 * speedstep_target - set a new CPUFreq policy
239 * @policy: new policy
240 * @target_freq: new freq
241 * @relation:
242 *
243 * Sets a new CPUFreq policy/freq.
244 */
245static int speedstep_target(struct cpufreq_policy *policy,
246 unsigned int target_freq, unsigned int relation)
247{
248 unsigned int newstate = 0;
249 struct cpufreq_freqs freqs;
250
251 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
252 target_freq, relation, &newstate))
253 return -EINVAL;
254
255 freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
256 freqs.new = speedstep_freqs[newstate].frequency;
257 freqs.cpu = 0; /* speedstep.c is UP only driver */
258
259 if (freqs.old == freqs.new)
260 return 0;
261
262 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
263 speedstep_set_state(newstate);
264 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
265
266 return 0;
267}
268
269
270/**
271 * speedstep_verify - verifies a new CPUFreq policy
272 * @policy: new policy
273 *
274 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
275 * at least one border included.
276 */
277static int speedstep_verify(struct cpufreq_policy *policy)
278{
279 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
280}
281
282
283static int speedstep_cpu_init(struct cpufreq_policy *policy)
284{
285 int result;
286 unsigned int speed, state;
287 unsigned int *low, *high;
288
289 /* capability check */
290 if (policy->cpu != 0)
291 return -ENODEV;
292
293 result = speedstep_smi_ownership();
294 if (result) {
295 dprintk("fails in acquiring ownership of a SMI interface.\n");
296 return -EINVAL;
297 }
298
299 /* detect low and high frequency */
300 low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
301 high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
302
303 result = speedstep_smi_get_freqs(low, high);
304 if (result) {
305 /* fall back to speedstep_lib.c dection mechanism:
306 * try both states out */
307 dprintk("could not detect low and high frequencies "
308 "by SMI call.\n");
309 result = speedstep_get_freqs(speedstep_processor,
310 low, high,
311 NULL,
312 &speedstep_set_state);
313
314 if (result) {
315 dprintk("could not detect two different speeds"
316 " -- aborting.\n");
317 return result;
318 } else
319 dprintk("workaround worked.\n");
320 }
321
322 /* get current speed setting */
323 state = speedstep_get_state();
324 speed = speedstep_freqs[state].frequency;
325
326 dprintk("currently at %s speed setting - %i MHz\n",
327 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
328 ? "low" : "high",
329 (speed / 1000));
330
331 /* cpuinfo and default policy values */
332 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
333 policy->cur = speed;
334
335 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
336 if (result)
337 return result;
338
339 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
340
341 return 0;
342}
343
344static int speedstep_cpu_exit(struct cpufreq_policy *policy)
345{
346 cpufreq_frequency_table_put_attr(policy->cpu);
347 return 0;
348}
349
350static unsigned int speedstep_get(unsigned int cpu)
351{
352 if (cpu)
353 return -ENODEV;
354 return speedstep_get_frequency(speedstep_processor);
355}
356
357
358static int speedstep_resume(struct cpufreq_policy *policy)
359{
360 int result = speedstep_smi_ownership();
361
362 if (result)
363 dprintk("fails in re-acquiring ownership of a SMI interface.\n");
364
365 return result;
366}
367
368static struct freq_attr *speedstep_attr[] = {
369 &cpufreq_freq_attr_scaling_available_freqs,
370 NULL,
371};
372
373static struct cpufreq_driver speedstep_driver = {
374 .name = "speedstep-smi",
375 .verify = speedstep_verify,
376 .target = speedstep_target,
377 .init = speedstep_cpu_init,
378 .exit = speedstep_cpu_exit,
379 .get = speedstep_get,
380 .resume = speedstep_resume,
381 .owner = THIS_MODULE,
382 .attr = speedstep_attr,
383};
384
385/**
386 * speedstep_init - initializes the SpeedStep CPUFreq driver
387 *
388 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
389 * BIOS, -EINVAL on problems during initiatization, and zero on
390 * success.
391 */
392static int __init speedstep_init(void)
393{
394 speedstep_processor = speedstep_detect_processor();
395
396 switch (speedstep_processor) {
397 case SPEEDSTEP_CPU_PIII_T:
398 case SPEEDSTEP_CPU_PIII_C:
399 case SPEEDSTEP_CPU_PIII_C_EARLY:
400 break;
401 default:
402 speedstep_processor = 0;
403 }
404
405 if (!speedstep_processor) {
406 dprintk("No supported Intel CPU detected.\n");
407 return -ENODEV;
408 }
409
410 dprintk("signature:0x%.8lx, command:0x%.8lx, "
411 "event:0x%.8lx, perf_level:0x%.8lx.\n",
412 ist_info.signature, ist_info.command,
413 ist_info.event, ist_info.perf_level);
414
415 /* Error if no IST-SMI BIOS or no PARM
416 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
417 if ((ist_info.signature != 0x47534943) && (
418 (smi_port == 0) || (smi_cmd == 0)))
419 return -ENODEV;
420
421 if (smi_sig == 1)
422 smi_sig = 0x47534943;
423 else
424 smi_sig = ist_info.signature;
425
426 /* setup smi_port from MODLULE_PARM or BIOS */
427 if ((smi_port > 0xff) || (smi_port < 0))
428 return -EINVAL;
429 else if (smi_port == 0)
430 smi_port = ist_info.command & 0xff;
431
432 if ((smi_cmd > 0xff) || (smi_cmd < 0))
433 return -EINVAL;
434 else if (smi_cmd == 0)
435 smi_cmd = (ist_info.command >> 16) & 0xff;
436
437 return cpufreq_register_driver(&speedstep_driver);
438}
439
440
441/**
442 * speedstep_exit - unregisters SpeedStep support
443 *
444 * Unregisters SpeedStep support.
445 */
446static void __exit speedstep_exit(void)
447{
448 cpufreq_unregister_driver(&speedstep_driver);
449}
450
451module_param(smi_port, int, 0444);
452module_param(smi_cmd, int, 0444);
453module_param(smi_sig, uint, 0444);
454
455MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
456 "-- Intel's default setting is 0xb2");
457MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
458 "-- Intel's default setting is 0x82");
459MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
460 "SMI interface.");
461
462MODULE_AUTHOR("Hiroshi Miura");
463MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
464MODULE_LICENSE("GPL");
465
466module_init(speedstep_init);
467module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index df86bc8c859d..1edf5ba4fb2b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
29 29
30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
31{ 31{
32 u64 misc_enable;
33
32 /* Unmask CPUID levels if masked: */ 34 /* Unmask CPUID levels if masked: */
33 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { 35 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
34 u64 misc_enable;
35
36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
37 37
38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { 38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
118 * (model 2) with the same problem. 118 * (model 2) with the same problem.
119 */ 119 */
120 if (c->x86 == 15) { 120 if (c->x86 == 15) {
121 u64 misc_enable;
122
123 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 121 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
124 122
125 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { 123 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
130 } 128 }
131 } 129 }
132#endif 130#endif
131
132 /*
133 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
134 * clear the fast string and enhanced fast string CPU capabilities.
135 */
136 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
137 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
138 if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
139 printk(KERN_INFO "Disabled fast string operations\n");
140 setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
141 setup_clear_cpu_cap(X86_FEATURE_ERMS);
142 }
143 }
133} 144}
134 145
135#ifdef CONFIG_X86_32 146#ifdef CONFIG_X86_32
@@ -400,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
400 411
401 switch (c->x86_model) { 412 switch (c->x86_model) {
402 case 5: 413 case 5:
403 if (c->x86_mask == 0) { 414 if (l2 == 0)
404 if (l2 == 0) 415 p = "Celeron (Covington)";
405 p = "Celeron (Covington)"; 416 else if (l2 == 256)
406 else if (l2 == 256) 417 p = "Mobile Pentium II (Dixon)";
407 p = "Mobile Pentium II (Dixon)";
408 }
409 break; 418 break;
410 419
411 case 6: 420 case 6:
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 1ce1af2899df..c105c533ed94 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -327,7 +327,6 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
327 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); 327 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
328 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); 328 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
329 329
330 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
331 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; 330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
332} 331}
333 332
@@ -454,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
454{ 453{
455 int ret = 0; 454 int ret = 0;
456 455
457#define SUBCACHE_MASK (3UL << 20) 456 /* check if @slot is already used or the index is already disabled */
458#define SUBCACHE_INDEX 0xfff
459
460 /*
461 * check whether this slot is already used or
462 * the index is already disabled
463 */
464 ret = amd_get_l3_disable_slot(l3, slot); 457 ret = amd_get_l3_disable_slot(l3, slot);
465 if (ret >= 0) 458 if (ret >= 0)
466 return -EINVAL; 459 return -EINVAL;
467 460
468 /* 461 if (index > l3->indices)
469 * check whether the other slot has disabled the
470 * same index already
471 */
472 if (index == amd_get_l3_disable_slot(l3, !slot))
473 return -EINVAL; 462 return -EINVAL;
474 463
475 /* do not allow writes outside of allowed bits */ 464 /* check whether the other slot has disabled the same index already */
476 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || 465 if (index == amd_get_l3_disable_slot(l3, !slot))
477 ((index & SUBCACHE_INDEX) > l3->indices))
478 return -EINVAL; 466 return -EINVAL;
479 467
480 amd_l3_disable_index(l3, cpu, slot, index); 468 amd_l3_disable_index(l3, cpu, slot, index);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3385ea26f684..ff1ae9b6464d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -105,20 +105,6 @@ static int cpu_missing;
105ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 105ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
106EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 106EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
107 107
108static int default_decode_mce(struct notifier_block *nb, unsigned long val,
109 void *data)
110{
111 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
112 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
113
114 return NOTIFY_STOP;
115}
116
117static struct notifier_block mce_dec_nb = {
118 .notifier_call = default_decode_mce,
119 .priority = -1,
120};
121
122/* MCA banks polled by the period polling timer for corrected events */ 108/* MCA banks polled by the period polling timer for corrected events */
123DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 109DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
124 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 110 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -212,6 +198,8 @@ void mce_log(struct mce *mce)
212 198
213static void print_mce(struct mce *m) 199static void print_mce(struct mce *m)
214{ 200{
201 int ret = 0;
202
215 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 203 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
216 m->extcpu, m->mcgstatus, m->bank, m->status); 204 m->extcpu, m->mcgstatus, m->bank, m->status);
217 205
@@ -239,7 +227,11 @@ static void print_mce(struct mce *m)
239 * Print out human-readable details about the MCE error, 227 * Print out human-readable details about the MCE error,
240 * (if the CPU has an implementation for that) 228 * (if the CPU has an implementation for that)
241 */ 229 */
242 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 230 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
231 if (ret == NOTIFY_STOP)
232 return;
233
234 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
243} 235}
244 236
245#define PANIC_TIMEOUT 5 /* 5 seconds */ 237#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -590,7 +582,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
590 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 582 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
591 mce_log(&m); 583 mce_log(&m);
592 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 584 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
593 add_taint(TAINT_MACHINE_CHECK);
594 } 585 }
595 586
596 /* 587 /*
@@ -1722,8 +1713,6 @@ __setup("mce", mcheck_enable);
1722 1713
1723int __init mcheck_init(void) 1714int __init mcheck_init(void)
1724{ 1715{
1725 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1726
1727 mcheck_intel_therm_init(); 1716 mcheck_intel_therm_init();
1728 1717
1729 return 0; 1718 return 0;
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 0f034460260d..27c625178bf1 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -187,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
187 this_cpu, 187 this_cpu,
188 level == CORE_LEVEL ? "Core" : "Package", 188 level == CORE_LEVEL ? "Core" : "Package",
189 state->count); 189 state->count);
190
191 add_taint(TAINT_MACHINE_CHECK);
192 return 1; 190 return 1;
193 } 191 }
194 if (old_event) { 192 if (old_event) {
@@ -355,7 +353,6 @@ static void notify_thresholds(__u64 msr_val)
355static void intel_thermal_interrupt(void) 353static void intel_thermal_interrupt(void)
356{ 354{
357 __u64 msr_val; 355 __u64 msr_val;
358 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
359 356
360 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 357 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
361 358
@@ -367,19 +364,19 @@ static void intel_thermal_interrupt(void)
367 CORE_LEVEL) != 0) 364 CORE_LEVEL) != 0)
368 mce_log_therm_throt_event(CORE_THROTTLED | msr_val); 365 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
369 366
370 if (cpu_has(c, X86_FEATURE_PLN)) 367 if (this_cpu_has(X86_FEATURE_PLN))
371 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 368 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
372 POWER_LIMIT_EVENT, 369 POWER_LIMIT_EVENT,
373 CORE_LEVEL) != 0) 370 CORE_LEVEL) != 0)
374 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); 371 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
375 372
376 if (cpu_has(c, X86_FEATURE_PTS)) { 373 if (this_cpu_has(X86_FEATURE_PTS)) {
377 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 374 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
378 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 375 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
379 THERMAL_THROTTLING_EVENT, 376 THERMAL_THROTTLING_EVENT,
380 PACKAGE_LEVEL) != 0) 377 PACKAGE_LEVEL) != 0)
381 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); 378 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
382 if (cpu_has(c, X86_FEATURE_PLN)) 379 if (this_cpu_has(X86_FEATURE_PLN))
383 if (therm_throt_process(msr_val & 380 if (therm_throt_process(msr_val &
384 PACKAGE_THERM_STATUS_POWER_LIMIT, 381 PACKAGE_THERM_STATUS_POWER_LIMIT,
385 POWER_LIMIT_EVENT, 382 POWER_LIMIT_EVENT,
@@ -393,7 +390,6 @@ static void unexpected_thermal_interrupt(void)
393{ 390{
394 printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n", 391 printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
395 smp_processor_id()); 392 smp_processor_id());
396 add_taint(TAINT_MACHINE_CHECK);
397} 393}
398 394
399static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; 395static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index e638689279d3..3a0338b4b179 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -31,6 +31,7 @@
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33#include <asm/smp.h> 33#include <asm/smp.h>
34#include <asm/alternative.h>
34 35
35#if 0 36#if 0
36#undef wrmsrl 37#undef wrmsrl
@@ -363,12 +364,18 @@ again:
363 return new_raw_count; 364 return new_raw_count;
364} 365}
365 366
366/* using X86_FEATURE_PERFCTR_CORE to later implement ALTERNATIVE() here */
367static inline int x86_pmu_addr_offset(int index) 367static inline int x86_pmu_addr_offset(int index)
368{ 368{
369 if (boot_cpu_has(X86_FEATURE_PERFCTR_CORE)) 369 int offset;
370 return index << 1; 370
371 return index; 371 /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
372 alternative_io(ASM_NOP2,
373 "shll $1, %%eax",
374 X86_FEATURE_PERFCTR_CORE,
375 "=a" (offset),
376 "a" (index));
377
378 return offset;
372} 379}
373 380
374static inline unsigned int x86_pmu_config_addr(int index) 381static inline unsigned int x86_pmu_config_addr(int index)
@@ -1766,17 +1773,6 @@ static struct pmu pmu = {
1766 * callchain support 1773 * callchain support
1767 */ 1774 */
1768 1775
1769static void
1770backtrace_warning_symbol(void *data, char *msg, unsigned long symbol)
1771{
1772 /* Ignore warnings */
1773}
1774
1775static void backtrace_warning(void *data, char *msg)
1776{
1777 /* Ignore warnings */
1778}
1779
1780static int backtrace_stack(void *data, char *name) 1776static int backtrace_stack(void *data, char *name)
1781{ 1777{
1782 return 0; 1778 return 0;
@@ -1790,8 +1786,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1790} 1786}
1791 1787
1792static const struct stacktrace_ops backtrace_ops = { 1788static const struct stacktrace_ops backtrace_ops = {
1793 .warning = backtrace_warning,
1794 .warning_symbol = backtrace_warning_symbol,
1795 .stack = backtrace_stack, 1789 .stack = backtrace_stack,
1796 .address = backtrace_address, 1790 .address = backtrace_address,
1797 .walk_stack = print_context_stack_bp, 1791 .walk_stack = print_context_stack_bp,
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index cf4e369cea67..fe29c1d2219e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -96,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids
96 */ 96 */
97static const u64 amd_perfmon_event_map[] = 97static const u64 amd_perfmon_event_map[] =
98{ 98{
99 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, 99 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
100 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 100 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
101 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, 101 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
102 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, 102 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
103 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, 103 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
104 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, 104 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
105 [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
106 [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
105}; 107};
106 108
107static u64 amd_pmu_event_map(int hw_event) 109static u64 amd_pmu_event_map(int hw_event)
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index 447a28de6f09..41178c826c48 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -36,7 +36,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
36 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 36 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
37}; 37};
38 38
39static struct event_constraint intel_core_event_constraints[] = 39static struct event_constraint intel_core_event_constraints[] __read_mostly =
40{ 40{
41 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 41 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
42 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 42 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -47,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] =
47 EVENT_CONSTRAINT_END 47 EVENT_CONSTRAINT_END
48}; 48};
49 49
50static struct event_constraint intel_core2_event_constraints[] = 50static struct event_constraint intel_core2_event_constraints[] __read_mostly =
51{ 51{
52 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 52 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
53 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 53 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -70,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] =
70 EVENT_CONSTRAINT_END 70 EVENT_CONSTRAINT_END
71}; 71};
72 72
73static struct event_constraint intel_nehalem_event_constraints[] = 73static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
74{ 74{
75 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 75 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
76 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 76 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -86,19 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] =
86 EVENT_CONSTRAINT_END 86 EVENT_CONSTRAINT_END
87}; 87};
88 88
89static struct extra_reg intel_nehalem_extra_regs[] = 89static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
90{ 90{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
92 EVENT_EXTRA_END 92 EVENT_EXTRA_END
93}; 93};
94 94
95static struct event_constraint intel_nehalem_percore_constraints[] = 95static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
96{ 96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0), 97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END 98 EVENT_CONSTRAINT_END
99}; 99};
100 100
101static struct event_constraint intel_westmere_event_constraints[] = 101static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
102{ 102{
103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -110,7 +110,7 @@ static struct event_constraint intel_westmere_event_constraints[] =
110 EVENT_CONSTRAINT_END 110 EVENT_CONSTRAINT_END
111}; 111};
112 112
113static struct event_constraint intel_snb_event_constraints[] = 113static struct event_constraint intel_snb_event_constraints[] __read_mostly =
114{ 114{
115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -123,21 +123,21 @@ static struct event_constraint intel_snb_event_constraints[] =
123 EVENT_CONSTRAINT_END 123 EVENT_CONSTRAINT_END
124}; 124};
125 125
126static struct extra_reg intel_westmere_extra_regs[] = 126static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
127{ 127{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), 128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), 129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
130 EVENT_EXTRA_END 130 EVENT_EXTRA_END
131}; 131};
132 132
133static struct event_constraint intel_westmere_percore_constraints[] = 133static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
134{ 134{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0), 135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0), 136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END 137 EVENT_CONSTRAINT_END
138}; 138};
139 139
140static struct event_constraint intel_gen_event_constraints[] = 140static struct event_constraint intel_gen_event_constraints[] __read_mostly =
141{ 141{
142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
143 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 143 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -1440,6 +1440,11 @@ static __init int intel_pmu_init(void)
1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1441 x86_pmu.extra_regs = intel_nehalem_extra_regs; 1441 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1442 1442
1443 /* UOPS_ISSUED.STALLED_CYCLES */
1444 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1445 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1446 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1447
1443 if (ebx & 0x40) { 1448 if (ebx & 0x40) {
1444 /* 1449 /*
1445 * Erratum AAJ80 detected, we work it around by using 1450 * Erratum AAJ80 detected, we work it around by using
@@ -1480,6 +1485,12 @@ static __init int intel_pmu_init(void)
1480 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1485 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1481 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; 1486 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1482 x86_pmu.extra_regs = intel_westmere_extra_regs; 1487 x86_pmu.extra_regs = intel_westmere_extra_regs;
1488
1489 /* UOPS_ISSUED.STALLED_CYCLES */
1490 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1491 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1492 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1493
1483 pr_cont("Westmere events, "); 1494 pr_cont("Westmere events, ");
1484 break; 1495 break;
1485 1496
@@ -1491,6 +1502,12 @@ static __init int intel_pmu_init(void)
1491 1502
1492 x86_pmu.event_constraints = intel_snb_event_constraints; 1503 x86_pmu.event_constraints = intel_snb_event_constraints;
1493 x86_pmu.pebs_constraints = intel_snb_pebs_events; 1504 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1505
1506 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1507 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1508 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
1509 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
1510
1494 pr_cont("SandyBridge events, "); 1511 pr_cont("SandyBridge events, ");
1495 break; 1512 break;
1496 1513
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index e93fcd55fae1..ead584fb6a7d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -468,7 +468,7 @@ static struct p4_event_bind p4_event_bind_map[] = {
468 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), 468 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
469 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 469 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
470 .escr_emask = 470 .escr_emask =
471 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS), 471 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
472 .cntr = { {12, 13, 16}, {14, 15, 17} }, 472 .cntr = { {12, 13, 16}, {14, 15, 17} },
473 }, 473 },
474 [P4_EVENT_X87_ASSIST] = { 474 [P4_EVENT_X87_ASSIST] = {
@@ -912,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
912 int idx, handled = 0; 912 int idx, handled = 0;
913 u64 val; 913 u64 val;
914 914
915 data.addr = 0; 915 perf_sample_data_init(&data, 0);
916 data.raw = NULL;
917 916
918 cpuc = &__get_cpu_var(cpu_hw_events); 917 cpuc = &__get_cpu_var(cpu_hw_events);
919 918
@@ -1197,7 +1196,7 @@ static __init int p4_pmu_init(void)
1197{ 1196{
1198 unsigned int low, high; 1197 unsigned int low, high;
1199 1198
1200 /* If we get stripped -- indexig fails */ 1199 /* If we get stripped -- indexing fails */
1201 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); 1200 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
1202 1201
1203 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 1202 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index e90f08458e6b..690bc8461835 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -369,6 +369,7 @@ static struct of_ioapic_type of_ioapic_type[] =
369static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize, 369static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
370 u32 *out_hwirq, u32 *out_type) 370 u32 *out_hwirq, u32 *out_type)
371{ 371{
372 struct mp_ioapic_gsi *gsi_cfg;
372 struct io_apic_irq_attr attr; 373 struct io_apic_irq_attr attr;
373 struct of_ioapic_type *it; 374 struct of_ioapic_type *it;
374 u32 line, idx, type; 375 u32 line, idx, type;
@@ -378,7 +379,8 @@ static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
378 379
379 line = *intspec; 380 line = *intspec;
380 idx = (u32) id->priv; 381 idx = (u32) id->priv;
381 *out_hwirq = line + mp_gsi_routing[idx].gsi_base; 382 gsi_cfg = mp_ioapic_gsi_routing(idx);
383 *out_hwirq = line + gsi_cfg->gsi_base;
382 384
383 intspec++; 385 intspec++;
384 type = *intspec; 386 type = *intspec;
@@ -407,7 +409,7 @@ static void __init ioapic_add_ofnode(struct device_node *np)
407 } 409 }
408 410
409 for (i = 0; i < nr_ioapics; i++) { 411 for (i = 0; i < nr_ioapics; i++) {
410 if (r.start == mp_ioapics[i].apicaddr) { 412 if (r.start == mpc_ioapic_addr(i)) {
411 struct irq_domain *id; 413 struct irq_domain *id;
412 414
413 id = kzalloc(sizeof(*id), GFP_KERNEL); 415 id = kzalloc(sizeof(*id), GFP_KERNEL);
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index e2a3f0606da4..1aae78f775fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
135} 135}
136EXPORT_SYMBOL_GPL(print_context_stack_bp); 136EXPORT_SYMBOL_GPL(print_context_stack_bp);
137 137
138
139static void
140print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
141{
142 printk(data);
143 print_symbol(msg, symbol);
144 printk("\n");
145}
146
147static void print_trace_warning(void *data, char *msg)
148{
149 printk("%s%s\n", (char *)data, msg);
150}
151
152static int print_trace_stack(void *data, char *name) 138static int print_trace_stack(void *data, char *name)
153{ 139{
154 printk("%s <%s> ", (char *)data, name); 140 printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
166} 152}
167 153
168static const struct stacktrace_ops print_trace_ops = { 154static const struct stacktrace_ops print_trace_ops = {
169 .warning = print_trace_warning,
170 .warning_symbol = print_trace_warning_symbol,
171 .stack = print_trace_stack, 155 .stack = print_trace_stack,
172 .address = print_trace_address, 156 .address = print_trace_address,
173 .walk_stack = print_context_stack, 157 .walk_stack = print_context_stack,
@@ -279,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
279 printk("DEBUG_PAGEALLOC"); 263 printk("DEBUG_PAGEALLOC");
280#endif 264#endif
281 printk("\n"); 265 printk("\n");
282 sysfs_printk_last_file();
283 if (notify_die(DIE_OOPS, str, regs, err, 266 if (notify_die(DIE_OOPS, str, regs, err,
284 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) 267 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
285 return 1; 268 return 1;
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index a93742a57468..c9a281f272fd 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -123,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
123static atomic_t nmi_running = ATOMIC_INIT(0); 123static atomic_t nmi_running = ATOMIC_INIT(0);
124static int mod_code_status; /* holds return value of text write */ 124static int mod_code_status; /* holds return value of text write */
125static void *mod_code_ip; /* holds the IP to write to */ 125static void *mod_code_ip; /* holds the IP to write to */
126static void *mod_code_newcode; /* holds the text to write to the IP */ 126static const void *mod_code_newcode; /* holds the text to write to the IP */
127 127
128static unsigned nmi_wait_count; 128static unsigned nmi_wait_count;
129static atomic_t nmi_update_count = ATOMIC_INIT(0); 129static atomic_t nmi_update_count = ATOMIC_INIT(0);
@@ -225,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
225} 225}
226 226
227static int 227static int
228do_ftrace_mod_code(unsigned long ip, void *new_code) 228do_ftrace_mod_code(unsigned long ip, const void *new_code)
229{ 229{
230 /* 230 /*
231 * On x86_64, kernel text mappings are mapped read-only with 231 * On x86_64, kernel text mappings are mapped read-only with
@@ -260,14 +260,14 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
260 return mod_code_status; 260 return mod_code_status;
261} 261}
262 262
263static unsigned char *ftrace_nop_replace(void) 263static const unsigned char *ftrace_nop_replace(void)
264{ 264{
265 return ideal_nop5; 265 return ideal_nops[NOP_ATOMIC5];
266} 266}
267 267
268static int 268static int
269ftrace_modify_code(unsigned long ip, unsigned char *old_code, 269ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
270 unsigned char *new_code) 270 unsigned const char *new_code)
271{ 271{
272 unsigned char replaced[MCOUNT_INSN_SIZE]; 272 unsigned char replaced[MCOUNT_INSN_SIZE];
273 273
@@ -301,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
301int ftrace_make_nop(struct module *mod, 301int ftrace_make_nop(struct module *mod,
302 struct dyn_ftrace *rec, unsigned long addr) 302 struct dyn_ftrace *rec, unsigned long addr)
303{ 303{
304 unsigned char *new, *old; 304 unsigned const char *new, *old;
305 unsigned long ip = rec->ip; 305 unsigned long ip = rec->ip;
306 306
307 old = ftrace_call_replace(ip, addr); 307 old = ftrace_call_replace(ip, addr);
@@ -312,7 +312,7 @@ int ftrace_make_nop(struct module *mod,
312 312
313int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) 313int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
314{ 314{
315 unsigned char *new, *old; 315 unsigned const char *new, *old;
316 unsigned long ip = rec->ip; 316 unsigned long ip = rec->ip;
317 317
318 old = ftrace_nop_replace(); 318 old = ftrace_nop_replace();
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index d6d6bb361931..3bb08509a7a1 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -23,7 +23,6 @@
23static void __init i386_default_early_setup(void) 23static void __init i386_default_early_setup(void)
24{ 24{
25 /* Initialize 32bit specific setup functions */ 25 /* Initialize 32bit specific setup functions */
26 x86_init.resources.probe_roms = probe_roms;
27 x86_init.resources.reserve_resources = i386_reserve_resources; 26 x86_init.resources.reserve_resources = i386_reserve_resources;
28 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 27 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
29 28
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index bfe8f729e086..6781765b3a0d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -217,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
217/* 217/*
218 * Common hpet info 218 * Common hpet info
219 */ 219 */
220static unsigned long hpet_period; 220static unsigned long hpet_freq;
221 221
222static void hpet_legacy_set_mode(enum clock_event_mode mode, 222static void hpet_legacy_set_mode(enum clock_event_mode mode,
223 struct clock_event_device *evt); 223 struct clock_event_device *evt);
@@ -232,7 +232,6 @@ static struct clock_event_device hpet_clockevent = {
232 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 232 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
233 .set_mode = hpet_legacy_set_mode, 233 .set_mode = hpet_legacy_set_mode,
234 .set_next_event = hpet_legacy_next_event, 234 .set_next_event = hpet_legacy_next_event,
235 .shift = 32,
236 .irq = 0, 235 .irq = 0,
237 .rating = 50, 236 .rating = 50,
238}; 237};
@@ -290,28 +289,12 @@ static void hpet_legacy_clockevent_register(void)
290 hpet_enable_legacy_int(); 289 hpet_enable_legacy_int();
291 290
292 /* 291 /*
293 * The mult factor is defined as (include/linux/clockchips.h)
294 * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
295 * hpet_period is in units of femtoseconds (per cycle), so
296 * mult/2^shift = cyc/ns = 10^6/hpet_period
297 * mult = (10^6 * 2^shift)/hpet_period
298 * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
299 */
300 hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
301 hpet_period, hpet_clockevent.shift);
302 /* Calculate the min / max delta */
303 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
304 &hpet_clockevent);
305 /* Setup minimum reprogramming delta. */
306 hpet_clockevent.min_delta_ns = clockevent_delta2ns(HPET_MIN_PROG_DELTA,
307 &hpet_clockevent);
308
309 /*
310 * Start hpet with the boot cpu mask and make it 292 * Start hpet with the boot cpu mask and make it
311 * global after the IO_APIC has been initialized. 293 * global after the IO_APIC has been initialized.
312 */ 294 */
313 hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); 295 hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
314 clockevents_register_device(&hpet_clockevent); 296 clockevents_config_and_register(&hpet_clockevent, hpet_freq,
297 HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
315 global_clock_event = &hpet_clockevent; 298 global_clock_event = &hpet_clockevent;
316 printk(KERN_DEBUG "hpet clockevent registered\n"); 299 printk(KERN_DEBUG "hpet clockevent registered\n");
317} 300}
@@ -549,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
549static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) 532static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
550{ 533{
551 struct clock_event_device *evt = &hdev->evt; 534 struct clock_event_device *evt = &hdev->evt;
552 uint64_t hpet_freq;
553 535
554 WARN_ON(cpu != smp_processor_id()); 536 WARN_ON(cpu != smp_processor_id());
555 if (!(hdev->flags & HPET_DEV_VALID)) 537 if (!(hdev->flags & HPET_DEV_VALID))
@@ -571,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
571 553
572 evt->set_mode = hpet_msi_set_mode; 554 evt->set_mode = hpet_msi_set_mode;
573 evt->set_next_event = hpet_msi_next_event; 555 evt->set_next_event = hpet_msi_next_event;
574 evt->shift = 32;
575
576 /*
577 * The period is a femto seconds value. We need to calculate the
578 * scaled math multiplication factor for nanosecond to hpet tick
579 * conversion.
580 */
581 hpet_freq = FSEC_PER_SEC;
582 do_div(hpet_freq, hpet_period);
583 evt->mult = div_sc((unsigned long) hpet_freq,
584 NSEC_PER_SEC, evt->shift);
585 /* Calculate the max delta */
586 evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
587 /* 5 usec minimum reprogramming delta. */
588 evt->min_delta_ns = 5000;
589
590 evt->cpumask = cpumask_of(hdev->cpu); 556 evt->cpumask = cpumask_of(hdev->cpu);
591 clockevents_register_device(evt); 557
558 clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
559 0x7FFFFFFF);
592} 560}
593 561
594#ifdef CONFIG_HPET 562#ifdef CONFIG_HPET
@@ -792,7 +760,6 @@ static struct clocksource clocksource_hpet = {
792static int hpet_clocksource_register(void) 760static int hpet_clocksource_register(void)
793{ 761{
794 u64 start, now; 762 u64 start, now;
795 u64 hpet_freq;
796 cycle_t t1; 763 cycle_t t1;
797 764
798 /* Start the counter */ 765 /* Start the counter */
@@ -819,24 +786,7 @@ static int hpet_clocksource_register(void)
819 return -ENODEV; 786 return -ENODEV;
820 } 787 }
821 788
822 /*
823 * The definition of mult is (include/linux/clocksource.h)
824 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
825 * so we first need to convert hpet_period to ns/cyc units:
826 * mult/2^shift = ns/cyc = hpet_period/10^6
827 * mult = (hpet_period * 2^shift)/10^6
828 * mult = (hpet_period << shift)/FSEC_PER_NSEC
829 */
830
831 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
832 *
833 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
834 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
835 */
836 hpet_freq = FSEC_PER_SEC;
837 do_div(hpet_freq, hpet_period);
838 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); 789 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
839
840 return 0; 790 return 0;
841} 791}
842 792
@@ -845,7 +795,9 @@ static int hpet_clocksource_register(void)
845 */ 795 */
846int __init hpet_enable(void) 796int __init hpet_enable(void)
847{ 797{
798 unsigned long hpet_period;
848 unsigned int id; 799 unsigned int id;
800 u64 freq;
849 int i; 801 int i;
850 802
851 if (!is_hpet_capable()) 803 if (!is_hpet_capable())
@@ -884,6 +836,14 @@ int __init hpet_enable(void)
884 goto out_nohpet; 836 goto out_nohpet;
885 837
886 /* 838 /*
839 * The period is a femto seconds value. Convert it to a
840 * frequency.
841 */
842 freq = FSEC_PER_SEC;
843 do_div(freq, hpet_period);
844 hpet_freq = freq;
845
846 /*
887 * Read the HPET ID register to retrieve the IRQ routing 847 * Read the HPET ID register to retrieve the IRQ routing
888 * information and the number of channels 848 * information and the number of channels
889 */ 849 */
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd31597443..fb66dc9e36cb 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = {
93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
94 .set_mode = init_pit_timer, 94 .set_mode = init_pit_timer,
95 .set_next_event = pit_next_event, 95 .set_next_event = pit_next_event,
96 .shift = 32,
97 .irq = 0, 96 .irq = 0,
98}; 97};
99 98
@@ -108,90 +107,12 @@ void __init setup_pit_timer(void)
108 * IO_APIC has been initialized. 107 * IO_APIC has been initialized.
109 */ 108 */
110 pit_ce.cpumask = cpumask_of(smp_processor_id()); 109 pit_ce.cpumask = cpumask_of(smp_processor_id());
111 pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
112 pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
113 pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
114 110
115 clockevents_register_device(&pit_ce); 111 clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
116 global_clock_event = &pit_ce; 112 global_clock_event = &pit_ce;
117} 113}
118 114
119#ifndef CONFIG_X86_64 115#ifndef CONFIG_X86_64
120/*
121 * Since the PIT overflows every tick, its not very useful
122 * to just read by itself. So use jiffies to emulate a free
123 * running counter:
124 */
125static cycle_t pit_read(struct clocksource *cs)
126{
127 static int old_count;
128 static u32 old_jifs;
129 unsigned long flags;
130 int count;
131 u32 jifs;
132
133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /*
135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine
137 * by having side effects on state that we cannot undo if
138 * there is a collision on the seqlock and our caller has to
139 * retry. (Namely, old_jifs and old_count.) So we must treat
140 * jiffies as volatile despite the lock. We read jiffies
141 * before latching the timer count to guarantee that although
142 * the jiffies value might be older than the count (that is,
143 * the counter may underflow between the last point where
144 * jiffies was incremented and the point where we latch the
145 * count), it cannot be newer.
146 */
147 jifs = jiffies;
148 outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
149 count = inb_pit(PIT_CH0); /* read the latched count */
150 count |= inb_pit(PIT_CH0) << 8;
151
152 /* VIA686a test code... reset the latch if count > max + 1 */
153 if (count > LATCH) {
154 outb_pit(0x34, PIT_MODE);
155 outb_pit(LATCH & 0xff, PIT_CH0);
156 outb_pit(LATCH >> 8, PIT_CH0);
157 count = LATCH - 1;
158 }
159
160 /*
161 * It's possible for count to appear to go the wrong way for a
162 * couple of reasons:
163 *
164 * 1. The timer counter underflows, but we haven't handled the
165 * resulting interrupt and incremented jiffies yet.
166 * 2. Hardware problem with the timer, not giving us continuous time,
167 * the counter does small "jumps" upwards on some Pentium systems,
168 * (see c't 95/10 page 335 for Neptun bug.)
169 *
170 * Previous attempts to handle these cases intelligently were
171 * buggy, so we just do the simple thing now.
172 */
173 if (count > old_count && jifs == old_jifs)
174 count = old_count;
175
176 old_count = count;
177 old_jifs = jifs;
178
179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180
181 count = (LATCH - 1) - count;
182
183 return (cycle_t)(jifs * LATCH) + count;
184}
185
186static struct clocksource pit_cs = {
187 .name = "pit",
188 .rating = 110,
189 .read = pit_read,
190 .mask = CLOCKSOURCE_MASK(32),
191 .mult = 0,
192 .shift = 20,
193};
194
195static int __init init_pit_clocksource(void) 116static int __init init_pit_clocksource(void)
196{ 117{
197 /* 118 /*
@@ -205,10 +126,7 @@ static int __init init_pit_clocksource(void)
205 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) 126 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
206 return 0; 127 return 0;
207 128
208 pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); 129 return clocksource_i8253_init();
209
210 return clocksource_register(&pit_cs);
211} 130}
212arch_initcall(init_pit_clocksource); 131arch_initcall(init_pit_clocksource);
213
214#endif /* !CONFIG_X86_64 */ 132#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 1cb0b9fc78dc..6c0802eb2f7f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -249,7 +249,7 @@ void fixup_irqs(void)
249 249
250 data = irq_desc_get_irq_data(desc); 250 data = irq_desc_get_irq_data(desc);
251 affinity = data->affinity; 251 affinity = data->affinity;
252 if (!irq_has_action(irq) || 252 if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
253 cpumask_subset(affinity, cpu_online_mask)) { 253 cpumask_subset(affinity, cpu_online_mask)) {
254 raw_spin_unlock(&desc->lock); 254 raw_spin_unlock(&desc->lock);
255 continue; 255 continue;
@@ -276,7 +276,8 @@ void fixup_irqs(void)
276 else if (!(warned++)) 276 else if (!(warned++))
277 set_affinity = 0; 277 set_affinity = 0;
278 278
279 if (!irqd_can_move_in_process_context(data) && chip->irq_unmask) 279 if (!irqd_can_move_in_process_context(data) &&
280 !irqd_irq_disabled(data) && chip->irq_unmask)
280 chip->irq_unmask(data); 281 chip->irq_unmask(data);
281 282
282 raw_spin_unlock(&desc->lock); 283 raw_spin_unlock(&desc->lock);
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
index 961b6b30ba90..3fee346ef545 100644
--- a/arch/x86/kernel/jump_label.c
+++ b/arch/x86/kernel/jump_label.c
@@ -34,7 +34,7 @@ void arch_jump_label_transform(struct jump_entry *entry,
34 code.offset = entry->target - 34 code.offset = entry->target -
35 (entry->code + JUMP_LABEL_NOP_SIZE); 35 (entry->code + JUMP_LABEL_NOP_SIZE);
36 } else 36 } else
37 memcpy(&code, ideal_nop5, JUMP_LABEL_NOP_SIZE); 37 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
38 get_online_cpus(); 38 get_online_cpus();
39 mutex_lock(&text_mutex); 39 mutex_lock(&text_mutex);
40 text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE); 40 text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
@@ -44,7 +44,8 @@ void arch_jump_label_transform(struct jump_entry *entry,
44 44
45void arch_jump_label_text_poke_early(jump_label_t addr) 45void arch_jump_label_text_poke_early(jump_label_t addr)
46{ 46{
47 text_poke_early((void *)addr, ideal_nop5, JUMP_LABEL_NOP_SIZE); 47 text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
48 JUMP_LABEL_NOP_SIZE);
48} 49}
49 50
50#endif 51#endif
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index f98d3eafe07a..6389a6bca11b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
26#include <asm/x86_init.h> 26#include <asm/x86_init.h>
27#include <asm/reboot.h> 27#include <asm/reboot.h>
28 28
29#define KVM_SCALE 22
30
31static int kvmclock = 1; 29static int kvmclock = 1;
32static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; 30static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
33static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; 31static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,8 +118,6 @@ static struct clocksource kvm_clock = {
120 .read = kvm_clock_get_cycles, 118 .read = kvm_clock_get_cycles,
121 .rating = 400, 119 .rating = 400,
122 .mask = CLOCKSOURCE_MASK(64), 120 .mask = CLOCKSOURCE_MASK(64),
123 .mult = 1 << KVM_SCALE,
124 .shift = KVM_SCALE,
125 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 121 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
126}; 122};
127 123
@@ -203,7 +199,7 @@ void __init kvmclock_init(void)
203 machine_ops.crash_shutdown = kvm_crash_shutdown; 199 machine_ops.crash_shutdown = kvm_crash_shutdown;
204#endif 200#endif
205 kvm_get_preset_lpj(); 201 kvm_get_preset_lpj();
206 clocksource_register(&kvm_clock); 202 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
207 pv_info.paravirt_enabled = 1; 203 pv_info.paravirt_enabled = 1;
208 pv_info.name = "KVM"; 204 pv_info.name = "KVM";
209 205
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index ab23f1ad4bf1..52f256f2cc81 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h> 26#include <linux/gfp.h>
27#include <linux/jump_label.h>
27 28
28#include <asm/system.h> 29#include <asm/system.h>
29#include <asm/page.h> 30#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5a532ce646bf..9103b89c145a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -285,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
285 intsrc.type = MP_INTSRC; 285 intsrc.type = MP_INTSRC;
286 intsrc.irqflag = 0; /* conforming */ 286 intsrc.irqflag = 0; /* conforming */
287 intsrc.srcbus = 0; 287 intsrc.srcbus = 0;
288 intsrc.dstapic = mp_ioapics[0].apicid; 288 intsrc.dstapic = mpc_ioapic_id(0);
289 289
290 intsrc.irqtype = mp_INT; 290 intsrc.irqtype = mp_INT;
291 291
@@ -715,17 +715,15 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
715 } 715 }
716} 716}
717 717
718static int 718static int __init
719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) 719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
720{ 720{
721 int ret = 0;
722
723 if (!mpc_new_phys || count <= mpc_new_length) { 721 if (!mpc_new_phys || count <= mpc_new_length) {
724 WARN(1, "update_mptable: No spare slots (length: %x)\n", count); 722 WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
725 return -1; 723 return -1;
726 } 724 }
727 725
728 return ret; 726 return 0;
729} 727}
730#else /* CONFIG_X86_IO_APIC */ 728#else /* CONFIG_X86_IO_APIC */
731static 729static
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9ea999a4dcc1..b49d00da2aed 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -68,74 +68,10 @@ int dma_set_mask(struct device *dev, u64 mask)
68} 68}
69EXPORT_SYMBOL(dma_set_mask); 69EXPORT_SYMBOL(dma_set_mask);
70 70
71#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
72static __initdata void *dma32_bootmem_ptr;
73static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
74
75static int __init parse_dma32_size_opt(char *p)
76{
77 if (!p)
78 return -EINVAL;
79 dma32_bootmem_size = memparse(p, &p);
80 return 0;
81}
82early_param("dma32_size", parse_dma32_size_opt);
83
84void __init dma32_reserve_bootmem(void)
85{
86 unsigned long size, align;
87 if (max_pfn <= MAX_DMA32_PFN)
88 return;
89
90 /*
91 * check aperture_64.c allocate_aperture() for reason about
92 * using 512M as goal
93 */
94 align = 64ULL<<20;
95 size = roundup(dma32_bootmem_size, align);
96 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
97 512ULL<<20);
98 /*
99 * Kmemleak should not scan this block as it may not be mapped via the
100 * kernel direct mapping.
101 */
102 kmemleak_ignore(dma32_bootmem_ptr);
103 if (dma32_bootmem_ptr)
104 dma32_bootmem_size = size;
105 else
106 dma32_bootmem_size = 0;
107}
108static void __init dma32_free_bootmem(void)
109{
110
111 if (max_pfn <= MAX_DMA32_PFN)
112 return;
113
114 if (!dma32_bootmem_ptr)
115 return;
116
117 free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
118
119 dma32_bootmem_ptr = NULL;
120 dma32_bootmem_size = 0;
121}
122#else
123void __init dma32_reserve_bootmem(void)
124{
125}
126static void __init dma32_free_bootmem(void)
127{
128}
129
130#endif
131
132void __init pci_iommu_alloc(void) 71void __init pci_iommu_alloc(void)
133{ 72{
134 struct iommu_table_entry *p; 73 struct iommu_table_entry *p;
135 74
136 /* free the range so iommu could get some range less than 4G */
137 dma32_free_bootmem();
138
139 sort_iommu_table(__iommu_table, __iommu_table_end); 75 sort_iommu_table(__iommu_table, __iommu_table_end);
140 check_iommu_entries(__iommu_table, __iommu_table_end); 76 check_iommu_entries(__iommu_table, __iommu_table_end);
141 77
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
index 55d745ec1181..35ccf75696eb 100644
--- a/arch/x86/kernel/pci-iommu_table.c
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -50,20 +50,14 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
50 struct iommu_table_entry *finish) 50 struct iommu_table_entry *finish)
51{ 51{
52 struct iommu_table_entry *p, *q, *x; 52 struct iommu_table_entry *p, *q, *x;
53 char sym_p[KSYM_SYMBOL_LEN];
54 char sym_q[KSYM_SYMBOL_LEN];
55 53
56 /* Simple cyclic dependency checker. */ 54 /* Simple cyclic dependency checker. */
57 for (p = start; p < finish; p++) { 55 for (p = start; p < finish; p++) {
58 q = find_dependents_of(start, finish, p); 56 q = find_dependents_of(start, finish, p);
59 x = find_dependents_of(start, finish, q); 57 x = find_dependents_of(start, finish, q);
60 if (p == x) { 58 if (p == x) {
61 sprint_symbol(sym_p, (unsigned long)p->detect); 59 printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
62 sprint_symbol(sym_q, (unsigned long)q->detect); 60 p->detect, q->detect);
63
64 printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %s depends" \
65 " on %s and vice-versa. BREAKING IT.\n",
66 sym_p, sym_q);
67 /* Heavy handed way..*/ 61 /* Heavy handed way..*/
68 x->depend = 0; 62 x->depend = 0;
69 } 63 }
@@ -72,12 +66,8 @@ void __init check_iommu_entries(struct iommu_table_entry *start,
72 for (p = start; p < finish; p++) { 66 for (p = start; p < finish; p++) {
73 q = find_dependents_of(p, finish, p); 67 q = find_dependents_of(p, finish, p);
74 if (q && q > p) { 68 if (q && q > p) {
75 sprint_symbol(sym_p, (unsigned long)p->detect); 69 printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
76 sprint_symbol(sym_q, (unsigned long)q->detect); 70 p->detect, q->detect);
77
78 printk(KERN_ERR "EXECUTION ORDER INVALID! %s "\
79 "should be called before %s!\n",
80 sym_p, sym_q);
81 } 71 }
82 } 72 }
83} 73}
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e5..ba0a4cce53be 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM 73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
74}; 74};
75 75
76/* does this oprom support the given pci device, or any of the devices
77 * that the driver supports?
78 */
79static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
80{
81 struct pci_driver *drv = pdev->driver;
82 const struct pci_device_id *id;
83
84 if (pdev->vendor == vendor && pdev->device == device)
85 return true;
86
87 for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
88 if (id->vendor == vendor && id->device == device)
89 break;
90
91 return id && id->vendor;
92}
93
94static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
95 const unsigned char *rom_list)
96{
97 unsigned short device;
98
99 do {
100 if (probe_kernel_address(rom_list, device) != 0)
101 device = 0;
102
103 if (device && match_id(pdev, vendor, device))
104 break;
105
106 rom_list += 2;
107 } while (device);
108
109 return !!device;
110}
111
112static struct resource *find_oprom(struct pci_dev *pdev)
113{
114 struct resource *oprom = NULL;
115 int i;
116
117 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
118 struct resource *res = &adapter_rom_resources[i];
119 unsigned short offset, vendor, device, list, rev;
120 const unsigned char *rom;
121
122 if (res->end == 0)
123 break;
124
125 rom = isa_bus_to_virt(res->start);
126 if (probe_kernel_address(rom + 0x18, offset) != 0)
127 continue;
128
129 if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
130 continue;
131
132 if (probe_kernel_address(rom + offset + 0x6, device) != 0)
133 continue;
134
135 if (match_id(pdev, vendor, device)) {
136 oprom = res;
137 break;
138 }
139
140 if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
141 probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
142 rev >= 3 && list &&
143 probe_list(pdev, vendor, rom + offset + list)) {
144 oprom = res;
145 break;
146 }
147 }
148
149 return oprom;
150}
151
152void *pci_map_biosrom(struct pci_dev *pdev)
153{
154 struct resource *oprom = find_oprom(pdev);
155
156 if (!oprom)
157 return NULL;
158
159 return ioremap(oprom->start, resource_size(oprom));
160}
161EXPORT_SYMBOL(pci_map_biosrom);
162
163void pci_unmap_biosrom(void __iomem *image)
164{
165 iounmap(image);
166}
167EXPORT_SYMBOL(pci_unmap_biosrom);
168
169size_t pci_biosrom_size(struct pci_dev *pdev)
170{
171 struct resource *oprom = find_oprom(pdev);
172
173 return oprom ? resource_size(oprom) : 0;
174}
175EXPORT_SYMBOL(pci_biosrom_size);
176
76#define ROMSIGNATURE 0xaa55 177#define ROMSIGNATURE 0xaa55
77 178
78static int __init romsignature(const unsigned char *rom) 179static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index d46cbe46b7ab..2e4928d45a2d 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -337,7 +337,9 @@ EXPORT_SYMBOL(boot_option_idle_override);
337 * Powermanagement idle function, if any.. 337 * Powermanagement idle function, if any..
338 */ 338 */
339void (*pm_idle)(void); 339void (*pm_idle)(void);
340#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE)
340EXPORT_SYMBOL(pm_idle); 341EXPORT_SYMBOL(pm_idle);
342#endif
341 343
342#ifdef CONFIG_X86_32 344#ifdef CONFIG_X86_32
343/* 345/*
@@ -397,7 +399,7 @@ void default_idle(void)
397 cpu_relax(); 399 cpu_relax();
398 } 400 }
399} 401}
400#ifdef CONFIG_APM_MODULE 402#if defined(CONFIG_APM_MODULE) && defined(CONFIG_APM_CPU_IDLE)
401EXPORT_SYMBOL(default_idle); 403EXPORT_SYMBOL(default_idle);
402#endif 404#endif
403 405
@@ -449,7 +451,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
449void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 451void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
450{ 452{
451 if (!need_resched()) { 453 if (!need_resched()) {
452 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) 454 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
453 clflush((void *)&current_thread_info()->flags); 455 clflush((void *)&current_thread_info()->flags);
454 456
455 __monitor((void *)&current_thread_info()->flags, 0, 0); 457 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -465,7 +467,7 @@ static void mwait_idle(void)
465 if (!need_resched()) { 467 if (!need_resched()) {
466 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 468 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
467 trace_cpu_idle(1, smp_processor_id()); 469 trace_cpu_idle(1, smp_processor_id());
468 if (cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLUSH_MONITOR)) 470 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
469 clflush((void *)&current_thread_info()->flags); 471 clflush((void *)&current_thread_info()->flags);
470 472
471 __monitor((void *)&current_thread_info()->flags, 0, 0); 473 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -535,45 +537,45 @@ int mwait_usable(const struct cpuinfo_x86 *c)
535 return (edx & MWAIT_EDX_C1); 537 return (edx & MWAIT_EDX_C1);
536} 538}
537 539
538bool c1e_detected; 540bool amd_e400_c1e_detected;
539EXPORT_SYMBOL(c1e_detected); 541EXPORT_SYMBOL(amd_e400_c1e_detected);
540 542
541static cpumask_var_t c1e_mask; 543static cpumask_var_t amd_e400_c1e_mask;
542 544
543void c1e_remove_cpu(int cpu) 545void amd_e400_remove_cpu(int cpu)
544{ 546{
545 if (c1e_mask != NULL) 547 if (amd_e400_c1e_mask != NULL)
546 cpumask_clear_cpu(cpu, c1e_mask); 548 cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
547} 549}
548 550
549/* 551/*
550 * C1E aware idle routine. We check for C1E active in the interrupt 552 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
551 * pending message MSR. If we detect C1E, then we handle it the same 553 * pending message MSR. If we detect C1E, then we handle it the same
552 * way as C3 power states (local apic timer and TSC stop) 554 * way as C3 power states (local apic timer and TSC stop)
553 */ 555 */
554static void c1e_idle(void) 556static void amd_e400_idle(void)
555{ 557{
556 if (need_resched()) 558 if (need_resched())
557 return; 559 return;
558 560
559 if (!c1e_detected) { 561 if (!amd_e400_c1e_detected) {
560 u32 lo, hi; 562 u32 lo, hi;
561 563
562 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 564 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
563 565
564 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 566 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
565 c1e_detected = true; 567 amd_e400_c1e_detected = true;
566 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 568 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
567 mark_tsc_unstable("TSC halt in AMD C1E"); 569 mark_tsc_unstable("TSC halt in AMD C1E");
568 printk(KERN_INFO "System has AMD C1E enabled\n"); 570 printk(KERN_INFO "System has AMD C1E enabled\n");
569 } 571 }
570 } 572 }
571 573
572 if (c1e_detected) { 574 if (amd_e400_c1e_detected) {
573 int cpu = smp_processor_id(); 575 int cpu = smp_processor_id();
574 576
575 if (!cpumask_test_cpu(cpu, c1e_mask)) { 577 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
576 cpumask_set_cpu(cpu, c1e_mask); 578 cpumask_set_cpu(cpu, amd_e400_c1e_mask);
577 /* 579 /*
578 * Force broadcast so ACPI can not interfere. 580 * Force broadcast so ACPI can not interfere.
579 */ 581 */
@@ -616,17 +618,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
616 pm_idle = mwait_idle; 618 pm_idle = mwait_idle;
617 } else if (cpu_has_amd_erratum(amd_erratum_400)) { 619 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
618 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 620 /* E400: APIC timer interrupt does not wake up CPU from C1e */
619 printk(KERN_INFO "using C1E aware idle routine\n"); 621 printk(KERN_INFO "using AMD E400 aware idle routine\n");
620 pm_idle = c1e_idle; 622 pm_idle = amd_e400_idle;
621 } else 623 } else
622 pm_idle = default_idle; 624 pm_idle = default_idle;
623} 625}
624 626
625void __init init_c1e_mask(void) 627void __init init_amd_e400_c1e_mask(void)
626{ 628{
627 /* If we're using c1e_idle, we need to allocate c1e_mask. */ 629 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
628 if (pm_idle == c1e_idle) 630 if (pm_idle == amd_e400_idle)
629 zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); 631 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
630} 632}
631 633
632static int __init idle_setup(char *str) 634static int __init idle_setup(char *str)
@@ -640,6 +642,7 @@ static int __init idle_setup(char *str)
640 boot_option_idle_override = IDLE_POLL; 642 boot_option_idle_override = IDLE_POLL;
641 } else if (!strcmp(str, "mwait")) { 643 } else if (!strcmp(str, "mwait")) {
642 boot_option_idle_override = IDLE_FORCE_MWAIT; 644 boot_option_idle_override = IDLE_FORCE_MWAIT;
645 WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
643 } else if (!strcmp(str, "halt")) { 646 } else if (!strcmp(str, "halt")) {
644 /* 647 /*
645 * When the boot option of idle=halt is added, halt is 648 * When the boot option of idle=halt is added, halt is
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index f65e5b521dbd..807c2a2b80f1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1363,7 +1363,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1363 * We must return the syscall number to actually look up in the table. 1363 * We must return the syscall number to actually look up in the table.
1364 * This can be -1L to skip running any syscall at all. 1364 * This can be -1L to skip running any syscall at all.
1365 */ 1365 */
1366asmregparm long syscall_trace_enter(struct pt_regs *regs) 1366long syscall_trace_enter(struct pt_regs *regs)
1367{ 1367{
1368 long ret = 0; 1368 long ret = 0;
1369 1369
@@ -1408,7 +1408,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1408 return ret ?: regs->orig_ax; 1408 return ret ?: regs->orig_ax;
1409} 1409}
1410 1410
1411asmregparm void syscall_trace_leave(struct pt_regs *regs) 1411void syscall_trace_leave(struct pt_regs *regs)
1412{ 1412{
1413 bool step; 1413 bool step;
1414 1414
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 08c44b08bf5b..0c016f727695 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -36,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
36 36
37static const struct desc_ptr no_idt = {}; 37static const struct desc_ptr no_idt = {};
38static int reboot_mode; 38static int reboot_mode;
39enum reboot_type reboot_type = BOOT_KBD; 39enum reboot_type reboot_type = BOOT_ACPI;
40int reboot_force; 40int reboot_force;
41 41
42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -478,9 +478,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
478{ 478{
479} 479}
480 480
481/*
482 * Windows compatible x86 hardware expects the following on reboot:
483 *
484 * 1) If the FADT has the ACPI reboot register flag set, try it
485 * 2) If still alive, write to the keyboard controller
486 * 3) If still alive, write to the ACPI reboot register again
487 * 4) If still alive, write to the keyboard controller again
488 *
489 * If the machine is still alive at this stage, it gives up. We default to
490 * following the same pattern, except that if we're still alive after (4) we'll
491 * try to force a triple fault and then cycle between hitting the keyboard
492 * controller and doing that
493 */
481static void native_machine_emergency_restart(void) 494static void native_machine_emergency_restart(void)
482{ 495{
483 int i; 496 int i;
497 int attempt = 0;
498 int orig_reboot_type = reboot_type;
484 499
485 if (reboot_emergency) 500 if (reboot_emergency)
486 emergency_vmx_disable_all(); 501 emergency_vmx_disable_all();
@@ -502,6 +517,13 @@ static void native_machine_emergency_restart(void)
502 outb(0xfe, 0x64); /* pulse reset low */ 517 outb(0xfe, 0x64); /* pulse reset low */
503 udelay(50); 518 udelay(50);
504 } 519 }
520 if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
521 attempt = 1;
522 reboot_type = BOOT_ACPI;
523 } else {
524 reboot_type = BOOT_TRIPLE;
525 }
526 break;
505 527
506 case BOOT_TRIPLE: 528 case BOOT_TRIPLE:
507 load_idt(&no_idt); 529 load_idt(&no_idt);
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 4be9b398470e..afaf38447ef5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow);
691 691
692void __init setup_arch(char **cmdline_p) 692void __init setup_arch(char **cmdline_p)
693{ 693{
694 unsigned long flags;
695
696#ifdef CONFIG_X86_32 694#ifdef CONFIG_X86_32
697 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 695 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
698 visws_early_detect(); 696 visws_early_detect();
@@ -912,6 +910,13 @@ void __init setup_arch(char **cmdline_p)
912 memblock.current_limit = get_max_mapped(); 910 memblock.current_limit = get_max_mapped();
913 memblock_x86_fill(); 911 memblock_x86_fill();
914 912
913 /*
914 * The EFI specification says that boot service code won't be called
915 * after ExitBootServices(). This is, in fact, a lie.
916 */
917 if (efi_enabled)
918 efi_reserve_boot_services();
919
915 /* preallocate 4k for mptable mpc */ 920 /* preallocate 4k for mptable mpc */
916 early_reserve_e820_mpc_new(); 921 early_reserve_e820_mpc_new();
917 922
@@ -948,6 +953,8 @@ void __init setup_arch(char **cmdline_p)
948 if (init_ohci1394_dma_early) 953 if (init_ohci1394_dma_early)
949 init_ohci1394_dma_on_all_controllers(); 954 init_ohci1394_dma_on_all_controllers();
950#endif 955#endif
956 /* Allocate bigger log buffer */
957 setup_log_buf(1);
951 958
952 reserve_initrd(); 959 reserve_initrd();
953 960
@@ -966,7 +973,6 @@ void __init setup_arch(char **cmdline_p)
966 973
967 initmem_init(); 974 initmem_init();
968 memblock_find_dma_reserve(); 975 memblock_find_dma_reserve();
969 dma32_reserve_bootmem();
970 976
971#ifdef CONFIG_KVM_CLOCK 977#ifdef CONFIG_KVM_CLOCK
972 kvmclock_init(); 978 kvmclock_init();
@@ -1041,9 +1047,7 @@ void __init setup_arch(char **cmdline_p)
1041 1047
1042 mcheck_init(); 1048 mcheck_init();
1043 1049
1044 local_irq_save(flags); 1050 arch_init_ideal_nops();
1045 arch_init_ideal_nop5();
1046 local_irq_restore(flags);
1047} 1051}
1048 1052
1049#ifdef CONFIG_X86_32 1053#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e57..40a24932a8a1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -601,10 +601,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
601 goto badframe; 601 goto badframe;
602 602
603 sigdelsetmask(&set, ~_BLOCKABLE); 603 sigdelsetmask(&set, ~_BLOCKABLE);
604 spin_lock_irq(&current->sighand->siglock); 604 set_current_blocked(&set);
605 current->blocked = set;
606 recalc_sigpending();
607 spin_unlock_irq(&current->sighand->siglock);
608 605
609 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 606 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
610 goto badframe; 607 goto badframe;
@@ -682,6 +679,7 @@ static int
682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 679handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
683 sigset_t *oldset, struct pt_regs *regs) 680 sigset_t *oldset, struct pt_regs *regs)
684{ 681{
682 sigset_t blocked;
685 int ret; 683 int ret;
686 684
687 /* Are we from a system call? */ 685 /* Are we from a system call? */
@@ -741,12 +739,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
741 */ 739 */
742 regs->flags &= ~X86_EFLAGS_TF; 740 regs->flags &= ~X86_EFLAGS_TF;
743 741
744 spin_lock_irq(&current->sighand->siglock); 742 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
745 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
746 if (!(ka->sa.sa_flags & SA_NODEFER)) 743 if (!(ka->sa.sa_flags & SA_NODEFER))
747 sigaddset(&current->blocked, sig); 744 sigaddset(&blocked, sig);
748 recalc_sigpending(); 745 set_current_blocked(&blocked);
749 spin_unlock_irq(&current->sighand->siglock);
750 746
751 tracehook_signal_handler(sig, info, ka, regs, 747 tracehook_signal_handler(sig, info, ka, regs,
752 test_thread_flag(TIF_SINGLESTEP)); 748 test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 513deac7228d..013e7eba83bb 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -194,14 +194,13 @@ static void native_stop_other_cpus(int wait)
194} 194}
195 195
196/* 196/*
197 * Reschedule call back. Nothing to do, 197 * Reschedule call back.
198 * all the work is done automatically when
199 * we return from the interrupt.
200 */ 198 */
201void smp_reschedule_interrupt(struct pt_regs *regs) 199void smp_reschedule_interrupt(struct pt_regs *regs)
202{ 200{
203 ack_APIC_irq(); 201 ack_APIC_irq();
204 inc_irq_stat(irq_resched_count); 202 inc_irq_stat(irq_resched_count);
203 scheduler_ipi();
205 /* 204 /*
206 * KVM uses this interrupt to force a cpu out of guest mode 205 * KVM uses this interrupt to force a cpu out of guest mode
207 */ 206 */
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index c2871d3c71b6..33a0c11797de 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -1307,7 +1307,7 @@ void play_dead_common(void)
1307{ 1307{
1308 idle_task_exit(); 1308 idle_task_exit();
1309 reset_lazy_tlbstate(); 1309 reset_lazy_tlbstate();
1310 c1e_remove_cpu(raw_smp_processor_id()); 1310 amd_e400_remove_cpu(raw_smp_processor_id());
1311 1311
1312 mb(); 1312 mb();
1313 /* Ack it */ 1313 /* Ack it */
@@ -1332,9 +1332,9 @@ static inline void mwait_play_dead(void)
1332 void *mwait_ptr; 1332 void *mwait_ptr;
1333 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); 1333 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1334 1334
1335 if (!(cpu_has(c, X86_FEATURE_MWAIT) && mwait_usable(c))) 1335 if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
1336 return; 1336 return;
1337 if (!cpu_has(__this_cpu_ptr(&cpu_info), X86_FEATURE_CLFLSH)) 1337 if (!this_cpu_has(X86_FEATURE_CLFLSH))
1338 return; 1338 return;
1339 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF) 1339 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1340 return; 1340 return;
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index 6515733a289d..55d9bc03f696 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
11 11
12static void save_stack_warning(void *data, char *msg)
13{
14}
15
16static void
17save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
18{
19}
20
21static int save_stack_stack(void *data, char *name) 12static int save_stack_stack(void *data, char *name)
22{ 13{
23 return 0; 14 return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 44}
54 45
55static const struct stacktrace_ops save_stack_ops = { 46static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 47 .stack = save_stack_stack,
59 .address = save_stack_address, 48 .address = save_stack_address,
60 .walk_stack = print_context_stack, 49 .walk_stack = print_context_stack,
61}; 50};
62 51
63static const struct stacktrace_ops save_stack_ops_nosched = { 52static const struct stacktrace_ops save_stack_ops_nosched = {
64 .warning = save_stack_warning,
65 .warning_symbol = save_stack_warning_symbol,
66 .stack = save_stack_stack, 53 .stack = save_stack_stack,
67 .address = save_stack_address_nosched, 54 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack, 55 .walk_stack = print_context_stack,
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index abce34d5c79d..fbb0a045a1a2 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -344,3 +344,5 @@ ENTRY(sys_call_table)
344 .long sys_open_by_handle_at 344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime 345 .long sys_clock_adjtime
346 .long sys_syncfs 346 .long sys_syncfs
347 .long sys_sendmmsg /* 345 */
348 .long sys_setns
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index 998e972f3b1a..30ac65df7d4e 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -110,7 +110,6 @@ static struct mm_struct tboot_mm = {
110 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), 110 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
111 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 111 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
112 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 112 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
113 .cpu_vm_mask = CPU_MASK_ALL,
114}; 113};
115 114
116static inline void switch_to_tboot_pt(void) 115static inline void switch_to_tboot_pt(void)
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 787a5e499dd1..3f92ce07e525 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -161,7 +161,7 @@ static int test_NX(void)
161 } 161 }
162 162
163#endif 163#endif
164 return 0; 164 return ret;
165} 165}
166 166
167static void test_exit(void) 167static void test_exit(void)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25a28a245937..00cbb272627f 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -23,7 +23,7 @@
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
26volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 26DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
27#endif 27#endif
28 28
29unsigned long profile_pc(struct pt_regs *regs) 29unsigned long profile_pc(struct pt_regs *regs)
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 9335bf7dd2e7..6cc6922262af 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -763,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
763 ret : clocksource_tsc.cycle_last; 763 ret : clocksource_tsc.cycle_last;
764} 764}
765 765
766#ifdef CONFIG_X86_64
767static cycle_t __vsyscall_fn vread_tsc(void)
768{
769 cycle_t ret;
770
771 /*
772 * Surround the RDTSC by barriers, to make sure it's not
773 * speculated to outside the seqlock critical section and
774 * does not cause time warps:
775 */
776 rdtsc_barrier();
777 ret = (cycle_t)vget_cycles();
778 rdtsc_barrier();
779
780 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
781 ret : __vsyscall_gtod_data.clock.cycle_last;
782}
783#endif
784
785static void resume_tsc(struct clocksource *cs) 766static void resume_tsc(struct clocksource *cs)
786{ 767{
787 clocksource_tsc.cycle_last = 0; 768 clocksource_tsc.cycle_last = 0;
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 624a2016198e..89aed99aafce 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -161,6 +161,12 @@ SECTIONS
161 161
162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) 162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
164#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
165 ADDR(.vsyscall_0) + offset \
166 : AT(VLOAD(.vsyscall_var_ ## x)) { \
167 *(.vsyscall_var_ ## x) \
168 } \
169 x = VVIRT(.vsyscall_var_ ## x);
164 170
165 . = ALIGN(4096); 171 . = ALIGN(4096);
166 __vsyscall_0 = .; 172 __vsyscall_0 = .;
@@ -175,18 +181,6 @@ SECTIONS
175 *(.vsyscall_fn) 181 *(.vsyscall_fn)
176 } 182 }
177 183
178 . = ALIGN(L1_CACHE_BYTES);
179 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
180 *(.vsyscall_gtod_data)
181 }
182
183 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
184 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
185 *(.vsyscall_clock)
186 }
187 vsyscall_clock = VVIRT(.vsyscall_clock);
188
189
190 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { 184 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
191 *(.vsyscall_1) 185 *(.vsyscall_1)
192 } 186 }
@@ -194,21 +188,14 @@ SECTIONS
194 *(.vsyscall_2) 188 *(.vsyscall_2)
195 } 189 }
196 190
197 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
198 *(.vgetcpu_mode)
199 }
200 vgetcpu_mode = VVIRT(.vgetcpu_mode);
201
202 . = ALIGN(L1_CACHE_BYTES);
203 .jiffies : AT(VLOAD(.jiffies)) {
204 *(.jiffies)
205 }
206 jiffies = VVIRT(.jiffies);
207
208 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { 191 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
209 *(.vsyscall_3) 192 *(.vsyscall_3)
210 } 193 }
211 194
195#define __VVAR_KERNEL_LDS
196#include <asm/vvar.h>
197#undef __VVAR_KERNEL_LDS
198
212 . = __vsyscall_0 + PAGE_SIZE; 199 . = __vsyscall_0 + PAGE_SIZE;
213 200
214#undef VSYSCALL_ADDR 201#undef VSYSCALL_ADDR
@@ -216,6 +203,7 @@ SECTIONS
216#undef VLOAD 203#undef VLOAD
217#undef VVIRT_OFFSET 204#undef VVIRT_OFFSET
218#undef VVIRT 205#undef VVIRT
206#undef EMIT_VVAR
219 207
220#endif /* CONFIG_X86_64 */ 208#endif /* CONFIG_X86_64 */
221 209
@@ -306,6 +294,13 @@ SECTIONS
306 } 294 }
307 295
308 . = ALIGN(8); 296 . = ALIGN(8);
297 .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
298 __apicdrivers = .;
299 *(.apicdrivers);
300 __apicdrivers_end = .;
301 }
302
303 . = ALIGN(8);
309 /* 304 /*
310 * .exit.text is discard at runtime, not link time, to deal with 305 * .exit.text is discard at runtime, not link time, to deal with
311 * references from .altinstructions and .eh_frame 306 * references from .altinstructions and .eh_frame
@@ -319,7 +314,7 @@ SECTIONS
319 } 314 }
320 315
321#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 316#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
322 PERCPU(INTERNODE_CACHE_BYTES, PAGE_SIZE) 317 PERCPU_SECTION(INTERNODE_CACHE_BYTES)
323#endif 318#endif
324 319
325 . = ALIGN(PAGE_SIZE); 320 . = ALIGN(PAGE_SIZE);
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 000000000000..a81aa9e9894c
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,36 @@
1/* This code runs in userspace. */
2
3#define DISABLE_BRANCH_PROFILING
4#include <asm/vgtod.h>
5
6notrace cycle_t __vsyscall_fn vread_tsc(void)
7{
8 cycle_t ret;
9 u64 last;
10
11 /*
12 * Empirically, a fence (of type that depends on the CPU)
13 * before rdtsc is enough to ensure that rdtsc is ordered
14 * with respect to loads. The various CPU manuals are unclear
15 * as to whether rdtsc can be reordered with later loads,
16 * but no one has ever seen it happen.
17 */
18 rdtsc_barrier();
19 ret = (cycle_t)vget_cycles();
20
21 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
22
23 if (likely(ret >= last))
24 return ret;
25
26 /*
27 * GCC likes to generate cmov here, but this branch is extremely
28 * predictable (it's just a funciton of time and the likely is
29 * very likely) and there's a data dependence, so force GCC
30 * to generate a branch instead. I don't barrier() because
31 * we don't actually need a barrier, and if this function
32 * ever gets inlined it will generate worse code.
33 */
34 asm volatile ("");
35 return last;
36}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b694..3e682184d76c 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -49,17 +49,10 @@
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace 49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
50#define __syscall_clobber "r11","cx","memory" 50#define __syscall_clobber "r11","cx","memory"
51 51
52/* 52DEFINE_VVAR(int, vgetcpu_mode);
53 * vsyscall_gtod_data contains data that is : 53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
54 * - readonly from vsyscalls
55 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
56 * Try to keep this structure as small as possible to avoid cache line ping pongs
57 */
58int __vgetcpu_mode __section_vgetcpu_mode;
59
60struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
61{ 54{
62 .lock = SEQLOCK_UNLOCKED, 55 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
63 .sysctl_enabled = 1, 56 .sysctl_enabled = 1,
64}; 57};
65 58
@@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
97 */ 90 */
98static __always_inline void do_get_tz(struct timezone * tz) 91static __always_inline void do_get_tz(struct timezone * tz)
99{ 92{
100 *tz = __vsyscall_gtod_data.sys_tz; 93 *tz = VVAR(vsyscall_gtod_data).sys_tz;
101} 94}
102 95
103static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
126 unsigned long mult, shift, nsec; 119 unsigned long mult, shift, nsec;
127 cycle_t (*vread)(void); 120 cycle_t (*vread)(void);
128 do { 121 do {
129 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 122 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
130 123
131 vread = __vsyscall_gtod_data.clock.vread; 124 vread = VVAR(vsyscall_gtod_data).clock.vread;
132 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { 125 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
126 !vread)) {
133 gettimeofday(tv,NULL); 127 gettimeofday(tv,NULL);
134 return; 128 return;
135 } 129 }
136 130
137 now = vread(); 131 now = vread();
138 base = __vsyscall_gtod_data.clock.cycle_last; 132 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
139 mask = __vsyscall_gtod_data.clock.mask; 133 mask = VVAR(vsyscall_gtod_data).clock.mask;
140 mult = __vsyscall_gtod_data.clock.mult; 134 mult = VVAR(vsyscall_gtod_data).clock.mult;
141 shift = __vsyscall_gtod_data.clock.shift; 135 shift = VVAR(vsyscall_gtod_data).clock.shift;
142 136
143 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; 137 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
144 nsec = __vsyscall_gtod_data.wall_time_nsec; 138 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
145 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 139 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
146 140
147 /* calculate interval: */ 141 /* calculate interval: */
148 cycle_delta = (now - base) & mask; 142 cycle_delta = (now - base) & mask;
@@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t)
171{ 165{
172 unsigned seq; 166 unsigned seq;
173 time_t result; 167 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 168 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
175 return time_syscall(t); 169 return time_syscall(t);
176 170
177 do { 171 do {
178 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 172 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
179 173
180 result = __vsyscall_gtod_data.wall_time_sec; 174 result = VVAR(vsyscall_gtod_data).wall_time_sec;
181 175
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 176 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
183 177
184 if (t) 178 if (t)
185 *t = result; 179 *t = result;
@@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
208 We do this here because otherwise user space would do it on 202 We do this here because otherwise user space would do it on
209 its own in a likely inferior way (no access to jiffies). 203 its own in a likely inferior way (no access to jiffies).
210 If you don't like it pass NULL. */ 204 If you don't like it pass NULL. */
211 if (tcache && tcache->blob[0] == (j = __jiffies)) { 205 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
212 p = tcache->blob[1]; 206 p = tcache->blob[1];
213 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { 207 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
214 /* Load per CPU data from RDTSCP */ 208 /* Load per CPU data from RDTSCP */
215 native_read_tscp(&p); 209 native_read_tscp(&p);
216 } else { 210 } else {
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 75ef4b18e9b7..6f164bd5e14d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -35,7 +35,7 @@ void iommu_shutdown_noop(void) { }
35struct x86_init_ops x86_init __initdata = { 35struct x86_init_ops x86_init __initdata = {
36 36
37 .resources = { 37 .resources = {
38 .probe_roms = x86_init_noop, 38 .probe_roms = probe_roms,
39 .reserve_resources = reserve_standard_io_resources, 39 .reserve_resources = reserve_standard_io_resources,
40 .memory_setup = default_machine_specific_memory_setup, 40 .memory_setup = default_machine_specific_memory_setup,
41 }, 41 },
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0ad47b819a8b..d6e2477feb18 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -73,9 +73,14 @@
73#define MemAbs (1<<11) /* Memory operand is absolute displacement */ 73#define MemAbs (1<<11) /* Memory operand is absolute displacement */
74#define String (1<<12) /* String instruction (rep capable) */ 74#define String (1<<12) /* String instruction (rep capable) */
75#define Stack (1<<13) /* Stack instruction (push/pop) */ 75#define Stack (1<<13) /* Stack instruction (push/pop) */
76#define GroupMask (7<<14) /* Opcode uses one of the group mechanisms */
76#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 77#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
77#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 78#define GroupDual (2<<14) /* Alternate decoding of mod == 3 */
79#define Prefix (3<<14) /* Instruction varies with 66/f2/f3 prefix */
80#define RMExt (4<<14) /* Opcode extension in ModRM r/m if mod == 3 */
81#define Sse (1<<17) /* SSE Vector instruction */
78/* Misc flags */ 82/* Misc flags */
83#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
79#define VendorSpecific (1<<22) /* Vendor specific instruction */ 84#define VendorSpecific (1<<22) /* Vendor specific instruction */
80#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */ 85#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
81#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */ 86#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
@@ -102,11 +107,14 @@
102 107
103struct opcode { 108struct opcode {
104 u32 flags; 109 u32 flags;
110 u8 intercept;
105 union { 111 union {
106 int (*execute)(struct x86_emulate_ctxt *ctxt); 112 int (*execute)(struct x86_emulate_ctxt *ctxt);
107 struct opcode *group; 113 struct opcode *group;
108 struct group_dual *gdual; 114 struct group_dual *gdual;
115 struct gprefix *gprefix;
109 } u; 116 } u;
117 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
110}; 118};
111 119
112struct group_dual { 120struct group_dual {
@@ -114,6 +122,13 @@ struct group_dual {
114 struct opcode mod3[8]; 122 struct opcode mod3[8];
115}; 123};
116 124
125struct gprefix {
126 struct opcode pfx_no;
127 struct opcode pfx_66;
128 struct opcode pfx_f2;
129 struct opcode pfx_f3;
130};
131
117/* EFLAGS bit definitions. */ 132/* EFLAGS bit definitions. */
118#define EFLG_ID (1<<21) 133#define EFLG_ID (1<<21)
119#define EFLG_VIP (1<<20) 134#define EFLG_VIP (1<<20)
@@ -248,42 +263,42 @@ struct group_dual {
248 "w", "r", _LO32, "r", "", "r") 263 "w", "r", _LO32, "r", "", "r")
249 264
250/* Instruction has three operands and one operand is stored in ECX register */ 265/* Instruction has three operands and one operand is stored in ECX register */
251#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ 266#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
252 do { \ 267 do { \
253 unsigned long _tmp; \ 268 unsigned long _tmp; \
254 _type _clv = (_cl).val; \ 269 _type _clv = (_cl).val; \
255 _type _srcv = (_src).val; \ 270 _type _srcv = (_src).val; \
256 _type _dstv = (_dst).val; \ 271 _type _dstv = (_dst).val; \
257 \ 272 \
258 __asm__ __volatile__ ( \ 273 __asm__ __volatile__ ( \
259 _PRE_EFLAGS("0", "5", "2") \ 274 _PRE_EFLAGS("0", "5", "2") \
260 _op _suffix " %4,%1 \n" \ 275 _op _suffix " %4,%1 \n" \
261 _POST_EFLAGS("0", "5", "2") \ 276 _POST_EFLAGS("0", "5", "2") \
262 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ 277 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
263 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ 278 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
264 ); \ 279 ); \
265 \ 280 \
266 (_cl).val = (unsigned long) _clv; \ 281 (_cl).val = (unsigned long) _clv; \
267 (_src).val = (unsigned long) _srcv; \ 282 (_src).val = (unsigned long) _srcv; \
268 (_dst).val = (unsigned long) _dstv; \ 283 (_dst).val = (unsigned long) _dstv; \
269 } while (0) 284 } while (0)
270 285
271#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ 286#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
272 do { \ 287 do { \
273 switch ((_dst).bytes) { \ 288 switch ((_dst).bytes) { \
274 case 2: \ 289 case 2: \
275 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 290 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
276 "w", unsigned short); \ 291 "w", unsigned short); \
277 break; \ 292 break; \
278 case 4: \ 293 case 4: \
279 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 294 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
280 "l", unsigned int); \ 295 "l", unsigned int); \
281 break; \ 296 break; \
282 case 8: \ 297 case 8: \
283 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 298 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
284 "q", unsigned long)); \ 299 "q", unsigned long)); \
285 break; \ 300 break; \
286 } \ 301 } \
287 } while (0) 302 } while (0)
288 303
289#define __emulate_1op(_op, _dst, _eflags, _suffix) \ 304#define __emulate_1op(_op, _dst, _eflags, _suffix) \
@@ -346,13 +361,25 @@ struct group_dual {
346 } while (0) 361 } while (0)
347 362
348/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */ 363/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
349#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \ 364#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
350 do { \ 365 do { \
351 switch((_src).bytes) { \ 366 switch((_src).bytes) { \
352 case 1: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "b"); break; \ 367 case 1: \
353 case 2: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "w"); break; \ 368 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
354 case 4: __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "l"); break; \ 369 _eflags, "b"); \
355 case 8: ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, "q")); break; \ 370 break; \
371 case 2: \
372 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
373 _eflags, "w"); \
374 break; \
375 case 4: \
376 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
377 _eflags, "l"); \
378 break; \
379 case 8: \
380 ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
381 _eflags, "q")); \
382 break; \
356 } \ 383 } \
357 } while (0) 384 } while (0)
358 385
@@ -388,13 +415,33 @@ struct group_dual {
388 (_type)_x; \ 415 (_type)_x; \
389}) 416})
390 417
391#define insn_fetch_arr(_arr, _size, _eip) \ 418#define insn_fetch_arr(_arr, _size, _eip) \
392({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ 419({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
393 if (rc != X86EMUL_CONTINUE) \ 420 if (rc != X86EMUL_CONTINUE) \
394 goto done; \ 421 goto done; \
395 (_eip) += (_size); \ 422 (_eip) += (_size); \
396}) 423})
397 424
425static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
426 enum x86_intercept intercept,
427 enum x86_intercept_stage stage)
428{
429 struct x86_instruction_info info = {
430 .intercept = intercept,
431 .rep_prefix = ctxt->decode.rep_prefix,
432 .modrm_mod = ctxt->decode.modrm_mod,
433 .modrm_reg = ctxt->decode.modrm_reg,
434 .modrm_rm = ctxt->decode.modrm_rm,
435 .src_val = ctxt->decode.src.val64,
436 .src_bytes = ctxt->decode.src.bytes,
437 .dst_bytes = ctxt->decode.dst.bytes,
438 .ad_bytes = ctxt->decode.ad_bytes,
439 .next_rip = ctxt->eip,
440 };
441
442 return ctxt->ops->intercept(ctxt, &info, stage);
443}
444
398static inline unsigned long ad_mask(struct decode_cache *c) 445static inline unsigned long ad_mask(struct decode_cache *c)
399{ 446{
400 return (1UL << (c->ad_bytes << 3)) - 1; 447 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -430,6 +477,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
430 register_address_increment(c, &c->eip, rel); 477 register_address_increment(c, &c->eip, rel);
431} 478}
432 479
480static u32 desc_limit_scaled(struct desc_struct *desc)
481{
482 u32 limit = get_desc_limit(desc);
483
484 return desc->g ? (limit << 12) | 0xfff : limit;
485}
486
433static void set_seg_override(struct decode_cache *c, int seg) 487static void set_seg_override(struct decode_cache *c, int seg)
434{ 488{
435 c->has_seg_override = true; 489 c->has_seg_override = true;
@@ -442,11 +496,10 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
442 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 496 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
443 return 0; 497 return 0;
444 498
445 return ops->get_cached_segment_base(seg, ctxt->vcpu); 499 return ops->get_cached_segment_base(ctxt, seg);
446} 500}
447 501
448static unsigned seg_override(struct x86_emulate_ctxt *ctxt, 502static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
449 struct x86_emulate_ops *ops,
450 struct decode_cache *c) 503 struct decode_cache *c)
451{ 504{
452 if (!c->has_seg_override) 505 if (!c->has_seg_override)
@@ -455,18 +508,6 @@ static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
455 return c->seg_override; 508 return c->seg_override;
456} 509}
457 510
458static ulong linear(struct x86_emulate_ctxt *ctxt,
459 struct segmented_address addr)
460{
461 struct decode_cache *c = &ctxt->decode;
462 ulong la;
463
464 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
465 if (c->ad_bytes != 8)
466 la &= (u32)-1;
467 return la;
468}
469
470static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 511static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
471 u32 error, bool valid) 512 u32 error, bool valid)
472{ 513{
@@ -476,11 +517,21 @@ static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
476 return X86EMUL_PROPAGATE_FAULT; 517 return X86EMUL_PROPAGATE_FAULT;
477} 518}
478 519
520static int emulate_db(struct x86_emulate_ctxt *ctxt)
521{
522 return emulate_exception(ctxt, DB_VECTOR, 0, false);
523}
524
479static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 525static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
480{ 526{
481 return emulate_exception(ctxt, GP_VECTOR, err, true); 527 return emulate_exception(ctxt, GP_VECTOR, err, true);
482} 528}
483 529
530static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
531{
532 return emulate_exception(ctxt, SS_VECTOR, err, true);
533}
534
484static int emulate_ud(struct x86_emulate_ctxt *ctxt) 535static int emulate_ud(struct x86_emulate_ctxt *ctxt)
485{ 536{
486 return emulate_exception(ctxt, UD_VECTOR, 0, false); 537 return emulate_exception(ctxt, UD_VECTOR, 0, false);
@@ -496,6 +547,128 @@ static int emulate_de(struct x86_emulate_ctxt *ctxt)
496 return emulate_exception(ctxt, DE_VECTOR, 0, false); 547 return emulate_exception(ctxt, DE_VECTOR, 0, false);
497} 548}
498 549
550static int emulate_nm(struct x86_emulate_ctxt *ctxt)
551{
552 return emulate_exception(ctxt, NM_VECTOR, 0, false);
553}
554
555static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
556{
557 u16 selector;
558 struct desc_struct desc;
559
560 ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
561 return selector;
562}
563
564static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
565 unsigned seg)
566{
567 u16 dummy;
568 u32 base3;
569 struct desc_struct desc;
570
571 ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
572 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
573}
574
575static int __linearize(struct x86_emulate_ctxt *ctxt,
576 struct segmented_address addr,
577 unsigned size, bool write, bool fetch,
578 ulong *linear)
579{
580 struct decode_cache *c = &ctxt->decode;
581 struct desc_struct desc;
582 bool usable;
583 ulong la;
584 u32 lim;
585 u16 sel;
586 unsigned cpl, rpl;
587
588 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
589 switch (ctxt->mode) {
590 case X86EMUL_MODE_REAL:
591 break;
592 case X86EMUL_MODE_PROT64:
593 if (((signed long)la << 16) >> 16 != la)
594 return emulate_gp(ctxt, 0);
595 break;
596 default:
597 usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
598 addr.seg);
599 if (!usable)
600 goto bad;
601 /* code segment or read-only data segment */
602 if (((desc.type & 8) || !(desc.type & 2)) && write)
603 goto bad;
604 /* unreadable code segment */
605 if (!fetch && (desc.type & 8) && !(desc.type & 2))
606 goto bad;
607 lim = desc_limit_scaled(&desc);
608 if ((desc.type & 8) || !(desc.type & 4)) {
609 /* expand-up segment */
610 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
611 goto bad;
612 } else {
613 /* exapand-down segment */
614 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
615 goto bad;
616 lim = desc.d ? 0xffffffff : 0xffff;
617 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
618 goto bad;
619 }
620 cpl = ctxt->ops->cpl(ctxt);
621 rpl = sel & 3;
622 cpl = max(cpl, rpl);
623 if (!(desc.type & 8)) {
624 /* data segment */
625 if (cpl > desc.dpl)
626 goto bad;
627 } else if ((desc.type & 8) && !(desc.type & 4)) {
628 /* nonconforming code segment */
629 if (cpl != desc.dpl)
630 goto bad;
631 } else if ((desc.type & 8) && (desc.type & 4)) {
632 /* conforming code segment */
633 if (cpl < desc.dpl)
634 goto bad;
635 }
636 break;
637 }
638 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
639 la &= (u32)-1;
640 *linear = la;
641 return X86EMUL_CONTINUE;
642bad:
643 if (addr.seg == VCPU_SREG_SS)
644 return emulate_ss(ctxt, addr.seg);
645 else
646 return emulate_gp(ctxt, addr.seg);
647}
648
649static int linearize(struct x86_emulate_ctxt *ctxt,
650 struct segmented_address addr,
651 unsigned size, bool write,
652 ulong *linear)
653{
654 return __linearize(ctxt, addr, size, write, false, linear);
655}
656
657
658static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
659 struct segmented_address addr,
660 void *data,
661 unsigned size)
662{
663 int rc;
664 ulong linear;
665
666 rc = linearize(ctxt, addr, size, false, &linear);
667 if (rc != X86EMUL_CONTINUE)
668 return rc;
669 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
670}
671
499static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 672static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
500 struct x86_emulate_ops *ops, 673 struct x86_emulate_ops *ops,
501 unsigned long eip, u8 *dest) 674 unsigned long eip, u8 *dest)
@@ -505,10 +678,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
505 int size, cur_size; 678 int size, cur_size;
506 679
507 if (eip == fc->end) { 680 if (eip == fc->end) {
681 unsigned long linear;
682 struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip};
508 cur_size = fc->end - fc->start; 683 cur_size = fc->end - fc->start;
509 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 684 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
510 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 685 rc = __linearize(ctxt, addr, size, false, true, &linear);
511 size, ctxt->vcpu, &ctxt->exception); 686 if (rc != X86EMUL_CONTINUE)
687 return rc;
688 rc = ops->fetch(ctxt, linear, fc->data + cur_size,
689 size, &ctxt->exception);
512 if (rc != X86EMUL_CONTINUE) 690 if (rc != X86EMUL_CONTINUE)
513 return rc; 691 return rc;
514 fc->end += size; 692 fc->end += size;
@@ -551,7 +729,6 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
551} 729}
552 730
553static int read_descriptor(struct x86_emulate_ctxt *ctxt, 731static int read_descriptor(struct x86_emulate_ctxt *ctxt,
554 struct x86_emulate_ops *ops,
555 struct segmented_address addr, 732 struct segmented_address addr,
556 u16 *size, unsigned long *address, int op_bytes) 733 u16 *size, unsigned long *address, int op_bytes)
557{ 734{
@@ -560,13 +737,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
560 if (op_bytes == 2) 737 if (op_bytes == 2)
561 op_bytes = 3; 738 op_bytes = 3;
562 *address = 0; 739 *address = 0;
563 rc = ops->read_std(linear(ctxt, addr), (unsigned long *)size, 2, 740 rc = segmented_read_std(ctxt, addr, size, 2);
564 ctxt->vcpu, &ctxt->exception);
565 if (rc != X86EMUL_CONTINUE) 741 if (rc != X86EMUL_CONTINUE)
566 return rc; 742 return rc;
567 addr.ea += 2; 743 addr.ea += 2;
568 rc = ops->read_std(linear(ctxt, addr), address, op_bytes, 744 rc = segmented_read_std(ctxt, addr, address, op_bytes);
569 ctxt->vcpu, &ctxt->exception);
570 return rc; 745 return rc;
571} 746}
572 747
@@ -623,7 +798,63 @@ static void fetch_register_operand(struct operand *op)
623 } 798 }
624} 799}
625 800
626static void decode_register_operand(struct operand *op, 801static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
802{
803 ctxt->ops->get_fpu(ctxt);
804 switch (reg) {
805 case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
806 case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
807 case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
808 case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
809 case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
810 case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
811 case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
812 case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
813#ifdef CONFIG_X86_64
814 case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
815 case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
816 case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
817 case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
818 case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
819 case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
820 case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
821 case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
822#endif
823 default: BUG();
824 }
825 ctxt->ops->put_fpu(ctxt);
826}
827
828static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
829 int reg)
830{
831 ctxt->ops->get_fpu(ctxt);
832 switch (reg) {
833 case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
834 case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
835 case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
836 case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
837 case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
838 case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
839 case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
840 case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
841#ifdef CONFIG_X86_64
842 case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
843 case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
844 case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
845 case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
846 case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
847 case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
848 case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
849 case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
850#endif
851 default: BUG();
852 }
853 ctxt->ops->put_fpu(ctxt);
854}
855
856static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
857 struct operand *op,
627 struct decode_cache *c, 858 struct decode_cache *c,
628 int inhibit_bytereg) 859 int inhibit_bytereg)
629{ 860{
@@ -632,6 +863,15 @@ static void decode_register_operand(struct operand *op,
632 863
633 if (!(c->d & ModRM)) 864 if (!(c->d & ModRM))
634 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 865 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
866
867 if (c->d & Sse) {
868 op->type = OP_XMM;
869 op->bytes = 16;
870 op->addr.xmm = reg;
871 read_sse_reg(ctxt, &op->vec_val, reg);
872 return;
873 }
874
635 op->type = OP_REG; 875 op->type = OP_REG;
636 if ((c->d & ByteOp) && !inhibit_bytereg) { 876 if ((c->d & ByteOp) && !inhibit_bytereg) {
637 op->addr.reg = decode_register(reg, c->regs, highbyte_regs); 877 op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
@@ -671,6 +911,13 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
671 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 911 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
672 op->addr.reg = decode_register(c->modrm_rm, 912 op->addr.reg = decode_register(c->modrm_rm,
673 c->regs, c->d & ByteOp); 913 c->regs, c->d & ByteOp);
914 if (c->d & Sse) {
915 op->type = OP_XMM;
916 op->bytes = 16;
917 op->addr.xmm = c->modrm_rm;
918 read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
919 return rc;
920 }
674 fetch_register_operand(op); 921 fetch_register_operand(op);
675 return rc; 922 return rc;
676 } 923 }
@@ -819,8 +1066,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
819 if (mc->pos < mc->end) 1066 if (mc->pos < mc->end)
820 goto read_cached; 1067 goto read_cached;
821 1068
822 rc = ops->read_emulated(addr, mc->data + mc->end, n, 1069 rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
823 &ctxt->exception, ctxt->vcpu); 1070 &ctxt->exception);
824 if (rc != X86EMUL_CONTINUE) 1071 if (rc != X86EMUL_CONTINUE)
825 return rc; 1072 return rc;
826 mc->end += n; 1073 mc->end += n;
@@ -834,6 +1081,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
834 return X86EMUL_CONTINUE; 1081 return X86EMUL_CONTINUE;
835} 1082}
836 1083
1084static int segmented_read(struct x86_emulate_ctxt *ctxt,
1085 struct segmented_address addr,
1086 void *data,
1087 unsigned size)
1088{
1089 int rc;
1090 ulong linear;
1091
1092 rc = linearize(ctxt, addr, size, false, &linear);
1093 if (rc != X86EMUL_CONTINUE)
1094 return rc;
1095 return read_emulated(ctxt, ctxt->ops, linear, data, size);
1096}
1097
1098static int segmented_write(struct x86_emulate_ctxt *ctxt,
1099 struct segmented_address addr,
1100 const void *data,
1101 unsigned size)
1102{
1103 int rc;
1104 ulong linear;
1105
1106 rc = linearize(ctxt, addr, size, true, &linear);
1107 if (rc != X86EMUL_CONTINUE)
1108 return rc;
1109 return ctxt->ops->write_emulated(ctxt, linear, data, size,
1110 &ctxt->exception);
1111}
1112
1113static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
1114 struct segmented_address addr,
1115 const void *orig_data, const void *data,
1116 unsigned size)
1117{
1118 int rc;
1119 ulong linear;
1120
1121 rc = linearize(ctxt, addr, size, true, &linear);
1122 if (rc != X86EMUL_CONTINUE)
1123 return rc;
1124 return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
1125 size, &ctxt->exception);
1126}
1127
837static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1128static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
838 struct x86_emulate_ops *ops, 1129 struct x86_emulate_ops *ops,
839 unsigned int size, unsigned short port, 1130 unsigned int size, unsigned short port,
@@ -854,7 +1145,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
854 if (n == 0) 1145 if (n == 0)
855 n = 1; 1146 n = 1;
856 rc->pos = rc->end = 0; 1147 rc->pos = rc->end = 0;
857 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) 1148 if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n))
858 return 0; 1149 return 0;
859 rc->end = n * size; 1150 rc->end = n * size;
860 } 1151 }
@@ -864,28 +1155,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
864 return 1; 1155 return 1;
865} 1156}
866 1157
867static u32 desc_limit_scaled(struct desc_struct *desc)
868{
869 u32 limit = get_desc_limit(desc);
870
871 return desc->g ? (limit << 12) | 0xfff : limit;
872}
873
874static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1158static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
875 struct x86_emulate_ops *ops, 1159 struct x86_emulate_ops *ops,
876 u16 selector, struct desc_ptr *dt) 1160 u16 selector, struct desc_ptr *dt)
877{ 1161{
878 if (selector & 1 << 2) { 1162 if (selector & 1 << 2) {
879 struct desc_struct desc; 1163 struct desc_struct desc;
1164 u16 sel;
1165
880 memset (dt, 0, sizeof *dt); 1166 memset (dt, 0, sizeof *dt);
881 if (!ops->get_cached_descriptor(&desc, NULL, VCPU_SREG_LDTR, 1167 if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
882 ctxt->vcpu))
883 return; 1168 return;
884 1169
885 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 1170 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
886 dt->address = get_desc_base(&desc); 1171 dt->address = get_desc_base(&desc);
887 } else 1172 } else
888 ops->get_gdt(dt, ctxt->vcpu); 1173 ops->get_gdt(ctxt, dt);
889} 1174}
890 1175
891/* allowed just for 8 bytes segments */ 1176/* allowed just for 8 bytes segments */
@@ -903,8 +1188,7 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
903 if (dt.size < index * 8 + 7) 1188 if (dt.size < index * 8 + 7)
904 return emulate_gp(ctxt, selector & 0xfffc); 1189 return emulate_gp(ctxt, selector & 0xfffc);
905 addr = dt.address + index * 8; 1190 addr = dt.address + index * 8;
906 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, 1191 ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
907 &ctxt->exception);
908 1192
909 return ret; 1193 return ret;
910} 1194}
@@ -925,8 +1209,7 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
925 return emulate_gp(ctxt, selector & 0xfffc); 1209 return emulate_gp(ctxt, selector & 0xfffc);
926 1210
927 addr = dt.address + index * 8; 1211 addr = dt.address + index * 8;
928 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, 1212 ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
929 &ctxt->exception);
930 1213
931 return ret; 1214 return ret;
932} 1215}
@@ -986,7 +1269,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
986 1269
987 rpl = selector & 3; 1270 rpl = selector & 3;
988 dpl = seg_desc.dpl; 1271 dpl = seg_desc.dpl;
989 cpl = ops->cpl(ctxt->vcpu); 1272 cpl = ops->cpl(ctxt);
990 1273
991 switch (seg) { 1274 switch (seg) {
992 case VCPU_SREG_SS: 1275 case VCPU_SREG_SS:
@@ -1042,8 +1325,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1042 return ret; 1325 return ret;
1043 } 1326 }
1044load: 1327load:
1045 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1328 ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
1046 ops->set_cached_descriptor(&seg_desc, 0, seg, ctxt->vcpu);
1047 return X86EMUL_CONTINUE; 1329 return X86EMUL_CONTINUE;
1048exception: 1330exception:
1049 emulate_exception(ctxt, err_vec, err_code, true); 1331 emulate_exception(ctxt, err_vec, err_code, true);
@@ -1069,8 +1351,7 @@ static void write_register_operand(struct operand *op)
1069 } 1351 }
1070} 1352}
1071 1353
1072static inline int writeback(struct x86_emulate_ctxt *ctxt, 1354static int writeback(struct x86_emulate_ctxt *ctxt)
1073 struct x86_emulate_ops *ops)
1074{ 1355{
1075 int rc; 1356 int rc;
1076 struct decode_cache *c = &ctxt->decode; 1357 struct decode_cache *c = &ctxt->decode;
@@ -1081,23 +1362,22 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1081 break; 1362 break;
1082 case OP_MEM: 1363 case OP_MEM:
1083 if (c->lock_prefix) 1364 if (c->lock_prefix)
1084 rc = ops->cmpxchg_emulated( 1365 rc = segmented_cmpxchg(ctxt,
1085 linear(ctxt, c->dst.addr.mem), 1366 c->dst.addr.mem,
1086 &c->dst.orig_val, 1367 &c->dst.orig_val,
1087 &c->dst.val, 1368 &c->dst.val,
1088 c->dst.bytes, 1369 c->dst.bytes);
1089 &ctxt->exception,
1090 ctxt->vcpu);
1091 else 1370 else
1092 rc = ops->write_emulated( 1371 rc = segmented_write(ctxt,
1093 linear(ctxt, c->dst.addr.mem), 1372 c->dst.addr.mem,
1094 &c->dst.val, 1373 &c->dst.val,
1095 c->dst.bytes, 1374 c->dst.bytes);
1096 &ctxt->exception,
1097 ctxt->vcpu);
1098 if (rc != X86EMUL_CONTINUE) 1375 if (rc != X86EMUL_CONTINUE)
1099 return rc; 1376 return rc;
1100 break; 1377 break;
1378 case OP_XMM:
1379 write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
1380 break;
1101 case OP_NONE: 1381 case OP_NONE:
1102 /* no writeback */ 1382 /* no writeback */
1103 break; 1383 break;
@@ -1107,21 +1387,21 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1107 return X86EMUL_CONTINUE; 1387 return X86EMUL_CONTINUE;
1108} 1388}
1109 1389
1110static inline void emulate_push(struct x86_emulate_ctxt *ctxt, 1390static int em_push(struct x86_emulate_ctxt *ctxt)
1111 struct x86_emulate_ops *ops)
1112{ 1391{
1113 struct decode_cache *c = &ctxt->decode; 1392 struct decode_cache *c = &ctxt->decode;
1393 struct segmented_address addr;
1114 1394
1115 c->dst.type = OP_MEM;
1116 c->dst.bytes = c->op_bytes;
1117 c->dst.val = c->src.val;
1118 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1395 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1119 c->dst.addr.mem.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1396 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1120 c->dst.addr.mem.seg = VCPU_SREG_SS; 1397 addr.seg = VCPU_SREG_SS;
1398
1399 /* Disable writeback. */
1400 c->dst.type = OP_NONE;
1401 return segmented_write(ctxt, addr, &c->src.val, c->op_bytes);
1121} 1402}
1122 1403
1123static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1404static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1124 struct x86_emulate_ops *ops,
1125 void *dest, int len) 1405 void *dest, int len)
1126{ 1406{
1127 struct decode_cache *c = &ctxt->decode; 1407 struct decode_cache *c = &ctxt->decode;
@@ -1130,7 +1410,7 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1130 1410
1131 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]); 1411 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1132 addr.seg = VCPU_SREG_SS; 1412 addr.seg = VCPU_SREG_SS;
1133 rc = read_emulated(ctxt, ops, linear(ctxt, addr), dest, len); 1413 rc = segmented_read(ctxt, addr, dest, len);
1134 if (rc != X86EMUL_CONTINUE) 1414 if (rc != X86EMUL_CONTINUE)
1135 return rc; 1415 return rc;
1136 1416
@@ -1138,6 +1418,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1138 return rc; 1418 return rc;
1139} 1419}
1140 1420
1421static int em_pop(struct x86_emulate_ctxt *ctxt)
1422{
1423 struct decode_cache *c = &ctxt->decode;
1424
1425 return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
1426}
1427
1141static int emulate_popf(struct x86_emulate_ctxt *ctxt, 1428static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1142 struct x86_emulate_ops *ops, 1429 struct x86_emulate_ops *ops,
1143 void *dest, int len) 1430 void *dest, int len)
@@ -1145,9 +1432,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1145 int rc; 1432 int rc;
1146 unsigned long val, change_mask; 1433 unsigned long val, change_mask;
1147 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1434 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1148 int cpl = ops->cpl(ctxt->vcpu); 1435 int cpl = ops->cpl(ctxt);
1149 1436
1150 rc = emulate_pop(ctxt, ops, &val, len); 1437 rc = emulate_pop(ctxt, &val, len);
1151 if (rc != X86EMUL_CONTINUE) 1438 if (rc != X86EMUL_CONTINUE)
1152 return rc; 1439 return rc;
1153 1440
@@ -1179,14 +1466,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1179 return rc; 1466 return rc;
1180} 1467}
1181 1468
1182static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, 1469static int em_popf(struct x86_emulate_ctxt *ctxt)
1183 struct x86_emulate_ops *ops, int seg)
1184{ 1470{
1185 struct decode_cache *c = &ctxt->decode; 1471 struct decode_cache *c = &ctxt->decode;
1186 1472
1187 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); 1473 c->dst.type = OP_REG;
1474 c->dst.addr.reg = &ctxt->eflags;
1475 c->dst.bytes = c->op_bytes;
1476 return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
1477}
1188 1478
1189 emulate_push(ctxt, ops); 1479static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1480 struct x86_emulate_ops *ops, int seg)
1481{
1482 struct decode_cache *c = &ctxt->decode;
1483
1484 c->src.val = get_segment_selector(ctxt, seg);
1485
1486 return em_push(ctxt);
1190} 1487}
1191 1488
1192static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1489static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1196,7 +1493,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1196 unsigned long selector; 1493 unsigned long selector;
1197 int rc; 1494 int rc;
1198 1495
1199 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1496 rc = emulate_pop(ctxt, &selector, c->op_bytes);
1200 if (rc != X86EMUL_CONTINUE) 1497 if (rc != X86EMUL_CONTINUE)
1201 return rc; 1498 return rc;
1202 1499
@@ -1204,8 +1501,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1204 return rc; 1501 return rc;
1205} 1502}
1206 1503
1207static int emulate_pusha(struct x86_emulate_ctxt *ctxt, 1504static int em_pusha(struct x86_emulate_ctxt *ctxt)
1208 struct x86_emulate_ops *ops)
1209{ 1505{
1210 struct decode_cache *c = &ctxt->decode; 1506 struct decode_cache *c = &ctxt->decode;
1211 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1507 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
@@ -1216,23 +1512,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1216 (reg == VCPU_REGS_RSP) ? 1512 (reg == VCPU_REGS_RSP) ?
1217 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1513 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1218 1514
1219 emulate_push(ctxt, ops); 1515 rc = em_push(ctxt);
1220
1221 rc = writeback(ctxt, ops);
1222 if (rc != X86EMUL_CONTINUE) 1516 if (rc != X86EMUL_CONTINUE)
1223 return rc; 1517 return rc;
1224 1518
1225 ++reg; 1519 ++reg;
1226 } 1520 }
1227 1521
1228 /* Disable writeback. */
1229 c->dst.type = OP_NONE;
1230
1231 return rc; 1522 return rc;
1232} 1523}
1233 1524
1234static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1525static int em_pushf(struct x86_emulate_ctxt *ctxt)
1235 struct x86_emulate_ops *ops) 1526{
1527 struct decode_cache *c = &ctxt->decode;
1528
1529 c->src.val = (unsigned long)ctxt->eflags;
1530 return em_push(ctxt);
1531}
1532
1533static int em_popa(struct x86_emulate_ctxt *ctxt)
1236{ 1534{
1237 struct decode_cache *c = &ctxt->decode; 1535 struct decode_cache *c = &ctxt->decode;
1238 int rc = X86EMUL_CONTINUE; 1536 int rc = X86EMUL_CONTINUE;
@@ -1245,7 +1543,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1245 --reg; 1543 --reg;
1246 } 1544 }
1247 1545
1248 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1546 rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes);
1249 if (rc != X86EMUL_CONTINUE) 1547 if (rc != X86EMUL_CONTINUE)
1250 break; 1548 break;
1251 --reg; 1549 --reg;
@@ -1265,37 +1563,32 @@ int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1265 1563
1266 /* TODO: Add limit checks */ 1564 /* TODO: Add limit checks */
1267 c->src.val = ctxt->eflags; 1565 c->src.val = ctxt->eflags;
1268 emulate_push(ctxt, ops); 1566 rc = em_push(ctxt);
1269 rc = writeback(ctxt, ops);
1270 if (rc != X86EMUL_CONTINUE) 1567 if (rc != X86EMUL_CONTINUE)
1271 return rc; 1568 return rc;
1272 1569
1273 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC); 1570 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1274 1571
1275 c->src.val = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 1572 c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1276 emulate_push(ctxt, ops); 1573 rc = em_push(ctxt);
1277 rc = writeback(ctxt, ops);
1278 if (rc != X86EMUL_CONTINUE) 1574 if (rc != X86EMUL_CONTINUE)
1279 return rc; 1575 return rc;
1280 1576
1281 c->src.val = c->eip; 1577 c->src.val = c->eip;
1282 emulate_push(ctxt, ops); 1578 rc = em_push(ctxt);
1283 rc = writeback(ctxt, ops);
1284 if (rc != X86EMUL_CONTINUE) 1579 if (rc != X86EMUL_CONTINUE)
1285 return rc; 1580 return rc;
1286 1581
1287 c->dst.type = OP_NONE; 1582 ops->get_idt(ctxt, &dt);
1288
1289 ops->get_idt(&dt, ctxt->vcpu);
1290 1583
1291 eip_addr = dt.address + (irq << 2); 1584 eip_addr = dt.address + (irq << 2);
1292 cs_addr = dt.address + (irq << 2) + 2; 1585 cs_addr = dt.address + (irq << 2) + 2;
1293 1586
1294 rc = ops->read_std(cs_addr, &cs, 2, ctxt->vcpu, &ctxt->exception); 1587 rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception);
1295 if (rc != X86EMUL_CONTINUE) 1588 if (rc != X86EMUL_CONTINUE)
1296 return rc; 1589 return rc;
1297 1590
1298 rc = ops->read_std(eip_addr, &eip, 2, ctxt->vcpu, &ctxt->exception); 1591 rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception);
1299 if (rc != X86EMUL_CONTINUE) 1592 if (rc != X86EMUL_CONTINUE)
1300 return rc; 1593 return rc;
1301 1594
@@ -1339,7 +1632,7 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1339 1632
1340 /* TODO: Add stack limit check */ 1633 /* TODO: Add stack limit check */
1341 1634
1342 rc = emulate_pop(ctxt, ops, &temp_eip, c->op_bytes); 1635 rc = emulate_pop(ctxt, &temp_eip, c->op_bytes);
1343 1636
1344 if (rc != X86EMUL_CONTINUE) 1637 if (rc != X86EMUL_CONTINUE)
1345 return rc; 1638 return rc;
@@ -1347,12 +1640,12 @@ static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1347 if (temp_eip & ~0xffff) 1640 if (temp_eip & ~0xffff)
1348 return emulate_gp(ctxt, 0); 1641 return emulate_gp(ctxt, 0);
1349 1642
1350 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1643 rc = emulate_pop(ctxt, &cs, c->op_bytes);
1351 1644
1352 if (rc != X86EMUL_CONTINUE) 1645 if (rc != X86EMUL_CONTINUE)
1353 return rc; 1646 return rc;
1354 1647
1355 rc = emulate_pop(ctxt, ops, &temp_eflags, c->op_bytes); 1648 rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes);
1356 1649
1357 if (rc != X86EMUL_CONTINUE) 1650 if (rc != X86EMUL_CONTINUE)
1358 return rc; 1651 return rc;
@@ -1394,15 +1687,31 @@ static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1394 } 1687 }
1395} 1688}
1396 1689
1397static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1690static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1398 struct x86_emulate_ops *ops) 1691{
1692 struct decode_cache *c = &ctxt->decode;
1693 int rc;
1694 unsigned short sel;
1695
1696 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1697
1698 rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS);
1699 if (rc != X86EMUL_CONTINUE)
1700 return rc;
1701
1702 c->eip = 0;
1703 memcpy(&c->eip, c->src.valptr, c->op_bytes);
1704 return X86EMUL_CONTINUE;
1705}
1706
1707static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1399{ 1708{
1400 struct decode_cache *c = &ctxt->decode; 1709 struct decode_cache *c = &ctxt->decode;
1401 1710
1402 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1711 return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
1403} 1712}
1404 1713
1405static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1714static int em_grp2(struct x86_emulate_ctxt *ctxt)
1406{ 1715{
1407 struct decode_cache *c = &ctxt->decode; 1716 struct decode_cache *c = &ctxt->decode;
1408 switch (c->modrm_reg) { 1717 switch (c->modrm_reg) {
@@ -1429,10 +1738,10 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1429 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); 1738 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1430 break; 1739 break;
1431 } 1740 }
1741 return X86EMUL_CONTINUE;
1432} 1742}
1433 1743
1434static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, 1744static int em_grp3(struct x86_emulate_ctxt *ctxt)
1435 struct x86_emulate_ops *ops)
1436{ 1745{
1437 struct decode_cache *c = &ctxt->decode; 1746 struct decode_cache *c = &ctxt->decode;
1438 unsigned long *rax = &c->regs[VCPU_REGS_RAX]; 1747 unsigned long *rax = &c->regs[VCPU_REGS_RAX];
@@ -1471,10 +1780,10 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1471 return X86EMUL_CONTINUE; 1780 return X86EMUL_CONTINUE;
1472} 1781}
1473 1782
1474static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1783static int em_grp45(struct x86_emulate_ctxt *ctxt)
1475 struct x86_emulate_ops *ops)
1476{ 1784{
1477 struct decode_cache *c = &ctxt->decode; 1785 struct decode_cache *c = &ctxt->decode;
1786 int rc = X86EMUL_CONTINUE;
1478 1787
1479 switch (c->modrm_reg) { 1788 switch (c->modrm_reg) {
1480 case 0: /* inc */ 1789 case 0: /* inc */
@@ -1488,21 +1797,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1488 old_eip = c->eip; 1797 old_eip = c->eip;
1489 c->eip = c->src.val; 1798 c->eip = c->src.val;
1490 c->src.val = old_eip; 1799 c->src.val = old_eip;
1491 emulate_push(ctxt, ops); 1800 rc = em_push(ctxt);
1492 break; 1801 break;
1493 } 1802 }
1494 case 4: /* jmp abs */ 1803 case 4: /* jmp abs */
1495 c->eip = c->src.val; 1804 c->eip = c->src.val;
1496 break; 1805 break;
1806 case 5: /* jmp far */
1807 rc = em_jmp_far(ctxt);
1808 break;
1497 case 6: /* push */ 1809 case 6: /* push */
1498 emulate_push(ctxt, ops); 1810 rc = em_push(ctxt);
1499 break; 1811 break;
1500 } 1812 }
1501 return X86EMUL_CONTINUE; 1813 return rc;
1502} 1814}
1503 1815
1504static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1816static int em_grp9(struct x86_emulate_ctxt *ctxt)
1505 struct x86_emulate_ops *ops)
1506{ 1817{
1507 struct decode_cache *c = &ctxt->decode; 1818 struct decode_cache *c = &ctxt->decode;
1508 u64 old = c->dst.orig_val64; 1819 u64 old = c->dst.orig_val64;
@@ -1528,12 +1839,12 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1528 int rc; 1839 int rc;
1529 unsigned long cs; 1840 unsigned long cs;
1530 1841
1531 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1842 rc = emulate_pop(ctxt, &c->eip, c->op_bytes);
1532 if (rc != X86EMUL_CONTINUE) 1843 if (rc != X86EMUL_CONTINUE)
1533 return rc; 1844 return rc;
1534 if (c->op_bytes == 4) 1845 if (c->op_bytes == 4)
1535 c->eip = (u32)c->eip; 1846 c->eip = (u32)c->eip;
1536 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1847 rc = emulate_pop(ctxt, &cs, c->op_bytes);
1537 if (rc != X86EMUL_CONTINUE) 1848 if (rc != X86EMUL_CONTINUE)
1538 return rc; 1849 return rc;
1539 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1850 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
@@ -1562,8 +1873,10 @@ setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1562 struct x86_emulate_ops *ops, struct desc_struct *cs, 1873 struct x86_emulate_ops *ops, struct desc_struct *cs,
1563 struct desc_struct *ss) 1874 struct desc_struct *ss)
1564{ 1875{
1876 u16 selector;
1877
1565 memset(cs, 0, sizeof(struct desc_struct)); 1878 memset(cs, 0, sizeof(struct desc_struct));
1566 ops->get_cached_descriptor(cs, NULL, VCPU_SREG_CS, ctxt->vcpu); 1879 ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
1567 memset(ss, 0, sizeof(struct desc_struct)); 1880 memset(ss, 0, sizeof(struct desc_struct));
1568 1881
1569 cs->l = 0; /* will be adjusted later */ 1882 cs->l = 0; /* will be adjusted later */
@@ -1593,44 +1906,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1593 struct desc_struct cs, ss; 1906 struct desc_struct cs, ss;
1594 u64 msr_data; 1907 u64 msr_data;
1595 u16 cs_sel, ss_sel; 1908 u16 cs_sel, ss_sel;
1909 u64 efer = 0;
1596 1910
1597 /* syscall is not available in real mode */ 1911 /* syscall is not available in real mode */
1598 if (ctxt->mode == X86EMUL_MODE_REAL || 1912 if (ctxt->mode == X86EMUL_MODE_REAL ||
1599 ctxt->mode == X86EMUL_MODE_VM86) 1913 ctxt->mode == X86EMUL_MODE_VM86)
1600 return emulate_ud(ctxt); 1914 return emulate_ud(ctxt);
1601 1915
1916 ops->get_msr(ctxt, MSR_EFER, &efer);
1602 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1917 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1603 1918
1604 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1919 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1605 msr_data >>= 32; 1920 msr_data >>= 32;
1606 cs_sel = (u16)(msr_data & 0xfffc); 1921 cs_sel = (u16)(msr_data & 0xfffc);
1607 ss_sel = (u16)(msr_data + 8); 1922 ss_sel = (u16)(msr_data + 8);
1608 1923
1609 if (is_long_mode(ctxt->vcpu)) { 1924 if (efer & EFER_LMA) {
1610 cs.d = 0; 1925 cs.d = 0;
1611 cs.l = 1; 1926 cs.l = 1;
1612 } 1927 }
1613 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); 1928 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1614 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1929 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1615 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1616 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1617 1930
1618 c->regs[VCPU_REGS_RCX] = c->eip; 1931 c->regs[VCPU_REGS_RCX] = c->eip;
1619 if (is_long_mode(ctxt->vcpu)) { 1932 if (efer & EFER_LMA) {
1620#ifdef CONFIG_X86_64 1933#ifdef CONFIG_X86_64
1621 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1934 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1622 1935
1623 ops->get_msr(ctxt->vcpu, 1936 ops->get_msr(ctxt,
1624 ctxt->mode == X86EMUL_MODE_PROT64 ? 1937 ctxt->mode == X86EMUL_MODE_PROT64 ?
1625 MSR_LSTAR : MSR_CSTAR, &msr_data); 1938 MSR_LSTAR : MSR_CSTAR, &msr_data);
1626 c->eip = msr_data; 1939 c->eip = msr_data;
1627 1940
1628 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1941 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
1629 ctxt->eflags &= ~(msr_data | EFLG_RF); 1942 ctxt->eflags &= ~(msr_data | EFLG_RF);
1630#endif 1943#endif
1631 } else { 1944 } else {
1632 /* legacy mode */ 1945 /* legacy mode */
1633 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1946 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1634 c->eip = (u32)msr_data; 1947 c->eip = (u32)msr_data;
1635 1948
1636 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1949 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1646,7 +1959,9 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1646 struct desc_struct cs, ss; 1959 struct desc_struct cs, ss;
1647 u64 msr_data; 1960 u64 msr_data;
1648 u16 cs_sel, ss_sel; 1961 u16 cs_sel, ss_sel;
1962 u64 efer = 0;
1649 1963
1964 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
1650 /* inject #GP if in real mode */ 1965 /* inject #GP if in real mode */
1651 if (ctxt->mode == X86EMUL_MODE_REAL) 1966 if (ctxt->mode == X86EMUL_MODE_REAL)
1652 return emulate_gp(ctxt, 0); 1967 return emulate_gp(ctxt, 0);
@@ -1659,7 +1974,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1659 1974
1660 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1975 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1661 1976
1662 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1977 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
1663 switch (ctxt->mode) { 1978 switch (ctxt->mode) {
1664 case X86EMUL_MODE_PROT32: 1979 case X86EMUL_MODE_PROT32:
1665 if ((msr_data & 0xfffc) == 0x0) 1980 if ((msr_data & 0xfffc) == 0x0)
@@ -1676,21 +1991,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1676 cs_sel &= ~SELECTOR_RPL_MASK; 1991 cs_sel &= ~SELECTOR_RPL_MASK;
1677 ss_sel = cs_sel + 8; 1992 ss_sel = cs_sel + 8;
1678 ss_sel &= ~SELECTOR_RPL_MASK; 1993 ss_sel &= ~SELECTOR_RPL_MASK;
1679 if (ctxt->mode == X86EMUL_MODE_PROT64 1994 if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) {
1680 || is_long_mode(ctxt->vcpu)) {
1681 cs.d = 0; 1995 cs.d = 0;
1682 cs.l = 1; 1996 cs.l = 1;
1683 } 1997 }
1684 1998
1685 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); 1999 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1686 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2000 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1687 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1688 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1689 2001
1690 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2002 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
1691 c->eip = msr_data; 2003 c->eip = msr_data;
1692 2004
1693 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2005 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
1694 c->regs[VCPU_REGS_RSP] = msr_data; 2006 c->regs[VCPU_REGS_RSP] = msr_data;
1695 2007
1696 return X86EMUL_CONTINUE; 2008 return X86EMUL_CONTINUE;
@@ -1719,7 +2031,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1719 2031
1720 cs.dpl = 3; 2032 cs.dpl = 3;
1721 ss.dpl = 3; 2033 ss.dpl = 3;
1722 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2034 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
1723 switch (usermode) { 2035 switch (usermode) {
1724 case X86EMUL_MODE_PROT32: 2036 case X86EMUL_MODE_PROT32:
1725 cs_sel = (u16)(msr_data + 16); 2037 cs_sel = (u16)(msr_data + 16);
@@ -1739,10 +2051,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1739 cs_sel |= SELECTOR_RPL_MASK; 2051 cs_sel |= SELECTOR_RPL_MASK;
1740 ss_sel |= SELECTOR_RPL_MASK; 2052 ss_sel |= SELECTOR_RPL_MASK;
1741 2053
1742 ops->set_cached_descriptor(&cs, 0, VCPU_SREG_CS, ctxt->vcpu); 2054 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1743 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2055 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1744 ops->set_cached_descriptor(&ss, 0, VCPU_SREG_SS, ctxt->vcpu);
1745 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1746 2056
1747 c->eip = c->regs[VCPU_REGS_RDX]; 2057 c->eip = c->regs[VCPU_REGS_RDX];
1748 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; 2058 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
@@ -1759,7 +2069,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
1759 if (ctxt->mode == X86EMUL_MODE_VM86) 2069 if (ctxt->mode == X86EMUL_MODE_VM86)
1760 return true; 2070 return true;
1761 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2071 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1762 return ops->cpl(ctxt->vcpu) > iopl; 2072 return ops->cpl(ctxt) > iopl;
1763} 2073}
1764 2074
1765static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2075static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -1769,11 +2079,11 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1769 struct desc_struct tr_seg; 2079 struct desc_struct tr_seg;
1770 u32 base3; 2080 u32 base3;
1771 int r; 2081 int r;
1772 u16 io_bitmap_ptr, perm, bit_idx = port & 0x7; 2082 u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
1773 unsigned mask = (1 << len) - 1; 2083 unsigned mask = (1 << len) - 1;
1774 unsigned long base; 2084 unsigned long base;
1775 2085
1776 ops->get_cached_descriptor(&tr_seg, &base3, VCPU_SREG_TR, ctxt->vcpu); 2086 ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
1777 if (!tr_seg.p) 2087 if (!tr_seg.p)
1778 return false; 2088 return false;
1779 if (desc_limit_scaled(&tr_seg) < 103) 2089 if (desc_limit_scaled(&tr_seg) < 103)
@@ -1782,13 +2092,12 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1782#ifdef CONFIG_X86_64 2092#ifdef CONFIG_X86_64
1783 base |= ((u64)base3) << 32; 2093 base |= ((u64)base3) << 32;
1784#endif 2094#endif
1785 r = ops->read_std(base + 102, &io_bitmap_ptr, 2, ctxt->vcpu, NULL); 2095 r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL);
1786 if (r != X86EMUL_CONTINUE) 2096 if (r != X86EMUL_CONTINUE)
1787 return false; 2097 return false;
1788 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 2098 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
1789 return false; 2099 return false;
1790 r = ops->read_std(base + io_bitmap_ptr + port/8, &perm, 2, ctxt->vcpu, 2100 r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL);
1791 NULL);
1792 if (r != X86EMUL_CONTINUE) 2101 if (r != X86EMUL_CONTINUE)
1793 return false; 2102 return false;
1794 if ((perm >> bit_idx) & mask) 2103 if ((perm >> bit_idx) & mask)
@@ -1829,11 +2138,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
1829 tss->si = c->regs[VCPU_REGS_RSI]; 2138 tss->si = c->regs[VCPU_REGS_RSI];
1830 tss->di = c->regs[VCPU_REGS_RDI]; 2139 tss->di = c->regs[VCPU_REGS_RDI];
1831 2140
1832 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); 2141 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
1833 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2142 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
1834 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); 2143 tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
1835 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); 2144 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
1836 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); 2145 tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
1837} 2146}
1838 2147
1839static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, 2148static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
@@ -1858,11 +2167,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
1858 * SDM says that segment selectors are loaded before segment 2167 * SDM says that segment selectors are loaded before segment
1859 * descriptors 2168 * descriptors
1860 */ 2169 */
1861 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); 2170 set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
1862 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); 2171 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
1863 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); 2172 set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
1864 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); 2173 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
1865 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); 2174 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
1866 2175
1867 /* 2176 /*
1868 * Now load segment descriptors. If fault happenes at this stage 2177 * Now load segment descriptors. If fault happenes at this stage
@@ -1896,7 +2205,7 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1896 int ret; 2205 int ret;
1897 u32 new_tss_base = get_desc_base(new_desc); 2206 u32 new_tss_base = get_desc_base(new_desc);
1898 2207
1899 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2208 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
1900 &ctxt->exception); 2209 &ctxt->exception);
1901 if (ret != X86EMUL_CONTINUE) 2210 if (ret != X86EMUL_CONTINUE)
1902 /* FIXME: need to provide precise fault address */ 2211 /* FIXME: need to provide precise fault address */
@@ -1904,13 +2213,13 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1904 2213
1905 save_state_to_tss16(ctxt, ops, &tss_seg); 2214 save_state_to_tss16(ctxt, ops, &tss_seg);
1906 2215
1907 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2216 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
1908 &ctxt->exception); 2217 &ctxt->exception);
1909 if (ret != X86EMUL_CONTINUE) 2218 if (ret != X86EMUL_CONTINUE)
1910 /* FIXME: need to provide precise fault address */ 2219 /* FIXME: need to provide precise fault address */
1911 return ret; 2220 return ret;
1912 2221
1913 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2222 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
1914 &ctxt->exception); 2223 &ctxt->exception);
1915 if (ret != X86EMUL_CONTINUE) 2224 if (ret != X86EMUL_CONTINUE)
1916 /* FIXME: need to provide precise fault address */ 2225 /* FIXME: need to provide precise fault address */
@@ -1919,10 +2228,10 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
1919 if (old_tss_sel != 0xffff) { 2228 if (old_tss_sel != 0xffff) {
1920 tss_seg.prev_task_link = old_tss_sel; 2229 tss_seg.prev_task_link = old_tss_sel;
1921 2230
1922 ret = ops->write_std(new_tss_base, 2231 ret = ops->write_std(ctxt, new_tss_base,
1923 &tss_seg.prev_task_link, 2232 &tss_seg.prev_task_link,
1924 sizeof tss_seg.prev_task_link, 2233 sizeof tss_seg.prev_task_link,
1925 ctxt->vcpu, &ctxt->exception); 2234 &ctxt->exception);
1926 if (ret != X86EMUL_CONTINUE) 2235 if (ret != X86EMUL_CONTINUE)
1927 /* FIXME: need to provide precise fault address */ 2236 /* FIXME: need to provide precise fault address */
1928 return ret; 2237 return ret;
@@ -1937,7 +2246,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
1937{ 2246{
1938 struct decode_cache *c = &ctxt->decode; 2247 struct decode_cache *c = &ctxt->decode;
1939 2248
1940 tss->cr3 = ops->get_cr(3, ctxt->vcpu); 2249 tss->cr3 = ops->get_cr(ctxt, 3);
1941 tss->eip = c->eip; 2250 tss->eip = c->eip;
1942 tss->eflags = ctxt->eflags; 2251 tss->eflags = ctxt->eflags;
1943 tss->eax = c->regs[VCPU_REGS_RAX]; 2252 tss->eax = c->regs[VCPU_REGS_RAX];
@@ -1949,13 +2258,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
1949 tss->esi = c->regs[VCPU_REGS_RSI]; 2258 tss->esi = c->regs[VCPU_REGS_RSI];
1950 tss->edi = c->regs[VCPU_REGS_RDI]; 2259 tss->edi = c->regs[VCPU_REGS_RDI];
1951 2260
1952 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); 2261 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
1953 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2262 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
1954 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); 2263 tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
1955 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); 2264 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
1956 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); 2265 tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
1957 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); 2266 tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
1958 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); 2267 tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
1959} 2268}
1960 2269
1961static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2270static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
@@ -1965,7 +2274,7 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
1965 struct decode_cache *c = &ctxt->decode; 2274 struct decode_cache *c = &ctxt->decode;
1966 int ret; 2275 int ret;
1967 2276
1968 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) 2277 if (ops->set_cr(ctxt, 3, tss->cr3))
1969 return emulate_gp(ctxt, 0); 2278 return emulate_gp(ctxt, 0);
1970 c->eip = tss->eip; 2279 c->eip = tss->eip;
1971 ctxt->eflags = tss->eflags | 2; 2280 ctxt->eflags = tss->eflags | 2;
@@ -1982,13 +2291,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
1982 * SDM says that segment selectors are loaded before segment 2291 * SDM says that segment selectors are loaded before segment
1983 * descriptors 2292 * descriptors
1984 */ 2293 */
1985 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); 2294 set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
1986 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); 2295 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
1987 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); 2296 set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
1988 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); 2297 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
1989 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); 2298 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
1990 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); 2299 set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
1991 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); 2300 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
1992 2301
1993 /* 2302 /*
1994 * Now load segment descriptors. If fault happenes at this stage 2303 * Now load segment descriptors. If fault happenes at this stage
@@ -2028,7 +2337,7 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2028 int ret; 2337 int ret;
2029 u32 new_tss_base = get_desc_base(new_desc); 2338 u32 new_tss_base = get_desc_base(new_desc);
2030 2339
2031 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2340 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2032 &ctxt->exception); 2341 &ctxt->exception);
2033 if (ret != X86EMUL_CONTINUE) 2342 if (ret != X86EMUL_CONTINUE)
2034 /* FIXME: need to provide precise fault address */ 2343 /* FIXME: need to provide precise fault address */
@@ -2036,13 +2345,13 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2036 2345
2037 save_state_to_tss32(ctxt, ops, &tss_seg); 2346 save_state_to_tss32(ctxt, ops, &tss_seg);
2038 2347
2039 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2348 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2040 &ctxt->exception); 2349 &ctxt->exception);
2041 if (ret != X86EMUL_CONTINUE) 2350 if (ret != X86EMUL_CONTINUE)
2042 /* FIXME: need to provide precise fault address */ 2351 /* FIXME: need to provide precise fault address */
2043 return ret; 2352 return ret;
2044 2353
2045 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2354 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
2046 &ctxt->exception); 2355 &ctxt->exception);
2047 if (ret != X86EMUL_CONTINUE) 2356 if (ret != X86EMUL_CONTINUE)
2048 /* FIXME: need to provide precise fault address */ 2357 /* FIXME: need to provide precise fault address */
@@ -2051,10 +2360,10 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2051 if (old_tss_sel != 0xffff) { 2360 if (old_tss_sel != 0xffff) {
2052 tss_seg.prev_task_link = old_tss_sel; 2361 tss_seg.prev_task_link = old_tss_sel;
2053 2362
2054 ret = ops->write_std(new_tss_base, 2363 ret = ops->write_std(ctxt, new_tss_base,
2055 &tss_seg.prev_task_link, 2364 &tss_seg.prev_task_link,
2056 sizeof tss_seg.prev_task_link, 2365 sizeof tss_seg.prev_task_link,
2057 ctxt->vcpu, &ctxt->exception); 2366 &ctxt->exception);
2058 if (ret != X86EMUL_CONTINUE) 2367 if (ret != X86EMUL_CONTINUE)
2059 /* FIXME: need to provide precise fault address */ 2368 /* FIXME: need to provide precise fault address */
2060 return ret; 2369 return ret;
@@ -2070,9 +2379,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2070{ 2379{
2071 struct desc_struct curr_tss_desc, next_tss_desc; 2380 struct desc_struct curr_tss_desc, next_tss_desc;
2072 int ret; 2381 int ret;
2073 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2382 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
2074 ulong old_tss_base = 2383 ulong old_tss_base =
2075 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); 2384 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2076 u32 desc_limit; 2385 u32 desc_limit;
2077 2386
2078 /* FIXME: old_tss_base == ~0 ? */ 2387 /* FIXME: old_tss_base == ~0 ? */
@@ -2088,7 +2397,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2088 2397
2089 if (reason != TASK_SWITCH_IRET) { 2398 if (reason != TASK_SWITCH_IRET) {
2090 if ((tss_selector & 3) > next_tss_desc.dpl || 2399 if ((tss_selector & 3) > next_tss_desc.dpl ||
2091 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) 2400 ops->cpl(ctxt) > next_tss_desc.dpl)
2092 return emulate_gp(ctxt, 0); 2401 return emulate_gp(ctxt, 0);
2093 } 2402 }
2094 2403
@@ -2132,9 +2441,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2132 &next_tss_desc); 2441 &next_tss_desc);
2133 } 2442 }
2134 2443
2135 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2444 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
2136 ops->set_cached_descriptor(&next_tss_desc, 0, VCPU_SREG_TR, ctxt->vcpu); 2445 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
2137 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2138 2446
2139 if (has_error_code) { 2447 if (has_error_code) {
2140 struct decode_cache *c = &ctxt->decode; 2448 struct decode_cache *c = &ctxt->decode;
@@ -2142,7 +2450,7 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2142 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2450 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2143 c->lock_prefix = 0; 2451 c->lock_prefix = 0;
2144 c->src.val = (unsigned long) error_code; 2452 c->src.val = (unsigned long) error_code;
2145 emulate_push(ctxt, ops); 2453 ret = em_push(ctxt);
2146 } 2454 }
2147 2455
2148 return ret; 2456 return ret;
@@ -2162,13 +2470,10 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2162 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2470 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2163 has_error_code, error_code); 2471 has_error_code, error_code);
2164 2472
2165 if (rc == X86EMUL_CONTINUE) { 2473 if (rc == X86EMUL_CONTINUE)
2166 rc = writeback(ctxt, ops); 2474 ctxt->eip = c->eip;
2167 if (rc == X86EMUL_CONTINUE)
2168 ctxt->eip = c->eip;
2169 }
2170 2475
2171 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2476 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2172} 2477}
2173 2478
2174static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg, 2479static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
@@ -2182,12 +2487,6 @@ static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2182 op->addr.mem.seg = seg; 2487 op->addr.mem.seg = seg;
2183} 2488}
2184 2489
2185static int em_push(struct x86_emulate_ctxt *ctxt)
2186{
2187 emulate_push(ctxt, ctxt->ops);
2188 return X86EMUL_CONTINUE;
2189}
2190
2191static int em_das(struct x86_emulate_ctxt *ctxt) 2490static int em_das(struct x86_emulate_ctxt *ctxt)
2192{ 2491{
2193 struct decode_cache *c = &ctxt->decode; 2492 struct decode_cache *c = &ctxt->decode;
@@ -2234,7 +2533,7 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
2234 ulong old_eip; 2533 ulong old_eip;
2235 int rc; 2534 int rc;
2236 2535
2237 old_cs = ctxt->ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2536 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2238 old_eip = c->eip; 2537 old_eip = c->eip;
2239 2538
2240 memcpy(&sel, c->src.valptr + c->op_bytes, 2); 2539 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
@@ -2245,20 +2544,12 @@ static int em_call_far(struct x86_emulate_ctxt *ctxt)
2245 memcpy(&c->eip, c->src.valptr, c->op_bytes); 2544 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2246 2545
2247 c->src.val = old_cs; 2546 c->src.val = old_cs;
2248 emulate_push(ctxt, ctxt->ops); 2547 rc = em_push(ctxt);
2249 rc = writeback(ctxt, ctxt->ops);
2250 if (rc != X86EMUL_CONTINUE) 2548 if (rc != X86EMUL_CONTINUE)
2251 return rc; 2549 return rc;
2252 2550
2253 c->src.val = old_eip; 2551 c->src.val = old_eip;
2254 emulate_push(ctxt, ctxt->ops); 2552 return em_push(ctxt);
2255 rc = writeback(ctxt, ctxt->ops);
2256 if (rc != X86EMUL_CONTINUE)
2257 return rc;
2258
2259 c->dst.type = OP_NONE;
2260
2261 return X86EMUL_CONTINUE;
2262} 2553}
2263 2554
2264static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt) 2555static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
@@ -2269,13 +2560,79 @@ static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2269 c->dst.type = OP_REG; 2560 c->dst.type = OP_REG;
2270 c->dst.addr.reg = &c->eip; 2561 c->dst.addr.reg = &c->eip;
2271 c->dst.bytes = c->op_bytes; 2562 c->dst.bytes = c->op_bytes;
2272 rc = emulate_pop(ctxt, ctxt->ops, &c->dst.val, c->op_bytes); 2563 rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
2273 if (rc != X86EMUL_CONTINUE) 2564 if (rc != X86EMUL_CONTINUE)
2274 return rc; 2565 return rc;
2275 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val); 2566 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
2276 return X86EMUL_CONTINUE; 2567 return X86EMUL_CONTINUE;
2277} 2568}
2278 2569
2570static int em_add(struct x86_emulate_ctxt *ctxt)
2571{
2572 struct decode_cache *c = &ctxt->decode;
2573
2574 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2575 return X86EMUL_CONTINUE;
2576}
2577
2578static int em_or(struct x86_emulate_ctxt *ctxt)
2579{
2580 struct decode_cache *c = &ctxt->decode;
2581
2582 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2583 return X86EMUL_CONTINUE;
2584}
2585
2586static int em_adc(struct x86_emulate_ctxt *ctxt)
2587{
2588 struct decode_cache *c = &ctxt->decode;
2589
2590 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2591 return X86EMUL_CONTINUE;
2592}
2593
2594static int em_sbb(struct x86_emulate_ctxt *ctxt)
2595{
2596 struct decode_cache *c = &ctxt->decode;
2597
2598 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2599 return X86EMUL_CONTINUE;
2600}
2601
2602static int em_and(struct x86_emulate_ctxt *ctxt)
2603{
2604 struct decode_cache *c = &ctxt->decode;
2605
2606 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2607 return X86EMUL_CONTINUE;
2608}
2609
2610static int em_sub(struct x86_emulate_ctxt *ctxt)
2611{
2612 struct decode_cache *c = &ctxt->decode;
2613
2614 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2615 return X86EMUL_CONTINUE;
2616}
2617
2618static int em_xor(struct x86_emulate_ctxt *ctxt)
2619{
2620 struct decode_cache *c = &ctxt->decode;
2621
2622 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2623 return X86EMUL_CONTINUE;
2624}
2625
2626static int em_cmp(struct x86_emulate_ctxt *ctxt)
2627{
2628 struct decode_cache *c = &ctxt->decode;
2629
2630 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2631 /* Disable writeback. */
2632 c->dst.type = OP_NONE;
2633 return X86EMUL_CONTINUE;
2634}
2635
2279static int em_imul(struct x86_emulate_ctxt *ctxt) 2636static int em_imul(struct x86_emulate_ctxt *ctxt)
2280{ 2637{
2281 struct decode_cache *c = &ctxt->decode; 2638 struct decode_cache *c = &ctxt->decode;
@@ -2306,13 +2663,10 @@ static int em_cwd(struct x86_emulate_ctxt *ctxt)
2306 2663
2307static int em_rdtsc(struct x86_emulate_ctxt *ctxt) 2664static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2308{ 2665{
2309 unsigned cpl = ctxt->ops->cpl(ctxt->vcpu);
2310 struct decode_cache *c = &ctxt->decode; 2666 struct decode_cache *c = &ctxt->decode;
2311 u64 tsc = 0; 2667 u64 tsc = 0;
2312 2668
2313 if (cpl > 0 && (ctxt->ops->get_cr(4, ctxt->vcpu) & X86_CR4_TSD)) 2669 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2314 return emulate_gp(ctxt, 0);
2315 ctxt->ops->get_msr(ctxt->vcpu, MSR_IA32_TSC, &tsc);
2316 c->regs[VCPU_REGS_RAX] = (u32)tsc; 2670 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2317 c->regs[VCPU_REGS_RDX] = tsc >> 32; 2671 c->regs[VCPU_REGS_RDX] = tsc >> 32;
2318 return X86EMUL_CONTINUE; 2672 return X86EMUL_CONTINUE;
@@ -2325,22 +2679,375 @@ static int em_mov(struct x86_emulate_ctxt *ctxt)
2325 return X86EMUL_CONTINUE; 2679 return X86EMUL_CONTINUE;
2326} 2680}
2327 2681
2682static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2683{
2684 struct decode_cache *c = &ctxt->decode;
2685 memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
2686 return X86EMUL_CONTINUE;
2687}
2688
2689static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2690{
2691 struct decode_cache *c = &ctxt->decode;
2692 int rc;
2693 ulong linear;
2694
2695 rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear);
2696 if (rc == X86EMUL_CONTINUE)
2697 ctxt->ops->invlpg(ctxt, linear);
2698 /* Disable writeback. */
2699 c->dst.type = OP_NONE;
2700 return X86EMUL_CONTINUE;
2701}
2702
2703static int em_clts(struct x86_emulate_ctxt *ctxt)
2704{
2705 ulong cr0;
2706
2707 cr0 = ctxt->ops->get_cr(ctxt, 0);
2708 cr0 &= ~X86_CR0_TS;
2709 ctxt->ops->set_cr(ctxt, 0, cr0);
2710 return X86EMUL_CONTINUE;
2711}
2712
2713static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2714{
2715 struct decode_cache *c = &ctxt->decode;
2716 int rc;
2717
2718 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2719 return X86EMUL_UNHANDLEABLE;
2720
2721 rc = ctxt->ops->fix_hypercall(ctxt);
2722 if (rc != X86EMUL_CONTINUE)
2723 return rc;
2724
2725 /* Let the processor re-execute the fixed hypercall */
2726 c->eip = ctxt->eip;
2727 /* Disable writeback. */
2728 c->dst.type = OP_NONE;
2729 return X86EMUL_CONTINUE;
2730}
2731
2732static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2733{
2734 struct decode_cache *c = &ctxt->decode;
2735 struct desc_ptr desc_ptr;
2736 int rc;
2737
2738 rc = read_descriptor(ctxt, c->src.addr.mem,
2739 &desc_ptr.size, &desc_ptr.address,
2740 c->op_bytes);
2741 if (rc != X86EMUL_CONTINUE)
2742 return rc;
2743 ctxt->ops->set_gdt(ctxt, &desc_ptr);
2744 /* Disable writeback. */
2745 c->dst.type = OP_NONE;
2746 return X86EMUL_CONTINUE;
2747}
2748
2749static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
2750{
2751 struct decode_cache *c = &ctxt->decode;
2752 int rc;
2753
2754 rc = ctxt->ops->fix_hypercall(ctxt);
2755
2756 /* Disable writeback. */
2757 c->dst.type = OP_NONE;
2758 return rc;
2759}
2760
2761static int em_lidt(struct x86_emulate_ctxt *ctxt)
2762{
2763 struct decode_cache *c = &ctxt->decode;
2764 struct desc_ptr desc_ptr;
2765 int rc;
2766
2767 rc = read_descriptor(ctxt, c->src.addr.mem,
2768 &desc_ptr.size, &desc_ptr.address,
2769 c->op_bytes);
2770 if (rc != X86EMUL_CONTINUE)
2771 return rc;
2772 ctxt->ops->set_idt(ctxt, &desc_ptr);
2773 /* Disable writeback. */
2774 c->dst.type = OP_NONE;
2775 return X86EMUL_CONTINUE;
2776}
2777
2778static int em_smsw(struct x86_emulate_ctxt *ctxt)
2779{
2780 struct decode_cache *c = &ctxt->decode;
2781
2782 c->dst.bytes = 2;
2783 c->dst.val = ctxt->ops->get_cr(ctxt, 0);
2784 return X86EMUL_CONTINUE;
2785}
2786
2787static int em_lmsw(struct x86_emulate_ctxt *ctxt)
2788{
2789 struct decode_cache *c = &ctxt->decode;
2790 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
2791 | (c->src.val & 0x0f));
2792 c->dst.type = OP_NONE;
2793 return X86EMUL_CONTINUE;
2794}
2795
2796static bool valid_cr(int nr)
2797{
2798 switch (nr) {
2799 case 0:
2800 case 2 ... 4:
2801 case 8:
2802 return true;
2803 default:
2804 return false;
2805 }
2806}
2807
2808static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2809{
2810 struct decode_cache *c = &ctxt->decode;
2811
2812 if (!valid_cr(c->modrm_reg))
2813 return emulate_ud(ctxt);
2814
2815 return X86EMUL_CONTINUE;
2816}
2817
2818static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2819{
2820 struct decode_cache *c = &ctxt->decode;
2821 u64 new_val = c->src.val64;
2822 int cr = c->modrm_reg;
2823 u64 efer = 0;
2824
2825 static u64 cr_reserved_bits[] = {
2826 0xffffffff00000000ULL,
2827 0, 0, 0, /* CR3 checked later */
2828 CR4_RESERVED_BITS,
2829 0, 0, 0,
2830 CR8_RESERVED_BITS,
2831 };
2832
2833 if (!valid_cr(cr))
2834 return emulate_ud(ctxt);
2835
2836 if (new_val & cr_reserved_bits[cr])
2837 return emulate_gp(ctxt, 0);
2838
2839 switch (cr) {
2840 case 0: {
2841 u64 cr4;
2842 if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
2843 ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
2844 return emulate_gp(ctxt, 0);
2845
2846 cr4 = ctxt->ops->get_cr(ctxt, 4);
2847 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2848
2849 if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
2850 !(cr4 & X86_CR4_PAE))
2851 return emulate_gp(ctxt, 0);
2852
2853 break;
2854 }
2855 case 3: {
2856 u64 rsvd = 0;
2857
2858 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2859 if (efer & EFER_LMA)
2860 rsvd = CR3_L_MODE_RESERVED_BITS;
2861 else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
2862 rsvd = CR3_PAE_RESERVED_BITS;
2863 else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
2864 rsvd = CR3_NONPAE_RESERVED_BITS;
2865
2866 if (new_val & rsvd)
2867 return emulate_gp(ctxt, 0);
2868
2869 break;
2870 }
2871 case 4: {
2872 u64 cr4;
2873
2874 cr4 = ctxt->ops->get_cr(ctxt, 4);
2875 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2876
2877 if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
2878 return emulate_gp(ctxt, 0);
2879
2880 break;
2881 }
2882 }
2883
2884 return X86EMUL_CONTINUE;
2885}
2886
2887static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
2888{
2889 unsigned long dr7;
2890
2891 ctxt->ops->get_dr(ctxt, 7, &dr7);
2892
2893 /* Check if DR7.Global_Enable is set */
2894 return dr7 & (1 << 13);
2895}
2896
2897static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2898{
2899 struct decode_cache *c = &ctxt->decode;
2900 int dr = c->modrm_reg;
2901 u64 cr4;
2902
2903 if (dr > 7)
2904 return emulate_ud(ctxt);
2905
2906 cr4 = ctxt->ops->get_cr(ctxt, 4);
2907 if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
2908 return emulate_ud(ctxt);
2909
2910 if (check_dr7_gd(ctxt))
2911 return emulate_db(ctxt);
2912
2913 return X86EMUL_CONTINUE;
2914}
2915
2916static int check_dr_write(struct x86_emulate_ctxt *ctxt)
2917{
2918 struct decode_cache *c = &ctxt->decode;
2919 u64 new_val = c->src.val64;
2920 int dr = c->modrm_reg;
2921
2922 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
2923 return emulate_gp(ctxt, 0);
2924
2925 return check_dr_read(ctxt);
2926}
2927
2928static int check_svme(struct x86_emulate_ctxt *ctxt)
2929{
2930 u64 efer;
2931
2932 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2933
2934 if (!(efer & EFER_SVME))
2935 return emulate_ud(ctxt);
2936
2937 return X86EMUL_CONTINUE;
2938}
2939
2940static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
2941{
2942 u64 rax = ctxt->decode.regs[VCPU_REGS_RAX];
2943
2944 /* Valid physical address? */
2945 if (rax & 0xffff000000000000ULL)
2946 return emulate_gp(ctxt, 0);
2947
2948 return check_svme(ctxt);
2949}
2950
2951static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
2952{
2953 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2954
2955 if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
2956 return emulate_ud(ctxt);
2957
2958 return X86EMUL_CONTINUE;
2959}
2960
2961static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2962{
2963 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2964 u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX];
2965
2966 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
2967 (rcx > 3))
2968 return emulate_gp(ctxt, 0);
2969
2970 return X86EMUL_CONTINUE;
2971}
2972
2973static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2974{
2975 struct decode_cache *c = &ctxt->decode;
2976
2977 c->dst.bytes = min(c->dst.bytes, 4u);
2978 if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
2979 return emulate_gp(ctxt, 0);
2980
2981 return X86EMUL_CONTINUE;
2982}
2983
2984static int check_perm_out(struct x86_emulate_ctxt *ctxt)
2985{
2986 struct decode_cache *c = &ctxt->decode;
2987
2988 c->src.bytes = min(c->src.bytes, 4u);
2989 if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
2990 return emulate_gp(ctxt, 0);
2991
2992 return X86EMUL_CONTINUE;
2993}
2994
2328#define D(_y) { .flags = (_y) } 2995#define D(_y) { .flags = (_y) }
2996#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
2997#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
2998 .check_perm = (_p) }
2329#define N D(0) 2999#define N D(0)
3000#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
2330#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) } 3001#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
2331#define GD(_f, _g) { .flags = ((_f) | Group | GroupDual), .u.gdual = (_g) } 3002#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
2332#define I(_f, _e) { .flags = (_f), .u.execute = (_e) } 3003#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3004#define II(_f, _e, _i) \
3005 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
3006#define IIP(_f, _e, _i, _p) \
3007 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
3008 .check_perm = (_p) }
3009#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
2333 3010
2334#define D2bv(_f) D((_f) | ByteOp), D(_f) 3011#define D2bv(_f) D((_f) | ByteOp), D(_f)
3012#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
2335#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e) 3013#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
2336 3014
2337#define D6ALU(_f) D2bv((_f) | DstMem | SrcReg | ModRM), \ 3015#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
2338 D2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock), \ 3016 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
2339 D2bv(((_f) & ~Lock) | DstAcc | SrcImm) 3017 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
2340 3018
3019static struct opcode group7_rm1[] = {
3020 DI(SrcNone | ModRM | Priv, monitor),
3021 DI(SrcNone | ModRM | Priv, mwait),
3022 N, N, N, N, N, N,
3023};
3024
3025static struct opcode group7_rm3[] = {
3026 DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
3027 II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
3028 DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
3029 DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
3030 DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
3031 DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
3032 DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
3033 DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
3034};
3035
3036static struct opcode group7_rm7[] = {
3037 N,
3038 DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
3039 N, N, N, N, N, N,
3040};
2341 3041
2342static struct opcode group1[] = { 3042static struct opcode group1[] = {
2343 X7(D(Lock)), N 3043 I(Lock, em_add),
3044 I(Lock, em_or),
3045 I(Lock, em_adc),
3046 I(Lock, em_sbb),
3047 I(Lock, em_and),
3048 I(Lock, em_sub),
3049 I(Lock, em_xor),
3050 I(0, em_cmp),
2344}; 3051};
2345 3052
2346static struct opcode group1A[] = { 3053static struct opcode group1A[] = {
@@ -2366,16 +3073,28 @@ static struct opcode group5[] = {
2366 D(SrcMem | ModRM | Stack), N, 3073 D(SrcMem | ModRM | Stack), N,
2367}; 3074};
2368 3075
3076static struct opcode group6[] = {
3077 DI(ModRM | Prot, sldt),
3078 DI(ModRM | Prot, str),
3079 DI(ModRM | Prot | Priv, lldt),
3080 DI(ModRM | Prot | Priv, ltr),
3081 N, N, N, N,
3082};
3083
2369static struct group_dual group7 = { { 3084static struct group_dual group7 = { {
2370 N, N, D(ModRM | SrcMem | Priv), D(ModRM | SrcMem | Priv), 3085 DI(ModRM | Mov | DstMem | Priv, sgdt),
2371 D(SrcNone | ModRM | DstMem | Mov), N, 3086 DI(ModRM | Mov | DstMem | Priv, sidt),
2372 D(SrcMem16 | ModRM | Mov | Priv), 3087 II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
2373 D(SrcMem | ModRM | ByteOp | Priv | NoAccess), 3088 II(ModRM | SrcMem | Priv, em_lidt, lidt),
3089 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
3090 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
3091 II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
2374}, { 3092}, {
2375 D(SrcNone | ModRM | Priv | VendorSpecific), N, 3093 I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
2376 N, D(SrcNone | ModRM | Priv | VendorSpecific), 3094 EXT(0, group7_rm1),
2377 D(SrcNone | ModRM | DstMem | Mov), N, 3095 N, EXT(0, group7_rm3),
2378 D(SrcMem16 | ModRM | Mov | Priv), N, 3096 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
3097 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
2379} }; 3098} };
2380 3099
2381static struct opcode group8[] = { 3100static struct opcode group8[] = {
@@ -2394,35 +3113,40 @@ static struct opcode group11[] = {
2394 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)), 3113 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
2395}; 3114};
2396 3115
3116static struct gprefix pfx_0f_6f_0f_7f = {
3117 N, N, N, I(Sse, em_movdqu),
3118};
3119
2397static struct opcode opcode_table[256] = { 3120static struct opcode opcode_table[256] = {
2398 /* 0x00 - 0x07 */ 3121 /* 0x00 - 0x07 */
2399 D6ALU(Lock), 3122 I6ALU(Lock, em_add),
2400 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3123 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2401 /* 0x08 - 0x0F */ 3124 /* 0x08 - 0x0F */
2402 D6ALU(Lock), 3125 I6ALU(Lock, em_or),
2403 D(ImplicitOps | Stack | No64), N, 3126 D(ImplicitOps | Stack | No64), N,
2404 /* 0x10 - 0x17 */ 3127 /* 0x10 - 0x17 */
2405 D6ALU(Lock), 3128 I6ALU(Lock, em_adc),
2406 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3129 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2407 /* 0x18 - 0x1F */ 3130 /* 0x18 - 0x1F */
2408 D6ALU(Lock), 3131 I6ALU(Lock, em_sbb),
2409 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3132 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
2410 /* 0x20 - 0x27 */ 3133 /* 0x20 - 0x27 */
2411 D6ALU(Lock), N, N, 3134 I6ALU(Lock, em_and), N, N,
2412 /* 0x28 - 0x2F */ 3135 /* 0x28 - 0x2F */
2413 D6ALU(Lock), N, I(ByteOp | DstAcc | No64, em_das), 3136 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
2414 /* 0x30 - 0x37 */ 3137 /* 0x30 - 0x37 */
2415 D6ALU(Lock), N, N, 3138 I6ALU(Lock, em_xor), N, N,
2416 /* 0x38 - 0x3F */ 3139 /* 0x38 - 0x3F */
2417 D6ALU(0), N, N, 3140 I6ALU(0, em_cmp), N, N,
2418 /* 0x40 - 0x4F */ 3141 /* 0x40 - 0x4F */
2419 X16(D(DstReg)), 3142 X16(D(DstReg)),
2420 /* 0x50 - 0x57 */ 3143 /* 0x50 - 0x57 */
2421 X8(I(SrcReg | Stack, em_push)), 3144 X8(I(SrcReg | Stack, em_push)),
2422 /* 0x58 - 0x5F */ 3145 /* 0x58 - 0x5F */
2423 X8(D(DstReg | Stack)), 3146 X8(I(DstReg | Stack, em_pop)),
2424 /* 0x60 - 0x67 */ 3147 /* 0x60 - 0x67 */
2425 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64), 3148 I(ImplicitOps | Stack | No64, em_pusha),
3149 I(ImplicitOps | Stack | No64, em_popa),
2426 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ , 3150 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
2427 N, N, N, N, 3151 N, N, N, N,
2428 /* 0x68 - 0x6F */ 3152 /* 0x68 - 0x6F */
@@ -2430,8 +3154,8 @@ static struct opcode opcode_table[256] = {
2430 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op), 3154 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
2431 I(SrcImmByte | Mov | Stack, em_push), 3155 I(SrcImmByte | Mov | Stack, em_push),
2432 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op), 3156 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
2433 D2bv(DstDI | Mov | String), /* insb, insw/insd */ 3157 D2bvIP(DstDI | Mov | String, ins, check_perm_in), /* insb, insw/insd */
2434 D2bv(SrcSI | ImplicitOps | String), /* outsb, outsw/outsd */ 3158 D2bvIP(SrcSI | ImplicitOps | String, outs, check_perm_out), /* outsb, outsw/outsd */
2435 /* 0x70 - 0x7F */ 3159 /* 0x70 - 0x7F */
2436 X16(D(SrcImmByte)), 3160 X16(D(SrcImmByte)),
2437 /* 0x80 - 0x87 */ 3161 /* 0x80 - 0x87 */
@@ -2446,21 +3170,22 @@ static struct opcode opcode_table[256] = {
2446 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg), 3170 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
2447 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A), 3171 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
2448 /* 0x90 - 0x97 */ 3172 /* 0x90 - 0x97 */
2449 X8(D(SrcAcc | DstReg)), 3173 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
2450 /* 0x98 - 0x9F */ 3174 /* 0x98 - 0x9F */
2451 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd), 3175 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
2452 I(SrcImmFAddr | No64, em_call_far), N, 3176 I(SrcImmFAddr | No64, em_call_far), N,
2453 D(ImplicitOps | Stack), D(ImplicitOps | Stack), N, N, 3177 II(ImplicitOps | Stack, em_pushf, pushf),
3178 II(ImplicitOps | Stack, em_popf, popf), N, N,
2454 /* 0xA0 - 0xA7 */ 3179 /* 0xA0 - 0xA7 */
2455 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov), 3180 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
2456 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov), 3181 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
2457 I2bv(SrcSI | DstDI | Mov | String, em_mov), 3182 I2bv(SrcSI | DstDI | Mov | String, em_mov),
2458 D2bv(SrcSI | DstDI | String), 3183 I2bv(SrcSI | DstDI | String, em_cmp),
2459 /* 0xA8 - 0xAF */ 3184 /* 0xA8 - 0xAF */
2460 D2bv(DstAcc | SrcImm), 3185 D2bv(DstAcc | SrcImm),
2461 I2bv(SrcAcc | DstDI | Mov | String, em_mov), 3186 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
2462 I2bv(SrcSI | DstAcc | Mov | String, em_mov), 3187 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
2463 D2bv(SrcAcc | DstDI | String), 3188 I2bv(SrcAcc | DstDI | String, em_cmp),
2464 /* 0xB0 - 0xB7 */ 3189 /* 0xB0 - 0xB7 */
2465 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)), 3190 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
2466 /* 0xB8 - 0xBF */ 3191 /* 0xB8 - 0xBF */
@@ -2473,7 +3198,8 @@ static struct opcode opcode_table[256] = {
2473 G(ByteOp, group11), G(0, group11), 3198 G(ByteOp, group11), G(0, group11),
2474 /* 0xC8 - 0xCF */ 3199 /* 0xC8 - 0xCF */
2475 N, N, N, D(ImplicitOps | Stack), 3200 N, N, N, D(ImplicitOps | Stack),
2476 D(ImplicitOps), D(SrcImmByte), D(ImplicitOps | No64), D(ImplicitOps), 3201 D(ImplicitOps), DI(SrcImmByte, intn),
3202 D(ImplicitOps | No64), DI(ImplicitOps, iret),
2477 /* 0xD0 - 0xD7 */ 3203 /* 0xD0 - 0xD7 */
2478 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM), 3204 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
2479 N, N, N, N, 3205 N, N, N, N,
@@ -2481,14 +3207,17 @@ static struct opcode opcode_table[256] = {
2481 N, N, N, N, N, N, N, N, 3207 N, N, N, N, N, N, N, N,
2482 /* 0xE0 - 0xE7 */ 3208 /* 0xE0 - 0xE7 */
2483 X4(D(SrcImmByte)), 3209 X4(D(SrcImmByte)),
2484 D2bv(SrcImmUByte | DstAcc), D2bv(SrcAcc | DstImmUByte), 3210 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
3211 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
2485 /* 0xE8 - 0xEF */ 3212 /* 0xE8 - 0xEF */
2486 D(SrcImm | Stack), D(SrcImm | ImplicitOps), 3213 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
2487 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps), 3214 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
2488 D2bv(SrcNone | DstAcc), D2bv(SrcAcc | ImplicitOps), 3215 D2bvIP(SrcNone | DstAcc, in, check_perm_in),
3216 D2bvIP(SrcAcc | ImplicitOps, out, check_perm_out),
2489 /* 0xF0 - 0xF7 */ 3217 /* 0xF0 - 0xF7 */
2490 N, N, N, N, 3218 N, DI(ImplicitOps, icebp), N, N,
2491 D(ImplicitOps | Priv), D(ImplicitOps), G(ByteOp, group3), G(0, group3), 3219 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
3220 G(ByteOp, group3), G(0, group3),
2492 /* 0xF8 - 0xFF */ 3221 /* 0xF8 - 0xFF */
2493 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), 3222 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
2494 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5), 3223 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
@@ -2496,20 +3225,24 @@ static struct opcode opcode_table[256] = {
2496 3225
2497static struct opcode twobyte_table[256] = { 3226static struct opcode twobyte_table[256] = {
2498 /* 0x00 - 0x0F */ 3227 /* 0x00 - 0x0F */
2499 N, GD(0, &group7), N, N, 3228 G(0, group6), GD(0, &group7), N, N,
2500 N, D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv), N, 3229 N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
2501 D(ImplicitOps | Priv), D(ImplicitOps | Priv), N, N, 3230 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
2502 N, D(ImplicitOps | ModRM), N, N, 3231 N, D(ImplicitOps | ModRM), N, N,
2503 /* 0x10 - 0x1F */ 3232 /* 0x10 - 0x1F */
2504 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N, 3233 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
2505 /* 0x20 - 0x2F */ 3234 /* 0x20 - 0x2F */
2506 D(ModRM | DstMem | Priv | Op3264), D(ModRM | DstMem | Priv | Op3264), 3235 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
2507 D(ModRM | SrcMem | Priv | Op3264), D(ModRM | SrcMem | Priv | Op3264), 3236 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
3237 DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
3238 DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
2508 N, N, N, N, 3239 N, N, N, N,
2509 N, N, N, N, N, N, N, N, 3240 N, N, N, N, N, N, N, N,
2510 /* 0x30 - 0x3F */ 3241 /* 0x30 - 0x3F */
2511 D(ImplicitOps | Priv), I(ImplicitOps, em_rdtsc), 3242 DI(ImplicitOps | Priv, wrmsr),
2512 D(ImplicitOps | Priv), N, 3243 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3244 DI(ImplicitOps | Priv, rdmsr),
3245 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
2513 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific), 3246 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
2514 N, N, 3247 N, N,
2515 N, N, N, N, N, N, N, N, 3248 N, N, N, N, N, N, N, N,
@@ -2518,21 +3251,27 @@ static struct opcode twobyte_table[256] = {
2518 /* 0x50 - 0x5F */ 3251 /* 0x50 - 0x5F */
2519 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3252 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
2520 /* 0x60 - 0x6F */ 3253 /* 0x60 - 0x6F */
2521 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3254 N, N, N, N,
3255 N, N, N, N,
3256 N, N, N, N,
3257 N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
2522 /* 0x70 - 0x7F */ 3258 /* 0x70 - 0x7F */
2523 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, 3259 N, N, N, N,
3260 N, N, N, N,
3261 N, N, N, N,
3262 N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
2524 /* 0x80 - 0x8F */ 3263 /* 0x80 - 0x8F */
2525 X16(D(SrcImm)), 3264 X16(D(SrcImm)),
2526 /* 0x90 - 0x9F */ 3265 /* 0x90 - 0x9F */
2527 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)), 3266 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
2528 /* 0xA0 - 0xA7 */ 3267 /* 0xA0 - 0xA7 */
2529 D(ImplicitOps | Stack), D(ImplicitOps | Stack), 3268 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2530 N, D(DstMem | SrcReg | ModRM | BitOp), 3269 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
2531 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3270 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2532 D(DstMem | SrcReg | Src2CL | ModRM), N, N, 3271 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
2533 /* 0xA8 - 0xAF */ 3272 /* 0xA8 - 0xAF */
2534 D(ImplicitOps | Stack), D(ImplicitOps | Stack), 3273 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
2535 N, D(DstMem | SrcReg | ModRM | BitOp | Lock), 3274 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
2536 D(DstMem | SrcReg | Src2ImmByte | ModRM), 3275 D(DstMem | SrcReg | Src2ImmByte | ModRM),
2537 D(DstMem | SrcReg | Src2CL | ModRM), 3276 D(DstMem | SrcReg | Src2CL | ModRM),
2538 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul), 3277 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
@@ -2564,10 +3303,13 @@ static struct opcode twobyte_table[256] = {
2564#undef G 3303#undef G
2565#undef GD 3304#undef GD
2566#undef I 3305#undef I
3306#undef GP
3307#undef EXT
2567 3308
2568#undef D2bv 3309#undef D2bv
3310#undef D2bvIP
2569#undef I2bv 3311#undef I2bv
2570#undef D6ALU 3312#undef I6ALU
2571 3313
2572static unsigned imm_size(struct decode_cache *c) 3314static unsigned imm_size(struct decode_cache *c)
2573{ 3315{
@@ -2625,8 +3367,9 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2625 struct decode_cache *c = &ctxt->decode; 3367 struct decode_cache *c = &ctxt->decode;
2626 int rc = X86EMUL_CONTINUE; 3368 int rc = X86EMUL_CONTINUE;
2627 int mode = ctxt->mode; 3369 int mode = ctxt->mode;
2628 int def_op_bytes, def_ad_bytes, dual, goffset; 3370 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
2629 struct opcode opcode, *g_mod012, *g_mod3; 3371 bool op_prefix = false;
3372 struct opcode opcode;
2630 struct operand memop = { .type = OP_NONE }; 3373 struct operand memop = { .type = OP_NONE };
2631 3374
2632 c->eip = ctxt->eip; 3375 c->eip = ctxt->eip;
@@ -2634,7 +3377,6 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2634 c->fetch.end = c->fetch.start + insn_len; 3377 c->fetch.end = c->fetch.start + insn_len;
2635 if (insn_len > 0) 3378 if (insn_len > 0)
2636 memcpy(c->fetch.data, insn, insn_len); 3379 memcpy(c->fetch.data, insn, insn_len);
2637 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
2638 3380
2639 switch (mode) { 3381 switch (mode) {
2640 case X86EMUL_MODE_REAL: 3382 case X86EMUL_MODE_REAL:
@@ -2662,6 +3404,7 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2662 for (;;) { 3404 for (;;) {
2663 switch (c->b = insn_fetch(u8, 1, c->eip)) { 3405 switch (c->b = insn_fetch(u8, 1, c->eip)) {
2664 case 0x66: /* operand-size override */ 3406 case 0x66: /* operand-size override */
3407 op_prefix = true;
2665 /* switch between 2/4 bytes */ 3408 /* switch between 2/4 bytes */
2666 c->op_bytes = def_op_bytes ^ 6; 3409 c->op_bytes = def_op_bytes ^ 6;
2667 break; 3410 break;
@@ -2692,10 +3435,8 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2692 c->lock_prefix = 1; 3435 c->lock_prefix = 1;
2693 break; 3436 break;
2694 case 0xf2: /* REPNE/REPNZ */ 3437 case 0xf2: /* REPNE/REPNZ */
2695 c->rep_prefix = REPNE_PREFIX;
2696 break;
2697 case 0xf3: /* REP/REPE/REPZ */ 3438 case 0xf3: /* REP/REPE/REPZ */
2698 c->rep_prefix = REPE_PREFIX; 3439 c->rep_prefix = c->b;
2699 break; 3440 break;
2700 default: 3441 default:
2701 goto done_prefixes; 3442 goto done_prefixes;
@@ -2722,29 +3463,49 @@ done_prefixes:
2722 } 3463 }
2723 c->d = opcode.flags; 3464 c->d = opcode.flags;
2724 3465
2725 if (c->d & Group) { 3466 while (c->d & GroupMask) {
2726 dual = c->d & GroupDual; 3467 switch (c->d & GroupMask) {
2727 c->modrm = insn_fetch(u8, 1, c->eip); 3468 case Group:
2728 --c->eip; 3469 c->modrm = insn_fetch(u8, 1, c->eip);
2729 3470 --c->eip;
2730 if (c->d & GroupDual) { 3471 goffset = (c->modrm >> 3) & 7;
2731 g_mod012 = opcode.u.gdual->mod012; 3472 opcode = opcode.u.group[goffset];
2732 g_mod3 = opcode.u.gdual->mod3; 3473 break;
2733 } else 3474 case GroupDual:
2734 g_mod012 = g_mod3 = opcode.u.group; 3475 c->modrm = insn_fetch(u8, 1, c->eip);
2735 3476 --c->eip;
2736 c->d &= ~(Group | GroupDual); 3477 goffset = (c->modrm >> 3) & 7;
2737 3478 if ((c->modrm >> 6) == 3)
2738 goffset = (c->modrm >> 3) & 7; 3479 opcode = opcode.u.gdual->mod3[goffset];
3480 else
3481 opcode = opcode.u.gdual->mod012[goffset];
3482 break;
3483 case RMExt:
3484 goffset = c->modrm & 7;
3485 opcode = opcode.u.group[goffset];
3486 break;
3487 case Prefix:
3488 if (c->rep_prefix && op_prefix)
3489 return X86EMUL_UNHANDLEABLE;
3490 simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
3491 switch (simd_prefix) {
3492 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
3493 case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
3494 case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
3495 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
3496 }
3497 break;
3498 default:
3499 return X86EMUL_UNHANDLEABLE;
3500 }
2739 3501
2740 if ((c->modrm >> 6) == 3) 3502 c->d &= ~GroupMask;
2741 opcode = g_mod3[goffset];
2742 else
2743 opcode = g_mod012[goffset];
2744 c->d |= opcode.flags; 3503 c->d |= opcode.flags;
2745 } 3504 }
2746 3505
2747 c->execute = opcode.u.execute; 3506 c->execute = opcode.u.execute;
3507 c->check_perm = opcode.check_perm;
3508 c->intercept = opcode.intercept;
2748 3509
2749 /* Unrecognised? */ 3510 /* Unrecognised? */
2750 if (c->d == 0 || (c->d & Undefined)) 3511 if (c->d == 0 || (c->d & Undefined))
@@ -2763,6 +3524,9 @@ done_prefixes:
2763 c->op_bytes = 4; 3524 c->op_bytes = 4;
2764 } 3525 }
2765 3526
3527 if (c->d & Sse)
3528 c->op_bytes = 16;
3529
2766 /* ModRM and SIB bytes. */ 3530 /* ModRM and SIB bytes. */
2767 if (c->d & ModRM) { 3531 if (c->d & ModRM) {
2768 rc = decode_modrm(ctxt, ops, &memop); 3532 rc = decode_modrm(ctxt, ops, &memop);
@@ -2776,7 +3540,7 @@ done_prefixes:
2776 if (!c->has_seg_override) 3540 if (!c->has_seg_override)
2777 set_seg_override(c, VCPU_SREG_DS); 3541 set_seg_override(c, VCPU_SREG_DS);
2778 3542
2779 memop.addr.mem.seg = seg_override(ctxt, ops, c); 3543 memop.addr.mem.seg = seg_override(ctxt, c);
2780 3544
2781 if (memop.type == OP_MEM && c->ad_bytes != 8) 3545 if (memop.type == OP_MEM && c->ad_bytes != 8)
2782 memop.addr.mem.ea = (u32)memop.addr.mem.ea; 3546 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
@@ -2792,7 +3556,7 @@ done_prefixes:
2792 case SrcNone: 3556 case SrcNone:
2793 break; 3557 break;
2794 case SrcReg: 3558 case SrcReg:
2795 decode_register_operand(&c->src, c, 0); 3559 decode_register_operand(ctxt, &c->src, c, 0);
2796 break; 3560 break;
2797 case SrcMem16: 3561 case SrcMem16:
2798 memop.bytes = 2; 3562 memop.bytes = 2;
@@ -2836,7 +3600,7 @@ done_prefixes:
2836 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 3600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
2837 c->src.addr.mem.ea = 3601 c->src.addr.mem.ea =
2838 register_address(c, c->regs[VCPU_REGS_RSI]); 3602 register_address(c, c->regs[VCPU_REGS_RSI]);
2839 c->src.addr.mem.seg = seg_override(ctxt, ops, c), 3603 c->src.addr.mem.seg = seg_override(ctxt, c);
2840 c->src.val = 0; 3604 c->src.val = 0;
2841 break; 3605 break;
2842 case SrcImmFAddr: 3606 case SrcImmFAddr:
@@ -2883,7 +3647,7 @@ done_prefixes:
2883 /* Decode and fetch the destination operand: register or memory. */ 3647 /* Decode and fetch the destination operand: register or memory. */
2884 switch (c->d & DstMask) { 3648 switch (c->d & DstMask) {
2885 case DstReg: 3649 case DstReg:
2886 decode_register_operand(&c->dst, c, 3650 decode_register_operand(ctxt, &c->dst, c,
2887 c->twobyte && (c->b == 0xb6 || c->b == 0xb7)); 3651 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
2888 break; 3652 break;
2889 case DstImmUByte: 3653 case DstImmUByte:
@@ -2926,7 +3690,7 @@ done_prefixes:
2926 } 3690 }
2927 3691
2928done: 3692done:
2929 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 3693 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2930} 3694}
2931 3695
2932static bool string_insn_completed(struct x86_emulate_ctxt *ctxt) 3696static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
@@ -2979,12 +3743,51 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
2979 goto done; 3743 goto done;
2980 } 3744 }
2981 3745
3746 if ((c->d & Sse)
3747 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
3748 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
3749 rc = emulate_ud(ctxt);
3750 goto done;
3751 }
3752
3753 if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
3754 rc = emulate_nm(ctxt);
3755 goto done;
3756 }
3757
3758 if (unlikely(ctxt->guest_mode) && c->intercept) {
3759 rc = emulator_check_intercept(ctxt, c->intercept,
3760 X86_ICPT_PRE_EXCEPT);
3761 if (rc != X86EMUL_CONTINUE)
3762 goto done;
3763 }
3764
2982 /* Privileged instruction can be executed only in CPL=0 */ 3765 /* Privileged instruction can be executed only in CPL=0 */
2983 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 3766 if ((c->d & Priv) && ops->cpl(ctxt)) {
2984 rc = emulate_gp(ctxt, 0); 3767 rc = emulate_gp(ctxt, 0);
2985 goto done; 3768 goto done;
2986 } 3769 }
2987 3770
3771 /* Instruction can only be executed in protected mode */
3772 if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
3773 rc = emulate_ud(ctxt);
3774 goto done;
3775 }
3776
3777 /* Do instruction specific permission checks */
3778 if (c->check_perm) {
3779 rc = c->check_perm(ctxt);
3780 if (rc != X86EMUL_CONTINUE)
3781 goto done;
3782 }
3783
3784 if (unlikely(ctxt->guest_mode) && c->intercept) {
3785 rc = emulator_check_intercept(ctxt, c->intercept,
3786 X86_ICPT_POST_EXCEPT);
3787 if (rc != X86EMUL_CONTINUE)
3788 goto done;
3789 }
3790
2988 if (c->rep_prefix && (c->d & String)) { 3791 if (c->rep_prefix && (c->d & String)) {
2989 /* All REP prefixes have the same first termination condition */ 3792 /* All REP prefixes have the same first termination condition */
2990 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3793 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
@@ -2994,16 +3797,16 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
2994 } 3797 }
2995 3798
2996 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) { 3799 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
2997 rc = read_emulated(ctxt, ops, linear(ctxt, c->src.addr.mem), 3800 rc = segmented_read(ctxt, c->src.addr.mem,
2998 c->src.valptr, c->src.bytes); 3801 c->src.valptr, c->src.bytes);
2999 if (rc != X86EMUL_CONTINUE) 3802 if (rc != X86EMUL_CONTINUE)
3000 goto done; 3803 goto done;
3001 c->src.orig_val64 = c->src.val64; 3804 c->src.orig_val64 = c->src.val64;
3002 } 3805 }
3003 3806
3004 if (c->src2.type == OP_MEM) { 3807 if (c->src2.type == OP_MEM) {
3005 rc = read_emulated(ctxt, ops, linear(ctxt, c->src2.addr.mem), 3808 rc = segmented_read(ctxt, c->src2.addr.mem,
3006 &c->src2.val, c->src2.bytes); 3809 &c->src2.val, c->src2.bytes);
3007 if (rc != X86EMUL_CONTINUE) 3810 if (rc != X86EMUL_CONTINUE)
3008 goto done; 3811 goto done;
3009 } 3812 }
@@ -3014,7 +3817,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3014 3817
3015 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3818 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
3016 /* optimisation - avoid slow emulated read if Mov */ 3819 /* optimisation - avoid slow emulated read if Mov */
3017 rc = read_emulated(ctxt, ops, linear(ctxt, c->dst.addr.mem), 3820 rc = segmented_read(ctxt, c->dst.addr.mem,
3018 &c->dst.val, c->dst.bytes); 3821 &c->dst.val, c->dst.bytes);
3019 if (rc != X86EMUL_CONTINUE) 3822 if (rc != X86EMUL_CONTINUE)
3020 goto done; 3823 goto done;
@@ -3023,6 +3826,13 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3023 3826
3024special_insn: 3827special_insn:
3025 3828
3829 if (unlikely(ctxt->guest_mode) && c->intercept) {
3830 rc = emulator_check_intercept(ctxt, c->intercept,
3831 X86_ICPT_POST_MEMACCESS);
3832 if (rc != X86EMUL_CONTINUE)
3833 goto done;
3834 }
3835
3026 if (c->execute) { 3836 if (c->execute) {
3027 rc = c->execute(ctxt); 3837 rc = c->execute(ctxt);
3028 if (rc != X86EMUL_CONTINUE) 3838 if (rc != X86EMUL_CONTINUE)
@@ -3034,75 +3844,33 @@ special_insn:
3034 goto twobyte_insn; 3844 goto twobyte_insn;
3035 3845
3036 switch (c->b) { 3846 switch (c->b) {
3037 case 0x00 ... 0x05:
3038 add: /* add */
3039 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
3040 break;
3041 case 0x06: /* push es */ 3847 case 0x06: /* push es */
3042 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); 3848 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
3043 break; 3849 break;
3044 case 0x07: /* pop es */ 3850 case 0x07: /* pop es */
3045 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3851 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
3046 break; 3852 break;
3047 case 0x08 ... 0x0d:
3048 or: /* or */
3049 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
3050 break;
3051 case 0x0e: /* push cs */ 3853 case 0x0e: /* push cs */
3052 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); 3854 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
3053 break;
3054 case 0x10 ... 0x15:
3055 adc: /* adc */
3056 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
3057 break; 3855 break;
3058 case 0x16: /* push ss */ 3856 case 0x16: /* push ss */
3059 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); 3857 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
3060 break; 3858 break;
3061 case 0x17: /* pop ss */ 3859 case 0x17: /* pop ss */
3062 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3860 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
3063 break; 3861 break;
3064 case 0x18 ... 0x1d:
3065 sbb: /* sbb */
3066 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
3067 break;
3068 case 0x1e: /* push ds */ 3862 case 0x1e: /* push ds */
3069 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); 3863 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
3070 break; 3864 break;
3071 case 0x1f: /* pop ds */ 3865 case 0x1f: /* pop ds */
3072 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3866 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
3073 break; 3867 break;
3074 case 0x20 ... 0x25:
3075 and: /* and */
3076 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
3077 break;
3078 case 0x28 ... 0x2d:
3079 sub: /* sub */
3080 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
3081 break;
3082 case 0x30 ... 0x35:
3083 xor: /* xor */
3084 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
3085 break;
3086 case 0x38 ... 0x3d:
3087 cmp: /* cmp */
3088 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
3089 break;
3090 case 0x40 ... 0x47: /* inc r16/r32 */ 3868 case 0x40 ... 0x47: /* inc r16/r32 */
3091 emulate_1op("inc", c->dst, ctxt->eflags); 3869 emulate_1op("inc", c->dst, ctxt->eflags);
3092 break; 3870 break;
3093 case 0x48 ... 0x4f: /* dec r16/r32 */ 3871 case 0x48 ... 0x4f: /* dec r16/r32 */
3094 emulate_1op("dec", c->dst, ctxt->eflags); 3872 emulate_1op("dec", c->dst, ctxt->eflags);
3095 break; 3873 break;
3096 case 0x58 ... 0x5f: /* pop reg */
3097 pop_instruction:
3098 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
3099 break;
3100 case 0x60: /* pusha */
3101 rc = emulate_pusha(ctxt, ops);
3102 break;
3103 case 0x61: /* popa */
3104 rc = emulate_popa(ctxt, ops);
3105 break;
3106 case 0x63: /* movsxd */ 3874 case 0x63: /* movsxd */
3107 if (ctxt->mode != X86EMUL_MODE_PROT64) 3875 if (ctxt->mode != X86EMUL_MODE_PROT64)
3108 goto cannot_emulate; 3876 goto cannot_emulate;
@@ -3121,26 +3889,6 @@ special_insn:
3121 if (test_cc(c->b, ctxt->eflags)) 3889 if (test_cc(c->b, ctxt->eflags))
3122 jmp_rel(c, c->src.val); 3890 jmp_rel(c, c->src.val);
3123 break; 3891 break;
3124 case 0x80 ... 0x83: /* Grp1 */
3125 switch (c->modrm_reg) {
3126 case 0:
3127 goto add;
3128 case 1:
3129 goto or;
3130 case 2:
3131 goto adc;
3132 case 3:
3133 goto sbb;
3134 case 4:
3135 goto and;
3136 case 5:
3137 goto sub;
3138 case 6:
3139 goto xor;
3140 case 7:
3141 goto cmp;
3142 }
3143 break;
3144 case 0x84 ... 0x85: 3892 case 0x84 ... 0x85:
3145 test: 3893 test:
3146 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 3894 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
@@ -3162,7 +3910,7 @@ special_insn:
3162 rc = emulate_ud(ctxt); 3910 rc = emulate_ud(ctxt);
3163 goto done; 3911 goto done;
3164 } 3912 }
3165 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3913 c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
3166 break; 3914 break;
3167 case 0x8d: /* lea r16/r32, m */ 3915 case 0x8d: /* lea r16/r32, m */
3168 c->dst.val = c->src.addr.mem.ea; 3916 c->dst.val = c->src.addr.mem.ea;
@@ -3187,7 +3935,7 @@ special_insn:
3187 break; 3935 break;
3188 } 3936 }
3189 case 0x8f: /* pop (sole member of Grp1a) */ 3937 case 0x8f: /* pop (sole member of Grp1a) */
3190 rc = emulate_grp1a(ctxt, ops); 3938 rc = em_grp1a(ctxt);
3191 break; 3939 break;
3192 case 0x90 ... 0x97: /* nop / xchg reg, rax */ 3940 case 0x90 ... 0x97: /* nop / xchg reg, rax */
3193 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX]) 3941 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
@@ -3200,31 +3948,17 @@ special_insn:
3200 case 8: c->dst.val = (s32)c->dst.val; break; 3948 case 8: c->dst.val = (s32)c->dst.val; break;
3201 } 3949 }
3202 break; 3950 break;
3203 case 0x9c: /* pushf */
3204 c->src.val = (unsigned long) ctxt->eflags;
3205 emulate_push(ctxt, ops);
3206 break;
3207 case 0x9d: /* popf */
3208 c->dst.type = OP_REG;
3209 c->dst.addr.reg = &ctxt->eflags;
3210 c->dst.bytes = c->op_bytes;
3211 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
3212 break;
3213 case 0xa6 ... 0xa7: /* cmps */
3214 c->dst.type = OP_NONE; /* Disable writeback. */
3215 goto cmp;
3216 case 0xa8 ... 0xa9: /* test ax, imm */ 3951 case 0xa8 ... 0xa9: /* test ax, imm */
3217 goto test; 3952 goto test;
3218 case 0xae ... 0xaf: /* scas */
3219 goto cmp;
3220 case 0xc0 ... 0xc1: 3953 case 0xc0 ... 0xc1:
3221 emulate_grp2(ctxt); 3954 rc = em_grp2(ctxt);
3222 break; 3955 break;
3223 case 0xc3: /* ret */ 3956 case 0xc3: /* ret */
3224 c->dst.type = OP_REG; 3957 c->dst.type = OP_REG;
3225 c->dst.addr.reg = &c->eip; 3958 c->dst.addr.reg = &c->eip;
3226 c->dst.bytes = c->op_bytes; 3959 c->dst.bytes = c->op_bytes;
3227 goto pop_instruction; 3960 rc = em_pop(ctxt);
3961 break;
3228 case 0xc4: /* les */ 3962 case 0xc4: /* les */
3229 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES); 3963 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
3230 break; 3964 break;
@@ -3252,11 +3986,11 @@ special_insn:
3252 rc = emulate_iret(ctxt, ops); 3986 rc = emulate_iret(ctxt, ops);
3253 break; 3987 break;
3254 case 0xd0 ... 0xd1: /* Grp2 */ 3988 case 0xd0 ... 0xd1: /* Grp2 */
3255 emulate_grp2(ctxt); 3989 rc = em_grp2(ctxt);
3256 break; 3990 break;
3257 case 0xd2 ... 0xd3: /* Grp2 */ 3991 case 0xd2 ... 0xd3: /* Grp2 */
3258 c->src.val = c->regs[VCPU_REGS_RCX]; 3992 c->src.val = c->regs[VCPU_REGS_RCX];
3259 emulate_grp2(ctxt); 3993 rc = em_grp2(ctxt);
3260 break; 3994 break;
3261 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */ 3995 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
3262 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 3996 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
@@ -3278,23 +4012,14 @@ special_insn:
3278 long int rel = c->src.val; 4012 long int rel = c->src.val;
3279 c->src.val = (unsigned long) c->eip; 4013 c->src.val = (unsigned long) c->eip;
3280 jmp_rel(c, rel); 4014 jmp_rel(c, rel);
3281 emulate_push(ctxt, ops); 4015 rc = em_push(ctxt);
3282 break; 4016 break;
3283 } 4017 }
3284 case 0xe9: /* jmp rel */ 4018 case 0xe9: /* jmp rel */
3285 goto jmp; 4019 goto jmp;
3286 case 0xea: { /* jmp far */ 4020 case 0xea: /* jmp far */
3287 unsigned short sel; 4021 rc = em_jmp_far(ctxt);
3288 jump_far:
3289 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
3290
3291 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
3292 goto done;
3293
3294 c->eip = 0;
3295 memcpy(&c->eip, c->src.valptr, c->op_bytes);
3296 break; 4022 break;
3297 }
3298 case 0xeb: 4023 case 0xeb:
3299 jmp: /* jmp rel short */ 4024 jmp: /* jmp rel short */
3300 jmp_rel(c, c->src.val); 4025 jmp_rel(c, c->src.val);
@@ -3304,11 +4029,6 @@ special_insn:
3304 case 0xed: /* in (e/r)ax,dx */ 4029 case 0xed: /* in (e/r)ax,dx */
3305 c->src.val = c->regs[VCPU_REGS_RDX]; 4030 c->src.val = c->regs[VCPU_REGS_RDX];
3306 do_io_in: 4031 do_io_in:
3307 c->dst.bytes = min(c->dst.bytes, 4u);
3308 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
3309 rc = emulate_gp(ctxt, 0);
3310 goto done;
3311 }
3312 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 4032 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
3313 &c->dst.val)) 4033 &c->dst.val))
3314 goto done; /* IO is needed */ 4034 goto done; /* IO is needed */
@@ -3317,25 +4037,19 @@ special_insn:
3317 case 0xef: /* out dx,(e/r)ax */ 4037 case 0xef: /* out dx,(e/r)ax */
3318 c->dst.val = c->regs[VCPU_REGS_RDX]; 4038 c->dst.val = c->regs[VCPU_REGS_RDX];
3319 do_io_out: 4039 do_io_out:
3320 c->src.bytes = min(c->src.bytes, 4u); 4040 ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
3321 if (!emulator_io_permited(ctxt, ops, c->dst.val, 4041 &c->src.val, 1);
3322 c->src.bytes)) {
3323 rc = emulate_gp(ctxt, 0);
3324 goto done;
3325 }
3326 ops->pio_out_emulated(c->src.bytes, c->dst.val,
3327 &c->src.val, 1, ctxt->vcpu);
3328 c->dst.type = OP_NONE; /* Disable writeback. */ 4042 c->dst.type = OP_NONE; /* Disable writeback. */
3329 break; 4043 break;
3330 case 0xf4: /* hlt */ 4044 case 0xf4: /* hlt */
3331 ctxt->vcpu->arch.halt_request = 1; 4045 ctxt->ops->halt(ctxt);
3332 break; 4046 break;
3333 case 0xf5: /* cmc */ 4047 case 0xf5: /* cmc */
3334 /* complement carry flag from eflags reg */ 4048 /* complement carry flag from eflags reg */
3335 ctxt->eflags ^= EFLG_CF; 4049 ctxt->eflags ^= EFLG_CF;
3336 break; 4050 break;
3337 case 0xf6 ... 0xf7: /* Grp3 */ 4051 case 0xf6 ... 0xf7: /* Grp3 */
3338 rc = emulate_grp3(ctxt, ops); 4052 rc = em_grp3(ctxt);
3339 break; 4053 break;
3340 case 0xf8: /* clc */ 4054 case 0xf8: /* clc */
3341 ctxt->eflags &= ~EFLG_CF; 4055 ctxt->eflags &= ~EFLG_CF;
@@ -3366,13 +4080,11 @@ special_insn:
3366 ctxt->eflags |= EFLG_DF; 4080 ctxt->eflags |= EFLG_DF;
3367 break; 4081 break;
3368 case 0xfe: /* Grp4 */ 4082 case 0xfe: /* Grp4 */
3369 grp45: 4083 rc = em_grp45(ctxt);
3370 rc = emulate_grp45(ctxt, ops);
3371 break; 4084 break;
3372 case 0xff: /* Grp5 */ 4085 case 0xff: /* Grp5 */
3373 if (c->modrm_reg == 5) 4086 rc = em_grp45(ctxt);
3374 goto jump_far; 4087 break;
3375 goto grp45;
3376 default: 4088 default:
3377 goto cannot_emulate; 4089 goto cannot_emulate;
3378 } 4090 }
@@ -3381,7 +4093,7 @@ special_insn:
3381 goto done; 4093 goto done;
3382 4094
3383writeback: 4095writeback:
3384 rc = writeback(ctxt, ops); 4096 rc = writeback(ctxt);
3385 if (rc != X86EMUL_CONTINUE) 4097 if (rc != X86EMUL_CONTINUE)
3386 goto done; 4098 goto done;
3387 4099
@@ -3392,7 +4104,7 @@ writeback:
3392 c->dst.type = saved_dst_type; 4104 c->dst.type = saved_dst_type;
3393 4105
3394 if ((c->d & SrcMask) == SrcSI) 4106 if ((c->d & SrcMask) == SrcSI)
3395 string_addr_inc(ctxt, seg_override(ctxt, ops, c), 4107 string_addr_inc(ctxt, seg_override(ctxt, c),
3396 VCPU_REGS_RSI, &c->src); 4108 VCPU_REGS_RSI, &c->src);
3397 4109
3398 if ((c->d & DstMask) == DstDI) 4110 if ((c->d & DstMask) == DstDI)
@@ -3427,115 +4139,34 @@ writeback:
3427done: 4139done:
3428 if (rc == X86EMUL_PROPAGATE_FAULT) 4140 if (rc == X86EMUL_PROPAGATE_FAULT)
3429 ctxt->have_exception = true; 4141 ctxt->have_exception = true;
4142 if (rc == X86EMUL_INTERCEPTED)
4143 return EMULATION_INTERCEPTED;
4144
3430 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK; 4145 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3431 4146
3432twobyte_insn: 4147twobyte_insn:
3433 switch (c->b) { 4148 switch (c->b) {
3434 case 0x01: /* lgdt, lidt, lmsw */
3435 switch (c->modrm_reg) {
3436 u16 size;
3437 unsigned long address;
3438
3439 case 0: /* vmcall */
3440 if (c->modrm_mod != 3 || c->modrm_rm != 1)
3441 goto cannot_emulate;
3442
3443 rc = kvm_fix_hypercall(ctxt->vcpu);
3444 if (rc != X86EMUL_CONTINUE)
3445 goto done;
3446
3447 /* Let the processor re-execute the fixed hypercall */
3448 c->eip = ctxt->eip;
3449 /* Disable writeback. */
3450 c->dst.type = OP_NONE;
3451 break;
3452 case 2: /* lgdt */
3453 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3454 &size, &address, c->op_bytes);
3455 if (rc != X86EMUL_CONTINUE)
3456 goto done;
3457 realmode_lgdt(ctxt->vcpu, size, address);
3458 /* Disable writeback. */
3459 c->dst.type = OP_NONE;
3460 break;
3461 case 3: /* lidt/vmmcall */
3462 if (c->modrm_mod == 3) {
3463 switch (c->modrm_rm) {
3464 case 1:
3465 rc = kvm_fix_hypercall(ctxt->vcpu);
3466 break;
3467 default:
3468 goto cannot_emulate;
3469 }
3470 } else {
3471 rc = read_descriptor(ctxt, ops, c->src.addr.mem,
3472 &size, &address,
3473 c->op_bytes);
3474 if (rc != X86EMUL_CONTINUE)
3475 goto done;
3476 realmode_lidt(ctxt->vcpu, size, address);
3477 }
3478 /* Disable writeback. */
3479 c->dst.type = OP_NONE;
3480 break;
3481 case 4: /* smsw */
3482 c->dst.bytes = 2;
3483 c->dst.val = ops->get_cr(0, ctxt->vcpu);
3484 break;
3485 case 6: /* lmsw */
3486 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0eul) |
3487 (c->src.val & 0x0f), ctxt->vcpu);
3488 c->dst.type = OP_NONE;
3489 break;
3490 case 5: /* not defined */
3491 emulate_ud(ctxt);
3492 rc = X86EMUL_PROPAGATE_FAULT;
3493 goto done;
3494 case 7: /* invlpg*/
3495 emulate_invlpg(ctxt->vcpu,
3496 linear(ctxt, c->src.addr.mem));
3497 /* Disable writeback. */
3498 c->dst.type = OP_NONE;
3499 break;
3500 default:
3501 goto cannot_emulate;
3502 }
3503 break;
3504 case 0x05: /* syscall */ 4149 case 0x05: /* syscall */
3505 rc = emulate_syscall(ctxt, ops); 4150 rc = emulate_syscall(ctxt, ops);
3506 break; 4151 break;
3507 case 0x06: 4152 case 0x06:
3508 emulate_clts(ctxt->vcpu); 4153 rc = em_clts(ctxt);
3509 break; 4154 break;
3510 case 0x09: /* wbinvd */ 4155 case 0x09: /* wbinvd */
3511 kvm_emulate_wbinvd(ctxt->vcpu); 4156 (ctxt->ops->wbinvd)(ctxt);
3512 break; 4157 break;
3513 case 0x08: /* invd */ 4158 case 0x08: /* invd */
3514 case 0x0d: /* GrpP (prefetch) */ 4159 case 0x0d: /* GrpP (prefetch) */
3515 case 0x18: /* Grp16 (prefetch/nop) */ 4160 case 0x18: /* Grp16 (prefetch/nop) */
3516 break; 4161 break;
3517 case 0x20: /* mov cr, reg */ 4162 case 0x20: /* mov cr, reg */
3518 switch (c->modrm_reg) { 4163 c->dst.val = ops->get_cr(ctxt, c->modrm_reg);
3519 case 1:
3520 case 5 ... 7:
3521 case 9 ... 15:
3522 emulate_ud(ctxt);
3523 rc = X86EMUL_PROPAGATE_FAULT;
3524 goto done;
3525 }
3526 c->dst.val = ops->get_cr(c->modrm_reg, ctxt->vcpu);
3527 break; 4164 break;
3528 case 0x21: /* mov from dr to reg */ 4165 case 0x21: /* mov from dr to reg */
3529 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 4166 ops->get_dr(ctxt, c->modrm_reg, &c->dst.val);
3530 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3531 emulate_ud(ctxt);
3532 rc = X86EMUL_PROPAGATE_FAULT;
3533 goto done;
3534 }
3535 ops->get_dr(c->modrm_reg, &c->dst.val, ctxt->vcpu);
3536 break; 4167 break;
3537 case 0x22: /* mov reg, cr */ 4168 case 0x22: /* mov reg, cr */
3538 if (ops->set_cr(c->modrm_reg, c->src.val, ctxt->vcpu)) { 4169 if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) {
3539 emulate_gp(ctxt, 0); 4170 emulate_gp(ctxt, 0);
3540 rc = X86EMUL_PROPAGATE_FAULT; 4171 rc = X86EMUL_PROPAGATE_FAULT;
3541 goto done; 4172 goto done;
@@ -3543,16 +4174,9 @@ twobyte_insn:
3543 c->dst.type = OP_NONE; 4174 c->dst.type = OP_NONE;
3544 break; 4175 break;
3545 case 0x23: /* mov from reg to dr */ 4176 case 0x23: /* mov from reg to dr */
3546 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 4177 if (ops->set_dr(ctxt, c->modrm_reg, c->src.val &
3547 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3548 emulate_ud(ctxt);
3549 rc = X86EMUL_PROPAGATE_FAULT;
3550 goto done;
3551 }
3552
3553 if (ops->set_dr(c->modrm_reg, c->src.val &
3554 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 4178 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3555 ~0ULL : ~0U), ctxt->vcpu) < 0) { 4179 ~0ULL : ~0U)) < 0) {
3556 /* #UD condition is already handled by the code above */ 4180 /* #UD condition is already handled by the code above */
3557 emulate_gp(ctxt, 0); 4181 emulate_gp(ctxt, 0);
3558 rc = X86EMUL_PROPAGATE_FAULT; 4182 rc = X86EMUL_PROPAGATE_FAULT;
@@ -3565,7 +4189,7 @@ twobyte_insn:
3565 /* wrmsr */ 4189 /* wrmsr */
3566 msr_data = (u32)c->regs[VCPU_REGS_RAX] 4190 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3567 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 4191 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3568 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 4192 if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) {
3569 emulate_gp(ctxt, 0); 4193 emulate_gp(ctxt, 0);
3570 rc = X86EMUL_PROPAGATE_FAULT; 4194 rc = X86EMUL_PROPAGATE_FAULT;
3571 goto done; 4195 goto done;
@@ -3574,7 +4198,7 @@ twobyte_insn:
3574 break; 4198 break;
3575 case 0x32: 4199 case 0x32:
3576 /* rdmsr */ 4200 /* rdmsr */
3577 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 4201 if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) {
3578 emulate_gp(ctxt, 0); 4202 emulate_gp(ctxt, 0);
3579 rc = X86EMUL_PROPAGATE_FAULT; 4203 rc = X86EMUL_PROPAGATE_FAULT;
3580 goto done; 4204 goto done;
@@ -3603,7 +4227,7 @@ twobyte_insn:
3603 c->dst.val = test_cc(c->b, ctxt->eflags); 4227 c->dst.val = test_cc(c->b, ctxt->eflags);
3604 break; 4228 break;
3605 case 0xa0: /* push fs */ 4229 case 0xa0: /* push fs */
3606 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 4230 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3607 break; 4231 break;
3608 case 0xa1: /* pop fs */ 4232 case 0xa1: /* pop fs */
3609 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 4233 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
@@ -3620,7 +4244,7 @@ twobyte_insn:
3620 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 4244 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3621 break; 4245 break;
3622 case 0xa8: /* push gs */ 4246 case 0xa8: /* push gs */
3623 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); 4247 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3624 break; 4248 break;
3625 case 0xa9: /* pop gs */ 4249 case 0xa9: /* pop gs */
3626 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 4250 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
@@ -3727,7 +4351,7 @@ twobyte_insn:
3727 (u64) c->src.val; 4351 (u64) c->src.val;
3728 break; 4352 break;
3729 case 0xc7: /* Grp9 (cmpxchg8b) */ 4353 case 0xc7: /* Grp9 (cmpxchg8b) */
3730 rc = emulate_grp9(ctxt, ops); 4354 rc = em_grp9(ctxt);
3731 break; 4355 break;
3732 default: 4356 default:
3733 goto cannot_emulate; 4357 goto cannot_emulate;
@@ -3739,5 +4363,5 @@ twobyte_insn:
3739 goto writeback; 4363 goto writeback;
3740 4364
3741cannot_emulate: 4365cannot_emulate:
3742 return -1; 4366 return EMULATION_FAILED;
3743} 4367}
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 46d08ca0b48f..51a97426e791 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,7 +33,6 @@ struct kvm_kpit_state {
33}; 33};
34 34
35struct kvm_pit { 35struct kvm_pit {
36 unsigned long base_addresss;
37 struct kvm_io_device dev; 36 struct kvm_io_device dev;
38 struct kvm_io_device speaker_dev; 37 struct kvm_io_device speaker_dev;
39 struct kvm *kvm; 38 struct kvm *kvm;
@@ -51,7 +50,6 @@ struct kvm_pit {
51#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 50#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
52#define KVM_PIT_CHANNEL_MASK 0x3 51#define KVM_PIT_CHANNEL_MASK 0x3
53 52
54void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
55void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); 53void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
56struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); 54struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
57void kvm_free_pit(struct kvm *kvm); 55void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index ba910d149410..53e2d084bffb 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm);
75void kvm_destroy_pic(struct kvm *kvm); 75void kvm_destroy_pic(struct kvm *kvm);
76int kvm_pic_read_irq(struct kvm *kvm); 76int kvm_pic_read_irq(struct kvm *kvm);
77void kvm_pic_update_irq(struct kvm_pic *s); 77void kvm_pic_update_irq(struct kvm_pic *s);
78void kvm_pic_clear_isr_ack(struct kvm *kvm);
79 78
80static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 79static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
81{ 80{
@@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
100void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); 99void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
101void __kvm_migrate_timers(struct kvm_vcpu *vcpu); 100void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
102 101
103int pit_has_pending_timer(struct kvm_vcpu *vcpu);
104int apic_has_pending_timer(struct kvm_vcpu *vcpu); 102int apic_has_pending_timer(struct kvm_vcpu *vcpu);
105 103
106#endif 104#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 22fae7593ee7..bd14bb4c8594 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -1206,7 +1206,7 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1206 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu, 1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte, 1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte, unsigned long mmu_seq) 1209 const void *pte)
1210{ 1210{
1211 WARN_ON(1); 1211 WARN_ON(1);
1212} 1212}
@@ -3163,9 +3163,8 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
3163} 3163}
3164 3164
3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3166 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp, u64 *spte,
3167 u64 *spte, 3167 const void *new)
3168 const void *new, unsigned long mmu_seq)
3169{ 3168{
3170 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
3171 ++vcpu->kvm->stat.mmu_pde_zapped; 3170 ++vcpu->kvm->stat.mmu_pde_zapped;
@@ -3173,7 +3172,7 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
3173 } 3172 }
3174 3173
3175 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
3176 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new, mmu_seq); 3175 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
3177} 3176}
3178 3177
3179static bool need_remote_flush(u64 old, u64 new) 3178static bool need_remote_flush(u64 old, u64 new)
@@ -3229,7 +3228,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3229 struct kvm_mmu_page *sp; 3228 struct kvm_mmu_page *sp;
3230 struct hlist_node *node; 3229 struct hlist_node *node;
3231 LIST_HEAD(invalid_list); 3230 LIST_HEAD(invalid_list);
3232 unsigned long mmu_seq;
3233 u64 entry, gentry, *spte; 3231 u64 entry, gentry, *spte;
3234 unsigned pte_size, page_offset, misaligned, quadrant, offset; 3232 unsigned pte_size, page_offset, misaligned, quadrant, offset;
3235 int level, npte, invlpg_counter, r, flooded = 0; 3233 int level, npte, invlpg_counter, r, flooded = 0;
@@ -3271,9 +3269,6 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3271 break; 3269 break;
3272 } 3270 }
3273 3271
3274 mmu_seq = vcpu->kvm->mmu_notifier_seq;
3275 smp_rmb();
3276
3277 spin_lock(&vcpu->kvm->mmu_lock); 3272 spin_lock(&vcpu->kvm->mmu_lock);
3278 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3273 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
3279 gentry = 0; 3274 gentry = 0;
@@ -3345,8 +3340,7 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
3345 if (gentry && 3340 if (gentry &&
3346 !((sp->role.word ^ vcpu->arch.mmu.base_role.word) 3341 !((sp->role.word ^ vcpu->arch.mmu.base_role.word)
3347 & mask.word)) 3342 & mask.word))
3348 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry, 3343 mmu_pte_write_new_pte(vcpu, sp, spte, &gentry);
3349 mmu_seq);
3350 if (!remote_flush && need_remote_flush(entry, *spte)) 3344 if (!remote_flush && need_remote_flush(entry, *spte))
3351 remote_flush = true; 3345 remote_flush = true;
3352 ++spte; 3346 ++spte;
@@ -3551,10 +3545,11 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3551 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3545 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3552} 3546}
3553 3547
3554static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3548static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3555{ 3549{
3556 struct kvm *kvm; 3550 struct kvm *kvm;
3557 struct kvm *kvm_freed = NULL; 3551 struct kvm *kvm_freed = NULL;
3552 int nr_to_scan = sc->nr_to_scan;
3558 3553
3559 if (nr_to_scan == 0) 3554 if (nr_to_scan == 0)
3560 goto out; 3555 goto out;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index c6397795d865..6c4dc010c4cb 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -78,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
78 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 78 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
79} 79}
80 80
81static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, 81static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
82 gfn_t table_gfn, unsigned index, 82 pt_element_t __user *ptep_user, unsigned index,
83 pt_element_t orig_pte, pt_element_t new_pte) 83 pt_element_t orig_pte, pt_element_t new_pte)
84{ 84{
85 int npages;
85 pt_element_t ret; 86 pt_element_t ret;
86 pt_element_t *table; 87 pt_element_t *table;
87 struct page *page; 88 struct page *page;
88 89
89 page = gfn_to_page(kvm, table_gfn); 90 npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
91 /* Check if the user is doing something meaningless. */
92 if (unlikely(npages != 1))
93 return -EFAULT;
90 94
91 table = kmap_atomic(page, KM_USER0); 95 table = kmap_atomic(page, KM_USER0);
92 ret = CMPXCHG(&table[index], orig_pte, new_pte); 96 ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -117,6 +121,7 @@ static int FNAME(walk_addr_generic)(struct guest_walker *walker,
117 gva_t addr, u32 access) 121 gva_t addr, u32 access)
118{ 122{
119 pt_element_t pte; 123 pt_element_t pte;
124 pt_element_t __user *ptep_user;
120 gfn_t table_gfn; 125 gfn_t table_gfn;
121 unsigned index, pt_access, uninitialized_var(pte_access); 126 unsigned index, pt_access, uninitialized_var(pte_access);
122 gpa_t pte_gpa; 127 gpa_t pte_gpa;
@@ -152,6 +157,9 @@ walk:
152 pt_access = ACC_ALL; 157 pt_access = ACC_ALL;
153 158
154 for (;;) { 159 for (;;) {
160 gfn_t real_gfn;
161 unsigned long host_addr;
162
155 index = PT_INDEX(addr, walker->level); 163 index = PT_INDEX(addr, walker->level);
156 164
157 table_gfn = gpte_to_gfn(pte); 165 table_gfn = gpte_to_gfn(pte);
@@ -160,43 +168,64 @@ walk:
160 walker->table_gfn[walker->level - 1] = table_gfn; 168 walker->table_gfn[walker->level - 1] = table_gfn;
161 walker->pte_gpa[walker->level - 1] = pte_gpa; 169 walker->pte_gpa[walker->level - 1] = pte_gpa;
162 170
163 if (kvm_read_guest_page_mmu(vcpu, mmu, table_gfn, &pte, 171 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
164 offset, sizeof(pte), 172 PFERR_USER_MASK|PFERR_WRITE_MASK);
165 PFERR_USER_MASK|PFERR_WRITE_MASK)) { 173 if (unlikely(real_gfn == UNMAPPED_GVA)) {
174 present = false;
175 break;
176 }
177 real_gfn = gpa_to_gfn(real_gfn);
178
179 host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
180 if (unlikely(kvm_is_error_hva(host_addr))) {
181 present = false;
182 break;
183 }
184
185 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
186 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
166 present = false; 187 present = false;
167 break; 188 break;
168 } 189 }
169 190
170 trace_kvm_mmu_paging_element(pte, walker->level); 191 trace_kvm_mmu_paging_element(pte, walker->level);
171 192
172 if (!is_present_gpte(pte)) { 193 if (unlikely(!is_present_gpte(pte))) {
173 present = false; 194 present = false;
174 break; 195 break;
175 } 196 }
176 197
177 if (is_rsvd_bits_set(&vcpu->arch.mmu, pte, walker->level)) { 198 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
199 walker->level))) {
178 rsvd_fault = true; 200 rsvd_fault = true;
179 break; 201 break;
180 } 202 }
181 203
182 if (write_fault && !is_writable_pte(pte)) 204 if (unlikely(write_fault && !is_writable_pte(pte)
183 if (user_fault || is_write_protection(vcpu)) 205 && (user_fault || is_write_protection(vcpu))))
184 eperm = true; 206 eperm = true;
185 207
186 if (user_fault && !(pte & PT_USER_MASK)) 208 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
187 eperm = true; 209 eperm = true;
188 210
189#if PTTYPE == 64 211#if PTTYPE == 64
190 if (fetch_fault && (pte & PT64_NX_MASK)) 212 if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
191 eperm = true; 213 eperm = true;
192#endif 214#endif
193 215
194 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { 216 if (!eperm && !rsvd_fault
217 && unlikely(!(pte & PT_ACCESSED_MASK))) {
218 int ret;
195 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 219 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
196 sizeof(pte)); 220 sizeof(pte));
197 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 221 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
198 index, pte, pte|PT_ACCESSED_MASK)) 222 pte, pte|PT_ACCESSED_MASK);
223 if (unlikely(ret < 0)) {
224 present = false;
225 break;
226 } else if (ret)
199 goto walk; 227 goto walk;
228
200 mark_page_dirty(vcpu->kvm, table_gfn); 229 mark_page_dirty(vcpu->kvm, table_gfn);
201 pte |= PT_ACCESSED_MASK; 230 pte |= PT_ACCESSED_MASK;
202 } 231 }
@@ -241,17 +270,21 @@ walk:
241 --walker->level; 270 --walker->level;
242 } 271 }
243 272
244 if (!present || eperm || rsvd_fault) 273 if (unlikely(!present || eperm || rsvd_fault))
245 goto error; 274 goto error;
246 275
247 if (write_fault && !is_dirty_gpte(pte)) { 276 if (write_fault && unlikely(!is_dirty_gpte(pte))) {
248 bool ret; 277 int ret;
249 278
250 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 279 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
251 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 280 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
252 pte|PT_DIRTY_MASK); 281 pte, pte|PT_DIRTY_MASK);
253 if (ret) 282 if (unlikely(ret < 0)) {
283 present = false;
284 goto error;
285 } else if (ret)
254 goto walk; 286 goto walk;
287
255 mark_page_dirty(vcpu->kvm, table_gfn); 288 mark_page_dirty(vcpu->kvm, table_gfn);
256 pte |= PT_DIRTY_MASK; 289 pte |= PT_DIRTY_MASK;
257 walker->ptes[walker->level - 1] = pte; 290 walker->ptes[walker->level - 1] = pte;
@@ -325,7 +358,7 @@ no_present:
325} 358}
326 359
327static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 360static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
328 u64 *spte, const void *pte, unsigned long mmu_seq) 361 u64 *spte, const void *pte)
329{ 362{
330 pt_element_t gpte; 363 pt_element_t gpte;
331 unsigned pte_access; 364 unsigned pte_access;
@@ -342,8 +375,6 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
342 kvm_release_pfn_clean(pfn); 375 kvm_release_pfn_clean(pfn);
343 return; 376 return;
344 } 377 }
345 if (mmu_notifier_retry(vcpu, mmu_seq))
346 return;
347 378
348 /* 379 /*
349 * we call mmu_set_spte() with host_writable = true because that 380 * we call mmu_set_spte() with host_writable = true because that
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 6bb15d583e47..506e4fe23adc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -63,6 +63,10 @@ MODULE_LICENSE("GPL");
63 63
64#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 64#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
65 65
66#define TSC_RATIO_RSVD 0xffffff0000000000ULL
67#define TSC_RATIO_MIN 0x0000000000000001ULL
68#define TSC_RATIO_MAX 0x000000ffffffffffULL
69
66static bool erratum_383_found __read_mostly; 70static bool erratum_383_found __read_mostly;
67 71
68static const u32 host_save_user_msrs[] = { 72static const u32 host_save_user_msrs[] = {
@@ -93,14 +97,6 @@ struct nested_state {
93 /* A VMEXIT is required but not yet emulated */ 97 /* A VMEXIT is required but not yet emulated */
94 bool exit_required; 98 bool exit_required;
95 99
96 /*
97 * If we vmexit during an instruction emulation we need this to restore
98 * the l1 guest rip after the emulation
99 */
100 unsigned long vmexit_rip;
101 unsigned long vmexit_rsp;
102 unsigned long vmexit_rax;
103
104 /* cache for intercepts of the guest */ 100 /* cache for intercepts of the guest */
105 u32 intercept_cr; 101 u32 intercept_cr;
106 u32 intercept_dr; 102 u32 intercept_dr;
@@ -144,8 +140,13 @@ struct vcpu_svm {
144 unsigned int3_injected; 140 unsigned int3_injected;
145 unsigned long int3_rip; 141 unsigned long int3_rip;
146 u32 apf_reason; 142 u32 apf_reason;
143
144 u64 tsc_ratio;
147}; 145};
148 146
147static DEFINE_PER_CPU(u64, current_tsc_ratio);
148#define TSC_RATIO_DEFAULT 0x0100000000ULL
149
149#define MSR_INVALID 0xffffffffU 150#define MSR_INVALID 0xffffffffU
150 151
151static struct svm_direct_access_msrs { 152static struct svm_direct_access_msrs {
@@ -190,6 +191,7 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
190static int nested_svm_vmexit(struct vcpu_svm *svm); 191static int nested_svm_vmexit(struct vcpu_svm *svm);
191static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 192static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
192 bool has_error_code, u32 error_code); 193 bool has_error_code, u32 error_code);
194static u64 __scale_tsc(u64 ratio, u64 tsc);
193 195
194enum { 196enum {
195 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset, 197 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
@@ -376,7 +378,6 @@ struct svm_cpu_data {
376}; 378};
377 379
378static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 380static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
379static uint32_t svm_features;
380 381
381struct svm_init_data { 382struct svm_init_data {
382 int cpu; 383 int cpu;
@@ -569,6 +570,10 @@ static int has_svm(void)
569 570
570static void svm_hardware_disable(void *garbage) 571static void svm_hardware_disable(void *garbage)
571{ 572{
573 /* Make sure we clean up behind us */
574 if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
575 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
576
572 cpu_svm_disable(); 577 cpu_svm_disable();
573} 578}
574 579
@@ -610,6 +615,11 @@ static int svm_hardware_enable(void *garbage)
610 615
611 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); 616 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
612 617
618 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
619 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
620 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
621 }
622
613 svm_init_erratum_383(); 623 svm_init_erratum_383();
614 624
615 return 0; 625 return 0;
@@ -791,6 +801,23 @@ static __init int svm_hardware_setup(void)
791 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 801 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
792 kvm_enable_efer_bits(EFER_FFXSR); 802 kvm_enable_efer_bits(EFER_FFXSR);
793 803
804 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
805 u64 max;
806
807 kvm_has_tsc_control = true;
808
809 /*
810 * Make sure the user can only configure tsc_khz values that
811 * fit into a signed integer.
812 * A min value is not calculated needed because it will always
813 * be 1 on all machines and a value of 0 is used to disable
814 * tsc-scaling for the vcpu.
815 */
816 max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
817
818 kvm_max_guest_tsc_khz = max;
819 }
820
794 if (nested) { 821 if (nested) {
795 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 822 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
796 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 823 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -802,8 +829,6 @@ static __init int svm_hardware_setup(void)
802 goto err; 829 goto err;
803 } 830 }
804 831
805 svm_features = cpuid_edx(SVM_CPUID_FUNC);
806
807 if (!boot_cpu_has(X86_FEATURE_NPT)) 832 if (!boot_cpu_has(X86_FEATURE_NPT))
808 npt_enabled = false; 833 npt_enabled = false;
809 834
@@ -854,6 +879,64 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
854 seg->base = 0; 879 seg->base = 0;
855} 880}
856 881
882static u64 __scale_tsc(u64 ratio, u64 tsc)
883{
884 u64 mult, frac, _tsc;
885
886 mult = ratio >> 32;
887 frac = ratio & ((1ULL << 32) - 1);
888
889 _tsc = tsc;
890 _tsc *= mult;
891 _tsc += (tsc >> 32) * frac;
892 _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
893
894 return _tsc;
895}
896
897static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
898{
899 struct vcpu_svm *svm = to_svm(vcpu);
900 u64 _tsc = tsc;
901
902 if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
903 _tsc = __scale_tsc(svm->tsc_ratio, tsc);
904
905 return _tsc;
906}
907
908static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
909{
910 struct vcpu_svm *svm = to_svm(vcpu);
911 u64 ratio;
912 u64 khz;
913
914 /* TSC scaling supported? */
915 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
916 return;
917
918 /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
919 if (user_tsc_khz == 0) {
920 vcpu->arch.virtual_tsc_khz = 0;
921 svm->tsc_ratio = TSC_RATIO_DEFAULT;
922 return;
923 }
924
925 khz = user_tsc_khz;
926
927 /* TSC scaling required - calculate ratio */
928 ratio = khz << 32;
929 do_div(ratio, tsc_khz);
930
931 if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
932 WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
933 user_tsc_khz);
934 return;
935 }
936 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
937 svm->tsc_ratio = ratio;
938}
939
857static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 940static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
858{ 941{
859 struct vcpu_svm *svm = to_svm(vcpu); 942 struct vcpu_svm *svm = to_svm(vcpu);
@@ -880,6 +963,15 @@ static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
880 mark_dirty(svm->vmcb, VMCB_INTERCEPTS); 963 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
881} 964}
882 965
966static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
967{
968 u64 tsc;
969
970 tsc = svm_scale_tsc(vcpu, native_read_tsc());
971
972 return target_tsc - tsc;
973}
974
883static void init_vmcb(struct vcpu_svm *svm) 975static void init_vmcb(struct vcpu_svm *svm)
884{ 976{
885 struct vmcb_control_area *control = &svm->vmcb->control; 977 struct vmcb_control_area *control = &svm->vmcb->control;
@@ -975,7 +1067,7 @@ static void init_vmcb(struct vcpu_svm *svm)
975 svm_set_efer(&svm->vcpu, 0); 1067 svm_set_efer(&svm->vcpu, 0);
976 save->dr6 = 0xffff0ff0; 1068 save->dr6 = 0xffff0ff0;
977 save->dr7 = 0x400; 1069 save->dr7 = 0x400;
978 save->rflags = 2; 1070 kvm_set_rflags(&svm->vcpu, 2);
979 save->rip = 0x0000fff0; 1071 save->rip = 0x0000fff0;
980 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1072 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
981 1073
@@ -1048,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
1048 goto out; 1140 goto out;
1049 } 1141 }
1050 1142
1143 svm->tsc_ratio = TSC_RATIO_DEFAULT;
1144
1051 err = kvm_vcpu_init(&svm->vcpu, kvm, id); 1145 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
1052 if (err) 1146 if (err)
1053 goto free_svm; 1147 goto free_svm;
@@ -1141,6 +1235,12 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1141 1235
1142 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1236 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
1143 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1237 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1238
1239 if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
1240 svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
1241 __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
1242 wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
1243 }
1144} 1244}
1145 1245
1146static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1246static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -1365,31 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1365{ 1465{
1366 struct vcpu_svm *svm = to_svm(vcpu); 1466 struct vcpu_svm *svm = to_svm(vcpu);
1367 1467
1368 if (is_guest_mode(vcpu)) {
1369 /*
1370 * We are here because we run in nested mode, the host kvm
1371 * intercepts cr0 writes but the l1 hypervisor does not.
1372 * But the L1 hypervisor may intercept selective cr0 writes.
1373 * This needs to be checked here.
1374 */
1375 unsigned long old, new;
1376
1377 /* Remove bits that would trigger a real cr0 write intercept */
1378 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1379 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1380
1381 if (old == new) {
1382 /* cr0 write with ts and mp unchanged */
1383 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1384 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE) {
1385 svm->nested.vmexit_rip = kvm_rip_read(vcpu);
1386 svm->nested.vmexit_rsp = kvm_register_read(vcpu, VCPU_REGS_RSP);
1387 svm->nested.vmexit_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
1388 return;
1389 }
1390 }
1391 }
1392
1393#ifdef CONFIG_X86_64 1468#ifdef CONFIG_X86_64
1394 if (vcpu->arch.efer & EFER_LME) { 1469 if (vcpu->arch.efer & EFER_LME) {
1395 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1470 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -2127,7 +2202,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2127 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu); 2202 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
2128 nested_vmcb->save.cr2 = vmcb->save.cr2; 2203 nested_vmcb->save.cr2 = vmcb->save.cr2;
2129 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2204 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
2130 nested_vmcb->save.rflags = vmcb->save.rflags; 2205 nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
2131 nested_vmcb->save.rip = vmcb->save.rip; 2206 nested_vmcb->save.rip = vmcb->save.rip;
2132 nested_vmcb->save.rsp = vmcb->save.rsp; 2207 nested_vmcb->save.rsp = vmcb->save.rsp;
2133 nested_vmcb->save.rax = vmcb->save.rax; 2208 nested_vmcb->save.rax = vmcb->save.rax;
@@ -2184,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
2184 svm->vmcb->save.ds = hsave->save.ds; 2259 svm->vmcb->save.ds = hsave->save.ds;
2185 svm->vmcb->save.gdtr = hsave->save.gdtr; 2260 svm->vmcb->save.gdtr = hsave->save.gdtr;
2186 svm->vmcb->save.idtr = hsave->save.idtr; 2261 svm->vmcb->save.idtr = hsave->save.idtr;
2187 svm->vmcb->save.rflags = hsave->save.rflags; 2262 kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
2188 svm_set_efer(&svm->vcpu, hsave->save.efer); 2263 svm_set_efer(&svm->vcpu, hsave->save.efer);
2189 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); 2264 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
2190 svm_set_cr4(&svm->vcpu, hsave->save.cr4); 2265 svm_set_cr4(&svm->vcpu, hsave->save.cr4);
@@ -2312,7 +2387,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2312 hsave->save.efer = svm->vcpu.arch.efer; 2387 hsave->save.efer = svm->vcpu.arch.efer;
2313 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 2388 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
2314 hsave->save.cr4 = svm->vcpu.arch.cr4; 2389 hsave->save.cr4 = svm->vcpu.arch.cr4;
2315 hsave->save.rflags = vmcb->save.rflags; 2390 hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2316 hsave->save.rip = kvm_rip_read(&svm->vcpu); 2391 hsave->save.rip = kvm_rip_read(&svm->vcpu);
2317 hsave->save.rsp = vmcb->save.rsp; 2392 hsave->save.rsp = vmcb->save.rsp;
2318 hsave->save.rax = vmcb->save.rax; 2393 hsave->save.rax = vmcb->save.rax;
@@ -2323,7 +2398,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2323 2398
2324 copy_vmcb_control_area(hsave, vmcb); 2399 copy_vmcb_control_area(hsave, vmcb);
2325 2400
2326 if (svm->vmcb->save.rflags & X86_EFLAGS_IF) 2401 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2327 svm->vcpu.arch.hflags |= HF_HIF_MASK; 2402 svm->vcpu.arch.hflags |= HF_HIF_MASK;
2328 else 2403 else
2329 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 2404 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
@@ -2341,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2341 svm->vmcb->save.ds = nested_vmcb->save.ds; 2416 svm->vmcb->save.ds = nested_vmcb->save.ds;
2342 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; 2417 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2343 svm->vmcb->save.idtr = nested_vmcb->save.idtr; 2418 svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2344 svm->vmcb->save.rflags = nested_vmcb->save.rflags; 2419 kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2345 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); 2420 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2346 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); 2421 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2347 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); 2422 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
@@ -2443,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm)
2443 if (nested_svm_check_permissions(svm)) 2518 if (nested_svm_check_permissions(svm))
2444 return 1; 2519 return 1;
2445 2520
2446 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2447 skip_emulated_instruction(&svm->vcpu);
2448
2449 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2521 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2450 if (!nested_vmcb) 2522 if (!nested_vmcb)
2451 return 1; 2523 return 1;
2452 2524
2525 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2526 skip_emulated_instruction(&svm->vcpu);
2527
2453 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2528 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2454 nested_svm_unmap(page); 2529 nested_svm_unmap(page);
2455 2530
@@ -2464,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm)
2464 if (nested_svm_check_permissions(svm)) 2539 if (nested_svm_check_permissions(svm))
2465 return 1; 2540 return 1;
2466 2541
2467 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2468 skip_emulated_instruction(&svm->vcpu);
2469
2470 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2542 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2471 if (!nested_vmcb) 2543 if (!nested_vmcb)
2472 return 1; 2544 return 1;
2473 2545
2546 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2547 skip_emulated_instruction(&svm->vcpu);
2548
2474 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2549 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2475 nested_svm_unmap(page); 2550 nested_svm_unmap(page);
2476 2551
@@ -2676,6 +2751,29 @@ static int emulate_on_interception(struct vcpu_svm *svm)
2676 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE; 2751 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2677} 2752}
2678 2753
2754bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
2755{
2756 unsigned long cr0 = svm->vcpu.arch.cr0;
2757 bool ret = false;
2758 u64 intercept;
2759
2760 intercept = svm->nested.intercept;
2761
2762 if (!is_guest_mode(&svm->vcpu) ||
2763 (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
2764 return false;
2765
2766 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2767 val &= ~SVM_CR0_SELECTIVE_MASK;
2768
2769 if (cr0 ^ val) {
2770 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2771 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2772 }
2773
2774 return ret;
2775}
2776
2679#define CR_VALID (1ULL << 63) 2777#define CR_VALID (1ULL << 63)
2680 2778
2681static int cr_interception(struct vcpu_svm *svm) 2779static int cr_interception(struct vcpu_svm *svm)
@@ -2699,7 +2797,11 @@ static int cr_interception(struct vcpu_svm *svm)
2699 val = kvm_register_read(&svm->vcpu, reg); 2797 val = kvm_register_read(&svm->vcpu, reg);
2700 switch (cr) { 2798 switch (cr) {
2701 case 0: 2799 case 0:
2702 err = kvm_set_cr0(&svm->vcpu, val); 2800 if (!check_selective_cr0_intercepted(svm, val))
2801 err = kvm_set_cr0(&svm->vcpu, val);
2802 else
2803 return 1;
2804
2703 break; 2805 break;
2704 case 3: 2806 case 3:
2705 err = kvm_set_cr3(&svm->vcpu, val); 2807 err = kvm_set_cr3(&svm->vcpu, val);
@@ -2744,23 +2846,6 @@ static int cr_interception(struct vcpu_svm *svm)
2744 return 1; 2846 return 1;
2745} 2847}
2746 2848
2747static int cr0_write_interception(struct vcpu_svm *svm)
2748{
2749 struct kvm_vcpu *vcpu = &svm->vcpu;
2750 int r;
2751
2752 r = cr_interception(svm);
2753
2754 if (svm->nested.vmexit_rip) {
2755 kvm_register_write(vcpu, VCPU_REGS_RIP, svm->nested.vmexit_rip);
2756 kvm_register_write(vcpu, VCPU_REGS_RSP, svm->nested.vmexit_rsp);
2757 kvm_register_write(vcpu, VCPU_REGS_RAX, svm->nested.vmexit_rax);
2758 svm->nested.vmexit_rip = 0;
2759 }
2760
2761 return r;
2762}
2763
2764static int dr_interception(struct vcpu_svm *svm) 2849static int dr_interception(struct vcpu_svm *svm)
2765{ 2850{
2766 int reg, dr; 2851 int reg, dr;
@@ -2813,7 +2898,9 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2813 case MSR_IA32_TSC: { 2898 case MSR_IA32_TSC: {
2814 struct vmcb *vmcb = get_host_vmcb(svm); 2899 struct vmcb *vmcb = get_host_vmcb(svm);
2815 2900
2816 *data = vmcb->control.tsc_offset + native_read_tsc(); 2901 *data = vmcb->control.tsc_offset +
2902 svm_scale_tsc(vcpu, native_read_tsc());
2903
2817 break; 2904 break;
2818 } 2905 }
2819 case MSR_STAR: 2906 case MSR_STAR:
@@ -3048,7 +3135,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
3048 [SVM_EXIT_READ_CR4] = cr_interception, 3135 [SVM_EXIT_READ_CR4] = cr_interception,
3049 [SVM_EXIT_READ_CR8] = cr_interception, 3136 [SVM_EXIT_READ_CR8] = cr_interception,
3050 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3137 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
3051 [SVM_EXIT_WRITE_CR0] = cr0_write_interception, 3138 [SVM_EXIT_WRITE_CR0] = cr_interception,
3052 [SVM_EXIT_WRITE_CR3] = cr_interception, 3139 [SVM_EXIT_WRITE_CR3] = cr_interception,
3053 [SVM_EXIT_WRITE_CR4] = cr_interception, 3140 [SVM_EXIT_WRITE_CR4] = cr_interception,
3054 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3141 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
@@ -3104,97 +3191,109 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
3104 [SVM_EXIT_NPF] = pf_interception, 3191 [SVM_EXIT_NPF] = pf_interception,
3105}; 3192};
3106 3193
3107void dump_vmcb(struct kvm_vcpu *vcpu) 3194static void dump_vmcb(struct kvm_vcpu *vcpu)
3108{ 3195{
3109 struct vcpu_svm *svm = to_svm(vcpu); 3196 struct vcpu_svm *svm = to_svm(vcpu);
3110 struct vmcb_control_area *control = &svm->vmcb->control; 3197 struct vmcb_control_area *control = &svm->vmcb->control;
3111 struct vmcb_save_area *save = &svm->vmcb->save; 3198 struct vmcb_save_area *save = &svm->vmcb->save;
3112 3199
3113 pr_err("VMCB Control Area:\n"); 3200 pr_err("VMCB Control Area:\n");
3114 pr_err("cr_read: %04x\n", control->intercept_cr & 0xffff); 3201 pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
3115 pr_err("cr_write: %04x\n", control->intercept_cr >> 16); 3202 pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
3116 pr_err("dr_read: %04x\n", control->intercept_dr & 0xffff); 3203 pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
3117 pr_err("dr_write: %04x\n", control->intercept_dr >> 16); 3204 pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
3118 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3205 pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
3119 pr_err("intercepts: %016llx\n", control->intercept); 3206 pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
3120 pr_err("pause filter count: %d\n", control->pause_filter_count); 3207 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
3121 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); 3208 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
3122 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); 3209 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
3123 pr_err("tsc_offset: %016llx\n", control->tsc_offset); 3210 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
3124 pr_err("asid: %d\n", control->asid); 3211 pr_err("%-20s%d\n", "asid:", control->asid);
3125 pr_err("tlb_ctl: %d\n", control->tlb_ctl); 3212 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
3126 pr_err("int_ctl: %08x\n", control->int_ctl); 3213 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
3127 pr_err("int_vector: %08x\n", control->int_vector); 3214 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
3128 pr_err("int_state: %08x\n", control->int_state); 3215 pr_err("%-20s%08x\n", "int_state:", control->int_state);
3129 pr_err("exit_code: %08x\n", control->exit_code); 3216 pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
3130 pr_err("exit_info1: %016llx\n", control->exit_info_1); 3217 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
3131 pr_err("exit_info2: %016llx\n", control->exit_info_2); 3218 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
3132 pr_err("exit_int_info: %08x\n", control->exit_int_info); 3219 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
3133 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); 3220 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
3134 pr_err("nested_ctl: %lld\n", control->nested_ctl); 3221 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
3135 pr_err("nested_cr3: %016llx\n", control->nested_cr3); 3222 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
3136 pr_err("event_inj: %08x\n", control->event_inj); 3223 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
3137 pr_err("event_inj_err: %08x\n", control->event_inj_err); 3224 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
3138 pr_err("lbr_ctl: %lld\n", control->lbr_ctl); 3225 pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
3139 pr_err("next_rip: %016llx\n", control->next_rip); 3226 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
3140 pr_err("VMCB State Save Area:\n"); 3227 pr_err("VMCB State Save Area:\n");
3141 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", 3228 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3142 save->es.selector, save->es.attrib, 3229 "es:",
3143 save->es.limit, save->es.base); 3230 save->es.selector, save->es.attrib,
3144 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", 3231 save->es.limit, save->es.base);
3145 save->cs.selector, save->cs.attrib, 3232 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3146 save->cs.limit, save->cs.base); 3233 "cs:",
3147 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", 3234 save->cs.selector, save->cs.attrib,
3148 save->ss.selector, save->ss.attrib, 3235 save->cs.limit, save->cs.base);
3149 save->ss.limit, save->ss.base); 3236 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3150 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", 3237 "ss:",
3151 save->ds.selector, save->ds.attrib, 3238 save->ss.selector, save->ss.attrib,
3152 save->ds.limit, save->ds.base); 3239 save->ss.limit, save->ss.base);
3153 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", 3240 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3154 save->fs.selector, save->fs.attrib, 3241 "ds:",
3155 save->fs.limit, save->fs.base); 3242 save->ds.selector, save->ds.attrib,
3156 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", 3243 save->ds.limit, save->ds.base);
3157 save->gs.selector, save->gs.attrib, 3244 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3158 save->gs.limit, save->gs.base); 3245 "fs:",
3159 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", 3246 save->fs.selector, save->fs.attrib,
3160 save->gdtr.selector, save->gdtr.attrib, 3247 save->fs.limit, save->fs.base);
3161 save->gdtr.limit, save->gdtr.base); 3248 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3162 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", 3249 "gs:",
3163 save->ldtr.selector, save->ldtr.attrib, 3250 save->gs.selector, save->gs.attrib,
3164 save->ldtr.limit, save->ldtr.base); 3251 save->gs.limit, save->gs.base);
3165 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", 3252 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3166 save->idtr.selector, save->idtr.attrib, 3253 "gdtr:",
3167 save->idtr.limit, save->idtr.base); 3254 save->gdtr.selector, save->gdtr.attrib,
3168 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", 3255 save->gdtr.limit, save->gdtr.base);
3169 save->tr.selector, save->tr.attrib, 3256 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3170 save->tr.limit, save->tr.base); 3257 "ldtr:",
3258 save->ldtr.selector, save->ldtr.attrib,
3259 save->ldtr.limit, save->ldtr.base);
3260 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3261 "idtr:",
3262 save->idtr.selector, save->idtr.attrib,
3263 save->idtr.limit, save->idtr.base);
3264 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3265 "tr:",
3266 save->tr.selector, save->tr.attrib,
3267 save->tr.limit, save->tr.base);
3171 pr_err("cpl: %d efer: %016llx\n", 3268 pr_err("cpl: %d efer: %016llx\n",
3172 save->cpl, save->efer); 3269 save->cpl, save->efer);
3173 pr_err("cr0: %016llx cr2: %016llx\n", 3270 pr_err("%-15s %016llx %-13s %016llx\n",
3174 save->cr0, save->cr2); 3271 "cr0:", save->cr0, "cr2:", save->cr2);
3175 pr_err("cr3: %016llx cr4: %016llx\n", 3272 pr_err("%-15s %016llx %-13s %016llx\n",
3176 save->cr3, save->cr4); 3273 "cr3:", save->cr3, "cr4:", save->cr4);
3177 pr_err("dr6: %016llx dr7: %016llx\n", 3274 pr_err("%-15s %016llx %-13s %016llx\n",
3178 save->dr6, save->dr7); 3275 "dr6:", save->dr6, "dr7:", save->dr7);
3179 pr_err("rip: %016llx rflags: %016llx\n", 3276 pr_err("%-15s %016llx %-13s %016llx\n",
3180 save->rip, save->rflags); 3277 "rip:", save->rip, "rflags:", save->rflags);
3181 pr_err("rsp: %016llx rax: %016llx\n", 3278 pr_err("%-15s %016llx %-13s %016llx\n",
3182 save->rsp, save->rax); 3279 "rsp:", save->rsp, "rax:", save->rax);
3183 pr_err("star: %016llx lstar: %016llx\n", 3280 pr_err("%-15s %016llx %-13s %016llx\n",
3184 save->star, save->lstar); 3281 "star:", save->star, "lstar:", save->lstar);
3185 pr_err("cstar: %016llx sfmask: %016llx\n", 3282 pr_err("%-15s %016llx %-13s %016llx\n",
3186 save->cstar, save->sfmask); 3283 "cstar:", save->cstar, "sfmask:", save->sfmask);
3187 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", 3284 pr_err("%-15s %016llx %-13s %016llx\n",
3188 save->kernel_gs_base, save->sysenter_cs); 3285 "kernel_gs_base:", save->kernel_gs_base,
3189 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", 3286 "sysenter_cs:", save->sysenter_cs);
3190 save->sysenter_esp, save->sysenter_eip); 3287 pr_err("%-15s %016llx %-13s %016llx\n",
3191 pr_err("gpat: %016llx dbgctl: %016llx\n", 3288 "sysenter_esp:", save->sysenter_esp,
3192 save->g_pat, save->dbgctl); 3289 "sysenter_eip:", save->sysenter_eip);
3193 pr_err("br_from: %016llx br_to: %016llx\n", 3290 pr_err("%-15s %016llx %-13s %016llx\n",
3194 save->br_from, save->br_to); 3291 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
3195 pr_err("excp_from: %016llx excp_to: %016llx\n", 3292 pr_err("%-15s %016llx %-13s %016llx\n",
3196 save->last_excp_from, save->last_excp_to); 3293 "br_from:", save->br_from, "br_to:", save->br_to);
3197 3294 pr_err("%-15s %016llx %-13s %016llx\n",
3295 "excp_from:", save->last_excp_from,
3296 "excp_to:", save->last_excp_to);
3198} 3297}
3199 3298
3200static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2) 3299static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
@@ -3384,7 +3483,7 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
3384 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) 3483 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
3385 return 0; 3484 return 0;
3386 3485
3387 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3486 ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3388 3487
3389 if (is_guest_mode(vcpu)) 3488 if (is_guest_mode(vcpu))
3390 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3489 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
@@ -3871,6 +3970,186 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3871 update_cr0_intercept(svm); 3970 update_cr0_intercept(svm);
3872} 3971}
3873 3972
3973#define PRE_EX(exit) { .exit_code = (exit), \
3974 .stage = X86_ICPT_PRE_EXCEPT, }
3975#define POST_EX(exit) { .exit_code = (exit), \
3976 .stage = X86_ICPT_POST_EXCEPT, }
3977#define POST_MEM(exit) { .exit_code = (exit), \
3978 .stage = X86_ICPT_POST_MEMACCESS, }
3979
3980static struct __x86_intercept {
3981 u32 exit_code;
3982 enum x86_intercept_stage stage;
3983} x86_intercept_map[] = {
3984 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
3985 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
3986 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
3987 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
3988 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
3989 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
3990 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
3991 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
3992 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
3993 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
3994 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
3995 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
3996 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
3997 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
3998 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
3999 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4000 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4001 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4002 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4003 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4004 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4005 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4006 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
4007 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4008 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4009 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
4010 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4011 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4012 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4013 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4014 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4015 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4016 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4017 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4018 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
4019 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4020 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4021 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4022 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4023 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4024 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4025 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
4026 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4027 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4028 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4029 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
4030};
4031
4032#undef PRE_EX
4033#undef POST_EX
4034#undef POST_MEM
4035
4036static int svm_check_intercept(struct kvm_vcpu *vcpu,
4037 struct x86_instruction_info *info,
4038 enum x86_intercept_stage stage)
4039{
4040 struct vcpu_svm *svm = to_svm(vcpu);
4041 int vmexit, ret = X86EMUL_CONTINUE;
4042 struct __x86_intercept icpt_info;
4043 struct vmcb *vmcb = svm->vmcb;
4044
4045 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4046 goto out;
4047
4048 icpt_info = x86_intercept_map[info->intercept];
4049
4050 if (stage != icpt_info.stage)
4051 goto out;
4052
4053 switch (icpt_info.exit_code) {
4054 case SVM_EXIT_READ_CR0:
4055 if (info->intercept == x86_intercept_cr_read)
4056 icpt_info.exit_code += info->modrm_reg;
4057 break;
4058 case SVM_EXIT_WRITE_CR0: {
4059 unsigned long cr0, val;
4060 u64 intercept;
4061
4062 if (info->intercept == x86_intercept_cr_write)
4063 icpt_info.exit_code += info->modrm_reg;
4064
4065 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
4066 break;
4067
4068 intercept = svm->nested.intercept;
4069
4070 if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4071 break;
4072
4073 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4074 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4075
4076 if (info->intercept == x86_intercept_lmsw) {
4077 cr0 &= 0xfUL;
4078 val &= 0xfUL;
4079 /* lmsw can't clear PE - catch this here */
4080 if (cr0 & X86_CR0_PE)
4081 val |= X86_CR0_PE;
4082 }
4083
4084 if (cr0 ^ val)
4085 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4086
4087 break;
4088 }
4089 case SVM_EXIT_READ_DR0:
4090 case SVM_EXIT_WRITE_DR0:
4091 icpt_info.exit_code += info->modrm_reg;
4092 break;
4093 case SVM_EXIT_MSR:
4094 if (info->intercept == x86_intercept_wrmsr)
4095 vmcb->control.exit_info_1 = 1;
4096 else
4097 vmcb->control.exit_info_1 = 0;
4098 break;
4099 case SVM_EXIT_PAUSE:
4100 /*
4101 * We get this for NOP only, but pause
4102 * is rep not, check this here
4103 */
4104 if (info->rep_prefix != REPE_PREFIX)
4105 goto out;
4106 case SVM_EXIT_IOIO: {
4107 u64 exit_info;
4108 u32 bytes;
4109
4110 exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
4111
4112 if (info->intercept == x86_intercept_in ||
4113 info->intercept == x86_intercept_ins) {
4114 exit_info |= SVM_IOIO_TYPE_MASK;
4115 bytes = info->src_bytes;
4116 } else {
4117 bytes = info->dst_bytes;
4118 }
4119
4120 if (info->intercept == x86_intercept_outs ||
4121 info->intercept == x86_intercept_ins)
4122 exit_info |= SVM_IOIO_STR_MASK;
4123
4124 if (info->rep_prefix)
4125 exit_info |= SVM_IOIO_REP_MASK;
4126
4127 bytes = min(bytes, 4u);
4128
4129 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4130
4131 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4132
4133 vmcb->control.exit_info_1 = exit_info;
4134 vmcb->control.exit_info_2 = info->next_rip;
4135
4136 break;
4137 }
4138 default:
4139 break;
4140 }
4141
4142 vmcb->control.next_rip = info->next_rip;
4143 vmcb->control.exit_code = icpt_info.exit_code;
4144 vmexit = nested_svm_exit_handled(svm);
4145
4146 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4147 : X86EMUL_CONTINUE;
4148
4149out:
4150 return ret;
4151}
4152
3874static struct kvm_x86_ops svm_x86_ops = { 4153static struct kvm_x86_ops svm_x86_ops = {
3875 .cpu_has_kvm_support = has_svm, 4154 .cpu_has_kvm_support = has_svm,
3876 .disabled_by_bios = is_disabled, 4155 .disabled_by_bios = is_disabled,
@@ -3952,10 +4231,14 @@ static struct kvm_x86_ops svm_x86_ops = {
3952 4231
3953 .has_wbinvd_exit = svm_has_wbinvd_exit, 4232 .has_wbinvd_exit = svm_has_wbinvd_exit,
3954 4233
4234 .set_tsc_khz = svm_set_tsc_khz,
3955 .write_tsc_offset = svm_write_tsc_offset, 4235 .write_tsc_offset = svm_write_tsc_offset,
3956 .adjust_tsc_offset = svm_adjust_tsc_offset, 4236 .adjust_tsc_offset = svm_adjust_tsc_offset,
4237 .compute_tsc_offset = svm_compute_tsc_offset,
3957 4238
3958 .set_tdp_cr3 = set_tdp_cr3, 4239 .set_tdp_cr3 = set_tdp_cr3,
4240
4241 .check_intercept = svm_check_intercept,
3959}; 4242};
3960 4243
3961static int __init svm_init(void) 4244static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 5b4cdcbd154c..4c3fa0f67469 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -128,8 +128,11 @@ struct vcpu_vmx {
128 unsigned long host_rsp; 128 unsigned long host_rsp;
129 int launched; 129 int launched;
130 u8 fail; 130 u8 fail;
131 u8 cpl;
132 bool nmi_known_unmasked;
131 u32 exit_intr_info; 133 u32 exit_intr_info;
132 u32 idt_vectoring_info; 134 u32 idt_vectoring_info;
135 ulong rflags;
133 struct shared_msr_entry *guest_msrs; 136 struct shared_msr_entry *guest_msrs;
134 int nmsrs; 137 int nmsrs;
135 int save_nmsrs; 138 int save_nmsrs;
@@ -159,6 +162,10 @@ struct vcpu_vmx {
159 u32 ar; 162 u32 ar;
160 } tr, es, ds, fs, gs; 163 } tr, es, ds, fs, gs;
161 } rmode; 164 } rmode;
165 struct {
166 u32 bitmask; /* 4 bits per segment (1 bit per field) */
167 struct kvm_save_segment seg[8];
168 } segment_cache;
162 int vpid; 169 int vpid;
163 bool emulation_required; 170 bool emulation_required;
164 171
@@ -171,6 +178,15 @@ struct vcpu_vmx {
171 bool rdtscp_enabled; 178 bool rdtscp_enabled;
172}; 179};
173 180
181enum segment_cache_field {
182 SEG_FIELD_SEL = 0,
183 SEG_FIELD_BASE = 1,
184 SEG_FIELD_LIMIT = 2,
185 SEG_FIELD_AR = 3,
186
187 SEG_FIELD_NR = 4
188};
189
174static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 190static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
175{ 191{
176 return container_of(vcpu, struct vcpu_vmx, vcpu); 192 return container_of(vcpu, struct vcpu_vmx, vcpu);
@@ -643,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
643 vmcs_writel(field, vmcs_readl(field) | mask); 659 vmcs_writel(field, vmcs_readl(field) | mask);
644} 660}
645 661
662static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
663{
664 vmx->segment_cache.bitmask = 0;
665}
666
667static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
668 unsigned field)
669{
670 bool ret;
671 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
672
673 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
674 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
675 vmx->segment_cache.bitmask = 0;
676 }
677 ret = vmx->segment_cache.bitmask & mask;
678 vmx->segment_cache.bitmask |= mask;
679 return ret;
680}
681
682static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
683{
684 u16 *p = &vmx->segment_cache.seg[seg].selector;
685
686 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
687 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
688 return *p;
689}
690
691static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
692{
693 ulong *p = &vmx->segment_cache.seg[seg].base;
694
695 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
696 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
697 return *p;
698}
699
700static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
701{
702 u32 *p = &vmx->segment_cache.seg[seg].limit;
703
704 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
705 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
706 return *p;
707}
708
709static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
710{
711 u32 *p = &vmx->segment_cache.seg[seg].ar;
712
713 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
714 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
715 return *p;
716}
717
646static void update_exception_bitmap(struct kvm_vcpu *vcpu) 718static void update_exception_bitmap(struct kvm_vcpu *vcpu)
647{ 719{
648 u32 eb; 720 u32 eb;
@@ -970,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
970{ 1042{
971 unsigned long rflags, save_rflags; 1043 unsigned long rflags, save_rflags;
972 1044
973 rflags = vmcs_readl(GUEST_RFLAGS); 1045 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
974 if (to_vmx(vcpu)->rmode.vm86_active) { 1046 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
975 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1047 rflags = vmcs_readl(GUEST_RFLAGS);
976 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 1048 if (to_vmx(vcpu)->rmode.vm86_active) {
977 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1049 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1050 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1051 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1052 }
1053 to_vmx(vcpu)->rflags = rflags;
978 } 1054 }
979 return rflags; 1055 return to_vmx(vcpu)->rflags;
980} 1056}
981 1057
982static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1058static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
983{ 1059{
1060 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1061 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1062 to_vmx(vcpu)->rflags = rflags;
984 if (to_vmx(vcpu)->rmode.vm86_active) { 1063 if (to_vmx(vcpu)->rmode.vm86_active) {
985 to_vmx(vcpu)->rmode.save_rflags = rflags; 1064 to_vmx(vcpu)->rmode.save_rflags = rflags;
986 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1065 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1053,7 +1132,10 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1053 } 1132 }
1054 1133
1055 if (vmx->rmode.vm86_active) { 1134 if (vmx->rmode.vm86_active) {
1056 if (kvm_inject_realmode_interrupt(vcpu, nr) != EMULATE_DONE) 1135 int inc_eip = 0;
1136 if (kvm_exception_is_soft(nr))
1137 inc_eip = vcpu->arch.event_exit_inst_len;
1138 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
1057 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 1139 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1058 return; 1140 return;
1059 } 1141 }
@@ -1151,6 +1233,16 @@ static u64 guest_read_tsc(void)
1151} 1233}
1152 1234
1153/* 1235/*
1236 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
1237 * ioctl. In this case the call-back should update internal vmx state to make
1238 * the changes effective.
1239 */
1240static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1241{
1242 /* Nothing to do here */
1243}
1244
1245/*
1154 * writes 'offset' into guest's timestamp counter offset register 1246 * writes 'offset' into guest's timestamp counter offset register
1155 */ 1247 */
1156static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset) 1248static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
@@ -1164,6 +1256,11 @@ static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1164 vmcs_write64(TSC_OFFSET, offset + adjustment); 1256 vmcs_write64(TSC_OFFSET, offset + adjustment);
1165} 1257}
1166 1258
1259static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1260{
1261 return target_tsc - native_read_tsc();
1262}
1263
1167/* 1264/*
1168 * Reads an msr value (of 'msr_index') into 'pdata'. 1265 * Reads an msr value (of 'msr_index') into 'pdata'.
1169 * Returns 0 on success, non-0 otherwise. 1266 * Returns 0 on success, non-0 otherwise.
@@ -1243,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1243 break; 1340 break;
1244#ifdef CONFIG_X86_64 1341#ifdef CONFIG_X86_64
1245 case MSR_FS_BASE: 1342 case MSR_FS_BASE:
1343 vmx_segment_cache_clear(vmx);
1246 vmcs_writel(GUEST_FS_BASE, data); 1344 vmcs_writel(GUEST_FS_BASE, data);
1247 break; 1345 break;
1248 case MSR_GS_BASE: 1346 case MSR_GS_BASE:
1347 vmx_segment_cache_clear(vmx);
1249 vmcs_writel(GUEST_GS_BASE, data); 1348 vmcs_writel(GUEST_GS_BASE, data);
1250 break; 1349 break;
1251 case MSR_KERNEL_GS_BASE: 1350 case MSR_KERNEL_GS_BASE:
@@ -1689,6 +1788,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1689 vmx->emulation_required = 1; 1788 vmx->emulation_required = 1;
1690 vmx->rmode.vm86_active = 0; 1789 vmx->rmode.vm86_active = 0;
1691 1790
1791 vmx_segment_cache_clear(vmx);
1792
1692 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector); 1793 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1693 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1794 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1694 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1795 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
@@ -1712,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1712 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); 1813 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
1713 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); 1814 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
1714 1815
1816 vmx_segment_cache_clear(vmx);
1817
1715 vmcs_write16(GUEST_SS_SELECTOR, 0); 1818 vmcs_write16(GUEST_SS_SELECTOR, 0);
1716 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1819 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1717 1820
@@ -1775,6 +1878,8 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1775 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); 1878 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1776 } 1879 }
1777 1880
1881 vmx_segment_cache_clear(vmx);
1882
1778 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR); 1883 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1779 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1884 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1780 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1885 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
@@ -1851,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1851{ 1956{
1852 u32 guest_tr_ar; 1957 u32 guest_tr_ar;
1853 1958
1959 vmx_segment_cache_clear(to_vmx(vcpu));
1960
1854 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 1961 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1855 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 1962 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1856 printk(KERN_DEBUG "%s: tss fixup for long mode. \n", 1963 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
@@ -1998,6 +2105,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1998 vmcs_writel(CR0_READ_SHADOW, cr0); 2105 vmcs_writel(CR0_READ_SHADOW, cr0);
1999 vmcs_writel(GUEST_CR0, hw_cr0); 2106 vmcs_writel(GUEST_CR0, hw_cr0);
2000 vcpu->arch.cr0 = cr0; 2107 vcpu->arch.cr0 = cr0;
2108 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2001} 2109}
2002 2110
2003static u64 construct_eptp(unsigned long root_hpa) 2111static u64 construct_eptp(unsigned long root_hpa)
@@ -2053,7 +2161,6 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2053 struct kvm_segment *var, int seg) 2161 struct kvm_segment *var, int seg)
2054{ 2162{
2055 struct vcpu_vmx *vmx = to_vmx(vcpu); 2163 struct vcpu_vmx *vmx = to_vmx(vcpu);
2056 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2057 struct kvm_save_segment *save; 2164 struct kvm_save_segment *save;
2058 u32 ar; 2165 u32 ar;
2059 2166
@@ -2075,13 +2182,13 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2075 var->limit = save->limit; 2182 var->limit = save->limit;
2076 ar = save->ar; 2183 ar = save->ar;
2077 if (seg == VCPU_SREG_TR 2184 if (seg == VCPU_SREG_TR
2078 || var->selector == vmcs_read16(sf->selector)) 2185 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
2079 goto use_saved_rmode_seg; 2186 goto use_saved_rmode_seg;
2080 } 2187 }
2081 var->base = vmcs_readl(sf->base); 2188 var->base = vmx_read_guest_seg_base(vmx, seg);
2082 var->limit = vmcs_read32(sf->limit); 2189 var->limit = vmx_read_guest_seg_limit(vmx, seg);
2083 var->selector = vmcs_read16(sf->selector); 2190 var->selector = vmx_read_guest_seg_selector(vmx, seg);
2084 ar = vmcs_read32(sf->ar_bytes); 2191 ar = vmx_read_guest_seg_ar(vmx, seg);
2085use_saved_rmode_seg: 2192use_saved_rmode_seg:
2086 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2193 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2087 ar = 0; 2194 ar = 0;
@@ -2098,27 +2205,37 @@ use_saved_rmode_seg:
2098 2205
2099static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg) 2206static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2100{ 2207{
2101 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2102 struct kvm_segment s; 2208 struct kvm_segment s;
2103 2209
2104 if (to_vmx(vcpu)->rmode.vm86_active) { 2210 if (to_vmx(vcpu)->rmode.vm86_active) {
2105 vmx_get_segment(vcpu, &s, seg); 2211 vmx_get_segment(vcpu, &s, seg);
2106 return s.base; 2212 return s.base;
2107 } 2213 }
2108 return vmcs_readl(sf->base); 2214 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
2109} 2215}
2110 2216
2111static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2217static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
2112{ 2218{
2113 if (!is_protmode(vcpu)) 2219 if (!is_protmode(vcpu))
2114 return 0; 2220 return 0;
2115 2221
2116 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 2222 if (!is_long_mode(vcpu)
2223 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
2117 return 3; 2224 return 3;
2118 2225
2119 return vmcs_read16(GUEST_CS_SELECTOR) & 3; 2226 return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
2120} 2227}
2121 2228
2229static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2230{
2231 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
2232 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2233 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
2234 }
2235 return to_vmx(vcpu)->cpl;
2236}
2237
2238
2122static u32 vmx_segment_access_rights(struct kvm_segment *var) 2239static u32 vmx_segment_access_rights(struct kvm_segment *var)
2123{ 2240{
2124 u32 ar; 2241 u32 ar;
@@ -2148,6 +2265,8 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2148 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2265 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2149 u32 ar; 2266 u32 ar;
2150 2267
2268 vmx_segment_cache_clear(vmx);
2269
2151 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2270 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2152 vmcs_write16(sf->selector, var->selector); 2271 vmcs_write16(sf->selector, var->selector);
2153 vmx->rmode.tr.selector = var->selector; 2272 vmx->rmode.tr.selector = var->selector;
@@ -2184,11 +2303,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2184 ar |= 0x1; /* Accessed */ 2303 ar |= 0x1; /* Accessed */
2185 2304
2186 vmcs_write32(sf->ar_bytes, ar); 2305 vmcs_write32(sf->ar_bytes, ar);
2306 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2187} 2307}
2188 2308
2189static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 2309static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2190{ 2310{
2191 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); 2311 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
2192 2312
2193 *db = (ar >> 14) & 1; 2313 *db = (ar >> 14) & 1;
2194 *l = (ar >> 13) & 1; 2314 *l = (ar >> 13) & 1;
@@ -2775,6 +2895,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2775 if (ret != 0) 2895 if (ret != 0)
2776 goto out; 2896 goto out;
2777 2897
2898 vmx_segment_cache_clear(vmx);
2899
2778 seg_setup(VCPU_SREG_CS); 2900 seg_setup(VCPU_SREG_CS);
2779 /* 2901 /*
2780 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2902 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
@@ -2904,7 +3026,10 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2904 3026
2905 ++vcpu->stat.irq_injections; 3027 ++vcpu->stat.irq_injections;
2906 if (vmx->rmode.vm86_active) { 3028 if (vmx->rmode.vm86_active) {
2907 if (kvm_inject_realmode_interrupt(vcpu, irq) != EMULATE_DONE) 3029 int inc_eip = 0;
3030 if (vcpu->arch.interrupt.soft)
3031 inc_eip = vcpu->arch.event_exit_inst_len;
3032 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
2908 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3033 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2909 return; 3034 return;
2910 } 3035 }
@@ -2937,8 +3062,9 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2937 } 3062 }
2938 3063
2939 ++vcpu->stat.nmi_injections; 3064 ++vcpu->stat.nmi_injections;
3065 vmx->nmi_known_unmasked = false;
2940 if (vmx->rmode.vm86_active) { 3066 if (vmx->rmode.vm86_active) {
2941 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR) != EMULATE_DONE) 3067 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
2942 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 3068 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2943 return; 3069 return;
2944 } 3070 }
@@ -2961,6 +3087,8 @@ static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2961{ 3087{
2962 if (!cpu_has_virtual_nmis()) 3088 if (!cpu_has_virtual_nmis())
2963 return to_vmx(vcpu)->soft_vnmi_blocked; 3089 return to_vmx(vcpu)->soft_vnmi_blocked;
3090 if (to_vmx(vcpu)->nmi_known_unmasked)
3091 return false;
2964 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 3092 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2965} 3093}
2966 3094
@@ -2974,6 +3102,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2974 vmx->vnmi_blocked_time = 0; 3102 vmx->vnmi_blocked_time = 0;
2975 } 3103 }
2976 } else { 3104 } else {
3105 vmx->nmi_known_unmasked = !masked;
2977 if (masked) 3106 if (masked)
2978 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3107 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2979 GUEST_INTR_STATE_NMI); 3108 GUEST_INTR_STATE_NMI);
@@ -3091,7 +3220,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3091 enum emulation_result er; 3220 enum emulation_result er;
3092 3221
3093 vect_info = vmx->idt_vectoring_info; 3222 vect_info = vmx->idt_vectoring_info;
3094 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3223 intr_info = vmx->exit_intr_info;
3095 3224
3096 if (is_machine_check(intr_info)) 3225 if (is_machine_check(intr_info))
3097 return handle_machine_check(vcpu); 3226 return handle_machine_check(vcpu);
@@ -3122,7 +3251,6 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3122 } 3251 }
3123 3252
3124 error_code = 0; 3253 error_code = 0;
3125 rip = kvm_rip_read(vcpu);
3126 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 3254 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
3127 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 3255 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
3128 if (is_page_fault(intr_info)) { 3256 if (is_page_fault(intr_info)) {
@@ -3169,6 +3297,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3169 vmx->vcpu.arch.event_exit_inst_len = 3297 vmx->vcpu.arch.event_exit_inst_len =
3170 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3298 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3171 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3299 kvm_run->exit_reason = KVM_EXIT_DEBUG;
3300 rip = kvm_rip_read(vcpu);
3172 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 3301 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
3173 kvm_run->debug.arch.exception = ex_no; 3302 kvm_run->debug.arch.exception = ex_no;
3174 break; 3303 break;
@@ -3505,9 +3634,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3505 switch (type) { 3634 switch (type) {
3506 case INTR_TYPE_NMI_INTR: 3635 case INTR_TYPE_NMI_INTR:
3507 vcpu->arch.nmi_injected = false; 3636 vcpu->arch.nmi_injected = false;
3508 if (cpu_has_virtual_nmis()) 3637 vmx_set_nmi_mask(vcpu, true);
3509 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3510 GUEST_INTR_STATE_NMI);
3511 break; 3638 break;
3512 case INTR_TYPE_EXT_INTR: 3639 case INTR_TYPE_EXT_INTR:
3513 case INTR_TYPE_SOFT_INTR: 3640 case INTR_TYPE_SOFT_INTR:
@@ -3867,12 +3994,17 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3867 3994
3868static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx) 3995static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3869{ 3996{
3870 u32 exit_intr_info = vmx->exit_intr_info; 3997 u32 exit_intr_info;
3998
3999 if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
4000 || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
4001 return;
4002
4003 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4004 exit_intr_info = vmx->exit_intr_info;
3871 4005
3872 /* Handle machine checks before interrupts are enabled */ 4006 /* Handle machine checks before interrupts are enabled */
3873 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) 4007 if (is_machine_check(exit_intr_info))
3874 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3875 && is_machine_check(exit_intr_info)))
3876 kvm_machine_check(); 4008 kvm_machine_check();
3877 4009
3878 /* We need to handle NMIs before interrupts are enabled */ 4010 /* We need to handle NMIs before interrupts are enabled */
@@ -3886,7 +4018,7 @@ static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3886 4018
3887static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx) 4019static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3888{ 4020{
3889 u32 exit_intr_info = vmx->exit_intr_info; 4021 u32 exit_intr_info;
3890 bool unblock_nmi; 4022 bool unblock_nmi;
3891 u8 vector; 4023 u8 vector;
3892 bool idtv_info_valid; 4024 bool idtv_info_valid;
@@ -3894,6 +4026,13 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3894 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK; 4026 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3895 4027
3896 if (cpu_has_virtual_nmis()) { 4028 if (cpu_has_virtual_nmis()) {
4029 if (vmx->nmi_known_unmasked)
4030 return;
4031 /*
4032 * Can't use vmx->exit_intr_info since we're not sure what
4033 * the exit reason is.
4034 */
4035 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3897 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 4036 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3898 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 4037 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3899 /* 4038 /*
@@ -3910,6 +4049,10 @@ static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
3910 vector != DF_VECTOR && !idtv_info_valid) 4049 vector != DF_VECTOR && !idtv_info_valid)
3911 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4050 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3912 GUEST_INTR_STATE_NMI); 4051 GUEST_INTR_STATE_NMI);
4052 else
4053 vmx->nmi_known_unmasked =
4054 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
4055 & GUEST_INTR_STATE_NMI);
3913 } else if (unlikely(vmx->soft_vnmi_blocked)) 4056 } else if (unlikely(vmx->soft_vnmi_blocked))
3914 vmx->vnmi_blocked_time += 4057 vmx->vnmi_blocked_time +=
3915 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 4058 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
@@ -3946,8 +4089,7 @@ static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
3946 * Clear bit "block by NMI" before VM entry if a NMI 4089 * Clear bit "block by NMI" before VM entry if a NMI
3947 * delivery faulted. 4090 * delivery faulted.
3948 */ 4091 */
3949 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4092 vmx_set_nmi_mask(&vmx->vcpu, false);
3950 GUEST_INTR_STATE_NMI);
3951 break; 4093 break;
3952 case INTR_TYPE_SOFT_EXCEPTION: 4094 case INTR_TYPE_SOFT_EXCEPTION:
3953 vmx->vcpu.arch.event_exit_inst_len = 4095 vmx->vcpu.arch.event_exit_inst_len =
@@ -4124,7 +4266,10 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4124 ); 4266 );
4125 4267
4126 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4268 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4269 | (1 << VCPU_EXREG_RFLAGS)
4270 | (1 << VCPU_EXREG_CPL)
4127 | (1 << VCPU_EXREG_PDPTR) 4271 | (1 << VCPU_EXREG_PDPTR)
4272 | (1 << VCPU_EXREG_SEGMENTS)
4128 | (1 << VCPU_EXREG_CR3)); 4273 | (1 << VCPU_EXREG_CR3));
4129 vcpu->arch.regs_dirty = 0; 4274 vcpu->arch.regs_dirty = 0;
4130 4275
@@ -4134,7 +4279,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
4134 vmx->launched = 1; 4279 vmx->launched = 1;
4135 4280
4136 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 4281 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4137 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4138 4282
4139 vmx_complete_atomic_exit(vmx); 4283 vmx_complete_atomic_exit(vmx);
4140 vmx_recover_nmi_blocking(vmx); 4284 vmx_recover_nmi_blocking(vmx);
@@ -4195,8 +4339,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4195 goto free_vcpu; 4339 goto free_vcpu;
4196 4340
4197 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 4341 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
4342 err = -ENOMEM;
4198 if (!vmx->guest_msrs) { 4343 if (!vmx->guest_msrs) {
4199 err = -ENOMEM;
4200 goto uninit_vcpu; 4344 goto uninit_vcpu;
4201 } 4345 }
4202 4346
@@ -4215,7 +4359,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4215 if (err) 4359 if (err)
4216 goto free_vmcs; 4360 goto free_vmcs;
4217 if (vm_need_virtualize_apic_accesses(kvm)) 4361 if (vm_need_virtualize_apic_accesses(kvm))
4218 if (alloc_apic_access_page(kvm) != 0) 4362 err = alloc_apic_access_page(kvm);
4363 if (err)
4219 goto free_vmcs; 4364 goto free_vmcs;
4220 4365
4221 if (enable_ept) { 4366 if (enable_ept) {
@@ -4368,6 +4513,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4368{ 4513{
4369} 4514}
4370 4515
4516static int vmx_check_intercept(struct kvm_vcpu *vcpu,
4517 struct x86_instruction_info *info,
4518 enum x86_intercept_stage stage)
4519{
4520 return X86EMUL_CONTINUE;
4521}
4522
4371static struct kvm_x86_ops vmx_x86_ops = { 4523static struct kvm_x86_ops vmx_x86_ops = {
4372 .cpu_has_kvm_support = cpu_has_kvm_support, 4524 .cpu_has_kvm_support = cpu_has_kvm_support,
4373 .disabled_by_bios = vmx_disabled_by_bios, 4525 .disabled_by_bios = vmx_disabled_by_bios,
@@ -4449,10 +4601,14 @@ static struct kvm_x86_ops vmx_x86_ops = {
4449 4601
4450 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 4602 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4451 4603
4604 .set_tsc_khz = vmx_set_tsc_khz,
4452 .write_tsc_offset = vmx_write_tsc_offset, 4605 .write_tsc_offset = vmx_write_tsc_offset,
4453 .adjust_tsc_offset = vmx_adjust_tsc_offset, 4606 .adjust_tsc_offset = vmx_adjust_tsc_offset,
4607 .compute_tsc_offset = vmx_compute_tsc_offset,
4454 4608
4455 .set_tdp_cr3 = vmx_set_cr3, 4609 .set_tdp_cr3 = vmx_set_cr3,
4610
4611 .check_intercept = vmx_check_intercept,
4456}; 4612};
4457 4613
4458static int __init vmx_init(void) 4614static int __init vmx_init(void)
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 934b4c6b0bf9..77c9d8673dc4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -60,22 +60,12 @@
60#include <asm/div64.h> 60#include <asm/div64.h>
61 61
62#define MAX_IO_MSRS 256 62#define MAX_IO_MSRS 256
63#define CR0_RESERVED_BITS \
64 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
65 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
66 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
67#define CR4_RESERVED_BITS \
68 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
69 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
70 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
71 | X86_CR4_OSXSAVE \
72 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
73
74#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
75
76#define KVM_MAX_MCE_BANKS 32 63#define KVM_MAX_MCE_BANKS 32
77#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P) 64#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
78 65
66#define emul_to_vcpu(ctxt) \
67 container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
68
79/* EFER defaults: 69/* EFER defaults:
80 * - enable syscall per default because its emulated by KVM 70 * - enable syscall per default because its emulated by KVM
81 * - enable LME and LMA per default on 64 bit KVM 71 * - enable LME and LMA per default on 64 bit KVM
@@ -100,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
100int ignore_msrs = 0; 90int ignore_msrs = 0;
101module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 91module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
102 92
93bool kvm_has_tsc_control;
94EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
95u32 kvm_max_guest_tsc_khz;
96EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
97
103#define KVM_NR_SHARED_MSRS 16 98#define KVM_NR_SHARED_MSRS 16
104 99
105struct kvm_shared_msrs_global { 100struct kvm_shared_msrs_global {
@@ -157,6 +152,8 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
157 152
158u64 __read_mostly host_xcr0; 153u64 __read_mostly host_xcr0;
159 154
155int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
156
160static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) 157static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
161{ 158{
162 int i; 159 int i;
@@ -361,8 +358,8 @@ void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
361 358
362void kvm_inject_nmi(struct kvm_vcpu *vcpu) 359void kvm_inject_nmi(struct kvm_vcpu *vcpu)
363{ 360{
364 kvm_make_request(KVM_REQ_NMI, vcpu);
365 kvm_make_request(KVM_REQ_EVENT, vcpu); 361 kvm_make_request(KVM_REQ_EVENT, vcpu);
362 vcpu->arch.nmi_pending = 1;
366} 363}
367EXPORT_SYMBOL_GPL(kvm_inject_nmi); 364EXPORT_SYMBOL_GPL(kvm_inject_nmi);
368 365
@@ -982,7 +979,15 @@ static inline int kvm_tsc_changes_freq(void)
982 return ret; 979 return ret;
983} 980}
984 981
985static inline u64 nsec_to_cycles(u64 nsec) 982static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
983{
984 if (vcpu->arch.virtual_tsc_khz)
985 return vcpu->arch.virtual_tsc_khz;
986 else
987 return __this_cpu_read(cpu_tsc_khz);
988}
989
990static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
986{ 991{
987 u64 ret; 992 u64 ret;
988 993
@@ -990,25 +995,24 @@ static inline u64 nsec_to_cycles(u64 nsec)
990 if (kvm_tsc_changes_freq()) 995 if (kvm_tsc_changes_freq())
991 printk_once(KERN_WARNING 996 printk_once(KERN_WARNING
992 "kvm: unreliable cycle conversion on adjustable rate TSC\n"); 997 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
993 ret = nsec * __this_cpu_read(cpu_tsc_khz); 998 ret = nsec * vcpu_tsc_khz(vcpu);
994 do_div(ret, USEC_PER_SEC); 999 do_div(ret, USEC_PER_SEC);
995 return ret; 1000 return ret;
996} 1001}
997 1002
998static void kvm_arch_set_tsc_khz(struct kvm *kvm, u32 this_tsc_khz) 1003static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
999{ 1004{
1000 /* Compute a scale to convert nanoseconds in TSC cycles */ 1005 /* Compute a scale to convert nanoseconds in TSC cycles */
1001 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000, 1006 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1002 &kvm->arch.virtual_tsc_shift, 1007 &vcpu->arch.tsc_catchup_shift,
1003 &kvm->arch.virtual_tsc_mult); 1008 &vcpu->arch.tsc_catchup_mult);
1004 kvm->arch.virtual_tsc_khz = this_tsc_khz;
1005} 1009}
1006 1010
1007static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) 1011static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1008{ 1012{
1009 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec, 1013 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1010 vcpu->kvm->arch.virtual_tsc_mult, 1014 vcpu->arch.tsc_catchup_mult,
1011 vcpu->kvm->arch.virtual_tsc_shift); 1015 vcpu->arch.tsc_catchup_shift);
1012 tsc += vcpu->arch.last_tsc_write; 1016 tsc += vcpu->arch.last_tsc_write;
1013 return tsc; 1017 return tsc;
1014} 1018}
@@ -1021,7 +1025,7 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1021 s64 sdiff; 1025 s64 sdiff;
1022 1026
1023 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); 1027 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1024 offset = data - native_read_tsc(); 1028 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1025 ns = get_kernel_ns(); 1029 ns = get_kernel_ns();
1026 elapsed = ns - kvm->arch.last_tsc_nsec; 1030 elapsed = ns - kvm->arch.last_tsc_nsec;
1027 sdiff = data - kvm->arch.last_tsc_write; 1031 sdiff = data - kvm->arch.last_tsc_write;
@@ -1037,13 +1041,13 @@ void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1037 * In that case, for a reliable TSC, we can match TSC offsets, 1041 * In that case, for a reliable TSC, we can match TSC offsets,
1038 * or make a best guest using elapsed value. 1042 * or make a best guest using elapsed value.
1039 */ 1043 */
1040 if (sdiff < nsec_to_cycles(5ULL * NSEC_PER_SEC) && 1044 if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
1041 elapsed < 5ULL * NSEC_PER_SEC) { 1045 elapsed < 5ULL * NSEC_PER_SEC) {
1042 if (!check_tsc_unstable()) { 1046 if (!check_tsc_unstable()) {
1043 offset = kvm->arch.last_tsc_offset; 1047 offset = kvm->arch.last_tsc_offset;
1044 pr_debug("kvm: matched tsc offset for %llu\n", data); 1048 pr_debug("kvm: matched tsc offset for %llu\n", data);
1045 } else { 1049 } else {
1046 u64 delta = nsec_to_cycles(elapsed); 1050 u64 delta = nsec_to_cycles(vcpu, elapsed);
1047 offset += delta; 1051 offset += delta;
1048 pr_debug("kvm: adjusted tsc offset by %llu\n", delta); 1052 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1049 } 1053 }
@@ -1075,8 +1079,7 @@ static int kvm_guest_time_update(struct kvm_vcpu *v)
1075 local_irq_save(flags); 1079 local_irq_save(flags);
1076 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp); 1080 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1077 kernel_ns = get_kernel_ns(); 1081 kernel_ns = get_kernel_ns();
1078 this_tsc_khz = __this_cpu_read(cpu_tsc_khz); 1082 this_tsc_khz = vcpu_tsc_khz(v);
1079
1080 if (unlikely(this_tsc_khz == 0)) { 1083 if (unlikely(this_tsc_khz == 0)) {
1081 local_irq_restore(flags); 1084 local_irq_restore(flags);
1082 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v); 1085 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
@@ -1993,6 +1996,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1993 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1996 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1994 case KVM_CAP_XSAVE: 1997 case KVM_CAP_XSAVE:
1995 case KVM_CAP_ASYNC_PF: 1998 case KVM_CAP_ASYNC_PF:
1999 case KVM_CAP_GET_TSC_KHZ:
1996 r = 1; 2000 r = 1;
1997 break; 2001 break;
1998 case KVM_CAP_COALESCED_MMIO: 2002 case KVM_CAP_COALESCED_MMIO:
@@ -2019,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext)
2019 case KVM_CAP_XCRS: 2023 case KVM_CAP_XCRS:
2020 r = cpu_has_xsave; 2024 r = cpu_has_xsave;
2021 break; 2025 break;
2026 case KVM_CAP_TSC_CONTROL:
2027 r = kvm_has_tsc_control;
2028 break;
2022 default: 2029 default:
2023 r = 0; 2030 r = 0;
2024 break; 2031 break;
@@ -2120,8 +2127,13 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
2120 kvm_x86_ops->vcpu_load(vcpu, cpu); 2127 kvm_x86_ops->vcpu_load(vcpu, cpu);
2121 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) { 2128 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
2122 /* Make sure TSC doesn't go backwards */ 2129 /* Make sure TSC doesn't go backwards */
2123 s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : 2130 s64 tsc_delta;
2124 native_read_tsc() - vcpu->arch.last_host_tsc; 2131 u64 tsc;
2132
2133 kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
2134 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
2135 tsc - vcpu->arch.last_guest_tsc;
2136
2125 if (tsc_delta < 0) 2137 if (tsc_delta < 0)
2126 mark_tsc_unstable("KVM discovered backwards TSC"); 2138 mark_tsc_unstable("KVM discovered backwards TSC");
2127 if (check_tsc_unstable()) { 2139 if (check_tsc_unstable()) {
@@ -2139,7 +2151,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
2139{ 2151{
2140 kvm_x86_ops->vcpu_put(vcpu); 2152 kvm_x86_ops->vcpu_put(vcpu);
2141 kvm_put_guest_fpu(vcpu); 2153 kvm_put_guest_fpu(vcpu);
2142 vcpu->arch.last_host_tsc = native_read_tsc(); 2154 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
2143} 2155}
2144 2156
2145static int is_efer_nx(void) 2157static int is_efer_nx(void)
@@ -2324,6 +2336,12 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2324 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) | 2336 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2325 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM); 2337 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2326 2338
2339 /* cpuid 0xC0000001.edx */
2340 const u32 kvm_supported_word5_x86_features =
2341 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2342 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2343 F(PMM) | F(PMM_EN);
2344
2327 /* all calls to cpuid_count() should be made on the same cpu */ 2345 /* all calls to cpuid_count() should be made on the same cpu */
2328 get_cpu(); 2346 get_cpu();
2329 do_cpuid_1_ent(entry, function, index); 2347 do_cpuid_1_ent(entry, function, index);
@@ -2418,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2418 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 2436 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
2419 (1 << KVM_FEATURE_NOP_IO_DELAY) | 2437 (1 << KVM_FEATURE_NOP_IO_DELAY) |
2420 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2438 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2439 (1 << KVM_FEATURE_ASYNC_PF) |
2421 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2440 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2422 entry->ebx = 0; 2441 entry->ebx = 0;
2423 entry->ecx = 0; 2442 entry->ecx = 0;
@@ -2432,6 +2451,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2432 entry->ecx &= kvm_supported_word6_x86_features; 2451 entry->ecx &= kvm_supported_word6_x86_features;
2433 cpuid_mask(&entry->ecx, 6); 2452 cpuid_mask(&entry->ecx, 6);
2434 break; 2453 break;
2454 /*Add support for Centaur's CPUID instruction*/
2455 case 0xC0000000:
2456 /*Just support up to 0xC0000004 now*/
2457 entry->eax = min(entry->eax, 0xC0000004);
2458 break;
2459 case 0xC0000001:
2460 entry->edx &= kvm_supported_word5_x86_features;
2461 cpuid_mask(&entry->edx, 5);
2462 break;
2463 case 0xC0000002:
2464 case 0xC0000003:
2465 case 0xC0000004:
2466 /*Now nothing to do, reserved for the future*/
2467 break;
2435 } 2468 }
2436 2469
2437 kvm_x86_ops->set_supported_cpuid(function, entry); 2470 kvm_x86_ops->set_supported_cpuid(function, entry);
@@ -2478,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2478 if (nent >= cpuid->nent) 2511 if (nent >= cpuid->nent)
2479 goto out_free; 2512 goto out_free;
2480 2513
2514 /* Add support for Centaur's CPUID instruction. */
2515 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
2516 do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
2517 &nent, cpuid->nent);
2518
2519 r = -E2BIG;
2520 if (nent >= cpuid->nent)
2521 goto out_free;
2522
2523 limit = cpuid_entries[nent - 1].eax;
2524 for (func = 0xC0000001;
2525 func <= limit && nent < cpuid->nent; ++func)
2526 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2527 &nent, cpuid->nent);
2528
2529 r = -E2BIG;
2530 if (nent >= cpuid->nent)
2531 goto out_free;
2532 }
2533
2481 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, 2534 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2482 cpuid->nent); 2535 cpuid->nent);
2483 2536
@@ -3046,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
3046 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3099 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
3047 break; 3100 break;
3048 } 3101 }
3102 case KVM_SET_TSC_KHZ: {
3103 u32 user_tsc_khz;
3104
3105 r = -EINVAL;
3106 if (!kvm_has_tsc_control)
3107 break;
3108
3109 user_tsc_khz = (u32)arg;
3110
3111 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3112 goto out;
3113
3114 kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
3115
3116 r = 0;
3117 goto out;
3118 }
3119 case KVM_GET_TSC_KHZ: {
3120 r = -EIO;
3121 if (check_tsc_unstable())
3122 goto out;
3123
3124 r = vcpu_tsc_khz(vcpu);
3125
3126 goto out;
3127 }
3049 default: 3128 default:
3050 r = -EINVAL; 3129 r = -EINVAL;
3051 } 3130 }
@@ -3595,20 +3674,43 @@ static void kvm_init_msr_list(void)
3595static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 3674static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3596 const void *v) 3675 const void *v)
3597{ 3676{
3598 if (vcpu->arch.apic && 3677 int handled = 0;
3599 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3678 int n;
3600 return 0;
3601 3679
3602 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3680 do {
3681 n = min(len, 8);
3682 if (!(vcpu->arch.apic &&
3683 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
3684 && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3685 break;
3686 handled += n;
3687 addr += n;
3688 len -= n;
3689 v += n;
3690 } while (len);
3691
3692 return handled;
3603} 3693}
3604 3694
3605static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3695static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3606{ 3696{
3607 if (vcpu->arch.apic && 3697 int handled = 0;
3608 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3698 int n;
3609 return 0; 3699
3700 do {
3701 n = min(len, 8);
3702 if (!(vcpu->arch.apic &&
3703 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
3704 && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3705 break;
3706 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
3707 handled += n;
3708 addr += n;
3709 len -= n;
3710 v += n;
3711 } while (len);
3610 3712
3611 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3713 return handled;
3612} 3714}
3613 3715
3614static void kvm_set_segment(struct kvm_vcpu *vcpu, 3716static void kvm_set_segment(struct kvm_vcpu *vcpu,
@@ -3703,37 +3805,43 @@ out:
3703} 3805}
3704 3806
3705/* used for instruction fetching */ 3807/* used for instruction fetching */
3706static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3808static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3707 struct kvm_vcpu *vcpu, 3809 gva_t addr, void *val, unsigned int bytes,
3708 struct x86_exception *exception) 3810 struct x86_exception *exception)
3709{ 3811{
3812 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3710 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3813 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3814
3711 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3815 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3712 access | PFERR_FETCH_MASK, 3816 access | PFERR_FETCH_MASK,
3713 exception); 3817 exception);
3714} 3818}
3715 3819
3716static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3820static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3717 struct kvm_vcpu *vcpu, 3821 gva_t addr, void *val, unsigned int bytes,
3718 struct x86_exception *exception) 3822 struct x86_exception *exception)
3719{ 3823{
3824 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3720 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3825 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3826
3721 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3827 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3722 exception); 3828 exception);
3723} 3829}
3724 3830
3725static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3831static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3726 struct kvm_vcpu *vcpu, 3832 gva_t addr, void *val, unsigned int bytes,
3727 struct x86_exception *exception) 3833 struct x86_exception *exception)
3728{ 3834{
3835 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3729 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception); 3836 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3730} 3837}
3731 3838
3732static int kvm_write_guest_virt_system(gva_t addr, void *val, 3839static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3840 gva_t addr, void *val,
3733 unsigned int bytes, 3841 unsigned int bytes,
3734 struct kvm_vcpu *vcpu,
3735 struct x86_exception *exception) 3842 struct x86_exception *exception)
3736{ 3843{
3844 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3737 void *data = val; 3845 void *data = val;
3738 int r = X86EMUL_CONTINUE; 3846 int r = X86EMUL_CONTINUE;
3739 3847
@@ -3761,13 +3869,15 @@ out:
3761 return r; 3869 return r;
3762} 3870}
3763 3871
3764static int emulator_read_emulated(unsigned long addr, 3872static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3873 unsigned long addr,
3765 void *val, 3874 void *val,
3766 unsigned int bytes, 3875 unsigned int bytes,
3767 struct x86_exception *exception, 3876 struct x86_exception *exception)
3768 struct kvm_vcpu *vcpu)
3769{ 3877{
3878 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3770 gpa_t gpa; 3879 gpa_t gpa;
3880 int handled;
3771 3881
3772 if (vcpu->mmio_read_completed) { 3882 if (vcpu->mmio_read_completed) {
3773 memcpy(val, vcpu->mmio_data, bytes); 3883 memcpy(val, vcpu->mmio_data, bytes);
@@ -3786,7 +3896,7 @@ static int emulator_read_emulated(unsigned long addr,
3786 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3896 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3787 goto mmio; 3897 goto mmio;
3788 3898
3789 if (kvm_read_guest_virt(addr, val, bytes, vcpu, exception) 3899 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
3790 == X86EMUL_CONTINUE) 3900 == X86EMUL_CONTINUE)
3791 return X86EMUL_CONTINUE; 3901 return X86EMUL_CONTINUE;
3792 3902
@@ -3794,18 +3904,24 @@ mmio:
3794 /* 3904 /*
3795 * Is this MMIO handled locally? 3905 * Is this MMIO handled locally?
3796 */ 3906 */
3797 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 3907 handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
3798 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 3908
3909 if (handled == bytes)
3799 return X86EMUL_CONTINUE; 3910 return X86EMUL_CONTINUE;
3800 } 3911
3912 gpa += handled;
3913 bytes -= handled;
3914 val += handled;
3801 3915
3802 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3916 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3803 3917
3804 vcpu->mmio_needed = 1; 3918 vcpu->mmio_needed = 1;
3805 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3919 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3806 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3920 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3807 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3921 vcpu->mmio_size = bytes;
3922 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3808 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; 3923 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3924 vcpu->mmio_index = 0;
3809 3925
3810 return X86EMUL_IO_NEEDED; 3926 return X86EMUL_IO_NEEDED;
3811} 3927}
@@ -3829,6 +3945,7 @@ static int emulator_write_emulated_onepage(unsigned long addr,
3829 struct kvm_vcpu *vcpu) 3945 struct kvm_vcpu *vcpu)
3830{ 3946{
3831 gpa_t gpa; 3947 gpa_t gpa;
3948 int handled;
3832 3949
3833 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception); 3950 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3834 3951
@@ -3847,25 +3964,35 @@ mmio:
3847 /* 3964 /*
3848 * Is this MMIO handled locally? 3965 * Is this MMIO handled locally?
3849 */ 3966 */
3850 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 3967 handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
3968 if (handled == bytes)
3851 return X86EMUL_CONTINUE; 3969 return X86EMUL_CONTINUE;
3852 3970
3971 gpa += handled;
3972 bytes -= handled;
3973 val += handled;
3974
3853 vcpu->mmio_needed = 1; 3975 vcpu->mmio_needed = 1;
3976 memcpy(vcpu->mmio_data, val, bytes);
3854 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3977 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3855 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3978 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3856 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3979 vcpu->mmio_size = bytes;
3980 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3857 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 3981 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3858 memcpy(vcpu->run->mmio.data, val, bytes); 3982 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
3983 vcpu->mmio_index = 0;
3859 3984
3860 return X86EMUL_CONTINUE; 3985 return X86EMUL_CONTINUE;
3861} 3986}
3862 3987
3863int emulator_write_emulated(unsigned long addr, 3988int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
3989 unsigned long addr,
3864 const void *val, 3990 const void *val,
3865 unsigned int bytes, 3991 unsigned int bytes,
3866 struct x86_exception *exception, 3992 struct x86_exception *exception)
3867 struct kvm_vcpu *vcpu)
3868{ 3993{
3994 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3995
3869 /* Crossing a page boundary? */ 3996 /* Crossing a page boundary? */
3870 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3997 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
3871 int rc, now; 3998 int rc, now;
@@ -3893,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr,
3893 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) 4020 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3894#endif 4021#endif
3895 4022
3896static int emulator_cmpxchg_emulated(unsigned long addr, 4023static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4024 unsigned long addr,
3897 const void *old, 4025 const void *old,
3898 const void *new, 4026 const void *new,
3899 unsigned int bytes, 4027 unsigned int bytes,
3900 struct x86_exception *exception, 4028 struct x86_exception *exception)
3901 struct kvm_vcpu *vcpu)
3902{ 4029{
4030 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3903 gpa_t gpa; 4031 gpa_t gpa;
3904 struct page *page; 4032 struct page *page;
3905 char *kaddr; 4033 char *kaddr;
@@ -3955,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3955emul_write: 4083emul_write:
3956 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 4084 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3957 4085
3958 return emulator_write_emulated(addr, new, bytes, exception, vcpu); 4086 return emulator_write_emulated(ctxt, addr, new, bytes, exception);
3959} 4087}
3960 4088
3961static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 4089static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3974,9 +4102,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3974} 4102}
3975 4103
3976 4104
3977static int emulator_pio_in_emulated(int size, unsigned short port, void *val, 4105static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
3978 unsigned int count, struct kvm_vcpu *vcpu) 4106 int size, unsigned short port, void *val,
4107 unsigned int count)
3979{ 4108{
4109 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4110
3980 if (vcpu->arch.pio.count) 4111 if (vcpu->arch.pio.count)
3981 goto data_avail; 4112 goto data_avail;
3982 4113
@@ -4004,10 +4135,12 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
4004 return 0; 4135 return 0;
4005} 4136}
4006 4137
4007static int emulator_pio_out_emulated(int size, unsigned short port, 4138static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
4008 const void *val, unsigned int count, 4139 int size, unsigned short port,
4009 struct kvm_vcpu *vcpu) 4140 const void *val, unsigned int count)
4010{ 4141{
4142 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4143
4011 trace_kvm_pio(1, port, size, count); 4144 trace_kvm_pio(1, port, size, count);
4012 4145
4013 vcpu->arch.pio.port = port; 4146 vcpu->arch.pio.port = port;
@@ -4037,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
4037 return kvm_x86_ops->get_segment_base(vcpu, seg); 4170 return kvm_x86_ops->get_segment_base(vcpu, seg);
4038} 4171}
4039 4172
4040int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 4173static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
4041{ 4174{
4042 kvm_mmu_invlpg(vcpu, address); 4175 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
4043 return X86EMUL_CONTINUE;
4044} 4176}
4045 4177
4046int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 4178int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
@@ -4062,22 +4194,20 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
4062} 4194}
4063EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4195EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
4064 4196
4065int emulate_clts(struct kvm_vcpu *vcpu) 4197static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
4066{ 4198{
4067 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4199 kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
4068 kvm_x86_ops->fpu_activate(vcpu);
4069 return X86EMUL_CONTINUE;
4070} 4200}
4071 4201
4072int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) 4202int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
4073{ 4203{
4074 return _kvm_get_dr(vcpu, dr, dest); 4204 return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
4075} 4205}
4076 4206
4077int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) 4207int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
4078{ 4208{
4079 4209
4080 return __kvm_set_dr(vcpu, dr, value); 4210 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
4081} 4211}
4082 4212
4083static u64 mk_cr_64(u64 curr_cr, u32 new_val) 4213static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -4085,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val)
4085 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 4215 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
4086} 4216}
4087 4217
4088static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) 4218static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
4089{ 4219{
4220 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4090 unsigned long value; 4221 unsigned long value;
4091 4222
4092 switch (cr) { 4223 switch (cr) {
@@ -4113,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
4113 return value; 4244 return value;
4114} 4245}
4115 4246
4116static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 4247static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
4117{ 4248{
4249 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4118 int res = 0; 4250 int res = 0;
4119 4251
4120 switch (cr) { 4252 switch (cr) {
@@ -4141,33 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
4141 return res; 4273 return res;
4142} 4274}
4143 4275
4144static int emulator_get_cpl(struct kvm_vcpu *vcpu) 4276static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
4277{
4278 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
4279}
4280
4281static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4282{
4283 kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
4284}
4285
4286static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4145{ 4287{
4146 return kvm_x86_ops->get_cpl(vcpu); 4288 kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
4147} 4289}
4148 4290
4149static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 4291static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4150{ 4292{
4151 kvm_x86_ops->get_gdt(vcpu, dt); 4293 kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4152} 4294}
4153 4295
4154static void emulator_get_idt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 4296static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4155{ 4297{
4156 kvm_x86_ops->get_idt(vcpu, dt); 4298 kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4157} 4299}
4158 4300
4159static unsigned long emulator_get_cached_segment_base(int seg, 4301static unsigned long emulator_get_cached_segment_base(
4160 struct kvm_vcpu *vcpu) 4302 struct x86_emulate_ctxt *ctxt, int seg)
4161{ 4303{
4162 return get_segment_base(vcpu, seg); 4304 return get_segment_base(emul_to_vcpu(ctxt), seg);
4163} 4305}
4164 4306
4165static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3, 4307static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4166 int seg, struct kvm_vcpu *vcpu) 4308 struct desc_struct *desc, u32 *base3,
4309 int seg)
4167{ 4310{
4168 struct kvm_segment var; 4311 struct kvm_segment var;
4169 4312
4170 kvm_get_segment(vcpu, &var, seg); 4313 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4314 *selector = var.selector;
4171 4315
4172 if (var.unusable) 4316 if (var.unusable)
4173 return false; 4317 return false;
@@ -4192,14 +4336,14 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, u32 *base3,
4192 return true; 4336 return true;
4193} 4337}
4194 4338
4195static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3, 4339static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
4196 int seg, struct kvm_vcpu *vcpu) 4340 struct desc_struct *desc, u32 base3,
4341 int seg)
4197{ 4342{
4343 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4198 struct kvm_segment var; 4344 struct kvm_segment var;
4199 4345
4200 /* needed to preserve selector */ 4346 var.selector = selector;
4201 kvm_get_segment(vcpu, &var, seg);
4202
4203 var.base = get_desc_base(desc); 4347 var.base = get_desc_base(desc);
4204#ifdef CONFIG_X86_64 4348#ifdef CONFIG_X86_64
4205 var.base |= ((u64)base3) << 32; 4349 var.base |= ((u64)base3) << 32;
@@ -4223,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, u32 base3,
4223 return; 4367 return;
4224} 4368}
4225 4369
4226static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) 4370static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4371 u32 msr_index, u64 *pdata)
4227{ 4372{
4228 struct kvm_segment kvm_seg; 4373 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4374}
4229 4375
4230 kvm_get_segment(vcpu, &kvm_seg, seg); 4376static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
4231 return kvm_seg.selector; 4377 u32 msr_index, u64 data)
4378{
4379 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
4232} 4380}
4233 4381
4234static void emulator_set_segment_selector(u16 sel, int seg, 4382static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4235 struct kvm_vcpu *vcpu)
4236{ 4383{
4237 struct kvm_segment kvm_seg; 4384 emul_to_vcpu(ctxt)->arch.halt_request = 1;
4385}
4238 4386
4239 kvm_get_segment(vcpu, &kvm_seg, seg); 4387static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4240 kvm_seg.selector = sel; 4388{
4241 kvm_set_segment(vcpu, &kvm_seg, seg); 4389 preempt_disable();
4390 kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4391 /*
4392 * CR0.TS may reference the host fpu state, not the guest fpu state,
4393 * so it may be clear at this point.
4394 */
4395 clts();
4396}
4397
4398static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
4399{
4400 preempt_enable();
4401}
4402
4403static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
4404 struct x86_instruction_info *info,
4405 enum x86_intercept_stage stage)
4406{
4407 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
4242} 4408}
4243 4409
4244static struct x86_emulate_ops emulate_ops = { 4410static struct x86_emulate_ops emulate_ops = {
@@ -4248,22 +4414,29 @@ static struct x86_emulate_ops emulate_ops = {
4248 .read_emulated = emulator_read_emulated, 4414 .read_emulated = emulator_read_emulated,
4249 .write_emulated = emulator_write_emulated, 4415 .write_emulated = emulator_write_emulated,
4250 .cmpxchg_emulated = emulator_cmpxchg_emulated, 4416 .cmpxchg_emulated = emulator_cmpxchg_emulated,
4417 .invlpg = emulator_invlpg,
4251 .pio_in_emulated = emulator_pio_in_emulated, 4418 .pio_in_emulated = emulator_pio_in_emulated,
4252 .pio_out_emulated = emulator_pio_out_emulated, 4419 .pio_out_emulated = emulator_pio_out_emulated,
4253 .get_cached_descriptor = emulator_get_cached_descriptor, 4420 .get_segment = emulator_get_segment,
4254 .set_cached_descriptor = emulator_set_cached_descriptor, 4421 .set_segment = emulator_set_segment,
4255 .get_segment_selector = emulator_get_segment_selector,
4256 .set_segment_selector = emulator_set_segment_selector,
4257 .get_cached_segment_base = emulator_get_cached_segment_base, 4422 .get_cached_segment_base = emulator_get_cached_segment_base,
4258 .get_gdt = emulator_get_gdt, 4423 .get_gdt = emulator_get_gdt,
4259 .get_idt = emulator_get_idt, 4424 .get_idt = emulator_get_idt,
4425 .set_gdt = emulator_set_gdt,
4426 .set_idt = emulator_set_idt,
4260 .get_cr = emulator_get_cr, 4427 .get_cr = emulator_get_cr,
4261 .set_cr = emulator_set_cr, 4428 .set_cr = emulator_set_cr,
4262 .cpl = emulator_get_cpl, 4429 .cpl = emulator_get_cpl,
4263 .get_dr = emulator_get_dr, 4430 .get_dr = emulator_get_dr,
4264 .set_dr = emulator_set_dr, 4431 .set_dr = emulator_set_dr,
4265 .set_msr = kvm_set_msr, 4432 .set_msr = emulator_set_msr,
4266 .get_msr = kvm_get_msr, 4433 .get_msr = emulator_get_msr,
4434 .halt = emulator_halt,
4435 .wbinvd = emulator_wbinvd,
4436 .fix_hypercall = emulator_fix_hypercall,
4437 .get_fpu = emulator_get_fpu,
4438 .put_fpu = emulator_put_fpu,
4439 .intercept = emulator_intercept,
4267}; 4440};
4268 4441
4269static void cache_all_regs(struct kvm_vcpu *vcpu) 4442static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -4305,12 +4478,17 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4305 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4478 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4306 int cs_db, cs_l; 4479 int cs_db, cs_l;
4307 4480
4481 /*
4482 * TODO: fix emulate.c to use guest_read/write_register
4483 * instead of direct ->regs accesses, can save hundred cycles
4484 * on Intel for instructions that don't read/change RSP, for
4485 * for example.
4486 */
4308 cache_all_regs(vcpu); 4487 cache_all_regs(vcpu);
4309 4488
4310 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 4489 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4311 4490
4312 vcpu->arch.emulate_ctxt.vcpu = vcpu; 4491 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
4313 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
4314 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu); 4492 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4315 vcpu->arch.emulate_ctxt.mode = 4493 vcpu->arch.emulate_ctxt.mode =
4316 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : 4494 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
@@ -4318,11 +4496,13 @@ static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4318 ? X86EMUL_MODE_VM86 : cs_l 4496 ? X86EMUL_MODE_VM86 : cs_l
4319 ? X86EMUL_MODE_PROT64 : cs_db 4497 ? X86EMUL_MODE_PROT64 : cs_db
4320 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 4498 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4499 vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
4321 memset(c, 0, sizeof(struct decode_cache)); 4500 memset(c, 0, sizeof(struct decode_cache));
4322 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4501 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4502 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4323} 4503}
4324 4504
4325int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq) 4505int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4326{ 4506{
4327 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4507 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4328 int ret; 4508 int ret;
@@ -4331,7 +4511,8 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
4331 4511
4332 vcpu->arch.emulate_ctxt.decode.op_bytes = 2; 4512 vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
4333 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2; 4513 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
4334 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip; 4514 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
4515 inc_eip;
4335 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq); 4516 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4336 4517
4337 if (ret != X86EMUL_CONTINUE) 4518 if (ret != X86EMUL_CONTINUE)
@@ -4340,7 +4521,7 @@ int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq)
4340 vcpu->arch.emulate_ctxt.eip = c->eip; 4521 vcpu->arch.emulate_ctxt.eip = c->eip;
4341 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4522 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4342 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4523 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4343 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4524 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4344 4525
4345 if (irq == NMI_VECTOR) 4526 if (irq == NMI_VECTOR)
4346 vcpu->arch.nmi_pending = false; 4527 vcpu->arch.nmi_pending = false;
@@ -4402,16 +4583,9 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4402{ 4583{
4403 int r; 4584 int r;
4404 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4585 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4586 bool writeback = true;
4405 4587
4406 kvm_clear_exception_queue(vcpu); 4588 kvm_clear_exception_queue(vcpu);
4407 vcpu->arch.mmio_fault_cr2 = cr2;
4408 /*
4409 * TODO: fix emulate.c to use guest_read/write_register
4410 * instead of direct ->regs accesses, can save hundred cycles
4411 * on Intel for instructions that don't read/change RSP, for
4412 * for example.
4413 */
4414 cache_all_regs(vcpu);
4415 4589
4416 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4590 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
4417 init_emulate_ctxt(vcpu); 4591 init_emulate_ctxt(vcpu);
@@ -4442,13 +4616,19 @@ int x86_emulate_instruction(struct kvm_vcpu *vcpu,
4442 return EMULATE_DONE; 4616 return EMULATE_DONE;
4443 } 4617 }
4444 4618
4445 /* this is needed for vmware backdor interface to work since it 4619 /* this is needed for vmware backdoor interface to work since it
4446 changes registers values during IO operation */ 4620 changes registers values during IO operation */
4447 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4621 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4622 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4623 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4624 }
4448 4625
4449restart: 4626restart:
4450 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt); 4627 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
4451 4628
4629 if (r == EMULATION_INTERCEPTED)
4630 return EMULATE_DONE;
4631
4452 if (r == EMULATION_FAILED) { 4632 if (r == EMULATION_FAILED) {
4453 if (reexecute_instruction(vcpu, cr2)) 4633 if (reexecute_instruction(vcpu, cr2))
4454 return EMULATE_DONE; 4634 return EMULATE_DONE;
@@ -4462,21 +4642,28 @@ restart:
4462 } else if (vcpu->arch.pio.count) { 4642 } else if (vcpu->arch.pio.count) {
4463 if (!vcpu->arch.pio.in) 4643 if (!vcpu->arch.pio.in)
4464 vcpu->arch.pio.count = 0; 4644 vcpu->arch.pio.count = 0;
4645 else
4646 writeback = false;
4465 r = EMULATE_DO_MMIO; 4647 r = EMULATE_DO_MMIO;
4466 } else if (vcpu->mmio_needed) { 4648 } else if (vcpu->mmio_needed) {
4467 if (vcpu->mmio_is_write) 4649 if (!vcpu->mmio_is_write)
4468 vcpu->mmio_needed = 0; 4650 writeback = false;
4469 r = EMULATE_DO_MMIO; 4651 r = EMULATE_DO_MMIO;
4470 } else if (r == EMULATION_RESTART) 4652 } else if (r == EMULATION_RESTART)
4471 goto restart; 4653 goto restart;
4472 else 4654 else
4473 r = EMULATE_DONE; 4655 r = EMULATE_DONE;
4474 4656
4475 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4657 if (writeback) {
4476 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 4658 toggle_interruptibility(vcpu,
4477 kvm_make_request(KVM_REQ_EVENT, vcpu); 4659 vcpu->arch.emulate_ctxt.interruptibility);
4478 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 4660 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4479 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 4661 kvm_make_request(KVM_REQ_EVENT, vcpu);
4662 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4663 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4664 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4665 } else
4666 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4480 4667
4481 return r; 4668 return r;
4482} 4669}
@@ -4485,7 +4672,8 @@ EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4485int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4672int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4486{ 4673{
4487 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 4674 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
4488 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); 4675 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
4676 size, port, &val, 1);
4489 /* do not return to emulator after return from userspace */ 4677 /* do not return to emulator after return from userspace */
4490 vcpu->arch.pio.count = 0; 4678 vcpu->arch.pio.count = 0;
4491 return ret; 4679 return ret;
@@ -4879,8 +5067,9 @@ out:
4879} 5067}
4880EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5068EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
4881 5069
4882int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 5070int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
4883{ 5071{
5072 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4884 char instruction[3]; 5073 char instruction[3];
4885 unsigned long rip = kvm_rip_read(vcpu); 5074 unsigned long rip = kvm_rip_read(vcpu);
4886 5075
@@ -4893,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4893 5082
4894 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5083 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4895 5084
4896 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); 5085 return emulator_write_emulated(&vcpu->arch.emulate_ctxt,
4897} 5086 rip, instruction, 3, NULL);
4898
4899void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4900{
4901 struct desc_ptr dt = { limit, base };
4902
4903 kvm_x86_ops->set_gdt(vcpu, &dt);
4904}
4905
4906void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4907{
4908 struct desc_ptr dt = { limit, base };
4909
4910 kvm_x86_ops->set_idt(vcpu, &dt);
4911} 5087}
4912 5088
4913static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 5089static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -5170,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
5170static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5346static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5171{ 5347{
5172 int r; 5348 int r;
5349 bool nmi_pending;
5173 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5350 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
5174 vcpu->run->request_interrupt_window; 5351 vcpu->run->request_interrupt_window;
5175 5352
@@ -5207,19 +5384,25 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
5207 r = 1; 5384 r = 1;
5208 goto out; 5385 goto out;
5209 } 5386 }
5210 if (kvm_check_request(KVM_REQ_NMI, vcpu))
5211 vcpu->arch.nmi_pending = true;
5212 } 5387 }
5213 5388
5214 r = kvm_mmu_reload(vcpu); 5389 r = kvm_mmu_reload(vcpu);
5215 if (unlikely(r)) 5390 if (unlikely(r))
5216 goto out; 5391 goto out;
5217 5392
5393 /*
5394 * An NMI can be injected between local nmi_pending read and
5395 * vcpu->arch.nmi_pending read inside inject_pending_event().
5396 * But in that case, KVM_REQ_EVENT will be set, which makes
5397 * the race described above benign.
5398 */
5399 nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
5400
5218 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) { 5401 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5219 inject_pending_event(vcpu); 5402 inject_pending_event(vcpu);
5220 5403
5221 /* enable NMI/IRQ window open exits if needed */ 5404 /* enable NMI/IRQ window open exits if needed */
5222 if (vcpu->arch.nmi_pending) 5405 if (nmi_pending)
5223 kvm_x86_ops->enable_nmi_window(vcpu); 5406 kvm_x86_ops->enable_nmi_window(vcpu);
5224 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win) 5407 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5225 kvm_x86_ops->enable_irq_window(vcpu); 5408 kvm_x86_ops->enable_irq_window(vcpu);
@@ -5399,6 +5582,41 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
5399 return r; 5582 return r;
5400} 5583}
5401 5584
5585static int complete_mmio(struct kvm_vcpu *vcpu)
5586{
5587 struct kvm_run *run = vcpu->run;
5588 int r;
5589
5590 if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
5591 return 1;
5592
5593 if (vcpu->mmio_needed) {
5594 vcpu->mmio_needed = 0;
5595 if (!vcpu->mmio_is_write)
5596 memcpy(vcpu->mmio_data + vcpu->mmio_index,
5597 run->mmio.data, 8);
5598 vcpu->mmio_index += 8;
5599 if (vcpu->mmio_index < vcpu->mmio_size) {
5600 run->exit_reason = KVM_EXIT_MMIO;
5601 run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
5602 memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
5603 run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
5604 run->mmio.is_write = vcpu->mmio_is_write;
5605 vcpu->mmio_needed = 1;
5606 return 0;
5607 }
5608 if (vcpu->mmio_is_write)
5609 return 1;
5610 vcpu->mmio_read_completed = 1;
5611 }
5612 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5613 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5614 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5615 if (r != EMULATE_DONE)
5616 return 0;
5617 return 1;
5618}
5619
5402int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5620int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5403{ 5621{
5404 int r; 5622 int r;
@@ -5425,20 +5643,10 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
5425 } 5643 }
5426 } 5644 }
5427 5645
5428 if (vcpu->arch.pio.count || vcpu->mmio_needed) { 5646 r = complete_mmio(vcpu);
5429 if (vcpu->mmio_needed) { 5647 if (r <= 0)
5430 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 5648 goto out;
5431 vcpu->mmio_read_completed = 1; 5649
5432 vcpu->mmio_needed = 0;
5433 }
5434 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5435 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5436 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5437 if (r != EMULATE_DONE) {
5438 r = 0;
5439 goto out;
5440 }
5441 }
5442 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 5650 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
5443 kvm_register_write(vcpu, VCPU_REGS_RAX, 5651 kvm_register_write(vcpu, VCPU_REGS_RAX,
5444 kvm_run->hypercall.ret); 5652 kvm_run->hypercall.ret);
@@ -5455,6 +5663,18 @@ out:
5455 5663
5456int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5664int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5457{ 5665{
5666 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
5667 /*
5668 * We are here if userspace calls get_regs() in the middle of
5669 * instruction emulation. Registers state needs to be copied
5670 * back from emulation context to vcpu. Usrapace shouldn't do
5671 * that usually, but some bad designed PV devices (vmware
5672 * backdoor interface) need this to work
5673 */
5674 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5675 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5676 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5677 }
5458 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5678 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
5459 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 5679 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
5460 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 5680 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -5482,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5482 5702
5483int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5703int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
5484{ 5704{
5705 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
5706 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5707
5485 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 5708 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
5486 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 5709 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
5487 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 5710 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -5592,7 +5815,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5592 5815
5593 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5816 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5594 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5817 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
5595 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5818 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5596 kvm_make_request(KVM_REQ_EVENT, vcpu); 5819 kvm_make_request(KVM_REQ_EVENT, vcpu);
5597 return EMULATE_DONE; 5820 return EMULATE_DONE;
5598} 5821}
@@ -5974,8 +6197,7 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5974 } 6197 }
5975 vcpu->arch.pio_data = page_address(page); 6198 vcpu->arch.pio_data = page_address(page);
5976 6199
5977 if (!kvm->arch.virtual_tsc_khz) 6200 kvm_init_tsc_catchup(vcpu, max_tsc_khz);
5978 kvm_arch_set_tsc_khz(kvm, max_tsc_khz);
5979 6201
5980 r = kvm_mmu_create(vcpu); 6202 r = kvm_mmu_create(vcpu);
5981 if (r < 0) 6203 if (r < 0)
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index c600da830ce0..e407ed3df817 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -77,7 +77,7 @@ static inline u32 bit(int bitno)
77 77
78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq); 80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
81 81
82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data); 82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
83 83
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 1cd608973ce5..db832fd65ecb 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,7 +7,7 @@
7 * kernel and insert a module (lg.ko) which allows us to run other Linux 7 * kernel and insert a module (lg.ko) which allows us to run other Linux
8 * kernels the same way we'd run processes. We call the first kernel the Host, 8 * kernels the same way we'd run processes. We call the first kernel the Host,
9 * and the others the Guests. The program which sets up and configures Guests 9 * and the others the Guests. The program which sets up and configures Guests
10 * (such as the example in Documentation/lguest/lguest.c) is called the 10 * (such as the example in Documentation/virtual/lguest/lguest.c) is called the
11 * Launcher. 11 * Launcher.
12 * 12 *
13 * Secondly, we only run specially modified Guests, not normal kernels: setting 13 * Secondly, we only run specially modified Guests, not normal kernels: setting
@@ -913,8 +913,6 @@ static struct clocksource lguest_clock = {
913 .rating = 200, 913 .rating = 200,
914 .read = lguest_clock_read, 914 .read = lguest_clock_read,
915 .mask = CLOCKSOURCE_MASK(64), 915 .mask = CLOCKSOURCE_MASK(64),
916 .mult = 1 << 22,
917 .shift = 22,
918 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 916 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
919}; 917};
920 918
@@ -995,9 +993,10 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
995static void lguest_time_init(void) 993static void lguest_time_init(void)
996{ 994{
997 /* Set up the timer interrupt (0) to go to our simple timer routine */ 995 /* Set up the timer interrupt (0) to go to our simple timer routine */
996 lguest_setup_irq(0);
998 irq_set_handler(0, lguest_time_irq); 997 irq_set_handler(0, lguest_time_irq);
999 998
1000 clocksource_register(&lguest_clock); 999 clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
1001 1000
1002 /* We can't set cpumask in the initializer: damn C limitations! Set it 1001 /* We can't set cpumask in the initializer: damn C limitations! Set it
1003 * here and register our timer device. */ 1002 * here and register our timer device. */
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24a..f2145cfa12a6 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <asm/dwarf2.h> 2#include <asm/dwarf2.h>
3#include <asm/alternative-asm.h>
3 4
4/* 5/*
5 * Zero a page. 6 * Zero a page.
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
14 CFI_ENDPROC 15 CFI_ENDPROC
15ENDPROC(clear_page_c) 16ENDPROC(clear_page_c)
16 17
18ENTRY(clear_page_c_e)
19 CFI_STARTPROC
20 movl $4096,%ecx
21 xorl %eax,%eax
22 rep stosb
23 ret
24 CFI_ENDPROC
25ENDPROC(clear_page_c_e)
26
17ENTRY(clear_page) 27ENTRY(clear_page)
18 CFI_STARTPROC 28 CFI_STARTPROC
19 xorl %eax,%eax 29 xorl %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
38.Lclear_page_end: 48.Lclear_page_end:
39ENDPROC(clear_page) 49ENDPROC(clear_page)
40 50
41 /* Some CPUs run faster using the string instructions. 51 /*
42 It is also a lot simpler. Use this when possible */ 52 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
53 * It is recommended to use this when possible.
54 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
55 * Otherwise, use original function.
56 *
57 */
43 58
44#include <asm/cpufeature.h> 59#include <asm/cpufeature.h>
45 60
46 .section .altinstr_replacement,"ax" 61 .section .altinstr_replacement,"ax"
471: .byte 0xeb /* jmp <disp8> */ 621: .byte 0xeb /* jmp <disp8> */
48 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ 63 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
492: 642: .byte 0xeb /* jmp <disp8> */
65 .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
663:
50 .previous 67 .previous
51 .section .altinstructions,"a" 68 .section .altinstructions,"a"
52 .align 8 69 altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
53 .quad clear_page 70 .Lclear_page_end-clear_page, 2b-1b
54 .quad 1b 71 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
55 .word X86_FEATURE_REP_GOOD 72 .Lclear_page_end-clear_page,3b-2b
56 .byte .Lclear_page_end - clear_page
57 .byte 2b - 1b
58 .previous 73 .previous
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 99e482615195..024840266ba0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
15#include <asm/asm-offsets.h> 15#include <asm/asm-offsets.h>
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/cpufeature.h> 17#include <asm/cpufeature.h>
18#include <asm/alternative-asm.h>
18 19
19 .macro ALTERNATIVE_JUMP feature,orig,alt 20/*
21 * By placing feature2 after feature1 in altinstructions section, we logically
22 * implement:
23 * If CPU has feature2, jmp to alt2 is used
24 * else if CPU has feature1, jmp to alt1 is used
25 * else jmp to orig is used.
26 */
27 .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
200: 280:
21 .byte 0xe9 /* 32bit jump */ 29 .byte 0xe9 /* 32bit jump */
22 .long \orig-1f /* by default jump to orig */ 30 .long \orig-1f /* by default jump to orig */
231: 311:
24 .section .altinstr_replacement,"ax" 32 .section .altinstr_replacement,"ax"
252: .byte 0xe9 /* near jump with 32bit immediate */ 332: .byte 0xe9 /* near jump with 32bit immediate */
26 .long \alt-1b /* offset */ /* or alternatively to alt */ 34 .long \alt1-1b /* offset */ /* or alternatively to alt1 */
353: .byte 0xe9 /* near jump with 32bit immediate */
36 .long \alt2-1b /* offset */ /* or alternatively to alt2 */
27 .previous 37 .previous
38
28 .section .altinstructions,"a" 39 .section .altinstructions,"a"
29 .align 8 40 altinstruction_entry 0b,2b,\feature1,5,5
30 .quad 0b 41 altinstruction_entry 0b,3b,\feature2,5,5
31 .quad 2b
32 .word \feature /* when feature is set */
33 .byte 5
34 .byte 5
35 .previous 42 .previous
36 .endm 43 .endm
37 44
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
72 addq %rdx,%rcx 79 addq %rdx,%rcx
73 jc bad_to_user 80 jc bad_to_user
74 cmpq TI_addr_limit(%rax),%rcx 81 cmpq TI_addr_limit(%rax),%rcx
75 jae bad_to_user 82 ja bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 83 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
84 copy_user_generic_unrolled,copy_user_generic_string, \
85 copy_user_enhanced_fast_string
77 CFI_ENDPROC 86 CFI_ENDPROC
78ENDPROC(_copy_to_user) 87ENDPROC(_copy_to_user)
79 88
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
85 addq %rdx,%rcx 94 addq %rdx,%rcx
86 jc bad_from_user 95 jc bad_from_user
87 cmpq TI_addr_limit(%rax),%rcx 96 cmpq TI_addr_limit(%rax),%rcx
88 jae bad_from_user 97 ja bad_from_user
89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 98 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
99 copy_user_generic_unrolled,copy_user_generic_string, \
100 copy_user_enhanced_fast_string
90 CFI_ENDPROC 101 CFI_ENDPROC
91ENDPROC(_copy_from_user) 102ENDPROC(_copy_from_user)
92 103
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
255 .previous 266 .previous
256 CFI_ENDPROC 267 CFI_ENDPROC
257ENDPROC(copy_user_generic_string) 268ENDPROC(copy_user_generic_string)
269
270/*
271 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
272 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
273 *
274 * Input:
275 * rdi destination
276 * rsi source
277 * rdx count
278 *
279 * Output:
280 * eax uncopied bytes or 0 if successful.
281 */
282ENTRY(copy_user_enhanced_fast_string)
283 CFI_STARTPROC
284 andl %edx,%edx
285 jz 2f
286 movl %edx,%ecx
2871: rep
288 movsb
2892: xorl %eax,%eax
290 ret
291
292 .section .fixup,"ax"
29312: movl %ecx,%edx /* ecx is zerorest also */
294 jmp copy_user_handle_tail
295 .previous
296
297 .section __ex_table,"a"
298 .align 8
299 .quad 1b,12b
300 .previous
301 CFI_ENDPROC
302ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 75ef61e35e38..efbf2a0ecdea 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
4 4
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/dwarf2.h> 6#include <asm/dwarf2.h>
7#include <asm/alternative-asm.h>
7 8
8/* 9/*
9 * memcpy - Copy a memory block. 10 * memcpy - Copy a memory block.
@@ -37,6 +38,23 @@
37.Lmemcpy_e: 38.Lmemcpy_e:
38 .previous 39 .previous
39 40
41/*
42 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
43 * memcpy_c. Use memcpy_c_e when possible.
44 *
45 * This gets patched over the unrolled variant (below) via the
46 * alternative instructions framework:
47 */
48 .section .altinstr_replacement, "ax", @progbits
49.Lmemcpy_c_e:
50 movq %rdi, %rax
51
52 movl %edx, %ecx
53 rep movsb
54 ret
55.Lmemcpy_e_e:
56 .previous
57
40ENTRY(__memcpy) 58ENTRY(__memcpy)
41ENTRY(memcpy) 59ENTRY(memcpy)
42 CFI_STARTPROC 60 CFI_STARTPROC
@@ -49,7 +67,7 @@ ENTRY(memcpy)
49 jb .Lhandle_tail 67 jb .Lhandle_tail
50 68
51 /* 69 /*
52 * We check whether memory false dependece could occur, 70 * We check whether memory false dependence could occur,
53 * then jump to corresponding copy mode. 71 * then jump to corresponding copy mode.
54 */ 72 */
55 cmp %dil, %sil 73 cmp %dil, %sil
@@ -171,21 +189,22 @@ ENDPROC(memcpy)
171ENDPROC(__memcpy) 189ENDPROC(__memcpy)
172 190
173 /* 191 /*
174 * Some CPUs run faster using the string copy instructions. 192 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
175 * It is also a lot simpler. Use this when possible: 193 * If the feature is supported, memcpy_c_e() is the first choice.
176 */ 194 * If enhanced rep movsb copy is not available, use fast string copy
177 195 * memcpy_c() when possible. This is faster and code is simpler than
178 .section .altinstructions, "a" 196 * original memcpy().
179 .align 8 197 * Otherwise, original memcpy() is used.
180 .quad memcpy 198 * In .altinstructions section, ERMS feature is placed after REG_GOOD
181 .quad .Lmemcpy_c 199 * feature to implement the right patch order.
182 .word X86_FEATURE_REP_GOOD 200 *
183
184 /*
185 * Replace only beginning, memcpy is used to apply alternatives, 201 * Replace only beginning, memcpy is used to apply alternatives,
186 * so it is silly to overwrite itself with nops - reboot is the 202 * so it is silly to overwrite itself with nops - reboot is the
187 * only outcome... 203 * only outcome...
188 */ 204 */
189 .byte .Lmemcpy_e - .Lmemcpy_c 205 .section .altinstructions, "a"
190 .byte .Lmemcpy_e - .Lmemcpy_c 206 altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
207 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
208 altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
209 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
191 .previous 210 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 0ecb8433e5a8..d0ec9c2936d7 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -8,6 +8,7 @@
8#define _STRING_C 8#define _STRING_C
9#include <linux/linkage.h> 9#include <linux/linkage.h>
10#include <asm/dwarf2.h> 10#include <asm/dwarf2.h>
11#include <asm/cpufeature.h>
11 12
12#undef memmove 13#undef memmove
13 14
@@ -24,6 +25,7 @@
24 */ 25 */
25ENTRY(memmove) 26ENTRY(memmove)
26 CFI_STARTPROC 27 CFI_STARTPROC
28
27 /* Handle more 32bytes in loop */ 29 /* Handle more 32bytes in loop */
28 mov %rdi, %rax 30 mov %rdi, %rax
29 cmp $0x20, %rdx 31 cmp $0x20, %rdx
@@ -31,8 +33,13 @@ ENTRY(memmove)
31 33
32 /* Decide forward/backward copy mode */ 34 /* Decide forward/backward copy mode */
33 cmp %rdi, %rsi 35 cmp %rdi, %rsi
34 jb 2f 36 jge .Lmemmove_begin_forward
37 mov %rsi, %r8
38 add %rdx, %r8
39 cmp %rdi, %r8
40 jg 2f
35 41
42.Lmemmove_begin_forward:
36 /* 43 /*
37 * movsq instruction have many startup latency 44 * movsq instruction have many startup latency
38 * so we handle small size by general register. 45 * so we handle small size by general register.
@@ -78,6 +85,8 @@ ENTRY(memmove)
78 rep movsq 85 rep movsq
79 movq %r11, (%r10) 86 movq %r11, (%r10)
80 jmp 13f 87 jmp 13f
88.Lmemmove_end_forward:
89
81 /* 90 /*
82 * Handle data backward by movsq. 91 * Handle data backward by movsq.
83 */ 92 */
@@ -194,4 +203,22 @@ ENTRY(memmove)
19413: 20313:
195 retq 204 retq
196 CFI_ENDPROC 205 CFI_ENDPROC
206
207 .section .altinstr_replacement,"ax"
208.Lmemmove_begin_forward_efs:
209 /* Forward moving data. */
210 movq %rdx, %rcx
211 rep movsb
212 retq
213.Lmemmove_end_forward_efs:
214 .previous
215
216 .section .altinstructions,"a"
217 .align 8
218 .quad .Lmemmove_begin_forward
219 .quad .Lmemmove_begin_forward_efs
220 .word X86_FEATURE_ERMS
221 .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
222 .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
223 .previous
197ENDPROC(memmove) 224ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d344269652..79bd454b78a3 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
6#include <asm/alternative-asm.h>
5 7
6/* 8/*
7 * ISO C memset - set a memory block to a byte value. 9 * ISO C memset - set a memory block to a byte value. This function uses fast
10 * string to get better performance than the original function. The code is
11 * simpler and shorter than the orignal function as well.
8 * 12 *
9 * rdi destination 13 * rdi destination
10 * rsi value (char) 14 * rsi value (char)
@@ -31,6 +35,28 @@
31.Lmemset_e: 35.Lmemset_e:
32 .previous 36 .previous
33 37
38/*
39 * ISO C memset - set a memory block to a byte value. This function uses
40 * enhanced rep stosb to override the fast string function.
41 * The code is simpler and shorter than the fast string function as well.
42 *
43 * rdi destination
44 * rsi value (char)
45 * rdx count (bytes)
46 *
47 * rax original destination
48 */
49 .section .altinstr_replacement, "ax", @progbits
50.Lmemset_c_e:
51 movq %rdi,%r9
52 movb %sil,%al
53 movl %edx,%ecx
54 rep stosb
55 movq %r9,%rax
56 ret
57.Lmemset_e_e:
58 .previous
59
34ENTRY(memset) 60ENTRY(memset)
35ENTRY(__memset) 61ENTRY(__memset)
36 CFI_STARTPROC 62 CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
112ENDPROC(memset) 138ENDPROC(memset)
113ENDPROC(__memset) 139ENDPROC(__memset)
114 140
115 /* Some CPUs run faster using the string instructions. 141 /* Some CPUs support enhanced REP MOVSB/STOSB feature.
116 It is also a lot simpler. Use this when possible */ 142 * It is recommended to use this when possible.
117 143 *
118#include <asm/cpufeature.h> 144 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
119 145 * instructions.
146 *
147 * Otherwise, use original memset function.
148 *
149 * In .altinstructions section, ERMS feature is placed after REG_GOOD
150 * feature to implement the right patch order.
151 */
120 .section .altinstructions,"a" 152 .section .altinstructions,"a"
121 .align 8 153 altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
122 .quad memset 154 .Lfinal-memset,.Lmemset_e-.Lmemset_c
123 .quad .Lmemset_c 155 altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
124 .word X86_FEATURE_REP_GOOD 156 .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
125 .byte .Lfinal - memset
126 .byte .Lmemset_e - .Lmemset_c
127 .previous 157 .previous
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index 3e608edf9958..3d11327c9ab4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,8 +23,8 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
24 24
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_AMD_NUMA) += amdtopology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat.o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o 28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
29 29
30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o 30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
diff --git a/arch/x86/mm/amdtopology_64.c b/arch/x86/mm/amdtopology.c
index 0919c26820d4..5247d01329ca 100644
--- a/arch/x86/mm/amdtopology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -12,6 +12,7 @@
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/memblock.h> 14#include <linux/memblock.h>
15#include <linux/bootmem.h>
15 16
16#include <asm/io.h> 17#include <asm/io.h>
17#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
@@ -69,10 +70,10 @@ static __init void early_get_boot_cpu_id(void)
69 70
70int __init amd_numa_init(void) 71int __init amd_numa_init(void)
71{ 72{
72 unsigned long start = PFN_PHYS(0); 73 u64 start = PFN_PHYS(0);
73 unsigned long end = PFN_PHYS(max_pfn); 74 u64 end = PFN_PHYS(max_pfn);
74 unsigned numnodes; 75 unsigned numnodes;
75 unsigned long prevbase; 76 u64 prevbase;
76 int i, j, nb; 77 int i, j, nb;
77 u32 nodeid, reg; 78 u32 nodeid, reg;
78 unsigned int bits, cores, apicid_base; 79 unsigned int bits, cores, apicid_base;
@@ -95,7 +96,7 @@ int __init amd_numa_init(void)
95 96
96 prevbase = 0; 97 prevbase = 0;
97 for (i = 0; i < 8; i++) { 98 for (i = 0; i < 8; i++) {
98 unsigned long base, limit; 99 u64 base, limit;
99 100
100 base = read_pci_config(0, nb, 1, 0x40 + i*8); 101 base = read_pci_config(0, nb, 1, 0x40 + i*8);
101 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 102 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
@@ -107,18 +108,18 @@ int __init amd_numa_init(void)
107 continue; 108 continue;
108 } 109 }
109 if (nodeid >= numnodes) { 110 if (nodeid >= numnodes) {
110 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, 111 pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
111 base, limit); 112 base, limit);
112 continue; 113 continue;
113 } 114 }
114 115
115 if (!limit) { 116 if (!limit) {
116 pr_info("Skipping node entry %d (base %lx)\n", 117 pr_info("Skipping node entry %d (base %Lx)\n",
117 i, base); 118 i, base);
118 continue; 119 continue;
119 } 120 }
120 if ((base >> 8) & 3 || (limit >> 8) & 3) { 121 if ((base >> 8) & 3 || (limit >> 8) & 3) {
121 pr_err("Node %d using interleaving mode %lx/%lx\n", 122 pr_err("Node %d using interleaving mode %Lx/%Lx\n",
122 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 123 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
123 return -EINVAL; 124 return -EINVAL;
124 } 125 }
@@ -150,19 +151,19 @@ int __init amd_numa_init(void)
150 continue; 151 continue;
151 } 152 }
152 if (limit < base) { 153 if (limit < base) {
153 pr_err("Node %d bogus settings %lx-%lx.\n", 154 pr_err("Node %d bogus settings %Lx-%Lx.\n",
154 nodeid, base, limit); 155 nodeid, base, limit);
155 continue; 156 continue;
156 } 157 }
157 158
158 /* Could sort here, but pun for now. Should not happen anyroads. */ 159 /* Could sort here, but pun for now. Should not happen anyroads. */
159 if (prevbase > base) { 160 if (prevbase > base) {
160 pr_err("Node map not sorted %lx,%lx\n", 161 pr_err("Node map not sorted %Lx,%Lx\n",
161 prevbase, base); 162 prevbase, base);
162 return -EINVAL; 163 return -EINVAL;
163 } 164 }
164 165
165 pr_info("Node %d MemBase %016lx Limit %016lx\n", 166 pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
166 nodeid, base, limit); 167 nodeid, base, limit);
167 168
168 prevbase = base; 169 prevbase = base;
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 20e3f8702d1e..2dbf6bf4c7e5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -12,6 +12,7 @@
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */ 14#include <linux/hugetlb.h> /* hstate_index_to_shift */
15#include <linux/prefetch.h> /* prefetchw */
15 16
16#include <asm/traps.h> /* dotraplinkage, ... */ 17#include <asm/traps.h> /* dotraplinkage, ... */
17#include <asm/pgalloc.h> /* pgd_*(), ... */ 18#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -822,16 +823,30 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
822 force_sig_info_fault(SIGBUS, code, address, tsk, fault); 823 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
823} 824}
824 825
825static noinline void 826static noinline int
826mm_fault_error(struct pt_regs *regs, unsigned long error_code, 827mm_fault_error(struct pt_regs *regs, unsigned long error_code,
827 unsigned long address, unsigned int fault) 828 unsigned long address, unsigned int fault)
828{ 829{
830 /*
831 * Pagefault was interrupted by SIGKILL. We have no reason to
832 * continue pagefault.
833 */
834 if (fatal_signal_pending(current)) {
835 if (!(fault & VM_FAULT_RETRY))
836 up_read(&current->mm->mmap_sem);
837 if (!(error_code & PF_USER))
838 no_context(regs, error_code, address);
839 return 1;
840 }
841 if (!(fault & VM_FAULT_ERROR))
842 return 0;
843
829 if (fault & VM_FAULT_OOM) { 844 if (fault & VM_FAULT_OOM) {
830 /* Kernel mode? Handle exceptions or die: */ 845 /* Kernel mode? Handle exceptions or die: */
831 if (!(error_code & PF_USER)) { 846 if (!(error_code & PF_USER)) {
832 up_read(&current->mm->mmap_sem); 847 up_read(&current->mm->mmap_sem);
833 no_context(regs, error_code, address); 848 no_context(regs, error_code, address);
834 return; 849 return 1;
835 } 850 }
836 851
837 out_of_memory(regs, error_code, address); 852 out_of_memory(regs, error_code, address);
@@ -842,6 +857,7 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
842 else 857 else
843 BUG(); 858 BUG();
844 } 859 }
860 return 1;
845} 861}
846 862
847static int spurious_fault_check(unsigned long error_code, pte_t *pte) 863static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -964,7 +980,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
964 struct mm_struct *mm; 980 struct mm_struct *mm;
965 int fault; 981 int fault;
966 int write = error_code & PF_WRITE; 982 int write = error_code & PF_WRITE;
967 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | 983 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
968 (write ? FAULT_FLAG_WRITE : 0); 984 (write ? FAULT_FLAG_WRITE : 0);
969 985
970 tsk = current; 986 tsk = current;
@@ -1132,9 +1148,9 @@ good_area:
1132 */ 1148 */
1133 fault = handle_mm_fault(mm, vma, address, flags); 1149 fault = handle_mm_fault(mm, vma, address, flags);
1134 1150
1135 if (unlikely(fault & VM_FAULT_ERROR)) { 1151 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
1136 mm_fault_error(regs, error_code, address, fault); 1152 if (mm_fault_error(regs, error_code, address, fault))
1137 return; 1153 return;
1138 } 1154 }
1139 1155
1140 /* 1156 /*
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index d4203988504a..f581a18c0d4d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
72 if (!vma_shareable(vma, addr)) 72 if (!vma_shareable(vma, addr))
73 return; 73 return;
74 74
75 spin_lock(&mapping->i_mmap_lock); 75 mutex_lock(&mapping->i_mmap_mutex);
76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { 76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
77 if (svma == vma) 77 if (svma == vma)
78 continue; 78 continue;
@@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
97 put_page(virt_to_page(spte)); 97 put_page(virt_to_page(spte));
98 spin_unlock(&mm->page_table_lock); 98 spin_unlock(&mm->page_table_lock);
99out: 99out:
100 spin_unlock(&mapping->i_mmap_lock); 100 mutex_unlock(&mapping->i_mmap_mutex);
101} 101}
102 102
103/* 103/*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 37b8b0fe8320..30326443ab81 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -16,8 +16,6 @@
16#include <asm/tlb.h> 16#include <asm/tlb.h>
17#include <asm/proto.h> 17#include <asm/proto.h>
18 18
19DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
20
21unsigned long __initdata pgt_buf_start; 19unsigned long __initdata pgt_buf_start;
22unsigned long __meminitdata pgt_buf_end; 20unsigned long __meminitdata pgt_buf_end;
23unsigned long __meminitdata pgt_buf_top; 21unsigned long __meminitdata pgt_buf_top;
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 80088f994193..29f7c6d98179 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -678,8 +678,10 @@ static void __init zone_sizes_init(void)
678{ 678{
679 unsigned long max_zone_pfns[MAX_NR_ZONES]; 679 unsigned long max_zone_pfns[MAX_NR_ZONES];
680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
681#ifdef CONFIG_ZONE_DMA
681 max_zone_pfns[ZONE_DMA] = 682 max_zone_pfns[ZONE_DMA] =
682 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 683 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
684#endif
683 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 685 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
684#ifdef CONFIG_HIGHMEM 686#ifdef CONFIG_HIGHMEM
685 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 687 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
@@ -716,6 +718,7 @@ void __init paging_init(void)
716 * NOTE: at this point the bootmem allocator is fully available. 718 * NOTE: at this point the bootmem allocator is fully available.
717 */ 719 */
718 olpc_dt_build_devicetree(); 720 olpc_dt_build_devicetree();
721 sparse_memory_present_with_active_regions(MAX_NUMNODES);
719 sparse_init(); 722 sparse_init();
720 zone_sizes_init(); 723 zone_sizes_init();
721} 724}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 794233587287..d865c4aeec55 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -616,7 +616,9 @@ void __init paging_init(void)
616 unsigned long max_zone_pfns[MAX_NR_ZONES]; 616 unsigned long max_zone_pfns[MAX_NR_ZONES];
617 617
618 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 618 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
619#ifdef CONFIG_ZONE_DMA
619 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 620 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
621#endif
620 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 622 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
621 max_zone_pfns[ZONE_NORMAL] = max_pfn; 623 max_zone_pfns[ZONE_NORMAL] = max_pfn;
622 624
@@ -679,14 +681,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
679} 681}
680EXPORT_SYMBOL_GPL(arch_add_memory); 682EXPORT_SYMBOL_GPL(arch_add_memory);
681 683
682#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
683int memory_add_physaddr_to_nid(u64 start)
684{
685 return 0;
686}
687EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
688#endif
689
690#endif /* CONFIG_MEMORY_HOTPLUG */ 684#endif /* CONFIG_MEMORY_HOTPLUG */
691 685
692static struct kcore_list kcore_vsyscall; 686static struct kcore_list kcore_vsyscall;
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 0369843511dc..be1ef574ce9a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
91 return (__force void __iomem *)phys_to_virt(phys_addr); 91 return (__force void __iomem *)phys_to_virt(phys_addr);
92 92
93 /* 93 /*
94 * Check if the request spans more than any BAR in the iomem resource
95 * tree.
96 */
97 WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
98 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
99
100 /*
101 * Don't allow anybody to remap normal RAM that we're using.. 94 * Don't allow anybody to remap normal RAM that we're using..
102 */ 95 */
103 last_pfn = last_addr >> PAGE_SHIFT; 96 last_pfn = last_addr >> PAGE_SHIFT;
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 ret_addr = (void __iomem *) (vaddr + offset); 163 ret_addr = (void __iomem *) (vaddr + offset);
171 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 164 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
172 165
166 /*
167 * Check if the request spans more than any BAR in the iomem resource
168 * tree.
169 */
170 WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
171 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
172
173 return ret_addr; 173 return ret_addr;
174err_free_area: 174err_free_area:
175 free_vm_area(area); 175 free_vm_area(area);
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 745258dfc4dc..f5510d889a22 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,11 +1,39 @@
1/* Common code for 32 and 64-bit NUMA */ 1/* Common code for 32 and 64-bit NUMA */
2#include <linux/topology.h> 2#include <linux/kernel.h>
3#include <linux/module.h> 3#include <linux/mm.h>
4#include <linux/string.h>
5#include <linux/init.h>
4#include <linux/bootmem.h> 6#include <linux/bootmem.h>
5#include <asm/numa.h> 7#include <linux/memblock.h>
8#include <linux/mmzone.h>
9#include <linux/ctype.h>
10#include <linux/module.h>
11#include <linux/nodemask.h>
12#include <linux/sched.h>
13#include <linux/topology.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
6#include <asm/acpi.h> 18#include <asm/acpi.h>
19#include <asm/amd_nb.h>
20
21#include "numa_internal.h"
7 22
8int __initdata numa_off; 23int __initdata numa_off;
24nodemask_t numa_nodes_parsed __initdata;
25
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29static struct numa_meminfo numa_meminfo
30#ifndef CONFIG_MEMORY_HOTPLUG
31__initdata
32#endif
33;
34
35static int numa_distance_cnt;
36static u8 *numa_distance;
9 37
10static __init int numa_setup(char *opt) 38static __init int numa_setup(char *opt)
11{ 39{
@@ -32,6 +60,15 @@ s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
32 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE 60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
33}; 61};
34 62
63int __cpuinit numa_cpu_node(int cpu)
64{
65 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
66
67 if (apicid != BAD_APICID)
68 return __apicid_to_node[apicid];
69 return NUMA_NO_NODE;
70}
71
35cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 72cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
36EXPORT_SYMBOL(node_to_cpumask_map); 73EXPORT_SYMBOL(node_to_cpumask_map);
37 74
@@ -95,6 +132,407 @@ void __init setup_node_to_cpumask_map(void)
95 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 132 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
96} 133}
97 134
135static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
136 struct numa_meminfo *mi)
137{
138 /* ignore zero length blks */
139 if (start == end)
140 return 0;
141
142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
145 nid, start, end);
146 return 0;
147 }
148
149 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
150 pr_err("NUMA: too many memblk ranges\n");
151 return -EINVAL;
152 }
153
154 mi->blk[mi->nr_blks].start = start;
155 mi->blk[mi->nr_blks].end = end;
156 mi->blk[mi->nr_blks].nid = nid;
157 mi->nr_blks++;
158 return 0;
159}
160
161/**
162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
163 * @idx: Index of memblk to remove
164 * @mi: numa_meminfo to remove memblk from
165 *
166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
167 * decrementing @mi->nr_blks.
168 */
169void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
170{
171 mi->nr_blks--;
172 memmove(&mi->blk[idx], &mi->blk[idx + 1],
173 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
174}
175
176/**
177 * numa_add_memblk - Add one numa_memblk to numa_meminfo
178 * @nid: NUMA node ID of the new memblk
179 * @start: Start address of the new memblk
180 * @end: End address of the new memblk
181 *
182 * Add a new memblk to the default numa_meminfo.
183 *
184 * RETURNS:
185 * 0 on success, -errno on failure.
186 */
187int __init numa_add_memblk(int nid, u64 start, u64 end)
188{
189 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
190}
191
192/* Initialize NODE_DATA for a node on the local memory */
193static void __init setup_node_data(int nid, u64 start, u64 end)
194{
195 const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
196 const u64 nd_high = PFN_PHYS(max_pfn_mapped);
197 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
198 bool remapped = false;
199 u64 nd_pa;
200 void *nd;
201 int tnid;
202
203 /*
204 * Don't confuse VM with a node that doesn't have the
205 * minimum amount of memory:
206 */
207 if (end && (end - start) < NODE_MIN_SIZE)
208 return;
209
210 /* initialize remap allocator before aligning to ZONE_ALIGN */
211 init_alloc_remap(nid, start, end);
212
213 start = roundup(start, ZONE_ALIGN);
214
215 printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
216 nid, start, end);
217
218 /*
219 * Allocate node data. Try remap allocator first, node-local
220 * memory and then any node. Never allocate in DMA zone.
221 */
222 nd = alloc_remap(nid, nd_size);
223 if (nd) {
224 nd_pa = __pa(nd);
225 remapped = true;
226 } else {
227 nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
228 nd_size, SMP_CACHE_BYTES);
229 if (nd_pa == MEMBLOCK_ERROR)
230 nd_pa = memblock_find_in_range(nd_low, nd_high,
231 nd_size, SMP_CACHE_BYTES);
232 if (nd_pa == MEMBLOCK_ERROR) {
233 pr_err("Cannot find %zu bytes in node %d\n",
234 nd_size, nid);
235 return;
236 }
237 memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
238 nd = __va(nd_pa);
239 }
240
241 /* report and initialize */
242 printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
243 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
244 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
245 if (!remapped && tnid != nid)
246 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
247
248 node_data[nid] = nd;
249 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
250 NODE_DATA(nid)->node_id = nid;
251 NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
252 NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
253
254 node_set_online(nid);
255}
256
257/**
258 * numa_cleanup_meminfo - Cleanup a numa_meminfo
259 * @mi: numa_meminfo to clean up
260 *
261 * Sanitize @mi by merging and removing unncessary memblks. Also check for
262 * conflicts and clear unused memblks.
263 *
264 * RETURNS:
265 * 0 on success, -errno on failure.
266 */
267int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
268{
269 const u64 low = 0;
270 const u64 high = PFN_PHYS(max_pfn);
271 int i, j, k;
272
273 /* first, trim all entries */
274 for (i = 0; i < mi->nr_blks; i++) {
275 struct numa_memblk *bi = &mi->blk[i];
276
277 /* make sure all blocks are inside the limits */
278 bi->start = max(bi->start, low);
279 bi->end = min(bi->end, high);
280
281 /* and there's no empty block */
282 if (bi->start >= bi->end)
283 numa_remove_memblk_from(i--, mi);
284 }
285
286 /* merge neighboring / overlapping entries */
287 for (i = 0; i < mi->nr_blks; i++) {
288 struct numa_memblk *bi = &mi->blk[i];
289
290 for (j = i + 1; j < mi->nr_blks; j++) {
291 struct numa_memblk *bj = &mi->blk[j];
292 u64 start, end;
293
294 /*
295 * See whether there are overlapping blocks. Whine
296 * about but allow overlaps of the same nid. They
297 * will be merged below.
298 */
299 if (bi->end > bj->start && bi->start < bj->end) {
300 if (bi->nid != bj->nid) {
301 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
302 bi->nid, bi->start, bi->end,
303 bj->nid, bj->start, bj->end);
304 return -EINVAL;
305 }
306 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
307 bi->nid, bi->start, bi->end,
308 bj->start, bj->end);
309 }
310
311 /*
312 * Join together blocks on the same node, holes
313 * between which don't overlap with memory on other
314 * nodes.
315 */
316 if (bi->nid != bj->nid)
317 continue;
318 start = min(bi->start, bj->start);
319 end = max(bi->end, bj->end);
320 for (k = 0; k < mi->nr_blks; k++) {
321 struct numa_memblk *bk = &mi->blk[k];
322
323 if (bi->nid == bk->nid)
324 continue;
325 if (start < bk->end && end > bk->start)
326 break;
327 }
328 if (k < mi->nr_blks)
329 continue;
330 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
331 bi->nid, bi->start, bi->end, bj->start, bj->end,
332 start, end);
333 bi->start = start;
334 bi->end = end;
335 numa_remove_memblk_from(j--, mi);
336 }
337 }
338
339 /* clear unused ones */
340 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
341 mi->blk[i].start = mi->blk[i].end = 0;
342 mi->blk[i].nid = NUMA_NO_NODE;
343 }
344
345 return 0;
346}
347
348/*
349 * Set nodes, which have memory in @mi, in *@nodemask.
350 */
351static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
352 const struct numa_meminfo *mi)
353{
354 int i;
355
356 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
357 if (mi->blk[i].start != mi->blk[i].end &&
358 mi->blk[i].nid != NUMA_NO_NODE)
359 node_set(mi->blk[i].nid, *nodemask);
360}
361
362/**
363 * numa_reset_distance - Reset NUMA distance table
364 *
365 * The current table is freed. The next numa_set_distance() call will
366 * create a new one.
367 */
368void __init numa_reset_distance(void)
369{
370 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
371
372 /* numa_distance could be 1LU marking allocation failure, test cnt */
373 if (numa_distance_cnt)
374 memblock_x86_free_range(__pa(numa_distance),
375 __pa(numa_distance) + size);
376 numa_distance_cnt = 0;
377 numa_distance = NULL; /* enable table creation */
378}
379
380static int __init numa_alloc_distance(void)
381{
382 nodemask_t nodes_parsed;
383 size_t size;
384 int i, j, cnt = 0;
385 u64 phys;
386
387 /* size the new table and allocate it */
388 nodes_parsed = numa_nodes_parsed;
389 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
390
391 for_each_node_mask(i, nodes_parsed)
392 cnt = i;
393 cnt++;
394 size = cnt * cnt * sizeof(numa_distance[0]);
395
396 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
397 size, PAGE_SIZE);
398 if (phys == MEMBLOCK_ERROR) {
399 pr_warning("NUMA: Warning: can't allocate distance table!\n");
400 /* don't retry until explicitly reset */
401 numa_distance = (void *)1LU;
402 return -ENOMEM;
403 }
404 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
405
406 numa_distance = __va(phys);
407 numa_distance_cnt = cnt;
408
409 /* fill with the default distances */
410 for (i = 0; i < cnt; i++)
411 for (j = 0; j < cnt; j++)
412 numa_distance[i * cnt + j] = i == j ?
413 LOCAL_DISTANCE : REMOTE_DISTANCE;
414 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
415
416 return 0;
417}
418
419/**
420 * numa_set_distance - Set NUMA distance from one NUMA to another
421 * @from: the 'from' node to set distance
422 * @to: the 'to' node to set distance
423 * @distance: NUMA distance
424 *
425 * Set the distance from node @from to @to to @distance. If distance table
426 * doesn't exist, one which is large enough to accommodate all the currently
427 * known nodes will be created.
428 *
429 * If such table cannot be allocated, a warning is printed and further
430 * calls are ignored until the distance table is reset with
431 * numa_reset_distance().
432 *
433 * If @from or @to is higher than the highest known node at the time of
434 * table creation or @distance doesn't make sense, the call is ignored.
435 * This is to allow simplification of specific NUMA config implementations.
436 */
437void __init numa_set_distance(int from, int to, int distance)
438{
439 if (!numa_distance && numa_alloc_distance() < 0)
440 return;
441
442 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
443 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
444 from, to, distance);
445 return;
446 }
447
448 if ((u8)distance != distance ||
449 (from == to && distance != LOCAL_DISTANCE)) {
450 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
451 from, to, distance);
452 return;
453 }
454
455 numa_distance[from * numa_distance_cnt + to] = distance;
456}
457
458int __node_distance(int from, int to)
459{
460 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
461 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
462 return numa_distance[from * numa_distance_cnt + to];
463}
464EXPORT_SYMBOL(__node_distance);
465
466/*
467 * Sanity check to catch more bad NUMA configurations (they are amazingly
468 * common). Make sure the nodes cover all memory.
469 */
470static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
471{
472 u64 numaram, e820ram;
473 int i;
474
475 numaram = 0;
476 for (i = 0; i < mi->nr_blks; i++) {
477 u64 s = mi->blk[i].start >> PAGE_SHIFT;
478 u64 e = mi->blk[i].end >> PAGE_SHIFT;
479 numaram += e - s;
480 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
481 if ((s64)numaram < 0)
482 numaram = 0;
483 }
484
485 e820ram = max_pfn - (memblock_x86_hole_size(0,
486 PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
487 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
488 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
489 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
490 (numaram << PAGE_SHIFT) >> 20,
491 (e820ram << PAGE_SHIFT) >> 20);
492 return false;
493 }
494 return true;
495}
496
497static int __init numa_register_memblks(struct numa_meminfo *mi)
498{
499 int i, nid;
500
501 /* Account for nodes with cpus and no memory */
502 node_possible_map = numa_nodes_parsed;
503 numa_nodemask_from_meminfo(&node_possible_map, mi);
504 if (WARN_ON(nodes_empty(node_possible_map)))
505 return -EINVAL;
506
507 for (i = 0; i < mi->nr_blks; i++)
508 memblock_x86_register_active_regions(mi->blk[i].nid,
509 mi->blk[i].start >> PAGE_SHIFT,
510 mi->blk[i].end >> PAGE_SHIFT);
511
512 /* for out of order entries */
513 sort_node_map();
514 if (!numa_meminfo_cover_memory(mi))
515 return -EINVAL;
516
517 /* Finally register nodes. */
518 for_each_node_mask(nid, node_possible_map) {
519 u64 start = PFN_PHYS(max_pfn);
520 u64 end = 0;
521
522 for (i = 0; i < mi->nr_blks; i++) {
523 if (nid != mi->blk[i].nid)
524 continue;
525 start = min(mi->blk[i].start, start);
526 end = max(mi->blk[i].end, end);
527 }
528
529 if (start < end)
530 setup_node_data(nid, start, end);
531 }
532
533 return 0;
534}
535
98/* 536/*
99 * There are unfortunately some poorly designed mainboards around that 537 * There are unfortunately some poorly designed mainboards around that
100 * only connect memory to a single CPU. This breaks the 1:1 cpu->node 538 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
@@ -102,7 +540,7 @@ void __init setup_node_to_cpumask_map(void)
102 * as the number of CPUs is not known yet. We round robin the existing 540 * as the number of CPUs is not known yet. We round robin the existing
103 * nodes. 541 * nodes.
104 */ 542 */
105void __init numa_init_array(void) 543static void __init numa_init_array(void)
106{ 544{
107 int rr, i; 545 int rr, i;
108 546
@@ -117,6 +555,95 @@ void __init numa_init_array(void)
117 } 555 }
118} 556}
119 557
558static int __init numa_init(int (*init_func)(void))
559{
560 int i;
561 int ret;
562
563 for (i = 0; i < MAX_LOCAL_APIC; i++)
564 set_apicid_to_node(i, NUMA_NO_NODE);
565
566 nodes_clear(numa_nodes_parsed);
567 nodes_clear(node_possible_map);
568 nodes_clear(node_online_map);
569 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
570 remove_all_active_ranges();
571 numa_reset_distance();
572
573 ret = init_func();
574 if (ret < 0)
575 return ret;
576 ret = numa_cleanup_meminfo(&numa_meminfo);
577 if (ret < 0)
578 return ret;
579
580 numa_emulation(&numa_meminfo, numa_distance_cnt);
581
582 ret = numa_register_memblks(&numa_meminfo);
583 if (ret < 0)
584 return ret;
585
586 for (i = 0; i < nr_cpu_ids; i++) {
587 int nid = early_cpu_to_node(i);
588
589 if (nid == NUMA_NO_NODE)
590 continue;
591 if (!node_online(nid))
592 numa_clear_node(i);
593 }
594 numa_init_array();
595 return 0;
596}
597
598/**
599 * dummy_numa_init - Fallback dummy NUMA init
600 *
601 * Used if there's no underlying NUMA architecture, NUMA initialization
602 * fails, or NUMA is disabled on the command line.
603 *
604 * Must online at least one node and add memory blocks that cover all
605 * allowed memory. This function must not fail.
606 */
607static int __init dummy_numa_init(void)
608{
609 printk(KERN_INFO "%s\n",
610 numa_off ? "NUMA turned off" : "No NUMA configuration found");
611 printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
612 0LLU, PFN_PHYS(max_pfn));
613
614 node_set(0, numa_nodes_parsed);
615 numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
616
617 return 0;
618}
619
620/**
621 * x86_numa_init - Initialize NUMA
622 *
623 * Try each configured NUMA initialization method until one succeeds. The
624 * last fallback is dummy single node config encomapssing whole memory and
625 * never fails.
626 */
627void __init x86_numa_init(void)
628{
629 if (!numa_off) {
630#ifdef CONFIG_X86_NUMAQ
631 if (!numa_init(numaq_numa_init))
632 return;
633#endif
634#ifdef CONFIG_ACPI_NUMA
635 if (!numa_init(x86_acpi_numa_init))
636 return;
637#endif
638#ifdef CONFIG_AMD_NUMA
639 if (!numa_init(amd_numa_init))
640 return;
641#endif
642 }
643
644 numa_init(dummy_numa_init);
645}
646
120static __init int find_near_online_node(int node) 647static __init int find_near_online_node(int node)
121{ 648{
122 int n, val; 649 int n, val;
@@ -282,3 +809,18 @@ const struct cpumask *cpumask_of_node(int node)
282EXPORT_SYMBOL(cpumask_of_node); 809EXPORT_SYMBOL(cpumask_of_node);
283 810
284#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 811#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
812
813#ifdef CONFIG_MEMORY_HOTPLUG
814int memory_add_physaddr_to_nid(u64 start)
815{
816 struct numa_meminfo *mi = &numa_meminfo;
817 int nid = mi->blk[0].nid;
818 int i;
819
820 for (i = 0; i < mi->nr_blks; i++)
821 if (mi->blk[i].start <= start && mi->blk[i].end > start)
822 nid = mi->blk[i].nid;
823 return nid;
824}
825EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
826#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index bde3906420df..849a975d3fa0 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,39 +22,11 @@
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25#include <linux/mm.h>
26#include <linux/bootmem.h> 25#include <linux/bootmem.h>
27#include <linux/memblock.h> 26#include <linux/memblock.h>
28#include <linux/mmzone.h>
29#include <linux/highmem.h>
30#include <linux/initrd.h>
31#include <linux/nodemask.h>
32#include <linux/module.h> 27#include <linux/module.h>
33#include <linux/kexec.h>
34#include <linux/pfn.h>
35#include <linux/swap.h>
36#include <linux/acpi.h>
37
38#include <asm/e820.h>
39#include <asm/setup.h>
40#include <asm/mmzone.h>
41#include <asm/bios_ebda.h>
42#include <asm/proto.h>
43
44struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
45EXPORT_SYMBOL(node_data);
46
47/*
48 * numa interface - we expect the numa architecture specific code to have
49 * populated the following initialisation.
50 *
51 * 1) node_online_map - the map of all nodes configured (online) in the system
52 * 2) node_start_pfn - the starting page frame number for a node
53 * 3) node_end_pfn - the ending page fram number for a node
54 */
55unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
56unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
57 28
29#include "numa_internal.h"
58 30
59#ifdef CONFIG_DISCONTIGMEM 31#ifdef CONFIG_DISCONTIGMEM
60/* 32/*
@@ -99,108 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
99} 71}
100#endif 72#endif
101 73
102extern unsigned long find_max_low_pfn(void);
103extern unsigned long highend_pfn, highstart_pfn; 74extern unsigned long highend_pfn, highstart_pfn;
104 75
105#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 76#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
106 77
107unsigned long node_remap_size[MAX_NUMNODES];
108static void *node_remap_start_vaddr[MAX_NUMNODES]; 78static void *node_remap_start_vaddr[MAX_NUMNODES];
109void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 79void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
110 80
111static unsigned long kva_start_pfn;
112static unsigned long kva_pages;
113
114int __cpuinit numa_cpu_node(int cpu)
115{
116 return apic->x86_32_numa_cpu_node(cpu);
117}
118
119/*
120 * FLAT - support for basic PC memory model with discontig enabled, essentially
121 * a single node with all available processors in it with a flat
122 * memory map.
123 */
124int __init get_memcfg_numa_flat(void)
125{
126 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
127
128 node_start_pfn[0] = 0;
129 node_end_pfn[0] = max_pfn;
130 memblock_x86_register_active_regions(0, 0, max_pfn);
131 memory_present(0, 0, max_pfn);
132 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
133
134 /* Indicate there is one node available. */
135 nodes_clear(node_online_map);
136 node_set_online(0);
137 return 1;
138}
139
140/*
141 * Find the highest page frame number we have available for the node
142 */
143static void __init propagate_e820_map_node(int nid)
144{
145 if (node_end_pfn[nid] > max_pfn)
146 node_end_pfn[nid] = max_pfn;
147 /*
148 * if a user has given mem=XXXX, then we need to make sure
149 * that the node _starts_ before that, too, not just ends
150 */
151 if (node_start_pfn[nid] > max_pfn)
152 node_start_pfn[nid] = max_pfn;
153 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
154}
155
156/*
157 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
158 * method. For node zero take this from the bottom of memory, for
159 * subsequent nodes place them at node_remap_start_vaddr which contains
160 * node local data in physically node local memory. See setup_memory()
161 * for details.
162 */
163static void __init allocate_pgdat(int nid)
164{
165 char buf[16];
166
167 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
168 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
169 else {
170 unsigned long pgdat_phys;
171 pgdat_phys = memblock_find_in_range(min_low_pfn<<PAGE_SHIFT,
172 max_pfn_mapped<<PAGE_SHIFT,
173 sizeof(pg_data_t),
174 PAGE_SIZE);
175 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
176 memset(buf, 0, sizeof(buf));
177 sprintf(buf, "NODE_DATA %d", nid);
178 memblock_x86_reserve_range(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
179 }
180 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
181 nid, (unsigned long)NODE_DATA(nid));
182}
183
184/* 81/*
185 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel 82 * Remap memory allocator
186 * virtual address space (KVA) is reserved and portions of nodes are mapped
187 * using it. This is to allow node-local memory to be allocated for
188 * structures that would normally require ZONE_NORMAL. The memory is
189 * allocated with alloc_remap() and callers should be prepared to allocate
190 * from the bootmem allocator instead.
191 */ 83 */
192static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 84static unsigned long node_remap_start_pfn[MAX_NUMNODES];
193static void *node_remap_end_vaddr[MAX_NUMNODES]; 85static void *node_remap_end_vaddr[MAX_NUMNODES];
194static void *node_remap_alloc_vaddr[MAX_NUMNODES]; 86static void *node_remap_alloc_vaddr[MAX_NUMNODES];
195static unsigned long node_remap_offset[MAX_NUMNODES];
196 87
88/**
89 * alloc_remap - Allocate remapped memory
90 * @nid: NUMA node to allocate memory from
91 * @size: The size of allocation
92 *
93 * Allocate @size bytes from the remap area of NUMA node @nid. The
94 * size of the remap area is predetermined by init_alloc_remap() and
95 * only the callers considered there should call this function. For
96 * more info, please read the comment on top of init_alloc_remap().
97 *
98 * The caller must be ready to handle allocation failure from this
99 * function and fall back to regular memory allocator in such cases.
100 *
101 * CONTEXT:
102 * Single CPU early boot context.
103 *
104 * RETURNS:
105 * Pointer to the allocated memory on success, %NULL on failure.
106 */
197void *alloc_remap(int nid, unsigned long size) 107void *alloc_remap(int nid, unsigned long size)
198{ 108{
199 void *allocation = node_remap_alloc_vaddr[nid]; 109 void *allocation = node_remap_alloc_vaddr[nid];
200 110
201 size = ALIGN(size, L1_CACHE_BYTES); 111 size = ALIGN(size, L1_CACHE_BYTES);
202 112
203 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 113 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
204 return NULL; 114 return NULL;
205 115
206 node_remap_alloc_vaddr[nid] += size; 116 node_remap_alloc_vaddr[nid] += size;
@@ -209,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
209 return allocation; 119 return allocation;
210} 120}
211 121
212static void __init remap_numa_kva(void)
213{
214 void *vaddr;
215 unsigned long pfn;
216 int node;
217
218 for_each_online_node(node) {
219 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
220 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
221 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
222 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
223 (unsigned long)vaddr,
224 node_remap_start_pfn[node] + pfn);
225 set_pmd_pfn((ulong) vaddr,
226 node_remap_start_pfn[node] + pfn,
227 PAGE_KERNEL_LARGE);
228 }
229 }
230}
231
232#ifdef CONFIG_HIBERNATION 122#ifdef CONFIG_HIBERNATION
233/** 123/**
234 * resume_map_numa_kva - add KVA mapping to the temporary page tables created 124 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -240,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
240 int node; 130 int node;
241 131
242 for_each_online_node(node) { 132 for_each_online_node(node) {
243 unsigned long start_va, start_pfn, size, pfn; 133 unsigned long start_va, start_pfn, nr_pages, pfn;
244 134
245 start_va = (unsigned long)node_remap_start_vaddr[node]; 135 start_va = (unsigned long)node_remap_start_vaddr[node];
246 start_pfn = node_remap_start_pfn[node]; 136 start_pfn = node_remap_start_pfn[node];
247 size = node_remap_size[node]; 137 nr_pages = (node_remap_end_vaddr[node] -
138 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
248 139
249 printk(KERN_DEBUG "%s: node %d\n", __func__, node); 140 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
250 141
251 for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { 142 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
252 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); 143 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
253 pgd_t *pgd = pgd_base + pgd_index(vaddr); 144 pgd_t *pgd = pgd_base + pgd_index(vaddr);
254 pud_t *pud = pud_offset(pgd, vaddr); 145 pud_t *pud = pud_offset(pgd, vaddr);
@@ -264,132 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
264} 155}
265#endif 156#endif
266 157
267static __init unsigned long calculate_numa_remap_pages(void) 158/**
159 * init_alloc_remap - Initialize remap allocator for a NUMA node
160 * @nid: NUMA node to initizlie remap allocator for
161 *
162 * NUMA nodes may end up without any lowmem. As allocating pgdat and
163 * memmap on a different node with lowmem is inefficient, a special
164 * remap allocator is implemented which can be used by alloc_remap().
165 *
166 * For each node, the amount of memory which will be necessary for
167 * pgdat and memmap is calculated and two memory areas of the size are
168 * allocated - one in the node and the other in lowmem; then, the area
169 * in the node is remapped to the lowmem area.
170 *
171 * As pgdat and memmap must be allocated in lowmem anyway, this
172 * doesn't waste lowmem address space; however, the actual lowmem
173 * which gets remapped over is wasted. The amount shouldn't be
174 * problematic on machines this feature will be used.
175 *
176 * Initialization failure isn't fatal. alloc_remap() is used
177 * opportunistically and the callers will fall back to other memory
178 * allocation mechanisms on failure.
179 */
180void __init init_alloc_remap(int nid, u64 start, u64 end)
268{ 181{
269 int nid; 182 unsigned long start_pfn = start >> PAGE_SHIFT;
270 unsigned long size, reserve_pages = 0; 183 unsigned long end_pfn = end >> PAGE_SHIFT;
271 184 unsigned long size, pfn;
272 for_each_online_node(nid) { 185 u64 node_pa, remap_pa;
273 u64 node_kva_target; 186 void *remap_va;
274 u64 node_kva_final;
275
276 /*
277 * The acpi/srat node info can show hot-add memroy zones
278 * where memory could be added but not currently present.
279 */
280 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
281 nid, node_start_pfn[nid], node_end_pfn[nid]);
282 if (node_start_pfn[nid] > max_pfn)
283 continue;
284 if (!node_end_pfn[nid])
285 continue;
286 if (node_end_pfn[nid] > max_pfn)
287 node_end_pfn[nid] = max_pfn;
288
289 /* ensure the remap includes space for the pgdat. */
290 size = node_remap_size[nid] + sizeof(pg_data_t);
291
292 /* convert size to large (pmd size) pages, rounding up */
293 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
294 /* now the roundup is correct, convert to PAGE_SIZE pages */
295 size = size * PTRS_PER_PTE;
296
297 node_kva_target = round_down(node_end_pfn[nid] - size,
298 PTRS_PER_PTE);
299 node_kva_target <<= PAGE_SHIFT;
300 do {
301 node_kva_final = memblock_find_in_range(node_kva_target,
302 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
303 ((u64)size)<<PAGE_SHIFT,
304 LARGE_PAGE_BYTES);
305 node_kva_target -= LARGE_PAGE_BYTES;
306 } while (node_kva_final == MEMBLOCK_ERROR &&
307 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
308
309 if (node_kva_final == MEMBLOCK_ERROR)
310 panic("Can not get kva ram\n");
311
312 node_remap_size[nid] = size;
313 node_remap_offset[nid] = reserve_pages;
314 reserve_pages += size;
315 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
316 " node %d at %llx\n",
317 size, nid, node_kva_final>>PAGE_SHIFT);
318
319 /*
320 * prevent kva address below max_low_pfn want it on system
321 * with less memory later.
322 * layout will be: KVA address , KVA RAM
323 *
324 * we are supposed to only record the one less then max_low_pfn
325 * but we could have some hole in high memory, and it will only
326 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
327 * to use it as free.
328 * So memblock_x86_reserve_range here, hope we don't run out of that array
329 */
330 memblock_x86_reserve_range(node_kva_final,
331 node_kva_final+(((u64)size)<<PAGE_SHIFT),
332 "KVA RAM");
333
334 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
335 }
336 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
337 reserve_pages);
338 return reserve_pages;
339}
340 187
341static void init_remap_allocator(int nid) 188 /*
342{ 189 * The acpi/srat node info can show hot-add memroy zones where
343 node_remap_start_vaddr[nid] = pfn_to_kaddr( 190 * memory could be added but not currently present.
344 kva_start_pfn + node_remap_offset[nid]); 191 */
345 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] + 192 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
346 (node_remap_size[nid] * PAGE_SIZE); 193 nid, start_pfn, end_pfn);
347 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] + 194
348 ALIGN(sizeof(pg_data_t), PAGE_SIZE); 195 /* calculate the necessary space aligned to large page size */
349 196 size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
350 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid, 197 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
351 (ulong) node_remap_start_vaddr[nid], 198 size = ALIGN(size, LARGE_PAGE_BYTES);
352 (ulong) node_remap_end_vaddr[nid]); 199
200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (node_pa == MEMBLOCK_ERROR) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid);
205 return;
206 }
207 memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES);
212 if (remap_pa == MEMBLOCK_ERROR) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid);
215 memblock_x86_free_range(node_pa, node_pa + size);
216 return;
217 }
218 memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
219 remap_va = phys_to_virt(remap_pa);
220
221 /* perform actual remap */
222 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
223 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
224 (node_pa >> PAGE_SHIFT) + pfn,
225 PAGE_KERNEL_LARGE);
226
227 /* initialize remap allocator parameters */
228 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
229 node_remap_start_vaddr[nid] = remap_va;
230 node_remap_end_vaddr[nid] = remap_va + size;
231 node_remap_alloc_vaddr[nid] = remap_va;
232
233 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
234 nid, node_pa, node_pa + size, remap_va, remap_va + size);
353} 235}
354 236
355void __init initmem_init(void) 237void __init initmem_init(void)
356{ 238{
357 int nid; 239 x86_numa_init();
358 long kva_target_pfn;
359
360 /*
361 * When mapping a NUMA machine we allocate the node_mem_map arrays
362 * from node local memory. They are then mapped directly into KVA
363 * between zone normal and vmalloc space. Calculate the size of
364 * this space and use it to adjust the boundary between ZONE_NORMAL
365 * and ZONE_HIGHMEM.
366 */
367
368 get_memcfg_numa();
369 numa_init_array();
370
371 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE);
372 240
373 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
374 do {
375 kva_start_pfn = memblock_find_in_range(kva_target_pfn<<PAGE_SHIFT,
376 max_low_pfn<<PAGE_SHIFT,
377 kva_pages<<PAGE_SHIFT,
378 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
379 kva_target_pfn -= PTRS_PER_PTE;
380 } while (kva_start_pfn == MEMBLOCK_ERROR && kva_target_pfn > min_low_pfn);
381
382 if (kva_start_pfn == MEMBLOCK_ERROR)
383 panic("Can not get kva space\n");
384
385 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
386 kva_start_pfn, max_low_pfn);
387 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
388
389 /* avoid clash with initrd */
390 memblock_x86_reserve_range(kva_start_pfn<<PAGE_SHIFT,
391 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
392 "KVA PG");
393#ifdef CONFIG_HIGHMEM 241#ifdef CONFIG_HIGHMEM
394 highstart_pfn = highend_pfn = max_pfn; 242 highstart_pfn = highend_pfn = max_pfn;
395 if (max_pfn > max_low_pfn) 243 if (max_pfn > max_low_pfn)
@@ -409,51 +257,9 @@ void __init initmem_init(void)
409 257
410 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", 258 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
411 (ulong) pfn_to_kaddr(max_low_pfn)); 259 (ulong) pfn_to_kaddr(max_low_pfn));
412 for_each_online_node(nid) {
413 init_remap_allocator(nid);
414
415 allocate_pgdat(nid);
416 }
417 remap_numa_kva();
418 260
419 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", 261 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
420 (ulong) pfn_to_kaddr(highstart_pfn)); 262 (ulong) pfn_to_kaddr(highstart_pfn));
421 for_each_online_node(nid)
422 propagate_e820_map_node(nid);
423
424 for_each_online_node(nid) {
425 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
426 NODE_DATA(nid)->node_id = nid;
427 }
428 263
429 setup_bootmem_allocator(); 264 setup_bootmem_allocator();
430} 265}
431
432#ifdef CONFIG_MEMORY_HOTPLUG
433static int paddr_to_nid(u64 addr)
434{
435 int nid;
436 unsigned long pfn = PFN_DOWN(addr);
437
438 for_each_node(nid)
439 if (node_start_pfn[nid] <= pfn &&
440 pfn < node_end_pfn[nid])
441 return nid;
442
443 return -1;
444}
445
446/*
447 * This function is used to ask node id BEFORE memmap and mem_section's
448 * initialization (pfn_to_nid() can't be used yet).
449 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
450 */
451int memory_add_physaddr_to_nid(u64 addr)
452{
453 int nid = paddr_to_nid(addr);
454 return (nid >= 0) ? nid : 0;
455}
456
457EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
458#endif
459
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 85b52fc03084..dd27f401f0a0 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,646 +2,13 @@
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h> 5#include <linux/bootmem.h>
10#include <linux/memblock.h>
11#include <linux/mmzone.h>
12#include <linux/ctype.h>
13#include <linux/module.h>
14#include <linux/nodemask.h>
15#include <linux/sched.h>
16#include <linux/acpi.h>
17
18#include <asm/e820.h>
19#include <asm/proto.h>
20#include <asm/dma.h>
21#include <asm/acpi.h>
22#include <asm/amd_nb.h>
23 6
24#include "numa_internal.h" 7#include "numa_internal.h"
25 8
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29nodemask_t numa_nodes_parsed __initdata;
30
31struct memnode memnode;
32
33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size;
35
36static struct numa_meminfo numa_meminfo __initdata;
37
38static int numa_distance_cnt;
39static u8 *numa_distance;
40
41/*
42 * Given a shift value, try to populate memnodemap[]
43 * Returns :
44 * 1 if OK
45 * 0 if memnodmap[] too small (of shift too small)
46 * -1 if node overlap or lost ram (shift too big)
47 */
48static int __init populate_memnodemap(const struct numa_meminfo *mi, int shift)
49{
50 unsigned long addr, end;
51 int i, res = -1;
52
53 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
54 for (i = 0; i < mi->nr_blks; i++) {
55 addr = mi->blk[i].start;
56 end = mi->blk[i].end;
57 if (addr >= end)
58 continue;
59 if ((end >> shift) >= memnodemapsize)
60 return 0;
61 do {
62 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
63 return -1;
64 memnodemap[addr >> shift] = mi->blk[i].nid;
65 addr += (1UL << shift);
66 } while (addr < end);
67 res = 1;
68 }
69 return res;
70}
71
72static int __init allocate_cachealigned_memnodemap(void)
73{
74 unsigned long addr;
75
76 memnodemap = memnode.embedded_map;
77 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
78 return 0;
79
80 addr = 0x8000;
81 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
82 nodemap_addr = memblock_find_in_range(addr, get_max_mapped(),
83 nodemap_size, L1_CACHE_BYTES);
84 if (nodemap_addr == MEMBLOCK_ERROR) {
85 printk(KERN_ERR
86 "NUMA: Unable to allocate Memory to Node hash map\n");
87 nodemap_addr = nodemap_size = 0;
88 return -1;
89 }
90 memnodemap = phys_to_virt(nodemap_addr);
91 memblock_x86_reserve_range(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
92
93 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
94 nodemap_addr, nodemap_addr + nodemap_size);
95 return 0;
96}
97
98/*
99 * The LSB of all start and end addresses in the node map is the value of the
100 * maximum possible shift.
101 */
102static int __init extract_lsb_from_nodes(const struct numa_meminfo *mi)
103{
104 int i, nodes_used = 0;
105 unsigned long start, end;
106 unsigned long bitfield = 0, memtop = 0;
107
108 for (i = 0; i < mi->nr_blks; i++) {
109 start = mi->blk[i].start;
110 end = mi->blk[i].end;
111 if (start >= end)
112 continue;
113 bitfield |= start;
114 nodes_used++;
115 if (end > memtop)
116 memtop = end;
117 }
118 if (nodes_used <= 1)
119 i = 63;
120 else
121 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
122 memnodemapsize = (memtop >> i)+1;
123 return i;
124}
125
126static int __init compute_hash_shift(const struct numa_meminfo *mi)
127{
128 int shift;
129
130 shift = extract_lsb_from_nodes(mi);
131 if (allocate_cachealigned_memnodemap())
132 return -1;
133 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
134 shift);
135
136 if (populate_memnodemap(mi, shift) != 1) {
137 printk(KERN_INFO "Your memory is not aligned you need to "
138 "rebuild your kernel with a bigger NODEMAPSIZE "
139 "shift=%d\n", shift);
140 return -1;
141 }
142 return shift;
143}
144
145int __meminit __early_pfn_to_nid(unsigned long pfn)
146{
147 return phys_to_nid(pfn << PAGE_SHIFT);
148}
149
150static void * __init early_node_mem(int nodeid, unsigned long start,
151 unsigned long end, unsigned long size,
152 unsigned long align)
153{
154 unsigned long mem;
155
156 /*
157 * put it on high as possible
158 * something will go with NODE_DATA
159 */
160 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
161 start = MAX_DMA_PFN<<PAGE_SHIFT;
162 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
163 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
164 start = MAX_DMA32_PFN<<PAGE_SHIFT;
165 mem = memblock_x86_find_in_range_node(nodeid, start, end, size, align);
166 if (mem != MEMBLOCK_ERROR)
167 return __va(mem);
168
169 /* extend the search scope */
170 end = max_pfn_mapped << PAGE_SHIFT;
171 start = MAX_DMA_PFN << PAGE_SHIFT;
172 mem = memblock_find_in_range(start, end, size, align);
173 if (mem != MEMBLOCK_ERROR)
174 return __va(mem);
175
176 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
177 size, nodeid);
178
179 return NULL;
180}
181
182static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
183 struct numa_meminfo *mi)
184{
185 /* ignore zero length blks */
186 if (start == end)
187 return 0;
188
189 /* whine about and ignore invalid blks */
190 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
191 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
192 nid, start, end);
193 return 0;
194 }
195
196 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
197 pr_err("NUMA: too many memblk ranges\n");
198 return -EINVAL;
199 }
200
201 mi->blk[mi->nr_blks].start = start;
202 mi->blk[mi->nr_blks].end = end;
203 mi->blk[mi->nr_blks].nid = nid;
204 mi->nr_blks++;
205 return 0;
206}
207
208/**
209 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
210 * @idx: Index of memblk to remove
211 * @mi: numa_meminfo to remove memblk from
212 *
213 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
214 * decrementing @mi->nr_blks.
215 */
216void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
217{
218 mi->nr_blks--;
219 memmove(&mi->blk[idx], &mi->blk[idx + 1],
220 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
221}
222
223/**
224 * numa_add_memblk - Add one numa_memblk to numa_meminfo
225 * @nid: NUMA node ID of the new memblk
226 * @start: Start address of the new memblk
227 * @end: End address of the new memblk
228 *
229 * Add a new memblk to the default numa_meminfo.
230 *
231 * RETURNS:
232 * 0 on success, -errno on failure.
233 */
234int __init numa_add_memblk(int nid, u64 start, u64 end)
235{
236 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
237}
238
239/* Initialize bootmem allocator for a node */
240void __init
241setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
242{
243 unsigned long start_pfn, last_pfn, nodedata_phys;
244 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
245 int nid;
246
247 if (!end)
248 return;
249
250 /*
251 * Don't confuse VM with a node that doesn't have the
252 * minimum amount of memory:
253 */
254 if (end && (end - start) < NODE_MIN_SIZE)
255 return;
256
257 start = roundup(start, ZONE_ALIGN);
258
259 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
260 start, end);
261
262 start_pfn = start >> PAGE_SHIFT;
263 last_pfn = end >> PAGE_SHIFT;
264
265 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
266 SMP_CACHE_BYTES);
267 if (node_data[nodeid] == NULL)
268 return;
269 nodedata_phys = __pa(node_data[nodeid]);
270 memblock_x86_reserve_range(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
271 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
272 nodedata_phys + pgdat_size - 1);
273 nid = phys_to_nid(nodedata_phys);
274 if (nid != nodeid)
275 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
276
277 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
278 NODE_DATA(nodeid)->node_id = nodeid;
279 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
280 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
281
282 node_set_online(nodeid);
283}
284
285/**
286 * numa_cleanup_meminfo - Cleanup a numa_meminfo
287 * @mi: numa_meminfo to clean up
288 *
289 * Sanitize @mi by merging and removing unncessary memblks. Also check for
290 * conflicts and clear unused memblks.
291 *
292 * RETURNS:
293 * 0 on success, -errno on failure.
294 */
295int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
296{
297 const u64 low = 0;
298 const u64 high = (u64)max_pfn << PAGE_SHIFT;
299 int i, j, k;
300
301 for (i = 0; i < mi->nr_blks; i++) {
302 struct numa_memblk *bi = &mi->blk[i];
303
304 /* make sure all blocks are inside the limits */
305 bi->start = max(bi->start, low);
306 bi->end = min(bi->end, high);
307
308 /* and there's no empty block */
309 if (bi->start >= bi->end) {
310 numa_remove_memblk_from(i--, mi);
311 continue;
312 }
313
314 for (j = i + 1; j < mi->nr_blks; j++) {
315 struct numa_memblk *bj = &mi->blk[j];
316 unsigned long start, end;
317
318 /*
319 * See whether there are overlapping blocks. Whine
320 * about but allow overlaps of the same nid. They
321 * will be merged below.
322 */
323 if (bi->end > bj->start && bi->start < bj->end) {
324 if (bi->nid != bj->nid) {
325 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
326 bi->nid, bi->start, bi->end,
327 bj->nid, bj->start, bj->end);
328 return -EINVAL;
329 }
330 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
331 bi->nid, bi->start, bi->end,
332 bj->start, bj->end);
333 }
334
335 /*
336 * Join together blocks on the same node, holes
337 * between which don't overlap with memory on other
338 * nodes.
339 */
340 if (bi->nid != bj->nid)
341 continue;
342 start = max(min(bi->start, bj->start), low);
343 end = min(max(bi->end, bj->end), high);
344 for (k = 0; k < mi->nr_blks; k++) {
345 struct numa_memblk *bk = &mi->blk[k];
346
347 if (bi->nid == bk->nid)
348 continue;
349 if (start < bk->end && end > bk->start)
350 break;
351 }
352 if (k < mi->nr_blks)
353 continue;
354 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
355 bi->nid, bi->start, bi->end, bj->start, bj->end,
356 start, end);
357 bi->start = start;
358 bi->end = end;
359 numa_remove_memblk_from(j--, mi);
360 }
361 }
362
363 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
364 mi->blk[i].start = mi->blk[i].end = 0;
365 mi->blk[i].nid = NUMA_NO_NODE;
366 }
367
368 return 0;
369}
370
371/*
372 * Set nodes, which have memory in @mi, in *@nodemask.
373 */
374static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
375 const struct numa_meminfo *mi)
376{
377 int i;
378
379 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
380 if (mi->blk[i].start != mi->blk[i].end &&
381 mi->blk[i].nid != NUMA_NO_NODE)
382 node_set(mi->blk[i].nid, *nodemask);
383}
384
385/**
386 * numa_reset_distance - Reset NUMA distance table
387 *
388 * The current table is freed. The next numa_set_distance() call will
389 * create a new one.
390 */
391void __init numa_reset_distance(void)
392{
393 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
394
395 /* numa_distance could be 1LU marking allocation failure, test cnt */
396 if (numa_distance_cnt)
397 memblock_x86_free_range(__pa(numa_distance),
398 __pa(numa_distance) + size);
399 numa_distance_cnt = 0;
400 numa_distance = NULL; /* enable table creation */
401}
402
403static int __init numa_alloc_distance(void)
404{
405 nodemask_t nodes_parsed;
406 size_t size;
407 int i, j, cnt = 0;
408 u64 phys;
409
410 /* size the new table and allocate it */
411 nodes_parsed = numa_nodes_parsed;
412 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
413
414 for_each_node_mask(i, nodes_parsed)
415 cnt = i;
416 cnt++;
417 size = cnt * cnt * sizeof(numa_distance[0]);
418
419 phys = memblock_find_in_range(0, (u64)max_pfn_mapped << PAGE_SHIFT,
420 size, PAGE_SIZE);
421 if (phys == MEMBLOCK_ERROR) {
422 pr_warning("NUMA: Warning: can't allocate distance table!\n");
423 /* don't retry until explicitly reset */
424 numa_distance = (void *)1LU;
425 return -ENOMEM;
426 }
427 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
428
429 numa_distance = __va(phys);
430 numa_distance_cnt = cnt;
431
432 /* fill with the default distances */
433 for (i = 0; i < cnt; i++)
434 for (j = 0; j < cnt; j++)
435 numa_distance[i * cnt + j] = i == j ?
436 LOCAL_DISTANCE : REMOTE_DISTANCE;
437 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
438
439 return 0;
440}
441
442/**
443 * numa_set_distance - Set NUMA distance from one NUMA to another
444 * @from: the 'from' node to set distance
445 * @to: the 'to' node to set distance
446 * @distance: NUMA distance
447 *
448 * Set the distance from node @from to @to to @distance. If distance table
449 * doesn't exist, one which is large enough to accommodate all the currently
450 * known nodes will be created.
451 *
452 * If such table cannot be allocated, a warning is printed and further
453 * calls are ignored until the distance table is reset with
454 * numa_reset_distance().
455 *
456 * If @from or @to is higher than the highest known node at the time of
457 * table creation or @distance doesn't make sense, the call is ignored.
458 * This is to allow simplification of specific NUMA config implementations.
459 */
460void __init numa_set_distance(int from, int to, int distance)
461{
462 if (!numa_distance && numa_alloc_distance() < 0)
463 return;
464
465 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
466 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
467 from, to, distance);
468 return;
469 }
470
471 if ((u8)distance != distance ||
472 (from == to && distance != LOCAL_DISTANCE)) {
473 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
474 from, to, distance);
475 return;
476 }
477
478 numa_distance[from * numa_distance_cnt + to] = distance;
479}
480
481int __node_distance(int from, int to)
482{
483 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
484 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
485 return numa_distance[from * numa_distance_cnt + to];
486}
487EXPORT_SYMBOL(__node_distance);
488
489/*
490 * Sanity check to catch more bad NUMA configurations (they are amazingly
491 * common). Make sure the nodes cover all memory.
492 */
493static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
494{
495 unsigned long numaram, e820ram;
496 int i;
497
498 numaram = 0;
499 for (i = 0; i < mi->nr_blks; i++) {
500 unsigned long s = mi->blk[i].start >> PAGE_SHIFT;
501 unsigned long e = mi->blk[i].end >> PAGE_SHIFT;
502 numaram += e - s;
503 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
504 if ((long)numaram < 0)
505 numaram = 0;
506 }
507
508 e820ram = max_pfn - (memblock_x86_hole_size(0,
509 max_pfn << PAGE_SHIFT) >> PAGE_SHIFT);
510 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
511 if ((long)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
512 printk(KERN_ERR "NUMA: nodes only cover %luMB of your %luMB e820 RAM. Not used.\n",
513 (numaram << PAGE_SHIFT) >> 20,
514 (e820ram << PAGE_SHIFT) >> 20);
515 return false;
516 }
517 return true;
518}
519
520static int __init numa_register_memblks(struct numa_meminfo *mi)
521{
522 int i, nid;
523
524 /* Account for nodes with cpus and no memory */
525 node_possible_map = numa_nodes_parsed;
526 numa_nodemask_from_meminfo(&node_possible_map, mi);
527 if (WARN_ON(nodes_empty(node_possible_map)))
528 return -EINVAL;
529
530 memnode_shift = compute_hash_shift(mi);
531 if (memnode_shift < 0) {
532 printk(KERN_ERR "NUMA: No NUMA node hash function found. Contact maintainer\n");
533 return -EINVAL;
534 }
535
536 for (i = 0; i < mi->nr_blks; i++)
537 memblock_x86_register_active_regions(mi->blk[i].nid,
538 mi->blk[i].start >> PAGE_SHIFT,
539 mi->blk[i].end >> PAGE_SHIFT);
540
541 /* for out of order entries */
542 sort_node_map();
543 if (!numa_meminfo_cover_memory(mi))
544 return -EINVAL;
545
546 /* Finally register nodes. */
547 for_each_node_mask(nid, node_possible_map) {
548 u64 start = (u64)max_pfn << PAGE_SHIFT;
549 u64 end = 0;
550
551 for (i = 0; i < mi->nr_blks; i++) {
552 if (nid != mi->blk[i].nid)
553 continue;
554 start = min(mi->blk[i].start, start);
555 end = max(mi->blk[i].end, end);
556 }
557
558 if (start < end)
559 setup_node_bootmem(nid, start, end);
560 }
561
562 return 0;
563}
564
565/**
566 * dummy_numma_init - Fallback dummy NUMA init
567 *
568 * Used if there's no underlying NUMA architecture, NUMA initialization
569 * fails, or NUMA is disabled on the command line.
570 *
571 * Must online at least one node and add memory blocks that cover all
572 * allowed memory. This function must not fail.
573 */
574static int __init dummy_numa_init(void)
575{
576 printk(KERN_INFO "%s\n",
577 numa_off ? "NUMA turned off" : "No NUMA configuration found");
578 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
579 0LU, max_pfn << PAGE_SHIFT);
580
581 node_set(0, numa_nodes_parsed);
582 numa_add_memblk(0, 0, (u64)max_pfn << PAGE_SHIFT);
583
584 return 0;
585}
586
587static int __init numa_init(int (*init_func)(void))
588{
589 int i;
590 int ret;
591
592 for (i = 0; i < MAX_LOCAL_APIC; i++)
593 set_apicid_to_node(i, NUMA_NO_NODE);
594
595 nodes_clear(numa_nodes_parsed);
596 nodes_clear(node_possible_map);
597 nodes_clear(node_online_map);
598 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
599 remove_all_active_ranges();
600 numa_reset_distance();
601
602 ret = init_func();
603 if (ret < 0)
604 return ret;
605 ret = numa_cleanup_meminfo(&numa_meminfo);
606 if (ret < 0)
607 return ret;
608
609 numa_emulation(&numa_meminfo, numa_distance_cnt);
610
611 ret = numa_register_memblks(&numa_meminfo);
612 if (ret < 0)
613 return ret;
614
615 for (i = 0; i < nr_cpu_ids; i++) {
616 int nid = early_cpu_to_node(i);
617
618 if (nid == NUMA_NO_NODE)
619 continue;
620 if (!node_online(nid))
621 numa_clear_node(i);
622 }
623 numa_init_array();
624 return 0;
625}
626
627void __init initmem_init(void) 9void __init initmem_init(void)
628{ 10{
629 int ret; 11 x86_numa_init();
630
631 if (!numa_off) {
632#ifdef CONFIG_ACPI_NUMA
633 ret = numa_init(x86_acpi_numa_init);
634 if (!ret)
635 return;
636#endif
637#ifdef CONFIG_AMD_NUMA
638 ret = numa_init(amd_numa_init);
639 if (!ret)
640 return;
641#endif
642 }
643
644 numa_init(dummy_numa_init);
645} 12}
646 13
647unsigned long __init numa_free_all_bootmem(void) 14unsigned long __init numa_free_all_bootmem(void)
@@ -656,12 +23,3 @@ unsigned long __init numa_free_all_bootmem(void)
656 23
657 return pages; 24 return pages;
658} 25}
659
660int __cpuinit numa_cpu_node(int cpu)
661{
662 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
663
664 if (apicid != BAD_APICID)
665 return __apicid_to_node[apicid];
666 return NUMA_NO_NODE;
667}
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
index de84cc140379..d0ed086b6247 100644
--- a/arch/x86/mm/numa_emulation.c
+++ b/arch/x86/mm/numa_emulation.c
@@ -5,6 +5,7 @@
5#include <linux/errno.h> 5#include <linux/errno.h>
6#include <linux/topology.h> 6#include <linux/topology.h>
7#include <linux/memblock.h> 7#include <linux/memblock.h>
8#include <linux/bootmem.h>
8#include <asm/dma.h> 9#include <asm/dma.h>
9 10
10#include "numa_internal.h" 11#include "numa_internal.h"
@@ -84,7 +85,13 @@ static int __init split_nodes_interleave(struct numa_meminfo *ei,
84 nr_nodes = MAX_NUMNODES; 85 nr_nodes = MAX_NUMNODES;
85 } 86 }
86 87
87 size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) / nr_nodes; 88 /*
89 * Calculate target node size. x86_32 freaks on __udivdi3() so do
90 * the division in ulong number of pages and convert back.
91 */
92 size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
93 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
94
88 /* 95 /*
89 * Calculate the number of big nodes that can be allocated as a result 96 * Calculate the number of big nodes that can be allocated as a result
90 * of consolidating the remainder. 97 * of consolidating the remainder.
@@ -226,7 +233,7 @@ static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
226 */ 233 */
227 while (nodes_weight(physnode_mask)) { 234 while (nodes_weight(physnode_mask)) {
228 for_each_node_mask(i, physnode_mask) { 235 for_each_node_mask(i, physnode_mask) {
229 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT; 236 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
230 u64 start, limit, end; 237 u64 start, limit, end;
231 int phys_blk; 238 int phys_blk;
232 239
@@ -298,7 +305,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
298{ 305{
299 static struct numa_meminfo ei __initdata; 306 static struct numa_meminfo ei __initdata;
300 static struct numa_meminfo pi __initdata; 307 static struct numa_meminfo pi __initdata;
301 const u64 max_addr = max_pfn << PAGE_SHIFT; 308 const u64 max_addr = PFN_PHYS(max_pfn);
302 u8 *phys_dist = NULL; 309 u8 *phys_dist = NULL;
303 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]); 310 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
304 int max_emu_nid, dfl_phys_nid; 311 int max_emu_nid, dfl_phys_nid;
@@ -342,8 +349,7 @@ void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
342 if (numa_dist_cnt) { 349 if (numa_dist_cnt) {
343 u64 phys; 350 u64 phys;
344 351
345 phys = memblock_find_in_range(0, 352 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
346 (u64)max_pfn_mapped << PAGE_SHIFT,
347 phys_size, PAGE_SIZE); 353 phys_size, PAGE_SIZE);
348 if (phys == MEMBLOCK_ERROR) { 354 if (phys == MEMBLOCK_ERROR) {
349 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n"); 355 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
index ef2d97377d7c..7178c3afe05e 100644
--- a/arch/x86/mm/numa_internal.h
+++ b/arch/x86/mm/numa_internal.h
@@ -19,6 +19,14 @@ void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi); 19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void); 20void __init numa_reset_distance(void);
21 21
22void __init x86_numa_init(void);
23
24#ifdef CONFIG_X86_64
25static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
26#else
27void __init init_alloc_remap(int nid, u64 start, u64 end);
28#endif
29
22#ifdef CONFIG_NUMA_EMU 30#ifdef CONFIG_NUMA_EMU
23void __init numa_emulation(struct numa_meminfo *numa_meminfo, 31void __init numa_emulation(struct numa_meminfo *numa_meminfo,
24 int numa_dist_cnt); 32 int numa_dist_cnt);
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 38e6d174c497..9f0614daea85 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
414 unsigned char *p; 414 unsigned char *p;
415 struct prefix_bits prf; 415 struct prefix_bits prf;
416 int i; 416 int i;
417 unsigned long rv;
418 417
419 p = (unsigned char *)ins_addr; 418 p = (unsigned char *)ins_addr;
420 p += skip_prefix(p, &prf); 419 p += skip_prefix(p, &prf);
421 p += get_opcode(p, &opcode); 420 p += get_opcode(p, &opcode);
422 for (i = 0; i < ARRAY_SIZE(reg_rop); i++) 421 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
423 if (reg_rop[i] == opcode) { 422 if (reg_rop[i] == opcode)
424 rv = REG_READ;
425 goto do_work; 423 goto do_work;
426 }
427 424
428 for (i = 0; i < ARRAY_SIZE(reg_wop); i++) 425 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
429 if (reg_wop[i] == opcode) { 426 if (reg_wop[i] == opcode)
430 rv = REG_WRITE;
431 goto do_work; 427 goto do_work;
432 }
433 428
434 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " 429 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
435 "0x%02x\n", opcode); 430 "0x%02x\n", opcode);
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
474 unsigned char *p; 469 unsigned char *p;
475 struct prefix_bits prf; 470 struct prefix_bits prf;
476 int i; 471 int i;
477 unsigned long rv;
478 472
479 p = (unsigned char *)ins_addr; 473 p = (unsigned char *)ins_addr;
480 p += skip_prefix(p, &prf); 474 p += skip_prefix(p, &prf);
481 p += get_opcode(p, &opcode); 475 p += get_opcode(p, &opcode);
482 for (i = 0; i < ARRAY_SIZE(imm_wop); i++) 476 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
483 if (imm_wop[i] == opcode) { 477 if (imm_wop[i] == opcode)
484 rv = IMM_WRITE;
485 goto do_work; 478 goto do_work;
486 }
487 479
488 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " 480 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
489 "0x%02x\n", opcode); 481 "0x%02x\n", opcode);
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat.c
index 8e9d3394f6d4..81dbfdeb080d 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat.c
@@ -26,8 +26,6 @@
26 26
27int acpi_numa __initdata; 27int acpi_numa __initdata;
28 28
29static struct bootnode nodes_add[MAX_NUMNODES];
30
31static __init int setup_node(int pxm) 29static __init int setup_node(int pxm)
32{ 30{
33 return acpi_map_pxm_to_node(pxm); 31 return acpi_map_pxm_to_node(pxm);
@@ -37,7 +35,6 @@ static __init void bad_srat(void)
37{ 35{
38 printk(KERN_ERR "SRAT: SRAT not used.\n"); 36 printk(KERN_ERR "SRAT: SRAT not used.\n");
39 acpi_numa = -1; 37 acpi_numa = -1;
40 memset(nodes_add, 0, sizeof(nodes_add));
41} 38}
42 39
43static __init inline int srat_disabled(void) 40static __init inline int srat_disabled(void)
@@ -131,73 +128,17 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
131 pxm, apic_id, node); 128 pxm, apic_id, node);
132} 129}
133 130
134#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE 131#ifdef CONFIG_MEMORY_HOTPLUG
135static inline int save_add_info(void) {return 1;} 132static inline int save_add_info(void) {return 1;}
136#else 133#else
137static inline int save_add_info(void) {return 0;} 134static inline int save_add_info(void) {return 0;}
138#endif 135#endif
139/*
140 * Update nodes_add[]
141 * This code supports one contiguous hot add area per node
142 */
143static void __init
144update_nodes_add(int node, unsigned long start, unsigned long end)
145{
146 unsigned long s_pfn = start >> PAGE_SHIFT;
147 unsigned long e_pfn = end >> PAGE_SHIFT;
148 int changed = 0;
149 struct bootnode *nd = &nodes_add[node];
150
151 /* I had some trouble with strange memory hotadd regions breaking
152 the boot. Be very strict here and reject anything unexpected.
153 If you want working memory hotadd write correct SRATs.
154
155 The node size check is a basic sanity check to guard against
156 mistakes */
157 if ((signed long)(end - start) < NODE_MIN_SIZE) {
158 printk(KERN_ERR "SRAT: Hotplug area too small\n");
159 return;
160 }
161
162 /* This check might be a bit too strict, but I'm keeping it for now. */
163 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
164 printk(KERN_ERR
165 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
166 s_pfn, e_pfn);
167 return;
168 }
169
170 /* Looks good */
171
172 if (nd->start == nd->end) {
173 nd->start = start;
174 nd->end = end;
175 changed = 1;
176 } else {
177 if (nd->start == end) {
178 nd->start = start;
179 changed = 1;
180 }
181 if (nd->end == start) {
182 nd->end = end;
183 changed = 1;
184 }
185 if (!changed)
186 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
187 }
188
189 if (changed) {
190 node_set(node, numa_nodes_parsed);
191 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
192 nd->start, nd->end);
193 }
194}
195 136
196/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 137/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
197void __init 138void __init
198acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) 139acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
199{ 140{
200 unsigned long start, end; 141 u64 start, end;
201 int node, pxm; 142 int node, pxm;
202 143
203 if (srat_disabled()) 144 if (srat_disabled())
@@ -226,11 +167,8 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
226 return; 167 return;
227 } 168 }
228 169
229 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 170 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
230 start, end); 171 start, end);
231
232 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE)
233 update_nodes_add(node, start, end);
234} 172}
235 173
236void __init acpi_numa_arch_fixup(void) {} 174void __init acpi_numa_arch_fixup(void) {}
@@ -244,17 +182,3 @@ int __init x86_acpi_numa_init(void)
244 return ret; 182 return ret;
245 return srat_disabled() ? -EINVAL : 0; 183 return srat_disabled() ? -EINVAL : 0;
246} 184}
247
248#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
249int memory_add_physaddr_to_nid(u64 start)
250{
251 int i, ret = 0;
252
253 for_each_node(i)
254 if (nodes_add[i].start <= start && nodes_add[i].end > start)
255 ret = i;
256
257 return ret;
258}
259EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
260#endif
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 364f36bdfad8..000000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,288 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/memblock.h>
29#include <linux/mmzone.h>
30#include <linux/acpi.h>
31#include <linux/nodemask.h>
32#include <asm/srat.h>
33#include <asm/topology.h>
34#include <asm/smp.h>
35#include <asm/e820.h>
36
37/*
38 * proximity macros and definitions
39 */
40#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
41#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
42#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
43#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
44/* bitmap length; _PXM is at most 255 */
45#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
46static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
47
48#define MAX_CHUNKS_PER_NODE 3
49#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
50struct node_memory_chunk_s {
51 unsigned long start_pfn;
52 unsigned long end_pfn;
53 u8 pxm; // proximity domain of node
54 u8 nid; // which cnode contains this chunk?
55 u8 bank; // which mem bank on this node
56};
57static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
58
59static int __initdata num_memory_chunks; /* total number of memory chunks */
60static u8 __initdata apicid_to_pxm[MAX_LOCAL_APIC];
61
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 /* don't need to check apic_id here, because it is always 8 bits */
95 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
96
97 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
98 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
99}
100
101/*
102 * Identify memory proximity domains and hot-remove capabilities.
103 * Fill node memory chunk list structure.
104 */
105void __init
106acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
107{
108 unsigned long long paddr, size;
109 unsigned long start_pfn, end_pfn;
110 u8 pxm;
111 struct node_memory_chunk_s *p, *q, *pend;
112
113 if (srat_disabled())
114 return;
115 if (memory_affinity->header.length !=
116 sizeof(struct acpi_srat_mem_affinity)) {
117 bad_srat();
118 return;
119 }
120
121 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
122 return; /* empty entry */
123
124 pxm = memory_affinity->proximity_domain & 0xff;
125
126 /* mark this node as "seen" in node bitmap */
127 BMAP_SET(pxm_bitmap, pxm);
128
129 /* calculate info for memory chunk structure */
130 paddr = memory_affinity->base_address;
131 size = memory_affinity->length;
132
133 start_pfn = paddr >> PAGE_SHIFT;
134 end_pfn = (paddr + size) >> PAGE_SHIFT;
135
136
137 if (num_memory_chunks >= MAXCHUNKS) {
138 printk(KERN_WARNING "Too many mem chunks in SRAT."
139 " Ignoring %lld MBytes at %llx\n",
140 size/(1024*1024), paddr);
141 return;
142 }
143
144 /* Insertion sort based on base address */
145 pend = &node_memory_chunk[num_memory_chunks];
146 for (p = &node_memory_chunk[0]; p < pend; p++) {
147 if (start_pfn < p->start_pfn)
148 break;
149 }
150 if (p < pend) {
151 for (q = pend; q >= p; q--)
152 *(q + 1) = *q;
153 }
154 p->start_pfn = start_pfn;
155 p->end_pfn = end_pfn;
156 p->pxm = pxm;
157
158 num_memory_chunks++;
159
160 printk(KERN_DEBUG "Memory range %08lx to %08lx"
161 " in proximity domain %02x %s\n",
162 start_pfn, end_pfn,
163 pxm,
164 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
165 "enabled and removable" : "enabled" ) );
166}
167
168/* Callback for SLIT parsing */
169void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
170{
171}
172
173void acpi_numa_arch_fixup(void)
174{
175}
176/*
177 * The SRAT table always lists ascending addresses, so can always
178 * assume that the first "start" address that you see is the real
179 * start of the node, and that the current "end" address is after
180 * the previous one.
181 */
182static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
183{
184 /*
185 * Only add present memory as told by the e820.
186 * There is no guarantee from the SRAT that the memory it
187 * enumerates is present at boot time because it represents
188 * *possible* memory hotplug areas the same as normal RAM.
189 */
190 if (memory_chunk->start_pfn >= max_pfn) {
191 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
192 memory_chunk->start_pfn, memory_chunk->end_pfn);
193 return -1;
194 }
195 if (memory_chunk->nid != nid)
196 return -1;
197
198 if (!node_has_online_mem(nid))
199 node_start_pfn[nid] = memory_chunk->start_pfn;
200
201 if (node_start_pfn[nid] > memory_chunk->start_pfn)
202 node_start_pfn[nid] = memory_chunk->start_pfn;
203
204 if (node_end_pfn[nid] < memory_chunk->end_pfn)
205 node_end_pfn[nid] = memory_chunk->end_pfn;
206
207 return 0;
208}
209
210int __init get_memcfg_from_srat(void)
211{
212 int i, j, nid;
213
214 if (srat_disabled())
215 goto out_fail;
216
217 if (acpi_numa_init() < 0)
218 goto out_fail;
219
220 if (num_memory_chunks == 0) {
221 printk(KERN_DEBUG
222 "could not find any ACPI SRAT memory areas.\n");
223 goto out_fail;
224 }
225
226 /* Calculate total number of nodes in system from PXM bitmap and create
227 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
228 * to specify the range of _PXM values.)
229 */
230 /*
231 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
232 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
233 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
234 * approaches MAX_PXM_DOMAINS for i386.
235 */
236 nodes_clear(node_online_map);
237 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
238 if (BMAP_TEST(pxm_bitmap, i)) {
239 int nid = acpi_map_pxm_to_node(i);
240 node_set_online(nid);
241 }
242 }
243 BUG_ON(num_online_nodes() == 0);
244
245 /* set cnode id in memory chunk structure */
246 for (i = 0; i < num_memory_chunks; i++)
247 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
248
249 printk(KERN_DEBUG "pxm bitmap: ");
250 for (i = 0; i < sizeof(pxm_bitmap); i++) {
251 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
252 }
253 printk(KERN_CONT "\n");
254 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
255 num_online_nodes());
256 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
257 num_memory_chunks);
258
259 for (i = 0; i < MAX_LOCAL_APIC; i++)
260 set_apicid_to_node(i, pxm_to_node(apicid_to_pxm[i]));
261
262 for (j = 0; j < num_memory_chunks; j++){
263 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
264 printk(KERN_DEBUG
265 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
266 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
267 if (node_read_chunk(chunk->nid, chunk))
268 continue;
269
270 memblock_x86_register_active_regions(chunk->nid, chunk->start_pfn,
271 min(chunk->end_pfn, max_pfn));
272 }
273 /* for out of order entries in SRAT */
274 sort_node_map();
275
276 for_each_online_node(nid) {
277 unsigned long start = node_start_pfn[nid];
278 unsigned long end = min(node_end_pfn[nid], max_pfn);
279
280 memory_present(nid, start, end);
281 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
282 }
283 return 1;
284out_fail:
285 printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
286 " table\n");
287 return 0;
288}
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
new file mode 100644
index 000000000000..90568c33ddb0
--- /dev/null
+++ b/arch/x86/net/Makefile
@@ -0,0 +1,4 @@
1#
2# Arch-specific network modules
3#
4obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
new file mode 100644
index 000000000000..66870223f8c5
--- /dev/null
+++ b/arch/x86/net/bpf_jit.S
@@ -0,0 +1,140 @@
1/* bpf_jit.S : BPF JIT helper functions
2 *
3 * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 */
10#include <linux/linkage.h>
11#include <asm/dwarf2.h>
12
13/*
14 * Calling convention :
15 * rdi : skb pointer
16 * esi : offset of byte(s) to fetch in skb (can be scratched)
17 * r8 : copy of skb->data
18 * r9d : hlen = skb->len - skb->data_len
19 */
20#define SKBDATA %r8
21
22sk_load_word_ind:
23 .globl sk_load_word_ind
24
25 add %ebx,%esi /* offset += X */
26# test %esi,%esi /* if (offset < 0) goto bpf_error; */
27 js bpf_error
28
29sk_load_word:
30 .globl sk_load_word
31
32 mov %r9d,%eax # hlen
33 sub %esi,%eax # hlen - offset
34 cmp $3,%eax
35 jle bpf_slow_path_word
36 mov (SKBDATA,%rsi),%eax
37 bswap %eax /* ntohl() */
38 ret
39
40
41sk_load_half_ind:
42 .globl sk_load_half_ind
43
44 add %ebx,%esi /* offset += X */
45 js bpf_error
46
47sk_load_half:
48 .globl sk_load_half
49
50 mov %r9d,%eax
51 sub %esi,%eax # hlen - offset
52 cmp $1,%eax
53 jle bpf_slow_path_half
54 movzwl (SKBDATA,%rsi),%eax
55 rol $8,%ax # ntohs()
56 ret
57
58sk_load_byte_ind:
59 .globl sk_load_byte_ind
60 add %ebx,%esi /* offset += X */
61 js bpf_error
62
63sk_load_byte:
64 .globl sk_load_byte
65
66 cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
67 jle bpf_slow_path_byte
68 movzbl (SKBDATA,%rsi),%eax
69 ret
70
71/**
72 * sk_load_byte_msh - BPF_S_LDX_B_MSH helper
73 *
74 * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
75 * Must preserve A accumulator (%eax)
76 * Inputs : %esi is the offset value, already known positive
77 */
78ENTRY(sk_load_byte_msh)
79 CFI_STARTPROC
80 cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
81 jle bpf_slow_path_byte_msh
82 movzbl (SKBDATA,%rsi),%ebx
83 and $15,%bl
84 shl $2,%bl
85 ret
86 CFI_ENDPROC
87ENDPROC(sk_load_byte_msh)
88
89bpf_error:
90# force a return 0 from jit handler
91 xor %eax,%eax
92 mov -8(%rbp),%rbx
93 leaveq
94 ret
95
96/* rsi contains offset and can be scratched */
97#define bpf_slow_path_common(LEN) \
98 push %rdi; /* save skb */ \
99 push %r9; \
100 push SKBDATA; \
101/* rsi already has offset */ \
102 mov $LEN,%ecx; /* len */ \
103 lea -12(%rbp),%rdx; \
104 call skb_copy_bits; \
105 test %eax,%eax; \
106 pop SKBDATA; \
107 pop %r9; \
108 pop %rdi
109
110
111bpf_slow_path_word:
112 bpf_slow_path_common(4)
113 js bpf_error
114 mov -12(%rbp),%eax
115 bswap %eax
116 ret
117
118bpf_slow_path_half:
119 bpf_slow_path_common(2)
120 js bpf_error
121 mov -12(%rbp),%ax
122 rol $8,%ax
123 movzwl %ax,%eax
124 ret
125
126bpf_slow_path_byte:
127 bpf_slow_path_common(1)
128 js bpf_error
129 movzbl -12(%rbp),%eax
130 ret
131
132bpf_slow_path_byte_msh:
133 xchg %eax,%ebx /* dont lose A , X is about to be scratched */
134 bpf_slow_path_common(1)
135 js bpf_error
136 movzbl -12(%rbp),%eax
137 and $15,%al
138 shl $2,%al
139 xchg %eax,%ebx
140 ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
new file mode 100644
index 000000000000..bfab3fa10edc
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -0,0 +1,654 @@
1/* bpf_jit_comp.c : BPF JIT compiler
2 *
3 * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 */
10#include <linux/moduleloader.h>
11#include <asm/cacheflush.h>
12#include <linux/netdevice.h>
13#include <linux/filter.h>
14
15/*
16 * Conventions :
17 * EAX : BPF A accumulator
18 * EBX : BPF X accumulator
19 * RDI : pointer to skb (first argument given to JIT function)
20 * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
21 * ECX,EDX,ESI : scratch registers
22 * r9d : skb->len - skb->data_len (headlen)
23 * r8 : skb->data
24 * -8(RBP) : saved RBX value
25 * -16(RBP)..-80(RBP) : BPF_MEMWORDS values
26 */
27int bpf_jit_enable __read_mostly;
28
29/*
30 * assembly code in arch/x86/net/bpf_jit.S
31 */
32extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
33extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[];
34
35static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
36{
37 if (len == 1)
38 *ptr = bytes;
39 else if (len == 2)
40 *(u16 *)ptr = bytes;
41 else {
42 *(u32 *)ptr = bytes;
43 barrier();
44 }
45 return ptr + len;
46}
47
48#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0)
49
50#define EMIT1(b1) EMIT(b1, 1)
51#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
52#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
53#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
54#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0)
55
56#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */
57#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */
58
59static inline bool is_imm8(int value)
60{
61 return value <= 127 && value >= -128;
62}
63
64static inline bool is_near(int offset)
65{
66 return offset <= 127 && offset >= -128;
67}
68
69#define EMIT_JMP(offset) \
70do { \
71 if (offset) { \
72 if (is_near(offset)) \
73 EMIT2(0xeb, offset); /* jmp .+off8 */ \
74 else \
75 EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \
76 } \
77} while (0)
78
79/* list of x86 cond jumps opcodes (. + s8)
80 * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
81 */
82#define X86_JB 0x72
83#define X86_JAE 0x73
84#define X86_JE 0x74
85#define X86_JNE 0x75
86#define X86_JBE 0x76
87#define X86_JA 0x77
88
89#define EMIT_COND_JMP(op, offset) \
90do { \
91 if (is_near(offset)) \
92 EMIT2(op, offset); /* jxx .+off8 */ \
93 else { \
94 EMIT2(0x0f, op + 0x10); \
95 EMIT(offset, 4); /* jxx .+off32 */ \
96 } \
97} while (0)
98
99#define COND_SEL(CODE, TOP, FOP) \
100 case CODE: \
101 t_op = TOP; \
102 f_op = FOP; \
103 goto cond_branch
104
105
106#define SEEN_DATAREF 1 /* might call external helpers */
107#define SEEN_XREG 2 /* ebx is used */
108#define SEEN_MEM 4 /* use mem[] for temporary storage */
109
110static inline void bpf_flush_icache(void *start, void *end)
111{
112 mm_segment_t old_fs = get_fs();
113
114 set_fs(KERNEL_DS);
115 smp_wmb();
116 flush_icache_range((unsigned long)start, (unsigned long)end);
117 set_fs(old_fs);
118}
119
120
121void bpf_jit_compile(struct sk_filter *fp)
122{
123 u8 temp[64];
124 u8 *prog;
125 unsigned int proglen, oldproglen = 0;
126 int ilen, i;
127 int t_offset, f_offset;
128 u8 t_op, f_op, seen = 0, pass;
129 u8 *image = NULL;
130 u8 *func;
131 int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
132 unsigned int cleanup_addr; /* epilogue code offset */
133 unsigned int *addrs;
134 const struct sock_filter *filter = fp->insns;
135 int flen = fp->len;
136
137 if (!bpf_jit_enable)
138 return;
139
140 addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL);
141 if (addrs == NULL)
142 return;
143
144 /* Before first pass, make a rough estimation of addrs[]
145 * each bpf instruction is translated to less than 64 bytes
146 */
147 for (proglen = 0, i = 0; i < flen; i++) {
148 proglen += 64;
149 addrs[i] = proglen;
150 }
151 cleanup_addr = proglen; /* epilogue address */
152
153 for (pass = 0; pass < 10; pass++) {
154 /* no prologue/epilogue for trivial filters (RET something) */
155 proglen = 0;
156 prog = temp;
157
158 if (seen) {
159 EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
160 EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */
161 /* note : must save %rbx in case bpf_error is hit */
162 if (seen & (SEEN_XREG | SEEN_DATAREF))
163 EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
164 if (seen & SEEN_XREG)
165 CLEAR_X(); /* make sure we dont leek kernel memory */
166
167 /*
168 * If this filter needs to access skb data,
169 * loads r9 and r8 with :
170 * r9 = skb->len - skb->data_len
171 * r8 = skb->data
172 */
173 if (seen & SEEN_DATAREF) {
174 if (offsetof(struct sk_buff, len) <= 127)
175 /* mov off8(%rdi),%r9d */
176 EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
177 else {
178 /* mov off32(%rdi),%r9d */
179 EMIT3(0x44, 0x8b, 0x8f);
180 EMIT(offsetof(struct sk_buff, len), 4);
181 }
182 if (is_imm8(offsetof(struct sk_buff, data_len)))
183 /* sub off8(%rdi),%r9d */
184 EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len));
185 else {
186 EMIT3(0x44, 0x2b, 0x8f);
187 EMIT(offsetof(struct sk_buff, data_len), 4);
188 }
189
190 if (is_imm8(offsetof(struct sk_buff, data)))
191 /* mov off8(%rdi),%r8 */
192 EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data));
193 else {
194 /* mov off32(%rdi),%r8 */
195 EMIT3(0x4c, 0x8b, 0x87);
196 EMIT(offsetof(struct sk_buff, data), 4);
197 }
198 }
199 }
200
201 switch (filter[0].code) {
202 case BPF_S_RET_K:
203 case BPF_S_LD_W_LEN:
204 case BPF_S_ANC_PROTOCOL:
205 case BPF_S_ANC_IFINDEX:
206 case BPF_S_ANC_MARK:
207 case BPF_S_ANC_RXHASH:
208 case BPF_S_ANC_CPU:
209 case BPF_S_ANC_QUEUE:
210 case BPF_S_LD_W_ABS:
211 case BPF_S_LD_H_ABS:
212 case BPF_S_LD_B_ABS:
213 /* first instruction sets A register (or is RET 'constant') */
214 break;
215 default:
216 /* make sure we dont leak kernel information to user */
217 CLEAR_A(); /* A = 0 */
218 }
219
220 for (i = 0; i < flen; i++) {
221 unsigned int K = filter[i].k;
222
223 switch (filter[i].code) {
224 case BPF_S_ALU_ADD_X: /* A += X; */
225 seen |= SEEN_XREG;
226 EMIT2(0x01, 0xd8); /* add %ebx,%eax */
227 break;
228 case BPF_S_ALU_ADD_K: /* A += K; */
229 if (!K)
230 break;
231 if (is_imm8(K))
232 EMIT3(0x83, 0xc0, K); /* add imm8,%eax */
233 else
234 EMIT1_off32(0x05, K); /* add imm32,%eax */
235 break;
236 case BPF_S_ALU_SUB_X: /* A -= X; */
237 seen |= SEEN_XREG;
238 EMIT2(0x29, 0xd8); /* sub %ebx,%eax */
239 break;
240 case BPF_S_ALU_SUB_K: /* A -= K */
241 if (!K)
242 break;
243 if (is_imm8(K))
244 EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */
245 else
246 EMIT1_off32(0x2d, K); /* sub imm32,%eax */
247 break;
248 case BPF_S_ALU_MUL_X: /* A *= X; */
249 seen |= SEEN_XREG;
250 EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */
251 break;
252 case BPF_S_ALU_MUL_K: /* A *= K */
253 if (is_imm8(K))
254 EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */
255 else {
256 EMIT2(0x69, 0xc0); /* imul imm32,%eax */
257 EMIT(K, 4);
258 }
259 break;
260 case BPF_S_ALU_DIV_X: /* A /= X; */
261 seen |= SEEN_XREG;
262 EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
263 if (pc_ret0 != -1)
264 EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4));
265 else {
266 EMIT_COND_JMP(X86_JNE, 2 + 5);
267 CLEAR_A();
268 EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
269 }
270 EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */
271 break;
272 case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */
273 EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */
274 EMIT(K, 4);
275 EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */
276 break;
277 case BPF_S_ALU_AND_X:
278 seen |= SEEN_XREG;
279 EMIT2(0x21, 0xd8); /* and %ebx,%eax */
280 break;
281 case BPF_S_ALU_AND_K:
282 if (K >= 0xFFFFFF00) {
283 EMIT2(0x24, K & 0xFF); /* and imm8,%al */
284 } else if (K >= 0xFFFF0000) {
285 EMIT2(0x66, 0x25); /* and imm16,%ax */
286 EMIT2(K, 2);
287 } else {
288 EMIT1_off32(0x25, K); /* and imm32,%eax */
289 }
290 break;
291 case BPF_S_ALU_OR_X:
292 seen |= SEEN_XREG;
293 EMIT2(0x09, 0xd8); /* or %ebx,%eax */
294 break;
295 case BPF_S_ALU_OR_K:
296 if (is_imm8(K))
297 EMIT3(0x83, 0xc8, K); /* or imm8,%eax */
298 else
299 EMIT1_off32(0x0d, K); /* or imm32,%eax */
300 break;
301 case BPF_S_ALU_LSH_X: /* A <<= X; */
302 seen |= SEEN_XREG;
303 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
304 break;
305 case BPF_S_ALU_LSH_K:
306 if (K == 0)
307 break;
308 else if (K == 1)
309 EMIT2(0xd1, 0xe0); /* shl %eax */
310 else
311 EMIT3(0xc1, 0xe0, K);
312 break;
313 case BPF_S_ALU_RSH_X: /* A >>= X; */
314 seen |= SEEN_XREG;
315 EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */
316 break;
317 case BPF_S_ALU_RSH_K: /* A >>= K; */
318 if (K == 0)
319 break;
320 else if (K == 1)
321 EMIT2(0xd1, 0xe8); /* shr %eax */
322 else
323 EMIT3(0xc1, 0xe8, K);
324 break;
325 case BPF_S_ALU_NEG:
326 EMIT2(0xf7, 0xd8); /* neg %eax */
327 break;
328 case BPF_S_RET_K:
329 if (!K) {
330 if (pc_ret0 == -1)
331 pc_ret0 = i;
332 CLEAR_A();
333 } else {
334 EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
335 }
336 /* fallinto */
337 case BPF_S_RET_A:
338 if (seen) {
339 if (i != flen - 1) {
340 EMIT_JMP(cleanup_addr - addrs[i]);
341 break;
342 }
343 if (seen & SEEN_XREG)
344 EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */
345 EMIT1(0xc9); /* leaveq */
346 }
347 EMIT1(0xc3); /* ret */
348 break;
349 case BPF_S_MISC_TAX: /* X = A */
350 seen |= SEEN_XREG;
351 EMIT2(0x89, 0xc3); /* mov %eax,%ebx */
352 break;
353 case BPF_S_MISC_TXA: /* A = X */
354 seen |= SEEN_XREG;
355 EMIT2(0x89, 0xd8); /* mov %ebx,%eax */
356 break;
357 case BPF_S_LD_IMM: /* A = K */
358 if (!K)
359 CLEAR_A();
360 else
361 EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
362 break;
363 case BPF_S_LDX_IMM: /* X = K */
364 seen |= SEEN_XREG;
365 if (!K)
366 CLEAR_X();
367 else
368 EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */
369 break;
370 case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */
371 seen |= SEEN_MEM;
372 EMIT3(0x8b, 0x45, 0xf0 - K*4);
373 break;
374 case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */
375 seen |= SEEN_XREG | SEEN_MEM;
376 EMIT3(0x8b, 0x5d, 0xf0 - K*4);
377 break;
378 case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */
379 seen |= SEEN_MEM;
380 EMIT3(0x89, 0x45, 0xf0 - K*4);
381 break;
382 case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
383 seen |= SEEN_XREG | SEEN_MEM;
384 EMIT3(0x89, 0x5d, 0xf0 - K*4);
385 break;
386 case BPF_S_LD_W_LEN: /* A = skb->len; */
387 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
388 if (is_imm8(offsetof(struct sk_buff, len)))
389 /* mov off8(%rdi),%eax */
390 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len));
391 else {
392 EMIT2(0x8b, 0x87);
393 EMIT(offsetof(struct sk_buff, len), 4);
394 }
395 break;
396 case BPF_S_LDX_W_LEN: /* X = skb->len; */
397 seen |= SEEN_XREG;
398 if (is_imm8(offsetof(struct sk_buff, len)))
399 /* mov off8(%rdi),%ebx */
400 EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len));
401 else {
402 EMIT2(0x8b, 0x9f);
403 EMIT(offsetof(struct sk_buff, len), 4);
404 }
405 break;
406 case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
407 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
408 if (is_imm8(offsetof(struct sk_buff, protocol))) {
409 /* movzwl off8(%rdi),%eax */
410 EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol));
411 } else {
412 EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
413 EMIT(offsetof(struct sk_buff, protocol), 4);
414 }
415 EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */
416 break;
417 case BPF_S_ANC_IFINDEX:
418 if (is_imm8(offsetof(struct sk_buff, dev))) {
419 /* movq off8(%rdi),%rax */
420 EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev));
421 } else {
422 EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */
423 EMIT(offsetof(struct sk_buff, dev), 4);
424 }
425 EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */
426 EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6));
427 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
428 EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */
429 EMIT(offsetof(struct net_device, ifindex), 4);
430 break;
431 case BPF_S_ANC_MARK:
432 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
433 if (is_imm8(offsetof(struct sk_buff, mark))) {
434 /* mov off8(%rdi),%eax */
435 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark));
436 } else {
437 EMIT2(0x8b, 0x87);
438 EMIT(offsetof(struct sk_buff, mark), 4);
439 }
440 break;
441 case BPF_S_ANC_RXHASH:
442 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
443 if (is_imm8(offsetof(struct sk_buff, rxhash))) {
444 /* mov off8(%rdi),%eax */
445 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash));
446 } else {
447 EMIT2(0x8b, 0x87);
448 EMIT(offsetof(struct sk_buff, rxhash), 4);
449 }
450 break;
451 case BPF_S_ANC_QUEUE:
452 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
453 if (is_imm8(offsetof(struct sk_buff, queue_mapping))) {
454 /* movzwl off8(%rdi),%eax */
455 EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping));
456 } else {
457 EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
458 EMIT(offsetof(struct sk_buff, queue_mapping), 4);
459 }
460 break;
461 case BPF_S_ANC_CPU:
462#ifdef CONFIG_SMP
463 EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */
464 EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */
465#else
466 CLEAR_A();
467#endif
468 break;
469 case BPF_S_LD_W_ABS:
470 func = sk_load_word;
471common_load: seen |= SEEN_DATAREF;
472 if ((int)K < 0)
473 goto out;
474 t_offset = func - (image + addrs[i]);
475 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
476 EMIT1_off32(0xe8, t_offset); /* call */
477 break;
478 case BPF_S_LD_H_ABS:
479 func = sk_load_half;
480 goto common_load;
481 case BPF_S_LD_B_ABS:
482 func = sk_load_byte;
483 goto common_load;
484 case BPF_S_LDX_B_MSH:
485 if ((int)K < 0) {
486 if (pc_ret0 != -1) {
487 EMIT_JMP(addrs[pc_ret0] - addrs[i]);
488 break;
489 }
490 CLEAR_A();
491 EMIT_JMP(cleanup_addr - addrs[i]);
492 break;
493 }
494 seen |= SEEN_DATAREF | SEEN_XREG;
495 t_offset = sk_load_byte_msh - (image + addrs[i]);
496 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
497 EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
498 break;
499 case BPF_S_LD_W_IND:
500 func = sk_load_word_ind;
501common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG;
502 t_offset = func - (image + addrs[i]);
503 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
504 EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */
505 break;
506 case BPF_S_LD_H_IND:
507 func = sk_load_half_ind;
508 goto common_load_ind;
509 case BPF_S_LD_B_IND:
510 func = sk_load_byte_ind;
511 goto common_load_ind;
512 case BPF_S_JMP_JA:
513 t_offset = addrs[i + K] - addrs[i];
514 EMIT_JMP(t_offset);
515 break;
516 COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE);
517 COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB);
518 COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE);
519 COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE);
520 COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE);
521 COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB);
522 COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE);
523 COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE);
524
525cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
526 t_offset = addrs[i + filter[i].jt] - addrs[i];
527
528 /* same targets, can avoid doing the test :) */
529 if (filter[i].jt == filter[i].jf) {
530 EMIT_JMP(t_offset);
531 break;
532 }
533
534 switch (filter[i].code) {
535 case BPF_S_JMP_JGT_X:
536 case BPF_S_JMP_JGE_X:
537 case BPF_S_JMP_JEQ_X:
538 seen |= SEEN_XREG;
539 EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */
540 break;
541 case BPF_S_JMP_JSET_X:
542 seen |= SEEN_XREG;
543 EMIT2(0x85, 0xd8); /* test %ebx,%eax */
544 break;
545 case BPF_S_JMP_JEQ_K:
546 if (K == 0) {
547 EMIT2(0x85, 0xc0); /* test %eax,%eax */
548 break;
549 }
550 case BPF_S_JMP_JGT_K:
551 case BPF_S_JMP_JGE_K:
552 if (K <= 127)
553 EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */
554 else
555 EMIT1_off32(0x3d, K); /* cmp imm32,%eax */
556 break;
557 case BPF_S_JMP_JSET_K:
558 if (K <= 0xFF)
559 EMIT2(0xa8, K); /* test imm8,%al */
560 else if (!(K & 0xFFFF00FF))
561 EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */
562 else if (K <= 0xFFFF) {
563 EMIT2(0x66, 0xa9); /* test imm16,%ax */
564 EMIT(K, 2);
565 } else {
566 EMIT1_off32(0xa9, K); /* test imm32,%eax */
567 }
568 break;
569 }
570 if (filter[i].jt != 0) {
571 if (filter[i].jf)
572 t_offset += is_near(f_offset) ? 2 : 6;
573 EMIT_COND_JMP(t_op, t_offset);
574 if (filter[i].jf)
575 EMIT_JMP(f_offset);
576 break;
577 }
578 EMIT_COND_JMP(f_op, f_offset);
579 break;
580 default:
581 /* hmm, too complex filter, give up with jit compiler */
582 goto out;
583 }
584 ilen = prog - temp;
585 if (image) {
586 if (unlikely(proglen + ilen > oldproglen)) {
587 pr_err("bpb_jit_compile fatal error\n");
588 kfree(addrs);
589 module_free(NULL, image);
590 return;
591 }
592 memcpy(image + proglen, temp, ilen);
593 }
594 proglen += ilen;
595 addrs[i] = proglen;
596 prog = temp;
597 }
598 /* last bpf instruction is always a RET :
599 * use it to give the cleanup instruction(s) addr
600 */
601 cleanup_addr = proglen - 1; /* ret */
602 if (seen)
603 cleanup_addr -= 1; /* leaveq */
604 if (seen & SEEN_XREG)
605 cleanup_addr -= 4; /* mov -8(%rbp),%rbx */
606
607 if (image) {
608 WARN_ON(proglen != oldproglen);
609 break;
610 }
611 if (proglen == oldproglen) {
612 image = module_alloc(max_t(unsigned int,
613 proglen,
614 sizeof(struct work_struct)));
615 if (!image)
616 goto out;
617 }
618 oldproglen = proglen;
619 }
620 if (bpf_jit_enable > 1)
621 pr_err("flen=%d proglen=%u pass=%d image=%p\n",
622 flen, proglen, pass, image);
623
624 if (image) {
625 if (bpf_jit_enable > 1)
626 print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
627 16, 1, image, proglen, false);
628
629 bpf_flush_icache(image, image + proglen);
630
631 fp->bpf_func = (void *)image;
632 }
633out:
634 kfree(addrs);
635 return;
636}
637
638static void jit_free_defer(struct work_struct *arg)
639{
640 module_free(NULL, arg);
641}
642
643/* run from softirq, we must use a work_struct to call
644 * module_free() from process context
645 */
646void bpf_jit_free(struct sk_filter *fp)
647{
648 if (fp->bpf_func != sk_run_filter) {
649 struct work_struct *work = (struct work_struct *)fp->bpf_func;
650
651 INIT_WORK(work, jit_free_defer);
652 schedule_work(work);
653 }
654}
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 2d49d4e19a36..a5b64ab4cd6e 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -16,17 +16,6 @@
16#include <asm/stacktrace.h> 16#include <asm/stacktrace.h>
17#include <linux/compat.h> 17#include <linux/compat.h>
18 18
19static void backtrace_warning_symbol(void *data, char *msg,
20 unsigned long symbol)
21{
22 /* Ignore warnings */
23}
24
25static void backtrace_warning(void *data, char *msg)
26{
27 /* Ignore warnings */
28}
29
30static int backtrace_stack(void *data, char *name) 19static int backtrace_stack(void *data, char *name)
31{ 20{
32 /* Yes, we want all stacks */ 21 /* Yes, we want all stacks */
@@ -42,8 +31,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
42} 31}
43 32
44static struct stacktrace_ops backtrace_ops = { 33static struct stacktrace_ops backtrace_ops = {
45 .warning = backtrace_warning,
46 .warning_symbol = backtrace_warning_symbol,
47 .stack = backtrace_stack, 34 .stack = backtrace_stack,
48 .address = backtrace_address, 35 .address = backtrace_address,
49 .walk_stack = print_context_stack, 36 .walk_stack = print_context_stack,
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index bd33620b0071..e6fd8473fb7b 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -280,12 +280,9 @@ void __init pci_direct_init(int type)
280 280
281int __init pci_direct_probe(void) 281int __init pci_direct_probe(void)
282{ 282{
283 struct resource *region, *region2;
284
285 if ((pci_probe & PCI_PROBE_CONF1) == 0) 283 if ((pci_probe & PCI_PROBE_CONF1) == 0)
286 goto type2; 284 goto type2;
287 region = request_region(0xCF8, 8, "PCI conf1"); 285 if (!request_region(0xCF8, 8, "PCI conf1"))
288 if (!region)
289 goto type2; 286 goto type2;
290 287
291 if (pci_check_type1()) { 288 if (pci_check_type1()) {
@@ -293,16 +290,14 @@ int __init pci_direct_probe(void)
293 port_cf9_safe = true; 290 port_cf9_safe = true;
294 return 1; 291 return 1;
295 } 292 }
296 release_resource(region); 293 release_region(0xCF8, 8);
297 294
298 type2: 295 type2:
299 if ((pci_probe & PCI_PROBE_CONF2) == 0) 296 if ((pci_probe & PCI_PROBE_CONF2) == 0)
300 return 0; 297 return 0;
301 region = request_region(0xCF8, 4, "PCI conf2"); 298 if (!request_region(0xCF8, 4, "PCI conf2"))
302 if (!region)
303 return 0; 299 return 0;
304 region2 = request_region(0xC000, 0x1000, "PCI conf2"); 300 if (!request_region(0xC000, 0x1000, "PCI conf2"))
305 if (!region2)
306 goto fail2; 301 goto fail2;
307 302
308 if (pci_check_type2()) { 303 if (pci_check_type2()) {
@@ -311,8 +306,8 @@ int __init pci_direct_probe(void)
311 return 2; 306 return 2;
312 } 307 }
313 308
314 release_resource(region2); 309 release_region(0xC000, 0x1000);
315 fail2: 310 fail2:
316 release_resource(region); 311 release_region(0xCF8, 4);
317 return 0; 312 return 0;
318} 313}
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 8201165bae28..372e9b8989b3 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -602,7 +602,9 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN && 602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX) 603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN && 604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)) { 605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)
606 || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN &&
607 device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) {
606 r->name = "PIIX/ICH"; 608 r->name = "PIIX/ICH";
607 r->get = pirq_piix_get; 609 r->get = pirq_piix_get;
608 r->set = pirq_piix_set; 610 r->set = pirq_piix_set;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index e282886616a0..750c346ef50a 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -606,6 +606,16 @@ static void __init __pci_mmcfg_init(int early)
606 if (list_empty(&pci_mmcfg_list)) 606 if (list_empty(&pci_mmcfg_list))
607 return; 607 return;
608 608
609 if (pcibios_last_bus < 0) {
610 const struct pci_mmcfg_region *cfg;
611
612 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
613 if (cfg->segment)
614 break;
615 pcibios_last_bus = cfg->end_bus;
616 }
617 }
618
609 if (pci_mmcfg_arch_init()) 619 if (pci_mmcfg_arch_init())
610 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 620 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
611 else { 621 else {
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
index e37b407a0ee8..8214724ce54d 100644
--- a/arch/x86/pci/xen.c
+++ b/arch/x86/pci/xen.c
@@ -108,7 +108,8 @@ static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
108 } 108 }
109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0, 109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
110 (type == PCI_CAP_ID_MSIX) ? 110 (type == PCI_CAP_ID_MSIX) ?
111 "msi-x" : "msi"); 111 "msi-x" : "msi",
112 DOMID_SELF);
112 if (irq < 0) 113 if (irq < 0)
113 goto error; 114 goto error;
114 dev_dbg(&dev->dev, 115 dev_dbg(&dev->dev,
@@ -148,7 +149,8 @@ static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
148 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0, 149 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
149 (type == PCI_CAP_ID_MSIX) ? 150 (type == PCI_CAP_ID_MSIX) ?
150 "pcifront-msi-x" : 151 "pcifront-msi-x" :
151 "pcifront-msi"); 152 "pcifront-msi",
153 DOMID_SELF);
152 if (irq < 0) 154 if (irq < 0)
153 goto free; 155 goto free;
154 i++; 156 i++;
@@ -190,9 +192,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
190 192
191 list_for_each_entry(msidesc, &dev->msi_list, list) { 193 list_for_each_entry(msidesc, &dev->msi_list, list) {
192 struct physdev_map_pirq map_irq; 194 struct physdev_map_pirq map_irq;
195 domid_t domid;
196
197 domid = ret = xen_find_device_domain_owner(dev);
198 /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
199 * hence check ret value for < 0. */
200 if (ret < 0)
201 domid = DOMID_SELF;
193 202
194 memset(&map_irq, 0, sizeof(map_irq)); 203 memset(&map_irq, 0, sizeof(map_irq));
195 map_irq.domid = DOMID_SELF; 204 map_irq.domid = domid;
196 map_irq.type = MAP_PIRQ_TYPE_MSI; 205 map_irq.type = MAP_PIRQ_TYPE_MSI;
197 map_irq.index = -1; 206 map_irq.index = -1;
198 map_irq.pirq = -1; 207 map_irq.pirq = -1;
@@ -215,14 +224,16 @@ static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
215 224
216 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq); 225 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
217 if (ret) { 226 if (ret) {
218 dev_warn(&dev->dev, "xen map irq failed %d\n", ret); 227 dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
228 ret, domid);
219 goto out; 229 goto out;
220 } 230 }
221 231
222 ret = xen_bind_pirq_msi_to_irq(dev, msidesc, 232 ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
223 map_irq.pirq, map_irq.index, 233 map_irq.pirq, map_irq.index,
224 (type == PCI_CAP_ID_MSIX) ? 234 (type == PCI_CAP_ID_MSIX) ?
225 "msi-x" : "msi"); 235 "msi-x" : "msi",
236 domid);
226 if (ret < 0) 237 if (ret < 0)
227 goto out; 238 goto out;
228 } 239 }
@@ -461,3 +472,78 @@ void __init xen_setup_pirqs(void)
461 } 472 }
462} 473}
463#endif 474#endif
475
476#ifdef CONFIG_XEN_DOM0
477struct xen_device_domain_owner {
478 domid_t domain;
479 struct pci_dev *dev;
480 struct list_head list;
481};
482
483static DEFINE_SPINLOCK(dev_domain_list_spinlock);
484static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
485
486static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
487{
488 struct xen_device_domain_owner *owner;
489
490 list_for_each_entry(owner, &dev_domain_list, list) {
491 if (owner->dev == dev)
492 return owner;
493 }
494 return NULL;
495}
496
497int xen_find_device_domain_owner(struct pci_dev *dev)
498{
499 struct xen_device_domain_owner *owner;
500 int domain = -ENODEV;
501
502 spin_lock(&dev_domain_list_spinlock);
503 owner = find_device(dev);
504 if (owner)
505 domain = owner->domain;
506 spin_unlock(&dev_domain_list_spinlock);
507 return domain;
508}
509EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
510
511int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
512{
513 struct xen_device_domain_owner *owner;
514
515 owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
516 if (!owner)
517 return -ENODEV;
518
519 spin_lock(&dev_domain_list_spinlock);
520 if (find_device(dev)) {
521 spin_unlock(&dev_domain_list_spinlock);
522 kfree(owner);
523 return -EEXIST;
524 }
525 owner->domain = domain;
526 owner->dev = dev;
527 list_add_tail(&owner->list, &dev_domain_list);
528 spin_unlock(&dev_domain_list_spinlock);
529 return 0;
530}
531EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
532
533int xen_unregister_device_domain_owner(struct pci_dev *dev)
534{
535 struct xen_device_domain_owner *owner;
536
537 spin_lock(&dev_domain_list_spinlock);
538 owner = find_device(dev);
539 if (!owner) {
540 spin_unlock(&dev_domain_list_spinlock);
541 return -ENODEV;
542 }
543 list_del(&owner->list);
544 spin_unlock(&dev_domain_list_spinlock);
545 kfree(owner);
546 return 0;
547}
548EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
549#endif
diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c
index 0fe27d7c6258..0d3a4fa34560 100644
--- a/arch/x86/platform/efi/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -145,17 +145,6 @@ static void virt_efi_reset_system(int reset_type,
145 data_size, data); 145 data_size, data);
146} 146}
147 147
148static efi_status_t virt_efi_set_virtual_address_map(
149 unsigned long memory_map_size,
150 unsigned long descriptor_size,
151 u32 descriptor_version,
152 efi_memory_desc_t *virtual_map)
153{
154 return efi_call_virt4(set_virtual_address_map,
155 memory_map_size, descriptor_size,
156 descriptor_version, virtual_map);
157}
158
159static efi_status_t __init phys_efi_set_virtual_address_map( 148static efi_status_t __init phys_efi_set_virtual_address_map(
160 unsigned long memory_map_size, 149 unsigned long memory_map_size,
161 unsigned long descriptor_size, 150 unsigned long descriptor_size,
@@ -315,6 +304,40 @@ static void __init print_efi_memmap(void)
315} 304}
316#endif /* EFI_DEBUG */ 305#endif /* EFI_DEBUG */
317 306
307void __init efi_reserve_boot_services(void)
308{
309 void *p;
310
311 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
312 efi_memory_desc_t *md = p;
313 unsigned long long start = md->phys_addr;
314 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
315
316 if (md->type != EFI_BOOT_SERVICES_CODE &&
317 md->type != EFI_BOOT_SERVICES_DATA)
318 continue;
319
320 memblock_x86_reserve_range(start, start + size, "EFI Boot");
321 }
322}
323
324static void __init efi_free_boot_services(void)
325{
326 void *p;
327
328 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
329 efi_memory_desc_t *md = p;
330 unsigned long long start = md->phys_addr;
331 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
332
333 if (md->type != EFI_BOOT_SERVICES_CODE &&
334 md->type != EFI_BOOT_SERVICES_DATA)
335 continue;
336
337 free_bootmem_late(start, size);
338 }
339}
340
318void __init efi_init(void) 341void __init efi_init(void)
319{ 342{
320 efi_config_table_t *config_tables; 343 efi_config_table_t *config_tables;
@@ -468,11 +491,25 @@ void __init efi_init(void)
468#endif 491#endif
469} 492}
470 493
494void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
495{
496 u64 addr, npages;
497
498 addr = md->virt_addr;
499 npages = md->num_pages;
500
501 memrange_efi_to_native(&addr, &npages);
502
503 if (executable)
504 set_memory_x(addr, npages);
505 else
506 set_memory_nx(addr, npages);
507}
508
471static void __init runtime_code_page_mkexec(void) 509static void __init runtime_code_page_mkexec(void)
472{ 510{
473 efi_memory_desc_t *md; 511 efi_memory_desc_t *md;
474 void *p; 512 void *p;
475 u64 addr, npages;
476 513
477 /* Make EFI runtime service code area executable */ 514 /* Make EFI runtime service code area executable */
478 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 515 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -481,10 +518,7 @@ static void __init runtime_code_page_mkexec(void)
481 if (md->type != EFI_RUNTIME_SERVICES_CODE) 518 if (md->type != EFI_RUNTIME_SERVICES_CODE)
482 continue; 519 continue;
483 520
484 addr = md->virt_addr; 521 efi_set_executable(md, true);
485 npages = md->num_pages;
486 memrange_efi_to_native(&addr, &npages);
487 set_memory_x(addr, npages);
488 } 522 }
489} 523}
490 524
@@ -498,16 +532,47 @@ static void __init runtime_code_page_mkexec(void)
498 */ 532 */
499void __init efi_enter_virtual_mode(void) 533void __init efi_enter_virtual_mode(void)
500{ 534{
501 efi_memory_desc_t *md; 535 efi_memory_desc_t *md, *prev_md = NULL;
502 efi_status_t status; 536 efi_status_t status;
503 unsigned long size; 537 unsigned long size;
504 u64 end, systab, addr, npages, end_pfn; 538 u64 end, systab, addr, npages, end_pfn;
505 void *p, *va; 539 void *p, *va, *new_memmap = NULL;
540 int count = 0;
506 541
507 efi.systab = NULL; 542 efi.systab = NULL;
543
544 /* Merge contiguous regions of the same type and attribute */
508 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 545 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
546 u64 prev_size;
509 md = p; 547 md = p;
510 if (!(md->attribute & EFI_MEMORY_RUNTIME)) 548
549 if (!prev_md) {
550 prev_md = md;
551 continue;
552 }
553
554 if (prev_md->type != md->type ||
555 prev_md->attribute != md->attribute) {
556 prev_md = md;
557 continue;
558 }
559
560 prev_size = prev_md->num_pages << EFI_PAGE_SHIFT;
561
562 if (md->phys_addr == (prev_md->phys_addr + prev_size)) {
563 prev_md->num_pages += md->num_pages;
564 md->type = EFI_RESERVED_TYPE;
565 md->attribute = 0;
566 continue;
567 }
568 prev_md = md;
569 }
570
571 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
572 md = p;
573 if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
574 md->type != EFI_BOOT_SERVICES_CODE &&
575 md->type != EFI_BOOT_SERVICES_DATA)
511 continue; 576 continue;
512 577
513 size = md->num_pages << EFI_PAGE_SHIFT; 578 size = md->num_pages << EFI_PAGE_SHIFT;
@@ -541,15 +606,21 @@ void __init efi_enter_virtual_mode(void)
541 systab += md->virt_addr - md->phys_addr; 606 systab += md->virt_addr - md->phys_addr;
542 efi.systab = (efi_system_table_t *) (unsigned long) systab; 607 efi.systab = (efi_system_table_t *) (unsigned long) systab;
543 } 608 }
609 new_memmap = krealloc(new_memmap,
610 (count + 1) * memmap.desc_size,
611 GFP_KERNEL);
612 memcpy(new_memmap + (count * memmap.desc_size), md,
613 memmap.desc_size);
614 count++;
544 } 615 }
545 616
546 BUG_ON(!efi.systab); 617 BUG_ON(!efi.systab);
547 618
548 status = phys_efi_set_virtual_address_map( 619 status = phys_efi_set_virtual_address_map(
549 memmap.desc_size * memmap.nr_map, 620 memmap.desc_size * count,
550 memmap.desc_size, 621 memmap.desc_size,
551 memmap.desc_version, 622 memmap.desc_version,
552 memmap.phys_map); 623 (efi_memory_desc_t *)__pa(new_memmap));
553 624
554 if (status != EFI_SUCCESS) { 625 if (status != EFI_SUCCESS) {
555 printk(KERN_ALERT "Unable to switch EFI into virtual mode " 626 printk(KERN_ALERT "Unable to switch EFI into virtual mode "
@@ -558,6 +629,13 @@ void __init efi_enter_virtual_mode(void)
558 } 629 }
559 630
560 /* 631 /*
632 * Thankfully, it does seem that no runtime services other than
633 * SetVirtualAddressMap() will touch boot services code, so we can
634 * get rid of it all at this point
635 */
636 efi_free_boot_services();
637
638 /*
561 * Now that EFI is in virtual mode, update the function 639 * Now that EFI is in virtual mode, update the function
562 * pointers in the runtime service table to the new virtual addresses. 640 * pointers in the runtime service table to the new virtual addresses.
563 * 641 *
@@ -572,11 +650,12 @@ void __init efi_enter_virtual_mode(void)
572 efi.set_variable = virt_efi_set_variable; 650 efi.set_variable = virt_efi_set_variable;
573 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; 651 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
574 efi.reset_system = virt_efi_reset_system; 652 efi.reset_system = virt_efi_reset_system;
575 efi.set_virtual_address_map = virt_efi_set_virtual_address_map; 653 efi.set_virtual_address_map = NULL;
576 if (__supported_pte_mask & _PAGE_NX) 654 if (__supported_pte_mask & _PAGE_NX)
577 runtime_code_page_mkexec(); 655 runtime_code_page_mkexec();
578 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); 656 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
579 memmap.map = NULL; 657 memmap.map = NULL;
658 kfree(new_memmap);
580} 659}
581 660
582/* 661/*
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac0621a7ac3d..ac3aa54e2654 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,22 +41,7 @@
41static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
42static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
43 43
44static void __init early_mapping_set_exec(unsigned long start, 44static void __init early_code_mapping_set_exec(int executable)
45 unsigned long end,
46 int executable)
47{
48 unsigned long num_pages;
49
50 start &= PMD_MASK;
51 end = (end + PMD_SIZE - 1) & PMD_MASK;
52 num_pages = (end - start) >> PAGE_SHIFT;
53 if (executable)
54 set_memory_x((unsigned long)__va(start), num_pages);
55 else
56 set_memory_nx((unsigned long)__va(start), num_pages);
57}
58
59static void __init early_runtime_code_mapping_set_exec(int executable)
60{ 45{
61 efi_memory_desc_t *md; 46 efi_memory_desc_t *md;
62 void *p; 47 void *p;
@@ -64,14 +49,12 @@ static void __init early_runtime_code_mapping_set_exec(int executable)
64 if (!(__supported_pte_mask & _PAGE_NX)) 49 if (!(__supported_pte_mask & _PAGE_NX))
65 return; 50 return;
66 51
67 /* Make EFI runtime service code area executable */ 52 /* Make EFI service code area executable */
68 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 53 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
69 md = p; 54 md = p;
70 if (md->type == EFI_RUNTIME_SERVICES_CODE) { 55 if (md->type == EFI_RUNTIME_SERVICES_CODE ||
71 unsigned long end; 56 md->type == EFI_BOOT_SERVICES_CODE)
72 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); 57 efi_set_executable(md, executable);
73 early_mapping_set_exec(md->phys_addr, end, executable);
74 }
75 } 58 }
76} 59}
77 60
@@ -79,7 +62,7 @@ void __init efi_call_phys_prelog(void)
79{ 62{
80 unsigned long vaddress; 63 unsigned long vaddress;
81 64
82 early_runtime_code_mapping_set_exec(1); 65 early_code_mapping_set_exec(1);
83 local_irq_save(efi_flags); 66 local_irq_save(efi_flags);
84 vaddress = (unsigned long)__va(0x0UL); 67 vaddress = (unsigned long)__va(0x0UL);
85 save_pgd = *pgd_offset_k(0x0UL); 68 save_pgd = *pgd_offset_k(0x0UL);
@@ -95,7 +78,7 @@ void __init efi_call_phys_epilog(void)
95 set_pgd(pgd_offset_k(0x0UL), save_pgd); 78 set_pgd(pgd_offset_k(0x0UL), save_pgd);
96 __flush_tlb_all(); 79 __flush_tlb_all();
97 local_irq_restore(efi_flags); 80 local_irq_restore(efi_flags);
98 early_runtime_code_mapping_set_exec(0); 81 early_code_mapping_set_exec(0);
99} 82}
100 83
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, 84void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
@@ -107,8 +90,10 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
107 return ioremap(phys_addr, size); 90 return ioremap(phys_addr, size);
108 91
109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); 92 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) 93 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
111 return NULL; 94 unsigned long top = last_map_pfn << PAGE_SHIFT;
95 efi_ioremap(top, size - (top - phys_addr), type);
96 }
112 97
113 return (void __iomem *)__va(phys_addr); 98 return (void __iomem *)__va(phys_addr);
114} 99}
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
index 275dbc19e2cf..7000e74b3087 100644
--- a/arch/x86/platform/mrst/mrst.c
+++ b/arch/x86/platform/mrst/mrst.c
@@ -194,7 +194,7 @@ static unsigned long __init mrst_calibrate_tsc(void)
194 return 0; 194 return 0;
195} 195}
196 196
197void __init mrst_time_init(void) 197static void __init mrst_time_init(void)
198{ 198{
199 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr); 199 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
200 switch (mrst_timer_options) { 200 switch (mrst_timer_options) {
@@ -216,7 +216,7 @@ void __init mrst_time_init(void)
216 apbt_time_init(); 216 apbt_time_init();
217} 217}
218 218
219void __cpuinit mrst_arch_setup(void) 219static void __cpuinit mrst_arch_setup(void)
220{ 220{
221 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27) 221 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
222 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL; 222 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
index c2a8cab65e5d..81c5e2165c24 100644
--- a/arch/x86/platform/olpc/Makefile
+++ b/arch/x86/platform/olpc/Makefile
@@ -1,4 +1,2 @@
1obj-$(CONFIG_OLPC) += olpc.o 1obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o 2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
3obj-$(CONFIG_OLPC) += olpc_ofw.o
4obj-$(CONFIG_OF_PROMTREE) += olpc_dt.o
diff --git a/arch/x86/platform/olpc/olpc.c b/arch/x86/platform/olpc/olpc.c
index edaf3fe8dc5e..0060fd59ea00 100644
--- a/arch/x86/platform/olpc/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -18,6 +18,7 @@
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/platform_device.h> 20#include <linux/platform_device.h>
21#include <linux/of.h>
21 22
22#include <asm/geode.h> 23#include <asm/geode.h>
23#include <asm/setup.h> 24#include <asm/setup.h>
@@ -187,41 +188,43 @@ err:
187} 188}
188EXPORT_SYMBOL_GPL(olpc_ec_cmd); 189EXPORT_SYMBOL_GPL(olpc_ec_cmd);
189 190
190static bool __init check_ofw_architecture(void) 191static bool __init check_ofw_architecture(struct device_node *root)
191{ 192{
192 size_t propsize; 193 const char *olpc_arch;
193 char olpc_arch[5]; 194 int propsize;
194 const void *args[] = { NULL, "architecture", olpc_arch, (void *)5 };
195 void *res[] = { &propsize };
196 195
197 if (olpc_ofw("getprop", args, res)) { 196 olpc_arch = of_get_property(root, "architecture", &propsize);
198 printk(KERN_ERR "ofw: getprop call failed!\n");
199 return false;
200 }
201 return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0; 197 return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
202} 198}
203 199
204static u32 __init get_board_revision(void) 200static u32 __init get_board_revision(struct device_node *root)
205{ 201{
206 size_t propsize; 202 int propsize;
207 __be32 rev; 203 const __be32 *rev;
208 const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 }; 204
209 void *res[] = { &propsize }; 205 rev = of_get_property(root, "board-revision-int", &propsize);
210 206 if (propsize != 4)
211 if (olpc_ofw("getprop", args, res) || propsize != 4) { 207 return 0;
212 printk(KERN_ERR "ofw: getprop call failed!\n"); 208
213 return cpu_to_be32(0); 209 return be32_to_cpu(*rev);
214 }
215 return be32_to_cpu(rev);
216} 210}
217 211
218static bool __init platform_detect(void) 212static bool __init platform_detect(void)
219{ 213{
220 if (!check_ofw_architecture()) 214 struct device_node *root = of_find_node_by_path("/");
215 bool success;
216
217 if (!root)
221 return false; 218 return false;
222 olpc_platform_info.flags |= OLPC_F_PRESENT; 219
223 olpc_platform_info.boardrev = get_board_revision(); 220 success = check_ofw_architecture(root);
224 return true; 221 if (success) {
222 olpc_platform_info.boardrev = get_board_revision(root);
223 olpc_platform_info.flags |= OLPC_F_PRESENT;
224 }
225
226 of_node_put(root);
227 return success;
225} 228}
226 229
227static int __init add_xo1_platform_devices(void) 230static int __init add_xo1_platform_devices(void)
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
index 044bda5b3174..d39f63d017d2 100644
--- a/arch/x86/platform/olpc/olpc_dt.c
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -19,7 +19,9 @@
19#include <linux/kernel.h> 19#include <linux/kernel.h>
20#include <linux/bootmem.h> 20#include <linux/bootmem.h>
21#include <linux/of.h> 21#include <linux/of.h>
22#include <linux/of_platform.h>
22#include <linux/of_pdt.h> 23#include <linux/of_pdt.h>
24#include <asm/olpc.h>
23#include <asm/olpc_ofw.h> 25#include <asm/olpc_ofw.h>
24 26
25static phandle __init olpc_dt_getsibling(phandle node) 27static phandle __init olpc_dt_getsibling(phandle node)
@@ -180,3 +182,20 @@ void __init olpc_dt_build_devicetree(void)
180 pr_info("PROM DT: Built device tree with %u bytes of memory.\n", 182 pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
181 prom_early_allocated); 183 prom_early_allocated);
182} 184}
185
186/* A list of DT node/bus matches that we want to expose as platform devices */
187static struct of_device_id __initdata of_ids[] = {
188 { .compatible = "olpc,xo1-battery" },
189 { .compatible = "olpc,xo1-dcon" },
190 { .compatible = "olpc,xo1-rtc" },
191 {},
192};
193
194static int __init olpc_create_platform_devices(void)
195{
196 if (machine_is_olpc())
197 return of_platform_bus_probe(NULL, of_ids, NULL);
198 else
199 return 0;
200}
201device_initcall(olpc_create_platform_devices);
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
index c58e0ea39ef5..68e467f69fec 100644
--- a/arch/x86/platform/uv/tlb_uv.c
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * SGI UltraViolet TLB flush routines. 2 * SGI UltraViolet TLB flush routines.
3 * 3 *
4 * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI. 4 * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
5 * 5 *
6 * This code is released under the GNU General Public License version 2 or 6 * This code is released under the GNU General Public License version 2 or
7 * later. 7 * later.
@@ -35,6 +35,7 @@ static int timeout_base_ns[] = {
35 5242880, 35 5242880,
36 167772160 36 167772160
37}; 37};
38
38static int timeout_us; 39static int timeout_us;
39static int nobau; 40static int nobau;
40static int baudisabled; 41static int baudisabled;
@@ -42,20 +43,70 @@ static spinlock_t disable_lock;
42static cycles_t congested_cycles; 43static cycles_t congested_cycles;
43 44
44/* tunables: */ 45/* tunables: */
45static int max_bau_concurrent = MAX_BAU_CONCURRENT; 46static int max_concurr = MAX_BAU_CONCURRENT;
46static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; 47static int max_concurr_const = MAX_BAU_CONCURRENT;
47static int plugged_delay = PLUGGED_DELAY; 48static int plugged_delay = PLUGGED_DELAY;
48static int plugsb4reset = PLUGSB4RESET; 49static int plugsb4reset = PLUGSB4RESET;
49static int timeoutsb4reset = TIMEOUTSB4RESET; 50static int timeoutsb4reset = TIMEOUTSB4RESET;
50static int ipi_reset_limit = IPI_RESET_LIMIT; 51static int ipi_reset_limit = IPI_RESET_LIMIT;
51static int complete_threshold = COMPLETE_THRESHOLD; 52static int complete_threshold = COMPLETE_THRESHOLD;
52static int congested_response_us = CONGESTED_RESPONSE_US; 53static int congested_respns_us = CONGESTED_RESPONSE_US;
53static int congested_reps = CONGESTED_REPS; 54static int congested_reps = CONGESTED_REPS;
54static int congested_period = CONGESTED_PERIOD; 55static int congested_period = CONGESTED_PERIOD;
56
57static struct tunables tunables[] = {
58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
59 {&plugged_delay, PLUGGED_DELAY},
60 {&plugsb4reset, PLUGSB4RESET},
61 {&timeoutsb4reset, TIMEOUTSB4RESET},
62 {&ipi_reset_limit, IPI_RESET_LIMIT},
63 {&complete_threshold, COMPLETE_THRESHOLD},
64 {&congested_respns_us, CONGESTED_RESPONSE_US},
65 {&congested_reps, CONGESTED_REPS},
66 {&congested_period, CONGESTED_PERIOD}
67};
68
55static struct dentry *tunables_dir; 69static struct dentry *tunables_dir;
56static struct dentry *tunables_file; 70static struct dentry *tunables_file;
57 71
58static int __init setup_nobau(char *arg) 72/* these correspond to the statistics printed by ptc_seq_show() */
73static char *stat_description[] = {
74 "sent: number of shootdown messages sent",
75 "stime: time spent sending messages",
76 "numuvhubs: number of hubs targeted with shootdown",
77 "numuvhubs16: number times 16 or more hubs targeted",
78 "numuvhubs8: number times 8 or more hubs targeted",
79 "numuvhubs4: number times 4 or more hubs targeted",
80 "numuvhubs2: number times 2 or more hubs targeted",
81 "numuvhubs1: number times 1 hub targeted",
82 "numcpus: number of cpus targeted with shootdown",
83 "dto: number of destination timeouts",
84 "retries: destination timeout retries sent",
85 "rok: : destination timeouts successfully retried",
86 "resetp: ipi-style resource resets for plugs",
87 "resett: ipi-style resource resets for timeouts",
88 "giveup: fall-backs to ipi-style shootdowns",
89 "sto: number of source timeouts",
90 "bz: number of stay-busy's",
91 "throt: number times spun in throttle",
92 "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
93 "recv: shootdown messages received",
94 "rtime: time spent processing messages",
95 "all: shootdown all-tlb messages",
96 "one: shootdown one-tlb messages",
97 "mult: interrupts that found multiple messages",
98 "none: interrupts that found no messages",
99 "retry: number of retry messages processed",
100 "canc: number messages canceled by retries",
101 "nocan: number retries that found nothing to cancel",
102 "reset: number of ipi-style reset requests processed",
103 "rcan: number messages canceled by reset requests",
104 "disable: number times use of the BAU was disabled",
105 "enable: number times use of the BAU was re-enabled"
106};
107
108static int __init
109setup_nobau(char *arg)
59{ 110{
60 nobau = 1; 111 nobau = 1;
61 return 0; 112 return 0;
@@ -63,7 +114,7 @@ static int __init setup_nobau(char *arg)
63early_param("nobau", setup_nobau); 114early_param("nobau", setup_nobau);
64 115
65/* base pnode in this partition */ 116/* base pnode in this partition */
66static int uv_partition_base_pnode __read_mostly; 117static int uv_base_pnode __read_mostly;
67/* position of pnode (which is nasid>>1): */ 118/* position of pnode (which is nasid>>1): */
68static int uv_nshift __read_mostly; 119static int uv_nshift __read_mostly;
69static unsigned long uv_mmask __read_mostly; 120static unsigned long uv_mmask __read_mostly;
@@ -109,60 +160,52 @@ static int __init uvhub_to_first_apicid(int uvhub)
109 * clear of the Timeout bit (as well) will free the resource. No reply will 160 * clear of the Timeout bit (as well) will free the resource. No reply will
110 * be sent (the hardware will only do one reply per message). 161 * be sent (the hardware will only do one reply per message).
111 */ 162 */
112static inline void uv_reply_to_message(struct msg_desc *mdp, 163static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
113 struct bau_control *bcp)
114{ 164{
115 unsigned long dw; 165 unsigned long dw;
116 struct bau_payload_queue_entry *msg; 166 struct bau_pq_entry *msg;
117 167
118 msg = mdp->msg; 168 msg = mdp->msg;
119 if (!msg->canceled) { 169 if (!msg->canceled) {
120 dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) | 170 dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
121 msg->sw_ack_vector; 171 write_mmr_sw_ack(dw);
122 uv_write_local_mmr(
123 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
124 } 172 }
125 msg->replied_to = 1; 173 msg->replied_to = 1;
126 msg->sw_ack_vector = 0; 174 msg->swack_vec = 0;
127} 175}
128 176
129/* 177/*
130 * Process the receipt of a RETRY message 178 * Process the receipt of a RETRY message
131 */ 179 */
132static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, 180static void bau_process_retry_msg(struct msg_desc *mdp,
133 struct bau_control *bcp) 181 struct bau_control *bcp)
134{ 182{
135 int i; 183 int i;
136 int cancel_count = 0; 184 int cancel_count = 0;
137 int slot2;
138 unsigned long msg_res; 185 unsigned long msg_res;
139 unsigned long mmr = 0; 186 unsigned long mmr = 0;
140 struct bau_payload_queue_entry *msg; 187 struct bau_pq_entry *msg = mdp->msg;
141 struct bau_payload_queue_entry *msg2; 188 struct bau_pq_entry *msg2;
142 struct ptc_stats *stat; 189 struct ptc_stats *stat = bcp->statp;
143 190
144 msg = mdp->msg;
145 stat = bcp->statp;
146 stat->d_retries++; 191 stat->d_retries++;
147 /* 192 /*
148 * cancel any message from msg+1 to the retry itself 193 * cancel any message from msg+1 to the retry itself
149 */ 194 */
150 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) { 195 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
151 if (msg2 > mdp->va_queue_last) 196 if (msg2 > mdp->queue_last)
152 msg2 = mdp->va_queue_first; 197 msg2 = mdp->queue_first;
153 if (msg2 == msg) 198 if (msg2 == msg)
154 break; 199 break;
155 200
156 /* same conditions for cancellation as uv_do_reset */ 201 /* same conditions for cancellation as do_reset */
157 if ((msg2->replied_to == 0) && (msg2->canceled == 0) && 202 if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
158 (msg2->sw_ack_vector) && ((msg2->sw_ack_vector & 203 (msg2->swack_vec) && ((msg2->swack_vec &
159 msg->sw_ack_vector) == 0) && 204 msg->swack_vec) == 0) &&
160 (msg2->sending_cpu == msg->sending_cpu) && 205 (msg2->sending_cpu == msg->sending_cpu) &&
161 (msg2->msg_type != MSG_NOOP)) { 206 (msg2->msg_type != MSG_NOOP)) {
162 slot2 = msg2 - mdp->va_queue_first; 207 mmr = read_mmr_sw_ack();
163 mmr = uv_read_local_mmr 208 msg_res = msg2->swack_vec;
164 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
165 msg_res = msg2->sw_ack_vector;
166 /* 209 /*
167 * This is a message retry; clear the resources held 210 * This is a message retry; clear the resources held
168 * by the previous message only if they timed out. 211 * by the previous message only if they timed out.
@@ -170,6 +213,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
170 * situation to report. 213 * situation to report.
171 */ 214 */
172 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { 215 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
216 unsigned long mr;
173 /* 217 /*
174 * is the resource timed out? 218 * is the resource timed out?
175 * make everyone ignore the cancelled message. 219 * make everyone ignore the cancelled message.
@@ -177,10 +221,8 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
177 msg2->canceled = 1; 221 msg2->canceled = 1;
178 stat->d_canceled++; 222 stat->d_canceled++;
179 cancel_count++; 223 cancel_count++;
180 uv_write_local_mmr( 224 mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
181 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, 225 write_mmr_sw_ack(mr);
182 (msg_res << UV_SW_ACK_NPENDING) |
183 msg_res);
184 } 226 }
185 } 227 }
186 } 228 }
@@ -192,20 +234,19 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
192 * Do all the things a cpu should do for a TLB shootdown message. 234 * Do all the things a cpu should do for a TLB shootdown message.
193 * Other cpu's may come here at the same time for this message. 235 * Other cpu's may come here at the same time for this message.
194 */ 236 */
195static void uv_bau_process_message(struct msg_desc *mdp, 237static void bau_process_message(struct msg_desc *mdp,
196 struct bau_control *bcp) 238 struct bau_control *bcp)
197{ 239{
198 int msg_ack_count;
199 short socket_ack_count = 0; 240 short socket_ack_count = 0;
200 struct ptc_stats *stat; 241 short *sp;
201 struct bau_payload_queue_entry *msg; 242 struct atomic_short *asp;
243 struct ptc_stats *stat = bcp->statp;
244 struct bau_pq_entry *msg = mdp->msg;
202 struct bau_control *smaster = bcp->socket_master; 245 struct bau_control *smaster = bcp->socket_master;
203 246
204 /* 247 /*
205 * This must be a normal message, or retry of a normal message 248 * This must be a normal message, or retry of a normal message
206 */ 249 */
207 msg = mdp->msg;
208 stat = bcp->statp;
209 if (msg->address == TLB_FLUSH_ALL) { 250 if (msg->address == TLB_FLUSH_ALL) {
210 local_flush_tlb(); 251 local_flush_tlb();
211 stat->d_alltlb++; 252 stat->d_alltlb++;
@@ -222,30 +263,32 @@ static void uv_bau_process_message(struct msg_desc *mdp,
222 * cpu number. 263 * cpu number.
223 */ 264 */
224 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master) 265 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
225 uv_bau_process_retry_msg(mdp, bcp); 266 bau_process_retry_msg(mdp, bcp);
226 267
227 /* 268 /*
228 * This is a sw_ack message, so we have to reply to it. 269 * This is a swack message, so we have to reply to it.
229 * Count each responding cpu on the socket. This avoids 270 * Count each responding cpu on the socket. This avoids
230 * pinging the count's cache line back and forth between 271 * pinging the count's cache line back and forth between
231 * the sockets. 272 * the sockets.
232 */ 273 */
233 socket_ack_count = atomic_add_short_return(1, (struct atomic_short *) 274 sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
234 &smaster->socket_acknowledge_count[mdp->msg_slot]); 275 asp = (struct atomic_short *)sp;
276 socket_ack_count = atom_asr(1, asp);
235 if (socket_ack_count == bcp->cpus_in_socket) { 277 if (socket_ack_count == bcp->cpus_in_socket) {
278 int msg_ack_count;
236 /* 279 /*
237 * Both sockets dump their completed count total into 280 * Both sockets dump their completed count total into
238 * the message's count. 281 * the message's count.
239 */ 282 */
240 smaster->socket_acknowledge_count[mdp->msg_slot] = 0; 283 smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
241 msg_ack_count = atomic_add_short_return(socket_ack_count, 284 asp = (struct atomic_short *)&msg->acknowledge_count;
242 (struct atomic_short *)&msg->acknowledge_count); 285 msg_ack_count = atom_asr(socket_ack_count, asp);
243 286
244 if (msg_ack_count == bcp->cpus_in_uvhub) { 287 if (msg_ack_count == bcp->cpus_in_uvhub) {
245 /* 288 /*
246 * All cpus in uvhub saw it; reply 289 * All cpus in uvhub saw it; reply
247 */ 290 */
248 uv_reply_to_message(mdp, bcp); 291 reply_to_message(mdp, bcp);
249 } 292 }
250 } 293 }
251 294
@@ -268,62 +311,51 @@ static int uvhub_to_first_cpu(int uvhub)
268 * Last resort when we get a large number of destination timeouts is 311 * Last resort when we get a large number of destination timeouts is
269 * to clear resources held by a given cpu. 312 * to clear resources held by a given cpu.
270 * Do this with IPI so that all messages in the BAU message queue 313 * Do this with IPI so that all messages in the BAU message queue
271 * can be identified by their nonzero sw_ack_vector field. 314 * can be identified by their nonzero swack_vec field.
272 * 315 *
273 * This is entered for a single cpu on the uvhub. 316 * This is entered for a single cpu on the uvhub.
274 * The sender want's this uvhub to free a specific message's 317 * The sender want's this uvhub to free a specific message's
275 * sw_ack resources. 318 * swack resources.
276 */ 319 */
277static void 320static void do_reset(void *ptr)
278uv_do_reset(void *ptr)
279{ 321{
280 int i; 322 int i;
281 int slot; 323 struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
282 int count = 0; 324 struct reset_args *rap = (struct reset_args *)ptr;
283 unsigned long mmr; 325 struct bau_pq_entry *msg;
284 unsigned long msg_res; 326 struct ptc_stats *stat = bcp->statp;
285 struct bau_control *bcp;
286 struct reset_args *rap;
287 struct bau_payload_queue_entry *msg;
288 struct ptc_stats *stat;
289 327
290 bcp = &per_cpu(bau_control, smp_processor_id());
291 rap = (struct reset_args *)ptr;
292 stat = bcp->statp;
293 stat->d_resets++; 328 stat->d_resets++;
294
295 /* 329 /*
296 * We're looking for the given sender, and 330 * We're looking for the given sender, and
297 * will free its sw_ack resource. 331 * will free its swack resource.
298 * If all cpu's finally responded after the timeout, its 332 * If all cpu's finally responded after the timeout, its
299 * message 'replied_to' was set. 333 * message 'replied_to' was set.
300 */ 334 */
301 for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) { 335 for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
302 /* uv_do_reset: same conditions for cancellation as 336 unsigned long msg_res;
303 uv_bau_process_retry_msg() */ 337 /* do_reset: same conditions for cancellation as
338 bau_process_retry_msg() */
304 if ((msg->replied_to == 0) && 339 if ((msg->replied_to == 0) &&
305 (msg->canceled == 0) && 340 (msg->canceled == 0) &&
306 (msg->sending_cpu == rap->sender) && 341 (msg->sending_cpu == rap->sender) &&
307 (msg->sw_ack_vector) && 342 (msg->swack_vec) &&
308 (msg->msg_type != MSG_NOOP)) { 343 (msg->msg_type != MSG_NOOP)) {
344 unsigned long mmr;
345 unsigned long mr;
309 /* 346 /*
310 * make everyone else ignore this message 347 * make everyone else ignore this message
311 */ 348 */
312 msg->canceled = 1; 349 msg->canceled = 1;
313 slot = msg - bcp->va_queue_first;
314 count++;
315 /* 350 /*
316 * only reset the resource if it is still pending 351 * only reset the resource if it is still pending
317 */ 352 */
318 mmr = uv_read_local_mmr 353 mmr = read_mmr_sw_ack();
319 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); 354 msg_res = msg->swack_vec;
320 msg_res = msg->sw_ack_vector; 355 mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
321 if (mmr & msg_res) { 356 if (mmr & msg_res) {
322 stat->d_rcanceled++; 357 stat->d_rcanceled++;
323 uv_write_local_mmr( 358 write_mmr_sw_ack(mr);
324 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
325 (msg_res << UV_SW_ACK_NPENDING) |
326 msg_res);
327 } 359 }
328 } 360 }
329 } 361 }
@@ -334,39 +366,38 @@ uv_do_reset(void *ptr)
334 * Use IPI to get all target uvhubs to release resources held by 366 * Use IPI to get all target uvhubs to release resources held by
335 * a given sending cpu number. 367 * a given sending cpu number.
336 */ 368 */
337static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution, 369static void reset_with_ipi(struct bau_targ_hubmask *distribution, int sender)
338 int sender)
339{ 370{
340 int uvhub; 371 int uvhub;
341 int cpu; 372 int maskbits;
342 cpumask_t mask; 373 cpumask_t mask;
343 struct reset_args reset_args; 374 struct reset_args reset_args;
344 375
345 reset_args.sender = sender; 376 reset_args.sender = sender;
346
347 cpus_clear(mask); 377 cpus_clear(mask);
348 /* find a single cpu for each uvhub in this distribution mask */ 378 /* find a single cpu for each uvhub in this distribution mask */
349 for (uvhub = 0; 379 maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE;
350 uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE; 380 for (uvhub = 0; uvhub < maskbits; uvhub++) {
351 uvhub++) { 381 int cpu;
352 if (!bau_uvhub_isset(uvhub, distribution)) 382 if (!bau_uvhub_isset(uvhub, distribution))
353 continue; 383 continue;
354 /* find a cpu for this uvhub */ 384 /* find a cpu for this uvhub */
355 cpu = uvhub_to_first_cpu(uvhub); 385 cpu = uvhub_to_first_cpu(uvhub);
356 cpu_set(cpu, mask); 386 cpu_set(cpu, mask);
357 } 387 }
358 /* IPI all cpus; Preemption is already disabled */ 388
359 smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1); 389 /* IPI all cpus; preemption is already disabled */
390 smp_call_function_many(&mask, do_reset, (void *)&reset_args, 1);
360 return; 391 return;
361} 392}
362 393
363static inline unsigned long 394static inline unsigned long cycles_2_us(unsigned long long cyc)
364cycles_2_us(unsigned long long cyc)
365{ 395{
366 unsigned long long ns; 396 unsigned long long ns;
367 unsigned long us; 397 unsigned long us;
368 ns = (cyc * per_cpu(cyc2ns, smp_processor_id())) 398 int cpu = smp_processor_id();
369 >> CYC2NS_SCALE_FACTOR; 399
400 ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
370 us = ns / 1000; 401 us = ns / 1000;
371 return us; 402 return us;
372} 403}
@@ -376,56 +407,56 @@ cycles_2_us(unsigned long long cyc)
376 * leaves uvhub_quiesce set so that no new broadcasts are started by 407 * leaves uvhub_quiesce set so that no new broadcasts are started by
377 * bau_flush_send_and_wait() 408 * bau_flush_send_and_wait()
378 */ 409 */
379static inline void 410static inline void quiesce_local_uvhub(struct bau_control *hmaster)
380quiesce_local_uvhub(struct bau_control *hmaster)
381{ 411{
382 atomic_add_short_return(1, (struct atomic_short *) 412 atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
383 &hmaster->uvhub_quiesce);
384} 413}
385 414
386/* 415/*
387 * mark this quiet-requestor as done 416 * mark this quiet-requestor as done
388 */ 417 */
389static inline void 418static inline void end_uvhub_quiesce(struct bau_control *hmaster)
390end_uvhub_quiesce(struct bau_control *hmaster)
391{ 419{
392 atomic_add_short_return(-1, (struct atomic_short *) 420 atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
393 &hmaster->uvhub_quiesce); 421}
422
423static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
424{
425 unsigned long descriptor_status;
426
427 descriptor_status = uv_read_local_mmr(mmr_offset);
428 descriptor_status >>= right_shift;
429 descriptor_status &= UV_ACT_STATUS_MASK;
430 return descriptor_status;
394} 431}
395 432
396/* 433/*
397 * Wait for completion of a broadcast software ack message 434 * Wait for completion of a broadcast software ack message
398 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP 435 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
399 */ 436 */
400static int uv_wait_completion(struct bau_desc *bau_desc, 437static int uv1_wait_completion(struct bau_desc *bau_desc,
401 unsigned long mmr_offset, int right_shift, int this_cpu, 438 unsigned long mmr_offset, int right_shift,
402 struct bau_control *bcp, struct bau_control *smaster, long try) 439 struct bau_control *bcp, long try)
403{ 440{
404 unsigned long descriptor_status; 441 unsigned long descriptor_status;
405 cycles_t ttime; 442 cycles_t ttm;
406 struct ptc_stats *stat = bcp->statp; 443 struct ptc_stats *stat = bcp->statp;
407 struct bau_control *hmaster;
408
409 hmaster = bcp->uvhub_master;
410 444
445 descriptor_status = uv1_read_status(mmr_offset, right_shift);
411 /* spin on the status MMR, waiting for it to go idle */ 446 /* spin on the status MMR, waiting for it to go idle */
412 while ((descriptor_status = (((unsigned long) 447 while ((descriptor_status != DS_IDLE)) {
413 uv_read_local_mmr(mmr_offset) >>
414 right_shift) & UV_ACT_STATUS_MASK)) !=
415 DESC_STATUS_IDLE) {
416 /* 448 /*
417 * Our software ack messages may be blocked because there are 449 * Our software ack messages may be blocked because
418 * no swack resources available. As long as none of them 450 * there are no swack resources available. As long
419 * has timed out hardware will NACK our message and its 451 * as none of them has timed out hardware will NACK
420 * state will stay IDLE. 452 * our message and its state will stay IDLE.
421 */ 453 */
422 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) { 454 if (descriptor_status == DS_SOURCE_TIMEOUT) {
423 stat->s_stimeout++; 455 stat->s_stimeout++;
424 return FLUSH_GIVEUP; 456 return FLUSH_GIVEUP;
425 } else if (descriptor_status == 457 } else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
426 DESC_STATUS_DESTINATION_TIMEOUT) {
427 stat->s_dtimeout++; 458 stat->s_dtimeout++;
428 ttime = get_cycles(); 459 ttm = get_cycles();
429 460
430 /* 461 /*
431 * Our retries may be blocked by all destination 462 * Our retries may be blocked by all destination
@@ -433,8 +464,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
433 * pending. In that case hardware returns the 464 * pending. In that case hardware returns the
434 * ERROR that looks like a destination timeout. 465 * ERROR that looks like a destination timeout.
435 */ 466 */
436 if (cycles_2_us(ttime - bcp->send_message) < 467 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
437 timeout_us) {
438 bcp->conseccompletes = 0; 468 bcp->conseccompletes = 0;
439 return FLUSH_RETRY_PLUGGED; 469 return FLUSH_RETRY_PLUGGED;
440 } 470 }
@@ -447,80 +477,160 @@ static int uv_wait_completion(struct bau_desc *bau_desc,
447 */ 477 */
448 cpu_relax(); 478 cpu_relax();
449 } 479 }
480 descriptor_status = uv1_read_status(mmr_offset, right_shift);
450 } 481 }
451 bcp->conseccompletes++; 482 bcp->conseccompletes++;
452 return FLUSH_COMPLETE; 483 return FLUSH_COMPLETE;
453} 484}
454 485
455static inline cycles_t 486/*
456sec_2_cycles(unsigned long sec) 487 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
488 */
489static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
457{ 490{
458 unsigned long ns; 491 unsigned long descriptor_status;
459 cycles_t cyc; 492 unsigned long descriptor_status2;
460 493
461 ns = sec * 1000000000; 494 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
462 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); 495 descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
463 return cyc; 496 descriptor_status = (descriptor_status << 1) | descriptor_status2;
497 return descriptor_status;
498}
499
500static int uv2_wait_completion(struct bau_desc *bau_desc,
501 unsigned long mmr_offset, int right_shift,
502 struct bau_control *bcp, long try)
503{
504 unsigned long descriptor_stat;
505 cycles_t ttm;
506 int cpu = bcp->uvhub_cpu;
507 struct ptc_stats *stat = bcp->statp;
508
509 descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
510
511 /* spin on the status MMR, waiting for it to go idle */
512 while (descriptor_stat != UV2H_DESC_IDLE) {
513 /*
514 * Our software ack messages may be blocked because
515 * there are no swack resources available. As long
516 * as none of them has timed out hardware will NACK
517 * our message and its state will stay IDLE.
518 */
519 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
520 (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
521 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
522 stat->s_stimeout++;
523 return FLUSH_GIVEUP;
524 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
525 stat->s_dtimeout++;
526 ttm = get_cycles();
527 /*
528 * Our retries may be blocked by all destination
529 * swack resources being consumed, and a timeout
530 * pending. In that case hardware returns the
531 * ERROR that looks like a destination timeout.
532 */
533 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
534 bcp->conseccompletes = 0;
535 return FLUSH_RETRY_PLUGGED;
536 }
537 bcp->conseccompletes = 0;
538 return FLUSH_RETRY_TIMEOUT;
539 } else {
540 /*
541 * descriptor_stat is still BUSY
542 */
543 cpu_relax();
544 }
545 descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
546 }
547 bcp->conseccompletes++;
548 return FLUSH_COMPLETE;
464} 549}
465 550
466/* 551/*
467 * conditionally add 1 to *v, unless *v is >= u 552 * There are 2 status registers; each and array[32] of 2 bits. Set up for
468 * return 0 if we cannot add 1 to *v because it is >= u 553 * which register to read and position in that register based on cpu in
469 * return 1 if we can add 1 to *v because it is < u 554 * current hub.
470 * the add is atomic
471 *
472 * This is close to atomic_add_unless(), but this allows the 'u' value
473 * to be lowered below the current 'v'. atomic_add_unless can only stop
474 * on equal.
475 */ 555 */
476static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) 556static int wait_completion(struct bau_desc *bau_desc,
557 struct bau_control *bcp, long try)
477{ 558{
478 spin_lock(lock); 559 int right_shift;
479 if (atomic_read(v) >= u) { 560 unsigned long mmr_offset;
480 spin_unlock(lock); 561 int cpu = bcp->uvhub_cpu;
481 return 0; 562
563 if (cpu < UV_CPUS_PER_AS) {
564 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
565 right_shift = cpu * UV_ACT_STATUS_SIZE;
566 } else {
567 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
568 right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
482 } 569 }
483 atomic_inc(v); 570
484 spin_unlock(lock); 571 if (is_uv1_hub())
485 return 1; 572 return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
573 bcp, try);
574 else
575 return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
576 bcp, try);
577}
578
579static inline cycles_t sec_2_cycles(unsigned long sec)
580{
581 unsigned long ns;
582 cycles_t cyc;
583
584 ns = sec * 1000000000;
585 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
586 return cyc;
486} 587}
487 588
488/* 589/*
489 * Our retries are blocked by all destination swack resources being 590 * Our retries are blocked by all destination sw ack resources being
490 * in use, and a timeout is pending. In that case hardware immediately 591 * in use, and a timeout is pending. In that case hardware immediately
491 * returns the ERROR that looks like a destination timeout. 592 * returns the ERROR that looks like a destination timeout.
492 */ 593 */
493static void 594static void destination_plugged(struct bau_desc *bau_desc,
494destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, 595 struct bau_control *bcp,
495 struct bau_control *hmaster, struct ptc_stats *stat) 596 struct bau_control *hmaster, struct ptc_stats *stat)
496{ 597{
497 udelay(bcp->plugged_delay); 598 udelay(bcp->plugged_delay);
498 bcp->plugged_tries++; 599 bcp->plugged_tries++;
600
499 if (bcp->plugged_tries >= bcp->plugsb4reset) { 601 if (bcp->plugged_tries >= bcp->plugsb4reset) {
500 bcp->plugged_tries = 0; 602 bcp->plugged_tries = 0;
603
501 quiesce_local_uvhub(hmaster); 604 quiesce_local_uvhub(hmaster);
605
502 spin_lock(&hmaster->queue_lock); 606 spin_lock(&hmaster->queue_lock);
503 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); 607 reset_with_ipi(&bau_desc->distribution, bcp->cpu);
504 spin_unlock(&hmaster->queue_lock); 608 spin_unlock(&hmaster->queue_lock);
609
505 end_uvhub_quiesce(hmaster); 610 end_uvhub_quiesce(hmaster);
611
506 bcp->ipi_attempts++; 612 bcp->ipi_attempts++;
507 stat->s_resets_plug++; 613 stat->s_resets_plug++;
508 } 614 }
509} 615}
510 616
511static void 617static void destination_timeout(struct bau_desc *bau_desc,
512destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, 618 struct bau_control *bcp, struct bau_control *hmaster,
513 struct bau_control *hmaster, struct ptc_stats *stat) 619 struct ptc_stats *stat)
514{ 620{
515 hmaster->max_bau_concurrent = 1; 621 hmaster->max_concurr = 1;
516 bcp->timeout_tries++; 622 bcp->timeout_tries++;
517 if (bcp->timeout_tries >= bcp->timeoutsb4reset) { 623 if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
518 bcp->timeout_tries = 0; 624 bcp->timeout_tries = 0;
625
519 quiesce_local_uvhub(hmaster); 626 quiesce_local_uvhub(hmaster);
627
520 spin_lock(&hmaster->queue_lock); 628 spin_lock(&hmaster->queue_lock);
521 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); 629 reset_with_ipi(&bau_desc->distribution, bcp->cpu);
522 spin_unlock(&hmaster->queue_lock); 630 spin_unlock(&hmaster->queue_lock);
631
523 end_uvhub_quiesce(hmaster); 632 end_uvhub_quiesce(hmaster);
633
524 bcp->ipi_attempts++; 634 bcp->ipi_attempts++;
525 stat->s_resets_timeout++; 635 stat->s_resets_timeout++;
526 } 636 }
@@ -530,34 +640,104 @@ destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
530 * Completions are taking a very long time due to a congested numalink 640 * Completions are taking a very long time due to a congested numalink
531 * network. 641 * network.
532 */ 642 */
533static void 643static void disable_for_congestion(struct bau_control *bcp,
534disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) 644 struct ptc_stats *stat)
535{ 645{
536 int tcpu;
537 struct bau_control *tbcp;
538
539 /* let only one cpu do this disabling */ 646 /* let only one cpu do this disabling */
540 spin_lock(&disable_lock); 647 spin_lock(&disable_lock);
648
541 if (!baudisabled && bcp->period_requests && 649 if (!baudisabled && bcp->period_requests &&
542 ((bcp->period_time / bcp->period_requests) > congested_cycles)) { 650 ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
651 int tcpu;
652 struct bau_control *tbcp;
543 /* it becomes this cpu's job to turn on the use of the 653 /* it becomes this cpu's job to turn on the use of the
544 BAU again */ 654 BAU again */
545 baudisabled = 1; 655 baudisabled = 1;
546 bcp->set_bau_off = 1; 656 bcp->set_bau_off = 1;
547 bcp->set_bau_on_time = get_cycles() + 657 bcp->set_bau_on_time = get_cycles();
548 sec_2_cycles(bcp->congested_period); 658 bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
549 stat->s_bau_disabled++; 659 stat->s_bau_disabled++;
550 for_each_present_cpu(tcpu) { 660 for_each_present_cpu(tcpu) {
551 tbcp = &per_cpu(bau_control, tcpu); 661 tbcp = &per_cpu(bau_control, tcpu);
552 tbcp->baudisabled = 1; 662 tbcp->baudisabled = 1;
553 } 663 }
554 } 664 }
665
555 spin_unlock(&disable_lock); 666 spin_unlock(&disable_lock);
556} 667}
557 668
558/** 669static void count_max_concurr(int stat, struct bau_control *bcp,
559 * uv_flush_send_and_wait 670 struct bau_control *hmaster)
560 * 671{
672 bcp->plugged_tries = 0;
673 bcp->timeout_tries = 0;
674 if (stat != FLUSH_COMPLETE)
675 return;
676 if (bcp->conseccompletes <= bcp->complete_threshold)
677 return;
678 if (hmaster->max_concurr >= hmaster->max_concurr_const)
679 return;
680 hmaster->max_concurr++;
681}
682
683static void record_send_stats(cycles_t time1, cycles_t time2,
684 struct bau_control *bcp, struct ptc_stats *stat,
685 int completion_status, int try)
686{
687 cycles_t elapsed;
688
689 if (time2 > time1) {
690 elapsed = time2 - time1;
691 stat->s_time += elapsed;
692
693 if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
694 bcp->period_requests++;
695 bcp->period_time += elapsed;
696 if ((elapsed > congested_cycles) &&
697 (bcp->period_requests > bcp->cong_reps))
698 disable_for_congestion(bcp, stat);
699 }
700 } else
701 stat->s_requestor--;
702
703 if (completion_status == FLUSH_COMPLETE && try > 1)
704 stat->s_retriesok++;
705 else if (completion_status == FLUSH_GIVEUP)
706 stat->s_giveup++;
707}
708
709/*
710 * Because of a uv1 hardware bug only a limited number of concurrent
711 * requests can be made.
712 */
713static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
714{
715 spinlock_t *lock = &hmaster->uvhub_lock;
716 atomic_t *v;
717
718 v = &hmaster->active_descriptor_count;
719 if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
720 stat->s_throttles++;
721 do {
722 cpu_relax();
723 } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
724 }
725}
726
727/*
728 * Handle the completion status of a message send.
729 */
730static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
731 struct bau_control *bcp, struct bau_control *hmaster,
732 struct ptc_stats *stat)
733{
734 if (completion_status == FLUSH_RETRY_PLUGGED)
735 destination_plugged(bau_desc, bcp, hmaster, stat);
736 else if (completion_status == FLUSH_RETRY_TIMEOUT)
737 destination_timeout(bau_desc, bcp, hmaster, stat);
738}
739
740/*
561 * Send a broadcast and wait for it to complete. 741 * Send a broadcast and wait for it to complete.
562 * 742 *
563 * The flush_mask contains the cpus the broadcast is to be sent to including 743 * The flush_mask contains the cpus the broadcast is to be sent to including
@@ -568,44 +748,23 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
568 * returned to the kernel. 748 * returned to the kernel.
569 */ 749 */
570int uv_flush_send_and_wait(struct bau_desc *bau_desc, 750int uv_flush_send_and_wait(struct bau_desc *bau_desc,
571 struct cpumask *flush_mask, struct bau_control *bcp) 751 struct cpumask *flush_mask, struct bau_control *bcp)
572{ 752{
573 int right_shift;
574 int completion_status = 0;
575 int seq_number = 0; 753 int seq_number = 0;
754 int completion_stat = 0;
576 long try = 0; 755 long try = 0;
577 int cpu = bcp->uvhub_cpu;
578 int this_cpu = bcp->cpu;
579 unsigned long mmr_offset;
580 unsigned long index; 756 unsigned long index;
581 cycles_t time1; 757 cycles_t time1;
582 cycles_t time2; 758 cycles_t time2;
583 cycles_t elapsed;
584 struct ptc_stats *stat = bcp->statp; 759 struct ptc_stats *stat = bcp->statp;
585 struct bau_control *smaster = bcp->socket_master;
586 struct bau_control *hmaster = bcp->uvhub_master; 760 struct bau_control *hmaster = bcp->uvhub_master;
587 761
588 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, 762 if (is_uv1_hub())
589 &hmaster->active_descriptor_count, 763 uv1_throttle(hmaster, stat);
590 hmaster->max_bau_concurrent)) { 764
591 stat->s_throttles++;
592 do {
593 cpu_relax();
594 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
595 &hmaster->active_descriptor_count,
596 hmaster->max_bau_concurrent));
597 }
598 while (hmaster->uvhub_quiesce) 765 while (hmaster->uvhub_quiesce)
599 cpu_relax(); 766 cpu_relax();
600 767
601 if (cpu < UV_CPUS_PER_ACT_STATUS) {
602 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
603 right_shift = cpu * UV_ACT_STATUS_SIZE;
604 } else {
605 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
606 right_shift =
607 ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
608 }
609 time1 = get_cycles(); 768 time1 = get_cycles();
610 do { 769 do {
611 if (try == 0) { 770 if (try == 0) {
@@ -615,64 +774,134 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
615 bau_desc->header.msg_type = MSG_RETRY; 774 bau_desc->header.msg_type = MSG_RETRY;
616 stat->s_retry_messages++; 775 stat->s_retry_messages++;
617 } 776 }
777
618 bau_desc->header.sequence = seq_number; 778 bau_desc->header.sequence = seq_number;
619 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | 779 index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
620 bcp->uvhub_cpu;
621 bcp->send_message = get_cycles(); 780 bcp->send_message = get_cycles();
622 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); 781
782 write_mmr_activation(index);
783
623 try++; 784 try++;
624 completion_status = uv_wait_completion(bau_desc, mmr_offset, 785 completion_stat = wait_completion(bau_desc, bcp, try);
625 right_shift, this_cpu, bcp, smaster, try); 786
787 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
626 788
627 if (completion_status == FLUSH_RETRY_PLUGGED) {
628 destination_plugged(bau_desc, bcp, hmaster, stat);
629 } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
630 destination_timeout(bau_desc, bcp, hmaster, stat);
631 }
632 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { 789 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
633 bcp->ipi_attempts = 0; 790 bcp->ipi_attempts = 0;
634 completion_status = FLUSH_GIVEUP; 791 completion_stat = FLUSH_GIVEUP;
635 break; 792 break;
636 } 793 }
637 cpu_relax(); 794 cpu_relax();
638 } while ((completion_status == FLUSH_RETRY_PLUGGED) || 795 } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
639 (completion_status == FLUSH_RETRY_TIMEOUT)); 796 (completion_stat == FLUSH_RETRY_TIMEOUT));
797
640 time2 = get_cycles(); 798 time2 = get_cycles();
641 bcp->plugged_tries = 0; 799
642 bcp->timeout_tries = 0; 800 count_max_concurr(completion_stat, bcp, hmaster);
643 if ((completion_status == FLUSH_COMPLETE) && 801
644 (bcp->conseccompletes > bcp->complete_threshold) &&
645 (hmaster->max_bau_concurrent <
646 hmaster->max_bau_concurrent_constant))
647 hmaster->max_bau_concurrent++;
648 while (hmaster->uvhub_quiesce) 802 while (hmaster->uvhub_quiesce)
649 cpu_relax(); 803 cpu_relax();
804
650 atomic_dec(&hmaster->active_descriptor_count); 805 atomic_dec(&hmaster->active_descriptor_count);
651 if (time2 > time1) { 806
652 elapsed = time2 - time1; 807 record_send_stats(time1, time2, bcp, stat, completion_stat, try);
653 stat->s_time += elapsed; 808
654 if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { 809 if (completion_stat == FLUSH_GIVEUP)
655 bcp->period_requests++; 810 return 1;
656 bcp->period_time += elapsed; 811 return 0;
657 if ((elapsed > congested_cycles) && 812}
658 (bcp->period_requests > bcp->congested_reps)) { 813
659 disable_for_congestion(bcp, stat); 814/*
815 * The BAU is disabled. When the disabled time period has expired, the cpu
816 * that disabled it must re-enable it.
817 * Return 0 if it is re-enabled for all cpus.
818 */
819static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
820{
821 int tcpu;
822 struct bau_control *tbcp;
823
824 if (bcp->set_bau_off) {
825 if (get_cycles() >= bcp->set_bau_on_time) {
826 stat->s_bau_reenabled++;
827 baudisabled = 0;
828 for_each_present_cpu(tcpu) {
829 tbcp = &per_cpu(bau_control, tcpu);
830 tbcp->baudisabled = 0;
831 tbcp->period_requests = 0;
832 tbcp->period_time = 0;
660 } 833 }
834 return 0;
661 } 835 }
836 }
837 return -1;
838}
839
840static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
841 int remotes, struct bau_desc *bau_desc)
842{
843 stat->s_requestor++;
844 stat->s_ntargcpu += remotes + locals;
845 stat->s_ntargremotes += remotes;
846 stat->s_ntarglocals += locals;
847
848 /* uvhub statistics */
849 hubs = bau_uvhub_weight(&bau_desc->distribution);
850 if (locals) {
851 stat->s_ntarglocaluvhub++;
852 stat->s_ntargremoteuvhub += (hubs - 1);
662 } else 853 } else
663 stat->s_requestor--; 854 stat->s_ntargremoteuvhub += hubs;
664 if (completion_status == FLUSH_COMPLETE && try > 1) 855
665 stat->s_retriesok++; 856 stat->s_ntarguvhub += hubs;
666 else if (completion_status == FLUSH_GIVEUP) { 857
667 stat->s_giveup++; 858 if (hubs >= 16)
668 return 1; 859 stat->s_ntarguvhub16++;
860 else if (hubs >= 8)
861 stat->s_ntarguvhub8++;
862 else if (hubs >= 4)
863 stat->s_ntarguvhub4++;
864 else if (hubs >= 2)
865 stat->s_ntarguvhub2++;
866 else
867 stat->s_ntarguvhub1++;
868}
869
870/*
871 * Translate a cpu mask to the uvhub distribution mask in the BAU
872 * activation descriptor.
873 */
874static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
875 struct bau_desc *bau_desc, int *localsp, int *remotesp)
876{
877 int cpu;
878 int pnode;
879 int cnt = 0;
880 struct hub_and_pnode *hpp;
881
882 for_each_cpu(cpu, flush_mask) {
883 /*
884 * The distribution vector is a bit map of pnodes, relative
885 * to the partition base pnode (and the partition base nasid
886 * in the header).
887 * Translate cpu to pnode and hub using a local memory array.
888 */
889 hpp = &bcp->socket_master->thp[cpu];
890 pnode = hpp->pnode - bcp->partition_base_pnode;
891 bau_uvhub_set(pnode, &bau_desc->distribution);
892 cnt++;
893 if (hpp->uvhub == bcp->uvhub)
894 (*localsp)++;
895 else
896 (*remotesp)++;
669 } 897 }
898 if (!cnt)
899 return 1;
670 return 0; 900 return 0;
671} 901}
672 902
673/** 903/*
674 * uv_flush_tlb_others - globally purge translation cache of a virtual 904 * globally purge translation cache of a virtual address or all TLB's
675 * address or all TLB's
676 * @cpumask: mask of all cpu's in which the address is to be removed 905 * @cpumask: mask of all cpu's in which the address is to be removed
677 * @mm: mm_struct containing virtual address range 906 * @mm: mm_struct containing virtual address range
678 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu) 907 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
@@ -696,20 +925,16 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc,
696 * done. The returned pointer is valid till preemption is re-enabled. 925 * done. The returned pointer is valid till preemption is re-enabled.
697 */ 926 */
698const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 927const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
699 struct mm_struct *mm, 928 struct mm_struct *mm, unsigned long va,
700 unsigned long va, unsigned int cpu) 929 unsigned int cpu)
701{ 930{
702 int locals = 0; 931 int locals = 0;
703 int remotes = 0; 932 int remotes = 0;
704 int hubs = 0; 933 int hubs = 0;
705 int tcpu;
706 int tpnode;
707 struct bau_desc *bau_desc; 934 struct bau_desc *bau_desc;
708 struct cpumask *flush_mask; 935 struct cpumask *flush_mask;
709 struct ptc_stats *stat; 936 struct ptc_stats *stat;
710 struct bau_control *bcp; 937 struct bau_control *bcp;
711 struct bau_control *tbcp;
712 struct hub_and_pnode *hpp;
713 938
714 /* kernel was booted 'nobau' */ 939 /* kernel was booted 'nobau' */
715 if (nobau) 940 if (nobau)
@@ -720,20 +945,8 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
720 945
721 /* bau was disabled due to slow response */ 946 /* bau was disabled due to slow response */
722 if (bcp->baudisabled) { 947 if (bcp->baudisabled) {
723 /* the cpu that disabled it must re-enable it */ 948 if (check_enable(bcp, stat))
724 if (bcp->set_bau_off) { 949 return cpumask;
725 if (get_cycles() >= bcp->set_bau_on_time) {
726 stat->s_bau_reenabled++;
727 baudisabled = 0;
728 for_each_present_cpu(tcpu) {
729 tbcp = &per_cpu(bau_control, tcpu);
730 tbcp->baudisabled = 0;
731 tbcp->period_requests = 0;
732 tbcp->period_time = 0;
733 }
734 }
735 }
736 return cpumask;
737 } 950 }
738 951
739 /* 952 /*
@@ -744,59 +957,20 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
744 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); 957 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
745 /* don't actually do a shootdown of the local cpu */ 958 /* don't actually do a shootdown of the local cpu */
746 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); 959 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
960
747 if (cpu_isset(cpu, *cpumask)) 961 if (cpu_isset(cpu, *cpumask))
748 stat->s_ntargself++; 962 stat->s_ntargself++;
749 963
750 bau_desc = bcp->descriptor_base; 964 bau_desc = bcp->descriptor_base;
751 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; 965 bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu;
752 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); 966 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
753 967 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
754 for_each_cpu(tcpu, flush_mask) {
755 /*
756 * The distribution vector is a bit map of pnodes, relative
757 * to the partition base pnode (and the partition base nasid
758 * in the header).
759 * Translate cpu to pnode and hub using an array stored
760 * in local memory.
761 */
762 hpp = &bcp->socket_master->target_hub_and_pnode[tcpu];
763 tpnode = hpp->pnode - bcp->partition_base_pnode;
764 bau_uvhub_set(tpnode, &bau_desc->distribution);
765 if (hpp->uvhub == bcp->uvhub)
766 locals++;
767 else
768 remotes++;
769 }
770 if ((locals + remotes) == 0)
771 return NULL; 968 return NULL;
772 stat->s_requestor++;
773 stat->s_ntargcpu += remotes + locals;
774 stat->s_ntargremotes += remotes;
775 stat->s_ntarglocals += locals;
776 remotes = bau_uvhub_weight(&bau_desc->distribution);
777 969
778 /* uvhub statistics */ 970 record_send_statistics(stat, locals, hubs, remotes, bau_desc);
779 hubs = bau_uvhub_weight(&bau_desc->distribution);
780 if (locals) {
781 stat->s_ntarglocaluvhub++;
782 stat->s_ntargremoteuvhub += (hubs - 1);
783 } else
784 stat->s_ntargremoteuvhub += hubs;
785 stat->s_ntarguvhub += hubs;
786 if (hubs >= 16)
787 stat->s_ntarguvhub16++;
788 else if (hubs >= 8)
789 stat->s_ntarguvhub8++;
790 else if (hubs >= 4)
791 stat->s_ntarguvhub4++;
792 else if (hubs >= 2)
793 stat->s_ntarguvhub2++;
794 else
795 stat->s_ntarguvhub1++;
796 971
797 bau_desc->payload.address = va; 972 bau_desc->payload.address = va;
798 bau_desc->payload.sending_cpu = cpu; 973 bau_desc->payload.sending_cpu = cpu;
799
800 /* 974 /*
801 * uv_flush_send_and_wait returns 0 if all cpu's were messaged, 975 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
802 * or 1 if it gave up and the original cpumask should be returned. 976 * or 1 if it gave up and the original cpumask should be returned.
@@ -825,26 +999,31 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
825{ 999{
826 int count = 0; 1000 int count = 0;
827 cycles_t time_start; 1001 cycles_t time_start;
828 struct bau_payload_queue_entry *msg; 1002 struct bau_pq_entry *msg;
829 struct bau_control *bcp; 1003 struct bau_control *bcp;
830 struct ptc_stats *stat; 1004 struct ptc_stats *stat;
831 struct msg_desc msgdesc; 1005 struct msg_desc msgdesc;
832 1006
833 time_start = get_cycles(); 1007 time_start = get_cycles();
1008
834 bcp = &per_cpu(bau_control, smp_processor_id()); 1009 bcp = &per_cpu(bau_control, smp_processor_id());
835 stat = bcp->statp; 1010 stat = bcp->statp;
836 msgdesc.va_queue_first = bcp->va_queue_first; 1011
837 msgdesc.va_queue_last = bcp->va_queue_last; 1012 msgdesc.queue_first = bcp->queue_first;
1013 msgdesc.queue_last = bcp->queue_last;
1014
838 msg = bcp->bau_msg_head; 1015 msg = bcp->bau_msg_head;
839 while (msg->sw_ack_vector) { 1016 while (msg->swack_vec) {
840 count++; 1017 count++;
841 msgdesc.msg_slot = msg - msgdesc.va_queue_first; 1018
842 msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1; 1019 msgdesc.msg_slot = msg - msgdesc.queue_first;
1020 msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
843 msgdesc.msg = msg; 1021 msgdesc.msg = msg;
844 uv_bau_process_message(&msgdesc, bcp); 1022 bau_process_message(&msgdesc, bcp);
1023
845 msg++; 1024 msg++;
846 if (msg > msgdesc.va_queue_last) 1025 if (msg > msgdesc.queue_last)
847 msg = msgdesc.va_queue_first; 1026 msg = msgdesc.queue_first;
848 bcp->bau_msg_head = msg; 1027 bcp->bau_msg_head = msg;
849 } 1028 }
850 stat->d_time += (get_cycles() - time_start); 1029 stat->d_time += (get_cycles() - time_start);
@@ -852,18 +1031,17 @@ void uv_bau_message_interrupt(struct pt_regs *regs)
852 stat->d_nomsg++; 1031 stat->d_nomsg++;
853 else if (count > 1) 1032 else if (count > 1)
854 stat->d_multmsg++; 1033 stat->d_multmsg++;
1034
855 ack_APIC_irq(); 1035 ack_APIC_irq();
856} 1036}
857 1037
858/* 1038/*
859 * uv_enable_timeouts 1039 * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
860 *
861 * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
862 * shootdown message timeouts enabled. The timeout does not cause 1040 * shootdown message timeouts enabled. The timeout does not cause
863 * an interrupt, but causes an error message to be returned to 1041 * an interrupt, but causes an error message to be returned to
864 * the sender. 1042 * the sender.
865 */ 1043 */
866static void __init uv_enable_timeouts(void) 1044static void __init enable_timeouts(void)
867{ 1045{
868 int uvhub; 1046 int uvhub;
869 int nuvhubs; 1047 int nuvhubs;
@@ -877,47 +1055,44 @@ static void __init uv_enable_timeouts(void)
877 continue; 1055 continue;
878 1056
879 pnode = uv_blade_to_pnode(uvhub); 1057 pnode = uv_blade_to_pnode(uvhub);
880 mmr_image = 1058 mmr_image = read_mmr_misc_control(pnode);
881 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
882 /* 1059 /*
883 * Set the timeout period and then lock it in, in three 1060 * Set the timeout period and then lock it in, in three
884 * steps; captures and locks in the period. 1061 * steps; captures and locks in the period.
885 * 1062 *
886 * To program the period, the SOFT_ACK_MODE must be off. 1063 * To program the period, the SOFT_ACK_MODE must be off.
887 */ 1064 */
888 mmr_image &= ~((unsigned long)1 << 1065 mmr_image &= ~(1L << SOFTACK_MSHIFT);
889 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); 1066 write_mmr_misc_control(pnode, mmr_image);
890 uv_write_global_mmr64
891 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
892 /* 1067 /*
893 * Set the 4-bit period. 1068 * Set the 4-bit period.
894 */ 1069 */
895 mmr_image &= ~((unsigned long)0xf << 1070 mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
896 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT); 1071 mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
897 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD << 1072 write_mmr_misc_control(pnode, mmr_image);
898 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
899 uv_write_global_mmr64
900 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
901 /* 1073 /*
1074 * UV1:
902 * Subsequent reversals of the timebase bit (3) cause an 1075 * Subsequent reversals of the timebase bit (3) cause an
903 * immediate timeout of one or all INTD resources as 1076 * immediate timeout of one or all INTD resources as
904 * indicated in bits 2:0 (7 causes all of them to timeout). 1077 * indicated in bits 2:0 (7 causes all of them to timeout).
905 */ 1078 */
906 mmr_image |= ((unsigned long)1 << 1079 mmr_image |= (1L << SOFTACK_MSHIFT);
907 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT); 1080 if (is_uv2_hub()) {
908 uv_write_global_mmr64 1081 mmr_image |= (1L << UV2_LEG_SHFT);
909 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image); 1082 mmr_image |= (1L << UV2_EXT_SHFT);
1083 }
1084 write_mmr_misc_control(pnode, mmr_image);
910 } 1085 }
911} 1086}
912 1087
913static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset) 1088static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
914{ 1089{
915 if (*offset < num_possible_cpus()) 1090 if (*offset < num_possible_cpus())
916 return offset; 1091 return offset;
917 return NULL; 1092 return NULL;
918} 1093}
919 1094
920static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset) 1095static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
921{ 1096{
922 (*offset)++; 1097 (*offset)++;
923 if (*offset < num_possible_cpus()) 1098 if (*offset < num_possible_cpus())
@@ -925,12 +1100,11 @@ static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
925 return NULL; 1100 return NULL;
926} 1101}
927 1102
928static void uv_ptc_seq_stop(struct seq_file *file, void *data) 1103static void ptc_seq_stop(struct seq_file *file, void *data)
929{ 1104{
930} 1105}
931 1106
932static inline unsigned long long 1107static inline unsigned long long usec_2_cycles(unsigned long microsec)
933microsec_2_cycles(unsigned long microsec)
934{ 1108{
935 unsigned long ns; 1109 unsigned long ns;
936 unsigned long long cyc; 1110 unsigned long long cyc;
@@ -941,29 +1115,27 @@ microsec_2_cycles(unsigned long microsec)
941} 1115}
942 1116
943/* 1117/*
944 * Display the statistics thru /proc. 1118 * Display the statistics thru /proc/sgi_uv/ptc_statistics
945 * 'data' points to the cpu number 1119 * 'data' points to the cpu number
1120 * Note: see the descriptions in stat_description[].
946 */ 1121 */
947static int uv_ptc_seq_show(struct seq_file *file, void *data) 1122static int ptc_seq_show(struct seq_file *file, void *data)
948{ 1123{
949 struct ptc_stats *stat; 1124 struct ptc_stats *stat;
950 int cpu; 1125 int cpu;
951 1126
952 cpu = *(loff_t *)data; 1127 cpu = *(loff_t *)data;
953
954 if (!cpu) { 1128 if (!cpu) {
955 seq_printf(file, 1129 seq_printf(file,
956 "# cpu sent stime self locals remotes ncpus localhub "); 1130 "# cpu sent stime self locals remotes ncpus localhub ");
957 seq_printf(file, 1131 seq_printf(file,
958 "remotehub numuvhubs numuvhubs16 numuvhubs8 "); 1132 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
959 seq_printf(file, 1133 seq_printf(file,
960 "numuvhubs4 numuvhubs2 numuvhubs1 dto "); 1134 "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok ");
961 seq_printf(file,
962 "retries rok resetp resett giveup sto bz throt ");
963 seq_printf(file, 1135 seq_printf(file,
964 "sw_ack recv rtime all "); 1136 "resetp resett giveup sto bz throt swack recv rtime ");
965 seq_printf(file, 1137 seq_printf(file,
966 "one mult none retry canc nocan reset rcan "); 1138 "all one mult none retry canc nocan reset rcan ");
967 seq_printf(file, 1139 seq_printf(file,
968 "disable enable\n"); 1140 "disable enable\n");
969 } 1141 }
@@ -990,8 +1162,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
990 /* destination side statistics */ 1162 /* destination side statistics */
991 seq_printf(file, 1163 seq_printf(file,
992 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", 1164 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
993 uv_read_global_mmr64(uv_cpu_to_pnode(cpu), 1165 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
994 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
995 stat->d_requestee, cycles_2_us(stat->d_time), 1166 stat->d_requestee, cycles_2_us(stat->d_time),
996 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg, 1167 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
997 stat->d_nomsg, stat->d_retries, stat->d_canceled, 1168 stat->d_nomsg, stat->d_retries, stat->d_canceled,
@@ -1000,7 +1171,6 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
1000 seq_printf(file, "%ld %ld\n", 1171 seq_printf(file, "%ld %ld\n",
1001 stat->s_bau_disabled, stat->s_bau_reenabled); 1172 stat->s_bau_disabled, stat->s_bau_reenabled);
1002 } 1173 }
1003
1004 return 0; 1174 return 0;
1005} 1175}
1006 1176
@@ -1008,18 +1178,18 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data)
1008 * Display the tunables thru debugfs 1178 * Display the tunables thru debugfs
1009 */ 1179 */
1010static ssize_t tunables_read(struct file *file, char __user *userbuf, 1180static ssize_t tunables_read(struct file *file, char __user *userbuf,
1011 size_t count, loff_t *ppos) 1181 size_t count, loff_t *ppos)
1012{ 1182{
1013 char *buf; 1183 char *buf;
1014 int ret; 1184 int ret;
1015 1185
1016 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", 1186 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1017 "max_bau_concurrent plugged_delay plugsb4reset", 1187 "max_concur plugged_delay plugsb4reset",
1018 "timeoutsb4reset ipi_reset_limit complete_threshold", 1188 "timeoutsb4reset ipi_reset_limit complete_threshold",
1019 "congested_response_us congested_reps congested_period", 1189 "congested_response_us congested_reps congested_period",
1020 max_bau_concurrent, plugged_delay, plugsb4reset, 1190 max_concurr, plugged_delay, plugsb4reset,
1021 timeoutsb4reset, ipi_reset_limit, complete_threshold, 1191 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1022 congested_response_us, congested_reps, congested_period); 1192 congested_respns_us, congested_reps, congested_period);
1023 1193
1024 if (!buf) 1194 if (!buf)
1025 return -ENOMEM; 1195 return -ENOMEM;
@@ -1030,13 +1200,16 @@ static ssize_t tunables_read(struct file *file, char __user *userbuf,
1030} 1200}
1031 1201
1032/* 1202/*
1033 * -1: resetf the statistics 1203 * handle a write to /proc/sgi_uv/ptc_statistics
1204 * -1: reset the statistics
1034 * 0: display meaning of the statistics 1205 * 0: display meaning of the statistics
1035 */ 1206 */
1036static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, 1207static ssize_t ptc_proc_write(struct file *file, const char __user *user,
1037 size_t count, loff_t *data) 1208 size_t count, loff_t *data)
1038{ 1209{
1039 int cpu; 1210 int cpu;
1211 int i;
1212 int elements;
1040 long input_arg; 1213 long input_arg;
1041 char optstr[64]; 1214 char optstr[64];
1042 struct ptc_stats *stat; 1215 struct ptc_stats *stat;
@@ -1046,79 +1219,18 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
1046 if (copy_from_user(optstr, user, count)) 1219 if (copy_from_user(optstr, user, count))
1047 return -EFAULT; 1220 return -EFAULT;
1048 optstr[count - 1] = '\0'; 1221 optstr[count - 1] = '\0';
1222
1049 if (strict_strtol(optstr, 10, &input_arg) < 0) { 1223 if (strict_strtol(optstr, 10, &input_arg) < 0) {
1050 printk(KERN_DEBUG "%s is invalid\n", optstr); 1224 printk(KERN_DEBUG "%s is invalid\n", optstr);
1051 return -EINVAL; 1225 return -EINVAL;
1052 } 1226 }
1053 1227
1054 if (input_arg == 0) { 1228 if (input_arg == 0) {
1229 elements = sizeof(stat_description)/sizeof(*stat_description);
1055 printk(KERN_DEBUG "# cpu: cpu number\n"); 1230 printk(KERN_DEBUG "# cpu: cpu number\n");
1056 printk(KERN_DEBUG "Sender statistics:\n"); 1231 printk(KERN_DEBUG "Sender statistics:\n");
1057 printk(KERN_DEBUG 1232 for (i = 0; i < elements; i++)
1058 "sent: number of shootdown messages sent\n"); 1233 printk(KERN_DEBUG "%s\n", stat_description[i]);
1059 printk(KERN_DEBUG
1060 "stime: time spent sending messages\n");
1061 printk(KERN_DEBUG
1062 "numuvhubs: number of hubs targeted with shootdown\n");
1063 printk(KERN_DEBUG
1064 "numuvhubs16: number times 16 or more hubs targeted\n");
1065 printk(KERN_DEBUG
1066 "numuvhubs8: number times 8 or more hubs targeted\n");
1067 printk(KERN_DEBUG
1068 "numuvhubs4: number times 4 or more hubs targeted\n");
1069 printk(KERN_DEBUG
1070 "numuvhubs2: number times 2 or more hubs targeted\n");
1071 printk(KERN_DEBUG
1072 "numuvhubs1: number times 1 hub targeted\n");
1073 printk(KERN_DEBUG
1074 "numcpus: number of cpus targeted with shootdown\n");
1075 printk(KERN_DEBUG
1076 "dto: number of destination timeouts\n");
1077 printk(KERN_DEBUG
1078 "retries: destination timeout retries sent\n");
1079 printk(KERN_DEBUG
1080 "rok: : destination timeouts successfully retried\n");
1081 printk(KERN_DEBUG
1082 "resetp: ipi-style resource resets for plugs\n");
1083 printk(KERN_DEBUG
1084 "resett: ipi-style resource resets for timeouts\n");
1085 printk(KERN_DEBUG
1086 "giveup: fall-backs to ipi-style shootdowns\n");
1087 printk(KERN_DEBUG
1088 "sto: number of source timeouts\n");
1089 printk(KERN_DEBUG
1090 "bz: number of stay-busy's\n");
1091 printk(KERN_DEBUG
1092 "throt: number times spun in throttle\n");
1093 printk(KERN_DEBUG "Destination side statistics:\n");
1094 printk(KERN_DEBUG
1095 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
1096 printk(KERN_DEBUG
1097 "recv: shootdown messages received\n");
1098 printk(KERN_DEBUG
1099 "rtime: time spent processing messages\n");
1100 printk(KERN_DEBUG
1101 "all: shootdown all-tlb messages\n");
1102 printk(KERN_DEBUG
1103 "one: shootdown one-tlb messages\n");
1104 printk(KERN_DEBUG
1105 "mult: interrupts that found multiple messages\n");
1106 printk(KERN_DEBUG
1107 "none: interrupts that found no messages\n");
1108 printk(KERN_DEBUG
1109 "retry: number of retry messages processed\n");
1110 printk(KERN_DEBUG
1111 "canc: number messages canceled by retries\n");
1112 printk(KERN_DEBUG
1113 "nocan: number retries that found nothing to cancel\n");
1114 printk(KERN_DEBUG
1115 "reset: number of ipi-style reset requests processed\n");
1116 printk(KERN_DEBUG
1117 "rcan: number messages canceled by reset requests\n");
1118 printk(KERN_DEBUG
1119 "disable: number times use of the BAU was disabled\n");
1120 printk(KERN_DEBUG
1121 "enable: number times use of the BAU was re-enabled\n");
1122 } else if (input_arg == -1) { 1234 } else if (input_arg == -1) {
1123 for_each_present_cpu(cpu) { 1235 for_each_present_cpu(cpu) {
1124 stat = &per_cpu(ptcstats, cpu); 1236 stat = &per_cpu(ptcstats, cpu);
@@ -1145,27 +1257,18 @@ static int local_atoi(const char *name)
1145} 1257}
1146 1258
1147/* 1259/*
1148 * set the tunables 1260 * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
1149 * 0 values reset them to defaults 1261 * Zero values reset them to defaults.
1150 */ 1262 */
1151static ssize_t tunables_write(struct file *file, const char __user *user, 1263static int parse_tunables_write(struct bau_control *bcp, char *instr,
1152 size_t count, loff_t *data) 1264 int count)
1153{ 1265{
1154 int cpu;
1155 int cnt = 0;
1156 int val;
1157 char *p; 1266 char *p;
1158 char *q; 1267 char *q;
1159 char instr[64]; 1268 int cnt = 0;
1160 struct bau_control *bcp; 1269 int val;
1161 1270 int e = sizeof(tunables) / sizeof(*tunables);
1162 if (count == 0 || count > sizeof(instr)-1)
1163 return -EINVAL;
1164 if (copy_from_user(instr, user, count))
1165 return -EFAULT;
1166 1271
1167 instr[count] = '\0';
1168 /* count the fields */
1169 p = instr + strspn(instr, WHITESPACE); 1272 p = instr + strspn(instr, WHITESPACE);
1170 q = p; 1273 q = p;
1171 for (; *p; p = q + strspn(q, WHITESPACE)) { 1274 for (; *p; p = q + strspn(q, WHITESPACE)) {
@@ -1174,8 +1277,8 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
1174 if (q == p) 1277 if (q == p)
1175 break; 1278 break;
1176 } 1279 }
1177 if (cnt != 9) { 1280 if (cnt != e) {
1178 printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); 1281 printk(KERN_INFO "bau tunable error: should be %d values\n", e);
1179 return -EINVAL; 1282 return -EINVAL;
1180 } 1283 }
1181 1284
@@ -1187,97 +1290,80 @@ static ssize_t tunables_write(struct file *file, const char __user *user,
1187 switch (cnt) { 1290 switch (cnt) {
1188 case 0: 1291 case 0:
1189 if (val == 0) { 1292 if (val == 0) {
1190 max_bau_concurrent = MAX_BAU_CONCURRENT; 1293 max_concurr = MAX_BAU_CONCURRENT;
1191 max_bau_concurrent_constant = 1294 max_concurr_const = MAX_BAU_CONCURRENT;
1192 MAX_BAU_CONCURRENT;
1193 continue; 1295 continue;
1194 } 1296 }
1195 bcp = &per_cpu(bau_control, smp_processor_id());
1196 if (val < 1 || val > bcp->cpus_in_uvhub) { 1297 if (val < 1 || val > bcp->cpus_in_uvhub) {
1197 printk(KERN_DEBUG 1298 printk(KERN_DEBUG
1198 "Error: BAU max concurrent %d is invalid\n", 1299 "Error: BAU max concurrent %d is invalid\n",
1199 val); 1300 val);
1200 return -EINVAL; 1301 return -EINVAL;
1201 } 1302 }
1202 max_bau_concurrent = val; 1303 max_concurr = val;
1203 max_bau_concurrent_constant = val; 1304 max_concurr_const = val;
1204 continue;
1205 case 1:
1206 if (val == 0)
1207 plugged_delay = PLUGGED_DELAY;
1208 else
1209 plugged_delay = val;
1210 continue;
1211 case 2:
1212 if (val == 0)
1213 plugsb4reset = PLUGSB4RESET;
1214 else
1215 plugsb4reset = val;
1216 continue;
1217 case 3:
1218 if (val == 0)
1219 timeoutsb4reset = TIMEOUTSB4RESET;
1220 else
1221 timeoutsb4reset = val;
1222 continue;
1223 case 4:
1224 if (val == 0)
1225 ipi_reset_limit = IPI_RESET_LIMIT;
1226 else
1227 ipi_reset_limit = val;
1228 continue;
1229 case 5:
1230 if (val == 0)
1231 complete_threshold = COMPLETE_THRESHOLD;
1232 else
1233 complete_threshold = val;
1234 continue;
1235 case 6:
1236 if (val == 0)
1237 congested_response_us = CONGESTED_RESPONSE_US;
1238 else
1239 congested_response_us = val;
1240 continue;
1241 case 7:
1242 if (val == 0)
1243 congested_reps = CONGESTED_REPS;
1244 else
1245 congested_reps = val;
1246 continue; 1305 continue;
1247 case 8: 1306 default:
1248 if (val == 0) 1307 if (val == 0)
1249 congested_period = CONGESTED_PERIOD; 1308 *tunables[cnt].tunp = tunables[cnt].deflt;
1250 else 1309 else
1251 congested_period = val; 1310 *tunables[cnt].tunp = val;
1252 continue; 1311 continue;
1253 } 1312 }
1254 if (q == p) 1313 if (q == p)
1255 break; 1314 break;
1256 } 1315 }
1316 return 0;
1317}
1318
1319/*
1320 * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
1321 */
1322static ssize_t tunables_write(struct file *file, const char __user *user,
1323 size_t count, loff_t *data)
1324{
1325 int cpu;
1326 int ret;
1327 char instr[100];
1328 struct bau_control *bcp;
1329
1330 if (count == 0 || count > sizeof(instr)-1)
1331 return -EINVAL;
1332 if (copy_from_user(instr, user, count))
1333 return -EFAULT;
1334
1335 instr[count] = '\0';
1336
1337 bcp = &per_cpu(bau_control, smp_processor_id());
1338
1339 ret = parse_tunables_write(bcp, instr, count);
1340 if (ret)
1341 return ret;
1342
1257 for_each_present_cpu(cpu) { 1343 for_each_present_cpu(cpu) {
1258 bcp = &per_cpu(bau_control, cpu); 1344 bcp = &per_cpu(bau_control, cpu);
1259 bcp->max_bau_concurrent = max_bau_concurrent; 1345 bcp->max_concurr = max_concurr;
1260 bcp->max_bau_concurrent_constant = max_bau_concurrent; 1346 bcp->max_concurr_const = max_concurr;
1261 bcp->plugged_delay = plugged_delay; 1347 bcp->plugged_delay = plugged_delay;
1262 bcp->plugsb4reset = plugsb4reset; 1348 bcp->plugsb4reset = plugsb4reset;
1263 bcp->timeoutsb4reset = timeoutsb4reset; 1349 bcp->timeoutsb4reset = timeoutsb4reset;
1264 bcp->ipi_reset_limit = ipi_reset_limit; 1350 bcp->ipi_reset_limit = ipi_reset_limit;
1265 bcp->complete_threshold = complete_threshold; 1351 bcp->complete_threshold = complete_threshold;
1266 bcp->congested_response_us = congested_response_us; 1352 bcp->cong_response_us = congested_respns_us;
1267 bcp->congested_reps = congested_reps; 1353 bcp->cong_reps = congested_reps;
1268 bcp->congested_period = congested_period; 1354 bcp->cong_period = congested_period;
1269 } 1355 }
1270 return count; 1356 return count;
1271} 1357}
1272 1358
1273static const struct seq_operations uv_ptc_seq_ops = { 1359static const struct seq_operations uv_ptc_seq_ops = {
1274 .start = uv_ptc_seq_start, 1360 .start = ptc_seq_start,
1275 .next = uv_ptc_seq_next, 1361 .next = ptc_seq_next,
1276 .stop = uv_ptc_seq_stop, 1362 .stop = ptc_seq_stop,
1277 .show = uv_ptc_seq_show 1363 .show = ptc_seq_show
1278}; 1364};
1279 1365
1280static int uv_ptc_proc_open(struct inode *inode, struct file *file) 1366static int ptc_proc_open(struct inode *inode, struct file *file)
1281{ 1367{
1282 return seq_open(file, &uv_ptc_seq_ops); 1368 return seq_open(file, &uv_ptc_seq_ops);
1283} 1369}
@@ -1288,9 +1374,9 @@ static int tunables_open(struct inode *inode, struct file *file)
1288} 1374}
1289 1375
1290static const struct file_operations proc_uv_ptc_operations = { 1376static const struct file_operations proc_uv_ptc_operations = {
1291 .open = uv_ptc_proc_open, 1377 .open = ptc_proc_open,
1292 .read = seq_read, 1378 .read = seq_read,
1293 .write = uv_ptc_proc_write, 1379 .write = ptc_proc_write,
1294 .llseek = seq_lseek, 1380 .llseek = seq_lseek,
1295 .release = seq_release, 1381 .release = seq_release,
1296}; 1382};
@@ -1324,7 +1410,7 @@ static int __init uv_ptc_init(void)
1324 return -EINVAL; 1410 return -EINVAL;
1325 } 1411 }
1326 tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, 1412 tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1327 tunables_dir, NULL, &tunables_fops); 1413 tunables_dir, NULL, &tunables_fops);
1328 if (!tunables_file) { 1414 if (!tunables_file) {
1329 printk(KERN_ERR "unable to create debugfs file %s\n", 1415 printk(KERN_ERR "unable to create debugfs file %s\n",
1330 UV_BAU_TUNABLES_FILE); 1416 UV_BAU_TUNABLES_FILE);
@@ -1336,24 +1422,24 @@ static int __init uv_ptc_init(void)
1336/* 1422/*
1337 * Initialize the sending side's sending buffers. 1423 * Initialize the sending side's sending buffers.
1338 */ 1424 */
1339static void 1425static void activation_descriptor_init(int node, int pnode, int base_pnode)
1340uv_activation_descriptor_init(int node, int pnode, int base_pnode)
1341{ 1426{
1342 int i; 1427 int i;
1343 int cpu; 1428 int cpu;
1344 unsigned long pa; 1429 unsigned long pa;
1345 unsigned long m; 1430 unsigned long m;
1346 unsigned long n; 1431 unsigned long n;
1432 size_t dsize;
1347 struct bau_desc *bau_desc; 1433 struct bau_desc *bau_desc;
1348 struct bau_desc *bd2; 1434 struct bau_desc *bd2;
1349 struct bau_control *bcp; 1435 struct bau_control *bcp;
1350 1436
1351 /* 1437 /*
1352 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR) 1438 * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
1353 * per cpu; and one per cpu on the uvhub (UV_ADP_SIZE) 1439 * per cpu; and one per cpu on the uvhub (ADP_SZ)
1354 */ 1440 */
1355 bau_desc = kmalloc_node(sizeof(struct bau_desc) * UV_ADP_SIZE 1441 dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
1356 * UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node); 1442 bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
1357 BUG_ON(!bau_desc); 1443 BUG_ON(!bau_desc);
1358 1444
1359 pa = uv_gpa(bau_desc); /* need the real nasid*/ 1445 pa = uv_gpa(bau_desc); /* need the real nasid*/
@@ -1361,27 +1447,25 @@ uv_activation_descriptor_init(int node, int pnode, int base_pnode)
1361 m = pa & uv_mmask; 1447 m = pa & uv_mmask;
1362 1448
1363 /* the 14-bit pnode */ 1449 /* the 14-bit pnode */
1364 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, 1450 write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
1365 (n << UV_DESC_BASE_PNODE_SHIFT | m));
1366 /* 1451 /*
1367 * Initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each 1452 * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
1368 * cpu even though we only use the first one; one descriptor can 1453 * cpu even though we only use the first one; one descriptor can
1369 * describe a broadcast to 256 uv hubs. 1454 * describe a broadcast to 256 uv hubs.
1370 */ 1455 */
1371 for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR); 1456 for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
1372 i++, bd2++) {
1373 memset(bd2, 0, sizeof(struct bau_desc)); 1457 memset(bd2, 0, sizeof(struct bau_desc));
1374 bd2->header.sw_ack_flag = 1; 1458 bd2->header.swack_flag = 1;
1375 /* 1459 /*
1376 * The base_dest_nasid set in the message header is the nasid 1460 * The base_dest_nasid set in the message header is the nasid
1377 * of the first uvhub in the partition. The bit map will 1461 * of the first uvhub in the partition. The bit map will
1378 * indicate destination pnode numbers relative to that base. 1462 * indicate destination pnode numbers relative to that base.
1379 * They may not be consecutive if nasid striding is being used. 1463 * They may not be consecutive if nasid striding is being used.
1380 */ 1464 */
1381 bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode); 1465 bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
1382 bd2->header.dest_subnodeid = UV_LB_SUBNODEID; 1466 bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
1383 bd2->header.command = UV_NET_ENDPOINT_INTD; 1467 bd2->header.command = UV_NET_ENDPOINT_INTD;
1384 bd2->header.int_both = 1; 1468 bd2->header.int_both = 1;
1385 /* 1469 /*
1386 * all others need to be set to zero: 1470 * all others need to be set to zero:
1387 * fairness chaining multilevel count replied_to 1471 * fairness chaining multilevel count replied_to
@@ -1401,57 +1485,55 @@ uv_activation_descriptor_init(int node, int pnode, int base_pnode)
1401 * - node is first node (kernel memory notion) on the uvhub 1485 * - node is first node (kernel memory notion) on the uvhub
1402 * - pnode is the uvhub's physical identifier 1486 * - pnode is the uvhub's physical identifier
1403 */ 1487 */
1404static void 1488static void pq_init(int node, int pnode)
1405uv_payload_queue_init(int node, int pnode)
1406{ 1489{
1407 int pn;
1408 int cpu; 1490 int cpu;
1491 size_t plsize;
1409 char *cp; 1492 char *cp;
1410 unsigned long pa; 1493 void *vp;
1411 struct bau_payload_queue_entry *pqp; 1494 unsigned long pn;
1412 struct bau_payload_queue_entry *pqp_malloc; 1495 unsigned long first;
1496 unsigned long pn_first;
1497 unsigned long last;
1498 struct bau_pq_entry *pqp;
1413 struct bau_control *bcp; 1499 struct bau_control *bcp;
1414 1500
1415 pqp = kmalloc_node((DEST_Q_SIZE + 1) 1501 plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
1416 * sizeof(struct bau_payload_queue_entry), 1502 vp = kmalloc_node(plsize, GFP_KERNEL, node);
1417 GFP_KERNEL, node); 1503 pqp = (struct bau_pq_entry *)vp;
1418 BUG_ON(!pqp); 1504 BUG_ON(!pqp);
1419 pqp_malloc = pqp;
1420 1505
1421 cp = (char *)pqp + 31; 1506 cp = (char *)pqp + 31;
1422 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5); 1507 pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
1423 1508
1424 for_each_present_cpu(cpu) { 1509 for_each_present_cpu(cpu) {
1425 if (pnode != uv_cpu_to_pnode(cpu)) 1510 if (pnode != uv_cpu_to_pnode(cpu))
1426 continue; 1511 continue;
1427 /* for every cpu on this pnode: */ 1512 /* for every cpu on this pnode: */
1428 bcp = &per_cpu(bau_control, cpu); 1513 bcp = &per_cpu(bau_control, cpu);
1429 bcp->va_queue_first = pqp; 1514 bcp->queue_first = pqp;
1430 bcp->bau_msg_head = pqp; 1515 bcp->bau_msg_head = pqp;
1431 bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1); 1516 bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
1432 } 1517 }
1433 /* 1518 /*
1434 * need the pnode of where the memory was really allocated 1519 * need the pnode of where the memory was really allocated
1435 */ 1520 */
1436 pa = uv_gpa(pqp); 1521 pn = uv_gpa(pqp) >> uv_nshift;
1437 pn = pa >> uv_nshift; 1522 first = uv_physnodeaddr(pqp);
1438 uv_write_global_mmr64(pnode, 1523 pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
1439 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, 1524 last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
1440 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | 1525 write_mmr_payload_first(pnode, pn_first);
1441 uv_physnodeaddr(pqp)); 1526 write_mmr_payload_tail(pnode, first);
1442 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, 1527 write_mmr_payload_last(pnode, last);
1443 uv_physnodeaddr(pqp)); 1528
1444 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
1445 (unsigned long)
1446 uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
1447 /* in effect, all msg_type's are set to MSG_NOOP */ 1529 /* in effect, all msg_type's are set to MSG_NOOP */
1448 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE); 1530 memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
1449} 1531}
1450 1532
1451/* 1533/*
1452 * Initialization of each UV hub's structures 1534 * Initialization of each UV hub's structures
1453 */ 1535 */
1454static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode) 1536static void __init init_uvhub(int uvhub, int vector, int base_pnode)
1455{ 1537{
1456 int node; 1538 int node;
1457 int pnode; 1539 int pnode;
@@ -1459,24 +1541,24 @@ static void __init uv_init_uvhub(int uvhub, int vector, int base_pnode)
1459 1541
1460 node = uvhub_to_first_node(uvhub); 1542 node = uvhub_to_first_node(uvhub);
1461 pnode = uv_blade_to_pnode(uvhub); 1543 pnode = uv_blade_to_pnode(uvhub);
1462 uv_activation_descriptor_init(node, pnode, base_pnode); 1544
1463 uv_payload_queue_init(node, pnode); 1545 activation_descriptor_init(node, pnode, base_pnode);
1546
1547 pq_init(node, pnode);
1464 /* 1548 /*
1465 * The below initialization can't be in firmware because the 1549 * The below initialization can't be in firmware because the
1466 * messaging IRQ will be determined by the OS. 1550 * messaging IRQ will be determined by the OS.
1467 */ 1551 */
1468 apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits; 1552 apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
1469 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, 1553 write_mmr_data_config(pnode, ((apicid << 32) | vector));
1470 ((apicid << 32) | vector));
1471} 1554}
1472 1555
1473/* 1556/*
1474 * We will set BAU_MISC_CONTROL with a timeout period. 1557 * We will set BAU_MISC_CONTROL with a timeout period.
1475 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. 1558 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1476 * So the destination timeout period has be be calculated from them. 1559 * So the destination timeout period has to be calculated from them.
1477 */ 1560 */
1478static int 1561static int calculate_destination_timeout(void)
1479calculate_destination_timeout(void)
1480{ 1562{
1481 unsigned long mmr_image; 1563 unsigned long mmr_image;
1482 int mult1; 1564 int mult1;
@@ -1486,73 +1568,92 @@ calculate_destination_timeout(void)
1486 int ret; 1568 int ret;
1487 unsigned long ts_ns; 1569 unsigned long ts_ns;
1488 1570
1489 mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; 1571 if (is_uv1_hub()) {
1490 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); 1572 mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1491 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; 1573 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1492 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); 1574 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1493 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; 1575 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1494 base = timeout_base_ns[index]; 1576 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1495 ts_ns = base * mult1 * mult2; 1577 base = timeout_base_ns[index];
1496 ret = ts_ns / 1000; 1578 ts_ns = base * mult1 * mult2;
1579 ret = ts_ns / 1000;
1580 } else {
1581 /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */
1582 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1583 mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
1584 if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
1585 mult1 = 80;
1586 else
1587 mult1 = 10;
1588 base = mmr_image & UV2_ACK_MASK;
1589 ret = mult1 * base;
1590 }
1497 return ret; 1591 return ret;
1498} 1592}
1499 1593
1594static void __init init_per_cpu_tunables(void)
1595{
1596 int cpu;
1597 struct bau_control *bcp;
1598
1599 for_each_present_cpu(cpu) {
1600 bcp = &per_cpu(bau_control, cpu);
1601 bcp->baudisabled = 0;
1602 bcp->statp = &per_cpu(ptcstats, cpu);
1603 /* time interval to catch a hardware stay-busy bug */
1604 bcp->timeout_interval = usec_2_cycles(2*timeout_us);
1605 bcp->max_concurr = max_concurr;
1606 bcp->max_concurr_const = max_concurr;
1607 bcp->plugged_delay = plugged_delay;
1608 bcp->plugsb4reset = plugsb4reset;
1609 bcp->timeoutsb4reset = timeoutsb4reset;
1610 bcp->ipi_reset_limit = ipi_reset_limit;
1611 bcp->complete_threshold = complete_threshold;
1612 bcp->cong_response_us = congested_respns_us;
1613 bcp->cong_reps = congested_reps;
1614 bcp->cong_period = congested_period;
1615 }
1616}
1617
1500/* 1618/*
1501 * initialize the bau_control structure for each cpu 1619 * Scan all cpus to collect blade and socket summaries.
1502 */ 1620 */
1503static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode) 1621static int __init get_cpu_topology(int base_pnode,
1622 struct uvhub_desc *uvhub_descs,
1623 unsigned char *uvhub_mask)
1504{ 1624{
1505 int i;
1506 int cpu; 1625 int cpu;
1507 int tcpu;
1508 int pnode; 1626 int pnode;
1509 int uvhub; 1627 int uvhub;
1510 int have_hmaster; 1628 int socket;
1511 short socket = 0;
1512 unsigned short socket_mask;
1513 unsigned char *uvhub_mask;
1514 struct bau_control *bcp; 1629 struct bau_control *bcp;
1515 struct uvhub_desc *bdp; 1630 struct uvhub_desc *bdp;
1516 struct socket_desc *sdp; 1631 struct socket_desc *sdp;
1517 struct bau_control *hmaster = NULL;
1518 struct bau_control *smaster = NULL;
1519 struct socket_desc {
1520 short num_cpus;
1521 short cpu_number[MAX_CPUS_PER_SOCKET];
1522 };
1523 struct uvhub_desc {
1524 unsigned short socket_mask;
1525 short num_cpus;
1526 short uvhub;
1527 short pnode;
1528 struct socket_desc socket[2];
1529 };
1530 struct uvhub_desc *uvhub_descs;
1531
1532 timeout_us = calculate_destination_timeout();
1533 1632
1534 uvhub_descs = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1535 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1536 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1537 for_each_present_cpu(cpu) { 1633 for_each_present_cpu(cpu) {
1538 bcp = &per_cpu(bau_control, cpu); 1634 bcp = &per_cpu(bau_control, cpu);
1635
1539 memset(bcp, 0, sizeof(struct bau_control)); 1636 memset(bcp, 0, sizeof(struct bau_control));
1637
1540 pnode = uv_cpu_hub_info(cpu)->pnode; 1638 pnode = uv_cpu_hub_info(cpu)->pnode;
1541 if ((pnode - base_part_pnode) >= UV_DISTRIBUTION_SIZE) { 1639 if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
1542 printk(KERN_EMERG 1640 printk(KERN_EMERG
1543 "cpu %d pnode %d-%d beyond %d; BAU disabled\n", 1641 "cpu %d pnode %d-%d beyond %d; BAU disabled\n",
1544 cpu, pnode, base_part_pnode, 1642 cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
1545 UV_DISTRIBUTION_SIZE);
1546 return 1; 1643 return 1;
1547 } 1644 }
1645
1548 bcp->osnode = cpu_to_node(cpu); 1646 bcp->osnode = cpu_to_node(cpu);
1549 bcp->partition_base_pnode = uv_partition_base_pnode; 1647 bcp->partition_base_pnode = base_pnode;
1648
1550 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; 1649 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1551 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8)); 1650 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1552 bdp = &uvhub_descs[uvhub]; 1651 bdp = &uvhub_descs[uvhub];
1652
1553 bdp->num_cpus++; 1653 bdp->num_cpus++;
1554 bdp->uvhub = uvhub; 1654 bdp->uvhub = uvhub;
1555 bdp->pnode = pnode; 1655 bdp->pnode = pnode;
1656
1556 /* kludge: 'assuming' one node per socket, and assuming that 1657 /* kludge: 'assuming' one node per socket, and assuming that
1557 disabling a socket just leaves a gap in node numbers */ 1658 disabling a socket just leaves a gap in node numbers */
1558 socket = bcp->osnode & 1; 1659 socket = bcp->osnode & 1;
@@ -1561,84 +1662,129 @@ static int __init uv_init_per_cpu(int nuvhubs, int base_part_pnode)
1561 sdp->cpu_number[sdp->num_cpus] = cpu; 1662 sdp->cpu_number[sdp->num_cpus] = cpu;
1562 sdp->num_cpus++; 1663 sdp->num_cpus++;
1563 if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) { 1664 if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
1564 printk(KERN_EMERG "%d cpus per socket invalid\n", sdp->num_cpus); 1665 printk(KERN_EMERG "%d cpus per socket invalid\n",
1666 sdp->num_cpus);
1565 return 1; 1667 return 1;
1566 } 1668 }
1567 } 1669 }
1670 return 0;
1671}
1672
1673/*
1674 * Each socket is to get a local array of pnodes/hubs.
1675 */
1676static void make_per_cpu_thp(struct bau_control *smaster)
1677{
1678 int cpu;
1679 size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
1680
1681 smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
1682 memset(smaster->thp, 0, hpsz);
1683 for_each_present_cpu(cpu) {
1684 smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
1685 smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1686 }
1687}
1688
1689/*
1690 * Initialize all the per_cpu information for the cpu's on a given socket,
1691 * given what has been gathered into the socket_desc struct.
1692 * And reports the chosen hub and socket masters back to the caller.
1693 */
1694static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
1695 struct bau_control **smasterp,
1696 struct bau_control **hmasterp)
1697{
1698 int i;
1699 int cpu;
1700 struct bau_control *bcp;
1701
1702 for (i = 0; i < sdp->num_cpus; i++) {
1703 cpu = sdp->cpu_number[i];
1704 bcp = &per_cpu(bau_control, cpu);
1705 bcp->cpu = cpu;
1706 if (i == 0) {
1707 *smasterp = bcp;
1708 if (!(*hmasterp))
1709 *hmasterp = bcp;
1710 }
1711 bcp->cpus_in_uvhub = bdp->num_cpus;
1712 bcp->cpus_in_socket = sdp->num_cpus;
1713 bcp->socket_master = *smasterp;
1714 bcp->uvhub = bdp->uvhub;
1715 bcp->uvhub_master = *hmasterp;
1716 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
1717 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1718 printk(KERN_EMERG "%d cpus per uvhub invalid\n",
1719 bcp->uvhub_cpu);
1720 return 1;
1721 }
1722 }
1723 return 0;
1724}
1725
1726/*
1727 * Summarize the blade and socket topology into the per_cpu structures.
1728 */
1729static int __init summarize_uvhub_sockets(int nuvhubs,
1730 struct uvhub_desc *uvhub_descs,
1731 unsigned char *uvhub_mask)
1732{
1733 int socket;
1734 int uvhub;
1735 unsigned short socket_mask;
1736
1568 for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 1737 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1738 struct uvhub_desc *bdp;
1739 struct bau_control *smaster = NULL;
1740 struct bau_control *hmaster = NULL;
1741
1569 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8)))) 1742 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1570 continue; 1743 continue;
1571 have_hmaster = 0; 1744
1572 bdp = &uvhub_descs[uvhub]; 1745 bdp = &uvhub_descs[uvhub];
1573 socket_mask = bdp->socket_mask; 1746 socket_mask = bdp->socket_mask;
1574 socket = 0; 1747 socket = 0;
1575 while (socket_mask) { 1748 while (socket_mask) {
1576 if (!(socket_mask & 1)) 1749 struct socket_desc *sdp;
1577 goto nextsocket; 1750 if ((socket_mask & 1)) {
1578 sdp = &bdp->socket[socket]; 1751 sdp = &bdp->socket[socket];
1579 for (i = 0; i < sdp->num_cpus; i++) { 1752 if (scan_sock(sdp, bdp, &smaster, &hmaster))
1580 cpu = sdp->cpu_number[i];
1581 bcp = &per_cpu(bau_control, cpu);
1582 bcp->cpu = cpu;
1583 if (i == 0) {
1584 smaster = bcp;
1585 if (!have_hmaster) {
1586 have_hmaster++;
1587 hmaster = bcp;
1588 }
1589 }
1590 bcp->cpus_in_uvhub = bdp->num_cpus;
1591 bcp->cpus_in_socket = sdp->num_cpus;
1592 bcp->socket_master = smaster;
1593 bcp->uvhub = bdp->uvhub;
1594 bcp->uvhub_master = hmaster;
1595 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1596 blade_processor_id;
1597 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1598 printk(KERN_EMERG
1599 "%d cpus per uvhub invalid\n",
1600 bcp->uvhub_cpu);
1601 return 1; 1753 return 1;
1602 }
1603 } 1754 }
1604nextsocket:
1605 socket++; 1755 socket++;
1606 socket_mask = (socket_mask >> 1); 1756 socket_mask = (socket_mask >> 1);
1607 /* each socket gets a local array of pnodes/hubs */ 1757 make_per_cpu_thp(smaster);
1608 bcp = smaster;
1609 bcp->target_hub_and_pnode = kmalloc_node(
1610 sizeof(struct hub_and_pnode) *
1611 num_possible_cpus(), GFP_KERNEL, bcp->osnode);
1612 memset(bcp->target_hub_and_pnode, 0,
1613 sizeof(struct hub_and_pnode) *
1614 num_possible_cpus());
1615 for_each_present_cpu(tcpu) {
1616 bcp->target_hub_and_pnode[tcpu].pnode =
1617 uv_cpu_hub_info(tcpu)->pnode;
1618 bcp->target_hub_and_pnode[tcpu].uvhub =
1619 uv_cpu_hub_info(tcpu)->numa_blade_id;
1620 }
1621 } 1758 }
1622 } 1759 }
1760 return 0;
1761}
1762
1763/*
1764 * initialize the bau_control structure for each cpu
1765 */
1766static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
1767{
1768 unsigned char *uvhub_mask;
1769 void *vp;
1770 struct uvhub_desc *uvhub_descs;
1771
1772 timeout_us = calculate_destination_timeout();
1773
1774 vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1775 uvhub_descs = (struct uvhub_desc *)vp;
1776 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1777 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1778
1779 if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
1780 return 1;
1781
1782 if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
1783 return 1;
1784
1623 kfree(uvhub_descs); 1785 kfree(uvhub_descs);
1624 kfree(uvhub_mask); 1786 kfree(uvhub_mask);
1625 for_each_present_cpu(cpu) { 1787 init_per_cpu_tunables();
1626 bcp = &per_cpu(bau_control, cpu);
1627 bcp->baudisabled = 0;
1628 bcp->statp = &per_cpu(ptcstats, cpu);
1629 /* time interval to catch a hardware stay-busy bug */
1630 bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
1631 bcp->max_bau_concurrent = max_bau_concurrent;
1632 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1633 bcp->plugged_delay = plugged_delay;
1634 bcp->plugsb4reset = plugsb4reset;
1635 bcp->timeoutsb4reset = timeoutsb4reset;
1636 bcp->ipi_reset_limit = ipi_reset_limit;
1637 bcp->complete_threshold = complete_threshold;
1638 bcp->congested_response_us = congested_response_us;
1639 bcp->congested_reps = congested_reps;
1640 bcp->congested_period = congested_period;
1641 }
1642 return 0; 1788 return 0;
1643} 1789}
1644 1790
@@ -1651,8 +1797,9 @@ static int __init uv_bau_init(void)
1651 int pnode; 1797 int pnode;
1652 int nuvhubs; 1798 int nuvhubs;
1653 int cur_cpu; 1799 int cur_cpu;
1800 int cpus;
1654 int vector; 1801 int vector;
1655 unsigned long mmr; 1802 cpumask_var_t *mask;
1656 1803
1657 if (!is_uv_system()) 1804 if (!is_uv_system())
1658 return 0; 1805 return 0;
@@ -1660,24 +1807,25 @@ static int __init uv_bau_init(void)
1660 if (nobau) 1807 if (nobau)
1661 return 0; 1808 return 0;
1662 1809
1663 for_each_possible_cpu(cur_cpu) 1810 for_each_possible_cpu(cur_cpu) {
1664 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), 1811 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
1665 GFP_KERNEL, cpu_to_node(cur_cpu)); 1812 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
1813 }
1666 1814
1667 uv_nshift = uv_hub_info->m_val; 1815 uv_nshift = uv_hub_info->m_val;
1668 uv_mmask = (1UL << uv_hub_info->m_val) - 1; 1816 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1669 nuvhubs = uv_num_possible_blades(); 1817 nuvhubs = uv_num_possible_blades();
1670 spin_lock_init(&disable_lock); 1818 spin_lock_init(&disable_lock);
1671 congested_cycles = microsec_2_cycles(congested_response_us); 1819 congested_cycles = usec_2_cycles(congested_respns_us);
1672 1820
1673 uv_partition_base_pnode = 0x7fffffff; 1821 uv_base_pnode = 0x7fffffff;
1674 for (uvhub = 0; uvhub < nuvhubs; uvhub++) { 1822 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1675 if (uv_blade_nr_possible_cpus(uvhub) && 1823 cpus = uv_blade_nr_possible_cpus(uvhub);
1676 (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode)) 1824 if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
1677 uv_partition_base_pnode = uv_blade_to_pnode(uvhub); 1825 uv_base_pnode = uv_blade_to_pnode(uvhub);
1678 } 1826 }
1679 1827
1680 if (uv_init_per_cpu(nuvhubs, uv_partition_base_pnode)) { 1828 if (init_per_cpu(nuvhubs, uv_base_pnode)) {
1681 nobau = 1; 1829 nobau = 1;
1682 return 0; 1830 return 0;
1683 } 1831 }
@@ -1685,21 +1833,21 @@ static int __init uv_bau_init(void)
1685 vector = UV_BAU_MESSAGE; 1833 vector = UV_BAU_MESSAGE;
1686 for_each_possible_blade(uvhub) 1834 for_each_possible_blade(uvhub)
1687 if (uv_blade_nr_possible_cpus(uvhub)) 1835 if (uv_blade_nr_possible_cpus(uvhub))
1688 uv_init_uvhub(uvhub, vector, uv_partition_base_pnode); 1836 init_uvhub(uvhub, vector, uv_base_pnode);
1689 1837
1690 uv_enable_timeouts(); 1838 enable_timeouts();
1691 alloc_intr_gate(vector, uv_bau_message_intr1); 1839 alloc_intr_gate(vector, uv_bau_message_intr1);
1692 1840
1693 for_each_possible_blade(uvhub) { 1841 for_each_possible_blade(uvhub) {
1694 if (uv_blade_nr_possible_cpus(uvhub)) { 1842 if (uv_blade_nr_possible_cpus(uvhub)) {
1843 unsigned long val;
1844 unsigned long mmr;
1695 pnode = uv_blade_to_pnode(uvhub); 1845 pnode = uv_blade_to_pnode(uvhub);
1696 /* INIT the bau */ 1846 /* INIT the bau */
1697 uv_write_global_mmr64(pnode, 1847 val = 1L << 63;
1698 UVH_LB_BAU_SB_ACTIVATION_CONTROL, 1848 write_gmmr_activation(pnode, val);
1699 ((unsigned long)1 << 63));
1700 mmr = 1; /* should be 1 to broadcast to both sockets */ 1849 mmr = 1; /* should be 1 to broadcast to both sockets */
1701 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, 1850 write_mmr_data_broadcast(pnode, mmr);
1702 mmr);
1703 } 1851 }
1704 } 1852 }
1705 1853
diff --git a/arch/x86/platform/uv/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 9daf5d1af9f1..9f29a01ee1b3 100644
--- a/arch/x86/platform/uv/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = {
40 .rating = 400, 40 .rating = 400,
41 .read = uv_read_rtc, 41 .read = uv_read_rtc,
42 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, 42 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
43 .shift = 10,
44 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 43 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
45}; 44};
46 45
@@ -100,8 +99,12 @@ static void uv_rtc_send_IPI(int cpu)
100/* Check for an RTC interrupt pending */ 99/* Check for an RTC interrupt pending */
101static int uv_intr_pending(int pnode) 100static int uv_intr_pending(int pnode)
102{ 101{
103 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & 102 if (is_uv1_hub())
104 UVH_EVENT_OCCURRED0_RTC1_MASK; 103 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
104 UV1H_EVENT_OCCURRED0_RTC1_MASK;
105 else
106 return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) &
107 UV2H_EVENT_OCCURRED2_RTC_1_MASK;
105} 108}
106 109
107/* Setup interrupt and return non-zero if early expiration occurred. */ 110/* Setup interrupt and return non-zero if early expiration occurred. */
@@ -115,8 +118,12 @@ static int uv_setup_intr(int cpu, u64 expires)
115 UVH_RTC1_INT_CONFIG_M_MASK); 118 UVH_RTC1_INT_CONFIG_M_MASK);
116 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); 119 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
117 120
118 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 121 if (is_uv1_hub())
119 UVH_EVENT_OCCURRED0_RTC1_MASK); 122 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
123 UV1H_EVENT_OCCURRED0_RTC1_MASK);
124 else
125 uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS,
126 UV2H_EVENT_OCCURRED2_RTC_1_MASK);
120 127
121 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 128 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
122 ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 129 ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
@@ -372,14 +379,11 @@ static __init int uv_rtc_setup_clock(void)
372 if (!is_uv_system()) 379 if (!is_uv_system())
373 return -ENODEV; 380 return -ENODEV;
374 381
375 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
376 clocksource_uv.shift);
377
378 /* If single blade, prefer tsc */ 382 /* If single blade, prefer tsc */
379 if (uv_num_possible_blades() == 1) 383 if (uv_num_possible_blades() == 1)
380 clocksource_uv.rating = 250; 384 clocksource_uv.rating = 250;
381 385
382 rc = clocksource_register(&clocksource_uv); 386 rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
383 if (rc) 387 if (rc)
384 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); 388 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
385 else 389 else
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index b6552b189bcd..bef0bc962400 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images)
11 11
12 12
13# files to link into the vdso 13# files to link into the vdso
14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o 14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
15 15
16# files to link into kernel 16# files to link into kernel
17obj-$(VDSO64-y) += vma.o vdso.o 17obj-$(VDSO64-y) += vma.o vdso.o
@@ -37,11 +37,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
37$(obj)/%.so: $(obj)/%.so.dbg FORCE 37$(obj)/%.so: $(obj)/%.so.dbg FORCE
38 $(call if_changed,objcopy) 38 $(call if_changed,objcopy)
39 39
40#
41# Don't omit frame pointers for ease of userspace debugging, but do
42# optimize sibling calls.
43#
40CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ 44CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
41 $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) 45 $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
46 -fno-omit-frame-pointer -foptimize-sibling-calls
42 47
43$(vobjs): KBUILD_CFLAGS += $(CFL) 48$(vobjs): KBUILD_CFLAGS += $(CFL)
44 49
50#
51# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
52#
53CFLAGS_REMOVE_vdso-note.o = -pg
54CFLAGS_REMOVE_vclock_gettime.o = -pg
55CFLAGS_REMOVE_vgetcpu.o = -pg
56CFLAGS_REMOVE_vvar.o = -pg
57
45targets += vdso-syms.lds 58targets += vdso-syms.lds
46obj-$(VDSO64-y) += vdso-syms.lds 59obj-$(VDSO64-y) += vdso-syms.lds
47 60
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754cc3c5..a724905fdae7 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -2,7 +2,7 @@
2 * Copyright 2006 Andi Kleen, SUSE Labs. 2 * Copyright 2006 Andi Kleen, SUSE Labs.
3 * Subject to the GNU Public License, v.2 3 * Subject to the GNU Public License, v.2
4 * 4 *
5 * Fast user context implementation of clock_gettime and gettimeofday. 5 * Fast user context implementation of clock_gettime, gettimeofday, and time.
6 * 6 *
7 * The code should have no internal unresolved relocations. 7 * The code should have no internal unresolved relocations.
8 * Check with readelf after changing. 8 * Check with readelf after changing.
@@ -22,9 +22,8 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/unistd.h> 23#include <asm/unistd.h>
24#include <asm/io.h> 24#include <asm/io.h>
25#include "vextern.h"
26 25
27#define gtod vdso_vsyscall_gtod_data 26#define gtod (&VVAR(vsyscall_gtod_data))
28 27
29notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 28notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
30{ 29{
@@ -56,22 +55,6 @@ notrace static noinline int do_realtime(struct timespec *ts)
56 return 0; 55 return 0;
57} 56}
58 57
59/* Copy of the version in kernel/time.c which we cannot directly access */
60notrace static void
61vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
62{
63 while (nsec >= NSEC_PER_SEC) {
64 nsec -= NSEC_PER_SEC;
65 ++sec;
66 }
67 while (nsec < 0) {
68 nsec += NSEC_PER_SEC;
69 --sec;
70 }
71 ts->tv_sec = sec;
72 ts->tv_nsec = nsec;
73}
74
75notrace static noinline int do_monotonic(struct timespec *ts) 58notrace static noinline int do_monotonic(struct timespec *ts)
76{ 59{
77 unsigned long seq, ns, secs; 60 unsigned long seq, ns, secs;
@@ -82,7 +65,17 @@ notrace static noinline int do_monotonic(struct timespec *ts)
82 secs += gtod->wall_to_monotonic.tv_sec; 65 secs += gtod->wall_to_monotonic.tv_sec;
83 ns += gtod->wall_to_monotonic.tv_nsec; 66 ns += gtod->wall_to_monotonic.tv_nsec;
84 } while (unlikely(read_seqretry(&gtod->lock, seq))); 67 } while (unlikely(read_seqretry(&gtod->lock, seq)));
85 vset_normalized_timespec(ts, secs, ns); 68
69 /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
70 * are all guaranteed to be nonnegative.
71 */
72 while (ns >= NSEC_PER_SEC) {
73 ns -= NSEC_PER_SEC;
74 ++secs;
75 }
76 ts->tv_sec = secs;
77 ts->tv_nsec = ns;
78
86 return 0; 79 return 0;
87} 80}
88 81
@@ -107,7 +100,17 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
107 secs += gtod->wall_to_monotonic.tv_sec; 100 secs += gtod->wall_to_monotonic.tv_sec;
108 ns += gtod->wall_to_monotonic.tv_nsec; 101 ns += gtod->wall_to_monotonic.tv_nsec;
109 } while (unlikely(read_seqretry(&gtod->lock, seq))); 102 } while (unlikely(read_seqretry(&gtod->lock, seq)));
110 vset_normalized_timespec(ts, secs, ns); 103
104 /* wall_time_nsec and wall_to_monotonic.tv_nsec are
105 * guaranteed to be between 0 and NSEC_PER_SEC.
106 */
107 if (ns >= NSEC_PER_SEC) {
108 ns -= NSEC_PER_SEC;
109 ++secs;
110 }
111 ts->tv_sec = secs;
112 ts->tv_nsec = ns;
113
111 return 0; 114 return 0;
112} 115}
113 116
@@ -157,3 +160,32 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
157} 160}
158int gettimeofday(struct timeval *, struct timezone *) 161int gettimeofday(struct timeval *, struct timezone *)
159 __attribute__((weak, alias("__vdso_gettimeofday"))); 162 __attribute__((weak, alias("__vdso_gettimeofday")));
163
164/* This will break when the xtime seconds get inaccurate, but that is
165 * unlikely */
166
167static __always_inline long time_syscall(long *t)
168{
169 long secs;
170 asm volatile("syscall"
171 : "=a" (secs)
172 : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
173 return secs;
174}
175
176notrace time_t __vdso_time(time_t *t)
177{
178 time_t result;
179
180 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
181 return time_syscall(t);
182
183 /* This is atomic on x86_64 so we don't need any locks. */
184 result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
185
186 if (t)
187 *t = result;
188 return result;
189}
190int time(time_t *t)
191 __attribute__((weak, alias("__vdso_time")));
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 4e5dd3b4de7f..b96b2677cad8 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -23,15 +23,10 @@ VERSION {
23 __vdso_gettimeofday; 23 __vdso_gettimeofday;
24 getcpu; 24 getcpu;
25 __vdso_getcpu; 25 __vdso_getcpu;
26 time;
27 __vdso_time;
26 local: *; 28 local: *;
27 }; 29 };
28} 30}
29 31
30VDSO64_PRELINK = VDSO_PRELINK; 32VDSO64_PRELINK = VDSO_PRELINK;
31
32/*
33 * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
34 */
35#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
36#include "vextern.h"
37#undef VEXTERN
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
deleted file mode 100644
index 1683ba2ae3e8..000000000000
--- a/arch/x86/vdso/vextern.h
+++ /dev/null
@@ -1,16 +0,0 @@
1#ifndef VEXTERN
2#include <asm/vsyscall.h>
3#define VEXTERN(x) \
4 extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
5#endif
6
7#define VMAGIC 0xfeedbabeabcdefabUL
8
9/* Any kernel variables used in the vDSO must be exported in the main
10 kernel's vmlinux.lds.S/vsyscall.h/proper __section and
11 put into vextern.h and be referenced as a pointer with vdso prefix.
12 The main kernel later fills in the values. */
13
14VEXTERN(jiffies)
15VEXTERN(vgetcpu_mode)
16VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 9fbc6b20026b..5463ad558573 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -11,14 +11,13 @@
11#include <linux/time.h> 11#include <linux/time.h>
12#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
13#include <asm/vgtod.h> 13#include <asm/vgtod.h>
14#include "vextern.h"
15 14
16notrace long 15notrace long
17__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
18{ 17{
19 unsigned int p; 18 unsigned int p;
20 19
21 if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { 20 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
22 /* Load per CPU data from RDTSCP */ 21 /* Load per CPU data from RDTSCP */
23 native_read_tscp(&p); 22 native_read_tscp(&p);
24 } else { 23 } else {
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 4b5d26f108bb..7abd2be0f9b9 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -15,9 +15,6 @@
15#include <asm/proto.h> 15#include <asm/proto.h>
16#include <asm/vdso.h> 16#include <asm/vdso.h>
17 17
18#include "vextern.h" /* Just for VMAGIC. */
19#undef VEXTERN
20
21unsigned int __read_mostly vdso_enabled = 1; 18unsigned int __read_mostly vdso_enabled = 1;
22 19
23extern char vdso_start[], vdso_end[]; 20extern char vdso_start[], vdso_end[];
@@ -26,20 +23,10 @@ extern unsigned short vdso_sync_cpuid;
26static struct page **vdso_pages; 23static struct page **vdso_pages;
27static unsigned vdso_size; 24static unsigned vdso_size;
28 25
29static inline void *var_ref(void *p, char *name)
30{
31 if (*(void **)p != (void *)VMAGIC) {
32 printk("VDSO: variable %s broken\n", name);
33 vdso_enabled = 0;
34 }
35 return p;
36}
37
38static int __init init_vdso_vars(void) 26static int __init init_vdso_vars(void)
39{ 27{
40 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; 28 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
41 int i; 29 int i;
42 char *vbase;
43 30
44 vdso_size = npages << PAGE_SHIFT; 31 vdso_size = npages << PAGE_SHIFT;
45 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 32 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
@@ -54,20 +41,6 @@ static int __init init_vdso_vars(void)
54 copy_page(page_address(p), vdso_start + i*PAGE_SIZE); 41 copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
55 } 42 }
56 43
57 vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
58 if (!vbase)
59 goto oom;
60
61 if (memcmp(vbase, "\177ELF", 4)) {
62 printk("VDSO: I'm broken; not ELF\n");
63 vdso_enabled = 0;
64 }
65
66#define VEXTERN(x) \
67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
68#include "vextern.h"
69#undef VEXTERN
70 vunmap(vbase);
71 return 0; 44 return 0;
72 45
73 oom: 46 oom:
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
deleted file mode 100644
index 1b7e703684f9..000000000000
--- a/arch/x86/vdso/vvar.c
+++ /dev/null
@@ -1,12 +0,0 @@
1/* Define pointer to external vDSO variables.
2 These are part of the vDSO. The kernel fills in the real addresses
3 at boot time. This is done because when the vdso is linked the
4 kernel isn't yet and we don't know the final addresses. */
5#include <linux/kernel.h>
6#include <linux/time.h>
7#include <asm/vsyscall.h>
8#include <asm/timex.h>
9#include <asm/vgtod.h>
10
11#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC;
12#include "vextern.h"
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index e3c6a06cf725..dd7b88f2ec7a 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -235,7 +235,7 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
235 *dx &= maskedx; 235 *dx &= maskedx;
236} 236}
237 237
238static __init void xen_init_cpuid_mask(void) 238static void __init xen_init_cpuid_mask(void)
239{ 239{
240 unsigned int ax, bx, cx, dx; 240 unsigned int ax, bx, cx, dx;
241 unsigned int xsave_mask; 241 unsigned int xsave_mask;
@@ -400,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
400/* 400/*
401 * load_gdt for early boot, when the gdt is only mapped once 401 * load_gdt for early boot, when the gdt is only mapped once
402 */ 402 */
403static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) 403static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
404{ 404{
405 unsigned long va = dtr->address; 405 unsigned long va = dtr->address;
406 unsigned int size = dtr->size + 1; 406 unsigned int size = dtr->size + 1;
@@ -662,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
662 * Version of write_gdt_entry for use at early boot-time needed to 662 * Version of write_gdt_entry for use at early boot-time needed to
663 * update an entry as simply as possible. 663 * update an entry as simply as possible.
664 */ 664 */
665static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 665static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
666 const void *desc, int type) 666 const void *desc, int type)
667{ 667{
668 switch (type) { 668 switch (type) {
@@ -933,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
933 return ret; 933 return ret;
934} 934}
935 935
936static const struct pv_info xen_info __initdata = { 936static const struct pv_info xen_info __initconst = {
937 .paravirt_enabled = 1, 937 .paravirt_enabled = 1,
938 .shared_kernel_pmd = 0, 938 .shared_kernel_pmd = 0,
939 939
940 .name = "Xen", 940 .name = "Xen",
941}; 941};
942 942
943static const struct pv_init_ops xen_init_ops __initdata = { 943static const struct pv_init_ops xen_init_ops __initconst = {
944 .patch = xen_patch, 944 .patch = xen_patch,
945}; 945};
946 946
947static const struct pv_cpu_ops xen_cpu_ops __initdata = { 947static const struct pv_cpu_ops xen_cpu_ops __initconst = {
948 .cpuid = xen_cpuid, 948 .cpuid = xen_cpuid,
949 949
950 .set_debugreg = xen_set_debugreg, 950 .set_debugreg = xen_set_debugreg,
@@ -1004,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1004 .end_context_switch = xen_end_context_switch, 1004 .end_context_switch = xen_end_context_switch,
1005}; 1005};
1006 1006
1007static const struct pv_apic_ops xen_apic_ops __initdata = { 1007static const struct pv_apic_ops xen_apic_ops __initconst = {
1008#ifdef CONFIG_X86_LOCAL_APIC 1008#ifdef CONFIG_X86_LOCAL_APIC
1009 .startup_ipi_hook = paravirt_nop, 1009 .startup_ipi_hook = paravirt_nop,
1010#endif 1010#endif
@@ -1055,7 +1055,7 @@ int xen_panic_handler_init(void)
1055 return 0; 1055 return 0;
1056} 1056}
1057 1057
1058static const struct machine_ops __initdata xen_machine_ops = { 1058static const struct machine_ops xen_machine_ops __initconst = {
1059 .restart = xen_restart, 1059 .restart = xen_restart,
1060 .halt = xen_machine_halt, 1060 .halt = xen_machine_halt,
1061 .power_off = xen_machine_halt, 1061 .power_off = xen_machine_halt,
@@ -1332,7 +1332,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1332 return NOTIFY_OK; 1332 return NOTIFY_OK;
1333} 1333}
1334 1334
1335static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { 1335static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1336 .notifier_call = xen_hvm_cpu_notify, 1336 .notifier_call = xen_hvm_cpu_notify,
1337}; 1337};
1338 1338
@@ -1381,7 +1381,7 @@ bool xen_hvm_need_lapic(void)
1381} 1381}
1382EXPORT_SYMBOL_GPL(xen_hvm_need_lapic); 1382EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
1383 1383
1384const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { 1384const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
1385 .name = "Xen HVM", 1385 .name = "Xen HVM",
1386 .detect = xen_hvm_platform, 1386 .detect = xen_hvm_platform,
1387 .init_platform = xen_hvm_guest_init, 1387 .init_platform = xen_hvm_guest_init,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 6a6fe8939645..8bbb465b6f0a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -113,7 +113,7 @@ static void xen_halt(void)
113 xen_safe_halt(); 113 xen_safe_halt();
114} 114}
115 115
116static const struct pv_irq_ops xen_irq_ops __initdata = { 116static const struct pv_irq_ops xen_irq_ops __initconst = {
117 .save_fl = PV_CALLEE_SAVE(xen_save_fl), 117 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), 118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), 119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 0684f3c74d53..dc708dcc62f1 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -75,67 +75,12 @@
75#include "mmu.h" 75#include "mmu.h"
76#include "debugfs.h" 76#include "debugfs.h"
77 77
78#define MMU_UPDATE_HISTO 30
79
80/* 78/*
81 * Protects atomic reservation decrease/increase against concurrent increases. 79 * Protects atomic reservation decrease/increase against concurrent increases.
82 * Also protects non-atomic updates of current_pages and balloon lists. 80 * Also protects non-atomic updates of current_pages and balloon lists.
83 */ 81 */
84DEFINE_SPINLOCK(xen_reservation_lock); 82DEFINE_SPINLOCK(xen_reservation_lock);
85 83
86#ifdef CONFIG_XEN_DEBUG_FS
87
88static struct {
89 u32 pgd_update;
90 u32 pgd_update_pinned;
91 u32 pgd_update_batched;
92
93 u32 pud_update;
94 u32 pud_update_pinned;
95 u32 pud_update_batched;
96
97 u32 pmd_update;
98 u32 pmd_update_pinned;
99 u32 pmd_update_batched;
100
101 u32 pte_update;
102 u32 pte_update_pinned;
103 u32 pte_update_batched;
104
105 u32 mmu_update;
106 u32 mmu_update_extended;
107 u32 mmu_update_histo[MMU_UPDATE_HISTO];
108
109 u32 prot_commit;
110 u32 prot_commit_batched;
111
112 u32 set_pte_at;
113 u32 set_pte_at_batched;
114 u32 set_pte_at_pinned;
115 u32 set_pte_at_current;
116 u32 set_pte_at_kernel;
117} mmu_stats;
118
119static u8 zero_stats;
120
121static inline void check_zero(void)
122{
123 if (unlikely(zero_stats)) {
124 memset(&mmu_stats, 0, sizeof(mmu_stats));
125 zero_stats = 0;
126 }
127}
128
129#define ADD_STATS(elem, val) \
130 do { check_zero(); mmu_stats.elem += (val); } while(0)
131
132#else /* !CONFIG_XEN_DEBUG_FS */
133
134#define ADD_STATS(elem, val) do { (void)(val); } while(0)
135
136#endif /* CONFIG_XEN_DEBUG_FS */
137
138
139/* 84/*
140 * Identity map, in addition to plain kernel map. This needs to be 85 * Identity map, in addition to plain kernel map. This needs to be
141 * large enough to allocate page table pages to allocate the rest. 86 * large enough to allocate page table pages to allocate the rest.
@@ -243,11 +188,6 @@ static bool xen_page_pinned(void *ptr)
243 return PagePinned(page); 188 return PagePinned(page);
244} 189}
245 190
246static bool xen_iomap_pte(pte_t pte)
247{
248 return pte_flags(pte) & _PAGE_IOMAP;
249}
250
251void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid) 191void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
252{ 192{
253 struct multicall_space mcs; 193 struct multicall_space mcs;
@@ -257,7 +197,7 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
257 u = mcs.args; 197 u = mcs.args;
258 198
259 /* ptep might be kmapped when using 32-bit HIGHPTE */ 199 /* ptep might be kmapped when using 32-bit HIGHPTE */
260 u->ptr = arbitrary_virt_to_machine(ptep).maddr; 200 u->ptr = virt_to_machine(ptep).maddr;
261 u->val = pte_val_ma(pteval); 201 u->val = pte_val_ma(pteval);
262 202
263 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid); 203 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
@@ -266,11 +206,6 @@ void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
266} 206}
267EXPORT_SYMBOL_GPL(xen_set_domain_pte); 207EXPORT_SYMBOL_GPL(xen_set_domain_pte);
268 208
269static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
270{
271 xen_set_domain_pte(ptep, pteval, DOMID_IO);
272}
273
274static void xen_extend_mmu_update(const struct mmu_update *update) 209static void xen_extend_mmu_update(const struct mmu_update *update)
275{ 210{
276 struct multicall_space mcs; 211 struct multicall_space mcs;
@@ -279,27 +214,17 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
279 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 214 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
280 215
281 if (mcs.mc != NULL) { 216 if (mcs.mc != NULL) {
282 ADD_STATS(mmu_update_extended, 1);
283 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
284
285 mcs.mc->args[1]++; 217 mcs.mc->args[1]++;
286
287 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
288 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
289 else
290 ADD_STATS(mmu_update_histo[0], 1);
291 } else { 218 } else {
292 ADD_STATS(mmu_update, 1);
293 mcs = __xen_mc_entry(sizeof(*u)); 219 mcs = __xen_mc_entry(sizeof(*u));
294 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 220 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
295 ADD_STATS(mmu_update_histo[1], 1);
296 } 221 }
297 222
298 u = mcs.args; 223 u = mcs.args;
299 *u = *update; 224 *u = *update;
300} 225}
301 226
302void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 227static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
303{ 228{
304 struct mmu_update u; 229 struct mmu_update u;
305 230
@@ -312,17 +237,13 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
312 u.val = pmd_val_ma(val); 237 u.val = pmd_val_ma(val);
313 xen_extend_mmu_update(&u); 238 xen_extend_mmu_update(&u);
314 239
315 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
316
317 xen_mc_issue(PARAVIRT_LAZY_MMU); 240 xen_mc_issue(PARAVIRT_LAZY_MMU);
318 241
319 preempt_enable(); 242 preempt_enable();
320} 243}
321 244
322void xen_set_pmd(pmd_t *ptr, pmd_t val) 245static void xen_set_pmd(pmd_t *ptr, pmd_t val)
323{ 246{
324 ADD_STATS(pmd_update, 1);
325
326 /* If page is not pinned, we can just update the entry 247 /* If page is not pinned, we can just update the entry
327 directly */ 248 directly */
328 if (!xen_page_pinned(ptr)) { 249 if (!xen_page_pinned(ptr)) {
@@ -330,8 +251,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
330 return; 251 return;
331 } 252 }
332 253
333 ADD_STATS(pmd_update_pinned, 1);
334
335 xen_set_pmd_hyper(ptr, val); 254 xen_set_pmd_hyper(ptr, val);
336} 255}
337 256
@@ -344,35 +263,34 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
344 set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); 263 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
345} 264}
346 265
347void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 266static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
348 pte_t *ptep, pte_t pteval)
349{ 267{
350 if (xen_iomap_pte(pteval)) { 268 struct mmu_update u;
351 xen_set_iomap_pte(ptep, pteval);
352 goto out;
353 }
354 269
355 ADD_STATS(set_pte_at, 1); 270 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
356// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); 271 return false;
357 ADD_STATS(set_pte_at_current, mm == current->mm);
358 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
359 272
360 if (mm == current->mm || mm == &init_mm) { 273 xen_mc_batch();
361 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) {
362 struct multicall_space mcs;
363 mcs = xen_mc_entry(0);
364 274
365 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 275 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
366 ADD_STATS(set_pte_at_batched, 1); 276 u.val = pte_val_ma(pteval);
367 xen_mc_issue(PARAVIRT_LAZY_MMU); 277 xen_extend_mmu_update(&u);
368 goto out; 278
369 } else 279 xen_mc_issue(PARAVIRT_LAZY_MMU);
370 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
371 goto out;
372 }
373 xen_set_pte(ptep, pteval);
374 280
375out: return; 281 return true;
282}
283
284static void xen_set_pte(pte_t *ptep, pte_t pteval)
285{
286 if (!xen_batched_set_pte(ptep, pteval))
287 native_set_pte(ptep, pteval);
288}
289
290static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
291 pte_t *ptep, pte_t pteval)
292{
293 xen_set_pte(ptep, pteval);
376} 294}
377 295
378pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 296pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -389,13 +307,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
389 307
390 xen_mc_batch(); 308 xen_mc_batch();
391 309
392 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 310 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
393 u.val = pte_val_ma(pte); 311 u.val = pte_val_ma(pte);
394 xen_extend_mmu_update(&u); 312 xen_extend_mmu_update(&u);
395 313
396 ADD_STATS(prot_commit, 1);
397 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
398
399 xen_mc_issue(PARAVIRT_LAZY_MMU); 314 xen_mc_issue(PARAVIRT_LAZY_MMU);
400} 315}
401 316
@@ -463,7 +378,7 @@ static pteval_t iomap_pte(pteval_t val)
463 return val; 378 return val;
464} 379}
465 380
466pteval_t xen_pte_val(pte_t pte) 381static pteval_t xen_pte_val(pte_t pte)
467{ 382{
468 pteval_t pteval = pte.pte; 383 pteval_t pteval = pte.pte;
469 384
@@ -480,7 +395,7 @@ pteval_t xen_pte_val(pte_t pte)
480} 395}
481PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 396PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
482 397
483pgdval_t xen_pgd_val(pgd_t pgd) 398static pgdval_t xen_pgd_val(pgd_t pgd)
484{ 399{
485 return pte_mfn_to_pfn(pgd.pgd); 400 return pte_mfn_to_pfn(pgd.pgd);
486} 401}
@@ -511,7 +426,7 @@ void xen_set_pat(u64 pat)
511 WARN_ON(pat != 0x0007010600070106ull); 426 WARN_ON(pat != 0x0007010600070106ull);
512} 427}
513 428
514pte_t xen_make_pte(pteval_t pte) 429static pte_t xen_make_pte(pteval_t pte)
515{ 430{
516 phys_addr_t addr = (pte & PTE_PFN_MASK); 431 phys_addr_t addr = (pte & PTE_PFN_MASK);
517 432
@@ -581,20 +496,20 @@ pte_t xen_make_pte_debug(pteval_t pte)
581PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug); 496PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
582#endif 497#endif
583 498
584pgd_t xen_make_pgd(pgdval_t pgd) 499static pgd_t xen_make_pgd(pgdval_t pgd)
585{ 500{
586 pgd = pte_pfn_to_mfn(pgd); 501 pgd = pte_pfn_to_mfn(pgd);
587 return native_make_pgd(pgd); 502 return native_make_pgd(pgd);
588} 503}
589PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 504PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
590 505
591pmdval_t xen_pmd_val(pmd_t pmd) 506static pmdval_t xen_pmd_val(pmd_t pmd)
592{ 507{
593 return pte_mfn_to_pfn(pmd.pmd); 508 return pte_mfn_to_pfn(pmd.pmd);
594} 509}
595PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); 510PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
596 511
597void xen_set_pud_hyper(pud_t *ptr, pud_t val) 512static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
598{ 513{
599 struct mmu_update u; 514 struct mmu_update u;
600 515
@@ -607,17 +522,13 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
607 u.val = pud_val_ma(val); 522 u.val = pud_val_ma(val);
608 xen_extend_mmu_update(&u); 523 xen_extend_mmu_update(&u);
609 524
610 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
611
612 xen_mc_issue(PARAVIRT_LAZY_MMU); 525 xen_mc_issue(PARAVIRT_LAZY_MMU);
613 526
614 preempt_enable(); 527 preempt_enable();
615} 528}
616 529
617void xen_set_pud(pud_t *ptr, pud_t val) 530static void xen_set_pud(pud_t *ptr, pud_t val)
618{ 531{
619 ADD_STATS(pud_update, 1);
620
621 /* If page is not pinned, we can just update the entry 532 /* If page is not pinned, we can just update the entry
622 directly */ 533 directly */
623 if (!xen_page_pinned(ptr)) { 534 if (!xen_page_pinned(ptr)) {
@@ -625,56 +536,28 @@ void xen_set_pud(pud_t *ptr, pud_t val)
625 return; 536 return;
626 } 537 }
627 538
628 ADD_STATS(pud_update_pinned, 1);
629
630 xen_set_pud_hyper(ptr, val); 539 xen_set_pud_hyper(ptr, val);
631} 540}
632 541
633void xen_set_pte(pte_t *ptep, pte_t pte)
634{
635 if (xen_iomap_pte(pte)) {
636 xen_set_iomap_pte(ptep, pte);
637 return;
638 }
639
640 ADD_STATS(pte_update, 1);
641// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
642 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
643
644#ifdef CONFIG_X86_PAE 542#ifdef CONFIG_X86_PAE
645 ptep->pte_high = pte.pte_high; 543static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
646 smp_wmb();
647 ptep->pte_low = pte.pte_low;
648#else
649 *ptep = pte;
650#endif
651}
652
653#ifdef CONFIG_X86_PAE
654void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
655{ 544{
656 if (xen_iomap_pte(pte)) {
657 xen_set_iomap_pte(ptep, pte);
658 return;
659 }
660
661 set_64bit((u64 *)ptep, native_pte_val(pte)); 545 set_64bit((u64 *)ptep, native_pte_val(pte));
662} 546}
663 547
664void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 548static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
665{ 549{
666 ptep->pte_low = 0; 550 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
667 smp_wmb(); /* make sure low gets written first */ 551 native_pte_clear(mm, addr, ptep);
668 ptep->pte_high = 0;
669} 552}
670 553
671void xen_pmd_clear(pmd_t *pmdp) 554static void xen_pmd_clear(pmd_t *pmdp)
672{ 555{
673 set_pmd(pmdp, __pmd(0)); 556 set_pmd(pmdp, __pmd(0));
674} 557}
675#endif /* CONFIG_X86_PAE */ 558#endif /* CONFIG_X86_PAE */
676 559
677pmd_t xen_make_pmd(pmdval_t pmd) 560static pmd_t xen_make_pmd(pmdval_t pmd)
678{ 561{
679 pmd = pte_pfn_to_mfn(pmd); 562 pmd = pte_pfn_to_mfn(pmd);
680 return native_make_pmd(pmd); 563 return native_make_pmd(pmd);
@@ -682,13 +565,13 @@ pmd_t xen_make_pmd(pmdval_t pmd)
682PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 565PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
683 566
684#if PAGETABLE_LEVELS == 4 567#if PAGETABLE_LEVELS == 4
685pudval_t xen_pud_val(pud_t pud) 568static pudval_t xen_pud_val(pud_t pud)
686{ 569{
687 return pte_mfn_to_pfn(pud.pud); 570 return pte_mfn_to_pfn(pud.pud);
688} 571}
689PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 572PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
690 573
691pud_t xen_make_pud(pudval_t pud) 574static pud_t xen_make_pud(pudval_t pud)
692{ 575{
693 pud = pte_pfn_to_mfn(pud); 576 pud = pte_pfn_to_mfn(pud);
694 577
@@ -696,7 +579,7 @@ pud_t xen_make_pud(pudval_t pud)
696} 579}
697PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); 580PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
698 581
699pgd_t *xen_get_user_pgd(pgd_t *pgd) 582static pgd_t *xen_get_user_pgd(pgd_t *pgd)
700{ 583{
701 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); 584 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
702 unsigned offset = pgd - pgd_page; 585 unsigned offset = pgd - pgd_page;
@@ -728,7 +611,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
728 * 2. It is always pinned 611 * 2. It is always pinned
729 * 3. It has no user pagetable attached to it 612 * 3. It has no user pagetable attached to it
730 */ 613 */
731void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 614static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
732{ 615{
733 preempt_disable(); 616 preempt_disable();
734 617
@@ -741,12 +624,10 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
741 preempt_enable(); 624 preempt_enable();
742} 625}
743 626
744void xen_set_pgd(pgd_t *ptr, pgd_t val) 627static void xen_set_pgd(pgd_t *ptr, pgd_t val)
745{ 628{
746 pgd_t *user_ptr = xen_get_user_pgd(ptr); 629 pgd_t *user_ptr = xen_get_user_pgd(ptr);
747 630
748 ADD_STATS(pgd_update, 1);
749
750 /* If page is not pinned, we can just update the entry 631 /* If page is not pinned, we can just update the entry
751 directly */ 632 directly */
752 if (!xen_page_pinned(ptr)) { 633 if (!xen_page_pinned(ptr)) {
@@ -758,9 +639,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
758 return; 639 return;
759 } 640 }
760 641
761 ADD_STATS(pgd_update_pinned, 1);
762 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
763
764 /* If it's pinned, then we can at least batch the kernel and 642 /* If it's pinned, then we can at least batch the kernel and
765 user updates together. */ 643 user updates together. */
766 xen_mc_batch(); 644 xen_mc_batch();
@@ -1054,7 +932,7 @@ void xen_mm_pin_all(void)
1054 * that's before we have page structures to store the bits. So do all 932 * that's before we have page structures to store the bits. So do all
1055 * the book-keeping now. 933 * the book-keeping now.
1056 */ 934 */
1057static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, 935static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
1058 enum pt_level level) 936 enum pt_level level)
1059{ 937{
1060 SetPagePinned(page); 938 SetPagePinned(page);
@@ -1162,14 +1040,14 @@ void xen_mm_unpin_all(void)
1162 spin_unlock(&pgd_lock); 1040 spin_unlock(&pgd_lock);
1163} 1041}
1164 1042
1165void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1043static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1166{ 1044{
1167 spin_lock(&next->page_table_lock); 1045 spin_lock(&next->page_table_lock);
1168 xen_pgd_pin(next); 1046 xen_pgd_pin(next);
1169 spin_unlock(&next->page_table_lock); 1047 spin_unlock(&next->page_table_lock);
1170} 1048}
1171 1049
1172void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1050static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1173{ 1051{
1174 spin_lock(&mm->page_table_lock); 1052 spin_lock(&mm->page_table_lock);
1175 xen_pgd_pin(mm); 1053 xen_pgd_pin(mm);
@@ -1187,7 +1065,7 @@ static void drop_other_mm_ref(void *info)
1187 1065
1188 active_mm = percpu_read(cpu_tlbstate.active_mm); 1066 active_mm = percpu_read(cpu_tlbstate.active_mm);
1189 1067
1190 if (active_mm == mm) 1068 if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1191 leave_mm(smp_processor_id()); 1069 leave_mm(smp_processor_id());
1192 1070
1193 /* If this cpu still has a stale cr3 reference, then make sure 1071 /* If this cpu still has a stale cr3 reference, then make sure
@@ -1256,7 +1134,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1256 * pagetable because of lazy tlb flushing. This means we need need to 1134 * pagetable because of lazy tlb flushing. This means we need need to
1257 * switch all CPUs off this pagetable before we can unpin it. 1135 * switch all CPUs off this pagetable before we can unpin it.
1258 */ 1136 */
1259void xen_exit_mmap(struct mm_struct *mm) 1137static void xen_exit_mmap(struct mm_struct *mm)
1260{ 1138{
1261 get_cpu(); /* make sure we don't move around */ 1139 get_cpu(); /* make sure we don't move around */
1262 xen_drop_mm_ref(mm); 1140 xen_drop_mm_ref(mm);
@@ -1271,7 +1149,7 @@ void xen_exit_mmap(struct mm_struct *mm)
1271 spin_unlock(&mm->page_table_lock); 1149 spin_unlock(&mm->page_table_lock);
1272} 1150}
1273 1151
1274static __init void xen_pagetable_setup_start(pgd_t *base) 1152static void __init xen_pagetable_setup_start(pgd_t *base)
1275{ 1153{
1276} 1154}
1277 1155
@@ -1291,7 +1169,7 @@ static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1291 1169
1292static void xen_post_allocator_init(void); 1170static void xen_post_allocator_init(void);
1293 1171
1294static __init void xen_pagetable_setup_done(pgd_t *base) 1172static void __init xen_pagetable_setup_done(pgd_t *base)
1295{ 1173{
1296 xen_setup_shared_info(); 1174 xen_setup_shared_info();
1297 xen_post_allocator_init(); 1175 xen_post_allocator_init();
@@ -1488,7 +1366,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1488} 1366}
1489 1367
1490#ifdef CONFIG_X86_32 1368#ifdef CONFIG_X86_32
1491static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1369static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1492{ 1370{
1493 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1371 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1494 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1372 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
@@ -1498,7 +1376,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1498 return pte; 1376 return pte;
1499} 1377}
1500#else /* CONFIG_X86_64 */ 1378#else /* CONFIG_X86_64 */
1501static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1379static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1502{ 1380{
1503 unsigned long pfn = pte_pfn(pte); 1381 unsigned long pfn = pte_pfn(pte);
1504 1382
@@ -1519,7 +1397,7 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1519 1397
1520/* Init-time set_pte while constructing initial pagetables, which 1398/* Init-time set_pte while constructing initial pagetables, which
1521 doesn't allow RO pagetable pages to be remapped RW */ 1399 doesn't allow RO pagetable pages to be remapped RW */
1522static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) 1400static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1523{ 1401{
1524 pte = mask_rw_pte(ptep, pte); 1402 pte = mask_rw_pte(ptep, pte);
1525 1403
@@ -1537,7 +1415,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1537 1415
1538/* Early in boot, while setting up the initial pagetable, assume 1416/* Early in boot, while setting up the initial pagetable, assume
1539 everything is pinned. */ 1417 everything is pinned. */
1540static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 1418static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1541{ 1419{
1542#ifdef CONFIG_FLATMEM 1420#ifdef CONFIG_FLATMEM
1543 BUG_ON(mem_map); /* should only be used early */ 1421 BUG_ON(mem_map); /* should only be used early */
@@ -1547,7 +1425,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1547} 1425}
1548 1426
1549/* Used for pmd and pud */ 1427/* Used for pmd and pud */
1550static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) 1428static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1551{ 1429{
1552#ifdef CONFIG_FLATMEM 1430#ifdef CONFIG_FLATMEM
1553 BUG_ON(mem_map); /* should only be used early */ 1431 BUG_ON(mem_map); /* should only be used early */
@@ -1557,13 +1435,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1557 1435
1558/* Early release_pte assumes that all pts are pinned, since there's 1436/* Early release_pte assumes that all pts are pinned, since there's
1559 only init_mm and anything attached to that is pinned. */ 1437 only init_mm and anything attached to that is pinned. */
1560static __init void xen_release_pte_init(unsigned long pfn) 1438static void __init xen_release_pte_init(unsigned long pfn)
1561{ 1439{
1562 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1440 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1563 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1441 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1564} 1442}
1565 1443
1566static __init void xen_release_pmd_init(unsigned long pfn) 1444static void __init xen_release_pmd_init(unsigned long pfn)
1567{ 1445{
1568 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1446 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1569} 1447}
@@ -1689,7 +1567,7 @@ static void set_page_prot(void *addr, pgprot_t prot)
1689 BUG(); 1567 BUG();
1690} 1568}
1691 1569
1692static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1570static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1693{ 1571{
1694 unsigned pmdidx, pteidx; 1572 unsigned pmdidx, pteidx;
1695 unsigned ident_pte; 1573 unsigned ident_pte;
@@ -1772,7 +1650,7 @@ static void convert_pfn_mfn(void *v)
1772 * of the physical mapping once some sort of allocator has been set 1650 * of the physical mapping once some sort of allocator has been set
1773 * up. 1651 * up.
1774 */ 1652 */
1775__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1653pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1776 unsigned long max_pfn) 1654 unsigned long max_pfn)
1777{ 1655{
1778 pud_t *l3; 1656 pud_t *l3;
@@ -1843,7 +1721,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1843static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD); 1721static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1844static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD); 1722static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1845 1723
1846static __init void xen_write_cr3_init(unsigned long cr3) 1724static void __init xen_write_cr3_init(unsigned long cr3)
1847{ 1725{
1848 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir)); 1726 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1849 1727
@@ -1880,7 +1758,7 @@ static __init void xen_write_cr3_init(unsigned long cr3)
1880 pv_mmu_ops.write_cr3 = &xen_write_cr3; 1758 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1881} 1759}
1882 1760
1883__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1761pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1884 unsigned long max_pfn) 1762 unsigned long max_pfn)
1885{ 1763{
1886 pmd_t *kernel_pmd; 1764 pmd_t *kernel_pmd;
@@ -1986,7 +1864,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1986#endif 1864#endif
1987} 1865}
1988 1866
1989__init void xen_ident_map_ISA(void) 1867void __init xen_ident_map_ISA(void)
1990{ 1868{
1991 unsigned long pa; 1869 unsigned long pa;
1992 1870
@@ -2009,7 +1887,7 @@ __init void xen_ident_map_ISA(void)
2009 xen_flush_tlb(); 1887 xen_flush_tlb();
2010} 1888}
2011 1889
2012static __init void xen_post_allocator_init(void) 1890static void __init xen_post_allocator_init(void)
2013{ 1891{
2014#ifdef CONFIG_XEN_DEBUG 1892#ifdef CONFIG_XEN_DEBUG
2015 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug); 1893 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
@@ -2046,7 +1924,7 @@ static void xen_leave_lazy_mmu(void)
2046 preempt_enable(); 1924 preempt_enable();
2047} 1925}
2048 1926
2049static const struct pv_mmu_ops xen_mmu_ops __initdata = { 1927static const struct pv_mmu_ops xen_mmu_ops __initconst = {
2050 .read_cr2 = xen_read_cr2, 1928 .read_cr2 = xen_read_cr2,
2051 .write_cr2 = xen_write_cr2, 1929 .write_cr2 = xen_write_cr2,
2052 1930
@@ -2371,7 +2249,7 @@ static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2371 struct remap_data *rmd = data; 2249 struct remap_data *rmd = data;
2372 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot)); 2250 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2373 2251
2374 rmd->mmu_update->ptr = arbitrary_virt_to_machine(ptep).maddr; 2252 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2375 rmd->mmu_update->val = pte_val_ma(pte); 2253 rmd->mmu_update->val = pte_val_ma(pte);
2376 rmd->mmu_update++; 2254 rmd->mmu_update++;
2377 2255
@@ -2425,7 +2303,6 @@ out:
2425EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range); 2303EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2426 2304
2427#ifdef CONFIG_XEN_DEBUG_FS 2305#ifdef CONFIG_XEN_DEBUG_FS
2428
2429static int p2m_dump_open(struct inode *inode, struct file *filp) 2306static int p2m_dump_open(struct inode *inode, struct file *filp)
2430{ 2307{
2431 return single_open(filp, p2m_dump_show, NULL); 2308 return single_open(filp, p2m_dump_show, NULL);
@@ -2437,65 +2314,4 @@ static const struct file_operations p2m_dump_fops = {
2437 .llseek = seq_lseek, 2314 .llseek = seq_lseek,
2438 .release = single_release, 2315 .release = single_release,
2439}; 2316};
2440 2317#endif /* CONFIG_XEN_DEBUG_FS */
2441static struct dentry *d_mmu_debug;
2442
2443static int __init xen_mmu_debugfs(void)
2444{
2445 struct dentry *d_xen = xen_init_debugfs();
2446
2447 if (d_xen == NULL)
2448 return -ENOMEM;
2449
2450 d_mmu_debug = debugfs_create_dir("mmu", d_xen);
2451
2452 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats);
2453
2454 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2455 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2456 &mmu_stats.pgd_update_pinned);
2457 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2458 &mmu_stats.pgd_update_pinned);
2459
2460 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2461 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2462 &mmu_stats.pud_update_pinned);
2463 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2464 &mmu_stats.pud_update_pinned);
2465
2466 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2467 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2468 &mmu_stats.pmd_update_pinned);
2469 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2470 &mmu_stats.pmd_update_pinned);
2471
2472 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2473// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2474// &mmu_stats.pte_update_pinned);
2475 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2476 &mmu_stats.pte_update_pinned);
2477
2478 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2479 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2480 &mmu_stats.mmu_update_extended);
2481 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2482 mmu_stats.mmu_update_histo, 20);
2483
2484 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2485 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2486 &mmu_stats.set_pte_at_batched);
2487 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2488 &mmu_stats.set_pte_at_current);
2489 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2490 &mmu_stats.set_pte_at_kernel);
2491
2492 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2493 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2494 &mmu_stats.prot_commit_batched);
2495
2496 debugfs_create_file("p2m", 0600, d_mmu_debug, NULL, &p2m_dump_fops);
2497 return 0;
2498}
2499fs_initcall(xen_mmu_debugfs);
2500
2501#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index 537bb9aab777..73809bb951b4 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -15,43 +15,6 @@ bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
15 15
16void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 16void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
17 17
18
19void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
20void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
21void xen_exit_mmap(struct mm_struct *mm);
22
23pteval_t xen_pte_val(pte_t);
24pmdval_t xen_pmd_val(pmd_t);
25pgdval_t xen_pgd_val(pgd_t);
26
27pte_t xen_make_pte(pteval_t);
28pmd_t xen_make_pmd(pmdval_t);
29pgd_t xen_make_pgd(pgdval_t);
30
31void xen_set_pte(pte_t *ptep, pte_t pteval);
32void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
33 pte_t *ptep, pte_t pteval);
34
35#ifdef CONFIG_X86_PAE
36void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
37void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
38void xen_pmd_clear(pmd_t *pmdp);
39#endif /* CONFIG_X86_PAE */
40
41void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
42void xen_set_pud(pud_t *ptr, pud_t val);
43void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
44void xen_set_pud_hyper(pud_t *ptr, pud_t val);
45
46#if PAGETABLE_LEVELS == 4
47pudval_t xen_pud_val(pud_t pud);
48pud_t xen_make_pud(pudval_t pudval);
49void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
50void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
51#endif
52
53pgd_t *xen_get_user_pgd(pgd_t *pgd);
54
55pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 18pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
56void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 19void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
57 pte_t *ptep, pte_t pte); 20 pte_t *ptep, pte_t pte);
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 141eb0de8b06..58efeb9d5440 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -522,11 +522,20 @@ static bool __init __early_alloc_p2m(unsigned long pfn)
522 /* Boundary cross-over for the edges: */ 522 /* Boundary cross-over for the edges: */
523 if (idx) { 523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE); 524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525 unsigned long *mid_mfn_p;
525 526
526 p2m_init(p2m); 527 p2m_init(p2m);
527 528
528 p2m_top[topidx][mididx] = p2m; 529 p2m_top[topidx][mididx] = p2m;
529 530
531 /* For save/restore we need to MFN of the P2M saved */
532
533 mid_mfn_p = p2m_top_mfn_p[topidx];
534 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
535 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
536 topidx, mididx);
537 mid_mfn_p[mididx] = virt_to_mfn(p2m);
538
530 } 539 }
531 return idx != 0; 540 return idx != 0;
532} 541}
@@ -549,12 +558,29 @@ unsigned long __init set_phys_range_identity(unsigned long pfn_s,
549 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE) 558 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
550 { 559 {
551 unsigned topidx = p2m_top_index(pfn); 560 unsigned topidx = p2m_top_index(pfn);
552 if (p2m_top[topidx] == p2m_mid_missing) { 561 unsigned long *mid_mfn_p;
553 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE); 562 unsigned long **mid;
563
564 mid = p2m_top[topidx];
565 mid_mfn_p = p2m_top_mfn_p[topidx];
566 if (mid == p2m_mid_missing) {
567 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
554 568
555 p2m_mid_init(mid); 569 p2m_mid_init(mid);
556 570
557 p2m_top[topidx] = mid; 571 p2m_top[topidx] = mid;
572
573 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
574 }
575 /* And the save/restore P2M tables.. */
576 if (mid_mfn_p == p2m_mid_missing_mfn) {
577 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
578 p2m_mid_mfn_init(mid_mfn_p);
579
580 p2m_top_mfn_p[topidx] = mid_mfn_p;
581 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
582 /* Note: we don't set mid_mfn_p[midix] here,
583 * look in __early_alloc_p2m */
558 } 584 }
559 } 585 }
560 586
@@ -650,7 +676,7 @@ static unsigned long mfn_hash(unsigned long mfn)
650} 676}
651 677
652/* Add an MFN override for a particular page */ 678/* Add an MFN override for a particular page */
653int m2p_add_override(unsigned long mfn, struct page *page) 679int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
654{ 680{
655 unsigned long flags; 681 unsigned long flags;
656 unsigned long pfn; 682 unsigned long pfn;
@@ -662,7 +688,6 @@ int m2p_add_override(unsigned long mfn, struct page *page)
662 if (!PageHighMem(page)) { 688 if (!PageHighMem(page)) {
663 address = (unsigned long)__va(pfn << PAGE_SHIFT); 689 address = (unsigned long)__va(pfn << PAGE_SHIFT);
664 ptep = lookup_address(address, &level); 690 ptep = lookup_address(address, &level);
665
666 if (WARN(ptep == NULL || level != PG_LEVEL_4K, 691 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
667 "m2p_add_override: pfn %lx not mapped", pfn)) 692 "m2p_add_override: pfn %lx not mapped", pfn))
668 return -EINVAL; 693 return -EINVAL;
@@ -674,18 +699,17 @@ int m2p_add_override(unsigned long mfn, struct page *page)
674 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) 699 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
675 return -ENOMEM; 700 return -ENOMEM;
676 701
677 if (!PageHighMem(page)) 702 if (clear_pte && !PageHighMem(page))
678 /* Just zap old mapping for now */ 703 /* Just zap old mapping for now */
679 pte_clear(&init_mm, address, ptep); 704 pte_clear(&init_mm, address, ptep);
680
681 spin_lock_irqsave(&m2p_override_lock, flags); 705 spin_lock_irqsave(&m2p_override_lock, flags);
682 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]); 706 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
683 spin_unlock_irqrestore(&m2p_override_lock, flags); 707 spin_unlock_irqrestore(&m2p_override_lock, flags);
684 708
685 return 0; 709 return 0;
686} 710}
687 711EXPORT_SYMBOL_GPL(m2p_add_override);
688int m2p_remove_override(struct page *page) 712int m2p_remove_override(struct page *page, bool clear_pte)
689{ 713{
690 unsigned long flags; 714 unsigned long flags;
691 unsigned long mfn; 715 unsigned long mfn;
@@ -713,7 +737,7 @@ int m2p_remove_override(struct page *page)
713 spin_unlock_irqrestore(&m2p_override_lock, flags); 737 spin_unlock_irqrestore(&m2p_override_lock, flags);
714 set_phys_to_machine(pfn, page->index); 738 set_phys_to_machine(pfn, page->index);
715 739
716 if (!PageHighMem(page)) 740 if (clear_pte && !PageHighMem(page))
717 set_pte_at(&init_mm, address, ptep, 741 set_pte_at(&init_mm, address, ptep,
718 pfn_pte(pfn, PAGE_KERNEL)); 742 pfn_pte(pfn, PAGE_KERNEL));
719 /* No tlb flush necessary because the caller already 743 /* No tlb flush necessary because the caller already
@@ -721,6 +745,7 @@ int m2p_remove_override(struct page *page)
721 745
722 return 0; 746 return 0;
723} 747}
748EXPORT_SYMBOL_GPL(m2p_remove_override);
724 749
725struct page *m2p_find_override(unsigned long mfn) 750struct page *m2p_find_override(unsigned long mfn)
726{ 751{
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index bfd0632fe65e..b480d4207a4c 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -36,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void)
36 36
37 /* If running as PV guest, either iommu=soft, or swiotlb=force will 37 /* If running as PV guest, either iommu=soft, or swiotlb=force will
38 * activate this IOMMU. If running as PV privileged, activate it 38 * activate this IOMMU. If running as PV privileged, activate it
39 * irregardlesss. 39 * irregardless.
40 */ 40 */
41 if ((xen_initial_domain() || swiotlb || swiotlb_force) && 41 if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
42 (xen_pv_domain())) 42 (xen_pv_domain()))
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 90bac0aac3a5..be1a464f6d66 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -50,7 +50,7 @@ phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
50 */ 50 */
51#define EXTRA_MEM_RATIO (10) 51#define EXTRA_MEM_RATIO (10)
52 52
53static __init void xen_add_extra_mem(unsigned long pages) 53static void __init xen_add_extra_mem(unsigned long pages)
54{ 54{
55 unsigned long pfn; 55 unsigned long pfn;
56 56
@@ -166,7 +166,7 @@ static unsigned long __init xen_set_identity(const struct e820entry *list,
166 if (last > end) 166 if (last > end)
167 continue; 167 continue;
168 168
169 if (entry->type == E820_RAM) { 169 if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
170 if (start > start_pci) 170 if (start > start_pci)
171 identity += set_phys_range_identity( 171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start)); 172 PFN_UP(start_pci), PFN_DOWN(start));
@@ -227,7 +227,11 @@ char * __init xen_memory_setup(void)
227 227
228 memcpy(map_raw, map, sizeof(map)); 228 memcpy(map_raw, map, sizeof(map));
229 e820.nr_map = 0; 229 e820.nr_map = 0;
230#ifdef CONFIG_X86_32
231 xen_extra_mem_start = mem_end;
232#else
230 xen_extra_mem_start = max((1ULL << 32), mem_end); 233 xen_extra_mem_start = max((1ULL << 32), mem_end);
234#endif
231 for (i = 0; i < memmap.nr_entries; i++) { 235 for (i = 0; i < memmap.nr_entries; i++) {
232 unsigned long long end; 236 unsigned long long end;
233 237
@@ -336,7 +340,7 @@ static void __init fiddle_vdso(void)
336#endif 340#endif
337} 341}
338 342
339static __cpuinit int register_callback(unsigned type, const void *func) 343static int __cpuinit register_callback(unsigned type, const void *func)
340{ 344{
341 struct callback_register callback = { 345 struct callback_register callback = {
342 .type = type, 346 .type = type,
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 30612441ed99..41038c01de40 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -46,18 +46,17 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
47 47
48/* 48/*
49 * Reschedule call back. Nothing to do, 49 * Reschedule call back.
50 * all the work is done automatically when
51 * we return from the interrupt.
52 */ 50 */
53static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
54{ 52{
55 inc_irq_stat(irq_resched_count); 53 inc_irq_stat(irq_resched_count);
54 scheduler_ipi();
56 55
57 return IRQ_HANDLED; 56 return IRQ_HANDLED;
58} 57}
59 58
60static __cpuinit void cpu_bringup(void) 59static void __cpuinit cpu_bringup(void)
61{ 60{
62 int cpu = smp_processor_id(); 61 int cpu = smp_processor_id();
63 62
@@ -85,7 +84,7 @@ static __cpuinit void cpu_bringup(void)
85 wmb(); /* make sure everything is out */ 84 wmb(); /* make sure everything is out */
86} 85}
87 86
88static __cpuinit void cpu_bringup_and_idle(void) 87static void __cpuinit cpu_bringup_and_idle(void)
89{ 88{
90 cpu_bringup(); 89 cpu_bringup();
91 cpu_idle(); 90 cpu_idle();
@@ -242,7 +241,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
242 } 241 }
243} 242}
244 243
245static __cpuinit int 244static int __cpuinit
246cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 245cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
247{ 246{
248 struct vcpu_guest_context *ctxt; 247 struct vcpu_guest_context *ctxt;
@@ -486,7 +485,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
486 return IRQ_HANDLED; 485 return IRQ_HANDLED;
487} 486}
488 487
489static const struct smp_ops xen_smp_ops __initdata = { 488static const struct smp_ops xen_smp_ops __initconst = {
490 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 489 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
491 .smp_prepare_cpus = xen_smp_prepare_cpus, 490 .smp_prepare_cpus = xen_smp_prepare_cpus,
492 .smp_cpus_done = xen_smp_cpus_done, 491 .smp_cpus_done = xen_smp_cpus_done,
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 2e2d370a47b1..5158c505bef9 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -26,8 +26,6 @@
26 26
27#include "xen-ops.h" 27#include "xen-ops.h"
28 28
29#define XEN_SHIFT 22
30
31/* Xen may fire a timer up to this many ns early */ 29/* Xen may fire a timer up to this many ns early */
32#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
33#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = {
211 .rating = 400, 209 .rating = 400,
212 .read = xen_clocksource_get_cycles, 210 .read = xen_clocksource_get_cycles,
213 .mask = ~0, 211 .mask = ~0,
214 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
215 .shift = XEN_SHIFT,
216 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 212 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
217}; 213};
218 214
@@ -439,16 +435,16 @@ void xen_timer_resume(void)
439 } 435 }
440} 436}
441 437
442static const struct pv_time_ops xen_time_ops __initdata = { 438static const struct pv_time_ops xen_time_ops __initconst = {
443 .sched_clock = xen_clocksource_read, 439 .sched_clock = xen_clocksource_read,
444}; 440};
445 441
446static __init void xen_time_init(void) 442static void __init xen_time_init(void)
447{ 443{
448 int cpu = smp_processor_id(); 444 int cpu = smp_processor_id();
449 struct timespec tp; 445 struct timespec tp;
450 446
451 clocksource_register(&xen_clocksource); 447 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
452 448
453 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { 449 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
454 /* Successfully turned off 100Hz tick, so we have the 450 /* Successfully turned off 100Hz tick, so we have the
@@ -468,7 +464,7 @@ static __init void xen_time_init(void)
468 xen_setup_cpu_clockevents(); 464 xen_setup_cpu_clockevents();
469} 465}
470 466
471__init void xen_init_time_ops(void) 467void __init xen_init_time_ops(void)
472{ 468{
473 pv_time_ops = xen_time_ops; 469 pv_time_ops = xen_time_ops;
474 470
@@ -490,7 +486,7 @@ static void xen_hvm_setup_cpu_clockevents(void)
490 xen_setup_cpu_clockevents(); 486 xen_setup_cpu_clockevents();
491} 487}
492 488
493__init void xen_hvm_init_time_ops(void) 489void __init xen_hvm_init_time_ops(void)
494{ 490{
495 /* vector callback is needed otherwise we cannot receive interrupts 491 /* vector callback is needed otherwise we cannot receive interrupts
496 * on cpu > 0 and at this point we don't know how many cpus are 492 * on cpu > 0 and at this point we don't know how many cpus are
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 3112f55638c4..97dfdc8757b3 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -74,7 +74,7 @@ static inline void xen_hvm_smp_init(void) {}
74 74
75#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS
76void __init xen_init_spinlocks(void); 76void __init xen_init_spinlocks(void);
77__cpuinit void xen_init_lock_cpu(int cpu); 77void __cpuinit xen_init_lock_cpu(int cpu);
78void xen_uninit_lock_cpu(int cpu); 78void xen_uninit_lock_cpu(int cpu);
79#else 79#else
80static inline void xen_init_spinlocks(void) 80static inline void xen_init_spinlocks(void)