aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kbuild2
-rw-r--r--arch/x86/Kconfig330
-rw-r--r--arch/x86/Kconfig.cpu30
-rw-r--r--arch/x86/Kconfig.debug49
-rw-r--r--arch/x86/Makefile8
-rw-r--r--arch/x86/Makefile_32.cpu15
-rw-r--r--arch/x86/boot/compressed/Makefile5
-rw-r--r--arch/x86/boot/compressed/head_64.S2
-rw-r--r--arch/x86/boot/compressed/misc.c35
-rw-r--r--arch/x86/boot/compressed/mkpiggy.c9
-rw-r--r--arch/x86/boot/memory.c2
-rw-r--r--arch/x86/configs/i386_defconfig1
-rw-r--r--arch/x86/configs/x86_64_defconfig1
-rw-r--r--arch/x86/crypto/Makefile4
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S1835
-rw-r--r--arch/x86/crypto/aesni-intel_glue.c557
-rw-r--r--arch/x86/crypto/fpu.c10
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c1
-rw-r--r--arch/x86/ia32/ia32_aout.c1
-rw-r--r--arch/x86/ia32/ia32entry.S35
-rw-r--r--arch/x86/ia32/sys_ia32.c1
-rw-r--r--arch/x86/include/asm/acpi.h30
-rw-r--r--arch/x86/include/asm/alternative-asm.h9
-rw-r--r--arch/x86/include/asm/alternative.h10
-rw-r--r--arch/x86/include/asm/amd_iommu.h6
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h15
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h51
-rw-r--r--arch/x86/include/asm/amd_nb.h64
-rw-r--r--arch/x86/include/asm/apb_timer.h3
-rw-r--r--arch/x86/include/asm/apic.h91
-rw-r--r--arch/x86/include/asm/apicdef.h15
-rw-r--r--arch/x86/include/asm/bios_ebda.h28
-rw-r--r--arch/x86/include/asm/bitops.h6
-rw-r--r--arch/x86/include/asm/boot.h6
-rw-r--r--arch/x86/include/asm/bootparam.h2
-rw-r--r--arch/x86/include/asm/cacheflush.h44
-rw-r--r--arch/x86/include/asm/calgary.h4
-rw-r--r--arch/x86/include/asm/calling.h52
-rw-r--r--arch/x86/include/asm/ce4100.h6
-rw-r--r--arch/x86/include/asm/cpu.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h32
-rw-r--r--arch/x86/include/asm/debugreg.h2
-rw-r--r--arch/x86/include/asm/desc.h152
-rw-r--r--arch/x86/include/asm/dma.h19
-rw-r--r--arch/x86/include/asm/dwarf2.h20
-rw-r--r--arch/x86/include/asm/e820.h25
-rw-r--r--arch/x86/include/asm/efi.h3
-rw-r--r--arch/x86/include/asm/entry_arch.h28
-rw-r--r--arch/x86/include/asm/fixmap.h19
-rw-r--r--arch/x86/include/asm/frame.h6
-rw-r--r--arch/x86/include/asm/ftrace.h7
-rw-r--r--arch/x86/include/asm/futex.h22
-rw-r--r--arch/x86/include/asm/gart.h42
-rw-r--r--arch/x86/include/asm/gpio.h5
-rw-r--r--arch/x86/include/asm/hardirq.h2
-rw-r--r--arch/x86/include/asm/highmem.h11
-rw-r--r--arch/x86/include/asm/hpet.h10
-rw-r--r--arch/x86/include/asm/hw_irq.h43
-rw-r--r--arch/x86/include/asm/hypervisor.h12
-rw-r--r--arch/x86/include/asm/i387.h205
-rw-r--r--arch/x86/include/asm/i8253.h2
-rw-r--r--arch/x86/include/asm/i8259.h2
-rw-r--r--arch/x86/include/asm/idle.h2
-rw-r--r--arch/x86/include/asm/init.h6
-rw-r--r--arch/x86/include/asm/io.h39
-rw-r--r--arch/x86/include/asm/io_apic.h70
-rw-r--r--arch/x86/include/asm/iomap.h4
-rw-r--r--arch/x86/include/asm/iommu_table.h100
-rw-r--r--arch/x86/include/asm/ipi.h8
-rw-r--r--arch/x86/include/asm/irq.h16
-rw-r--r--arch/x86/include/asm/irq_controller.h12
-rw-r--r--arch/x86/include/asm/irq_remapping.h35
-rw-r--r--arch/x86/include/asm/irq_vectors.h53
-rw-r--r--arch/x86/include/asm/irqflags.h32
-rw-r--r--arch/x86/include/asm/jump_label.h42
-rw-r--r--arch/x86/include/asm/k8.h36
-rw-r--r--arch/x86/include/asm/kdebug.h2
-rw-r--r--arch/x86/include/asm/kgdb.h1
-rw-r--r--arch/x86/include/asm/kvm_emulate.h241
-rw-r--r--arch/x86/include/asm/kvm_host.h210
-rw-r--r--arch/x86/include/asm/kvm_para.h30
-rw-r--r--arch/x86/include/asm/linkage.h5
-rw-r--r--arch/x86/include/asm/mach_traps.h12
-rw-r--r--arch/x86/include/asm/mce.h5
-rw-r--r--arch/x86/include/asm/memblock.h23
-rw-r--r--arch/x86/include/asm/microcode.h6
-rw-r--r--arch/x86/include/asm/mmu.h6
-rw-r--r--arch/x86/include/asm/mmu_context.h5
-rw-r--r--arch/x86/include/asm/mmzone_32.h33
-rw-r--r--arch/x86/include/asm/mmzone_64.h26
-rw-r--r--arch/x86/include/asm/module.h9
-rw-r--r--arch/x86/include/asm/mpspec.h34
-rw-r--r--arch/x86/include/asm/mpspec_def.h7
-rw-r--r--arch/x86/include/asm/mrst-vrtc.h9
-rw-r--r--arch/x86/include/asm/mrst.h24
-rw-r--r--arch/x86/include/asm/msr-index.h34
-rw-r--r--arch/x86/include/asm/mwait.h15
-rw-r--r--arch/x86/include/asm/nmi.h72
-rw-r--r--arch/x86/include/asm/nops.h148
-rw-r--r--arch/x86/include/asm/numa.h80
-rw-r--r--arch/x86/include/asm/numa_32.h3
-rw-r--r--arch/x86/include/asm/numa_64.h46
-rw-r--r--arch/x86/include/asm/numaq.h7
-rw-r--r--arch/x86/include/asm/olpc.h14
-rw-r--r--arch/x86/include/asm/olpc_ofw.h14
-rw-r--r--arch/x86/include/asm/page_32_types.h4
-rw-r--r--arch/x86/include/asm/page_types.h11
-rw-r--r--arch/x86/include/asm/paravirt.h47
-rw-r--r--arch/x86/include/asm/paravirt_types.h7
-rw-r--r--arch/x86/include/asm/pci.h36
-rw-r--r--arch/x86/include/asm/pci_x86.h1
-rw-r--r--arch/x86/include/asm/percpu.h256
-rw-r--r--arch/x86/include/asm/perf_event.h21
-rw-r--r--arch/x86/include/asm/perf_event_p4.h123
-rw-r--r--arch/x86/include/asm/pgalloc.h2
-rw-r--r--arch/x86/include/asm/pgtable-2level.h9
-rw-r--r--arch/x86/include/asm/pgtable-3level.h34
-rw-r--r--arch/x86/include/asm/pgtable.h147
-rw-r--r--arch/x86/include/asm/pgtable_32.h16
-rw-r--r--arch/x86/include/asm/pgtable_64.h32
-rw-r--r--arch/x86/include/asm/pgtable_types.h4
-rw-r--r--arch/x86/include/asm/probe_roms.h8
-rw-r--r--arch/x86/include/asm/processor-flags.h3
-rw-r--r--arch/x86/include/asm/processor.h45
-rw-r--r--arch/x86/include/asm/prom.h69
-rw-r--r--arch/x86/include/asm/ptrace-abi.h2
-rw-r--r--arch/x86/include/asm/ptrace.h22
-rw-r--r--arch/x86/include/asm/pvclock.h44
-rw-r--r--arch/x86/include/asm/reboot.h5
-rw-r--r--arch/x86/include/asm/rwsem.h80
-rw-r--r--arch/x86/include/asm/segment.h44
-rw-r--r--arch/x86/include/asm/setup.h15
-rw-r--r--arch/x86/include/asm/smp.h34
-rw-r--r--arch/x86/include/asm/smpboot_hooks.h3
-rw-r--r--arch/x86/include/asm/srat.h39
-rw-r--r--arch/x86/include/asm/stacktrace.h34
-rw-r--r--arch/x86/include/asm/suspend_32.h2
-rw-r--r--arch/x86/include/asm/suspend_64.h5
-rw-r--r--arch/x86/include/asm/svm.h57
-rw-r--r--arch/x86/include/asm/swiotlb.h13
-rw-r--r--arch/x86/include/asm/system.h87
-rw-r--r--arch/x86/include/asm/system_64.h22
-rw-r--r--arch/x86/include/asm/thread_info.h10
-rw-r--r--arch/x86/include/asm/timer.h6
-rw-r--r--arch/x86/include/asm/tlbflush.h2
-rw-r--r--arch/x86/include/asm/topology.h27
-rw-r--r--arch/x86/include/asm/trampoline.h36
-rw-r--r--arch/x86/include/asm/traps.h1
-rw-r--r--arch/x86/include/asm/tsc.h6
-rw-r--r--arch/x86/include/asm/types.h16
-rw-r--r--arch/x86/include/asm/uaccess.h3
-rw-r--r--arch/x86/include/asm/uaccess_32.h1
-rw-r--r--arch/x86/include/asm/uaccess_64.h1
-rw-r--r--arch/x86/include/asm/unistd_32.h10
-rw-r--r--arch/x86/include/asm/unistd_64.h16
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h596
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h92
-rw-r--r--arch/x86/include/asm/uv/uv_mmrs.h1140
-rw-r--r--arch/x86/include/asm/vdso.h14
-rw-r--r--arch/x86/include/asm/vgtod.h2
-rw-r--r--arch/x86/include/asm/vmi.h269
-rw-r--r--arch/x86/include/asm/vmi_time.h98
-rw-r--r--arch/x86/include/asm/vmx.h15
-rw-r--r--arch/x86/include/asm/vsyscall.h12
-rw-r--r--arch/x86/include/asm/vvar.h52
-rw-r--r--arch/x86/include/asm/x2apic.h62
-rw-r--r--arch/x86/include/asm/x86_init.h23
-rw-r--r--arch/x86/include/asm/xen/hypercall.h39
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h35
-rw-r--r--arch/x86/include/asm/xen/interface.h8
-rw-r--r--arch/x86/include/asm/xen/interface_32.h5
-rw-r--r--arch/x86/include/asm/xen/interface_64.h13
-rw-r--r--arch/x86/include/asm/xen/page.h67
-rw-r--r--arch/x86/include/asm/xen/pci.h81
-rw-r--r--arch/x86/kernel/Makefile44
-rw-r--r--arch/x86/kernel/acpi/boot.c150
-rw-r--r--arch/x86/kernel/acpi/cstate.c11
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.S35
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.h11
-rw-r--r--arch/x86/kernel/acpi/realmode/wakeup.lds.S28
-rw-r--r--arch/x86/kernel/acpi/sleep.c86
-rw-r--r--arch/x86/kernel/acpi/sleep.h5
-rw-r--r--arch/x86/kernel/acpi/wakeup_rm.S12
-rw-r--r--arch/x86/kernel/alternative.c242
-rw-r--r--arch/x86/kernel/amd_gart_64.c (renamed from arch/x86/kernel/pci-gart_64.c)84
-rw-r--r--arch/x86/kernel/amd_iommu.c581
-rw-r--r--arch/x86/kernel/amd_iommu_init.c221
-rw-r--r--arch/x86/kernel/amd_nb.c255
-rw-r--r--arch/x86/kernel/apb_timer.c133
-rw-r--r--arch/x86/kernel/aperture_64.c150
-rw-r--r--arch/x86/kernel/apic/Makefile22
-rw-r--r--arch/x86/kernel/apic/apic.c554
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c30
-rw-r--r--arch/x86/kernel/apic/apic_noop.c17
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c45
-rw-r--r--arch/x86/kernel/apic/es7000_32.c46
-rw-r--r--arch/x86/kernel/apic/hw_nmi.c48
-rw-r--r--arch/x86/kernel/apic/io_apic.c1639
-rw-r--r--arch/x86/kernel/apic/ipi.c12
-rw-r--r--arch/x86/kernel/apic/nmi.c567
-rw-r--r--arch/x86/kernel/apic/numaq_32.c62
-rw-r--r--arch/x86/kernel/apic/probe_32.c120
-rw-r--r--arch/x86/kernel/apic/probe_64.c69
-rw-r--r--arch/x86/kernel/apic/summit_32.c50
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c224
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c117
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c219
-rw-r--r--arch/x86/kernel/apm_32.c35
-rw-r--r--arch/x86/kernel/asm-offsets.c65
-rw-r--r--arch/x86/kernel/asm-offsets_32.c71
-rw-r--r--arch/x86/kernel/asm-offsets_64.c90
-rw-r--r--arch/x86/kernel/check.c24
-rw-r--r--arch/x86/kernel/cpu/Makefile1
-rw-r--r--arch/x86/kernel/cpu/amd.c167
-rw-r--r--arch/x86/kernel/cpu/bugs.c1
-rw-r--r--arch/x86/kernel/cpu/common.c66
-rw-r--r--arch/x86/kernel/cpu/cpu.h1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig266
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile21
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c775
-rw-r--r--arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c446
-rw-r--r--arch/x86/kernel/cpu/cpufreq/e_powersaver.c367
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c309
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c517
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c1029
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.h353
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c327
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.c51
-rw-r--r--arch/x86/kernel/cpu/cpufreq/mperf.h9
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c331
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c626
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c261
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c752
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.h43
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c1601
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.h224
-rw-r--r--arch/x86/kernel/cpu/cpufreq/sc520_freq.c194
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c636
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c452
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c481
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h49
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c467
-rw-r--r--arch/x86/kernel/cpu/intel.c40
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c261
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-apei.c42
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c7
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c1
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c71
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c122
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c2
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c65
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c130
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c40
-rw-r--r--arch/x86/kernel/cpu/perf_event.c615
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c229
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c537
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel_ds.c312
-rw-r--r--arch/x86/kernel/cpu/perf_event_p4.c368
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c4
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c649
-rw-r--r--arch/x86/kernel/cpu/scattered.c6
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c1
-rw-r--r--arch/x86/kernel/crash_dump_32.c5
-rw-r--r--arch/x86/kernel/crash_dump_64.c6
-rw-r--r--arch/x86/kernel/devicetree.c452
-rw-r--r--arch/x86/kernel/dumpstack.c63
-rw-r--r--arch/x86/kernel/dumpstack_32.c22
-rw-r--r--arch/x86/kernel/dumpstack_64.c26
-rw-r--r--arch/x86/kernel/e820.c211
-rw-r--r--arch/x86/kernel/early-quirks.c23
-rw-r--r--arch/x86/kernel/early_printk.c12
-rw-r--r--arch/x86/kernel/entry_32.S325
-rw-r--r--arch/x86/kernel/entry_64.S186
-rw-r--r--arch/x86/kernel/ftrace.c101
-rw-r--r--arch/x86/kernel/head.c3
-rw-r--r--arch/x86/kernel/head32.c20
-rw-r--r--arch/x86/kernel/head64.c10
-rw-r--r--arch/x86/kernel/head_32.S188
-rw-r--r--arch/x86/kernel/head_64.S3
-rw-r--r--arch/x86/kernel/hpet.c147
-rw-r--r--arch/x86/kernel/hw_breakpoint.c16
-rw-r--r--arch/x86/kernel/i387.c61
-rw-r--r--arch/x86/kernel/i8237.c30
-rw-r--r--arch/x86/kernel/i8253.c86
-rw-r--r--arch/x86/kernel/i8259.c98
-rw-r--r--arch/x86/kernel/ioport.c20
-rw-r--r--arch/x86/kernel/irq.c102
-rw-r--r--arch/x86/kernel/irq_32.c38
-rw-r--r--arch/x86/kernel/irq_work.c30
-rw-r--r--arch/x86/kernel/irqinit.c113
-rw-r--r--arch/x86/kernel/jump_label.c51
-rw-r--r--arch/x86/kernel/k8.c137
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c48
-rw-r--r--arch/x86/kernel/kprobes.c154
-rw-r--r--arch/x86/kernel/kvm.c317
-rw-r--r--arch/x86/kernel/kvmclock.c25
-rw-r--r--arch/x86/kernel/machine_kexec_64.c4
-rw-r--r--arch/x86/kernel/mca_32.c2
-rw-r--r--arch/x86/kernel/microcode_amd.c208
-rw-r--r--arch/x86/kernel/microcode_core.c44
-rw-r--r--arch/x86/kernel/microcode_intel.c18
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c71
-rw-r--r--arch/x86/kernel/module.c21
-rw-r--r--arch/x86/kernel/mpparse.c139
-rw-r--r--arch/x86/kernel/mrst.c311
-rw-r--r--arch/x86/kernel/msr.c1
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c22
-rw-r--r--arch/x86/kernel/pci-dma.c110
-rw-r--r--arch/x86/kernel/pci-iommu_table.c79
-rw-r--r--arch/x86/kernel/pci-swiotlb.c44
-rw-r--r--arch/x86/kernel/pmtimer_64.c69
-rw-r--r--arch/x86/kernel/probe_roms.c (renamed from arch/x86/kernel/probe_roms_32.c)101
-rw-r--r--arch/x86/kernel/process.c105
-rw-r--r--arch/x86/kernel/process_32.c5
-rw-r--r--arch/x86/kernel/process_64.c15
-rw-r--r--arch/x86/kernel/ptrace.c57
-rw-r--r--arch/x86/kernel/pvclock.c46
-rw-r--r--arch/x86/kernel/quirks.c2
-rw-r--r--arch/x86/kernel/reboot.c204
-rw-r--r--arch/x86/kernel/reboot_32.S135
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c16
-rw-r--r--arch/x86/kernel/resource.c48
-rw-r--r--arch/x86/kernel/rtc.c5
-rw-r--r--arch/x86/kernel/setup.c327
-rw-r--r--arch/x86/kernel/setup_percpu.c19
-rw-r--r--arch/x86/kernel/signal.c14
-rw-r--r--arch/x86/kernel/smp.c24
-rw-r--r--arch/x86/kernel/smpboot.c325
-rw-r--r--arch/x86/kernel/stacktrace.c17
-rw-r--r--arch/x86/kernel/step.c2
-rw-r--r--arch/x86/kernel/sys_i386_32.c4
-rw-r--r--arch/x86/kernel/syscall_table_32.S12
-rw-r--r--arch/x86/kernel/tboot.c3
-rw-r--r--arch/x86/kernel/test_nx.c2
-rw-r--r--arch/x86/kernel/time.c20
-rw-r--r--arch/x86/kernel/tlb_uv.c1655
-rw-r--r--arch/x86/kernel/topology.c2
-rw-r--r--arch/x86/kernel/trampoline.c62
-rw-r--r--arch/x86/kernel/trampoline_32.S15
-rw-r--r--arch/x86/kernel/trampoline_64.S30
-rw-r--r--arch/x86/kernel/traps.c167
-rw-r--r--arch/x86/kernel/tsc.c183
-rw-r--r--arch/x86/kernel/verify_cpu.S (renamed from arch/x86/kernel/verify_cpu_64.S)49
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c893
-rw-r--r--arch/x86/kernel/vmiclock_32.c317
-rw-r--r--arch/x86/kernel/vmlinux.lds.S95
-rw-r--r--arch/x86/kernel/vread_tsc_64.c36
-rw-r--r--arch/x86/kernel/vsyscall_64.c48
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c1
-rw-r--r--arch/x86/kernel/x86_init.c14
-rw-r--r--arch/x86/kernel/xsave.c5
-rw-r--r--arch/x86/kvm/Kconfig8
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c3945
-rw-r--r--arch/x86/kvm/i8254.c11
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c48
-rw-r--r--arch/x86/kvm/irq.c9
-rw-r--r--arch/x86/kvm/irq.h4
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c30
-rw-r--r--arch/x86/kvm/lapic.h1
-rw-r--r--arch/x86/kvm/mmu.c1371
-rw-r--r--arch/x86/kvm/mmu.h9
-rw-r--r--arch/x86/kvm/mmu_audit.c304
-rw-r--r--arch/x86/kvm/mmutrace.h19
-rw-r--r--arch/x86/kvm/paging_tmpl.h410
-rw-r--r--arch/x86/kvm/svm.c1644
-rw-r--r--arch/x86/kvm/timer.c4
-rw-r--r--arch/x86/kvm/trace.h25
-rw-r--r--arch/x86/kvm/vmx.c724
-rw-r--r--arch/x86/kvm/x86.c1860
-rw-r--r--arch/x86/kvm/x86.h13
-rw-r--r--arch/x86/lguest/Kconfig1
-rw-r--r--arch/x86/lguest/boot.c49
-rw-r--r--arch/x86/lib/Makefile1
-rw-r--r--arch/x86/lib/atomic64_386_32.S6
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S6
-rw-r--r--arch/x86/lib/checksum_32.S63
-rw-r--r--arch/x86/lib/clear_page_64.S33
-rw-r--r--arch/x86/lib/cmpxchg16b_emu.S65
-rw-r--r--arch/x86/lib/copy_user_64.S71
-rw-r--r--arch/x86/lib/csum-copy_64.S242
-rw-r--r--arch/x86/lib/csum-partial_64.c2
-rw-r--r--arch/x86/lib/delay.c2
-rw-r--r--arch/x86/lib/memcpy_32.c199
-rw-r--r--arch/x86/lib/memcpy_64.S203
-rw-r--r--arch/x86/lib/memmove_64.S224
-rw-r--r--arch/x86/lib/memmove_64.c21
-rw-r--r--arch/x86/lib/memset_64.S54
-rw-r--r--arch/x86/lib/rwsem_64.S56
-rw-r--r--arch/x86/lib/semaphore_32.S38
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S27
-rw-r--r--arch/x86/mm/Makefile7
-rw-r--r--arch/x86/mm/amdtopology.c (renamed from arch/x86/mm/k8topology_64.c)122
-rw-r--r--arch/x86/mm/fault.c148
-rw-r--r--arch/x86/mm/gup.c28
-rw-r--r--arch/x86/mm/highmem_32.c76
-rw-r--r--arch/x86/mm/hugetlbpage.c6
-rw-r--r--arch/x86/mm/init.c85
-rw-r--r--arch/x86/mm/init_32.c202
-rw-r--r--arch/x86/mm/init_64.c239
-rw-r--r--arch/x86/mm/iomap_32.c43
-rw-r--r--arch/x86/mm/ioremap.c19
-rw-r--r--arch/x86/mm/kmemcheck/error.c2
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/opcode.c2
-rw-r--r--arch/x86/mm/memblock.c348
-rw-r--r--arch/x86/mm/memtest.c7
-rw-r--r--arch/x86/mm/numa.c773
-rw-r--r--arch/x86/mm/numa_32.c400
-rw-r--r--arch/x86/mm/numa_64.c882
-rw-r--r--arch/x86/mm/numa_emulation.c492
-rw-r--r--arch/x86/mm/numa_internal.h39
-rw-r--r--arch/x86/mm/pageattr.c45
-rw-r--r--arch/x86/mm/pf_in.c14
-rw-r--r--arch/x86/mm/pgtable.c104
-rw-r--r--arch/x86/mm/setup_nx.c2
-rw-r--r--arch/x86/mm/srat.c184
-rw-r--r--arch/x86/mm/srat_32.c285
-rw-r--r--arch/x86/mm/srat_64.c564
-rw-r--r--arch/x86/mm/tlb.c63
-rw-r--r--arch/x86/net/Makefile4
-rw-r--r--arch/x86/net/bpf_jit.S140
-rw-r--r--arch/x86/net/bpf_jit_comp.c654
-rw-r--r--arch/x86/oprofile/backtrace.c83
-rw-r--r--arch/x86/oprofile/nmi_int.c84
-rw-r--r--arch/x86/oprofile/nmi_timer_int.c5
-rw-r--r--arch/x86/oprofile/op_counter.h1
-rw-r--r--arch/x86/oprofile/op_model_amd.c356
-rw-r--r--arch/x86/oprofile/op_model_p4.c4
-rw-r--r--arch/x86/oprofile/op_model_ppro.c8
-rw-r--r--arch/x86/pci/Makefile2
-rw-r--r--arch/x86/pci/acpi.c103
-rw-r--r--arch/x86/pci/amd_bus.c35
-rw-r--r--arch/x86/pci/broadcom_bus.c11
-rw-r--r--arch/x86/pci/ce4100.c316
-rw-r--r--arch/x86/pci/common.c58
-rw-r--r--arch/x86/pci/direct.c17
-rw-r--r--arch/x86/pci/i386.c9
-rw-r--r--arch/x86/pci/irq.c21
-rw-r--r--arch/x86/pci/mmconfig-shared.c14
-rw-r--r--arch/x86/pci/olpc.c2
-rw-r--r--arch/x86/pci/pcbios.c23
-rw-r--r--arch/x86/pci/xen.c571
-rw-r--r--arch/x86/platform/Makefile10
-rw-r--r--arch/x86/platform/ce4100/Makefile1
-rw-r--r--arch/x86/platform/ce4100/ce4100.c146
-rw-r--r--arch/x86/platform/ce4100/falconfalls.dts430
-rw-r--r--arch/x86/platform/efi/Makefile1
-rw-r--r--arch/x86/platform/efi/efi.c (renamed from arch/x86/kernel/efi.c)152
-rw-r--r--arch/x86/platform/efi/efi_32.c (renamed from arch/x86/kernel/efi_32.c)0
-rw-r--r--arch/x86/platform/efi/efi_64.c (renamed from arch/x86/kernel/efi_64.c)37
-rw-r--r--arch/x86/platform/efi/efi_stub_32.S (renamed from arch/x86/kernel/efi_stub_32.S)0
-rw-r--r--arch/x86/platform/efi/efi_stub_64.S (renamed from arch/x86/kernel/efi_stub_64.S)0
-rw-r--r--arch/x86/platform/iris/Makefile1
-rw-r--r--arch/x86/platform/iris/iris.c91
-rw-r--r--arch/x86/platform/mrst/Makefile3
-rw-r--r--arch/x86/platform/mrst/early_printk_mrst.c319
-rw-r--r--arch/x86/platform/mrst/mrst.c811
-rw-r--r--arch/x86/platform/mrst/vrtc.c159
-rw-r--r--arch/x86/platform/olpc/Makefile2
-rw-r--r--arch/x86/platform/olpc/olpc-xo1.c146
-rw-r--r--arch/x86/platform/olpc/olpc.c (renamed from arch/x86/kernel/olpc.c)108
-rw-r--r--arch/x86/platform/olpc/olpc_dt.c201
-rw-r--r--arch/x86/platform/olpc/olpc_ofw.c (renamed from arch/x86/kernel/olpc_ofw.c)11
-rw-r--r--arch/x86/platform/scx200/Makefile2
-rw-r--r--arch/x86/platform/scx200/scx200_32.c (renamed from arch/x86/kernel/scx200_32.c)0
-rw-r--r--arch/x86/platform/sfi/Makefile1
-rw-r--r--arch/x86/platform/sfi/sfi.c (renamed from arch/x86/kernel/sfi.c)19
-rw-r--r--arch/x86/platform/uv/Makefile1
-rw-r--r--arch/x86/platform/uv/bios_uv.c (renamed from arch/x86/kernel/bios_uv.c)0
-rw-r--r--arch/x86/platform/uv/tlb_uv.c1857
-rw-r--r--arch/x86/platform/uv/uv_irq.c (renamed from arch/x86/kernel/uv_irq.c)57
-rw-r--r--arch/x86/platform/uv/uv_sysfs.c (renamed from arch/x86/kernel/uv_sysfs.c)0
-rw-r--r--arch/x86/platform/uv/uv_time.c (renamed from arch/x86/kernel/uv_time.c)26
-rw-r--r--arch/x86/platform/visws/Makefile1
-rw-r--r--arch/x86/platform/visws/visws_quirks.c (renamed from arch/x86/kernel/visws_quirks.c)154
-rw-r--r--arch/x86/vdso/Makefile21
-rw-r--r--arch/x86/vdso/vclock_gettime.c74
-rw-r--r--arch/x86/vdso/vdso.lds.S9
-rw-r--r--arch/x86/vdso/vdso32-setup.c15
-rw-r--r--arch/x86/vdso/vextern.h16
-rw-r--r--arch/x86/vdso/vgetcpu.c3
-rw-r--r--arch/x86/vdso/vma.c27
-rw-r--r--arch/x86/vdso/vvar.c12
-rw-r--r--arch/x86/xen/Kconfig32
-rw-r--r--arch/x86/xen/Makefile3
-rw-r--r--arch/x86/xen/debugfs.c1
-rw-r--r--arch/x86/xen/enlighten.c152
-rw-r--r--arch/x86/xen/irq.c4
-rw-r--r--arch/x86/xen/mmu.c870
-rw-r--r--arch/x86/xen/mmu.h38
-rw-r--r--arch/x86/xen/multicalls.c12
-rw-r--r--arch/x86/xen/multicalls.h2
-rw-r--r--arch/x86/xen/p2m.c859
-rw-r--r--arch/x86/xen/pci-swiotlb-xen.c11
-rw-r--r--arch/x86/xen/platform-pci-unplug.c2
-rw-r--r--arch/x86/xen/setup.c241
-rw-r--r--arch/x86/xen/smp.c90
-rw-r--r--arch/x86/xen/spinlock.c10
-rw-r--r--arch/x86/xen/suspend.c9
-rw-r--r--arch/x86/xen/time.c28
-rw-r--r--arch/x86/xen/xen-head.S4
-rw-r--r--arch/x86/xen/xen-ops.h9
511 files changed, 34874 insertions, 31621 deletions
diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild
index ad8ec356fb36..0e9dec6cadd1 100644
--- a/arch/x86/Kbuild
+++ b/arch/x86/Kbuild
@@ -14,3 +14,5 @@ obj-y += crypto/
14obj-y += vdso/ 14obj-y += vdso/
15obj-$(CONFIG_IA32_EMULATION) += ia32/ 15obj-$(CONFIG_IA32_EMULATION) += ia32/
16 16
17obj-y += platform/
18obj-y += net/
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 5181ed3a211a..9f5e14388e17 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1,6 +1,3 @@
1# x86 configuration
2mainmenu "Linux Kernel Configuration for x86"
3
4# Select 32 or 64 bit 1# Select 32 or 64 bit
5config 64BIT 2config 64BIT
6 bool "64-bit kernel" if ARCH = "x86" 3 bool "64-bit kernel" if ARCH = "x86"
@@ -11,6 +8,7 @@ config 64BIT
11 8
12config X86_32 9config X86_32
13 def_bool !64BIT 10 def_bool !64BIT
11 select CLKSRC_I8253
14 12
15config X86_64 13config X86_64
16 def_bool 64BIT 14 def_bool 64BIT
@@ -19,20 +17,21 @@ config X86_64
19config X86 17config X86
20 def_bool y 18 def_bool y
21 select HAVE_AOUT if X86_32 19 select HAVE_AOUT if X86_32
22 select HAVE_READQ
23 select HAVE_WRITEQ
24 select HAVE_UNSTABLE_SCHED_CLOCK 20 select HAVE_UNSTABLE_SCHED_CLOCK
25 select HAVE_IDE 21 select HAVE_IDE
26 select HAVE_OPROFILE 22 select HAVE_OPROFILE
27 select HAVE_PERF_EVENTS if (!M386 && !M486) 23 select HAVE_PERF_EVENTS
24 select HAVE_IRQ_WORK
28 select HAVE_IOREMAP_PROT 25 select HAVE_IOREMAP_PROT
29 select HAVE_KPROBES 26 select HAVE_KPROBES
27 select HAVE_MEMBLOCK
30 select ARCH_WANT_OPTIONAL_GPIOLIB 28 select ARCH_WANT_OPTIONAL_GPIOLIB
31 select ARCH_WANT_FRAME_POINTERS 29 select ARCH_WANT_FRAME_POINTERS
32 select HAVE_DMA_ATTRS 30 select HAVE_DMA_ATTRS
33 select HAVE_KRETPROBES 31 select HAVE_KRETPROBES
34 select HAVE_OPTPROBES 32 select HAVE_OPTPROBES
35 select HAVE_FTRACE_MCOUNT_RECORD 33 select HAVE_FTRACE_MCOUNT_RECORD
34 select HAVE_C_RECORDMCOUNT
36 select HAVE_DYNAMIC_FTRACE 35 select HAVE_DYNAMIC_FTRACE
37 select HAVE_FUNCTION_TRACER 36 select HAVE_FUNCTION_TRACER
38 select HAVE_FUNCTION_GRAPH_TRACER 37 select HAVE_FUNCTION_GRAPH_TRACER
@@ -51,6 +50,7 @@ config X86
51 select HAVE_KERNEL_GZIP 50 select HAVE_KERNEL_GZIP
52 select HAVE_KERNEL_BZIP2 51 select HAVE_KERNEL_BZIP2
53 select HAVE_KERNEL_LZMA 52 select HAVE_KERNEL_LZMA
53 select HAVE_KERNEL_XZ
54 select HAVE_KERNEL_LZO 54 select HAVE_KERNEL_LZO
55 select HAVE_HW_BREAKPOINT 55 select HAVE_HW_BREAKPOINT
56 select HAVE_MIXED_BREAKPOINTS_REGS 56 select HAVE_MIXED_BREAKPOINTS_REGS
@@ -59,6 +59,17 @@ config X86
59 select ANON_INODES 59 select ANON_INODES
60 select HAVE_ARCH_KMEMCHECK 60 select HAVE_ARCH_KMEMCHECK
61 select HAVE_USER_RETURN_NOTIFIER 61 select HAVE_USER_RETURN_NOTIFIER
62 select HAVE_ARCH_JUMP_LABEL
63 select HAVE_TEXT_POKE_SMP
64 select HAVE_GENERIC_HARDIRQS
65 select HAVE_SPARSE_IRQ
66 select GENERIC_FIND_FIRST_BIT
67 select GENERIC_IRQ_PROBE
68 select GENERIC_PENDING_IRQ if SMP
69 select GENERIC_IRQ_SHOW
70 select IRQ_FORCED_THREADING
71 select USE_GENERIC_SMP_HELPERS if SMP
72 select HAVE_BPF_JIT if (X86_64 && NET)
62 73
63config INSTRUCTION_DECODER 74config INSTRUCTION_DECODER
64 def_bool (KPROBES || PERF_EVENTS) 75 def_bool (KPROBES || PERF_EVENTS)
@@ -99,7 +110,14 @@ config MMU
99 def_bool y 110 def_bool y
100 111
101config ZONE_DMA 112config ZONE_DMA
102 def_bool y 113 bool "DMA memory allocation support" if EXPERT
114 default y
115 help
116 DMA memory allocation support allows devices with less than 32-bit
117 addressing to allocate within the first 16MB of address space.
118 Disable if no such devices will be used.
119
120 If unsure, say Y.
103 121
104config SBUS 122config SBUS
105 bool 123 bool
@@ -111,7 +129,7 @@ config NEED_SG_DMA_LENGTH
111 def_bool y 129 def_bool y
112 130
113config GENERIC_ISA_DMA 131config GENERIC_ISA_DMA
114 def_bool y 132 def_bool ISA_DMA_API
115 133
116config GENERIC_IOMAP 134config GENERIC_IOMAP
117 def_bool y 135 def_bool y
@@ -131,7 +149,7 @@ config GENERIC_GPIO
131 bool 149 bool
132 150
133config ARCH_MAY_HAVE_PC_FDC 151config ARCH_MAY_HAVE_PC_FDC
134 def_bool y 152 def_bool ISA_DMA_API
135 153
136config RWSEM_GENERIC_SPINLOCK 154config RWSEM_GENERIC_SPINLOCK
137 def_bool !X86_XADD 155 def_bool !X86_XADD
@@ -193,31 +211,10 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
193config ARCH_SUPPORTS_DEBUG_PAGEALLOC 211config ARCH_SUPPORTS_DEBUG_PAGEALLOC
194 def_bool y 212 def_bool y
195 213
196config HAVE_EARLY_RES
197 def_bool y
198
199config HAVE_INTEL_TXT 214config HAVE_INTEL_TXT
200 def_bool y 215 def_bool y
201 depends on EXPERIMENTAL && DMAR && ACPI 216 depends on EXPERIMENTAL && DMAR && ACPI
202 217
203# Use the generic interrupt handling code in kernel/irq/:
204config GENERIC_HARDIRQS
205 def_bool y
206
207config GENERIC_HARDIRQS_NO__DO_IRQ
208 def_bool y
209
210config GENERIC_IRQ_PROBE
211 def_bool y
212
213config GENERIC_PENDING_IRQ
214 def_bool y
215 depends on GENERIC_HARDIRQS && SMP
216
217config USE_GENERIC_SMP_HELPERS
218 def_bool y
219 depends on SMP
220
221config X86_32_SMP 218config X86_32_SMP
222 def_bool y 219 def_bool y
223 depends on X86_32 && SMP 220 depends on X86_32 && SMP
@@ -230,10 +227,6 @@ config X86_HT
230 def_bool y 227 def_bool y
231 depends on SMP 228 depends on SMP
232 229
233config X86_TRAMPOLINE
234 def_bool y
235 depends on SMP || (64BIT && ACPI_SLEEP)
236
237config X86_32_LAZY_GS 230config X86_32_LAZY_GS
238 def_bool y 231 def_bool y
239 depends on X86_32 && !CC_STACKPROTECTOR 232 depends on X86_32 && !CC_STACKPROTECTOR
@@ -296,23 +289,6 @@ config X86_X2APIC
296 289
297 If you don't know what to do here, say N. 290 If you don't know what to do here, say N.
298 291
299config SPARSE_IRQ
300 bool "Support sparse irq numbering"
301 depends on PCI_MSI || HT_IRQ
302 ---help---
303 This enables support for sparse irqs. This is useful for distro
304 kernels that want to define a high CONFIG_NR_CPUS value but still
305 want to have low kernel memory footprint on smaller machines.
306
307 ( Sparse IRQs can also be beneficial on NUMA boxes, as they spread
308 out the irq_desc[] array in a more NUMA-friendly way. )
309
310 If you don't know what to do here, say N.
311
312config NUMA_IRQ_DESC
313 def_bool y
314 depends on SPARSE_IRQ && NUMA
315
316config X86_MPPARSE 292config X86_MPPARSE
317 bool "Enable MPS table" if ACPI 293 bool "Enable MPS table" if ACPI
318 default y 294 default y
@@ -372,6 +348,7 @@ endif
372 348
373config X86_VSMP 349config X86_VSMP
374 bool "ScaleMP vSMP" 350 bool "ScaleMP vSMP"
351 select PARAVIRT_GUEST
375 select PARAVIRT 352 select PARAVIRT
376 depends on X86_64 && PCI 353 depends on X86_64 && PCI
377 depends on X86_EXTENDED_PLATFORM 354 depends on X86_EXTENDED_PLATFORM
@@ -393,16 +370,19 @@ config X86_UV
393# Following is an alphabetically sorted list of 32 bit extended platforms 370# Following is an alphabetically sorted list of 32 bit extended platforms
394# Please maintain the alphabetic order if and when there are additions 371# Please maintain the alphabetic order if and when there are additions
395 372
396config X86_ELAN 373config X86_INTEL_CE
397 bool "AMD Elan" 374 bool "CE4100 TV platform"
375 depends on PCI
376 depends on PCI_GODIRECT
398 depends on X86_32 377 depends on X86_32
399 depends on X86_EXTENDED_PLATFORM 378 depends on X86_EXTENDED_PLATFORM
379 select X86_REBOOTFIXUPS
380 select OF
381 select OF_EARLY_FLATTREE
400 ---help--- 382 ---help---
401 Select this for an AMD Elan processor. 383 Select for the Intel CE media processor (CE4100) SOC.
402 384 This option compiles in support for the CE4100 SOC for settop
403 Do not use this option for K6/Athlon/Opteron processors! 385 boxes and media devices.
404
405 If unsure, choose "PC-compatible" instead.
406 386
407config X86_MRST 387config X86_MRST
408 bool "Moorestown MID platform" 388 bool "Moorestown MID platform"
@@ -412,6 +392,10 @@ config X86_MRST
412 depends on X86_EXTENDED_PLATFORM 392 depends on X86_EXTENDED_PLATFORM
413 depends on X86_IO_APIC 393 depends on X86_IO_APIC
414 select APB_TIMER 394 select APB_TIMER
395 select I2C
396 select SPI
397 select INTEL_SCU_IPC
398 select X86_PLATFORM_DEVICES
415 ---help--- 399 ---help---
416 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin 400 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
417 Internet Device(MID) platform. Moorestown consists of two chips: 401 Internet Device(MID) platform. Moorestown consists of two chips:
@@ -493,6 +477,19 @@ config X86_ES7000
493 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is 477 Support for Unisys ES7000 systems. Say 'Y' here if this kernel is
494 supposed to run on an IA32-based Unisys ES7000 system. 478 supposed to run on an IA32-based Unisys ES7000 system.
495 479
480config X86_32_IRIS
481 tristate "Eurobraille/Iris poweroff module"
482 depends on X86_32
483 ---help---
484 The Iris machines from EuroBraille do not have APM or ACPI support
485 to shut themselves down properly. A special I/O sequence is
486 needed to do so, which is what this module does at
487 kernel shutdown.
488
489 This is only for Iris machines from EuroBraille.
490
491 If unused, say N.
492
496config SCHED_OMIT_FRAME_POINTER 493config SCHED_OMIT_FRAME_POINTER
497 def_bool y 494 def_bool y
498 prompt "Single-depth WCHAN output" 495 prompt "Single-depth WCHAN output"
@@ -517,25 +514,6 @@ if PARAVIRT_GUEST
517 514
518source "arch/x86/xen/Kconfig" 515source "arch/x86/xen/Kconfig"
519 516
520config VMI
521 bool "VMI Guest support (DEPRECATED)"
522 select PARAVIRT
523 depends on X86_32
524 ---help---
525 VMI provides a paravirtualized interface to the VMware ESX server
526 (it could be used by other hypervisors in theory too, but is not
527 at the moment), by linking the kernel to a GPL-ed ROM module
528 provided by the hypervisor.
529
530 As of September 2009, VMware has started a phased retirement
531 of this feature from VMware's products. Please see
532 feature-removal-schedule.txt for details. If you are
533 planning to enable this option, please note that you cannot
534 live migrate a VMI enabled VM to a future VMware product,
535 which doesn't support VMI. So if you expect your kernel to
536 seamlessly migrate to newer VMware products, keep this
537 disabled.
538
539config KVM_CLOCK 517config KVM_CLOCK
540 bool "KVM paravirtualized clock" 518 bool "KVM paravirtualized clock"
541 select PARAVIRT 519 select PARAVIRT
@@ -590,16 +568,7 @@ config PARAVIRT_DEBUG
590 a paravirt_op is missing when it is called. 568 a paravirt_op is missing when it is called.
591 569
592config NO_BOOTMEM 570config NO_BOOTMEM
593 default y 571 def_bool y
594 bool "Disable Bootmem code"
595 ---help---
596 Use early_res directly instead of bootmem before slab is ready.
597 - allocator (buddy) [generic]
598 - early allocator (bootmem) [generic]
599 - very early allocator (reserve_early*()) [x86]
600 - very very early allocator (early brk model) [x86]
601 So reduce one layer between early allocator to final allocator
602
603 572
604config MEMTEST 573config MEMTEST
605 bool "Memtest" 574 bool "Memtest"
@@ -655,11 +624,11 @@ config APB_TIMER
655 as it is off-chip. APB timers are always running regardless of CPU 624 as it is off-chip. APB timers are always running regardless of CPU
656 C states, they are used as per CPU clockevent device when possible. 625 C states, they are used as per CPU clockevent device when possible.
657 626
658# Mark as embedded because too many people got it wrong. 627# Mark as expert because too many people got it wrong.
659# The code disables itself when not needed. 628# The code disables itself when not needed.
660config DMI 629config DMI
661 default y 630 default y
662 bool "Enable DMI scanning" if EMBEDDED 631 bool "Enable DMI scanning" if EXPERT
663 ---help--- 632 ---help---
664 Enabled scanning of DMI to identify machine quirks. Say Y 633 Enabled scanning of DMI to identify machine quirks. Say Y
665 here unless you have verified that your setup is not 634 here unless you have verified that your setup is not
@@ -667,10 +636,10 @@ config DMI
667 BIOS code. 636 BIOS code.
668 637
669config GART_IOMMU 638config GART_IOMMU
670 bool "GART IOMMU support" if EMBEDDED 639 bool "GART IOMMU support" if EXPERT
671 default y 640 default y
672 select SWIOTLB 641 select SWIOTLB
673 depends on X86_64 && PCI && K8_NB 642 depends on X86_64 && PCI && AMD_NB
674 ---help--- 643 ---help---
675 Support for full DMA access of devices with 32bit memory access only 644 Support for full DMA access of devices with 32bit memory access only
676 on systems with more than 3GB. This is usually needed for USB, 645 on systems with more than 3GB. This is usually needed for USB,
@@ -715,6 +684,7 @@ config AMD_IOMMU
715 bool "AMD IOMMU support" 684 bool "AMD IOMMU support"
716 select SWIOTLB 685 select SWIOTLB
717 select PCI_MSI 686 select PCI_MSI
687 select PCI_IOV
718 depends on X86_64 && PCI && ACPI 688 depends on X86_64 && PCI && ACPI
719 ---help--- 689 ---help---
720 With this option you can enable support for AMD IOMMU hardware in 690 With this option you can enable support for AMD IOMMU hardware in
@@ -795,6 +765,17 @@ config SCHED_MC
795 making when dealing with multi-core CPU chips at a cost of slightly 765 making when dealing with multi-core CPU chips at a cost of slightly
796 increased overhead in some places. If unsure say N here. 766 increased overhead in some places. If unsure say N here.
797 767
768config IRQ_TIME_ACCOUNTING
769 bool "Fine granularity task level IRQ time accounting"
770 default n
771 ---help---
772 Select this option to enable fine granularity task irq time
773 accounting. This is done by reading a timestamp on each
774 transitions between softirq and hardirq state, so there can be a
775 small performance impact.
776
777 If in doubt, say N here.
778
798source "kernel/Kconfig.preempt" 779source "kernel/Kconfig.preempt"
799 780
800config X86_UP_APIC 781config X86_UP_APIC
@@ -828,7 +809,7 @@ config X86_LOCAL_APIC
828 809
829config X86_IO_APIC 810config X86_IO_APIC
830 def_bool y 811 def_bool y
831 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_APIC 812 depends on X86_64 || SMP || X86_32_NON_STANDARD || X86_UP_IOAPIC
832 813
833config X86_VISWS_APIC 814config X86_VISWS_APIC
834 def_bool y 815 def_bool y
@@ -906,7 +887,7 @@ config X86_THERMAL_VECTOR
906 depends on X86_MCE_INTEL 887 depends on X86_MCE_INTEL
907 888
908config VM86 889config VM86
909 bool "Enable VM86 support" if EMBEDDED 890 bool "Enable VM86 support" if EXPERT
910 default y 891 default y
911 depends on X86_32 892 depends on X86_32
912 ---help--- 893 ---help---
@@ -933,6 +914,7 @@ config TOSHIBA
933 914
934config I8K 915config I8K
935 tristate "Dell laptop support" 916 tristate "Dell laptop support"
917 select HWMON
936 ---help--- 918 ---help---
937 This adds a driver to safely access the System Management Mode 919 This adds a driver to safely access the System Management Mode
938 of the CPU on the Dell Inspiron 8000. The System Management Mode 920 of the CPU on the Dell Inspiron 8000. The System Management Mode
@@ -1090,7 +1072,7 @@ endchoice
1090 1072
1091choice 1073choice
1092 depends on EXPERIMENTAL 1074 depends on EXPERIMENTAL
1093 prompt "Memory split" if EMBEDDED 1075 prompt "Memory split" if EXPERT
1094 default VMSPLIT_3G 1076 default VMSPLIT_3G
1095 depends on X86_32 1077 depends on X86_32
1096 ---help--- 1078 ---help---
@@ -1148,8 +1130,11 @@ config X86_PAE
1148config ARCH_PHYS_ADDR_T_64BIT 1130config ARCH_PHYS_ADDR_T_64BIT
1149 def_bool X86_64 || X86_PAE 1131 def_bool X86_64 || X86_PAE
1150 1132
1133config ARCH_DMA_ADDR_T_64BIT
1134 def_bool X86_64 || HIGHMEM64G
1135
1151config DIRECT_GBPAGES 1136config DIRECT_GBPAGES
1152 bool "Enable 1GB pages for kernel pagetables" if EMBEDDED 1137 bool "Enable 1GB pages for kernel pagetables" if EXPERT
1153 default y 1138 default y
1154 depends on X86_64 1139 depends on X86_64
1155 ---help--- 1140 ---help---
@@ -1182,16 +1167,16 @@ config NUMA
1182comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI" 1167comment "NUMA (Summit) requires SMP, 64GB highmem support, ACPI"
1183 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI) 1168 depends on X86_32 && X86_SUMMIT && (!HIGHMEM64G || !ACPI)
1184 1169
1185config K8_NUMA 1170config AMD_NUMA
1186 def_bool y 1171 def_bool y
1187 prompt "Old style AMD Opteron NUMA detection" 1172 prompt "Old style AMD Opteron NUMA detection"
1188 depends on X86_64 && NUMA && PCI 1173 depends on X86_64 && NUMA && PCI
1189 ---help--- 1174 ---help---
1190 Enable K8 NUMA node topology detection. You should say Y here if 1175 Enable AMD NUMA node topology detection. You should say Y here if
1191 you have a multi processor AMD K8 system. This uses an old 1176 you have a multi processor AMD system. This uses an old method to
1192 method to read the NUMA configuration directly from the builtin 1177 read the NUMA configuration directly from the builtin Northbridge
1193 Northbridge of Opteron. It is recommended to use X86_64_ACPI_NUMA 1178 of Opteron. It is recommended to use X86_64_ACPI_NUMA instead,
1194 instead, which also takes priority if both are compiled in. 1179 which also takes priority if both are compiled in.
1195 1180
1196config X86_64_ACPI_NUMA 1181config X86_64_ACPI_NUMA
1197 def_bool y 1182 def_bool y
@@ -1212,7 +1197,7 @@ config NODES_SPAN_OTHER_NODES
1212 1197
1213config NUMA_EMU 1198config NUMA_EMU
1214 bool "NUMA emulation" 1199 bool "NUMA emulation"
1215 depends on X86_64 && NUMA 1200 depends on NUMA
1216 ---help--- 1201 ---help---
1217 Enable NUMA emulation. A flat machine will be split 1202 Enable NUMA emulation. A flat machine will be split
1218 into virtual nodes when booted with "numa=fake=N", where N is the 1203 into virtual nodes when booted with "numa=fake=N", where N is the
@@ -1234,6 +1219,10 @@ config HAVE_ARCH_BOOTMEM
1234 def_bool y 1219 def_bool y
1235 depends on X86_32 && NUMA 1220 depends on X86_32 && NUMA
1236 1221
1222config HAVE_ARCH_ALLOC_REMAP
1223 def_bool y
1224 depends on X86_32 && NUMA
1225
1237config ARCH_HAVE_MEMORY_PRESENT 1226config ARCH_HAVE_MEMORY_PRESENT
1238 def_bool y 1227 def_bool y
1239 depends on X86_32 && DISCONTIGMEM 1228 depends on X86_32 && DISCONTIGMEM
@@ -1242,13 +1231,9 @@ config NEED_NODE_MEMMAP_SIZE
1242 def_bool y 1231 def_bool y
1243 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM) 1232 depends on X86_32 && (DISCONTIGMEM || SPARSEMEM)
1244 1233
1245config HAVE_ARCH_ALLOC_REMAP
1246 def_bool y
1247 depends on X86_32 && NUMA
1248
1249config ARCH_FLATMEM_ENABLE 1234config ARCH_FLATMEM_ENABLE
1250 def_bool y 1235 def_bool y
1251 depends on X86_32 && ARCH_SELECT_MEMORY_MODEL && !NUMA 1236 depends on X86_32 && !NUMA
1252 1237
1253config ARCH_DISCONTIGMEM_ENABLE 1238config ARCH_DISCONTIGMEM_ENABLE
1254 def_bool y 1239 def_bool y
@@ -1258,20 +1243,16 @@ config ARCH_DISCONTIGMEM_DEFAULT
1258 def_bool y 1243 def_bool y
1259 depends on NUMA && X86_32 1244 depends on NUMA && X86_32
1260 1245
1261config ARCH_PROC_KCORE_TEXT
1262 def_bool y
1263 depends on X86_64 && PROC_KCORE
1264
1265config ARCH_SPARSEMEM_DEFAULT
1266 def_bool y
1267 depends on X86_64
1268
1269config ARCH_SPARSEMEM_ENABLE 1246config ARCH_SPARSEMEM_ENABLE
1270 def_bool y 1247 def_bool y
1271 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD 1248 depends on X86_64 || NUMA || (EXPERIMENTAL && X86_32) || X86_32_NON_STANDARD
1272 select SPARSEMEM_STATIC if X86_32 1249 select SPARSEMEM_STATIC if X86_32
1273 select SPARSEMEM_VMEMMAP_ENABLE if X86_64 1250 select SPARSEMEM_VMEMMAP_ENABLE if X86_64
1274 1251
1252config ARCH_SPARSEMEM_DEFAULT
1253 def_bool y
1254 depends on X86_64
1255
1275config ARCH_SELECT_MEMORY_MODEL 1256config ARCH_SELECT_MEMORY_MODEL
1276 def_bool y 1257 def_bool y
1277 depends on ARCH_SPARSEMEM_ENABLE 1258 depends on ARCH_SPARSEMEM_ENABLE
@@ -1280,6 +1261,10 @@ config ARCH_MEMORY_PROBE
1280 def_bool X86_64 1261 def_bool X86_64
1281 depends on MEMORY_HOTPLUG 1262 depends on MEMORY_HOTPLUG
1282 1263
1264config ARCH_PROC_KCORE_TEXT
1265 def_bool y
1266 depends on X86_64 && PROC_KCORE
1267
1283config ILLEGAL_POINTER_VALUE 1268config ILLEGAL_POINTER_VALUE
1284 hex 1269 hex
1285 default 0 if X86_32 1270 default 0 if X86_32
@@ -1326,25 +1311,34 @@ config X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK
1326 Set whether the default state of memory_corruption_check is 1311 Set whether the default state of memory_corruption_check is
1327 on or off. 1312 on or off.
1328 1313
1329config X86_RESERVE_LOW_64K 1314config X86_RESERVE_LOW
1330 bool "Reserve low 64K of RAM on AMI/Phoenix BIOSen" 1315 int "Amount of low memory, in kilobytes, to reserve for the BIOS"
1331 default y 1316 default 64
1317 range 4 640
1332 ---help--- 1318 ---help---
1333 Reserve the first 64K of physical RAM on BIOSes that are known 1319 Specify the amount of low memory to reserve for the BIOS.
1334 to potentially corrupt that memory range. A numbers of BIOSes are 1320
1335 known to utilize this area during suspend/resume, so it must not 1321 The first page contains BIOS data structures that the kernel
1336 be used by the kernel. 1322 must not use, so that page must always be reserved.
1337 1323
1338 Set this to N if you are absolutely sure that you trust the BIOS 1324 By default we reserve the first 64K of physical RAM, as a
1339 to get all its memory reservations and usages right. 1325 number of BIOSes are known to corrupt that memory range
1326 during events such as suspend/resume or monitor cable
1327 insertion, so it must not be used by the kernel.
1340 1328
1341 If you have doubts about the BIOS (e.g. suspend/resume does not 1329 You can set this to 4 if you are absolutely sure that you
1342 work or there's kernel crashes after certain hardware hotplug 1330 trust the BIOS to get all its memory reservations and usages
1343 events) and it's not AMI or Phoenix, then you might want to enable 1331 right. If you know your BIOS have problems beyond the
1344 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check typical 1332 default 64K area, you can set this to 640 to avoid using the
1345 corruption patterns. 1333 entire low memory range.
1346 1334
1347 Say Y if unsure. 1335 If you have doubts about the BIOS (e.g. suspend/resume does
1336 not work or there's kernel crashes after certain hardware
1337 hotplug events) then you might want to enable
1338 X86_CHECK_BIOS_CORRUPTION=y to allow the kernel to check
1339 typical corruption patterns.
1340
1341 Leave this to the default value of 64 if you are unsure.
1348 1342
1349config MATH_EMULATION 1343config MATH_EMULATION
1350 bool 1344 bool
@@ -1374,7 +1368,7 @@ config MATH_EMULATION
1374 1368
1375config MTRR 1369config MTRR
1376 def_bool y 1370 def_bool y
1377 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED 1371 prompt "MTRR (Memory Type Range Register) support" if EXPERT
1378 ---help--- 1372 ---help---
1379 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1373 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1380 the Memory Type Range Registers (MTRRs) may be used to control 1374 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1440,7 +1434,7 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1440 1434
1441config X86_PAT 1435config X86_PAT
1442 def_bool y 1436 def_bool y
1443 prompt "x86 PAT support" if EMBEDDED 1437 prompt "x86 PAT support" if EXPERT
1444 depends on MTRR 1438 depends on MTRR
1445 ---help--- 1439 ---help---
1446 Use PAT attributes to setup page level cache control. 1440 Use PAT attributes to setup page level cache control.
@@ -1544,7 +1538,7 @@ config KEXEC_JUMP
1544 code in physical address mode via KEXEC 1538 code in physical address mode via KEXEC
1545 1539
1546config PHYSICAL_START 1540config PHYSICAL_START
1547 hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) 1541 hex "Physical address where the kernel is loaded" if (EXPERT || CRASH_DUMP)
1548 default "0x1000000" 1542 default "0x1000000"
1549 ---help--- 1543 ---help---
1550 This gives the physical address where the kernel is loaded. 1544 This gives the physical address where the kernel is loaded.
@@ -1705,12 +1699,8 @@ config ARCH_ENABLE_MEMORY_HOTREMOVE
1705 def_bool y 1699 def_bool y
1706 depends on MEMORY_HOTPLUG 1700 depends on MEMORY_HOTPLUG
1707 1701
1708config HAVE_ARCH_EARLY_PFN_TO_NID
1709 def_bool X86_64
1710 depends on NUMA
1711
1712config USE_PERCPU_NUMA_NODE_ID 1702config USE_PERCPU_NUMA_NODE_ID
1713 def_bool X86_64 1703 def_bool y
1714 depends on NUMA 1704 depends on NUMA
1715 1705
1716menu "Power management and ACPI options" 1706menu "Power management and ACPI options"
@@ -1850,7 +1840,7 @@ config APM_ALLOW_INTS
1850 1840
1851endif # APM 1841endif # APM
1852 1842
1853source "arch/x86/kernel/cpu/cpufreq/Kconfig" 1843source "drivers/cpufreq/Kconfig"
1854 1844
1855source "drivers/cpuidle/Kconfig" 1845source "drivers/cpuidle/Kconfig"
1856 1846
@@ -1900,7 +1890,7 @@ config PCI_GODIRECT
1900 bool "Direct" 1890 bool "Direct"
1901 1891
1902config PCI_GOOLPC 1892config PCI_GOOLPC
1903 bool "OLPC" 1893 bool "OLPC XO-1"
1904 depends on OLPC 1894 depends on OLPC
1905 1895
1906config PCI_GOANY 1896config PCI_GOANY
@@ -1925,6 +1915,11 @@ config PCI_OLPC
1925 def_bool y 1915 def_bool y
1926 depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY) 1916 depends on PCI && OLPC && (PCI_GOOLPC || PCI_GOANY)
1927 1917
1918config PCI_XEN
1919 def_bool y
1920 depends on PCI && XEN
1921 select SWIOTLB_XEN
1922
1928config PCI_DOMAINS 1923config PCI_DOMAINS
1929 def_bool y 1924 def_bool y
1930 depends on PCI 1925 depends on PCI
@@ -1934,13 +1929,19 @@ config PCI_MMCONFIG
1934 depends on X86_64 && PCI && ACPI 1929 depends on X86_64 && PCI && ACPI
1935 1930
1936config PCI_CNB20LE_QUIRK 1931config PCI_CNB20LE_QUIRK
1937 bool "Read CNB20LE Host Bridge Windows" 1932 bool "Read CNB20LE Host Bridge Windows" if EXPERT
1938 depends on PCI 1933 default n
1934 depends on PCI && EXPERIMENTAL
1939 help 1935 help
1940 Read the PCI windows out of the CNB20LE host bridge. This allows 1936 Read the PCI windows out of the CNB20LE host bridge. This allows
1941 PCI hotplug to work on systems with the CNB20LE chipset which do 1937 PCI hotplug to work on systems with the CNB20LE chipset which do
1942 not have ACPI. 1938 not have ACPI.
1943 1939
1940 There's no public spec for this chipset, and this functionality
1941 is known to be incomplete.
1942
1943 You should say N unless you know you need this.
1944
1944config DMAR 1945config DMAR
1945 bool "Support for DMA Remapping Devices (EXPERIMENTAL)" 1946 bool "Support for DMA Remapping Devices (EXPERIMENTAL)"
1946 depends on PCI_MSI && ACPI && EXPERIMENTAL 1947 depends on PCI_MSI && ACPI && EXPERIMENTAL
@@ -1994,9 +1995,13 @@ source "drivers/pci/pcie/Kconfig"
1994 1995
1995source "drivers/pci/Kconfig" 1996source "drivers/pci/Kconfig"
1996 1997
1997# x86_64 have no ISA slots, but do have ISA-style DMA. 1998# x86_64 have no ISA slots, but can have ISA-style DMA.
1998config ISA_DMA_API 1999config ISA_DMA_API
1999 def_bool y 2000 bool "ISA-style DMA support" if (X86_64 && EXPERT)
2001 default y
2002 help
2003 Enables ISA-style DMA support for devices requiring such controllers.
2004 If unsure, say Y.
2000 2005
2001if X86_32 2006if X86_32
2002 2007
@@ -2060,23 +2065,23 @@ config SCx200HR_TIMER
2060 2065
2061config OLPC 2066config OLPC
2062 bool "One Laptop Per Child support" 2067 bool "One Laptop Per Child support"
2068 depends on !X86_PAE
2063 select GPIOLIB 2069 select GPIOLIB
2070 select OF
2071 select OF_PROMTREE
2064 ---help--- 2072 ---help---
2065 Add support for detecting the unique features of the OLPC 2073 Add support for detecting the unique features of the OLPC
2066 XO hardware. 2074 XO hardware.
2067 2075
2068config OLPC_OPENFIRMWARE 2076config OLPC_XO1
2069 bool "Support for OLPC's Open Firmware" 2077 tristate "OLPC XO-1 support"
2070 depends on !X86_64 && !X86_PAE 2078 depends on OLPC && MFD_CS5535
2071 default y if OLPC 2079 ---help---
2072 help 2080 Add support for non-essential features of the OLPC XO-1 laptop.
2073 This option adds support for the implementation of Open Firmware
2074 that is used on the OLPC XO-1 Children's Machine.
2075 If unsure, say N here.
2076 2081
2077endif # X86_32 2082endif # X86_32
2078 2083
2079config K8_NB 2084config AMD_NB
2080 def_bool y 2085 def_bool y
2081 depends on CPU_SUP_AMD && PCI 2086 depends on CPU_SUP_AMD && PCI
2082 2087
@@ -2084,6 +2089,16 @@ source "drivers/pcmcia/Kconfig"
2084 2089
2085source "drivers/pci/hotplug/Kconfig" 2090source "drivers/pci/hotplug/Kconfig"
2086 2091
2092config RAPIDIO
2093 bool "RapidIO support"
2094 depends on PCI
2095 default n
2096 help
2097 If you say Y here, the kernel will include drivers and
2098 infrastructure code to support RapidIO interconnect devices.
2099
2100source "drivers/rapidio/Kconfig"
2101
2087endmenu 2102endmenu
2088 2103
2089 2104
@@ -2118,6 +2133,11 @@ config SYSVIPC_COMPAT
2118 def_bool y 2133 def_bool y
2119 depends on COMPAT && SYSVIPC 2134 depends on COMPAT && SYSVIPC
2120 2135
2136config KEYS_COMPAT
2137 bool
2138 depends on COMPAT && KEYS
2139 default y
2140
2121endmenu 2141endmenu
2122 2142
2123 2143
@@ -2125,6 +2145,10 @@ config HAVE_ATOMIC_IOMAP
2125 def_bool y 2145 def_bool y
2126 depends on X86_32 2146 depends on X86_32
2127 2147
2148config HAVE_TEXT_POKE_SMP
2149 bool
2150 select STOP_MACHINE if SMP
2151
2128source "net/Kconfig" 2152source "net/Kconfig"
2129 2153
2130source "drivers/Kconfig" 2154source "drivers/Kconfig"
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2ac9069890cd..6a7cfdf8ff69 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -1,6 +1,4 @@
1# Put here option for CPU selection and depending optimization 1# Put here option for CPU selection and depending optimization
2if !X86_ELAN
3
4choice 2choice
5 prompt "Processor family" 3 prompt "Processor family"
6 default M686 if X86_32 4 default M686 if X86_32
@@ -203,6 +201,14 @@ config MWINCHIP3D
203 stores for this CPU, which can increase performance of some 201 stores for this CPU, which can increase performance of some
204 operations. 202 operations.
205 203
204config MELAN
205 bool "AMD Elan"
206 depends on X86_32
207 ---help---
208 Select this for an AMD Elan processor.
209
210 Do not use this option for K6/Athlon/Opteron processors!
211
206config MGEODEGX1 212config MGEODEGX1
207 bool "GeodeGX1" 213 bool "GeodeGX1"
208 depends on X86_32 214 depends on X86_32
@@ -292,13 +298,6 @@ config X86_GENERIC
292 This is really intended for distributors who need more 298 This is really intended for distributors who need more
293 generic optimizations. 299 generic optimizations.
294 300
295endif
296
297config X86_CPU
298 def_bool y
299 select GENERIC_FIND_FIRST_BIT
300 select GENERIC_FIND_NEXT_BIT
301
302# 301#
303# Define implied options from the CPU selection here 302# Define implied options from the CPU selection here
304config X86_INTERNODE_CACHE_SHIFT 303config X86_INTERNODE_CACHE_SHIFT
@@ -310,11 +309,14 @@ config X86_INTERNODE_CACHE_SHIFT
310config X86_CMPXCHG 309config X86_CMPXCHG
311 def_bool X86_64 || (X86_32 && !M386) 310 def_bool X86_64 || (X86_32 && !M386)
312 311
312config CMPXCHG_LOCAL
313 def_bool X86_64 || (X86_32 && !M386)
314
313config X86_L1_CACHE_SHIFT 315config X86_L1_CACHE_SHIFT
314 int 316 int
315 default "7" if MPENTIUM4 || MPSC 317 default "7" if MPENTIUM4 || MPSC
316 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU 318 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
317 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 319 default "4" if MELAN || M486 || M386 || MGEODEGX1
318 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 320 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
319 321
320config X86_XADD 322config X86_XADD
@@ -328,7 +330,7 @@ config X86_PPRO_FENCE
328 Old PentiumPro multiprocessor systems had errata that could cause 330 Old PentiumPro multiprocessor systems had errata that could cause
329 memory operations to violate the x86 ordering standard in rare cases. 331 memory operations to violate the x86 ordering standard in rare cases.
330 Enabling this option will attempt to work around some (but not all) 332 Enabling this option will attempt to work around some (but not all)
331 occurances of this problem, at the cost of much heavier spinlock and 333 occurrences of this problem, at the cost of much heavier spinlock and
332 memory barrier operations. 334 memory barrier operations.
333 335
334 If unsure, say n here. Even distro kernels should think twice before 336 If unsure, say n here. Even distro kernels should think twice before
@@ -360,7 +362,7 @@ config X86_POPAD_OK
360 362
361config X86_ALIGNMENT_16 363config X86_ALIGNMENT_16
362 def_bool y 364 def_bool y
363 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || X86_ELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1 365 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486 || MVIAC3_2 || MGEODEGX1
364 366
365config X86_INTEL_USERCOPY 367config X86_INTEL_USERCOPY
366 def_bool y 368 def_bool y
@@ -368,7 +370,7 @@ config X86_INTEL_USERCOPY
368 370
369config X86_USE_PPRO_CHECKSUM 371config X86_USE_PPRO_CHECKSUM
370 def_bool y 372 def_bool y
371 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM 373 depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM
372 374
373config X86_USE_3DNOW 375config X86_USE_3DNOW
374 def_bool y 376 def_bool y
@@ -421,7 +423,7 @@ config X86_DEBUGCTLMSR
421 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML 423 depends on !(MK6 || MWINCHIPC6 || MWINCHIP3D || MCYRIXIII || M586MMX || M586TSC || M586 || M486 || M386) && !UML
422 424
423menuconfig PROCESSOR_SELECT 425menuconfig PROCESSOR_SELECT
424 bool "Supported processor vendors" if EMBEDDED 426 bool "Supported processor vendors" if EXPERT
425 ---help--- 427 ---help---
426 This lets you choose what x86 vendor support code your kernel 428 This lets you choose what x86 vendor support code your kernel
427 will include. 429 will include.
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 75085080b63e..c0f8a5c88910 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -31,7 +31,7 @@ config X86_VERBOSE_BOOTUP
31 see errors. Disable this if you want silent bootup. 31 see errors. Disable this if you want silent bootup.
32 32
33config EARLY_PRINTK 33config EARLY_PRINTK
34 bool "Early printk" if EMBEDDED 34 bool "Early printk" if EXPERT
35 default y 35 default y
36 ---help--- 36 ---help---
37 Write kernel log output directly into the VGA buffer or to a serial 37 Write kernel log output directly into the VGA buffer or to a serial
@@ -43,6 +43,10 @@ config EARLY_PRINTK
43 with klogd/syslogd or the X server. You should normally N here, 43 with klogd/syslogd or the X server. You should normally N here,
44 unless you want to debug such a crash. 44 unless you want to debug such a crash.
45 45
46config EARLY_PRINTK_MRST
47 bool "Early printk for MRST platform support"
48 depends on EARLY_PRINTK && X86_MRST
49
46config EARLY_PRINTK_DBGP 50config EARLY_PRINTK_DBGP
47 bool "Early printk via EHCI debug port" 51 bool "Early printk via EHCI debug port"
48 depends on EARLY_PRINTK && PCI 52 depends on EARLY_PRINTK && PCI
@@ -62,26 +66,6 @@ config DEBUG_STACKOVERFLOW
62 This option will cause messages to be printed if free stack space 66 This option will cause messages to be printed if free stack space
63 drops below a certain limit. 67 drops below a certain limit.
64 68
65config DEBUG_STACK_USAGE
66 bool "Stack utilization instrumentation"
67 depends on DEBUG_KERNEL
68 ---help---
69 Enables the display of the minimum amount of free stack which each
70 task has ever had available in the sysrq-T and sysrq-P debug output.
71
72 This option will slow down process creation somewhat.
73
74config DEBUG_PER_CPU_MAPS
75 bool "Debug access to per_cpu maps"
76 depends on DEBUG_KERNEL
77 depends on SMP
78 ---help---
79 Say Y to verify that the per_cpu map being accessed has
80 been setup. Adds a fair amount of code to kernel memory
81 and decreases performance.
82
83 Say N if unsure.
84
85config X86_PTDUMP 69config X86_PTDUMP
86 bool "Export kernel pagetable layout to userspace via debugfs" 70 bool "Export kernel pagetable layout to userspace via debugfs"
87 depends on DEBUG_KERNEL 71 depends on DEBUG_KERNEL
@@ -113,6 +97,17 @@ config DEBUG_RODATA_TEST
113 feature as well as for the change_page_attr() infrastructure. 97 feature as well as for the change_page_attr() infrastructure.
114 If in doubt, say "N" 98 If in doubt, say "N"
115 99
100config DEBUG_SET_MODULE_RONX
101 bool "Set loadable kernel module data as NX and text as RO"
102 depends on MODULES
103 ---help---
104 This option helps catch unintended modifications to loadable
105 kernel module's text and read-only data. It also prevents execution
106 of module data. Such protection may interfere with run-time code
107 patching and dynamic kernel tracing - and they might also protect
108 against certain classes of kernel exploits.
109 If in doubt, say "N".
110
116config DEBUG_NX_TEST 111config DEBUG_NX_TEST
117 tristate "Testcase for the NX non-executable stack feature" 112 tristate "Testcase for the NX non-executable stack feature"
118 depends on DEBUG_KERNEL && m 113 depends on DEBUG_KERNEL && m
@@ -121,19 +116,9 @@ config DEBUG_NX_TEST
121 and the software setup of this feature. 116 and the software setup of this feature.
122 If in doubt, say "N" 117 If in doubt, say "N"
123 118
124config 4KSTACKS
125 bool "Use 4Kb for kernel stacks instead of 8Kb"
126 depends on X86_32
127 ---help---
128 If you say Y here the kernel will use a 4Kb stacksize for the
129 kernel stack attached to each process/thread. This facilitates
130 running more threads on a system and also reduces the pressure
131 on the VM subsystem for higher order allocations. This option
132 will also use IRQ stacks to compensate for the reduced stackspace.
133
134config DOUBLEFAULT 119config DOUBLEFAULT
135 default y 120 default y
136 bool "Enable doublefault exception handler" if EMBEDDED 121 bool "Enable doublefault exception handler" if EXPERT
137 depends on X86_32 122 depends on X86_32
138 ---help--- 123 ---help---
139 This option allows trapping of rare doublefault exceptions that 124 This option allows trapping of rare doublefault exceptions that
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index e8c8881351b3..b02e509072a7 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -96,8 +96,12 @@ cfi := $(call as-instr,.cfi_startproc\n.cfi_rel_offset $(sp-y)$(comma)0\n.cfi_en
96# is .cfi_signal_frame supported too? 96# is .cfi_signal_frame supported too?
97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1) 97cfi-sigframe := $(call as-instr,.cfi_startproc\n.cfi_signal_frame\n.cfi_endproc,-DCONFIG_AS_CFI_SIGNAL_FRAME=1)
98cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1) 98cfi-sections := $(call as-instr,.cfi_sections .debug_frame,-DCONFIG_AS_CFI_SECTIONS=1)
99KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) 99
100KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) 100# does binutils support specific instructions?
101asinstr := $(call as-instr,fxsaveq (%rax),-DCONFIG_AS_FXSAVEQ=1)
102
103KBUILD_AFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
104KBUILD_CFLAGS += $(cfi) $(cfi-sigframe) $(cfi-sections) $(asinstr)
101 105
102LDFLAGS := -m elf_$(UTS_MACHINE) 106LDFLAGS := -m elf_$(UTS_MACHINE)
103 107
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 1255d953c65d..86cee7b749e1 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -37,7 +37,7 @@ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=
37 $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) 37 $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic))
38 38
39# AMD Elan support 39# AMD Elan support
40cflags-$(CONFIG_X86_ELAN) += -march=i486 40cflags-$(CONFIG_MELAN) += -march=i486
41 41
42# Geode GX1 support 42# Geode GX1 support
43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx 43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
@@ -51,7 +51,18 @@ cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
51# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph 51# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
52# tracer assumptions. For i686, generic, core2 this is set by the 52# tracer assumptions. For i686, generic, core2 this is set by the
53# compiler anyway 53# compiler anyway
54cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args) 54ifeq ($(CONFIG_FUNCTION_GRAPH_TRACER), y)
55ADD_ACCUMULATE_OUTGOING_ARGS := y
56endif
57
58# Work around to a bug with asm goto with first implementations of it
59# in gcc causing gcc to mess up the push and pop of the stack in some
60# uses of asm goto.
61ifeq ($(CONFIG_JUMP_LABEL), y)
62ADD_ACCUMULATE_OUTGOING_ARGS := y
63endif
64
65cflags-$(ADD_ACCUMULATE_OUTGOING_ARGS) += $(call cc-option,-maccumulate-outgoing-args)
55 66
56# Bug fix for binutils: this option is required in order to keep 67# Bug fix for binutils: this option is required in order to keep
57# binutils from generating NOPL instructions against our will. 68# binutils from generating NOPL instructions against our will.
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 0c229551eead..09664efb9cee 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,7 +4,7 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.xz vmlinux.bin.lzo head_$(BITS).o misc.o string.o cmdline.o early_serial_console.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
@@ -49,12 +49,15 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
49 $(call if_changed,bzip2) 49 $(call if_changed,bzip2)
50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE 50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
51 $(call if_changed,lzma) 51 $(call if_changed,lzma)
52$(obj)/vmlinux.bin.xz: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,xzkern)
52$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE 54$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,lzo) 55 $(call if_changed,lzo)
54 56
55suffix-$(CONFIG_KERNEL_GZIP) := gz 57suffix-$(CONFIG_KERNEL_GZIP) := gz
56suffix-$(CONFIG_KERNEL_BZIP2) := bz2 58suffix-$(CONFIG_KERNEL_BZIP2) := bz2
57suffix-$(CONFIG_KERNEL_LZMA) := lzma 59suffix-$(CONFIG_KERNEL_LZMA) := lzma
60suffix-$(CONFIG_KERNEL_XZ) := xz
58suffix-$(CONFIG_KERNEL_LZO) := lzo 61suffix-$(CONFIG_KERNEL_LZO) := lzo
59 62
60quiet_cmd_mkpiggy = MKPIGGY $@ 63quiet_cmd_mkpiggy = MKPIGGY $@
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 52f85a196fa0..35af09d13dc1 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -182,7 +182,7 @@ no_longmode:
182 hlt 182 hlt
183 jmp 1b 183 jmp 1b
184 184
185#include "../../kernel/verify_cpu_64.S" 185#include "../../kernel/verify_cpu.S"
186 186
187 /* 187 /*
188 * Be careful here startup_64 needs to be at a predictable 188 * Be careful here startup_64 needs to be at a predictable
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 8f7bef8e9fff..3a19d04cebeb 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -139,6 +139,10 @@ static int lines, cols;
139#include "../../../../lib/decompress_unlzma.c" 139#include "../../../../lib/decompress_unlzma.c"
140#endif 140#endif
141 141
142#ifdef CONFIG_KERNEL_XZ
143#include "../../../../lib/decompress_unxz.c"
144#endif
145
142#ifdef CONFIG_KERNEL_LZO 146#ifdef CONFIG_KERNEL_LZO
143#include "../../../../lib/decompress_unlzo.c" 147#include "../../../../lib/decompress_unlzo.c"
144#endif 148#endif
@@ -229,18 +233,35 @@ void *memset(void *s, int c, size_t n)
229 ss[i] = c; 233 ss[i] = c;
230 return s; 234 return s;
231} 235}
232 236#ifdef CONFIG_X86_32
233void *memcpy(void *dest, const void *src, size_t n) 237void *memcpy(void *dest, const void *src, size_t n)
234{ 238{
235 int i; 239 int d0, d1, d2;
236 const char *s = src; 240 asm volatile(
237 char *d = dest; 241 "rep ; movsl\n\t"
242 "movl %4,%%ecx\n\t"
243 "rep ; movsb\n\t"
244 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
245 : "0" (n >> 2), "g" (n & 3), "1" (dest), "2" (src)
246 : "memory");
238 247
239 for (i = 0; i < n; i++)
240 d[i] = s[i];
241 return dest; 248 return dest;
242} 249}
250#else
251void *memcpy(void *dest, const void *src, size_t n)
252{
253 long d0, d1, d2;
254 asm volatile(
255 "rep ; movsq\n\t"
256 "movq %4,%%rcx\n\t"
257 "rep ; movsb\n\t"
258 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
259 : "0" (n >> 3), "g" (n & 7), "1" (dest), "2" (src)
260 : "memory");
243 261
262 return dest;
263}
264#endif
244 265
245static void error(char *x) 266static void error(char *x)
246{ 267{
@@ -338,7 +359,7 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap,
338 if (heap > 0x3fffffffffffUL) 359 if (heap > 0x3fffffffffffUL)
339 error("Destination address too large"); 360 error("Destination address too large");
340#else 361#else
341 if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) 362 if (heap > ((-__PAGE_OFFSET-(128<<20)-1) & 0x7fffffff))
342 error("Destination address too large"); 363 error("Destination address too large");
343#endif 364#endif
344#ifndef CONFIG_RELOCATABLE 365#ifndef CONFIG_RELOCATABLE
diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c
index 5c228129d175..46a823882437 100644
--- a/arch/x86/boot/compressed/mkpiggy.c
+++ b/arch/x86/boot/compressed/mkpiggy.c
@@ -62,7 +62,12 @@ int main(int argc, char *argv[])
62 if (fseek(f, -4L, SEEK_END)) { 62 if (fseek(f, -4L, SEEK_END)) {
63 perror(argv[1]); 63 perror(argv[1]);
64 } 64 }
65 fread(&olen, sizeof olen, 1, f); 65
66 if (fread(&olen, sizeof(olen), 1, f) != 1) {
67 perror(argv[1]);
68 return 1;
69 }
70
66 ilen = ftell(f); 71 ilen = ftell(f);
67 olen = getle32(&olen); 72 olen = getle32(&olen);
68 fclose(f); 73 fclose(f);
@@ -74,7 +79,7 @@ int main(int argc, char *argv[])
74 79
75 offs = (olen > ilen) ? olen - ilen : 0; 80 offs = (olen > ilen) ? olen - ilen : 0;
76 offs += olen >> 12; /* Add 8 bytes for each 32K block */ 81 offs += olen >> 12; /* Add 8 bytes for each 32K block */
77 offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ 82 offs += 64*1024 + 128; /* Add 64K + 128 bytes slack */
78 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ 83 offs = (offs+4095) & ~4095; /* Round to a 4K boundary */
79 84
80 printf(".section \".rodata..compressed\",\"a\",@progbits\n"); 85 printf(".section \".rodata..compressed\",\"a\",@progbits\n");
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index cae3feb1035e..db75d07c3645 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -91,7 +91,7 @@ static int detect_memory_e801(void)
91 if (oreg.ax > 15*1024) { 91 if (oreg.ax > 15*1024) {
92 return -1; /* Bogus! */ 92 return -1; /* Bogus! */
93 } else if (oreg.ax == 15*1024) { 93 } else if (oreg.ax == 15*1024) {
94 boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; 94 boot_params.alt_mem_k = (oreg.bx << 6) + oreg.ax;
95 } else { 95 } else {
96 /* 96 /*
97 * This ignores memory above 16MB if we have a memory 97 * This ignores memory above 16MB if we have a memory
diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig
index 6f9872658dd2..2bf18059fbea 100644
--- a/arch/x86/configs/i386_defconfig
+++ b/arch/x86/configs/i386_defconfig
@@ -10,7 +10,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
10CONFIG_AUDIT=y 10CONFIG_AUDIT=y
11CONFIG_LOG_BUF_SHIFT=18 11CONFIG_LOG_BUF_SHIFT=18
12CONFIG_CGROUPS=y 12CONFIG_CGROUPS=y
13CONFIG_CGROUP_NS=y
14CONFIG_CGROUP_FREEZER=y 13CONFIG_CGROUP_FREEZER=y
15CONFIG_CPUSETS=y 14CONFIG_CPUSETS=y
16CONFIG_CGROUP_CPUACCT=y 15CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index ee01a9d5d4f0..22a0dc8e51dd 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -11,7 +11,6 @@ CONFIG_TASK_IO_ACCOUNTING=y
11CONFIG_AUDIT=y 11CONFIG_AUDIT=y
12CONFIG_LOG_BUF_SHIFT=18 12CONFIG_LOG_BUF_SHIFT=18
13CONFIG_CGROUPS=y 13CONFIG_CGROUPS=y
14CONFIG_CGROUP_NS=y
15CONFIG_CGROUP_FREEZER=y 14CONFIG_CGROUP_FREEZER=y
16CONFIG_CPUSETS=y 15CONFIG_CPUSETS=y
17CONFIG_CGROUP_CPUACCT=y 16CONFIG_CGROUP_CPUACCT=y
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index 1a58ad89fdf7..c04f1b7a9139 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -2,8 +2,6 @@
2# Arch-specific CryptoAPI modules. 2# Arch-specific CryptoAPI modules.
3# 3#
4 4
5obj-$(CONFIG_CRYPTO_FPU) += fpu.o
6
7obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o 5obj-$(CONFIG_CRYPTO_AES_586) += aes-i586.o
8obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o 6obj-$(CONFIG_CRYPTO_TWOFISH_586) += twofish-i586.o
9obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o 7obj-$(CONFIG_CRYPTO_SALSA20_586) += salsa20-i586.o
@@ -24,6 +22,6 @@ aes-x86_64-y := aes-x86_64-asm_64.o aes_glue.o
24twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o 22twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 23salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
26 24
27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 25aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o fpu.o
28 26
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o 27ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index ff16756a51c1..be6d9e365a80 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -9,6 +9,20 @@
9 * Vinodh Gopal <vinodh.gopal@intel.com> 9 * Vinodh Gopal <vinodh.gopal@intel.com>
10 * Kahraman Akdemir 10 * Kahraman Akdemir
11 * 11 *
12 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
13 * interface for 64-bit kernels.
14 * Authors: Erdinc Ozturk (erdinc.ozturk@intel.com)
15 * Aidan O'Mahony (aidan.o.mahony@intel.com)
16 * Adrian Hoban <adrian.hoban@intel.com>
17 * James Guilford (james.guilford@intel.com)
18 * Gabriele Paoloni <gabriele.paoloni@intel.com>
19 * Tadeusz Struk (tadeusz.struk@intel.com)
20 * Wajdi Feghali (wajdi.k.feghali@intel.com)
21 * Copyright (c) 2010, Intel Corporation.
22 *
23 * Ported x86_64 version to x86:
24 * Author: Mathias Krause <minipli@googlemail.com>
25 *
12 * This program is free software; you can redistribute it and/or modify 26 * This program is free software; you can redistribute it and/or modify
13 * it under the terms of the GNU General Public License as published by 27 * it under the terms of the GNU General Public License as published by
14 * the Free Software Foundation; either version 2 of the License, or 28 * the Free Software Foundation; either version 2 of the License, or
@@ -18,8 +32,62 @@
18#include <linux/linkage.h> 32#include <linux/linkage.h>
19#include <asm/inst.h> 33#include <asm/inst.h>
20 34
35#ifdef __x86_64__
36.data
37POLY: .octa 0xC2000000000000000000000000000001
38TWOONE: .octa 0x00000001000000000000000000000001
39
40# order of these constants should not change.
41# more specifically, ALL_F should follow SHIFT_MASK,
42# and ZERO should follow ALL_F
43
44SHUF_MASK: .octa 0x000102030405060708090A0B0C0D0E0F
45MASK1: .octa 0x0000000000000000ffffffffffffffff
46MASK2: .octa 0xffffffffffffffff0000000000000000
47SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
48ALL_F: .octa 0xffffffffffffffffffffffffffffffff
49ZERO: .octa 0x00000000000000000000000000000000
50ONE: .octa 0x00000000000000000000000000000001
51F_MIN_MASK: .octa 0xf1f2f3f4f5f6f7f8f9fafbfcfdfeff0
52dec: .octa 0x1
53enc: .octa 0x2
54
55
21.text 56.text
22 57
58
59#define STACK_OFFSET 8*3
60#define HashKey 16*0 // store HashKey <<1 mod poly here
61#define HashKey_2 16*1 // store HashKey^2 <<1 mod poly here
62#define HashKey_3 16*2 // store HashKey^3 <<1 mod poly here
63#define HashKey_4 16*3 // store HashKey^4 <<1 mod poly here
64#define HashKey_k 16*4 // store XOR of High 64 bits and Low 64
65 // bits of HashKey <<1 mod poly here
66 //(for Karatsuba purposes)
67#define HashKey_2_k 16*5 // store XOR of High 64 bits and Low 64
68 // bits of HashKey^2 <<1 mod poly here
69 // (for Karatsuba purposes)
70#define HashKey_3_k 16*6 // store XOR of High 64 bits and Low 64
71 // bits of HashKey^3 <<1 mod poly here
72 // (for Karatsuba purposes)
73#define HashKey_4_k 16*7 // store XOR of High 64 bits and Low 64
74 // bits of HashKey^4 <<1 mod poly here
75 // (for Karatsuba purposes)
76#define VARIABLE_OFFSET 16*8
77
78#define arg1 rdi
79#define arg2 rsi
80#define arg3 rdx
81#define arg4 rcx
82#define arg5 r8
83#define arg6 r9
84#define arg7 STACK_OFFSET+8(%r14)
85#define arg8 STACK_OFFSET+16(%r14)
86#define arg9 STACK_OFFSET+24(%r14)
87#define arg10 STACK_OFFSET+32(%r14)
88#endif
89
90
23#define STATE1 %xmm0 91#define STATE1 %xmm0
24#define STATE2 %xmm4 92#define STATE2 %xmm4
25#define STATE3 %xmm5 93#define STATE3 %xmm5
@@ -32,12 +100,16 @@
32#define IN IN1 100#define IN IN1
33#define KEY %xmm2 101#define KEY %xmm2
34#define IV %xmm3 102#define IV %xmm3
103
35#define BSWAP_MASK %xmm10 104#define BSWAP_MASK %xmm10
36#define CTR %xmm11 105#define CTR %xmm11
37#define INC %xmm12 106#define INC %xmm12
38 107
108#ifdef __x86_64__
109#define AREG %rax
39#define KEYP %rdi 110#define KEYP %rdi
40#define OUTP %rsi 111#define OUTP %rsi
112#define UKEYP OUTP
41#define INP %rdx 113#define INP %rdx
42#define LEN %rcx 114#define LEN %rcx
43#define IVP %r8 115#define IVP %r8
@@ -46,6 +118,1591 @@
46#define TKEYP T1 118#define TKEYP T1
47#define T2 %r11 119#define T2 %r11
48#define TCTR_LOW T2 120#define TCTR_LOW T2
121#else
122#define AREG %eax
123#define KEYP %edi
124#define OUTP AREG
125#define UKEYP OUTP
126#define INP %edx
127#define LEN %esi
128#define IVP %ebp
129#define KLEN %ebx
130#define T1 %ecx
131#define TKEYP T1
132#endif
133
134
135#ifdef __x86_64__
136/* GHASH_MUL MACRO to implement: Data*HashKey mod (128,127,126,121,0)
137*
138*
139* Input: A and B (128-bits each, bit-reflected)
140* Output: C = A*B*x mod poly, (i.e. >>1 )
141* To compute GH = GH*HashKey mod poly, give HK = HashKey<<1 mod poly as input
142* GH = GH * HK * x mod poly which is equivalent to GH*HashKey mod poly.
143*
144*/
145.macro GHASH_MUL GH HK TMP1 TMP2 TMP3 TMP4 TMP5
146 movdqa \GH, \TMP1
147 pshufd $78, \GH, \TMP2
148 pshufd $78, \HK, \TMP3
149 pxor \GH, \TMP2 # TMP2 = a1+a0
150 pxor \HK, \TMP3 # TMP3 = b1+b0
151 PCLMULQDQ 0x11, \HK, \TMP1 # TMP1 = a1*b1
152 PCLMULQDQ 0x00, \HK, \GH # GH = a0*b0
153 PCLMULQDQ 0x00, \TMP3, \TMP2 # TMP2 = (a0+a1)*(b1+b0)
154 pxor \GH, \TMP2
155 pxor \TMP1, \TMP2 # TMP2 = (a0*b0)+(a1*b0)
156 movdqa \TMP2, \TMP3
157 pslldq $8, \TMP3 # left shift TMP3 2 DWs
158 psrldq $8, \TMP2 # right shift TMP2 2 DWs
159 pxor \TMP3, \GH
160 pxor \TMP2, \TMP1 # TMP2:GH holds the result of GH*HK
161
162 # first phase of the reduction
163
164 movdqa \GH, \TMP2
165 movdqa \GH, \TMP3
166 movdqa \GH, \TMP4 # copy GH into TMP2,TMP3 and TMP4
167 # in in order to perform
168 # independent shifts
169 pslld $31, \TMP2 # packed right shift <<31
170 pslld $30, \TMP3 # packed right shift <<30
171 pslld $25, \TMP4 # packed right shift <<25
172 pxor \TMP3, \TMP2 # xor the shifted versions
173 pxor \TMP4, \TMP2
174 movdqa \TMP2, \TMP5
175 psrldq $4, \TMP5 # right shift TMP5 1 DW
176 pslldq $12, \TMP2 # left shift TMP2 3 DWs
177 pxor \TMP2, \GH
178
179 # second phase of the reduction
180
181 movdqa \GH,\TMP2 # copy GH into TMP2,TMP3 and TMP4
182 # in in order to perform
183 # independent shifts
184 movdqa \GH,\TMP3
185 movdqa \GH,\TMP4
186 psrld $1,\TMP2 # packed left shift >>1
187 psrld $2,\TMP3 # packed left shift >>2
188 psrld $7,\TMP4 # packed left shift >>7
189 pxor \TMP3,\TMP2 # xor the shifted versions
190 pxor \TMP4,\TMP2
191 pxor \TMP5, \TMP2
192 pxor \TMP2, \GH
193 pxor \TMP1, \GH # result is in TMP1
194.endm
195
196/*
197* if a = number of total plaintext bytes
198* b = floor(a/16)
199* num_initial_blocks = b mod 4
200* encrypt the initial num_initial_blocks blocks and apply ghash on
201* the ciphertext
202* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
203* are clobbered
204* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
205*/
206
207
208.macro INITIAL_BLOCKS_DEC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
209XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
210 mov arg7, %r10 # %r10 = AAD
211 mov arg8, %r12 # %r12 = aadLen
212 mov %r12, %r11
213 pxor %xmm\i, %xmm\i
214_get_AAD_loop\num_initial_blocks\operation:
215 movd (%r10), \TMP1
216 pslldq $12, \TMP1
217 psrldq $4, %xmm\i
218 pxor \TMP1, %xmm\i
219 add $4, %r10
220 sub $4, %r12
221 jne _get_AAD_loop\num_initial_blocks\operation
222 cmp $16, %r11
223 je _get_AAD_loop2_done\num_initial_blocks\operation
224 mov $16, %r12
225_get_AAD_loop2\num_initial_blocks\operation:
226 psrldq $4, %xmm\i
227 sub $4, %r12
228 cmp %r11, %r12
229 jne _get_AAD_loop2\num_initial_blocks\operation
230_get_AAD_loop2_done\num_initial_blocks\operation:
231 movdqa SHUF_MASK(%rip), %xmm14
232 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
233
234 xor %r11, %r11 # initialise the data pointer offset as zero
235
236 # start AES for num_initial_blocks blocks
237
238 mov %arg5, %rax # %rax = *Y0
239 movdqu (%rax), \XMM0 # XMM0 = Y0
240 movdqa SHUF_MASK(%rip), %xmm14
241 PSHUFB_XMM %xmm14, \XMM0
242
243.if (\i == 5) || (\i == 6) || (\i == 7)
244.irpc index, \i_seq
245 paddd ONE(%rip), \XMM0 # INCR Y0
246 movdqa \XMM0, %xmm\index
247 movdqa SHUF_MASK(%rip), %xmm14
248 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
249
250.endr
251.irpc index, \i_seq
252 pxor 16*0(%arg1), %xmm\index
253.endr
254.irpc index, \i_seq
255 movaps 0x10(%rdi), \TMP1
256 AESENC \TMP1, %xmm\index # Round 1
257.endr
258.irpc index, \i_seq
259 movaps 0x20(%arg1), \TMP1
260 AESENC \TMP1, %xmm\index # Round 2
261.endr
262.irpc index, \i_seq
263 movaps 0x30(%arg1), \TMP1
264 AESENC \TMP1, %xmm\index # Round 2
265.endr
266.irpc index, \i_seq
267 movaps 0x40(%arg1), \TMP1
268 AESENC \TMP1, %xmm\index # Round 2
269.endr
270.irpc index, \i_seq
271 movaps 0x50(%arg1), \TMP1
272 AESENC \TMP1, %xmm\index # Round 2
273.endr
274.irpc index, \i_seq
275 movaps 0x60(%arg1), \TMP1
276 AESENC \TMP1, %xmm\index # Round 2
277.endr
278.irpc index, \i_seq
279 movaps 0x70(%arg1), \TMP1
280 AESENC \TMP1, %xmm\index # Round 2
281.endr
282.irpc index, \i_seq
283 movaps 0x80(%arg1), \TMP1
284 AESENC \TMP1, %xmm\index # Round 2
285.endr
286.irpc index, \i_seq
287 movaps 0x90(%arg1), \TMP1
288 AESENC \TMP1, %xmm\index # Round 2
289.endr
290.irpc index, \i_seq
291 movaps 0xa0(%arg1), \TMP1
292 AESENCLAST \TMP1, %xmm\index # Round 10
293.endr
294.irpc index, \i_seq
295 movdqu (%arg3 , %r11, 1), \TMP1
296 pxor \TMP1, %xmm\index
297 movdqu %xmm\index, (%arg2 , %r11, 1)
298 # write back plaintext/ciphertext for num_initial_blocks
299 add $16, %r11
300
301 movdqa \TMP1, %xmm\index
302 movdqa SHUF_MASK(%rip), %xmm14
303 PSHUFB_XMM %xmm14, %xmm\index
304
305 # prepare plaintext/ciphertext for GHASH computation
306.endr
307.endif
308 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
309 # apply GHASH on num_initial_blocks blocks
310
311.if \i == 5
312 pxor %xmm5, %xmm6
313 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
314 pxor %xmm6, %xmm7
315 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
316 pxor %xmm7, %xmm8
317 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
318.elseif \i == 6
319 pxor %xmm6, %xmm7
320 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
321 pxor %xmm7, %xmm8
322 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
323.elseif \i == 7
324 pxor %xmm7, %xmm8
325 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
326.endif
327 cmp $64, %r13
328 jl _initial_blocks_done\num_initial_blocks\operation
329 # no need for precomputed values
330/*
331*
332* Precomputations for HashKey parallel with encryption of first 4 blocks.
333* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
334*/
335 paddd ONE(%rip), \XMM0 # INCR Y0
336 movdqa \XMM0, \XMM1
337 movdqa SHUF_MASK(%rip), %xmm14
338 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
339
340 paddd ONE(%rip), \XMM0 # INCR Y0
341 movdqa \XMM0, \XMM2
342 movdqa SHUF_MASK(%rip), %xmm14
343 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
344
345 paddd ONE(%rip), \XMM0 # INCR Y0
346 movdqa \XMM0, \XMM3
347 movdqa SHUF_MASK(%rip), %xmm14
348 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
349
350 paddd ONE(%rip), \XMM0 # INCR Y0
351 movdqa \XMM0, \XMM4
352 movdqa SHUF_MASK(%rip), %xmm14
353 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
354
355 pxor 16*0(%arg1), \XMM1
356 pxor 16*0(%arg1), \XMM2
357 pxor 16*0(%arg1), \XMM3
358 pxor 16*0(%arg1), \XMM4
359 movdqa \TMP3, \TMP5
360 pshufd $78, \TMP3, \TMP1
361 pxor \TMP3, \TMP1
362 movdqa \TMP1, HashKey_k(%rsp)
363 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
364# TMP5 = HashKey^2<<1 (mod poly)
365 movdqa \TMP5, HashKey_2(%rsp)
366# HashKey_2 = HashKey^2<<1 (mod poly)
367 pshufd $78, \TMP5, \TMP1
368 pxor \TMP5, \TMP1
369 movdqa \TMP1, HashKey_2_k(%rsp)
370.irpc index, 1234 # do 4 rounds
371 movaps 0x10*\index(%arg1), \TMP1
372 AESENC \TMP1, \XMM1
373 AESENC \TMP1, \XMM2
374 AESENC \TMP1, \XMM3
375 AESENC \TMP1, \XMM4
376.endr
377 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
378# TMP5 = HashKey^3<<1 (mod poly)
379 movdqa \TMP5, HashKey_3(%rsp)
380 pshufd $78, \TMP5, \TMP1
381 pxor \TMP5, \TMP1
382 movdqa \TMP1, HashKey_3_k(%rsp)
383.irpc index, 56789 # do next 5 rounds
384 movaps 0x10*\index(%arg1), \TMP1
385 AESENC \TMP1, \XMM1
386 AESENC \TMP1, \XMM2
387 AESENC \TMP1, \XMM3
388 AESENC \TMP1, \XMM4
389.endr
390 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
391# TMP5 = HashKey^3<<1 (mod poly)
392 movdqa \TMP5, HashKey_4(%rsp)
393 pshufd $78, \TMP5, \TMP1
394 pxor \TMP5, \TMP1
395 movdqa \TMP1, HashKey_4_k(%rsp)
396 movaps 0xa0(%arg1), \TMP2
397 AESENCLAST \TMP2, \XMM1
398 AESENCLAST \TMP2, \XMM2
399 AESENCLAST \TMP2, \XMM3
400 AESENCLAST \TMP2, \XMM4
401 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
402 pxor \TMP1, \XMM1
403 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
404 movdqa \TMP1, \XMM1
405 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
406 pxor \TMP1, \XMM2
407 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
408 movdqa \TMP1, \XMM2
409 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
410 pxor \TMP1, \XMM3
411 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
412 movdqa \TMP1, \XMM3
413 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
414 pxor \TMP1, \XMM4
415 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
416 movdqa \TMP1, \XMM4
417 add $64, %r11
418 movdqa SHUF_MASK(%rip), %xmm14
419 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
420 pxor \XMMDst, \XMM1
421# combine GHASHed value with the corresponding ciphertext
422 movdqa SHUF_MASK(%rip), %xmm14
423 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
424 movdqa SHUF_MASK(%rip), %xmm14
425 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
426 movdqa SHUF_MASK(%rip), %xmm14
427 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
428
429_initial_blocks_done\num_initial_blocks\operation:
430
431.endm
432
433
434/*
435* if a = number of total plaintext bytes
436* b = floor(a/16)
437* num_initial_blocks = b mod 4
438* encrypt the initial num_initial_blocks blocks and apply ghash on
439* the ciphertext
440* %r10, %r11, %r12, %rax, %xmm5, %xmm6, %xmm7, %xmm8, %xmm9 registers
441* are clobbered
442* arg1, %arg2, %arg3, %r14 are used as a pointer only, not modified
443*/
444
445
446.macro INITIAL_BLOCKS_ENC num_initial_blocks TMP1 TMP2 TMP3 TMP4 TMP5 XMM0 XMM1 \
447XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
448 mov arg7, %r10 # %r10 = AAD
449 mov arg8, %r12 # %r12 = aadLen
450 mov %r12, %r11
451 pxor %xmm\i, %xmm\i
452_get_AAD_loop\num_initial_blocks\operation:
453 movd (%r10), \TMP1
454 pslldq $12, \TMP1
455 psrldq $4, %xmm\i
456 pxor \TMP1, %xmm\i
457 add $4, %r10
458 sub $4, %r12
459 jne _get_AAD_loop\num_initial_blocks\operation
460 cmp $16, %r11
461 je _get_AAD_loop2_done\num_initial_blocks\operation
462 mov $16, %r12
463_get_AAD_loop2\num_initial_blocks\operation:
464 psrldq $4, %xmm\i
465 sub $4, %r12
466 cmp %r11, %r12
467 jne _get_AAD_loop2\num_initial_blocks\operation
468_get_AAD_loop2_done\num_initial_blocks\operation:
469 movdqa SHUF_MASK(%rip), %xmm14
470 PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
471
472 xor %r11, %r11 # initialise the data pointer offset as zero
473
474 # start AES for num_initial_blocks blocks
475
476 mov %arg5, %rax # %rax = *Y0
477 movdqu (%rax), \XMM0 # XMM0 = Y0
478 movdqa SHUF_MASK(%rip), %xmm14
479 PSHUFB_XMM %xmm14, \XMM0
480
481.if (\i == 5) || (\i == 6) || (\i == 7)
482.irpc index, \i_seq
483 paddd ONE(%rip), \XMM0 # INCR Y0
484 movdqa \XMM0, %xmm\index
485 movdqa SHUF_MASK(%rip), %xmm14
486 PSHUFB_XMM %xmm14, %xmm\index # perform a 16 byte swap
487
488.endr
489.irpc index, \i_seq
490 pxor 16*0(%arg1), %xmm\index
491.endr
492.irpc index, \i_seq
493 movaps 0x10(%rdi), \TMP1
494 AESENC \TMP1, %xmm\index # Round 1
495.endr
496.irpc index, \i_seq
497 movaps 0x20(%arg1), \TMP1
498 AESENC \TMP1, %xmm\index # Round 2
499.endr
500.irpc index, \i_seq
501 movaps 0x30(%arg1), \TMP1
502 AESENC \TMP1, %xmm\index # Round 2
503.endr
504.irpc index, \i_seq
505 movaps 0x40(%arg1), \TMP1
506 AESENC \TMP1, %xmm\index # Round 2
507.endr
508.irpc index, \i_seq
509 movaps 0x50(%arg1), \TMP1
510 AESENC \TMP1, %xmm\index # Round 2
511.endr
512.irpc index, \i_seq
513 movaps 0x60(%arg1), \TMP1
514 AESENC \TMP1, %xmm\index # Round 2
515.endr
516.irpc index, \i_seq
517 movaps 0x70(%arg1), \TMP1
518 AESENC \TMP1, %xmm\index # Round 2
519.endr
520.irpc index, \i_seq
521 movaps 0x80(%arg1), \TMP1
522 AESENC \TMP1, %xmm\index # Round 2
523.endr
524.irpc index, \i_seq
525 movaps 0x90(%arg1), \TMP1
526 AESENC \TMP1, %xmm\index # Round 2
527.endr
528.irpc index, \i_seq
529 movaps 0xa0(%arg1), \TMP1
530 AESENCLAST \TMP1, %xmm\index # Round 10
531.endr
532.irpc index, \i_seq
533 movdqu (%arg3 , %r11, 1), \TMP1
534 pxor \TMP1, %xmm\index
535 movdqu %xmm\index, (%arg2 , %r11, 1)
536 # write back plaintext/ciphertext for num_initial_blocks
537 add $16, %r11
538
539 movdqa SHUF_MASK(%rip), %xmm14
540 PSHUFB_XMM %xmm14, %xmm\index
541
542 # prepare plaintext/ciphertext for GHASH computation
543.endr
544.endif
545 GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
546 # apply GHASH on num_initial_blocks blocks
547
548.if \i == 5
549 pxor %xmm5, %xmm6
550 GHASH_MUL %xmm6, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
551 pxor %xmm6, %xmm7
552 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
553 pxor %xmm7, %xmm8
554 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
555.elseif \i == 6
556 pxor %xmm6, %xmm7
557 GHASH_MUL %xmm7, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
558 pxor %xmm7, %xmm8
559 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
560.elseif \i == 7
561 pxor %xmm7, %xmm8
562 GHASH_MUL %xmm8, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
563.endif
564 cmp $64, %r13
565 jl _initial_blocks_done\num_initial_blocks\operation
566 # no need for precomputed values
567/*
568*
569* Precomputations for HashKey parallel with encryption of first 4 blocks.
570* Haskey_i_k holds XORed values of the low and high parts of the Haskey_i
571*/
572 paddd ONE(%rip), \XMM0 # INCR Y0
573 movdqa \XMM0, \XMM1
574 movdqa SHUF_MASK(%rip), %xmm14
575 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
576
577 paddd ONE(%rip), \XMM0 # INCR Y0
578 movdqa \XMM0, \XMM2
579 movdqa SHUF_MASK(%rip), %xmm14
580 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
581
582 paddd ONE(%rip), \XMM0 # INCR Y0
583 movdqa \XMM0, \XMM3
584 movdqa SHUF_MASK(%rip), %xmm14
585 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
586
587 paddd ONE(%rip), \XMM0 # INCR Y0
588 movdqa \XMM0, \XMM4
589 movdqa SHUF_MASK(%rip), %xmm14
590 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
591
592 pxor 16*0(%arg1), \XMM1
593 pxor 16*0(%arg1), \XMM2
594 pxor 16*0(%arg1), \XMM3
595 pxor 16*0(%arg1), \XMM4
596 movdqa \TMP3, \TMP5
597 pshufd $78, \TMP3, \TMP1
598 pxor \TMP3, \TMP1
599 movdqa \TMP1, HashKey_k(%rsp)
600 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
601# TMP5 = HashKey^2<<1 (mod poly)
602 movdqa \TMP5, HashKey_2(%rsp)
603# HashKey_2 = HashKey^2<<1 (mod poly)
604 pshufd $78, \TMP5, \TMP1
605 pxor \TMP5, \TMP1
606 movdqa \TMP1, HashKey_2_k(%rsp)
607.irpc index, 1234 # do 4 rounds
608 movaps 0x10*\index(%arg1), \TMP1
609 AESENC \TMP1, \XMM1
610 AESENC \TMP1, \XMM2
611 AESENC \TMP1, \XMM3
612 AESENC \TMP1, \XMM4
613.endr
614 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
615# TMP5 = HashKey^3<<1 (mod poly)
616 movdqa \TMP5, HashKey_3(%rsp)
617 pshufd $78, \TMP5, \TMP1
618 pxor \TMP5, \TMP1
619 movdqa \TMP1, HashKey_3_k(%rsp)
620.irpc index, 56789 # do next 5 rounds
621 movaps 0x10*\index(%arg1), \TMP1
622 AESENC \TMP1, \XMM1
623 AESENC \TMP1, \XMM2
624 AESENC \TMP1, \XMM3
625 AESENC \TMP1, \XMM4
626.endr
627 GHASH_MUL \TMP5, \TMP3, \TMP1, \TMP2, \TMP4, \TMP6, \TMP7
628# TMP5 = HashKey^3<<1 (mod poly)
629 movdqa \TMP5, HashKey_4(%rsp)
630 pshufd $78, \TMP5, \TMP1
631 pxor \TMP5, \TMP1
632 movdqa \TMP1, HashKey_4_k(%rsp)
633 movaps 0xa0(%arg1), \TMP2
634 AESENCLAST \TMP2, \XMM1
635 AESENCLAST \TMP2, \XMM2
636 AESENCLAST \TMP2, \XMM3
637 AESENCLAST \TMP2, \XMM4
638 movdqu 16*0(%arg3 , %r11 , 1), \TMP1
639 pxor \TMP1, \XMM1
640 movdqu 16*1(%arg3 , %r11 , 1), \TMP1
641 pxor \TMP1, \XMM2
642 movdqu 16*2(%arg3 , %r11 , 1), \TMP1
643 pxor \TMP1, \XMM3
644 movdqu 16*3(%arg3 , %r11 , 1), \TMP1
645 pxor \TMP1, \XMM4
646 movdqu \XMM1, 16*0(%arg2 , %r11 , 1)
647 movdqu \XMM2, 16*1(%arg2 , %r11 , 1)
648 movdqu \XMM3, 16*2(%arg2 , %r11 , 1)
649 movdqu \XMM4, 16*3(%arg2 , %r11 , 1)
650
651 add $64, %r11
652 movdqa SHUF_MASK(%rip), %xmm14
653 PSHUFB_XMM %xmm14, \XMM1 # perform a 16 byte swap
654 pxor \XMMDst, \XMM1
655# combine GHASHed value with the corresponding ciphertext
656 movdqa SHUF_MASK(%rip), %xmm14
657 PSHUFB_XMM %xmm14, \XMM2 # perform a 16 byte swap
658 movdqa SHUF_MASK(%rip), %xmm14
659 PSHUFB_XMM %xmm14, \XMM3 # perform a 16 byte swap
660 movdqa SHUF_MASK(%rip), %xmm14
661 PSHUFB_XMM %xmm14, \XMM4 # perform a 16 byte swap
662
663_initial_blocks_done\num_initial_blocks\operation:
664
665.endm
666
667/*
668* encrypt 4 blocks at a time
669* ghash the 4 previously encrypted ciphertext blocks
670* arg1, %arg2, %arg3 are used as pointers only, not modified
671* %r11 is the data offset value
672*/
673.macro GHASH_4_ENCRYPT_4_PARALLEL_ENC TMP1 TMP2 TMP3 TMP4 TMP5 \
674TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
675
676 movdqa \XMM1, \XMM5
677 movdqa \XMM2, \XMM6
678 movdqa \XMM3, \XMM7
679 movdqa \XMM4, \XMM8
680
681 movdqa SHUF_MASK(%rip), %xmm15
682 # multiply TMP5 * HashKey using karatsuba
683
684 movdqa \XMM5, \TMP4
685 pshufd $78, \XMM5, \TMP6
686 pxor \XMM5, \TMP6
687 paddd ONE(%rip), \XMM0 # INCR CNT
688 movdqa HashKey_4(%rsp), \TMP5
689 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
690 movdqa \XMM0, \XMM1
691 paddd ONE(%rip), \XMM0 # INCR CNT
692 movdqa \XMM0, \XMM2
693 paddd ONE(%rip), \XMM0 # INCR CNT
694 movdqa \XMM0, \XMM3
695 paddd ONE(%rip), \XMM0 # INCR CNT
696 movdqa \XMM0, \XMM4
697 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
698 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
699 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
700 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
701 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
702
703 pxor (%arg1), \XMM1
704 pxor (%arg1), \XMM2
705 pxor (%arg1), \XMM3
706 pxor (%arg1), \XMM4
707 movdqa HashKey_4_k(%rsp), \TMP5
708 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
709 movaps 0x10(%arg1), \TMP1
710 AESENC \TMP1, \XMM1 # Round 1
711 AESENC \TMP1, \XMM2
712 AESENC \TMP1, \XMM3
713 AESENC \TMP1, \XMM4
714 movaps 0x20(%arg1), \TMP1
715 AESENC \TMP1, \XMM1 # Round 2
716 AESENC \TMP1, \XMM2
717 AESENC \TMP1, \XMM3
718 AESENC \TMP1, \XMM4
719 movdqa \XMM6, \TMP1
720 pshufd $78, \XMM6, \TMP2
721 pxor \XMM6, \TMP2
722 movdqa HashKey_3(%rsp), \TMP5
723 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
724 movaps 0x30(%arg1), \TMP3
725 AESENC \TMP3, \XMM1 # Round 3
726 AESENC \TMP3, \XMM2
727 AESENC \TMP3, \XMM3
728 AESENC \TMP3, \XMM4
729 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
730 movaps 0x40(%arg1), \TMP3
731 AESENC \TMP3, \XMM1 # Round 4
732 AESENC \TMP3, \XMM2
733 AESENC \TMP3, \XMM3
734 AESENC \TMP3, \XMM4
735 movdqa HashKey_3_k(%rsp), \TMP5
736 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
737 movaps 0x50(%arg1), \TMP3
738 AESENC \TMP3, \XMM1 # Round 5
739 AESENC \TMP3, \XMM2
740 AESENC \TMP3, \XMM3
741 AESENC \TMP3, \XMM4
742 pxor \TMP1, \TMP4
743# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
744 pxor \XMM6, \XMM5
745 pxor \TMP2, \TMP6
746 movdqa \XMM7, \TMP1
747 pshufd $78, \XMM7, \TMP2
748 pxor \XMM7, \TMP2
749 movdqa HashKey_2(%rsp ), \TMP5
750
751 # Multiply TMP5 * HashKey using karatsuba
752
753 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
754 movaps 0x60(%arg1), \TMP3
755 AESENC \TMP3, \XMM1 # Round 6
756 AESENC \TMP3, \XMM2
757 AESENC \TMP3, \XMM3
758 AESENC \TMP3, \XMM4
759 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
760 movaps 0x70(%arg1), \TMP3
761 AESENC \TMP3, \XMM1 # Round 7
762 AESENC \TMP3, \XMM2
763 AESENC \TMP3, \XMM3
764 AESENC \TMP3, \XMM4
765 movdqa HashKey_2_k(%rsp), \TMP5
766 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
767 movaps 0x80(%arg1), \TMP3
768 AESENC \TMP3, \XMM1 # Round 8
769 AESENC \TMP3, \XMM2
770 AESENC \TMP3, \XMM3
771 AESENC \TMP3, \XMM4
772 pxor \TMP1, \TMP4
773# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
774 pxor \XMM7, \XMM5
775 pxor \TMP2, \TMP6
776
777 # Multiply XMM8 * HashKey
778 # XMM8 and TMP5 hold the values for the two operands
779
780 movdqa \XMM8, \TMP1
781 pshufd $78, \XMM8, \TMP2
782 pxor \XMM8, \TMP2
783 movdqa HashKey(%rsp), \TMP5
784 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
785 movaps 0x90(%arg1), \TMP3
786 AESENC \TMP3, \XMM1 # Round 9
787 AESENC \TMP3, \XMM2
788 AESENC \TMP3, \XMM3
789 AESENC \TMP3, \XMM4
790 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
791 movaps 0xa0(%arg1), \TMP3
792 AESENCLAST \TMP3, \XMM1 # Round 10
793 AESENCLAST \TMP3, \XMM2
794 AESENCLAST \TMP3, \XMM3
795 AESENCLAST \TMP3, \XMM4
796 movdqa HashKey_k(%rsp), \TMP5
797 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
798 movdqu (%arg3,%r11,1), \TMP3
799 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
800 movdqu 16(%arg3,%r11,1), \TMP3
801 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
802 movdqu 32(%arg3,%r11,1), \TMP3
803 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
804 movdqu 48(%arg3,%r11,1), \TMP3
805 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
806 movdqu \XMM1, (%arg2,%r11,1) # Write to the ciphertext buffer
807 movdqu \XMM2, 16(%arg2,%r11,1) # Write to the ciphertext buffer
808 movdqu \XMM3, 32(%arg2,%r11,1) # Write to the ciphertext buffer
809 movdqu \XMM4, 48(%arg2,%r11,1) # Write to the ciphertext buffer
810 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
811 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
812 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
813 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
814
815 pxor \TMP4, \TMP1
816 pxor \XMM8, \XMM5
817 pxor \TMP6, \TMP2
818 pxor \TMP1, \TMP2
819 pxor \XMM5, \TMP2
820 movdqa \TMP2, \TMP3
821 pslldq $8, \TMP3 # left shift TMP3 2 DWs
822 psrldq $8, \TMP2 # right shift TMP2 2 DWs
823 pxor \TMP3, \XMM5
824 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
825
826 # first phase of reduction
827
828 movdqa \XMM5, \TMP2
829 movdqa \XMM5, \TMP3
830 movdqa \XMM5, \TMP4
831# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
832 pslld $31, \TMP2 # packed right shift << 31
833 pslld $30, \TMP3 # packed right shift << 30
834 pslld $25, \TMP4 # packed right shift << 25
835 pxor \TMP3, \TMP2 # xor the shifted versions
836 pxor \TMP4, \TMP2
837 movdqa \TMP2, \TMP5
838 psrldq $4, \TMP5 # right shift T5 1 DW
839 pslldq $12, \TMP2 # left shift T2 3 DWs
840 pxor \TMP2, \XMM5
841
842 # second phase of reduction
843
844 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
845 movdqa \XMM5,\TMP3
846 movdqa \XMM5,\TMP4
847 psrld $1, \TMP2 # packed left shift >>1
848 psrld $2, \TMP3 # packed left shift >>2
849 psrld $7, \TMP4 # packed left shift >>7
850 pxor \TMP3,\TMP2 # xor the shifted versions
851 pxor \TMP4,\TMP2
852 pxor \TMP5, \TMP2
853 pxor \TMP2, \XMM5
854 pxor \TMP1, \XMM5 # result is in TMP1
855
856 pxor \XMM5, \XMM1
857.endm
858
859/*
860* decrypt 4 blocks at a time
861* ghash the 4 previously decrypted ciphertext blocks
862* arg1, %arg2, %arg3 are used as pointers only, not modified
863* %r11 is the data offset value
864*/
865.macro GHASH_4_ENCRYPT_4_PARALLEL_DEC TMP1 TMP2 TMP3 TMP4 TMP5 \
866TMP6 XMM0 XMM1 XMM2 XMM3 XMM4 XMM5 XMM6 XMM7 XMM8 operation
867
868 movdqa \XMM1, \XMM5
869 movdqa \XMM2, \XMM6
870 movdqa \XMM3, \XMM7
871 movdqa \XMM4, \XMM8
872
873 movdqa SHUF_MASK(%rip), %xmm15
874 # multiply TMP5 * HashKey using karatsuba
875
876 movdqa \XMM5, \TMP4
877 pshufd $78, \XMM5, \TMP6
878 pxor \XMM5, \TMP6
879 paddd ONE(%rip), \XMM0 # INCR CNT
880 movdqa HashKey_4(%rsp), \TMP5
881 PCLMULQDQ 0x11, \TMP5, \TMP4 # TMP4 = a1*b1
882 movdqa \XMM0, \XMM1
883 paddd ONE(%rip), \XMM0 # INCR CNT
884 movdqa \XMM0, \XMM2
885 paddd ONE(%rip), \XMM0 # INCR CNT
886 movdqa \XMM0, \XMM3
887 paddd ONE(%rip), \XMM0 # INCR CNT
888 movdqa \XMM0, \XMM4
889 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
890 PCLMULQDQ 0x00, \TMP5, \XMM5 # XMM5 = a0*b0
891 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
892 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
893 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
894
895 pxor (%arg1), \XMM1
896 pxor (%arg1), \XMM2
897 pxor (%arg1), \XMM3
898 pxor (%arg1), \XMM4
899 movdqa HashKey_4_k(%rsp), \TMP5
900 PCLMULQDQ 0x00, \TMP5, \TMP6 # TMP6 = (a1+a0)*(b1+b0)
901 movaps 0x10(%arg1), \TMP1
902 AESENC \TMP1, \XMM1 # Round 1
903 AESENC \TMP1, \XMM2
904 AESENC \TMP1, \XMM3
905 AESENC \TMP1, \XMM4
906 movaps 0x20(%arg1), \TMP1
907 AESENC \TMP1, \XMM1 # Round 2
908 AESENC \TMP1, \XMM2
909 AESENC \TMP1, \XMM3
910 AESENC \TMP1, \XMM4
911 movdqa \XMM6, \TMP1
912 pshufd $78, \XMM6, \TMP2
913 pxor \XMM6, \TMP2
914 movdqa HashKey_3(%rsp), \TMP5
915 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1 * b1
916 movaps 0x30(%arg1), \TMP3
917 AESENC \TMP3, \XMM1 # Round 3
918 AESENC \TMP3, \XMM2
919 AESENC \TMP3, \XMM3
920 AESENC \TMP3, \XMM4
921 PCLMULQDQ 0x00, \TMP5, \XMM6 # XMM6 = a0*b0
922 movaps 0x40(%arg1), \TMP3
923 AESENC \TMP3, \XMM1 # Round 4
924 AESENC \TMP3, \XMM2
925 AESENC \TMP3, \XMM3
926 AESENC \TMP3, \XMM4
927 movdqa HashKey_3_k(%rsp), \TMP5
928 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
929 movaps 0x50(%arg1), \TMP3
930 AESENC \TMP3, \XMM1 # Round 5
931 AESENC \TMP3, \XMM2
932 AESENC \TMP3, \XMM3
933 AESENC \TMP3, \XMM4
934 pxor \TMP1, \TMP4
935# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
936 pxor \XMM6, \XMM5
937 pxor \TMP2, \TMP6
938 movdqa \XMM7, \TMP1
939 pshufd $78, \XMM7, \TMP2
940 pxor \XMM7, \TMP2
941 movdqa HashKey_2(%rsp ), \TMP5
942
943 # Multiply TMP5 * HashKey using karatsuba
944
945 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
946 movaps 0x60(%arg1), \TMP3
947 AESENC \TMP3, \XMM1 # Round 6
948 AESENC \TMP3, \XMM2
949 AESENC \TMP3, \XMM3
950 AESENC \TMP3, \XMM4
951 PCLMULQDQ 0x00, \TMP5, \XMM7 # XMM7 = a0*b0
952 movaps 0x70(%arg1), \TMP3
953 AESENC \TMP3, \XMM1 # Round 7
954 AESENC \TMP3, \XMM2
955 AESENC \TMP3, \XMM3
956 AESENC \TMP3, \XMM4
957 movdqa HashKey_2_k(%rsp), \TMP5
958 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
959 movaps 0x80(%arg1), \TMP3
960 AESENC \TMP3, \XMM1 # Round 8
961 AESENC \TMP3, \XMM2
962 AESENC \TMP3, \XMM3
963 AESENC \TMP3, \XMM4
964 pxor \TMP1, \TMP4
965# accumulate the results in TMP4:XMM5, TMP6 holds the middle part
966 pxor \XMM7, \XMM5
967 pxor \TMP2, \TMP6
968
969 # Multiply XMM8 * HashKey
970 # XMM8 and TMP5 hold the values for the two operands
971
972 movdqa \XMM8, \TMP1
973 pshufd $78, \XMM8, \TMP2
974 pxor \XMM8, \TMP2
975 movdqa HashKey(%rsp), \TMP5
976 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
977 movaps 0x90(%arg1), \TMP3
978 AESENC \TMP3, \XMM1 # Round 9
979 AESENC \TMP3, \XMM2
980 AESENC \TMP3, \XMM3
981 AESENC \TMP3, \XMM4
982 PCLMULQDQ 0x00, \TMP5, \XMM8 # XMM8 = a0*b0
983 movaps 0xa0(%arg1), \TMP3
984 AESENCLAST \TMP3, \XMM1 # Round 10
985 AESENCLAST \TMP3, \XMM2
986 AESENCLAST \TMP3, \XMM3
987 AESENCLAST \TMP3, \XMM4
988 movdqa HashKey_k(%rsp), \TMP5
989 PCLMULQDQ 0x00, \TMP5, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
990 movdqu (%arg3,%r11,1), \TMP3
991 pxor \TMP3, \XMM1 # Ciphertext/Plaintext XOR EK
992 movdqu \XMM1, (%arg2,%r11,1) # Write to plaintext buffer
993 movdqa \TMP3, \XMM1
994 movdqu 16(%arg3,%r11,1), \TMP3
995 pxor \TMP3, \XMM2 # Ciphertext/Plaintext XOR EK
996 movdqu \XMM2, 16(%arg2,%r11,1) # Write to plaintext buffer
997 movdqa \TMP3, \XMM2
998 movdqu 32(%arg3,%r11,1), \TMP3
999 pxor \TMP3, \XMM3 # Ciphertext/Plaintext XOR EK
1000 movdqu \XMM3, 32(%arg2,%r11,1) # Write to plaintext buffer
1001 movdqa \TMP3, \XMM3
1002 movdqu 48(%arg3,%r11,1), \TMP3
1003 pxor \TMP3, \XMM4 # Ciphertext/Plaintext XOR EK
1004 movdqu \XMM4, 48(%arg2,%r11,1) # Write to plaintext buffer
1005 movdqa \TMP3, \XMM4
1006 PSHUFB_XMM %xmm15, \XMM1 # perform a 16 byte swap
1007 PSHUFB_XMM %xmm15, \XMM2 # perform a 16 byte swap
1008 PSHUFB_XMM %xmm15, \XMM3 # perform a 16 byte swap
1009 PSHUFB_XMM %xmm15, \XMM4 # perform a 16 byte swap
1010
1011 pxor \TMP4, \TMP1
1012 pxor \XMM8, \XMM5
1013 pxor \TMP6, \TMP2
1014 pxor \TMP1, \TMP2
1015 pxor \XMM5, \TMP2
1016 movdqa \TMP2, \TMP3
1017 pslldq $8, \TMP3 # left shift TMP3 2 DWs
1018 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1019 pxor \TMP3, \XMM5
1020 pxor \TMP2, \TMP1 # accumulate the results in TMP1:XMM5
1021
1022 # first phase of reduction
1023
1024 movdqa \XMM5, \TMP2
1025 movdqa \XMM5, \TMP3
1026 movdqa \XMM5, \TMP4
1027# move XMM5 into TMP2, TMP3, TMP4 in order to perform shifts independently
1028 pslld $31, \TMP2 # packed right shift << 31
1029 pslld $30, \TMP3 # packed right shift << 30
1030 pslld $25, \TMP4 # packed right shift << 25
1031 pxor \TMP3, \TMP2 # xor the shifted versions
1032 pxor \TMP4, \TMP2
1033 movdqa \TMP2, \TMP5
1034 psrldq $4, \TMP5 # right shift T5 1 DW
1035 pslldq $12, \TMP2 # left shift T2 3 DWs
1036 pxor \TMP2, \XMM5
1037
1038 # second phase of reduction
1039
1040 movdqa \XMM5,\TMP2 # make 3 copies of XMM5 into TMP2, TMP3, TMP4
1041 movdqa \XMM5,\TMP3
1042 movdqa \XMM5,\TMP4
1043 psrld $1, \TMP2 # packed left shift >>1
1044 psrld $2, \TMP3 # packed left shift >>2
1045 psrld $7, \TMP4 # packed left shift >>7
1046 pxor \TMP3,\TMP2 # xor the shifted versions
1047 pxor \TMP4,\TMP2
1048 pxor \TMP5, \TMP2
1049 pxor \TMP2, \XMM5
1050 pxor \TMP1, \XMM5 # result is in TMP1
1051
1052 pxor \XMM5, \XMM1
1053.endm
1054
1055/* GHASH the last 4 ciphertext blocks. */
1056.macro GHASH_LAST_4 TMP1 TMP2 TMP3 TMP4 TMP5 TMP6 \
1057TMP7 XMM1 XMM2 XMM3 XMM4 XMMDst
1058
1059 # Multiply TMP6 * HashKey (using Karatsuba)
1060
1061 movdqa \XMM1, \TMP6
1062 pshufd $78, \XMM1, \TMP2
1063 pxor \XMM1, \TMP2
1064 movdqa HashKey_4(%rsp), \TMP5
1065 PCLMULQDQ 0x11, \TMP5, \TMP6 # TMP6 = a1*b1
1066 PCLMULQDQ 0x00, \TMP5, \XMM1 # XMM1 = a0*b0
1067 movdqa HashKey_4_k(%rsp), \TMP4
1068 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1069 movdqa \XMM1, \XMMDst
1070 movdqa \TMP2, \XMM1 # result in TMP6, XMMDst, XMM1
1071
1072 # Multiply TMP1 * HashKey (using Karatsuba)
1073
1074 movdqa \XMM2, \TMP1
1075 pshufd $78, \XMM2, \TMP2
1076 pxor \XMM2, \TMP2
1077 movdqa HashKey_3(%rsp), \TMP5
1078 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1079 PCLMULQDQ 0x00, \TMP5, \XMM2 # XMM2 = a0*b0
1080 movdqa HashKey_3_k(%rsp), \TMP4
1081 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1082 pxor \TMP1, \TMP6
1083 pxor \XMM2, \XMMDst
1084 pxor \TMP2, \XMM1
1085# results accumulated in TMP6, XMMDst, XMM1
1086
1087 # Multiply TMP1 * HashKey (using Karatsuba)
1088
1089 movdqa \XMM3, \TMP1
1090 pshufd $78, \XMM3, \TMP2
1091 pxor \XMM3, \TMP2
1092 movdqa HashKey_2(%rsp), \TMP5
1093 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1094 PCLMULQDQ 0x00, \TMP5, \XMM3 # XMM3 = a0*b0
1095 movdqa HashKey_2_k(%rsp), \TMP4
1096 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1097 pxor \TMP1, \TMP6
1098 pxor \XMM3, \XMMDst
1099 pxor \TMP2, \XMM1 # results accumulated in TMP6, XMMDst, XMM1
1100
1101 # Multiply TMP1 * HashKey (using Karatsuba)
1102 movdqa \XMM4, \TMP1
1103 pshufd $78, \XMM4, \TMP2
1104 pxor \XMM4, \TMP2
1105 movdqa HashKey(%rsp), \TMP5
1106 PCLMULQDQ 0x11, \TMP5, \TMP1 # TMP1 = a1*b1
1107 PCLMULQDQ 0x00, \TMP5, \XMM4 # XMM4 = a0*b0
1108 movdqa HashKey_k(%rsp), \TMP4
1109 PCLMULQDQ 0x00, \TMP4, \TMP2 # TMP2 = (a1+a0)*(b1+b0)
1110 pxor \TMP1, \TMP6
1111 pxor \XMM4, \XMMDst
1112 pxor \XMM1, \TMP2
1113 pxor \TMP6, \TMP2
1114 pxor \XMMDst, \TMP2
1115 # middle section of the temp results combined as in karatsuba algorithm
1116 movdqa \TMP2, \TMP4
1117 pslldq $8, \TMP4 # left shift TMP4 2 DWs
1118 psrldq $8, \TMP2 # right shift TMP2 2 DWs
1119 pxor \TMP4, \XMMDst
1120 pxor \TMP2, \TMP6
1121# TMP6:XMMDst holds the result of the accumulated carry-less multiplications
1122 # first phase of the reduction
1123 movdqa \XMMDst, \TMP2
1124 movdqa \XMMDst, \TMP3
1125 movdqa \XMMDst, \TMP4
1126# move XMMDst into TMP2, TMP3, TMP4 in order to perform 3 shifts independently
1127 pslld $31, \TMP2 # packed right shifting << 31
1128 pslld $30, \TMP3 # packed right shifting << 30
1129 pslld $25, \TMP4 # packed right shifting << 25
1130 pxor \TMP3, \TMP2 # xor the shifted versions
1131 pxor \TMP4, \TMP2
1132 movdqa \TMP2, \TMP7
1133 psrldq $4, \TMP7 # right shift TMP7 1 DW
1134 pslldq $12, \TMP2 # left shift TMP2 3 DWs
1135 pxor \TMP2, \XMMDst
1136
1137 # second phase of the reduction
1138 movdqa \XMMDst, \TMP2
1139 # make 3 copies of XMMDst for doing 3 shift operations
1140 movdqa \XMMDst, \TMP3
1141 movdqa \XMMDst, \TMP4
1142 psrld $1, \TMP2 # packed left shift >> 1
1143 psrld $2, \TMP3 # packed left shift >> 2
1144 psrld $7, \TMP4 # packed left shift >> 7
1145 pxor \TMP3, \TMP2 # xor the shifted versions
1146 pxor \TMP4, \TMP2
1147 pxor \TMP7, \TMP2
1148 pxor \TMP2, \XMMDst
1149 pxor \TMP6, \XMMDst # reduced result is in XMMDst
1150.endm
1151
1152/* Encryption of a single block done*/
1153.macro ENCRYPT_SINGLE_BLOCK XMM0 TMP1
1154
1155 pxor (%arg1), \XMM0
1156 movaps 16(%arg1), \TMP1
1157 AESENC \TMP1, \XMM0
1158 movaps 32(%arg1), \TMP1
1159 AESENC \TMP1, \XMM0
1160 movaps 48(%arg1), \TMP1
1161 AESENC \TMP1, \XMM0
1162 movaps 64(%arg1), \TMP1
1163 AESENC \TMP1, \XMM0
1164 movaps 80(%arg1), \TMP1
1165 AESENC \TMP1, \XMM0
1166 movaps 96(%arg1), \TMP1
1167 AESENC \TMP1, \XMM0
1168 movaps 112(%arg1), \TMP1
1169 AESENC \TMP1, \XMM0
1170 movaps 128(%arg1), \TMP1
1171 AESENC \TMP1, \XMM0
1172 movaps 144(%arg1), \TMP1
1173 AESENC \TMP1, \XMM0
1174 movaps 160(%arg1), \TMP1
1175 AESENCLAST \TMP1, \XMM0
1176.endm
1177
1178
1179/*****************************************************************************
1180* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1181* u8 *out, // Plaintext output. Encrypt in-place is allowed.
1182* const u8 *in, // Ciphertext input
1183* u64 plaintext_len, // Length of data in bytes for decryption.
1184* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1185* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1186* // concatenated with 0x00000001. 16-byte aligned pointer.
1187* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1188* const u8 *aad, // Additional Authentication Data (AAD)
1189* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1190* u8 *auth_tag, // Authenticated Tag output. The driver will compare this to the
1191* // given authentication tag and only return the plaintext if they match.
1192* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16
1193* // (most likely), 12 or 8.
1194*
1195* Assumptions:
1196*
1197* keys:
1198* keys are pre-expanded and aligned to 16 bytes. we are using the first
1199* set of 11 keys in the data structure void *aes_ctx
1200*
1201* iv:
1202* 0 1 2 3
1203* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1204* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1205* | Salt (From the SA) |
1206* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1207* | Initialization Vector |
1208* | (This is the sequence number from IPSec header) |
1209* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1210* | 0x1 |
1211* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1212*
1213*
1214*
1215* AAD:
1216* AAD padded to 128 bits with 0
1217* for example, assume AAD is a u32 vector
1218*
1219* if AAD is 8 bytes:
1220* AAD[3] = {A0, A1};
1221* padded AAD in xmm register = {A1 A0 0 0}
1222*
1223* 0 1 2 3
1224* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1225* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1226* | SPI (A1) |
1227* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1228* | 32-bit Sequence Number (A0) |
1229* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1230* | 0x0 |
1231* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1232*
1233* AAD Format with 32-bit Sequence Number
1234*
1235* if AAD is 12 bytes:
1236* AAD[3] = {A0, A1, A2};
1237* padded AAD in xmm register = {A2 A1 A0 0}
1238*
1239* 0 1 2 3
1240* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1241* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1242* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1243* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1244* | SPI (A2) |
1245* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1246* | 64-bit Extended Sequence Number {A1,A0} |
1247* | |
1248* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1249* | 0x0 |
1250* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1251*
1252* AAD Format with 64-bit Extended Sequence Number
1253*
1254* aadLen:
1255* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1256* The code supports 16 too but for other sizes, the code will fail.
1257*
1258* TLen:
1259* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1260* For other sizes, the code will fail.
1261*
1262* poly = x^128 + x^127 + x^126 + x^121 + 1
1263*
1264*****************************************************************************/
1265
1266ENTRY(aesni_gcm_dec)
1267 push %r12
1268 push %r13
1269 push %r14
1270 mov %rsp, %r14
1271/*
1272* states of %xmm registers %xmm6:%xmm15 not saved
1273* all %xmm registers are clobbered
1274*/
1275 sub $VARIABLE_OFFSET, %rsp
1276 and $~63, %rsp # align rsp to 64 bytes
1277 mov %arg6, %r12
1278 movdqu (%r12), %xmm13 # %xmm13 = HashKey
1279 movdqa SHUF_MASK(%rip), %xmm2
1280 PSHUFB_XMM %xmm2, %xmm13
1281
1282
1283# Precompute HashKey<<1 (mod poly) from the hash key (required for GHASH)
1284
1285 movdqa %xmm13, %xmm2
1286 psllq $1, %xmm13
1287 psrlq $63, %xmm2
1288 movdqa %xmm2, %xmm1
1289 pslldq $8, %xmm2
1290 psrldq $8, %xmm1
1291 por %xmm2, %xmm13
1292
1293 # Reduction
1294
1295 pshufd $0x24, %xmm1, %xmm2
1296 pcmpeqd TWOONE(%rip), %xmm2
1297 pand POLY(%rip), %xmm2
1298 pxor %xmm2, %xmm13 # %xmm13 holds the HashKey<<1 (mod poly)
1299
1300
1301 # Decrypt first few blocks
1302
1303 movdqa %xmm13, HashKey(%rsp) # store HashKey<<1 (mod poly)
1304 mov %arg4, %r13 # save the number of bytes of plaintext/ciphertext
1305 and $-16, %r13 # %r13 = %r13 - (%r13 mod 16)
1306 mov %r13, %r12
1307 and $(3<<4), %r12
1308 jz _initial_num_blocks_is_0_decrypt
1309 cmp $(2<<4), %r12
1310 jb _initial_num_blocks_is_1_decrypt
1311 je _initial_num_blocks_is_2_decrypt
1312_initial_num_blocks_is_3_decrypt:
1313 INITIAL_BLOCKS_DEC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1314%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, dec
1315 sub $48, %r13
1316 jmp _initial_blocks_decrypted
1317_initial_num_blocks_is_2_decrypt:
1318 INITIAL_BLOCKS_DEC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1319%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, dec
1320 sub $32, %r13
1321 jmp _initial_blocks_decrypted
1322_initial_num_blocks_is_1_decrypt:
1323 INITIAL_BLOCKS_DEC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1324%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, dec
1325 sub $16, %r13
1326 jmp _initial_blocks_decrypted
1327_initial_num_blocks_is_0_decrypt:
1328 INITIAL_BLOCKS_DEC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1329%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, dec
1330_initial_blocks_decrypted:
1331 cmp $0, %r13
1332 je _zero_cipher_left_decrypt
1333 sub $64, %r13
1334 je _four_cipher_left_decrypt
1335_decrypt_by_4:
1336 GHASH_4_ENCRYPT_4_PARALLEL_DEC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1337%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, dec
1338 add $64, %r11
1339 sub $64, %r13
1340 jne _decrypt_by_4
1341_four_cipher_left_decrypt:
1342 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1343%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1344_zero_cipher_left_decrypt:
1345 mov %arg4, %r13
1346 and $15, %r13 # %r13 = arg4 (mod 16)
1347 je _multiple_of_16_bytes_decrypt
1348
1349 # Handle the last <16 byte block separately
1350
1351 paddd ONE(%rip), %xmm0 # increment CNT to get Yn
1352 movdqa SHUF_MASK(%rip), %xmm10
1353 PSHUFB_XMM %xmm10, %xmm0
1354
1355 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
1356 sub $16, %r11
1357 add %r13, %r11
1358 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
1359 lea SHIFT_MASK+16(%rip), %r12
1360 sub %r13, %r12
1361# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
1362# (%r13 is the number of bytes in plaintext mod 16)
1363 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1364 PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
1365
1366 movdqa %xmm1, %xmm2
1367 pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
1368 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1369 # get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
1370 pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
1371 pand %xmm1, %xmm2
1372 movdqa SHUF_MASK(%rip), %xmm10
1373 PSHUFB_XMM %xmm10 ,%xmm2
1374
1375 pxor %xmm2, %xmm8
1376 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1377 # GHASH computation for the last <16 byte block
1378 sub %r13, %r11
1379 add $16, %r11
1380
1381 # output %r13 bytes
1382 MOVQ_R64_XMM %xmm0, %rax
1383 cmp $8, %r13
1384 jle _less_than_8_bytes_left_decrypt
1385 mov %rax, (%arg2 , %r11, 1)
1386 add $8, %r11
1387 psrldq $8, %xmm0
1388 MOVQ_R64_XMM %xmm0, %rax
1389 sub $8, %r13
1390_less_than_8_bytes_left_decrypt:
1391 mov %al, (%arg2, %r11, 1)
1392 add $1, %r11
1393 shr $8, %rax
1394 sub $1, %r13
1395 jne _less_than_8_bytes_left_decrypt
1396_multiple_of_16_bytes_decrypt:
1397 mov arg8, %r12 # %r13 = aadLen (number of bytes)
1398 shl $3, %r12 # convert into number of bits
1399 movd %r12d, %xmm15 # len(A) in %xmm15
1400 shl $3, %arg4 # len(C) in bits (*128)
1401 MOVQ_R64_XMM %arg4, %xmm1
1402 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1403 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1404 pxor %xmm15, %xmm8
1405 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1406 # final GHASH computation
1407 movdqa SHUF_MASK(%rip), %xmm10
1408 PSHUFB_XMM %xmm10, %xmm8
1409
1410 mov %arg5, %rax # %rax = *Y0
1411 movdqu (%rax), %xmm0 # %xmm0 = Y0
1412 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Y0)
1413 pxor %xmm8, %xmm0
1414_return_T_decrypt:
1415 mov arg9, %r10 # %r10 = authTag
1416 mov arg10, %r11 # %r11 = auth_tag_len
1417 cmp $16, %r11
1418 je _T_16_decrypt
1419 cmp $12, %r11
1420 je _T_12_decrypt
1421_T_8_decrypt:
1422 MOVQ_R64_XMM %xmm0, %rax
1423 mov %rax, (%r10)
1424 jmp _return_T_done_decrypt
1425_T_12_decrypt:
1426 MOVQ_R64_XMM %xmm0, %rax
1427 mov %rax, (%r10)
1428 psrldq $8, %xmm0
1429 movd %xmm0, %eax
1430 mov %eax, 8(%r10)
1431 jmp _return_T_done_decrypt
1432_T_16_decrypt:
1433 movdqu %xmm0, (%r10)
1434_return_T_done_decrypt:
1435 mov %r14, %rsp
1436 pop %r14
1437 pop %r13
1438 pop %r12
1439 ret
1440
1441
1442/*****************************************************************************
1443* void aesni_gcm_enc(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
1444* u8 *out, // Ciphertext output. Encrypt in-place is allowed.
1445* const u8 *in, // Plaintext input
1446* u64 plaintext_len, // Length of data in bytes for encryption.
1447* u8 *iv, // Pre-counter block j0: 4 byte salt (from Security Association)
1448* // concatenated with 8 byte Initialisation Vector (from IPSec ESP Payload)
1449* // concatenated with 0x00000001. 16-byte aligned pointer.
1450* u8 *hash_subkey, // H, the Hash sub key input. Data starts on a 16-byte boundary.
1451* const u8 *aad, // Additional Authentication Data (AAD)
1452* u64 aad_len, // Length of AAD in bytes. With RFC4106 this is going to be 8 or 12 bytes
1453* u8 *auth_tag, // Authenticated Tag output.
1454* u64 auth_tag_len); // Authenticated Tag Length in bytes. Valid values are 16 (most likely),
1455* // 12 or 8.
1456*
1457* Assumptions:
1458*
1459* keys:
1460* keys are pre-expanded and aligned to 16 bytes. we are using the
1461* first set of 11 keys in the data structure void *aes_ctx
1462*
1463*
1464* iv:
1465* 0 1 2 3
1466* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1467* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1468* | Salt (From the SA) |
1469* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1470* | Initialization Vector |
1471* | (This is the sequence number from IPSec header) |
1472* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1473* | 0x1 |
1474* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1475*
1476*
1477*
1478* AAD:
1479* AAD padded to 128 bits with 0
1480* for example, assume AAD is a u32 vector
1481*
1482* if AAD is 8 bytes:
1483* AAD[3] = {A0, A1};
1484* padded AAD in xmm register = {A1 A0 0 0}
1485*
1486* 0 1 2 3
1487* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1488* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1489* | SPI (A1) |
1490* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1491* | 32-bit Sequence Number (A0) |
1492* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1493* | 0x0 |
1494* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1495*
1496* AAD Format with 32-bit Sequence Number
1497*
1498* if AAD is 12 bytes:
1499* AAD[3] = {A0, A1, A2};
1500* padded AAD in xmm register = {A2 A1 A0 0}
1501*
1502* 0 1 2 3
1503* 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1504* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1505* | SPI (A2) |
1506* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1507* | 64-bit Extended Sequence Number {A1,A0} |
1508* | |
1509* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1510* | 0x0 |
1511* +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1512*
1513* AAD Format with 64-bit Extended Sequence Number
1514*
1515* aadLen:
1516* from the definition of the spec, aadLen can only be 8 or 12 bytes.
1517* The code supports 16 too but for other sizes, the code will fail.
1518*
1519* TLen:
1520* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
1521* For other sizes, the code will fail.
1522*
1523* poly = x^128 + x^127 + x^126 + x^121 + 1
1524***************************************************************************/
1525ENTRY(aesni_gcm_enc)
1526 push %r12
1527 push %r13
1528 push %r14
1529 mov %rsp, %r14
1530#
1531# states of %xmm registers %xmm6:%xmm15 not saved
1532# all %xmm registers are clobbered
1533#
1534 sub $VARIABLE_OFFSET, %rsp
1535 and $~63, %rsp
1536 mov %arg6, %r12
1537 movdqu (%r12), %xmm13
1538 movdqa SHUF_MASK(%rip), %xmm2
1539 PSHUFB_XMM %xmm2, %xmm13
1540
1541
1542# precompute HashKey<<1 mod poly from the HashKey (required for GHASH)
1543
1544 movdqa %xmm13, %xmm2
1545 psllq $1, %xmm13
1546 psrlq $63, %xmm2
1547 movdqa %xmm2, %xmm1
1548 pslldq $8, %xmm2
1549 psrldq $8, %xmm1
1550 por %xmm2, %xmm13
1551
1552 # reduce HashKey<<1
1553
1554 pshufd $0x24, %xmm1, %xmm2
1555 pcmpeqd TWOONE(%rip), %xmm2
1556 pand POLY(%rip), %xmm2
1557 pxor %xmm2, %xmm13
1558 movdqa %xmm13, HashKey(%rsp)
1559 mov %arg4, %r13 # %xmm13 holds HashKey<<1 (mod poly)
1560 and $-16, %r13
1561 mov %r13, %r12
1562
1563 # Encrypt first few blocks
1564
1565 and $(3<<4), %r12
1566 jz _initial_num_blocks_is_0_encrypt
1567 cmp $(2<<4), %r12
1568 jb _initial_num_blocks_is_1_encrypt
1569 je _initial_num_blocks_is_2_encrypt
1570_initial_num_blocks_is_3_encrypt:
1571 INITIAL_BLOCKS_ENC 3, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1572%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 5, 678, enc
1573 sub $48, %r13
1574 jmp _initial_blocks_encrypted
1575_initial_num_blocks_is_2_encrypt:
1576 INITIAL_BLOCKS_ENC 2, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1577%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 6, 78, enc
1578 sub $32, %r13
1579 jmp _initial_blocks_encrypted
1580_initial_num_blocks_is_1_encrypt:
1581 INITIAL_BLOCKS_ENC 1, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1582%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 7, 8, enc
1583 sub $16, %r13
1584 jmp _initial_blocks_encrypted
1585_initial_num_blocks_is_0_encrypt:
1586 INITIAL_BLOCKS_ENC 0, %xmm9, %xmm10, %xmm13, %xmm11, %xmm12, %xmm0, \
1587%xmm1, %xmm2, %xmm3, %xmm4, %xmm8, %xmm5, %xmm6, 8, 0, enc
1588_initial_blocks_encrypted:
1589
1590 # Main loop - Encrypt remaining blocks
1591
1592 cmp $0, %r13
1593 je _zero_cipher_left_encrypt
1594 sub $64, %r13
1595 je _four_cipher_left_encrypt
1596_encrypt_by_4_encrypt:
1597 GHASH_4_ENCRYPT_4_PARALLEL_ENC %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, \
1598%xmm14, %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, %xmm8, enc
1599 add $64, %r11
1600 sub $64, %r13
1601 jne _encrypt_by_4_encrypt
1602_four_cipher_left_encrypt:
1603 GHASH_LAST_4 %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, \
1604%xmm15, %xmm1, %xmm2, %xmm3, %xmm4, %xmm8
1605_zero_cipher_left_encrypt:
1606 mov %arg4, %r13
1607 and $15, %r13 # %r13 = arg4 (mod 16)
1608 je _multiple_of_16_bytes_encrypt
1609
1610 # Handle the last <16 Byte block separately
1611 paddd ONE(%rip), %xmm0 # INCR CNT to get Yn
1612 movdqa SHUF_MASK(%rip), %xmm10
1613 PSHUFB_XMM %xmm10, %xmm0
1614
1615
1616 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
1617 sub $16, %r11
1618 add %r13, %r11
1619 movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
1620 lea SHIFT_MASK+16(%rip), %r12
1621 sub %r13, %r12
1622 # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
1623 # (%r13 is the number of bytes in plaintext mod 16)
1624 movdqu (%r12), %xmm2 # get the appropriate shuffle mask
1625 PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
1626 pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
1627 movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
1628 # get the appropriate mask to mask out top 16-r13 bytes of xmm0
1629 pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
1630 movdqa SHUF_MASK(%rip), %xmm10
1631 PSHUFB_XMM %xmm10,%xmm0
1632
1633 pxor %xmm0, %xmm8
1634 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1635 # GHASH computation for the last <16 byte block
1636 sub %r13, %r11
1637 add $16, %r11
1638
1639 movdqa SHUF_MASK(%rip), %xmm10
1640 PSHUFB_XMM %xmm10, %xmm0
1641
1642 # shuffle xmm0 back to output as ciphertext
1643
1644 # Output %r13 bytes
1645 MOVQ_R64_XMM %xmm0, %rax
1646 cmp $8, %r13
1647 jle _less_than_8_bytes_left_encrypt
1648 mov %rax, (%arg2 , %r11, 1)
1649 add $8, %r11
1650 psrldq $8, %xmm0
1651 MOVQ_R64_XMM %xmm0, %rax
1652 sub $8, %r13
1653_less_than_8_bytes_left_encrypt:
1654 mov %al, (%arg2, %r11, 1)
1655 add $1, %r11
1656 shr $8, %rax
1657 sub $1, %r13
1658 jne _less_than_8_bytes_left_encrypt
1659_multiple_of_16_bytes_encrypt:
1660 mov arg8, %r12 # %r12 = addLen (number of bytes)
1661 shl $3, %r12
1662 movd %r12d, %xmm15 # len(A) in %xmm15
1663 shl $3, %arg4 # len(C) in bits (*128)
1664 MOVQ_R64_XMM %arg4, %xmm1
1665 pslldq $8, %xmm15 # %xmm15 = len(A)||0x0000000000000000
1666 pxor %xmm1, %xmm15 # %xmm15 = len(A)||len(C)
1667 pxor %xmm15, %xmm8
1668 GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
1669 # final GHASH computation
1670 movdqa SHUF_MASK(%rip), %xmm10
1671 PSHUFB_XMM %xmm10, %xmm8 # perform a 16 byte swap
1672
1673 mov %arg5, %rax # %rax = *Y0
1674 movdqu (%rax), %xmm0 # %xmm0 = Y0
1675 ENCRYPT_SINGLE_BLOCK %xmm0, %xmm15 # Encrypt(K, Y0)
1676 pxor %xmm8, %xmm0
1677_return_T_encrypt:
1678 mov arg9, %r10 # %r10 = authTag
1679 mov arg10, %r11 # %r11 = auth_tag_len
1680 cmp $16, %r11
1681 je _T_16_encrypt
1682 cmp $12, %r11
1683 je _T_12_encrypt
1684_T_8_encrypt:
1685 MOVQ_R64_XMM %xmm0, %rax
1686 mov %rax, (%r10)
1687 jmp _return_T_done_encrypt
1688_T_12_encrypt:
1689 MOVQ_R64_XMM %xmm0, %rax
1690 mov %rax, (%r10)
1691 psrldq $8, %xmm0
1692 movd %xmm0, %eax
1693 mov %eax, 8(%r10)
1694 jmp _return_T_done_encrypt
1695_T_16_encrypt:
1696 movdqu %xmm0, (%r10)
1697_return_T_done_encrypt:
1698 mov %r14, %rsp
1699 pop %r14
1700 pop %r13
1701 pop %r12
1702 ret
1703
1704#endif
1705
49 1706
50_key_expansion_128: 1707_key_expansion_128:
51_key_expansion_256a: 1708_key_expansion_256a:
@@ -55,10 +1712,11 @@ _key_expansion_256a:
55 shufps $0b10001100, %xmm0, %xmm4 1712 shufps $0b10001100, %xmm0, %xmm4
56 pxor %xmm4, %xmm0 1713 pxor %xmm4, %xmm0
57 pxor %xmm1, %xmm0 1714 pxor %xmm1, %xmm0
58 movaps %xmm0, (%rcx) 1715 movaps %xmm0, (TKEYP)
59 add $0x10, %rcx 1716 add $0x10, TKEYP
60 ret 1717 ret
61 1718
1719.align 4
62_key_expansion_192a: 1720_key_expansion_192a:
63 pshufd $0b01010101, %xmm1, %xmm1 1721 pshufd $0b01010101, %xmm1, %xmm1
64 shufps $0b00010000, %xmm0, %xmm4 1722 shufps $0b00010000, %xmm0, %xmm4
@@ -76,12 +1734,13 @@ _key_expansion_192a:
76 1734
77 movaps %xmm0, %xmm1 1735 movaps %xmm0, %xmm1
78 shufps $0b01000100, %xmm0, %xmm6 1736 shufps $0b01000100, %xmm0, %xmm6
79 movaps %xmm6, (%rcx) 1737 movaps %xmm6, (TKEYP)
80 shufps $0b01001110, %xmm2, %xmm1 1738 shufps $0b01001110, %xmm2, %xmm1
81 movaps %xmm1, 16(%rcx) 1739 movaps %xmm1, 0x10(TKEYP)
82 add $0x20, %rcx 1740 add $0x20, TKEYP
83 ret 1741 ret
84 1742
1743.align 4
85_key_expansion_192b: 1744_key_expansion_192b:
86 pshufd $0b01010101, %xmm1, %xmm1 1745 pshufd $0b01010101, %xmm1, %xmm1
87 shufps $0b00010000, %xmm0, %xmm4 1746 shufps $0b00010000, %xmm0, %xmm4
@@ -96,10 +1755,11 @@ _key_expansion_192b:
96 pxor %xmm3, %xmm2 1755 pxor %xmm3, %xmm2
97 pxor %xmm5, %xmm2 1756 pxor %xmm5, %xmm2
98 1757
99 movaps %xmm0, (%rcx) 1758 movaps %xmm0, (TKEYP)
100 add $0x10, %rcx 1759 add $0x10, TKEYP
101 ret 1760 ret
102 1761
1762.align 4
103_key_expansion_256b: 1763_key_expansion_256b:
104 pshufd $0b10101010, %xmm1, %xmm1 1764 pshufd $0b10101010, %xmm1, %xmm1
105 shufps $0b00010000, %xmm2, %xmm4 1765 shufps $0b00010000, %xmm2, %xmm4
@@ -107,8 +1767,8 @@ _key_expansion_256b:
107 shufps $0b10001100, %xmm2, %xmm4 1767 shufps $0b10001100, %xmm2, %xmm4
108 pxor %xmm4, %xmm2 1768 pxor %xmm4, %xmm2
109 pxor %xmm1, %xmm2 1769 pxor %xmm1, %xmm2
110 movaps %xmm2, (%rcx) 1770 movaps %xmm2, (TKEYP)
111 add $0x10, %rcx 1771 add $0x10, TKEYP
112 ret 1772 ret
113 1773
114/* 1774/*
@@ -116,17 +1776,23 @@ _key_expansion_256b:
116 * unsigned int key_len) 1776 * unsigned int key_len)
117 */ 1777 */
118ENTRY(aesni_set_key) 1778ENTRY(aesni_set_key)
119 movups (%rsi), %xmm0 # user key (first 16 bytes) 1779#ifndef __x86_64__
120 movaps %xmm0, (%rdi) 1780 pushl KEYP
121 lea 0x10(%rdi), %rcx # key addr 1781 movl 8(%esp), KEYP # ctx
122 movl %edx, 480(%rdi) 1782 movl 12(%esp), UKEYP # in_key
1783 movl 16(%esp), %edx # key_len
1784#endif
1785 movups (UKEYP), %xmm0 # user key (first 16 bytes)
1786 movaps %xmm0, (KEYP)
1787 lea 0x10(KEYP), TKEYP # key addr
1788 movl %edx, 480(KEYP)
123 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x 1789 pxor %xmm4, %xmm4 # xmm4 is assumed 0 in _key_expansion_x
124 cmp $24, %dl 1790 cmp $24, %dl
125 jb .Lenc_key128 1791 jb .Lenc_key128
126 je .Lenc_key192 1792 je .Lenc_key192
127 movups 0x10(%rsi), %xmm2 # other user key 1793 movups 0x10(UKEYP), %xmm2 # other user key
128 movaps %xmm2, (%rcx) 1794 movaps %xmm2, (TKEYP)
129 add $0x10, %rcx 1795 add $0x10, TKEYP
130 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1796 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
131 call _key_expansion_256a 1797 call _key_expansion_256a
132 AESKEYGENASSIST 0x1 %xmm0 %xmm1 1798 AESKEYGENASSIST 0x1 %xmm0 %xmm1
@@ -155,7 +1821,7 @@ ENTRY(aesni_set_key)
155 call _key_expansion_256a 1821 call _key_expansion_256a
156 jmp .Ldec_key 1822 jmp .Ldec_key
157.Lenc_key192: 1823.Lenc_key192:
158 movq 0x10(%rsi), %xmm2 # other user key 1824 movq 0x10(UKEYP), %xmm2 # other user key
159 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1 1825 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
160 call _key_expansion_192a 1826 call _key_expansion_192a
161 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2 1827 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
@@ -195,33 +1861,47 @@ ENTRY(aesni_set_key)
195 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10 1861 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
196 call _key_expansion_128 1862 call _key_expansion_128
197.Ldec_key: 1863.Ldec_key:
198 sub $0x10, %rcx 1864 sub $0x10, TKEYP
199 movaps (%rdi), %xmm0 1865 movaps (KEYP), %xmm0
200 movaps (%rcx), %xmm1 1866 movaps (TKEYP), %xmm1
201 movaps %xmm0, 240(%rcx) 1867 movaps %xmm0, 240(TKEYP)
202 movaps %xmm1, 240(%rdi) 1868 movaps %xmm1, 240(KEYP)
203 add $0x10, %rdi 1869 add $0x10, KEYP
204 lea 240-16(%rcx), %rsi 1870 lea 240-16(TKEYP), UKEYP
205.align 4 1871.align 4
206.Ldec_key_loop: 1872.Ldec_key_loop:
207 movaps (%rdi), %xmm0 1873 movaps (KEYP), %xmm0
208 AESIMC %xmm0 %xmm1 1874 AESIMC %xmm0 %xmm1
209 movaps %xmm1, (%rsi) 1875 movaps %xmm1, (UKEYP)
210 add $0x10, %rdi 1876 add $0x10, KEYP
211 sub $0x10, %rsi 1877 sub $0x10, UKEYP
212 cmp %rcx, %rdi 1878 cmp TKEYP, KEYP
213 jb .Ldec_key_loop 1879 jb .Ldec_key_loop
214 xor %rax, %rax 1880 xor AREG, AREG
1881#ifndef __x86_64__
1882 popl KEYP
1883#endif
215 ret 1884 ret
216 1885
217/* 1886/*
218 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 1887 * void aesni_enc(struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
219 */ 1888 */
220ENTRY(aesni_enc) 1889ENTRY(aesni_enc)
1890#ifndef __x86_64__
1891 pushl KEYP
1892 pushl KLEN
1893 movl 12(%esp), KEYP
1894 movl 16(%esp), OUTP
1895 movl 20(%esp), INP
1896#endif
221 movl 480(KEYP), KLEN # key length 1897 movl 480(KEYP), KLEN # key length
222 movups (INP), STATE # input 1898 movups (INP), STATE # input
223 call _aesni_enc1 1899 call _aesni_enc1
224 movups STATE, (OUTP) # output 1900 movups STATE, (OUTP) # output
1901#ifndef __x86_64__
1902 popl KLEN
1903 popl KEYP
1904#endif
225 ret 1905 ret
226 1906
227/* 1907/*
@@ -236,6 +1916,7 @@ ENTRY(aesni_enc)
236 * KEY 1916 * KEY
237 * TKEYP (T1) 1917 * TKEYP (T1)
238 */ 1918 */
1919.align 4
239_aesni_enc1: 1920_aesni_enc1:
240 movaps (KEYP), KEY # key 1921 movaps (KEYP), KEY # key
241 mov KEYP, TKEYP 1922 mov KEYP, TKEYP
@@ -298,6 +1979,7 @@ _aesni_enc1:
298 * KEY 1979 * KEY
299 * TKEYP (T1) 1980 * TKEYP (T1)
300 */ 1981 */
1982.align 4
301_aesni_enc4: 1983_aesni_enc4:
302 movaps (KEYP), KEY # key 1984 movaps (KEYP), KEY # key
303 mov KEYP, TKEYP 1985 mov KEYP, TKEYP
@@ -391,11 +2073,22 @@ _aesni_enc4:
391 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src) 2073 * void aesni_dec (struct crypto_aes_ctx *ctx, u8 *dst, const u8 *src)
392 */ 2074 */
393ENTRY(aesni_dec) 2075ENTRY(aesni_dec)
2076#ifndef __x86_64__
2077 pushl KEYP
2078 pushl KLEN
2079 movl 12(%esp), KEYP
2080 movl 16(%esp), OUTP
2081 movl 20(%esp), INP
2082#endif
394 mov 480(KEYP), KLEN # key length 2083 mov 480(KEYP), KLEN # key length
395 add $240, KEYP 2084 add $240, KEYP
396 movups (INP), STATE # input 2085 movups (INP), STATE # input
397 call _aesni_dec1 2086 call _aesni_dec1
398 movups STATE, (OUTP) #output 2087 movups STATE, (OUTP) #output
2088#ifndef __x86_64__
2089 popl KLEN
2090 popl KEYP
2091#endif
399 ret 2092 ret
400 2093
401/* 2094/*
@@ -410,6 +2103,7 @@ ENTRY(aesni_dec)
410 * KEY 2103 * KEY
411 * TKEYP (T1) 2104 * TKEYP (T1)
412 */ 2105 */
2106.align 4
413_aesni_dec1: 2107_aesni_dec1:
414 movaps (KEYP), KEY # key 2108 movaps (KEYP), KEY # key
415 mov KEYP, TKEYP 2109 mov KEYP, TKEYP
@@ -472,6 +2166,7 @@ _aesni_dec1:
472 * KEY 2166 * KEY
473 * TKEYP (T1) 2167 * TKEYP (T1)
474 */ 2168 */
2169.align 4
475_aesni_dec4: 2170_aesni_dec4:
476 movaps (KEYP), KEY # key 2171 movaps (KEYP), KEY # key
477 mov KEYP, TKEYP 2172 mov KEYP, TKEYP
@@ -566,6 +2261,15 @@ _aesni_dec4:
566 * size_t len) 2261 * size_t len)
567 */ 2262 */
568ENTRY(aesni_ecb_enc) 2263ENTRY(aesni_ecb_enc)
2264#ifndef __x86_64__
2265 pushl LEN
2266 pushl KEYP
2267 pushl KLEN
2268 movl 16(%esp), KEYP
2269 movl 20(%esp), OUTP
2270 movl 24(%esp), INP
2271 movl 28(%esp), LEN
2272#endif
569 test LEN, LEN # check length 2273 test LEN, LEN # check length
570 jz .Lecb_enc_ret 2274 jz .Lecb_enc_ret
571 mov 480(KEYP), KLEN 2275 mov 480(KEYP), KLEN
@@ -602,6 +2306,11 @@ ENTRY(aesni_ecb_enc)
602 cmp $16, LEN 2306 cmp $16, LEN
603 jge .Lecb_enc_loop1 2307 jge .Lecb_enc_loop1
604.Lecb_enc_ret: 2308.Lecb_enc_ret:
2309#ifndef __x86_64__
2310 popl KLEN
2311 popl KEYP
2312 popl LEN
2313#endif
605 ret 2314 ret
606 2315
607/* 2316/*
@@ -609,6 +2318,15 @@ ENTRY(aesni_ecb_enc)
609 * size_t len); 2318 * size_t len);
610 */ 2319 */
611ENTRY(aesni_ecb_dec) 2320ENTRY(aesni_ecb_dec)
2321#ifndef __x86_64__
2322 pushl LEN
2323 pushl KEYP
2324 pushl KLEN
2325 movl 16(%esp), KEYP
2326 movl 20(%esp), OUTP
2327 movl 24(%esp), INP
2328 movl 28(%esp), LEN
2329#endif
612 test LEN, LEN 2330 test LEN, LEN
613 jz .Lecb_dec_ret 2331 jz .Lecb_dec_ret
614 mov 480(KEYP), KLEN 2332 mov 480(KEYP), KLEN
@@ -646,6 +2364,11 @@ ENTRY(aesni_ecb_dec)
646 cmp $16, LEN 2364 cmp $16, LEN
647 jge .Lecb_dec_loop1 2365 jge .Lecb_dec_loop1
648.Lecb_dec_ret: 2366.Lecb_dec_ret:
2367#ifndef __x86_64__
2368 popl KLEN
2369 popl KEYP
2370 popl LEN
2371#endif
649 ret 2372 ret
650 2373
651/* 2374/*
@@ -653,6 +2376,17 @@ ENTRY(aesni_ecb_dec)
653 * size_t len, u8 *iv) 2376 * size_t len, u8 *iv)
654 */ 2377 */
655ENTRY(aesni_cbc_enc) 2378ENTRY(aesni_cbc_enc)
2379#ifndef __x86_64__
2380 pushl IVP
2381 pushl LEN
2382 pushl KEYP
2383 pushl KLEN
2384 movl 20(%esp), KEYP
2385 movl 24(%esp), OUTP
2386 movl 28(%esp), INP
2387 movl 32(%esp), LEN
2388 movl 36(%esp), IVP
2389#endif
656 cmp $16, LEN 2390 cmp $16, LEN
657 jb .Lcbc_enc_ret 2391 jb .Lcbc_enc_ret
658 mov 480(KEYP), KLEN 2392 mov 480(KEYP), KLEN
@@ -670,6 +2404,12 @@ ENTRY(aesni_cbc_enc)
670 jge .Lcbc_enc_loop 2404 jge .Lcbc_enc_loop
671 movups STATE, (IVP) 2405 movups STATE, (IVP)
672.Lcbc_enc_ret: 2406.Lcbc_enc_ret:
2407#ifndef __x86_64__
2408 popl KLEN
2409 popl KEYP
2410 popl LEN
2411 popl IVP
2412#endif
673 ret 2413 ret
674 2414
675/* 2415/*
@@ -677,6 +2417,17 @@ ENTRY(aesni_cbc_enc)
677 * size_t len, u8 *iv) 2417 * size_t len, u8 *iv)
678 */ 2418 */
679ENTRY(aesni_cbc_dec) 2419ENTRY(aesni_cbc_dec)
2420#ifndef __x86_64__
2421 pushl IVP
2422 pushl LEN
2423 pushl KEYP
2424 pushl KLEN
2425 movl 20(%esp), KEYP
2426 movl 24(%esp), OUTP
2427 movl 28(%esp), INP
2428 movl 32(%esp), LEN
2429 movl 36(%esp), IVP
2430#endif
680 cmp $16, LEN 2431 cmp $16, LEN
681 jb .Lcbc_dec_just_ret 2432 jb .Lcbc_dec_just_ret
682 mov 480(KEYP), KLEN 2433 mov 480(KEYP), KLEN
@@ -690,16 +2441,30 @@ ENTRY(aesni_cbc_dec)
690 movaps IN1, STATE1 2441 movaps IN1, STATE1
691 movups 0x10(INP), IN2 2442 movups 0x10(INP), IN2
692 movaps IN2, STATE2 2443 movaps IN2, STATE2
2444#ifdef __x86_64__
693 movups 0x20(INP), IN3 2445 movups 0x20(INP), IN3
694 movaps IN3, STATE3 2446 movaps IN3, STATE3
695 movups 0x30(INP), IN4 2447 movups 0x30(INP), IN4
696 movaps IN4, STATE4 2448 movaps IN4, STATE4
2449#else
2450 movups 0x20(INP), IN1
2451 movaps IN1, STATE3
2452 movups 0x30(INP), IN2
2453 movaps IN2, STATE4
2454#endif
697 call _aesni_dec4 2455 call _aesni_dec4
698 pxor IV, STATE1 2456 pxor IV, STATE1
2457#ifdef __x86_64__
699 pxor IN1, STATE2 2458 pxor IN1, STATE2
700 pxor IN2, STATE3 2459 pxor IN2, STATE3
701 pxor IN3, STATE4 2460 pxor IN3, STATE4
702 movaps IN4, IV 2461 movaps IN4, IV
2462#else
2463 pxor (INP), STATE2
2464 pxor 0x10(INP), STATE3
2465 pxor IN1, STATE4
2466 movaps IN2, IV
2467#endif
703 movups STATE1, (OUTP) 2468 movups STATE1, (OUTP)
704 movups STATE2, 0x10(OUTP) 2469 movups STATE2, 0x10(OUTP)
705 movups STATE3, 0x20(OUTP) 2470 movups STATE3, 0x20(OUTP)
@@ -727,8 +2492,15 @@ ENTRY(aesni_cbc_dec)
727.Lcbc_dec_ret: 2492.Lcbc_dec_ret:
728 movups IV, (IVP) 2493 movups IV, (IVP)
729.Lcbc_dec_just_ret: 2494.Lcbc_dec_just_ret:
2495#ifndef __x86_64__
2496 popl KLEN
2497 popl KEYP
2498 popl LEN
2499 popl IVP
2500#endif
730 ret 2501 ret
731 2502
2503#ifdef __x86_64__
732.align 16 2504.align 16
733.Lbswap_mask: 2505.Lbswap_mask:
734 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 2506 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
@@ -744,6 +2516,7 @@ ENTRY(aesni_cbc_dec)
744 * INC: == 1, in little endian 2516 * INC: == 1, in little endian
745 * BSWAP_MASK == endian swapping mask 2517 * BSWAP_MASK == endian swapping mask
746 */ 2518 */
2519.align 4
747_aesni_inc_init: 2520_aesni_inc_init:
748 movaps .Lbswap_mask, BSWAP_MASK 2521 movaps .Lbswap_mask, BSWAP_MASK
749 movaps IV, CTR 2522 movaps IV, CTR
@@ -768,6 +2541,7 @@ _aesni_inc_init:
768 * CTR: == output IV, in little endian 2541 * CTR: == output IV, in little endian
769 * TCTR_LOW: == lower qword of CTR 2542 * TCTR_LOW: == lower qword of CTR
770 */ 2543 */
2544.align 4
771_aesni_inc: 2545_aesni_inc:
772 paddq INC, CTR 2546 paddq INC, CTR
773 add $1, TCTR_LOW 2547 add $1, TCTR_LOW
@@ -839,3 +2613,4 @@ ENTRY(aesni_ctr_enc)
839 movups IV, (IVP) 2613 movups IV, (IVP)
840.Lctr_enc_just_ret: 2614.Lctr_enc_just_ret:
841 ret 2615 ret
2616#endif
diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c
index 2cb3dcc4490a..feee8ff1d05e 100644
--- a/arch/x86/crypto/aesni-intel_glue.c
+++ b/arch/x86/crypto/aesni-intel_glue.c
@@ -5,6 +5,14 @@
5 * Copyright (C) 2008, Intel Corp. 5 * Copyright (C) 2008, Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com> 6 * Author: Huang Ying <ying.huang@intel.com>
7 * 7 *
8 * Added RFC4106 AES-GCM support for 128-bit keys under the AEAD
9 * interface for 64-bit kernels.
10 * Authors: Adrian Hoban <adrian.hoban@intel.com>
11 * Gabriele Paoloni <gabriele.paoloni@intel.com>
12 * Tadeusz Struk (tadeusz.struk@intel.com)
13 * Aidan O'Mahony (aidan.o.mahony@intel.com)
14 * Copyright (c) 2010, Intel Corporation.
15 *
8 * This program is free software; you can redistribute it and/or modify 16 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by 17 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or 18 * the Free Software Foundation; either version 2 of the License, or
@@ -21,6 +29,10 @@
21#include <crypto/ctr.h> 29#include <crypto/ctr.h>
22#include <asm/i387.h> 30#include <asm/i387.h>
23#include <asm/aes.h> 31#include <asm/aes.h>
32#include <crypto/scatterwalk.h>
33#include <crypto/internal/aead.h>
34#include <linux/workqueue.h>
35#include <linux/spinlock.h>
24 36
25#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE) 37#if defined(CONFIG_CRYPTO_CTR) || defined(CONFIG_CRYPTO_CTR_MODULE)
26#define HAS_CTR 38#define HAS_CTR
@@ -42,8 +54,31 @@ struct async_aes_ctx {
42 struct cryptd_ablkcipher *cryptd_tfm; 54 struct cryptd_ablkcipher *cryptd_tfm;
43}; 55};
44 56
45#define AESNI_ALIGN 16 57/* This data is stored at the end of the crypto_tfm struct.
58 * It's a type of per "session" data storage location.
59 * This needs to be 16 byte aligned.
60 */
61struct aesni_rfc4106_gcm_ctx {
62 u8 hash_subkey[16];
63 struct crypto_aes_ctx aes_key_expanded;
64 u8 nonce[4];
65 struct cryptd_aead *cryptd_tfm;
66};
67
68struct aesni_gcm_set_hash_subkey_result {
69 int err;
70 struct completion completion;
71};
72
73struct aesni_hash_subkey_req_data {
74 u8 iv[16];
75 struct aesni_gcm_set_hash_subkey_result result;
76 struct scatterlist sg;
77};
78
79#define AESNI_ALIGN (16)
46#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1)) 80#define AES_BLOCK_MASK (~(AES_BLOCK_SIZE-1))
81#define RFC4106_HASH_SUBKEY_SIZE 16
47 82
48asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key, 83asmlinkage int aesni_set_key(struct crypto_aes_ctx *ctx, const u8 *in_key,
49 unsigned int key_len); 84 unsigned int key_len);
@@ -59,9 +94,66 @@ asmlinkage void aesni_cbc_enc(struct crypto_aes_ctx *ctx, u8 *out,
59 const u8 *in, unsigned int len, u8 *iv); 94 const u8 *in, unsigned int len, u8 *iv);
60asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out, 95asmlinkage void aesni_cbc_dec(struct crypto_aes_ctx *ctx, u8 *out,
61 const u8 *in, unsigned int len, u8 *iv); 96 const u8 *in, unsigned int len, u8 *iv);
97
98int crypto_fpu_init(void);
99void crypto_fpu_exit(void);
100
101#ifdef CONFIG_X86_64
62asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out, 102asmlinkage void aesni_ctr_enc(struct crypto_aes_ctx *ctx, u8 *out,
63 const u8 *in, unsigned int len, u8 *iv); 103 const u8 *in, unsigned int len, u8 *iv);
64 104
105/* asmlinkage void aesni_gcm_enc()
106 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
107 * u8 *out, Ciphertext output. Encrypt in-place is allowed.
108 * const u8 *in, Plaintext input
109 * unsigned long plaintext_len, Length of data in bytes for encryption.
110 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
111 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
112 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
113 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
114 * const u8 *aad, Additional Authentication Data (AAD)
115 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this
116 * is going to be 8 or 12 bytes
117 * u8 *auth_tag, Authenticated Tag output.
118 * unsigned long auth_tag_len), Authenticated Tag Length in bytes.
119 * Valid values are 16 (most likely), 12 or 8.
120 */
121asmlinkage void aesni_gcm_enc(void *ctx, u8 *out,
122 const u8 *in, unsigned long plaintext_len, u8 *iv,
123 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
124 u8 *auth_tag, unsigned long auth_tag_len);
125
126/* asmlinkage void aesni_gcm_dec()
127 * void *ctx, AES Key schedule. Starts on a 16 byte boundary.
128 * u8 *out, Plaintext output. Decrypt in-place is allowed.
129 * const u8 *in, Ciphertext input
130 * unsigned long ciphertext_len, Length of data in bytes for decryption.
131 * u8 *iv, Pre-counter block j0: 4 byte salt (from Security Association)
132 * concatenated with 8 byte Initialisation Vector (from IPSec ESP
133 * Payload) concatenated with 0x00000001. 16-byte aligned pointer.
134 * u8 *hash_subkey, the Hash sub key input. Data starts on a 16-byte boundary.
135 * const u8 *aad, Additional Authentication Data (AAD)
136 * unsigned long aad_len, Length of AAD in bytes. With RFC4106 this is going
137 * to be 8 or 12 bytes
138 * u8 *auth_tag, Authenticated Tag output.
139 * unsigned long auth_tag_len) Authenticated Tag Length in bytes.
140 * Valid values are 16 (most likely), 12 or 8.
141 */
142asmlinkage void aesni_gcm_dec(void *ctx, u8 *out,
143 const u8 *in, unsigned long ciphertext_len, u8 *iv,
144 u8 *hash_subkey, const u8 *aad, unsigned long aad_len,
145 u8 *auth_tag, unsigned long auth_tag_len);
146
147static inline struct
148aesni_rfc4106_gcm_ctx *aesni_rfc4106_gcm_ctx_get(struct crypto_aead *tfm)
149{
150 return
151 (struct aesni_rfc4106_gcm_ctx *)
152 PTR_ALIGN((u8 *)
153 crypto_tfm_ctx(crypto_aead_tfm(tfm)), AESNI_ALIGN);
154}
155#endif
156
65static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx) 157static inline struct crypto_aes_ctx *aes_ctx(void *raw_ctx)
66{ 158{
67 unsigned long addr = (unsigned long)raw_ctx; 159 unsigned long addr = (unsigned long)raw_ctx;
@@ -324,6 +416,7 @@ static struct crypto_alg blk_cbc_alg = {
324 }, 416 },
325}; 417};
326 418
419#ifdef CONFIG_X86_64
327static void ctr_crypt_final(struct crypto_aes_ctx *ctx, 420static void ctr_crypt_final(struct crypto_aes_ctx *ctx,
328 struct blkcipher_walk *walk) 421 struct blkcipher_walk *walk)
329{ 422{
@@ -389,6 +482,7 @@ static struct crypto_alg blk_ctr_alg = {
389 }, 482 },
390 }, 483 },
391}; 484};
485#endif
392 486
393static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key, 487static int ablk_set_key(struct crypto_ablkcipher *tfm, const u8 *key,
394 unsigned int key_len) 488 unsigned int key_len)
@@ -536,6 +630,7 @@ static struct crypto_alg ablk_cbc_alg = {
536 }, 630 },
537}; 631};
538 632
633#ifdef CONFIG_X86_64
539static int ablk_ctr_init(struct crypto_tfm *tfm) 634static int ablk_ctr_init(struct crypto_tfm *tfm)
540{ 635{
541 struct cryptd_ablkcipher *cryptd_tfm; 636 struct cryptd_ablkcipher *cryptd_tfm;
@@ -612,6 +707,7 @@ static struct crypto_alg ablk_rfc3686_ctr_alg = {
612 }, 707 },
613}; 708};
614#endif 709#endif
710#endif
615 711
616#ifdef HAS_LRW 712#ifdef HAS_LRW
617static int ablk_lrw_init(struct crypto_tfm *tfm) 713static int ablk_lrw_init(struct crypto_tfm *tfm)
@@ -730,6 +826,432 @@ static struct crypto_alg ablk_xts_alg = {
730}; 826};
731#endif 827#endif
732 828
829#ifdef CONFIG_X86_64
830static int rfc4106_init(struct crypto_tfm *tfm)
831{
832 struct cryptd_aead *cryptd_tfm;
833 struct aesni_rfc4106_gcm_ctx *ctx = (struct aesni_rfc4106_gcm_ctx *)
834 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
835 struct crypto_aead *cryptd_child;
836 struct aesni_rfc4106_gcm_ctx *child_ctx;
837 cryptd_tfm = cryptd_alloc_aead("__driver-gcm-aes-aesni", 0, 0);
838 if (IS_ERR(cryptd_tfm))
839 return PTR_ERR(cryptd_tfm);
840
841 cryptd_child = cryptd_aead_child(cryptd_tfm);
842 child_ctx = aesni_rfc4106_gcm_ctx_get(cryptd_child);
843 memcpy(child_ctx, ctx, sizeof(*ctx));
844 ctx->cryptd_tfm = cryptd_tfm;
845 tfm->crt_aead.reqsize = sizeof(struct aead_request)
846 + crypto_aead_reqsize(&cryptd_tfm->base);
847 return 0;
848}
849
850static void rfc4106_exit(struct crypto_tfm *tfm)
851{
852 struct aesni_rfc4106_gcm_ctx *ctx =
853 (struct aesni_rfc4106_gcm_ctx *)
854 PTR_ALIGN((u8 *)crypto_tfm_ctx(tfm), AESNI_ALIGN);
855 if (!IS_ERR(ctx->cryptd_tfm))
856 cryptd_free_aead(ctx->cryptd_tfm);
857 return;
858}
859
860static void
861rfc4106_set_hash_subkey_done(struct crypto_async_request *req, int err)
862{
863 struct aesni_gcm_set_hash_subkey_result *result = req->data;
864
865 if (err == -EINPROGRESS)
866 return;
867 result->err = err;
868 complete(&result->completion);
869}
870
871static int
872rfc4106_set_hash_subkey(u8 *hash_subkey, const u8 *key, unsigned int key_len)
873{
874 struct crypto_ablkcipher *ctr_tfm;
875 struct ablkcipher_request *req;
876 int ret = -EINVAL;
877 struct aesni_hash_subkey_req_data *req_data;
878
879 ctr_tfm = crypto_alloc_ablkcipher("ctr(aes)", 0, 0);
880 if (IS_ERR(ctr_tfm))
881 return PTR_ERR(ctr_tfm);
882
883 crypto_ablkcipher_clear_flags(ctr_tfm, ~0);
884
885 ret = crypto_ablkcipher_setkey(ctr_tfm, key, key_len);
886 if (ret)
887 goto out_free_ablkcipher;
888
889 ret = -ENOMEM;
890 req = ablkcipher_request_alloc(ctr_tfm, GFP_KERNEL);
891 if (!req)
892 goto out_free_ablkcipher;
893
894 req_data = kmalloc(sizeof(*req_data), GFP_KERNEL);
895 if (!req_data)
896 goto out_free_request;
897
898 memset(req_data->iv, 0, sizeof(req_data->iv));
899
900 /* Clear the data in the hash sub key container to zero.*/
901 /* We want to cipher all zeros to create the hash sub key. */
902 memset(hash_subkey, 0, RFC4106_HASH_SUBKEY_SIZE);
903
904 init_completion(&req_data->result.completion);
905 sg_init_one(&req_data->sg, hash_subkey, RFC4106_HASH_SUBKEY_SIZE);
906 ablkcipher_request_set_tfm(req, ctr_tfm);
907 ablkcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP |
908 CRYPTO_TFM_REQ_MAY_BACKLOG,
909 rfc4106_set_hash_subkey_done,
910 &req_data->result);
911
912 ablkcipher_request_set_crypt(req, &req_data->sg,
913 &req_data->sg, RFC4106_HASH_SUBKEY_SIZE, req_data->iv);
914
915 ret = crypto_ablkcipher_encrypt(req);
916 if (ret == -EINPROGRESS || ret == -EBUSY) {
917 ret = wait_for_completion_interruptible
918 (&req_data->result.completion);
919 if (!ret)
920 ret = req_data->result.err;
921 }
922 kfree(req_data);
923out_free_request:
924 ablkcipher_request_free(req);
925out_free_ablkcipher:
926 crypto_free_ablkcipher(ctr_tfm);
927 return ret;
928}
929
930static int rfc4106_set_key(struct crypto_aead *parent, const u8 *key,
931 unsigned int key_len)
932{
933 int ret = 0;
934 struct crypto_tfm *tfm = crypto_aead_tfm(parent);
935 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
936 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
937 struct aesni_rfc4106_gcm_ctx *child_ctx =
938 aesni_rfc4106_gcm_ctx_get(cryptd_child);
939 u8 *new_key_mem = NULL;
940
941 if (key_len < 4) {
942 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
943 return -EINVAL;
944 }
945 /*Account for 4 byte nonce at the end.*/
946 key_len -= 4;
947 if (key_len != AES_KEYSIZE_128) {
948 crypto_tfm_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
949 return -EINVAL;
950 }
951
952 memcpy(ctx->nonce, key + key_len, sizeof(ctx->nonce));
953 /*This must be on a 16 byte boundary!*/
954 if ((unsigned long)(&(ctx->aes_key_expanded.key_enc[0])) % AESNI_ALIGN)
955 return -EINVAL;
956
957 if ((unsigned long)key % AESNI_ALIGN) {
958 /*key is not aligned: use an auxuliar aligned pointer*/
959 new_key_mem = kmalloc(key_len+AESNI_ALIGN, GFP_KERNEL);
960 if (!new_key_mem)
961 return -ENOMEM;
962
963 new_key_mem = PTR_ALIGN(new_key_mem, AESNI_ALIGN);
964 memcpy(new_key_mem, key, key_len);
965 key = new_key_mem;
966 }
967
968 if (!irq_fpu_usable())
969 ret = crypto_aes_expand_key(&(ctx->aes_key_expanded),
970 key, key_len);
971 else {
972 kernel_fpu_begin();
973 ret = aesni_set_key(&(ctx->aes_key_expanded), key, key_len);
974 kernel_fpu_end();
975 }
976 /*This must be on a 16 byte boundary!*/
977 if ((unsigned long)(&(ctx->hash_subkey[0])) % AESNI_ALIGN) {
978 ret = -EINVAL;
979 goto exit;
980 }
981 ret = rfc4106_set_hash_subkey(ctx->hash_subkey, key, key_len);
982 memcpy(child_ctx, ctx, sizeof(*ctx));
983exit:
984 kfree(new_key_mem);
985 return ret;
986}
987
988/* This is the Integrity Check Value (aka the authentication tag length and can
989 * be 8, 12 or 16 bytes long. */
990static int rfc4106_set_authsize(struct crypto_aead *parent,
991 unsigned int authsize)
992{
993 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(parent);
994 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
995
996 switch (authsize) {
997 case 8:
998 case 12:
999 case 16:
1000 break;
1001 default:
1002 return -EINVAL;
1003 }
1004 crypto_aead_crt(parent)->authsize = authsize;
1005 crypto_aead_crt(cryptd_child)->authsize = authsize;
1006 return 0;
1007}
1008
1009static int rfc4106_encrypt(struct aead_request *req)
1010{
1011 int ret;
1012 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1013 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1014
1015 if (!irq_fpu_usable()) {
1016 struct aead_request *cryptd_req =
1017 (struct aead_request *) aead_request_ctx(req);
1018 memcpy(cryptd_req, req, sizeof(*req));
1019 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1020 return crypto_aead_encrypt(cryptd_req);
1021 } else {
1022 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1023 kernel_fpu_begin();
1024 ret = cryptd_child->base.crt_aead.encrypt(req);
1025 kernel_fpu_end();
1026 return ret;
1027 }
1028}
1029
1030static int rfc4106_decrypt(struct aead_request *req)
1031{
1032 int ret;
1033 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1034 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1035
1036 if (!irq_fpu_usable()) {
1037 struct aead_request *cryptd_req =
1038 (struct aead_request *) aead_request_ctx(req);
1039 memcpy(cryptd_req, req, sizeof(*req));
1040 aead_request_set_tfm(cryptd_req, &ctx->cryptd_tfm->base);
1041 return crypto_aead_decrypt(cryptd_req);
1042 } else {
1043 struct crypto_aead *cryptd_child = cryptd_aead_child(ctx->cryptd_tfm);
1044 kernel_fpu_begin();
1045 ret = cryptd_child->base.crt_aead.decrypt(req);
1046 kernel_fpu_end();
1047 return ret;
1048 }
1049}
1050
1051static struct crypto_alg rfc4106_alg = {
1052 .cra_name = "rfc4106(gcm(aes))",
1053 .cra_driver_name = "rfc4106-gcm-aesni",
1054 .cra_priority = 400,
1055 .cra_flags = CRYPTO_ALG_TYPE_AEAD | CRYPTO_ALG_ASYNC,
1056 .cra_blocksize = 1,
1057 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1058 .cra_alignmask = 0,
1059 .cra_type = &crypto_nivaead_type,
1060 .cra_module = THIS_MODULE,
1061 .cra_list = LIST_HEAD_INIT(rfc4106_alg.cra_list),
1062 .cra_init = rfc4106_init,
1063 .cra_exit = rfc4106_exit,
1064 .cra_u = {
1065 .aead = {
1066 .setkey = rfc4106_set_key,
1067 .setauthsize = rfc4106_set_authsize,
1068 .encrypt = rfc4106_encrypt,
1069 .decrypt = rfc4106_decrypt,
1070 .geniv = "seqiv",
1071 .ivsize = 8,
1072 .maxauthsize = 16,
1073 },
1074 },
1075};
1076
1077static int __driver_rfc4106_encrypt(struct aead_request *req)
1078{
1079 u8 one_entry_in_sg = 0;
1080 u8 *src, *dst, *assoc;
1081 __be32 counter = cpu_to_be32(1);
1082 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1083 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1084 void *aes_ctx = &(ctx->aes_key_expanded);
1085 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1086 u8 iv_tab[16+AESNI_ALIGN];
1087 u8* iv = (u8 *) PTR_ALIGN((u8 *)iv_tab, AESNI_ALIGN);
1088 struct scatter_walk src_sg_walk;
1089 struct scatter_walk assoc_sg_walk;
1090 struct scatter_walk dst_sg_walk;
1091 unsigned int i;
1092
1093 /* Assuming we are supporting rfc4106 64-bit extended */
1094 /* sequence numbers We need to have the AAD length equal */
1095 /* to 8 or 12 bytes */
1096 if (unlikely(req->assoclen != 8 && req->assoclen != 12))
1097 return -EINVAL;
1098 /* IV below built */
1099 for (i = 0; i < 4; i++)
1100 *(iv+i) = ctx->nonce[i];
1101 for (i = 0; i < 8; i++)
1102 *(iv+4+i) = req->iv[i];
1103 *((__be32 *)(iv+12)) = counter;
1104
1105 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1106 one_entry_in_sg = 1;
1107 scatterwalk_start(&src_sg_walk, req->src);
1108 scatterwalk_start(&assoc_sg_walk, req->assoc);
1109 src = scatterwalk_map(&src_sg_walk, 0);
1110 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1111 dst = src;
1112 if (unlikely(req->src != req->dst)) {
1113 scatterwalk_start(&dst_sg_walk, req->dst);
1114 dst = scatterwalk_map(&dst_sg_walk, 0);
1115 }
1116
1117 } else {
1118 /* Allocate memory for src, dst, assoc */
1119 src = kmalloc(req->cryptlen + auth_tag_len + req->assoclen,
1120 GFP_ATOMIC);
1121 if (unlikely(!src))
1122 return -ENOMEM;
1123 assoc = (src + req->cryptlen + auth_tag_len);
1124 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1125 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1126 req->assoclen, 0);
1127 dst = src;
1128 }
1129
1130 aesni_gcm_enc(aes_ctx, dst, src, (unsigned long)req->cryptlen, iv,
1131 ctx->hash_subkey, assoc, (unsigned long)req->assoclen, dst
1132 + ((unsigned long)req->cryptlen), auth_tag_len);
1133
1134 /* The authTag (aka the Integrity Check Value) needs to be written
1135 * back to the packet. */
1136 if (one_entry_in_sg) {
1137 if (unlikely(req->src != req->dst)) {
1138 scatterwalk_unmap(dst, 0);
1139 scatterwalk_done(&dst_sg_walk, 0, 0);
1140 }
1141 scatterwalk_unmap(src, 0);
1142 scatterwalk_unmap(assoc, 0);
1143 scatterwalk_done(&src_sg_walk, 0, 0);
1144 scatterwalk_done(&assoc_sg_walk, 0, 0);
1145 } else {
1146 scatterwalk_map_and_copy(dst, req->dst, 0,
1147 req->cryptlen + auth_tag_len, 1);
1148 kfree(src);
1149 }
1150 return 0;
1151}
1152
1153static int __driver_rfc4106_decrypt(struct aead_request *req)
1154{
1155 u8 one_entry_in_sg = 0;
1156 u8 *src, *dst, *assoc;
1157 unsigned long tempCipherLen = 0;
1158 __be32 counter = cpu_to_be32(1);
1159 int retval = 0;
1160 struct crypto_aead *tfm = crypto_aead_reqtfm(req);
1161 struct aesni_rfc4106_gcm_ctx *ctx = aesni_rfc4106_gcm_ctx_get(tfm);
1162 void *aes_ctx = &(ctx->aes_key_expanded);
1163 unsigned long auth_tag_len = crypto_aead_authsize(tfm);
1164 u8 iv_and_authTag[32+AESNI_ALIGN];
1165 u8 *iv = (u8 *) PTR_ALIGN((u8 *)iv_and_authTag, AESNI_ALIGN);
1166 u8 *authTag = iv + 16;
1167 struct scatter_walk src_sg_walk;
1168 struct scatter_walk assoc_sg_walk;
1169 struct scatter_walk dst_sg_walk;
1170 unsigned int i;
1171
1172 if (unlikely((req->cryptlen < auth_tag_len) ||
1173 (req->assoclen != 8 && req->assoclen != 12)))
1174 return -EINVAL;
1175 /* Assuming we are supporting rfc4106 64-bit extended */
1176 /* sequence numbers We need to have the AAD length */
1177 /* equal to 8 or 12 bytes */
1178
1179 tempCipherLen = (unsigned long)(req->cryptlen - auth_tag_len);
1180 /* IV below built */
1181 for (i = 0; i < 4; i++)
1182 *(iv+i) = ctx->nonce[i];
1183 for (i = 0; i < 8; i++)
1184 *(iv+4+i) = req->iv[i];
1185 *((__be32 *)(iv+12)) = counter;
1186
1187 if ((sg_is_last(req->src)) && (sg_is_last(req->assoc))) {
1188 one_entry_in_sg = 1;
1189 scatterwalk_start(&src_sg_walk, req->src);
1190 scatterwalk_start(&assoc_sg_walk, req->assoc);
1191 src = scatterwalk_map(&src_sg_walk, 0);
1192 assoc = scatterwalk_map(&assoc_sg_walk, 0);
1193 dst = src;
1194 if (unlikely(req->src != req->dst)) {
1195 scatterwalk_start(&dst_sg_walk, req->dst);
1196 dst = scatterwalk_map(&dst_sg_walk, 0);
1197 }
1198
1199 } else {
1200 /* Allocate memory for src, dst, assoc */
1201 src = kmalloc(req->cryptlen + req->assoclen, GFP_ATOMIC);
1202 if (!src)
1203 return -ENOMEM;
1204 assoc = (src + req->cryptlen + auth_tag_len);
1205 scatterwalk_map_and_copy(src, req->src, 0, req->cryptlen, 0);
1206 scatterwalk_map_and_copy(assoc, req->assoc, 0,
1207 req->assoclen, 0);
1208 dst = src;
1209 }
1210
1211 aesni_gcm_dec(aes_ctx, dst, src, tempCipherLen, iv,
1212 ctx->hash_subkey, assoc, (unsigned long)req->assoclen,
1213 authTag, auth_tag_len);
1214
1215 /* Compare generated tag with passed in tag. */
1216 retval = memcmp(src + tempCipherLen, authTag, auth_tag_len) ?
1217 -EBADMSG : 0;
1218
1219 if (one_entry_in_sg) {
1220 if (unlikely(req->src != req->dst)) {
1221 scatterwalk_unmap(dst, 0);
1222 scatterwalk_done(&dst_sg_walk, 0, 0);
1223 }
1224 scatterwalk_unmap(src, 0);
1225 scatterwalk_unmap(assoc, 0);
1226 scatterwalk_done(&src_sg_walk, 0, 0);
1227 scatterwalk_done(&assoc_sg_walk, 0, 0);
1228 } else {
1229 scatterwalk_map_and_copy(dst, req->dst, 0, req->cryptlen, 1);
1230 kfree(src);
1231 }
1232 return retval;
1233}
1234
1235static struct crypto_alg __rfc4106_alg = {
1236 .cra_name = "__gcm-aes-aesni",
1237 .cra_driver_name = "__driver-gcm-aes-aesni",
1238 .cra_priority = 0,
1239 .cra_flags = CRYPTO_ALG_TYPE_AEAD,
1240 .cra_blocksize = 1,
1241 .cra_ctxsize = sizeof(struct aesni_rfc4106_gcm_ctx) + AESNI_ALIGN,
1242 .cra_alignmask = 0,
1243 .cra_type = &crypto_aead_type,
1244 .cra_module = THIS_MODULE,
1245 .cra_list = LIST_HEAD_INIT(__rfc4106_alg.cra_list),
1246 .cra_u = {
1247 .aead = {
1248 .encrypt = __driver_rfc4106_encrypt,
1249 .decrypt = __driver_rfc4106_decrypt,
1250 },
1251 },
1252};
1253#endif
1254
733static int __init aesni_init(void) 1255static int __init aesni_init(void)
734{ 1256{
735 int err; 1257 int err;
@@ -738,6 +1260,9 @@ static int __init aesni_init(void)
738 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n"); 1260 printk(KERN_INFO "Intel AES-NI instructions are not detected.\n");
739 return -ENODEV; 1261 return -ENODEV;
740 } 1262 }
1263
1264 if ((err = crypto_fpu_init()))
1265 goto fpu_err;
741 if ((err = crypto_register_alg(&aesni_alg))) 1266 if ((err = crypto_register_alg(&aesni_alg)))
742 goto aes_err; 1267 goto aes_err;
743 if ((err = crypto_register_alg(&__aesni_alg))) 1268 if ((err = crypto_register_alg(&__aesni_alg)))
@@ -746,18 +1271,24 @@ static int __init aesni_init(void)
746 goto blk_ecb_err; 1271 goto blk_ecb_err;
747 if ((err = crypto_register_alg(&blk_cbc_alg))) 1272 if ((err = crypto_register_alg(&blk_cbc_alg)))
748 goto blk_cbc_err; 1273 goto blk_cbc_err;
749 if ((err = crypto_register_alg(&blk_ctr_alg)))
750 goto blk_ctr_err;
751 if ((err = crypto_register_alg(&ablk_ecb_alg))) 1274 if ((err = crypto_register_alg(&ablk_ecb_alg)))
752 goto ablk_ecb_err; 1275 goto ablk_ecb_err;
753 if ((err = crypto_register_alg(&ablk_cbc_alg))) 1276 if ((err = crypto_register_alg(&ablk_cbc_alg)))
754 goto ablk_cbc_err; 1277 goto ablk_cbc_err;
1278#ifdef CONFIG_X86_64
1279 if ((err = crypto_register_alg(&blk_ctr_alg)))
1280 goto blk_ctr_err;
755 if ((err = crypto_register_alg(&ablk_ctr_alg))) 1281 if ((err = crypto_register_alg(&ablk_ctr_alg)))
756 goto ablk_ctr_err; 1282 goto ablk_ctr_err;
1283 if ((err = crypto_register_alg(&__rfc4106_alg)))
1284 goto __aead_gcm_err;
1285 if ((err = crypto_register_alg(&rfc4106_alg)))
1286 goto aead_gcm_err;
757#ifdef HAS_CTR 1287#ifdef HAS_CTR
758 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg))) 1288 if ((err = crypto_register_alg(&ablk_rfc3686_ctr_alg)))
759 goto ablk_rfc3686_ctr_err; 1289 goto ablk_rfc3686_ctr_err;
760#endif 1290#endif
1291#endif
761#ifdef HAS_LRW 1292#ifdef HAS_LRW
762 if ((err = crypto_register_alg(&ablk_lrw_alg))) 1293 if ((err = crypto_register_alg(&ablk_lrw_alg)))
763 goto ablk_lrw_err; 1294 goto ablk_lrw_err;
@@ -770,7 +1301,6 @@ static int __init aesni_init(void)
770 if ((err = crypto_register_alg(&ablk_xts_alg))) 1301 if ((err = crypto_register_alg(&ablk_xts_alg)))
771 goto ablk_xts_err; 1302 goto ablk_xts_err;
772#endif 1303#endif
773
774 return err; 1304 return err;
775 1305
776#ifdef HAS_XTS 1306#ifdef HAS_XTS
@@ -784,18 +1314,24 @@ ablk_pcbc_err:
784 crypto_unregister_alg(&ablk_lrw_alg); 1314 crypto_unregister_alg(&ablk_lrw_alg);
785ablk_lrw_err: 1315ablk_lrw_err:
786#endif 1316#endif
1317#ifdef CONFIG_X86_64
787#ifdef HAS_CTR 1318#ifdef HAS_CTR
788 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1319 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
789ablk_rfc3686_ctr_err: 1320ablk_rfc3686_ctr_err:
790#endif 1321#endif
1322 crypto_unregister_alg(&rfc4106_alg);
1323aead_gcm_err:
1324 crypto_unregister_alg(&__rfc4106_alg);
1325__aead_gcm_err:
791 crypto_unregister_alg(&ablk_ctr_alg); 1326 crypto_unregister_alg(&ablk_ctr_alg);
792ablk_ctr_err: 1327ablk_ctr_err:
1328 crypto_unregister_alg(&blk_ctr_alg);
1329blk_ctr_err:
1330#endif
793 crypto_unregister_alg(&ablk_cbc_alg); 1331 crypto_unregister_alg(&ablk_cbc_alg);
794ablk_cbc_err: 1332ablk_cbc_err:
795 crypto_unregister_alg(&ablk_ecb_alg); 1333 crypto_unregister_alg(&ablk_ecb_alg);
796ablk_ecb_err: 1334ablk_ecb_err:
797 crypto_unregister_alg(&blk_ctr_alg);
798blk_ctr_err:
799 crypto_unregister_alg(&blk_cbc_alg); 1335 crypto_unregister_alg(&blk_cbc_alg);
800blk_cbc_err: 1336blk_cbc_err:
801 crypto_unregister_alg(&blk_ecb_alg); 1337 crypto_unregister_alg(&blk_ecb_alg);
@@ -804,6 +1340,7 @@ blk_ecb_err:
804__aes_err: 1340__aes_err:
805 crypto_unregister_alg(&aesni_alg); 1341 crypto_unregister_alg(&aesni_alg);
806aes_err: 1342aes_err:
1343fpu_err:
807 return err; 1344 return err;
808} 1345}
809 1346
@@ -818,17 +1355,23 @@ static void __exit aesni_exit(void)
818#ifdef HAS_LRW 1355#ifdef HAS_LRW
819 crypto_unregister_alg(&ablk_lrw_alg); 1356 crypto_unregister_alg(&ablk_lrw_alg);
820#endif 1357#endif
1358#ifdef CONFIG_X86_64
821#ifdef HAS_CTR 1359#ifdef HAS_CTR
822 crypto_unregister_alg(&ablk_rfc3686_ctr_alg); 1360 crypto_unregister_alg(&ablk_rfc3686_ctr_alg);
823#endif 1361#endif
1362 crypto_unregister_alg(&rfc4106_alg);
1363 crypto_unregister_alg(&__rfc4106_alg);
824 crypto_unregister_alg(&ablk_ctr_alg); 1364 crypto_unregister_alg(&ablk_ctr_alg);
1365 crypto_unregister_alg(&blk_ctr_alg);
1366#endif
825 crypto_unregister_alg(&ablk_cbc_alg); 1367 crypto_unregister_alg(&ablk_cbc_alg);
826 crypto_unregister_alg(&ablk_ecb_alg); 1368 crypto_unregister_alg(&ablk_ecb_alg);
827 crypto_unregister_alg(&blk_ctr_alg);
828 crypto_unregister_alg(&blk_cbc_alg); 1369 crypto_unregister_alg(&blk_cbc_alg);
829 crypto_unregister_alg(&blk_ecb_alg); 1370 crypto_unregister_alg(&blk_ecb_alg);
830 crypto_unregister_alg(&__aesni_alg); 1371 crypto_unregister_alg(&__aesni_alg);
831 crypto_unregister_alg(&aesni_alg); 1372 crypto_unregister_alg(&aesni_alg);
1373
1374 crypto_fpu_exit();
832} 1375}
833 1376
834module_init(aesni_init); 1377module_init(aesni_init);
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index 1a8f8649c035..98d7a188f46b 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -150,18 +150,12 @@ static struct crypto_template crypto_fpu_tmpl = {
150 .module = THIS_MODULE, 150 .module = THIS_MODULE,
151}; 151};
152 152
153static int __init crypto_fpu_module_init(void) 153int __init crypto_fpu_init(void)
154{ 154{
155 return crypto_register_template(&crypto_fpu_tmpl); 155 return crypto_register_template(&crypto_fpu_tmpl);
156} 156}
157 157
158static void __exit crypto_fpu_module_exit(void) 158void __exit crypto_fpu_exit(void)
159{ 159{
160 crypto_unregister_template(&crypto_fpu_tmpl); 160 crypto_unregister_template(&crypto_fpu_tmpl);
161} 161}
162
163module_init(crypto_fpu_module_init);
164module_exit(crypto_fpu_module_exit);
165
166MODULE_LICENSE("GPL");
167MODULE_DESCRIPTION("FPU block cipher wrapper");
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
index cbcc8d8ea93a..7a6e68e4f748 100644
--- a/arch/x86/crypto/ghash-clmulni-intel_glue.c
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -10,6 +10,7 @@
10 * by the Free Software Foundation. 10 * by the Free Software Foundation.
11 */ 11 */
12 12
13#include <linux/err.h>
13#include <linux/module.h> 14#include <linux/module.h>
14#include <linux/init.h> 15#include <linux/init.h>
15#include <linux/kernel.h> 16#include <linux/kernel.h>
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2d93bdbc9ac0..fd843877e841 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -298,6 +298,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
298 /* OK, This is the point of no return */ 298 /* OK, This is the point of no return */
299 set_personality(PER_LINUX); 299 set_personality(PER_LINUX);
300 set_thread_flag(TIF_IA32); 300 set_thread_flag(TIF_IA32);
301 current->mm->context.ia32_compat = 1;
301 302
302 setup_new_exec(bprm); 303 setup_new_exec(bprm);
303 304
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 518bb99c3394..c1870dddd322 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -25,6 +25,8 @@
25#define sysretl_audit ia32_ret_from_sys_call 25#define sysretl_audit ia32_ret_from_sys_call
26#endif 26#endif
27 27
28 .section .entry.text, "ax"
29
28#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8) 30#define IA32_NR_syscalls ((ia32_syscall_end - ia32_sys_call_table)/8)
29 31
30 .macro IA32_ARG_FIXUP noebp=0 32 .macro IA32_ARG_FIXUP noebp=0
@@ -126,26 +128,20 @@ ENTRY(ia32_sysenter_target)
126 */ 128 */
127 ENABLE_INTERRUPTS(CLBR_NONE) 129 ENABLE_INTERRUPTS(CLBR_NONE)
128 movl %ebp,%ebp /* zero extension */ 130 movl %ebp,%ebp /* zero extension */
129 pushq $__USER32_DS 131 pushq_cfi $__USER32_DS
130 CFI_ADJUST_CFA_OFFSET 8
131 /*CFI_REL_OFFSET ss,0*/ 132 /*CFI_REL_OFFSET ss,0*/
132 pushq %rbp 133 pushq_cfi %rbp
133 CFI_ADJUST_CFA_OFFSET 8
134 CFI_REL_OFFSET rsp,0 134 CFI_REL_OFFSET rsp,0
135 pushfq 135 pushfq_cfi
136 CFI_ADJUST_CFA_OFFSET 8
137 /*CFI_REL_OFFSET rflags,0*/ 136 /*CFI_REL_OFFSET rflags,0*/
138 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d 137 movl 8*3-THREAD_SIZE+TI_sysenter_return(%rsp), %r10d
139 CFI_REGISTER rip,r10 138 CFI_REGISTER rip,r10
140 pushq $__USER32_CS 139 pushq_cfi $__USER32_CS
141 CFI_ADJUST_CFA_OFFSET 8
142 /*CFI_REL_OFFSET cs,0*/ 140 /*CFI_REL_OFFSET cs,0*/
143 movl %eax, %eax 141 movl %eax, %eax
144 pushq %r10 142 pushq_cfi %r10
145 CFI_ADJUST_CFA_OFFSET 8
146 CFI_REL_OFFSET rip,0 143 CFI_REL_OFFSET rip,0
147 pushq %rax 144 pushq_cfi %rax
148 CFI_ADJUST_CFA_OFFSET 8
149 cld 145 cld
150 SAVE_ARGS 0,0,1 146 SAVE_ARGS 0,0,1
151 /* no need to do an access_ok check here because rbp has been 147 /* no need to do an access_ok check here because rbp has been
@@ -182,11 +178,9 @@ sysexit_from_sys_call:
182 xorq %r9,%r9 178 xorq %r9,%r9
183 xorq %r10,%r10 179 xorq %r10,%r10
184 xorq %r11,%r11 180 xorq %r11,%r11
185 popfq 181 popfq_cfi
186 CFI_ADJUST_CFA_OFFSET -8
187 /*CFI_RESTORE rflags*/ 182 /*CFI_RESTORE rflags*/
188 popq %rcx /* User %esp */ 183 popq_cfi %rcx /* User %esp */
189 CFI_ADJUST_CFA_OFFSET -8
190 CFI_REGISTER rsp,rcx 184 CFI_REGISTER rsp,rcx
191 TRACE_IRQS_ON 185 TRACE_IRQS_ON
192 ENABLE_INTERRUPTS_SYSEXIT32 186 ENABLE_INTERRUPTS_SYSEXIT32
@@ -421,8 +415,7 @@ ENTRY(ia32_syscall)
421 */ 415 */
422 ENABLE_INTERRUPTS(CLBR_NONE) 416 ENABLE_INTERRUPTS(CLBR_NONE)
423 movl %eax,%eax 417 movl %eax,%eax
424 pushq %rax 418 pushq_cfi %rax
425 CFI_ADJUST_CFA_OFFSET 8
426 cld 419 cld
427 /* note the registers are not zero extended to the sf. 420 /* note the registers are not zero extended to the sf.
428 this could be a problem. */ 421 this could be a problem. */
@@ -851,4 +844,10 @@ ia32_sys_call_table:
851 .quad sys_fanotify_init 844 .quad sys_fanotify_init
852 .quad sys32_fanotify_mark 845 .quad sys32_fanotify_mark
853 .quad sys_prlimit64 /* 340 */ 846 .quad sys_prlimit64 /* 340 */
847 .quad sys_name_to_handle_at
848 .quad compat_sys_open_by_handle_at
849 .quad compat_sys_clock_adjtime
850 .quad sys_syncfs
851 .quad compat_sys_sendmmsg /* 345 */
852 .quad sys_setns
854ia32_syscall_end: 853ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 849813f398e7..5852519b2d0f 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -28,7 +28,6 @@
28#include <linux/syscalls.h> 28#include <linux/syscalls.h>
29#include <linux/times.h> 29#include <linux/times.h>
30#include <linux/utsname.h> 30#include <linux/utsname.h>
31#include <linux/smp_lock.h>
32#include <linux/mm.h> 31#include <linux/mm.h>
33#include <linux/uio.h> 32#include <linux/uio.h>
34#include <linux/poll.h> 33#include <linux/poll.h>
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 92091de11113..610001d385dd 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -29,6 +29,7 @@
29#include <asm/processor.h> 29#include <asm/processor.h>
30#include <asm/mmu.h> 30#include <asm/mmu.h>
31#include <asm/mpspec.h> 31#include <asm/mpspec.h>
32#include <asm/trampoline.h>
32 33
33#define COMPILER_DEPENDENT_INT64 long long 34#define COMPILER_DEPENDENT_INT64 long long
34#define COMPILER_DEPENDENT_UINT64 unsigned long long 35#define COMPILER_DEPENDENT_UINT64 unsigned long long
@@ -88,11 +89,15 @@ extern int acpi_disabled;
88extern int acpi_pci_disabled; 89extern int acpi_pci_disabled;
89extern int acpi_skip_timer_override; 90extern int acpi_skip_timer_override;
90extern int acpi_use_timer_override; 91extern int acpi_use_timer_override;
92extern int acpi_fix_pin2_polarity;
91 93
92extern u8 acpi_sci_flags; 94extern u8 acpi_sci_flags;
93extern int acpi_sci_override_gsi; 95extern int acpi_sci_override_gsi;
94void acpi_pic_sci_set_trigger(unsigned int, u16); 96void acpi_pic_sci_set_trigger(unsigned int, u16);
95 97
98extern int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
99 int trigger, int polarity);
100
96static inline void disable_acpi(void) 101static inline void disable_acpi(void)
97{ 102{
98 acpi_disabled = 1; 103 acpi_disabled = 1;
@@ -109,11 +114,11 @@ static inline void acpi_disable_pci(void)
109 acpi_noirq_set(); 114 acpi_noirq_set();
110} 115}
111 116
112/* routines for saving/restoring kernel state */ 117/* Low-level suspend routine. */
113extern int acpi_save_state_mem(void); 118extern int acpi_suspend_lowlevel(void);
114extern void acpi_restore_state_mem(void);
115 119
116extern unsigned long acpi_wakeup_address; 120extern const unsigned char acpi_wakeup_code[];
121#define acpi_wakeup_address (__pa(TRAMPOLINE_SYM(acpi_wakeup_code)))
117 122
118/* early initialization routine */ 123/* early initialization routine */
119extern void acpi_reserve_wakeup_memory(void); 124extern void acpi_reserve_wakeup_memory(void);
@@ -134,7 +139,7 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
134 boot_cpu_data.x86_model <= 0x05 && 139 boot_cpu_data.x86_model <= 0x05 &&
135 boot_cpu_data.x86_mask < 0x0A) 140 boot_cpu_data.x86_mask < 0x0A)
136 return 1; 141 return 1;
137 else if (c1e_detected) 142 else if (amd_e400_c1e_detected)
138 return 1; 143 return 1;
139 else 144 else
140 return max_cstate; 145 return max_cstate;
@@ -178,21 +183,10 @@ static inline void disable_acpi(void) { }
178 183
179#define ARCH_HAS_POWER_INIT 1 184#define ARCH_HAS_POWER_INIT 1
180 185
181struct bootnode;
182
183#ifdef CONFIG_ACPI_NUMA 186#ifdef CONFIG_ACPI_NUMA
184extern int acpi_numa; 187extern int acpi_numa;
185extern int acpi_get_nodes(struct bootnode *physnodes); 188extern int x86_acpi_numa_init(void);
186extern int acpi_scan_nodes(unsigned long start, unsigned long end); 189#endif /* CONFIG_ACPI_NUMA */
187#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
188extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
189 int num_nodes);
190#else
191static inline void acpi_fake_nodes(const struct bootnode *fake_nodes,
192 int num_nodes)
193{
194}
195#endif
196 190
197#define acpi_unlazy_tlb(x) leave_mm(x) 191#define acpi_unlazy_tlb(x) leave_mm(x)
198 192
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index a63a68be1cce..94d420b360d1 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -15,4 +15,13 @@
15 .endm 15 .endm
16#endif 16#endif
17 17
18.macro altinstruction_entry orig alt feature orig_len alt_len
19 .align 8
20 .quad \orig
21 .quad \alt
22 .word \feature
23 .byte \orig_len
24 .byte \alt_len
25.endm
26
18#endif /* __ASSEMBLY__ */ 27#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index bc6abb7bc7ee..bf535f947e8c 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -65,6 +65,7 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
65extern void alternatives_smp_module_del(struct module *mod); 65extern void alternatives_smp_module_del(struct module *mod);
66extern void alternatives_smp_switch(int smp); 66extern void alternatives_smp_switch(int smp);
67extern int alternatives_text_reserved(void *start, void *end); 67extern int alternatives_text_reserved(void *start, void *end);
68extern bool skip_smp_alternatives;
68#else 69#else
69static inline void alternatives_smp_module_add(struct module *mod, char *name, 70static inline void alternatives_smp_module_add(struct module *mod, char *name,
70 void *locks, void *locks_end, 71 void *locks, void *locks_end,
@@ -160,6 +161,8 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
160#define __parainstructions_end NULL 161#define __parainstructions_end NULL
161#endif 162#endif
162 163
164extern void *text_poke_early(void *addr, const void *opcode, size_t len);
165
163/* 166/*
164 * Clear and restore the kernel write-protection flag on the local CPU. 167 * Clear and restore the kernel write-protection flag on the local CPU.
165 * Allows the kernel to edit read-only pages. 168 * Allows the kernel to edit read-only pages.
@@ -177,7 +180,14 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
177 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 180 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
178 * inconsistent instruction while you patch. 181 * inconsistent instruction while you patch.
179 */ 182 */
183struct text_poke_param {
184 void *addr;
185 const void *opcode;
186 size_t len;
187};
188
180extern void *text_poke(void *addr, const void *opcode, size_t len); 189extern void *text_poke(void *addr, const void *opcode, size_t len);
181extern void *text_poke_smp(void *addr, const void *opcode, size_t len); 190extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
191extern void text_poke_smp_batch(struct text_poke_param *params, int n);
182 192
183#endif /* _ASM_X86_ALTERNATIVE_H */ 193#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 5af2982133b5..a6863a2dec1f 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -24,11 +24,11 @@
24 24
25#ifdef CONFIG_AMD_IOMMU 25#ifdef CONFIG_AMD_IOMMU
26 26
27extern void amd_iommu_detect(void); 27extern int amd_iommu_detect(void);
28 28
29#else 29#else
30 30
31static inline void amd_iommu_detect(void) { } 31static inline int amd_iommu_detect(void) { return -ENODEV; }
32 32
33#endif 33#endif
34 34
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
index cb030374b90a..55d95eb789b3 100644
--- a/arch/x86/include/asm/amd_iommu_proto.h
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2009-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * 4 *
5 * This program is free software; you can redistribute it and/or modify it 5 * This program is free software; you can redistribute it and/or modify it
@@ -19,13 +19,12 @@
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H 19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H 20#define _ASM_X86_AMD_IOMMU_PROTO_H
21 21
22struct amd_iommu; 22#include <asm/amd_iommu_types.h>
23 23
24extern int amd_iommu_init_dma_ops(void); 24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void); 25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_thread(int irq, void *data);
26extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 27extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
27extern void amd_iommu_flush_all_domains(void);
28extern void amd_iommu_flush_all_devices(void);
29extern void amd_iommu_apply_erratum_63(u16 devid); 28extern void amd_iommu_apply_erratum_63(u16 devid);
30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu); 29extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31extern int amd_iommu_init_devices(void); 30extern int amd_iommu_init_devices(void);
@@ -44,4 +43,12 @@ static inline bool is_rd890_iommu(struct pci_dev *pdev)
44 (pdev->device == PCI_DEVICE_ID_RD890_IOMMU); 43 (pdev->device == PCI_DEVICE_ID_RD890_IOMMU);
45} 44}
46 45
46static inline bool iommu_feature(struct amd_iommu *iommu, u64 f)
47{
48 if (!(iommu->cap & (1 << IOMMU_CAP_EFR)))
49 return false;
50
51 return !!(iommu->features & f);
52}
53
47#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */ 54#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 08616180deaf..4c9982995414 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -68,12 +68,25 @@
68#define MMIO_CONTROL_OFFSET 0x0018 68#define MMIO_CONTROL_OFFSET 0x0018
69#define MMIO_EXCL_BASE_OFFSET 0x0020 69#define MMIO_EXCL_BASE_OFFSET 0x0020
70#define MMIO_EXCL_LIMIT_OFFSET 0x0028 70#define MMIO_EXCL_LIMIT_OFFSET 0x0028
71#define MMIO_EXT_FEATURES 0x0030
71#define MMIO_CMD_HEAD_OFFSET 0x2000 72#define MMIO_CMD_HEAD_OFFSET 0x2000
72#define MMIO_CMD_TAIL_OFFSET 0x2008 73#define MMIO_CMD_TAIL_OFFSET 0x2008
73#define MMIO_EVT_HEAD_OFFSET 0x2010 74#define MMIO_EVT_HEAD_OFFSET 0x2010
74#define MMIO_EVT_TAIL_OFFSET 0x2018 75#define MMIO_EVT_TAIL_OFFSET 0x2018
75#define MMIO_STATUS_OFFSET 0x2020 76#define MMIO_STATUS_OFFSET 0x2020
76 77
78
79/* Extended Feature Bits */
80#define FEATURE_PREFETCH (1ULL<<0)
81#define FEATURE_PPR (1ULL<<1)
82#define FEATURE_X2APIC (1ULL<<2)
83#define FEATURE_NX (1ULL<<3)
84#define FEATURE_GT (1ULL<<4)
85#define FEATURE_IA (1ULL<<6)
86#define FEATURE_GA (1ULL<<7)
87#define FEATURE_HE (1ULL<<8)
88#define FEATURE_PC (1ULL<<9)
89
77/* MMIO status bits */ 90/* MMIO status bits */
78#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04 91#define MMIO_STATUS_COM_WAIT_INT_MASK 0x04
79 92
@@ -113,7 +126,9 @@
113/* command specific defines */ 126/* command specific defines */
114#define CMD_COMPL_WAIT 0x01 127#define CMD_COMPL_WAIT 0x01
115#define CMD_INV_DEV_ENTRY 0x02 128#define CMD_INV_DEV_ENTRY 0x02
116#define CMD_INV_IOMMU_PAGES 0x03 129#define CMD_INV_IOMMU_PAGES 0x03
130#define CMD_INV_IOTLB_PAGES 0x04
131#define CMD_INV_ALL 0x08
117 132
118#define CMD_COMPL_WAIT_STORE_MASK 0x01 133#define CMD_COMPL_WAIT_STORE_MASK 0x01
119#define CMD_COMPL_WAIT_INT_MASK 0x02 134#define CMD_COMPL_WAIT_INT_MASK 0x02
@@ -215,6 +230,8 @@
215#define IOMMU_PTE_IR (1ULL << 61) 230#define IOMMU_PTE_IR (1ULL << 61)
216#define IOMMU_PTE_IW (1ULL << 62) 231#define IOMMU_PTE_IW (1ULL << 62)
217 232
233#define DTE_FLAG_IOTLB 0x01
234
218#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL) 235#define IOMMU_PAGE_MASK (((1ULL << 52) - 1) & ~0xfffULL)
219#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P) 236#define IOMMU_PTE_PRESENT(pte) ((pte) & IOMMU_PTE_P)
220#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK)) 237#define IOMMU_PTE_PAGE(pte) (phys_to_virt((pte) & IOMMU_PAGE_MASK))
@@ -227,6 +244,7 @@
227/* IOMMU capabilities */ 244/* IOMMU capabilities */
228#define IOMMU_CAP_IOTLB 24 245#define IOMMU_CAP_IOTLB 24
229#define IOMMU_CAP_NPCACHE 26 246#define IOMMU_CAP_NPCACHE 26
247#define IOMMU_CAP_EFR 27
230 248
231#define MAX_DOMAIN_ID 65536 249#define MAX_DOMAIN_ID 65536
232 250
@@ -249,6 +267,8 @@ extern bool amd_iommu_dump;
249 267
250/* global flag if IOMMUs cache non-present entries */ 268/* global flag if IOMMUs cache non-present entries */
251extern bool amd_iommu_np_cache; 269extern bool amd_iommu_np_cache;
270/* Only true if all IOMMUs support device IOTLBs */
271extern bool amd_iommu_iotlb_sup;
252 272
253/* 273/*
254 * Make iterating over all IOMMUs easier 274 * Make iterating over all IOMMUs easier
@@ -371,6 +391,9 @@ struct amd_iommu {
371 /* flags read from acpi table */ 391 /* flags read from acpi table */
372 u8 acpi_flags; 392 u8 acpi_flags;
373 393
394 /* Extended features */
395 u64 features;
396
374 /* 397 /*
375 * Capability pointer. There could be more than one IOMMU per PCI 398 * Capability pointer. There could be more than one IOMMU per PCI
376 * device function if there are more than one AMD IOMMU capability 399 * device function if there are more than one AMD IOMMU capability
@@ -409,20 +432,26 @@ struct amd_iommu {
409 /* if one, we need to send a completion wait command */ 432 /* if one, we need to send a completion wait command */
410 bool need_sync; 433 bool need_sync;
411 434
412 /* becomes true if a command buffer reset is running */
413 bool reset_in_progress;
414
415 /* default dma_ops domain for that IOMMU */ 435 /* default dma_ops domain for that IOMMU */
416 struct dma_ops_domain *default_dom; 436 struct dma_ops_domain *default_dom;
417 437
418 /* 438 /*
419 * This array is required to work around a potential BIOS bug. 439 * We can't rely on the BIOS to restore all values on reinit, so we
420 * The BIOS may miss to restore parts of the PCI configuration 440 * need to stash them
421 * space when the system resumes from S3. The result is that the
422 * IOMMU does not execute commands anymore which leads to system
423 * failure.
424 */ 441 */
425 u32 cache_cfg[4]; 442
443 /* The iommu BAR */
444 u32 stored_addr_lo;
445 u32 stored_addr_hi;
446
447 /*
448 * Each iommu has 6 l1s, each of which is documented as having 0x12
449 * registers
450 */
451 u32 stored_l1[6][0x12];
452
453 /* The l2 indirect registers */
454 u32 stored_l2[0x83];
426}; 455};
427 456
428/* 457/*
diff --git a/arch/x86/include/asm/amd_nb.h b/arch/x86/include/asm/amd_nb.h
new file mode 100644
index 000000000000..67f87f257611
--- /dev/null
+++ b/arch/x86/include/asm/amd_nb.h
@@ -0,0 +1,64 @@
1#ifndef _ASM_X86_AMD_NB_H
2#define _ASM_X86_AMD_NB_H
3
4#include <linux/pci.h>
5
6struct amd_nb_bus_dev_range {
7 u8 bus;
8 u8 dev_base;
9 u8 dev_limit;
10};
11
12extern const struct pci_device_id amd_nb_misc_ids[];
13extern const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[];
14
15extern bool early_is_amd_nb(u32 value);
16extern int amd_cache_northbridges(void);
17extern void amd_flush_garts(void);
18extern int amd_numa_init(void);
19extern int amd_get_subcaches(int);
20extern int amd_set_subcaches(int, int);
21
22struct amd_northbridge {
23 struct pci_dev *misc;
24 struct pci_dev *link;
25};
26
27struct amd_northbridge_info {
28 u16 num;
29 u64 flags;
30 struct amd_northbridge *nb;
31};
32extern struct amd_northbridge_info amd_northbridges;
33
34#define AMD_NB_GART BIT(0)
35#define AMD_NB_L3_INDEX_DISABLE BIT(1)
36#define AMD_NB_L3_PARTITIONING BIT(2)
37
38#ifdef CONFIG_AMD_NB
39
40static inline u16 amd_nb_num(void)
41{
42 return amd_northbridges.num;
43}
44
45static inline bool amd_nb_has_feature(unsigned feature)
46{
47 return ((amd_northbridges.flags & feature) == feature);
48}
49
50static inline struct amd_northbridge *node_to_amd_nb(int node)
51{
52 return (node < amd_northbridges.num) ? &amd_northbridges.nb[node] : NULL;
53}
54
55#else
56
57#define amd_nb_num(x) 0
58#define amd_nb_has_feature(x) false
59#define node_to_amd_nb(x) NULL
60
61#endif
62
63
64#endif /* _ASM_X86_AMD_NB_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
index a69b1ac9eaf8..af60d8a2e288 100644
--- a/arch/x86/include/asm/apb_timer.h
+++ b/arch/x86/include/asm/apb_timer.h
@@ -54,7 +54,6 @@ extern struct clock_event_device *global_clock_event;
54extern unsigned long apbt_quick_calibrate(void); 54extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu); 55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void); 56extern void apbt_setup_secondary_clock(void);
57extern unsigned int boot_cpu_id;
58 57
59extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint); 58extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
60extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr); 59extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
@@ -63,7 +62,7 @@ extern int sfi_mtimer_num;
63#else /* CONFIG_APB_TIMER */ 62#else /* CONFIG_APB_TIMER */
64 63
65static inline unsigned long apbt_quick_calibrate(void) {return 0; } 64static inline unsigned long apbt_quick_calibrate(void) {return 0; }
66static inline void apbt_time_init(void) {return 0; } 65static inline void apbt_time_init(void) { }
67 66
68#endif 67#endif
69#endif /* ASM_X86_APBT_H */ 68#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 1fa03e04ae44..4a0b7c7e2cce 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -2,7 +2,6 @@
2#define _ASM_X86_APIC_H 2#define _ASM_X86_APIC_H
3 3
4#include <linux/cpumask.h> 4#include <linux/cpumask.h>
5#include <linux/delay.h>
6#include <linux/pm.h> 5#include <linux/pm.h>
7 6
8#include <asm/alternative.h> 7#include <asm/alternative.h>
@@ -141,13 +140,13 @@ static inline void native_apic_msr_write(u32 reg, u32 v)
141 140
142static inline u32 native_apic_msr_read(u32 reg) 141static inline u32 native_apic_msr_read(u32 reg)
143{ 142{
144 u32 low, high; 143 u64 msr;
145 144
146 if (reg == APIC_DFR) 145 if (reg == APIC_DFR)
147 return -1; 146 return -1;
148 147
149 rdmsr(APIC_BASE_MSR + (reg >> 4), low, high); 148 rdmsrl(APIC_BASE_MSR + (reg >> 4), msr);
150 return low; 149 return (u32)msr;
151} 150}
152 151
153static inline void native_x2apic_wait_icr_idle(void) 152static inline void native_x2apic_wait_icr_idle(void)
@@ -181,12 +180,12 @@ extern void enable_x2apic(void);
181extern void x2apic_icr_write(u32 low, u32 id); 180extern void x2apic_icr_write(u32 low, u32 id);
182static inline int x2apic_enabled(void) 181static inline int x2apic_enabled(void)
183{ 182{
184 int msr, msr2; 183 u64 msr;
185 184
186 if (!cpu_has_x2apic) 185 if (!cpu_has_x2apic)
187 return 0; 186 return 0;
188 187
189 rdmsr(MSR_IA32_APICBASE, msr, msr2); 188 rdmsrl(MSR_IA32_APICBASE, msr);
190 if (msr & X2APIC_ENABLE) 189 if (msr & X2APIC_ENABLE)
191 return 1; 190 return 1;
192 return 0; 191 return 0;
@@ -220,7 +219,6 @@ extern void enable_IR_x2apic(void);
220 219
221extern int get_physical_broadcast(void); 220extern int get_physical_broadcast(void);
222 221
223extern void apic_disable(void);
224extern int lapic_get_maxlvt(void); 222extern int lapic_get_maxlvt(void);
225extern void clear_local_APIC(void); 223extern void clear_local_APIC(void);
226extern void connect_bsp_APIC(void); 224extern void connect_bsp_APIC(void);
@@ -228,22 +226,22 @@ extern void disconnect_bsp_APIC(int virt_wire_setup);
228extern void disable_local_APIC(void); 226extern void disable_local_APIC(void);
229extern void lapic_shutdown(void); 227extern void lapic_shutdown(void);
230extern int verify_local_APIC(void); 228extern int verify_local_APIC(void);
231extern void cache_APIC_registers(void);
232extern void sync_Arb_IDs(void); 229extern void sync_Arb_IDs(void);
233extern void init_bsp_APIC(void); 230extern void init_bsp_APIC(void);
234extern void setup_local_APIC(void); 231extern void setup_local_APIC(void);
235extern void end_local_APIC_setup(void); 232extern void end_local_APIC_setup(void);
233extern void bsp_end_local_APIC_setup(void);
236extern void init_apic_mappings(void); 234extern void init_apic_mappings(void);
235void register_lapic_address(unsigned long address);
237extern void setup_boot_APIC_clock(void); 236extern void setup_boot_APIC_clock(void);
238extern void setup_secondary_APIC_clock(void); 237extern void setup_secondary_APIC_clock(void);
239extern int APIC_init_uniprocessor(void); 238extern int APIC_init_uniprocessor(void);
240extern void enable_NMI_through_LVT0(void); 239extern int apic_force_enable(unsigned long addr);
241 240
242/* 241/*
243 * On 32bit this is mach-xxx local 242 * On 32bit this is mach-xxx local
244 */ 243 */
245#ifdef CONFIG_X86_64 244#ifdef CONFIG_X86_64
246extern void early_init_lapic_mapping(void);
247extern int apic_is_clustered_box(void); 245extern int apic_is_clustered_box(void);
248#else 246#else
249static inline int apic_is_clustered_box(void) 247static inline int apic_is_clustered_box(void)
@@ -252,16 +250,13 @@ static inline int apic_is_clustered_box(void)
252} 250}
253#endif 251#endif
254 252
255extern u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask); 253extern int setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask);
256extern u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask);
257
258 254
259#else /* !CONFIG_X86_LOCAL_APIC */ 255#else /* !CONFIG_X86_LOCAL_APIC */
260static inline void lapic_shutdown(void) { } 256static inline void lapic_shutdown(void) { }
261#define local_apic_timer_c2_ok 1 257#define local_apic_timer_c2_ok 1
262static inline void init_apic_mappings(void) { } 258static inline void init_apic_mappings(void) { }
263static inline void disable_local_APIC(void) { } 259static inline void disable_local_APIC(void) { }
264static inline void apic_disable(void) { }
265# define setup_boot_APIC_clock x86_init_noop 260# define setup_boot_APIC_clock x86_init_noop
266# define setup_secondary_APIC_clock x86_init_noop 261# define setup_secondary_APIC_clock x86_init_noop
267#endif /* !CONFIG_X86_LOCAL_APIC */ 262#endif /* !CONFIG_X86_LOCAL_APIC */
@@ -307,8 +302,6 @@ struct apic {
307 302
308 void (*setup_apic_routing)(void); 303 void (*setup_apic_routing)(void);
309 int (*multi_timer_check)(int apic, int irq); 304 int (*multi_timer_check)(int apic, int irq);
310 int (*apicid_to_node)(int logical_apicid);
311 int (*cpu_to_logical_apicid)(int cpu);
312 int (*cpu_present_to_apicid)(int mps_cpu); 305 int (*cpu_present_to_apicid)(int mps_cpu);
313 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap); 306 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
314 void (*setup_portio_remap)(void); 307 void (*setup_portio_remap)(void);
@@ -356,6 +349,28 @@ struct apic {
356 void (*icr_write)(u32 low, u32 high); 349 void (*icr_write)(u32 low, u32 high);
357 void (*wait_icr_idle)(void); 350 void (*wait_icr_idle)(void);
358 u32 (*safe_wait_icr_idle)(void); 351 u32 (*safe_wait_icr_idle)(void);
352
353#ifdef CONFIG_X86_32
354 /*
355 * Called very early during boot from get_smp_config(). It should
356 * return the logical apicid. x86_[bios]_cpu_to_apicid is
357 * initialized before this function is called.
358 *
359 * If logical apicid can't be determined that early, the function
360 * may return BAD_APICID. Logical apicid will be configured after
361 * init_apic_ldr() while bringing up CPUs. Note that NUMA affinity
362 * won't be applied properly during early boot in this case.
363 */
364 int (*x86_32_early_logical_apicid)(int cpu);
365
366 /*
367 * Optional method called from setup_local_APIC() after logical
368 * apicid is guaranteed to be known to initialize apicid -> node
369 * mapping if NUMA initialization hasn't done so already. Don't
370 * add new users.
371 */
372 int (*x86_32_numa_cpu_node)(int cpu);
373#endif
359}; 374};
360 375
361/* 376/*
@@ -366,6 +381,26 @@ struct apic {
366extern struct apic *apic; 381extern struct apic *apic;
367 382
368/* 383/*
384 * APIC drivers are probed based on how they are listed in the .apicdrivers
385 * section. So the order is important and enforced by the ordering
386 * of different apic driver files in the Makefile.
387 *
388 * For the files having two apic drivers, we use apic_drivers()
389 * to enforce the order with in them.
390 */
391#define apic_driver(sym) \
392 static struct apic *__apicdrivers_##sym __used \
393 __aligned(sizeof(struct apic *)) \
394 __section(.apicdrivers) = { &sym }
395
396#define apic_drivers(sym1, sym2) \
397 static struct apic *__apicdrivers_##sym1##sym2[2] __used \
398 __aligned(sizeof(struct apic *)) \
399 __section(.apicdrivers) = { &sym1, &sym2 }
400
401extern struct apic *__apicdrivers[], *__apicdrivers_end[];
402
403/*
369 * APIC functionality to boot other CPUs - only used on SMP: 404 * APIC functionality to boot other CPUs - only used on SMP:
370 */ 405 */
371#ifdef CONFIG_SMP 406#ifdef CONFIG_SMP
@@ -443,15 +478,10 @@ static inline unsigned default_get_apic_id(unsigned long x)
443#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469 478#define DEFAULT_TRAMPOLINE_PHYS_HIGH 0x469
444 479
445#ifdef CONFIG_X86_64 480#ifdef CONFIG_X86_64
446extern struct apic apic_flat;
447extern struct apic apic_physflat;
448extern struct apic apic_x2apic_cluster;
449extern struct apic apic_x2apic_phys;
450extern int default_acpi_madt_oem_check(char *, char *); 481extern int default_acpi_madt_oem_check(char *, char *);
451 482
452extern void apic_send_IPI_self(int vector); 483extern void apic_send_IPI_self(int vector);
453 484
454extern struct apic apic_x2apic_uv_x;
455DECLARE_PER_CPU(int, x2apic_extra_bits); 485DECLARE_PER_CPU(int, x2apic_extra_bits);
456 486
457extern int default_cpu_present_to_apicid(int mps_cpu); 487extern int default_cpu_present_to_apicid(int mps_cpu);
@@ -465,7 +495,7 @@ static inline void default_wait_for_init_deassert(atomic_t *deassert)
465 return; 495 return;
466} 496}
467 497
468extern void generic_bigsmp_probe(void); 498extern struct apic *generic_bigsmp_probe(void);
469 499
470 500
471#ifdef CONFIG_X86_LOCAL_APIC 501#ifdef CONFIG_X86_LOCAL_APIC
@@ -501,7 +531,10 @@ extern struct apic apic_noop;
501 531
502#ifdef CONFIG_X86_32 532#ifdef CONFIG_X86_32
503 533
504extern struct apic apic_default; 534static inline int noop_x86_32_early_logical_apicid(int cpu)
535{
536 return BAD_APICID;
537}
505 538
506/* 539/*
507 * Set up the logical destination ID. 540 * Set up the logical destination ID.
@@ -522,8 +555,6 @@ static inline int default_phys_pkg_id(int cpuid_apic, int index_msb)
522 return cpuid_apic >> index_msb; 555 return cpuid_apic >> index_msb;
523} 556}
524 557
525extern int default_apicid_to_node(int logical_apicid);
526
527#endif 558#endif
528 559
529static inline unsigned int 560static inline unsigned int
@@ -558,12 +589,6 @@ static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_ma
558 *retmap = *phys_map; 589 *retmap = *phys_map;
559} 590}
560 591
561/* Mapping from cpu number to logical apicid */
562static inline int default_cpu_to_logical_apicid(int cpu)
563{
564 return 1 << cpu;
565}
566
567static inline int __default_cpu_present_to_apicid(int mps_cpu) 592static inline int __default_cpu_present_to_apicid(int mps_cpu)
568{ 593{
569 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu)) 594 if (mps_cpu < nr_cpu_ids && cpu_present(mps_cpu))
@@ -596,8 +621,4 @@ extern int default_check_phys_apicid_present(int phys_apicid);
596 621
597#endif /* CONFIG_X86_LOCAL_APIC */ 622#endif /* CONFIG_X86_LOCAL_APIC */
598 623
599#ifdef CONFIG_X86_32
600extern u8 cpu_2_logical_apicid[NR_CPUS];
601#endif
602
603#endif /* _ASM_X86_APIC_H */ 624#endif /* _ASM_X86_APIC_H */
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 7fe3b3060f08..34595d5e1038 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -78,6 +78,7 @@
78#define APIC_DEST_LOGICAL 0x00800 78#define APIC_DEST_LOGICAL 0x00800
79#define APIC_DEST_PHYSICAL 0x00000 79#define APIC_DEST_PHYSICAL 0x00000
80#define APIC_DM_FIXED 0x00000 80#define APIC_DM_FIXED 0x00000
81#define APIC_DM_FIXED_MASK 0x00700
81#define APIC_DM_LOWEST 0x00100 82#define APIC_DM_LOWEST 0x00100
82#define APIC_DM_SMI 0x00200 83#define APIC_DM_SMI 0x00200
83#define APIC_DM_REMRD 0x00300 84#define APIC_DM_REMRD 0x00300
@@ -131,6 +132,7 @@
131#define APIC_EILVTn(n) (0x500 + 0x10 * n) 132#define APIC_EILVTn(n) (0x500 + 0x10 * n)
132#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */ 133#define APIC_EILVT_NR_AMD_K8 1 /* # of extended interrupts */
133#define APIC_EILVT_NR_AMD_10H 4 134#define APIC_EILVT_NR_AMD_10H 4
135#define APIC_EILVT_NR_MAX APIC_EILVT_NR_AMD_10H
134#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF) 136#define APIC_EILVT_LVTOFF(x) (((x) >> 4) & 0xF)
135#define APIC_EILVT_MSG_FIX 0x0 137#define APIC_EILVT_MSG_FIX 0x0
136#define APIC_EILVT_MSG_SMI 0x2 138#define APIC_EILVT_MSG_SMI 0x2
@@ -144,6 +146,7 @@
144 146
145#ifdef CONFIG_X86_32 147#ifdef CONFIG_X86_32
146# define MAX_IO_APICS 64 148# define MAX_IO_APICS 64
149# define MAX_LOCAL_APIC 256
147#else 150#else
148# define MAX_IO_APICS 128 151# define MAX_IO_APICS 128
149# define MAX_LOCAL_APIC 32768 152# define MAX_LOCAL_APIC 32768
@@ -424,4 +427,16 @@ struct local_apic {
424#else 427#else
425 #define BAD_APICID 0xFFFFu 428 #define BAD_APICID 0xFFFFu
426#endif 429#endif
430
431enum ioapic_irq_destination_types {
432 dest_Fixed = 0,
433 dest_LowestPrio = 1,
434 dest_SMI = 2,
435 dest__reserved_1 = 3,
436 dest_NMI = 4,
437 dest_INIT = 5,
438 dest__reserved_2 = 6,
439 dest_ExtINT = 7
440};
441
427#endif /* _ASM_X86_APICDEF_H */ 442#endif /* _ASM_X86_APICDEF_H */
diff --git a/arch/x86/include/asm/bios_ebda.h b/arch/x86/include/asm/bios_ebda.h
index 3c7521063d3f..aa6a3170ab5a 100644
--- a/arch/x86/include/asm/bios_ebda.h
+++ b/arch/x86/include/asm/bios_ebda.h
@@ -4,16 +4,40 @@
4#include <asm/io.h> 4#include <asm/io.h>
5 5
6/* 6/*
7 * there is a real-mode segmented pointer pointing to the 7 * Returns physical address of EBDA. Returns 0 if there is no EBDA.
8 * 4K EBDA area at 0x40E.
9 */ 8 */
10static inline unsigned int get_bios_ebda(void) 9static inline unsigned int get_bios_ebda(void)
11{ 10{
11 /*
12 * There is a real-mode segmented pointer pointing to the
13 * 4K EBDA area at 0x40E.
14 */
12 unsigned int address = *(unsigned short *)phys_to_virt(0x40E); 15 unsigned int address = *(unsigned short *)phys_to_virt(0x40E);
13 address <<= 4; 16 address <<= 4;
14 return address; /* 0 means none */ 17 return address; /* 0 means none */
15} 18}
16 19
20/*
21 * Return the sanitized length of the EBDA in bytes, if it exists.
22 */
23static inline unsigned int get_bios_ebda_length(void)
24{
25 unsigned int address;
26 unsigned int length;
27
28 address = get_bios_ebda();
29 if (!address)
30 return 0;
31
32 /* EBDA length is byte 0 of the EBDA (stored in KiB) */
33 length = *(unsigned char *)phys_to_virt(address);
34 length <<= 10;
35
36 /* Trim the length if it extends beyond 640KiB */
37 length = min_t(unsigned int, (640 * 1024) - address, length);
38 return length;
39}
40
17void reserve_ebda_region(void); 41void reserve_ebda_region(void);
18 42
19#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION 43#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index bafd80defa43..69d58131bc8e 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -440,6 +440,8 @@ static inline int fls(int x)
440 440
441#ifdef __KERNEL__ 441#ifdef __KERNEL__
442 442
443#include <asm-generic/bitops/find.h>
444
443#include <asm-generic/bitops/sched.h> 445#include <asm-generic/bitops/sched.h>
444 446
445#define ARCH_HAS_FAST_MULTIPLIER 1 447#define ARCH_HAS_FAST_MULTIPLIER 1
@@ -454,14 +456,12 @@ static inline int fls(int x)
454 456
455#ifdef __KERNEL__ 457#ifdef __KERNEL__
456 458
457#include <asm-generic/bitops/ext2-non-atomic.h> 459#include <asm-generic/bitops/le.h>
458 460
459#define ext2_set_bit_atomic(lock, nr, addr) \ 461#define ext2_set_bit_atomic(lock, nr, addr) \
460 test_and_set_bit((nr), (unsigned long *)(addr)) 462 test_and_set_bit((nr), (unsigned long *)(addr))
461#define ext2_clear_bit_atomic(lock, nr, addr) \ 463#define ext2_clear_bit_atomic(lock, nr, addr) \
462 test_and_clear_bit((nr), (unsigned long *)(addr)) 464 test_and_clear_bit((nr), (unsigned long *)(addr))
463 465
464#include <asm-generic/bitops/minix.h>
465
466#endif /* __KERNEL__ */ 466#endif /* __KERNEL__ */
467#endif /* _ASM_X86_BITOPS_H */ 467#endif /* _ASM_X86_BITOPS_H */
diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h
index 3b62ab56c7a0..5e1a2eef3e7c 100644
--- a/arch/x86/include/asm/boot.h
+++ b/arch/x86/include/asm/boot.h
@@ -32,11 +32,7 @@
32#define BOOT_HEAP_SIZE 0x400000 32#define BOOT_HEAP_SIZE 0x400000
33#else /* !CONFIG_KERNEL_BZIP2 */ 33#else /* !CONFIG_KERNEL_BZIP2 */
34 34
35#ifdef CONFIG_X86_64 35#define BOOT_HEAP_SIZE 0x8000
36#define BOOT_HEAP_SIZE 0x7000
37#else
38#define BOOT_HEAP_SIZE 0x4000
39#endif
40 36
41#endif /* !CONFIG_KERNEL_BZIP2 */ 37#endif /* !CONFIG_KERNEL_BZIP2 */
42 38
diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h
index 8e6218550e77..e020d88ec02d 100644
--- a/arch/x86/include/asm/bootparam.h
+++ b/arch/x86/include/asm/bootparam.h
@@ -12,6 +12,7 @@
12/* setup data types */ 12/* setup data types */
13#define SETUP_NONE 0 13#define SETUP_NONE 0
14#define SETUP_E820_EXT 1 14#define SETUP_E820_EXT 1
15#define SETUP_DTB 2
15 16
16/* extensible setup data list node */ 17/* extensible setup data list node */
17struct setup_data { 18struct setup_data {
@@ -124,6 +125,7 @@ enum {
124 X86_SUBARCH_LGUEST, 125 X86_SUBARCH_LGUEST,
125 X86_SUBARCH_XEN, 126 X86_SUBARCH_XEN,
126 X86_SUBARCH_MRST, 127 X86_SUBARCH_MRST,
128 X86_SUBARCH_CE4100,
127 X86_NR_SUBARCHS, 129 X86_NR_SUBARCHS,
128}; 130};
129 131
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 63e35ec9075c..4e12668711e5 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -1,48 +1,8 @@
1#ifndef _ASM_X86_CACHEFLUSH_H 1#ifndef _ASM_X86_CACHEFLUSH_H
2#define _ASM_X86_CACHEFLUSH_H 2#define _ASM_X86_CACHEFLUSH_H
3 3
4/* Keep includes the same across arches. */
5#include <linux/mm.h>
6
7/* Caches aren't brain-dead on the intel. */ 4/* Caches aren't brain-dead on the intel. */
8static inline void flush_cache_all(void) { } 5#include <asm-generic/cacheflush.h>
9static inline void flush_cache_mm(struct mm_struct *mm) { }
10static inline void flush_cache_dup_mm(struct mm_struct *mm) { }
11static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
16static inline void flush_dcache_page(struct page *page) { }
17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
19static inline void flush_icache_range(unsigned long start,
20 unsigned long end) { }
21static inline void flush_icache_page(struct vm_area_struct *vma,
22 struct page *page) { }
23static inline void flush_icache_user_range(struct vm_area_struct *vma,
24 struct page *page,
25 unsigned long addr,
26 unsigned long len) { }
27static inline void flush_cache_vmap(unsigned long start, unsigned long end) { }
28static inline void flush_cache_vunmap(unsigned long start,
29 unsigned long end) { }
30
31static inline void copy_to_user_page(struct vm_area_struct *vma,
32 struct page *page, unsigned long vaddr,
33 void *dst, const void *src,
34 unsigned long len)
35{
36 memcpy(dst, src, len);
37}
38
39static inline void copy_from_user_page(struct vm_area_struct *vma,
40 struct page *page, unsigned long vaddr,
41 void *dst, const void *src,
42 unsigned long len)
43{
44 memcpy(dst, src, len);
45}
46 6
47#ifdef CONFIG_X86_PAT 7#ifdef CONFIG_X86_PAT
48/* 8/*
@@ -111,7 +71,7 @@ static inline void set_page_memtype(struct page *pg, unsigned long memtype) { }
111 * Read/Write : ReadOnly, ReadWrite 71 * Read/Write : ReadOnly, ReadWrite
112 * Presence : NotPresent 72 * Presence : NotPresent
113 * 73 *
114 * Within a catagory, the attributes are mutually exclusive. 74 * Within a category, the attributes are mutually exclusive.
115 * 75 *
116 * The implementation of this API will take care of various aspects that 76 * The implementation of this API will take care of various aspects that
117 * are associated with changing such attributes, such as: 77 * are associated with changing such attributes, such as:
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index 0918654305af..0d467b338835 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,9 +62,9 @@ struct cal_chipset_ops {
62extern int use_calgary; 62extern int use_calgary;
63 63
64#ifdef CONFIG_CALGARY_IOMMU 64#ifdef CONFIG_CALGARY_IOMMU
65extern void detect_calgary(void); 65extern int detect_calgary(void);
66#else 66#else
67static inline void detect_calgary(void) { return; } 67static inline int detect_calgary(void) { return -ENODEV; }
68#endif 68#endif
69 69
70#endif /* _ASM_X86_CALGARY_H */ 70#endif /* _ASM_X86_CALGARY_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 0e63c9a2a8d0..30af5a832163 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -48,36 +48,38 @@ For 32-bit we have the following conventions - kernel is built with
48 48
49 49
50/* 50/*
51 * 64-bit system call stack frame layout defines and helpers, 51 * 64-bit system call stack frame layout defines and helpers, for
52 * for assembly code: 52 * assembly code (note that the seemingly unnecessary parentheses
53 * are to prevent cpp from inserting spaces in expressions that get
54 * passed to macros):
53 */ 55 */
54 56
55#define R15 0 57#define R15 (0)
56#define R14 8 58#define R14 (8)
57#define R13 16 59#define R13 (16)
58#define R12 24 60#define R12 (24)
59#define RBP 32 61#define RBP (32)
60#define RBX 40 62#define RBX (40)
61 63
62/* arguments: interrupts/non tracing syscalls only save up to here: */ 64/* arguments: interrupts/non tracing syscalls only save up to here: */
63#define R11 48 65#define R11 (48)
64#define R10 56 66#define R10 (56)
65#define R9 64 67#define R9 (64)
66#define R8 72 68#define R8 (72)
67#define RAX 80 69#define RAX (80)
68#define RCX 88 70#define RCX (88)
69#define RDX 96 71#define RDX (96)
70#define RSI 104 72#define RSI (104)
71#define RDI 112 73#define RDI (112)
72#define ORIG_RAX 120 /* + error_code */ 74#define ORIG_RAX (120) /* + error_code */
73/* end of arguments */ 75/* end of arguments */
74 76
75/* cpu exception frame or undefined in case of fast syscall: */ 77/* cpu exception frame or undefined in case of fast syscall: */
76#define RIP 128 78#define RIP (128)
77#define CS 136 79#define CS (136)
78#define EFLAGS 144 80#define EFLAGS (144)
79#define RSP 152 81#define RSP (152)
80#define SS 160 82#define SS (160)
81 83
82#define ARGOFFSET R11 84#define ARGOFFSET R11
83#define SWFRAME ORIG_RAX 85#define SWFRAME ORIG_RAX
@@ -111,7 +113,7 @@ For 32-bit we have the following conventions - kernel is built with
111 .endif 113 .endif
112 .endm 114 .endm
113 115
114#define ARG_SKIP 9*8 116#define ARG_SKIP (9*8)
115 117
116 .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \ 118 .macro RESTORE_ARGS skiprax=0, addskip=0, skiprcx=0, skipr11=0, \
117 skipr8910=0, skiprdx=0 119 skipr8910=0, skiprdx=0
@@ -169,7 +171,7 @@ For 32-bit we have the following conventions - kernel is built with
169 .endif 171 .endif
170 .endm 172 .endm
171 173
172#define REST_SKIP 6*8 174#define REST_SKIP (6*8)
173 175
174 .macro SAVE_REST 176 .macro SAVE_REST
175 subq $REST_SKIP, %rsp 177 subq $REST_SKIP, %rsp
diff --git a/arch/x86/include/asm/ce4100.h b/arch/x86/include/asm/ce4100.h
new file mode 100644
index 000000000000..e656ad8c0a2e
--- /dev/null
+++ b/arch/x86/include/asm/ce4100.h
@@ -0,0 +1,6 @@
1#ifndef _ASM_CE4100_H_
2#define _ASM_CE4100_H_
3
4int ce4100_pci_init(void);
5
6#endif
diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index b185091bf19c..4564c8e28a33 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -32,6 +32,6 @@ extern void arch_unregister_cpu(int);
32 32
33DECLARE_PER_CPU(int, cpu_state); 33DECLARE_PER_CPU(int, cpu_state);
34 34
35extern unsigned int boot_cpu_id; 35int mwait_usable(const struct cpuinfo_x86 *);
36 36
37#endif /* _ASM_X86_CPU_H */ 37#endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 3f76523589af..71cc3800712c 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -125,7 +125,7 @@
125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */ 125#define X86_FEATURE_OSXSAVE (4*32+27) /* "" XSAVE enabled in the OS */
126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */ 126#define X86_FEATURE_AVX (4*32+28) /* Advanced Vector Extensions */
127#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */ 127#define X86_FEATURE_F16C (4*32+29) /* 16-bit fp conversions */
128#define X86_FEATURE_RDRND (4*32+30) /* The RDRAND instruction */ 128#define X86_FEATURE_RDRAND (4*32+30) /* The RDRAND instruction */
129#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */ 129#define X86_FEATURE_HYPERVISOR (4*32+31) /* Running on a hypervisor */
130 130
131/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ 131/* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
@@ -152,10 +152,15 @@
152#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */ 152#define X86_FEATURE_3DNOWPREFETCH (6*32+ 8) /* 3DNow prefetch instructions */
153#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */ 153#define X86_FEATURE_OSVW (6*32+ 9) /* OS Visible Workaround */
154#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */ 154#define X86_FEATURE_IBS (6*32+10) /* Instruction Based Sampling */
155#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ 155#define X86_FEATURE_XOP (6*32+11) /* extended AVX instructions */
156#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ 156#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
157#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ 157#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
158#define X86_FEATURE_LWP (6*32+15) /* Light Weight Profiling */
159#define X86_FEATURE_FMA4 (6*32+16) /* 4 operands MAC instructions */
158#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */ 160#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
161#define X86_FEATURE_TBM (6*32+21) /* trailing bit manipulations */
162#define X86_FEATURE_TOPOEXT (6*32+22) /* topology extensions CPUID leafs */
163#define X86_FEATURE_PERFCTR_CORE (6*32+23) /* core performance counter extensions */
159 164
160/* 165/*
161 * Auxiliary flags: Linux defined - For features scattered in various 166 * Auxiliary flags: Linux defined - For features scattered in various
@@ -180,9 +185,18 @@
180#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */ 185#define X86_FEATURE_LBRV (8*32+ 6) /* AMD LBR Virtualization support */
181#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */ 186#define X86_FEATURE_SVML (8*32+ 7) /* "svm_lock" AMD SVM locking MSR */
182#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */ 187#define X86_FEATURE_NRIPS (8*32+ 8) /* "nrip_save" AMD SVM next_rip save */
188#define X86_FEATURE_TSCRATEMSR (8*32+ 9) /* "tsc_scale" AMD TSC scaling support */
189#define X86_FEATURE_VMCBCLEAN (8*32+10) /* "vmcb_clean" AMD VMCB clean bits support */
190#define X86_FEATURE_FLUSHBYASID (8*32+11) /* AMD flush-by-ASID support */
191#define X86_FEATURE_DECODEASSISTS (8*32+12) /* AMD Decode Assists support */
192#define X86_FEATURE_PAUSEFILTER (8*32+13) /* AMD filtered pause intercept */
193#define X86_FEATURE_PFTHRESHOLD (8*32+14) /* AMD pause filter threshold */
194
183 195
184/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ 196/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
185#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ 197#define X86_FEATURE_FSGSBASE (9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
198#define X86_FEATURE_SMEP (9*32+ 7) /* Supervisor Mode Execution Protection */
199#define X86_FEATURE_ERMS (9*32+ 9) /* Enhanced REP MOVSB/STOSB */
186 200
187#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 201#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
188 202
@@ -195,8 +209,7 @@ extern const char * const x86_power_flags[32];
195#define test_cpu_cap(c, bit) \ 209#define test_cpu_cap(c, bit) \
196 test_bit(bit, (unsigned long *)((c)->x86_capability)) 210 test_bit(bit, (unsigned long *)((c)->x86_capability))
197 211
198#define cpu_has(c, bit) \ 212#define REQUIRED_MASK_BIT_SET(bit) \
199 (__builtin_constant_p(bit) && \
200 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \ 213 ( (((bit)>>5)==0 && (1UL<<((bit)&31) & REQUIRED_MASK0)) || \
201 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \ 214 (((bit)>>5)==1 && (1UL<<((bit)&31) & REQUIRED_MASK1)) || \
202 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \ 215 (((bit)>>5)==2 && (1UL<<((bit)&31) & REQUIRED_MASK2)) || \
@@ -206,10 +219,16 @@ extern const char * const x86_power_flags[32];
206 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \ 219 (((bit)>>5)==6 && (1UL<<((bit)&31) & REQUIRED_MASK6)) || \
207 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \ 220 (((bit)>>5)==7 && (1UL<<((bit)&31) & REQUIRED_MASK7)) || \
208 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \ 221 (((bit)>>5)==8 && (1UL<<((bit)&31) & REQUIRED_MASK8)) || \
209 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) ) \ 222 (((bit)>>5)==9 && (1UL<<((bit)&31) & REQUIRED_MASK9)) )
210 ? 1 : \ 223
224#define cpu_has(c, bit) \
225 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
211 test_cpu_cap(c, bit)) 226 test_cpu_cap(c, bit))
212 227
228#define this_cpu_has(bit) \
229 (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \
230 x86_this_cpu_test_bit(bit, (unsigned long *)&cpu_info.x86_capability))
231
213#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) 232#define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit)
214 233
215#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) 234#define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability))
@@ -268,6 +287,7 @@ extern const char * const x86_power_flags[32];
268#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 287#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
269#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 288#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
270#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ) 289#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
290#define cpu_has_perfctr_core boot_cpu_has(X86_FEATURE_PERFCTR_CORE)
271 291
272#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 292#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
273# define cpu_has_invlpg 1 293# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index b81002f23614..078ad0caefc6 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -94,7 +94,7 @@ static inline void hw_breakpoint_disable(void)
94 94
95static inline int hw_breakpoint_active(void) 95static inline int hw_breakpoint_active(void)
96{ 96{
97 return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK; 97 return __this_cpu_read(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
98} 98}
99 99
100extern void aout_dump_debugregs(struct user *dump); 100extern void aout_dump_debugregs(struct user *dump);
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index 617bd56b3070..7b439d9aea2a 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -4,30 +4,33 @@
4#include <asm/desc_defs.h> 4#include <asm/desc_defs.h>
5#include <asm/ldt.h> 5#include <asm/ldt.h>
6#include <asm/mmu.h> 6#include <asm/mmu.h>
7
7#include <linux/smp.h> 8#include <linux/smp.h>
8 9
9static inline void fill_ldt(struct desc_struct *desc, 10static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *info)
10 const struct user_desc *info) 11{
11{ 12 desc->limit0 = info->limit & 0x0ffff;
12 desc->limit0 = info->limit & 0x0ffff; 13
13 desc->base0 = info->base_addr & 0x0000ffff; 14 desc->base0 = (info->base_addr & 0x0000ffff);
14 15 desc->base1 = (info->base_addr & 0x00ff0000) >> 16;
15 desc->base1 = (info->base_addr & 0x00ff0000) >> 16; 16
16 desc->type = (info->read_exec_only ^ 1) << 1; 17 desc->type = (info->read_exec_only ^ 1) << 1;
17 desc->type |= info->contents << 2; 18 desc->type |= info->contents << 2;
18 desc->s = 1; 19
19 desc->dpl = 0x3; 20 desc->s = 1;
20 desc->p = info->seg_not_present ^ 1; 21 desc->dpl = 0x3;
21 desc->limit = (info->limit & 0xf0000) >> 16; 22 desc->p = info->seg_not_present ^ 1;
22 desc->avl = info->useable; 23 desc->limit = (info->limit & 0xf0000) >> 16;
23 desc->d = info->seg_32bit; 24 desc->avl = info->useable;
24 desc->g = info->limit_in_pages; 25 desc->d = info->seg_32bit;
25 desc->base2 = (info->base_addr & 0xff000000) >> 24; 26 desc->g = info->limit_in_pages;
27
28 desc->base2 = (info->base_addr & 0xff000000) >> 24;
26 /* 29 /*
27 * Don't allow setting of the lm bit. It is useless anyway 30 * Don't allow setting of the lm bit. It is useless anyway
28 * because 64bit system calls require __USER_CS: 31 * because 64bit system calls require __USER_CS:
29 */ 32 */
30 desc->l = 0; 33 desc->l = 0;
31} 34}
32 35
33extern struct desc_ptr idt_descr; 36extern struct desc_ptr idt_descr;
@@ -36,6 +39,7 @@ extern gate_desc idt_table[];
36struct gdt_page { 39struct gdt_page {
37 struct desc_struct gdt[GDT_ENTRIES]; 40 struct desc_struct gdt[GDT_ENTRIES];
38} __attribute__((aligned(PAGE_SIZE))); 41} __attribute__((aligned(PAGE_SIZE)));
42
39DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page); 43DECLARE_PER_CPU_PAGE_ALIGNED(struct gdt_page, gdt_page);
40 44
41static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu) 45static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
@@ -48,16 +52,16 @@ static inline struct desc_struct *get_cpu_gdt_table(unsigned int cpu)
48static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, 52static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func,
49 unsigned dpl, unsigned ist, unsigned seg) 53 unsigned dpl, unsigned ist, unsigned seg)
50{ 54{
51 gate->offset_low = PTR_LOW(func); 55 gate->offset_low = PTR_LOW(func);
52 gate->segment = __KERNEL_CS; 56 gate->segment = __KERNEL_CS;
53 gate->ist = ist; 57 gate->ist = ist;
54 gate->p = 1; 58 gate->p = 1;
55 gate->dpl = dpl; 59 gate->dpl = dpl;
56 gate->zero0 = 0; 60 gate->zero0 = 0;
57 gate->zero1 = 0; 61 gate->zero1 = 0;
58 gate->type = type; 62 gate->type = type;
59 gate->offset_middle = PTR_MIDDLE(func); 63 gate->offset_middle = PTR_MIDDLE(func);
60 gate->offset_high = PTR_HIGH(func); 64 gate->offset_high = PTR_HIGH(func);
61} 65}
62 66
63#else 67#else
@@ -66,8 +70,7 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
66 unsigned short seg) 70 unsigned short seg)
67{ 71{
68 gate->a = (seg << 16) | (base & 0xffff); 72 gate->a = (seg << 16) | (base & 0xffff);
69 gate->b = (base & 0xffff0000) | 73 gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8);
70 (((0x80 | type | (dpl << 5)) & 0xff) << 8);
71} 74}
72 75
73#endif 76#endif
@@ -75,31 +78,29 @@ static inline void pack_gate(gate_desc *gate, unsigned char type,
75static inline int desc_empty(const void *ptr) 78static inline int desc_empty(const void *ptr)
76{ 79{
77 const u32 *desc = ptr; 80 const u32 *desc = ptr;
81
78 return !(desc[0] | desc[1]); 82 return !(desc[0] | desc[1]);
79} 83}
80 84
81#ifdef CONFIG_PARAVIRT 85#ifdef CONFIG_PARAVIRT
82#include <asm/paravirt.h> 86#include <asm/paravirt.h>
83#else 87#else
84#define load_TR_desc() native_load_tr_desc() 88#define load_TR_desc() native_load_tr_desc()
85#define load_gdt(dtr) native_load_gdt(dtr) 89#define load_gdt(dtr) native_load_gdt(dtr)
86#define load_idt(dtr) native_load_idt(dtr) 90#define load_idt(dtr) native_load_idt(dtr)
87#define load_tr(tr) asm volatile("ltr %0"::"m" (tr)) 91#define load_tr(tr) asm volatile("ltr %0"::"m" (tr))
88#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) 92#define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt))
89 93
90#define store_gdt(dtr) native_store_gdt(dtr) 94#define store_gdt(dtr) native_store_gdt(dtr)
91#define store_idt(dtr) native_store_idt(dtr) 95#define store_idt(dtr) native_store_idt(dtr)
92#define store_tr(tr) (tr = native_store_tr()) 96#define store_tr(tr) (tr = native_store_tr())
93 97
94#define load_TLS(t, cpu) native_load_tls(t, cpu) 98#define load_TLS(t, cpu) native_load_tls(t, cpu)
95#define set_ldt native_set_ldt 99#define set_ldt native_set_ldt
96 100
97#define write_ldt_entry(dt, entry, desc) \ 101#define write_ldt_entry(dt, entry, desc) native_write_ldt_entry(dt, entry, desc)
98 native_write_ldt_entry(dt, entry, desc) 102#define write_gdt_entry(dt, entry, desc, type) native_write_gdt_entry(dt, entry, desc, type)
99#define write_gdt_entry(dt, entry, desc, type) \ 103#define write_idt_entry(dt, entry, g) native_write_idt_entry(dt, entry, g)
100 native_write_gdt_entry(dt, entry, desc, type)
101#define write_idt_entry(dt, entry, g) \
102 native_write_idt_entry(dt, entry, g)
103 104
104static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries) 105static inline void paravirt_alloc_ldt(struct desc_struct *ldt, unsigned entries)
105{ 106{
@@ -112,33 +113,27 @@ static inline void paravirt_free_ldt(struct desc_struct *ldt, unsigned entries)
112 113
113#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt)) 114#define store_ldt(ldt) asm("sldt %0" : "=m"(ldt))
114 115
115static inline void native_write_idt_entry(gate_desc *idt, int entry, 116static inline void native_write_idt_entry(gate_desc *idt, int entry, const gate_desc *gate)
116 const gate_desc *gate)
117{ 117{
118 memcpy(&idt[entry], gate, sizeof(*gate)); 118 memcpy(&idt[entry], gate, sizeof(*gate));
119} 119}
120 120
121static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, 121static inline void native_write_ldt_entry(struct desc_struct *ldt, int entry, const void *desc)
122 const void *desc)
123{ 122{
124 memcpy(&ldt[entry], desc, 8); 123 memcpy(&ldt[entry], desc, 8);
125} 124}
126 125
127static inline void native_write_gdt_entry(struct desc_struct *gdt, int entry, 126static inline void
128 const void *desc, int type) 127native_write_gdt_entry(struct desc_struct *gdt, int entry, const void *desc, int type)
129{ 128{
130 unsigned int size; 129 unsigned int size;
130
131 switch (type) { 131 switch (type) {
132 case DESC_TSS: 132 case DESC_TSS: size = sizeof(tss_desc); break;
133 size = sizeof(tss_desc); 133 case DESC_LDT: size = sizeof(ldt_desc); break;
134 break; 134 default: size = sizeof(*gdt); break;
135 case DESC_LDT:
136 size = sizeof(ldt_desc);
137 break;
138 default:
139 size = sizeof(struct desc_struct);
140 break;
141 } 135 }
136
142 memcpy(&gdt[entry], desc, size); 137 memcpy(&gdt[entry], desc, size);
143} 138}
144 139
@@ -154,20 +149,21 @@ static inline void pack_descriptor(struct desc_struct *desc, unsigned long base,
154} 149}
155 150
156 151
157static inline void set_tssldt_descriptor(void *d, unsigned long addr, 152static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size)
158 unsigned type, unsigned size)
159{ 153{
160#ifdef CONFIG_X86_64 154#ifdef CONFIG_X86_64
161 struct ldttss_desc64 *desc = d; 155 struct ldttss_desc64 *desc = d;
156
162 memset(desc, 0, sizeof(*desc)); 157 memset(desc, 0, sizeof(*desc));
163 desc->limit0 = size & 0xFFFF; 158
164 desc->base0 = PTR_LOW(addr); 159 desc->limit0 = size & 0xFFFF;
165 desc->base1 = PTR_MIDDLE(addr) & 0xFF; 160 desc->base0 = PTR_LOW(addr);
166 desc->type = type; 161 desc->base1 = PTR_MIDDLE(addr) & 0xFF;
167 desc->p = 1; 162 desc->type = type;
168 desc->limit1 = (size >> 16) & 0xF; 163 desc->p = 1;
169 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; 164 desc->limit1 = (size >> 16) & 0xF;
170 desc->base3 = PTR_HIGH(addr); 165 desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF;
166 desc->base3 = PTR_HIGH(addr);
171#else 167#else
172 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); 168 pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0);
173#endif 169#endif
@@ -237,14 +233,16 @@ static inline void native_store_idt(struct desc_ptr *dtr)
237static inline unsigned long native_store_tr(void) 233static inline unsigned long native_store_tr(void)
238{ 234{
239 unsigned long tr; 235 unsigned long tr;
236
240 asm volatile("str %0":"=r" (tr)); 237 asm volatile("str %0":"=r" (tr));
238
241 return tr; 239 return tr;
242} 240}
243 241
244static inline void native_load_tls(struct thread_struct *t, unsigned int cpu) 242static inline void native_load_tls(struct thread_struct *t, unsigned int cpu)
245{ 243{
246 unsigned int i;
247 struct desc_struct *gdt = get_cpu_gdt_table(cpu); 244 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
245 unsigned int i;
248 246
249 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++) 247 for (i = 0; i < GDT_ENTRY_TLS_ENTRIES; i++)
250 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i]; 248 gdt[GDT_ENTRY_TLS_MIN + i] = t->tls_array[i];
@@ -313,6 +311,7 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
313 unsigned dpl, unsigned ist, unsigned seg) 311 unsigned dpl, unsigned ist, unsigned seg)
314{ 312{
315 gate_desc s; 313 gate_desc s;
314
316 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg); 315 pack_gate(&s, type, (unsigned long)addr, dpl, ist, seg);
317 /* 316 /*
318 * does not need to be atomic because it is only done once at 317 * does not need to be atomic because it is only done once at
@@ -343,8 +342,9 @@ static inline void alloc_system_vector(int vector)
343 set_bit(vector, used_vectors); 342 set_bit(vector, used_vectors);
344 if (first_system_vector > vector) 343 if (first_system_vector > vector)
345 first_system_vector = vector; 344 first_system_vector = vector;
346 } else 345 } else {
347 BUG(); 346 BUG();
347 }
348} 348}
349 349
350static inline void alloc_intr_gate(unsigned int n, void *addr) 350static inline void alloc_intr_gate(unsigned int n, void *addr)
diff --git a/arch/x86/include/asm/dma.h b/arch/x86/include/asm/dma.h
index ca1098a7e580..0bdb0c54d9a1 100644
--- a/arch/x86/include/asm/dma.h
+++ b/arch/x86/include/asm/dma.h
@@ -10,7 +10,6 @@
10 10
11#include <linux/spinlock.h> /* And spinlocks */ 11#include <linux/spinlock.h> /* And spinlocks */
12#include <asm/io.h> /* need byte IO */ 12#include <asm/io.h> /* need byte IO */
13#include <linux/delay.h>
14 13
15#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER 14#ifdef HAVE_REALLY_SLOW_DMA_CONTROLLER
16#define dma_outb outb_p 15#define dma_outb outb_p
@@ -70,22 +69,18 @@
70 69
71#define MAX_DMA_CHANNELS 8 70#define MAX_DMA_CHANNELS 8
72 71
73#ifdef CONFIG_X86_32
74
75/* The maximum address that we can perform a DMA transfer to on this platform */
76#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
77
78#else
79
80/* 16MB ISA DMA zone */ 72/* 16MB ISA DMA zone */
81#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT) 73#define MAX_DMA_PFN ((16 * 1024 * 1024) >> PAGE_SHIFT)
82 74
83/* 4GB broken PCI/AGP hardware bus master zone */ 75/* 4GB broken PCI/AGP hardware bus master zone */
84#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT) 76#define MAX_DMA32_PFN ((4UL * 1024 * 1024 * 1024) >> PAGE_SHIFT)
85 77
78#ifdef CONFIG_X86_32
79/* The maximum address that we can perform a DMA transfer to on this platform */
80#define MAX_DMA_ADDRESS (PAGE_OFFSET + 0x1000000)
81#else
86/* Compat define for old dma zone */ 82/* Compat define for old dma zone */
87#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT)) 83#define MAX_DMA_ADDRESS ((unsigned long)__va(MAX_DMA_PFN << PAGE_SHIFT))
88
89#endif 84#endif
90 85
91/* 8237 DMA controllers */ 86/* 8237 DMA controllers */
@@ -151,6 +146,7 @@
151#define DMA_AUTOINIT 0x10 146#define DMA_AUTOINIT 0x10
152 147
153 148
149#ifdef CONFIG_ISA_DMA_API
154extern spinlock_t dma_spin_lock; 150extern spinlock_t dma_spin_lock;
155 151
156static inline unsigned long claim_dma_lock(void) 152static inline unsigned long claim_dma_lock(void)
@@ -164,6 +160,7 @@ static inline void release_dma_lock(unsigned long flags)
164{ 160{
165 spin_unlock_irqrestore(&dma_spin_lock, flags); 161 spin_unlock_irqrestore(&dma_spin_lock, flags);
166} 162}
163#endif /* CONFIG_ISA_DMA_API */
167 164
168/* enable/disable a specific DMA channel */ 165/* enable/disable a specific DMA channel */
169static inline void enable_dma(unsigned int dmanr) 166static inline void enable_dma(unsigned int dmanr)
@@ -303,9 +300,11 @@ static inline int get_dma_residue(unsigned int dmanr)
303} 300}
304 301
305 302
306/* These are in kernel/dma.c: */ 303/* These are in kernel/dma.c because x86 uses CONFIG_GENERIC_ISA_DMA */
304#ifdef CONFIG_ISA_DMA_API
307extern int request_dma(unsigned int dmanr, const char *device_id); 305extern int request_dma(unsigned int dmanr, const char *device_id);
308extern void free_dma(unsigned int dmanr); 306extern void free_dma(unsigned int dmanr);
307#endif
309 308
310/* From PCI */ 309/* From PCI */
311 310
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index 733f7e91e7a9..326099199318 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -89,6 +89,16 @@
89 CFI_ADJUST_CFA_OFFSET -8 89 CFI_ADJUST_CFA_OFFSET -8
90 .endm 90 .endm
91 91
92 .macro pushfq_cfi
93 pushfq
94 CFI_ADJUST_CFA_OFFSET 8
95 .endm
96
97 .macro popfq_cfi
98 popfq
99 CFI_ADJUST_CFA_OFFSET -8
100 .endm
101
92 .macro movq_cfi reg offset=0 102 .macro movq_cfi reg offset=0
93 movq %\reg, \offset(%rsp) 103 movq %\reg, \offset(%rsp)
94 CFI_REL_OFFSET \reg, \offset 104 CFI_REL_OFFSET \reg, \offset
@@ -109,6 +119,16 @@
109 CFI_ADJUST_CFA_OFFSET -4 119 CFI_ADJUST_CFA_OFFSET -4
110 .endm 120 .endm
111 121
122 .macro pushfl_cfi
123 pushfl
124 CFI_ADJUST_CFA_OFFSET 4
125 .endm
126
127 .macro popfl_cfi
128 popfl
129 CFI_ADJUST_CFA_OFFSET -4
130 .endm
131
112 .macro movl_cfi reg offset=0 132 .macro movl_cfi reg offset=0
113 movl %\reg, \offset(%esp) 133 movl %\reg, \offset(%esp)
114 CFI_REL_OFFSET \reg, \offset 134 CFI_REL_OFFSET \reg, \offset
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index ec8a52d14ab1..908b96957d88 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -72,6 +72,9 @@ struct e820map {
72#define BIOS_BEGIN 0x000a0000 72#define BIOS_BEGIN 0x000a0000
73#define BIOS_END 0x00100000 73#define BIOS_END 0x00100000
74 74
75#define BIOS_ROM_BASE 0xffe00000
76#define BIOS_ROM_END 0xffffffff
77
75#ifdef __KERNEL__ 78#ifdef __KERNEL__
76/* see comment in arch/x86/kernel/e820.c */ 79/* see comment in arch/x86/kernel/e820.c */
77extern struct e820map e820; 80extern struct e820map e820;
@@ -93,7 +96,7 @@ extern void e820_setup_gap(void);
93extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, 96extern int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize,
94 unsigned long start_addr, unsigned long long end_addr); 97 unsigned long start_addr, unsigned long long end_addr);
95struct setup_data; 98struct setup_data;
96extern void parse_e820_ext(struct setup_data *data, unsigned long pa_data); 99extern void parse_e820_ext(struct setup_data *data);
97 100
98#if defined(CONFIG_X86_64) || \ 101#if defined(CONFIG_X86_64) || \
99 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION)) 102 (defined(CONFIG_X86_32) && defined(CONFIG_HIBERNATION))
@@ -112,23 +115,13 @@ static inline void early_memtest(unsigned long start, unsigned long end)
112} 115}
113#endif 116#endif
114 117
115extern unsigned long end_user_pfn;
116
117extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
118extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
119extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
120#include <linux/early_res.h>
121
122extern unsigned long e820_end_of_ram_pfn(void); 118extern unsigned long e820_end_of_ram_pfn(void);
123extern unsigned long e820_end_of_low_ram_pfn(void); 119extern unsigned long e820_end_of_low_ram_pfn(void);
124extern int e820_find_active_region(const struct e820entry *ei, 120extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
125 unsigned long start_pfn, 121
126 unsigned long last_pfn, 122void memblock_x86_fill(void);
127 unsigned long *ei_startpfn, 123void memblock_find_dma_reserve(void);
128 unsigned long *ei_endpfn); 124
129extern void e820_register_active_regions(int nid, unsigned long start_pfn,
130 unsigned long end_pfn);
131extern u64 e820_hole_size(u64 start, u64 end);
132extern void finish_e820_parsing(void); 125extern void finish_e820_parsing(void);
133extern void e820_reserve_resources(void); 126extern void e820_reserve_resources(void);
134extern void e820_reserve_resources_late(void); 127extern void e820_reserve_resources_late(void);
diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h
index 8406ed7f9926..7093e4a6a0bc 100644
--- a/arch/x86/include/asm/efi.h
+++ b/arch/x86/include/asm/efi.h
@@ -90,7 +90,8 @@ extern void __iomem *efi_ioremap(unsigned long addr, unsigned long size,
90#endif /* CONFIG_X86_32 */ 90#endif /* CONFIG_X86_32 */
91 91
92extern int add_efi_memmap; 92extern int add_efi_memmap;
93extern void efi_reserve_early(void); 93extern void efi_set_executable(efi_memory_desc_t *md, bool executable);
94extern void efi_memblock_x86_reserve_range(void);
94extern void efi_call_phys_prelog(void); 95extern void efi_call_phys_prelog(void);
95extern void efi_call_phys_epilog(void); 96extern void efi_call_phys_epilog(void);
96 97
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 5d07dea2ebb8..3b0d7ef959b8 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -17,22 +17,14 @@ BUILD_INTERRUPT(pull_timers_interrupt,PULL_TIMERS_VECTOR)
17BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) 17BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
18BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR) 18BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
19 19
20BUILD_INTERRUPT3(invalidate_interrupt0,INVALIDATE_TLB_VECTOR_START+0, 20.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
21 smp_invalidate_interrupt) 21 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
22BUILD_INTERRUPT3(invalidate_interrupt1,INVALIDATE_TLB_VECTOR_START+1, 22.if NUM_INVALIDATE_TLB_VECTORS > \idx
23 smp_invalidate_interrupt) 23BUILD_INTERRUPT3(invalidate_interrupt\idx,
24BUILD_INTERRUPT3(invalidate_interrupt2,INVALIDATE_TLB_VECTOR_START+2, 24 (INVALIDATE_TLB_VECTOR_START)+\idx,
25 smp_invalidate_interrupt) 25 smp_invalidate_interrupt)
26BUILD_INTERRUPT3(invalidate_interrupt3,INVALIDATE_TLB_VECTOR_START+3, 26.endif
27 smp_invalidate_interrupt) 27.endr
28BUILD_INTERRUPT3(invalidate_interrupt4,INVALIDATE_TLB_VECTOR_START+4,
29 smp_invalidate_interrupt)
30BUILD_INTERRUPT3(invalidate_interrupt5,INVALIDATE_TLB_VECTOR_START+5,
31 smp_invalidate_interrupt)
32BUILD_INTERRUPT3(invalidate_interrupt6,INVALIDATE_TLB_VECTOR_START+6,
33 smp_invalidate_interrupt)
34BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
35 smp_invalidate_interrupt)
36#endif 28#endif
37 29
38BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR) 30BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
@@ -50,8 +42,8 @@ BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR)
50BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) 42BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR)
51BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) 43BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR)
52 44
53#ifdef CONFIG_PERF_EVENTS 45#ifdef CONFIG_IRQ_WORK
54BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) 46BUILD_INTERRUPT(irq_work_interrupt, IRQ_WORK_VECTOR)
55#endif 47#endif
56 48
57#ifdef CONFIG_X86_THERMAL_VECTOR 49#ifdef CONFIG_X86_THERMAL_VECTOR
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index d07b44f7d1dc..4729b2b63117 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -116,7 +116,11 @@ enum fixed_addresses {
116#endif 116#endif
117 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */ 117 FIX_TEXT_POKE1, /* reserve 2 pages for text_poke() */
118 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */ 118 FIX_TEXT_POKE0, /* first page is last, because allocation is backward */
119#ifdef CONFIG_X86_MRST
120 FIX_LNW_VRTC,
121#endif
119 __end_of_permanent_fixed_addresses, 122 __end_of_permanent_fixed_addresses,
123
120 /* 124 /*
121 * 256 temporary boot-time mappings, used by early_ioremap(), 125 * 256 temporary boot-time mappings, used by early_ioremap(),
122 * before ioremap() is functional. 126 * before ioremap() is functional.
@@ -214,5 +218,20 @@ static inline unsigned long virt_to_fix(const unsigned long vaddr)
214 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START); 218 BUG_ON(vaddr >= FIXADDR_TOP || vaddr < FIXADDR_START);
215 return __virt_to_fix(vaddr); 219 return __virt_to_fix(vaddr);
216} 220}
221
222/* Return an pointer with offset calculated */
223static __always_inline unsigned long
224__set_fixmap_offset(enum fixed_addresses idx, phys_addr_t phys, pgprot_t flags)
225{
226 __set_fixmap(idx, phys, flags);
227 return fix_to_virt(idx) + (phys & (PAGE_SIZE - 1));
228}
229
230#define set_fixmap_offset(idx, phys) \
231 __set_fixmap_offset(idx, phys, PAGE_KERNEL)
232
233#define set_fixmap_offset_nocache(idx, phys) \
234 __set_fixmap_offset(idx, phys, PAGE_KERNEL_NOCACHE)
235
217#endif /* !__ASSEMBLY__ */ 236#endif /* !__ASSEMBLY__ */
218#endif /* _ASM_X86_FIXMAP_H */ 237#endif /* _ASM_X86_FIXMAP_H */
diff --git a/arch/x86/include/asm/frame.h b/arch/x86/include/asm/frame.h
index 06850a7194e1..2c6fc9e62812 100644
--- a/arch/x86/include/asm/frame.h
+++ b/arch/x86/include/asm/frame.h
@@ -7,14 +7,12 @@
7 frame pointer later */ 7 frame pointer later */
8#ifdef CONFIG_FRAME_POINTER 8#ifdef CONFIG_FRAME_POINTER
9 .macro FRAME 9 .macro FRAME
10 pushl %ebp 10 pushl_cfi %ebp
11 CFI_ADJUST_CFA_OFFSET 4
12 CFI_REL_OFFSET ebp,0 11 CFI_REL_OFFSET ebp,0
13 movl %esp,%ebp 12 movl %esp,%ebp
14 .endm 13 .endm
15 .macro ENDFRAME 14 .macro ENDFRAME
16 popl %ebp 15 popl_cfi %ebp
17 CFI_ADJUST_CFA_OFFSET -4
18 CFI_RESTORE ebp 16 CFI_RESTORE ebp
19 .endm 17 .endm
20#else 18#else
diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index db24c2278be0..268c783ab1c0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -38,11 +38,10 @@ extern void mcount(void);
38static inline unsigned long ftrace_call_adjust(unsigned long addr) 38static inline unsigned long ftrace_call_adjust(unsigned long addr)
39{ 39{
40 /* 40 /*
41 * call mcount is "e8 <4 byte offset>" 41 * addr is the address of the mcount call instruction.
42 * The addr points to the 4 byte offset and the caller of this 42 * recordmcount does the necessary offset calculation.
43 * function wants the pointer to e8. Simply subtract one.
44 */ 43 */
45 return addr - 1; 44 return addr;
46} 45}
47 46
48#ifdef CONFIG_DYNAMIC_FTRACE 47#ifdef CONFIG_DYNAMIC_FTRACE
diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h
index 1f11ce44e956..d09bb03653f0 100644
--- a/arch/x86/include/asm/futex.h
+++ b/arch/x86/include/asm/futex.h
@@ -37,7 +37,7 @@
37 "+m" (*uaddr), "=&r" (tem) \ 37 "+m" (*uaddr), "=&r" (tem) \
38 : "r" (oparg), "i" (-EFAULT), "1" (0)) 38 : "r" (oparg), "i" (-EFAULT), "1" (0))
39 39
40static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr) 40static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr)
41{ 41{
42 int op = (encoded_op >> 28) & 7; 42 int op = (encoded_op >> 28) & 7;
43 int cmp = (encoded_op >> 24) & 15; 43 int cmp = (encoded_op >> 24) & 15;
@@ -48,7 +48,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) 48 if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28))
49 oparg = 1 << oparg; 49 oparg = 1 << oparg;
50 50
51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 51 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
52 return -EFAULT; 52 return -EFAULT;
53 53
54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 54#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
@@ -109,9 +109,10 @@ static inline int futex_atomic_op_inuser(int encoded_op, int __user *uaddr)
109 return ret; 109 return ret;
110} 110}
111 111
112static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval, 112static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
113 int newval) 113 u32 oldval, u32 newval)
114{ 114{
115 int ret = 0;
115 116
116#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP) 117#if defined(CONFIG_X86_32) && !defined(CONFIG_X86_BSWAP)
117 /* Real i386 machines have no cmpxchg instruction */ 118 /* Real i386 machines have no cmpxchg instruction */
@@ -119,21 +120,22 @@ static inline int futex_atomic_cmpxchg_inatomic(int __user *uaddr, int oldval,
119 return -ENOSYS; 120 return -ENOSYS;
120#endif 121#endif
121 122
122 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) 123 if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))
123 return -EFAULT; 124 return -EFAULT;
124 125
125 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %3, %1\n" 126 asm volatile("1:\t" LOCK_PREFIX "cmpxchgl %4, %2\n"
126 "2:\t.section .fixup, \"ax\"\n" 127 "2:\t.section .fixup, \"ax\"\n"
127 "3:\tmov %2, %0\n" 128 "3:\tmov %3, %0\n"
128 "\tjmp 2b\n" 129 "\tjmp 2b\n"
129 "\t.previous\n" 130 "\t.previous\n"
130 _ASM_EXTABLE(1b, 3b) 131 _ASM_EXTABLE(1b, 3b)
131 : "=a" (oldval), "+m" (*uaddr) 132 : "+r" (ret), "=a" (oldval), "+m" (*uaddr)
132 : "i" (-EFAULT), "r" (newval), "0" (oldval) 133 : "i" (-EFAULT), "r" (newval), "1" (oldval)
133 : "memory" 134 : "memory"
134 ); 135 );
135 136
136 return oldval; 137 *uval = oldval;
138 return ret;
137} 139}
138 140
139#endif 141#endif
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 4ac5b0f33fc1..156cd5d18d2a 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -17,6 +17,7 @@ extern int fix_aperture;
17#define GARTEN (1<<0) 17#define GARTEN (1<<0)
18#define DISGARTCPU (1<<4) 18#define DISGARTCPU (1<<4)
19#define DISGARTIO (1<<5) 19#define DISGARTIO (1<<5)
20#define DISTLBWALKPRB (1<<6)
20 21
21/* GART cache control register bits. */ 22/* GART cache control register bits. */
22#define INVGART (1<<0) 23#define INVGART (1<<0)
@@ -27,7 +28,6 @@ extern int fix_aperture;
27#define AMD64_GARTAPERTUREBASE 0x94 28#define AMD64_GARTAPERTUREBASE 0x94
28#define AMD64_GARTTABLEBASE 0x98 29#define AMD64_GARTTABLEBASE 0x98
29#define AMD64_GARTCACHECTL 0x9c 30#define AMD64_GARTCACHECTL 0x9c
30#define AMD64_GARTEN (1<<0)
31 31
32#ifdef CONFIG_GART_IOMMU 32#ifdef CONFIG_GART_IOMMU
33extern int gart_iommu_aperture; 33extern int gart_iommu_aperture;
@@ -37,7 +37,7 @@ extern int gart_iommu_aperture_disabled;
37extern void early_gart_iommu_check(void); 37extern void early_gart_iommu_check(void);
38extern int gart_iommu_init(void); 38extern int gart_iommu_init(void);
39extern void __init gart_parse_options(char *); 39extern void __init gart_parse_options(char *);
40extern void gart_iommu_hole_init(void); 40extern int gart_iommu_hole_init(void);
41 41
42#else 42#else
43#define gart_iommu_aperture 0 43#define gart_iommu_aperture 0
@@ -50,28 +50,42 @@ static inline void early_gart_iommu_check(void)
50static inline void gart_parse_options(char *options) 50static inline void gart_parse_options(char *options)
51{ 51{
52} 52}
53static inline void gart_iommu_hole_init(void) 53static inline int gart_iommu_hole_init(void)
54{ 54{
55 return -ENODEV;
55} 56}
56#endif 57#endif
57 58
58extern int agp_amd64_init(void); 59extern int agp_amd64_init(void);
59 60
61static inline void gart_set_size_and_enable(struct pci_dev *dev, u32 order)
62{
63 u32 ctl;
64
65 /*
66 * Don't enable translation but enable GART IO and CPU accesses.
67 * Also, set DISTLBWALKPRB since GART tables memory is UC.
68 */
69 ctl = order << 1;
70
71 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
72}
73
60static inline void enable_gart_translation(struct pci_dev *dev, u64 addr) 74static inline void enable_gart_translation(struct pci_dev *dev, u64 addr)
61{ 75{
62 u32 tmp, ctl; 76 u32 tmp, ctl;
63 77
64 /* address of the mappings table */ 78 /* address of the mappings table */
65 addr >>= 12; 79 addr >>= 12;
66 tmp = (u32) addr<<4; 80 tmp = (u32) addr<<4;
67 tmp &= ~0xf; 81 tmp &= ~0xf;
68 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp); 82 pci_write_config_dword(dev, AMD64_GARTTABLEBASE, tmp);
69 83
70 /* Enable GART translation for this hammer. */ 84 /* Enable GART translation for this hammer. */
71 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 85 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
72 ctl |= GARTEN; 86 ctl |= GARTEN | DISTLBWALKPRB;
73 ctl &= ~(DISGARTCPU | DISGARTIO); 87 ctl &= ~(DISGARTCPU | DISGARTIO);
74 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl); 88 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, ctl);
75} 89}
76 90
77static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size) 91static inline int aperture_valid(u64 aper_base, u32 aper_size, u32 min_size)
diff --git a/arch/x86/include/asm/gpio.h b/arch/x86/include/asm/gpio.h
index 49dbfdfa50f9..91d915a65259 100644
--- a/arch/x86/include/asm/gpio.h
+++ b/arch/x86/include/asm/gpio.h
@@ -38,12 +38,9 @@ static inline int gpio_cansleep(unsigned int gpio)
38 return __gpio_cansleep(gpio); 38 return __gpio_cansleep(gpio);
39} 39}
40 40
41/*
42 * Not implemented, yet.
43 */
44static inline int gpio_to_irq(unsigned int gpio) 41static inline int gpio_to_irq(unsigned int gpio)
45{ 42{
46 return -ENOSYS; 43 return __gpio_to_irq(gpio);
47} 44}
48 45
49static inline int irq_to_gpio(unsigned int irq) 46static inline int irq_to_gpio(unsigned int irq)
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index aeab29aee617..55e4de613f0e 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -14,7 +14,7 @@ typedef struct {
14#endif 14#endif
15 unsigned int x86_platform_ipis; /* arch dependent */ 15 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs; 17 unsigned int apic_irq_work_irqs;
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
19 unsigned int irq_resched_count; 19 unsigned int irq_resched_count;
20 unsigned int irq_call_count; 20 unsigned int irq_call_count;
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 8caac76ac324..3bd04022fd0c 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -59,11 +59,12 @@ extern void kunmap_high(struct page *page);
59 59
60void *kmap(struct page *page); 60void *kmap(struct page *page);
61void kunmap(struct page *page); 61void kunmap(struct page *page);
62void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot); 62
63void *kmap_atomic(struct page *page, enum km_type type); 63void *kmap_atomic_prot(struct page *page, pgprot_t prot);
64void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type); 64void *__kmap_atomic(struct page *page);
65void *kmap_atomic_pfn(unsigned long pfn, enum km_type type); 65void __kunmap_atomic(void *kvaddr);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 66void *kmap_atomic_pfn(unsigned long pfn);
67void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
67struct page *kmap_atomic_to_page(void *ptr); 68struct page *kmap_atomic_to_page(void *ptr);
68 69
69#define flush_cache_kmaps() do { } while (0) 70#define flush_cache_kmaps() do { } while (0)
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1d5c08a1bdfd..2c392d663dce 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -74,10 +74,12 @@ extern void hpet_disable(void);
74extern unsigned int hpet_readl(unsigned int a); 74extern unsigned int hpet_readl(unsigned int a);
75extern void force_hpet_resume(void); 75extern void force_hpet_resume(void);
76 76
77extern void hpet_msi_unmask(unsigned int irq); 77struct irq_data;
78extern void hpet_msi_mask(unsigned int irq); 78extern void hpet_msi_unmask(struct irq_data *data);
79extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg); 79extern void hpet_msi_mask(struct irq_data *data);
80extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); 80struct hpet_dev;
81extern void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg);
82extern void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg);
81 83
82#ifdef CONFIG_PCI_MSI 84#ifdef CONFIG_PCI_MSI
83extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id); 85extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index c17411503f28..c490d89a9b7b 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -29,7 +29,7 @@
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void x86_platform_ipi(void); 30extern void x86_platform_ipi(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void); 32extern void irq_work_interrupt(void);
33 33
34extern void spurious_interrupt(void); 34extern void spurious_interrupt(void);
35extern void thermal_interrupt(void); 35extern void thermal_interrupt(void);
@@ -45,6 +45,30 @@ extern void invalidate_interrupt4(void);
45extern void invalidate_interrupt5(void); 45extern void invalidate_interrupt5(void);
46extern void invalidate_interrupt6(void); 46extern void invalidate_interrupt6(void);
47extern void invalidate_interrupt7(void); 47extern void invalidate_interrupt7(void);
48extern void invalidate_interrupt8(void);
49extern void invalidate_interrupt9(void);
50extern void invalidate_interrupt10(void);
51extern void invalidate_interrupt11(void);
52extern void invalidate_interrupt12(void);
53extern void invalidate_interrupt13(void);
54extern void invalidate_interrupt14(void);
55extern void invalidate_interrupt15(void);
56extern void invalidate_interrupt16(void);
57extern void invalidate_interrupt17(void);
58extern void invalidate_interrupt18(void);
59extern void invalidate_interrupt19(void);
60extern void invalidate_interrupt20(void);
61extern void invalidate_interrupt21(void);
62extern void invalidate_interrupt22(void);
63extern void invalidate_interrupt23(void);
64extern void invalidate_interrupt24(void);
65extern void invalidate_interrupt25(void);
66extern void invalidate_interrupt26(void);
67extern void invalidate_interrupt27(void);
68extern void invalidate_interrupt28(void);
69extern void invalidate_interrupt29(void);
70extern void invalidate_interrupt30(void);
71extern void invalidate_interrupt31(void);
48 72
49extern void irq_move_cleanup_interrupt(void); 73extern void irq_move_cleanup_interrupt(void);
50extern void reboot_interrupt(void); 74extern void reboot_interrupt(void);
@@ -80,6 +104,13 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
80 irq_attr->polarity = polarity; 104 irq_attr->polarity = polarity;
81} 105}
82 106
107struct irq_2_iommu {
108 struct intel_iommu *iommu;
109 u16 irte_index;
110 u16 sub_handle;
111 u8 irte_mask;
112};
113
83/* 114/*
84 * This is performance-critical, we want to do it O(1) 115 * This is performance-critical, we want to do it O(1)
85 * 116 *
@@ -91,15 +122,17 @@ struct irq_cfg {
91 cpumask_var_t old_domain; 122 cpumask_var_t old_domain;
92 u8 vector; 123 u8 vector;
93 u8 move_in_progress : 1; 124 u8 move_in_progress : 1;
125#ifdef CONFIG_INTR_REMAP
126 struct irq_2_iommu irq_2_iommu;
127#endif
94}; 128};
95 129
96extern struct irq_cfg *irq_cfg(unsigned int);
97extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *); 130extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
98extern void send_cleanup_vector(struct irq_cfg *); 131extern void send_cleanup_vector(struct irq_cfg *);
99 132
100struct irq_desc; 133struct irq_data;
101extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *, 134int __ioapic_set_affinity(struct irq_data *, const struct cpumask *,
102 unsigned int *dest_id); 135 unsigned int *dest_id);
103extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr); 136extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
104extern void setup_ioapic_dest(void); 137extern void setup_ioapic_dest(void);
105 138
diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h
index ff2546ce7178..7a15153c675d 100644
--- a/arch/x86/include/asm/hypervisor.h
+++ b/arch/x86/include/asm/hypervisor.h
@@ -20,6 +20,9 @@
20#ifndef _ASM_X86_HYPERVISOR_H 20#ifndef _ASM_X86_HYPERVISOR_H
21#define _ASM_X86_HYPERVISOR_H 21#define _ASM_X86_HYPERVISOR_H
22 22
23#include <asm/kvm_para.h>
24#include <asm/xen/hypervisor.h>
25
23extern void init_hypervisor(struct cpuinfo_x86 *c); 26extern void init_hypervisor(struct cpuinfo_x86 *c);
24extern void init_hypervisor_platform(void); 27extern void init_hypervisor_platform(void);
25 28
@@ -47,4 +50,13 @@ extern const struct hypervisor_x86 x86_hyper_vmware;
47extern const struct hypervisor_x86 x86_hyper_ms_hyperv; 50extern const struct hypervisor_x86 x86_hyper_ms_hyperv;
48extern const struct hypervisor_x86 x86_hyper_xen_hvm; 51extern const struct hypervisor_x86 x86_hyper_xen_hvm;
49 52
53static inline bool hypervisor_x2apic_available(void)
54{
55 if (kvm_para_available())
56 return true;
57 if (xen_x2apic_para_available())
58 return true;
59 return false;
60}
61
50#endif 62#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index a73a8d5a5e69..c9e09ea05644 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -55,6 +55,12 @@ extern int save_i387_xstate_ia32(void __user *buf);
55extern int restore_i387_xstate_ia32(void __user *buf); 55extern int restore_i387_xstate_ia32(void __user *buf);
56#endif 56#endif
57 57
58#ifdef CONFIG_MATH_EMULATION
59extern void finit_soft_fpu(struct i387_soft_struct *soft);
60#else
61static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
62#endif
63
58#define X87_FSW_ES (1 << 7) /* Exception Summary */ 64#define X87_FSW_ES (1 << 7) /* Exception Summary */
59 65
60static __always_inline __pure bool use_xsaveopt(void) 66static __always_inline __pure bool use_xsaveopt(void)
@@ -67,6 +73,11 @@ static __always_inline __pure bool use_xsave(void)
67 return static_cpu_has(X86_FEATURE_XSAVE); 73 return static_cpu_has(X86_FEATURE_XSAVE);
68} 74}
69 75
76static __always_inline __pure bool use_fxsr(void)
77{
78 return static_cpu_has(X86_FEATURE_FXSR);
79}
80
70extern void __sanitize_i387_state(struct task_struct *); 81extern void __sanitize_i387_state(struct task_struct *);
71 82
72static inline void sanitize_i387_state(struct task_struct *tsk) 83static inline void sanitize_i387_state(struct task_struct *tsk)
@@ -77,20 +88,13 @@ static inline void sanitize_i387_state(struct task_struct *tsk)
77} 88}
78 89
79#ifdef CONFIG_X86_64 90#ifdef CONFIG_X86_64
80
81/* Ignore delayed exceptions from user space */
82static inline void tolerant_fwait(void)
83{
84 asm volatile("1: fwait\n"
85 "2:\n"
86 _ASM_EXTABLE(1b, 2b));
87}
88
89static inline int fxrstor_checking(struct i387_fxsave_struct *fx) 91static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
90{ 92{
91 int err; 93 int err;
92 94
93 asm volatile("1: rex64/fxrstor (%[fx])\n\t" 95 /* See comment in fxsave() below. */
96#ifdef CONFIG_AS_FXSAVEQ
97 asm volatile("1: fxrstorq %[fx]\n\t"
94 "2:\n" 98 "2:\n"
95 ".section .fixup,\"ax\"\n" 99 ".section .fixup,\"ax\"\n"
96 "3: movl $-1,%[err]\n" 100 "3: movl $-1,%[err]\n"
@@ -98,44 +102,21 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
98 ".previous\n" 102 ".previous\n"
99 _ASM_EXTABLE(1b, 3b) 103 _ASM_EXTABLE(1b, 3b)
100 : [err] "=r" (err) 104 : [err] "=r" (err)
101#if 0 /* See comment in fxsave() below. */ 105 : [fx] "m" (*fx), "0" (0));
102 : [fx] "r" (fx), "m" (*fx), "0" (0));
103#else 106#else
104 : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); 107 asm volatile("1: rex64/fxrstor (%[fx])\n\t"
108 "2:\n"
109 ".section .fixup,\"ax\"\n"
110 "3: movl $-1,%[err]\n"
111 " jmp 2b\n"
112 ".previous\n"
113 _ASM_EXTABLE(1b, 3b)
114 : [err] "=r" (err)
115 : [fx] "R" (fx), "m" (*fx), "0" (0));
105#endif 116#endif
106 return err; 117 return err;
107} 118}
108 119
109/* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception
110 is pending. Clear the x87 state here by setting it to fixed
111 values. The kernel data segment can be sometimes 0 and sometimes
112 new user value. Both should be ok.
113 Use the PDA as safe address because it should be already in L1. */
114static inline void fpu_clear(struct fpu *fpu)
115{
116 struct xsave_struct *xstate = &fpu->state->xsave;
117 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
118
119 /*
120 * xsave header may indicate the init state of the FP.
121 */
122 if (use_xsave() &&
123 !(xstate->xsave_hdr.xstate_bv & XSTATE_FP))
124 return;
125
126 if (unlikely(fx->swd & X87_FSW_ES))
127 asm volatile("fnclex");
128 alternative_input(ASM_NOP8 ASM_NOP2,
129 " emms\n" /* clear stack tags */
130 " fildl %%gs:0", /* load to clear state */
131 X86_FEATURE_FXSAVE_LEAK);
132}
133
134static inline void clear_fpu_state(struct task_struct *tsk)
135{
136 fpu_clear(&tsk->thread.fpu);
137}
138
139static inline int fxsave_user(struct i387_fxsave_struct __user *fx) 120static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
140{ 121{
141 int err; 122 int err;
@@ -149,6 +130,18 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
149 if (unlikely(err)) 130 if (unlikely(err))
150 return -EFAULT; 131 return -EFAULT;
151 132
133 /* See comment in fxsave() below. */
134#ifdef CONFIG_AS_FXSAVEQ
135 asm volatile("1: fxsaveq %[fx]\n\t"
136 "2:\n"
137 ".section .fixup,\"ax\"\n"
138 "3: movl $-1,%[err]\n"
139 " jmp 2b\n"
140 ".previous\n"
141 _ASM_EXTABLE(1b, 3b)
142 : [err] "=r" (err), [fx] "=m" (*fx)
143 : "0" (0));
144#else
152 asm volatile("1: rex64/fxsave (%[fx])\n\t" 145 asm volatile("1: rex64/fxsave (%[fx])\n\t"
153 "2:\n" 146 "2:\n"
154 ".section .fixup,\"ax\"\n" 147 ".section .fixup,\"ax\"\n"
@@ -157,10 +150,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx)
157 ".previous\n" 150 ".previous\n"
158 _ASM_EXTABLE(1b, 3b) 151 _ASM_EXTABLE(1b, 3b)
159 : [err] "=r" (err), "=m" (*fx) 152 : [err] "=r" (err), "=m" (*fx)
160#if 0 /* See comment in fxsave() below. */ 153 : [fx] "R" (fx), "0" (0));
161 : [fx] "r" (fx), "0" (0));
162#else
163 : [fx] "cdaSDb" (fx), "0" (0));
164#endif 154#endif
165 if (unlikely(err) && 155 if (unlikely(err) &&
166 __clear_user(fx, sizeof(struct i387_fxsave_struct))) 156 __clear_user(fx, sizeof(struct i387_fxsave_struct)))
@@ -175,56 +165,29 @@ static inline void fpu_fxsave(struct fpu *fpu)
175 uses any extended registers for addressing, a second REX prefix 165 uses any extended registers for addressing, a second REX prefix
176 will be generated (to the assembler, rex64 followed by semicolon 166 will be generated (to the assembler, rex64 followed by semicolon
177 is a separate instruction), and hence the 64-bitness is lost. */ 167 is a separate instruction), and hence the 64-bitness is lost. */
178#if 0 168
169#ifdef CONFIG_AS_FXSAVEQ
179 /* Using "fxsaveq %0" would be the ideal choice, but is only supported 170 /* Using "fxsaveq %0" would be the ideal choice, but is only supported
180 starting with gas 2.16. */ 171 starting with gas 2.16. */
181 __asm__ __volatile__("fxsaveq %0" 172 __asm__ __volatile__("fxsaveq %0"
182 : "=m" (fpu->state->fxsave)); 173 : "=m" (fpu->state->fxsave));
183#elif 0 174#else
184 /* Using, as a workaround, the properly prefixed form below isn't 175 /* Using, as a workaround, the properly prefixed form below isn't
185 accepted by any binutils version so far released, complaining that 176 accepted by any binutils version so far released, complaining that
186 the same type of prefix is used twice if an extended register is 177 the same type of prefix is used twice if an extended register is
187 needed for addressing (fix submitted to mainline 2005-11-21). */ 178 needed for addressing (fix submitted to mainline 2005-11-21).
188 __asm__ __volatile__("rex64/fxsave %0" 179 asm volatile("rex64/fxsave %0"
189 : "=m" (fpu->state->fxsave)); 180 : "=m" (fpu->state->fxsave));
190#else 181 This, however, we can work around by forcing the compiler to select
191 /* This, however, we can work around by forcing the compiler to select
192 an addressing mode that doesn't require extended registers. */ 182 an addressing mode that doesn't require extended registers. */
193 __asm__ __volatile__("rex64/fxsave (%1)" 183 asm volatile("rex64/fxsave (%[fx])"
194 : "=m" (fpu->state->fxsave) 184 : "=m" (fpu->state->fxsave)
195 : "cdaSDb" (&fpu->state->fxsave)); 185 : [fx] "R" (&fpu->state->fxsave));
196#endif 186#endif
197} 187}
198 188
199static inline void fpu_save_init(struct fpu *fpu)
200{
201 if (use_xsave())
202 fpu_xsave(fpu);
203 else
204 fpu_fxsave(fpu);
205
206 fpu_clear(fpu);
207}
208
209static inline void __save_init_fpu(struct task_struct *tsk)
210{
211 fpu_save_init(&tsk->thread.fpu);
212 task_thread_info(tsk)->status &= ~TS_USEDFPU;
213}
214
215#else /* CONFIG_X86_32 */ 189#else /* CONFIG_X86_32 */
216 190
217#ifdef CONFIG_MATH_EMULATION
218extern void finit_soft_fpu(struct i387_soft_struct *soft);
219#else
220static inline void finit_soft_fpu(struct i387_soft_struct *soft) {}
221#endif
222
223static inline void tolerant_fwait(void)
224{
225 asm volatile("fnclex ; fwait");
226}
227
228/* perform fxrstor iff the processor has extended states, otherwise frstor */ 191/* perform fxrstor iff the processor has extended states, otherwise frstor */
229static inline int fxrstor_checking(struct i387_fxsave_struct *fx) 192static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
230{ 193{
@@ -241,6 +204,14 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
241 return 0; 204 return 0;
242} 205}
243 206
207static inline void fpu_fxsave(struct fpu *fpu)
208{
209 asm volatile("fxsave %[fx]"
210 : [fx] "=m" (fpu->state->fxsave));
211}
212
213#endif /* CONFIG_X86_64 */
214
244/* We need a safe address that is cheap to find and that is already 215/* We need a safe address that is cheap to find and that is already
245 in L1 during context switch. The best choices are unfortunately 216 in L1 during context switch. The best choices are unfortunately
246 different for UP and SMP */ 217 different for UP and SMP */
@@ -256,47 +227,33 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx)
256static inline void fpu_save_init(struct fpu *fpu) 227static inline void fpu_save_init(struct fpu *fpu)
257{ 228{
258 if (use_xsave()) { 229 if (use_xsave()) {
259 struct xsave_struct *xstate = &fpu->state->xsave;
260 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
261
262 fpu_xsave(fpu); 230 fpu_xsave(fpu);
263 231
264 /* 232 /*
265 * xsave header may indicate the init state of the FP. 233 * xsave header may indicate the init state of the FP.
266 */ 234 */
267 if (!(xstate->xsave_hdr.xstate_bv & XSTATE_FP)) 235 if (!(fpu->state->xsave.xsave_hdr.xstate_bv & XSTATE_FP))
268 goto end; 236 return;
269 237 } else if (use_fxsr()) {
270 if (unlikely(fx->swd & X87_FSW_ES)) 238 fpu_fxsave(fpu);
271 asm volatile("fnclex"); 239 } else {
272 240 asm volatile("fnsave %[fx]; fwait"
273 /* 241 : [fx] "=m" (fpu->state->fsave));
274 * we can do a simple return here or be paranoid :) 242 return;
275 */
276 goto clear_state;
277 } 243 }
278 244
279 /* Use more nops than strictly needed in case the compiler 245 if (unlikely(fpu->state->fxsave.swd & X87_FSW_ES))
280 varies code */ 246 asm volatile("fnclex");
281 alternative_input( 247
282 "fnsave %[fx] ;fwait;" GENERIC_NOP8 GENERIC_NOP4,
283 "fxsave %[fx]\n"
284 "bt $7,%[fsw] ; jnc 1f ; fnclex\n1:",
285 X86_FEATURE_FXSR,
286 [fx] "m" (fpu->state->fxsave),
287 [fsw] "m" (fpu->state->fxsave.swd) : "memory");
288clear_state:
289 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception 248 /* AMD K7/K8 CPUs don't save/restore FDP/FIP/FOP unless an exception
290 is pending. Clear the x87 state here by setting it to fixed 249 is pending. Clear the x87 state here by setting it to fixed
291 values. safe_address is a random variable that should be in L1 */ 250 values. safe_address is a random variable that should be in L1 */
292 alternative_input( 251 alternative_input(
293 GENERIC_NOP8 GENERIC_NOP2, 252 ASM_NOP8 ASM_NOP2,
294 "emms\n\t" /* clear stack tags */ 253 "emms\n\t" /* clear stack tags */
295 "fildl %[addr]", /* set F?P to defined value */ 254 "fildl %P[addr]", /* set F?P to defined value */
296 X86_FEATURE_FXSAVE_LEAK, 255 X86_FEATURE_FXSAVE_LEAK,
297 [addr] "m" (safe_address)); 256 [addr] "m" (safe_address));
298end:
299 ;
300} 257}
301 258
302static inline void __save_init_fpu(struct task_struct *tsk) 259static inline void __save_init_fpu(struct task_struct *tsk)
@@ -305,9 +262,6 @@ static inline void __save_init_fpu(struct task_struct *tsk)
305 task_thread_info(tsk)->status &= ~TS_USEDFPU; 262 task_thread_info(tsk)->status &= ~TS_USEDFPU;
306} 263}
307 264
308
309#endif /* CONFIG_X86_64 */
310
311static inline int fpu_fxrstor_checking(struct fpu *fpu) 265static inline int fpu_fxrstor_checking(struct fpu *fpu)
312{ 266{
313 return fxrstor_checking(&fpu->state->fxsave); 267 return fxrstor_checking(&fpu->state->fxsave);
@@ -344,7 +298,10 @@ static inline void __unlazy_fpu(struct task_struct *tsk)
344static inline void __clear_fpu(struct task_struct *tsk) 298static inline void __clear_fpu(struct task_struct *tsk)
345{ 299{
346 if (task_thread_info(tsk)->status & TS_USEDFPU) { 300 if (task_thread_info(tsk)->status & TS_USEDFPU) {
347 tolerant_fwait(); 301 /* Ignore delayed exceptions from user space */
302 asm volatile("1: fwait\n"
303 "2:\n"
304 _ASM_EXTABLE(1b, 2b));
348 task_thread_info(tsk)->status &= ~TS_USEDFPU; 305 task_thread_info(tsk)->status &= ~TS_USEDFPU;
349 stts(); 306 stts();
350 } 307 }
@@ -405,19 +362,6 @@ static inline void irq_ts_restore(int TS_state)
405 stts(); 362 stts();
406} 363}
407 364
408#ifdef CONFIG_X86_64
409
410static inline void save_init_fpu(struct task_struct *tsk)
411{
412 __save_init_fpu(tsk);
413 stts();
414}
415
416#define unlazy_fpu __unlazy_fpu
417#define clear_fpu __clear_fpu
418
419#else /* CONFIG_X86_32 */
420
421/* 365/*
422 * These disable preemption on their own and are safe 366 * These disable preemption on their own and are safe
423 */ 367 */
@@ -443,8 +387,6 @@ static inline void clear_fpu(struct task_struct *tsk)
443 preempt_enable(); 387 preempt_enable();
444} 388}
445 389
446#endif /* CONFIG_X86_64 */
447
448/* 390/*
449 * i387 state interaction 391 * i387 state interaction
450 */ 392 */
@@ -508,7 +450,4 @@ extern void fpu_finit(struct fpu *fpu);
508 450
509#endif /* __ASSEMBLY__ */ 451#endif /* __ASSEMBLY__ */
510 452
511#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
512#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
513
514#endif /* _ASM_X86_I387_H */ 453#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8253.h b/arch/x86/include/asm/i8253.h
index fc1f579fb965..65aaa91d5850 100644
--- a/arch/x86/include/asm/i8253.h
+++ b/arch/x86/include/asm/i8253.h
@@ -6,6 +6,8 @@
6#define PIT_CH0 0x40 6#define PIT_CH0 0x40
7#define PIT_CH2 0x42 7#define PIT_CH2 0x42
8 8
9#define PIT_LATCH LATCH
10
9extern raw_spinlock_t i8253_lock; 11extern raw_spinlock_t i8253_lock;
10 12
11extern struct clock_event_device *global_clock_event; 13extern struct clock_event_device *global_clock_event;
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 1655147646aa..a20365953bf8 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -55,6 +55,8 @@ extern struct irq_chip i8259A_chip;
55struct legacy_pic { 55struct legacy_pic {
56 int nr_legacy_irqs; 56 int nr_legacy_irqs;
57 struct irq_chip *chip; 57 struct irq_chip *chip;
58 void (*mask)(unsigned int irq);
59 void (*unmask)(unsigned int irq);
58 void (*mask_all)(void); 60 void (*mask_all)(void);
59 void (*restore_mask)(void); 61 void (*restore_mask)(void);
60 void (*init)(int auto_eoi); 62 void (*init)(int auto_eoi);
diff --git a/arch/x86/include/asm/idle.h b/arch/x86/include/asm/idle.h
index 38d87379e270..f49253d75710 100644
--- a/arch/x86/include/asm/idle.h
+++ b/arch/x86/include/asm/idle.h
@@ -16,6 +16,6 @@ static inline void enter_idle(void) { }
16static inline void exit_idle(void) { } 16static inline void exit_idle(void) { }
17#endif /* CONFIG_X86_64 */ 17#endif /* CONFIG_X86_64 */
18 18
19void c1e_remove_cpu(int cpu); 19void amd_e400_remove_cpu(int cpu);
20 20
21#endif /* _ASM_X86_IDLE_H */ 21#endif /* _ASM_X86_IDLE_H */
diff --git a/arch/x86/include/asm/init.h b/arch/x86/include/asm/init.h
index 36fb1a6a5109..8dbe353e41e1 100644
--- a/arch/x86/include/asm/init.h
+++ b/arch/x86/include/asm/init.h
@@ -11,8 +11,8 @@ kernel_physical_mapping_init(unsigned long start,
11 unsigned long page_size_mask); 11 unsigned long page_size_mask);
12 12
13 13
14extern unsigned long __initdata e820_table_start; 14extern unsigned long __initdata pgt_buf_start;
15extern unsigned long __meminitdata e820_table_end; 15extern unsigned long __meminitdata pgt_buf_end;
16extern unsigned long __meminitdata e820_table_top; 16extern unsigned long __meminitdata pgt_buf_top;
17 17
18#endif /* _ASM_X86_INIT_32_H */ 18#endif /* _ASM_X86_INIT_32_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 30a3e9776123..d02804d650c4 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -38,9 +38,10 @@
38 38
39#include <linux/string.h> 39#include <linux/string.h>
40#include <linux/compiler.h> 40#include <linux/compiler.h>
41#include <asm-generic/int-ll64.h>
42#include <asm/page.h> 41#include <asm/page.h>
43 42
43#include <xen/xen.h>
44
44#define build_mmio_read(name, size, type, reg, barrier) \ 45#define build_mmio_read(name, size, type, reg, barrier) \
45static inline type name(const volatile void __iomem *addr) \ 46static inline type name(const volatile void __iomem *addr) \
46{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \ 47{ type ret; asm volatile("mov" size " %1,%0":reg (ret) \
@@ -85,27 +86,6 @@ build_mmio_write(__writel, "l", unsigned int, "r", )
85build_mmio_read(readq, "q", unsigned long, "=r", :"memory") 86build_mmio_read(readq, "q", unsigned long, "=r", :"memory")
86build_mmio_write(writeq, "q", unsigned long, "r", :"memory") 87build_mmio_write(writeq, "q", unsigned long, "r", :"memory")
87 88
88#else
89
90static inline __u64 readq(const volatile void __iomem *addr)
91{
92 const volatile u32 __iomem *p = addr;
93 u32 low, high;
94
95 low = readl(p);
96 high = readl(p + 1);
97
98 return low + ((u64)high << 32);
99}
100
101static inline void writeq(__u64 val, volatile void __iomem *addr)
102{
103 writel(val, addr);
104 writel(val >> 32, addr+4);
105}
106
107#endif
108
109#define readq_relaxed(a) readq(a) 89#define readq_relaxed(a) readq(a)
110 90
111#define __raw_readq(a) readq(a) 91#define __raw_readq(a) readq(a)
@@ -115,6 +95,8 @@ static inline void writeq(__u64 val, volatile void __iomem *addr)
115#define readq readq 95#define readq readq
116#define writeq writeq 96#define writeq writeq
117 97
98#endif
99
118/** 100/**
119 * virt_to_phys - map virtual addresses to physical 101 * virt_to_phys - map virtual addresses to physical
120 * @address: address to remap 102 * @address: address to remap
@@ -206,6 +188,7 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
206 188
207extern void iounmap(volatile void __iomem *addr); 189extern void iounmap(volatile void __iomem *addr);
208 190
191extern void set_iounmap_nonlazy(void);
209 192
210#ifdef __KERNEL__ 193#ifdef __KERNEL__
211 194
@@ -348,6 +331,18 @@ extern void __iomem *early_memremap(resource_size_t phys_addr,
348 unsigned long size); 331 unsigned long size);
349extern void early_iounmap(void __iomem *addr, unsigned long size); 332extern void early_iounmap(void __iomem *addr, unsigned long size);
350extern void fixup_early_ioremap(void); 333extern void fixup_early_ioremap(void);
334extern bool is_early_ioremap_ptep(pte_t *ptep);
335
336#ifdef CONFIG_XEN
337struct bio_vec;
338
339extern bool xen_biovec_phys_mergeable(const struct bio_vec *vec1,
340 const struct bio_vec *vec2);
341
342#define BIOVEC_PHYS_MERGEABLE(vec1, vec2) \
343 (__BIOVEC_PHYS_MERGEABLE(vec1, vec2) && \
344 (!xen_domain() || xen_biovec_phys_mergeable(vec1, vec2)))
345#endif /* CONFIG_XEN */
351 346
352#define IO_SPACE_LIMIT 0xffff 347#define IO_SPACE_LIMIT 0xffff
353 348
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 9cb2edb87c2f..690d1cc9a877 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -63,17 +63,6 @@ union IO_APIC_reg_03 {
63 } __attribute__ ((packed)) bits; 63 } __attribute__ ((packed)) bits;
64}; 64};
65 65
66enum ioapic_irq_destination_types {
67 dest_Fixed = 0,
68 dest_LowestPrio = 1,
69 dest_SMI = 2,
70 dest__reserved_1 = 3,
71 dest_NMI = 4,
72 dest_INIT = 5,
73 dest__reserved_2 = 6,
74 dest_ExtINT = 7
75};
76
77struct IO_APIC_route_entry { 66struct IO_APIC_route_entry {
78 __u32 vector : 8, 67 __u32 vector : 8,
79 delivery_mode : 3, /* 000: FIXED 68 delivery_mode : 3, /* 000: FIXED
@@ -106,18 +95,22 @@ struct IR_IO_APIC_route_entry {
106 index : 15; 95 index : 15;
107} __attribute__ ((packed)); 96} __attribute__ ((packed));
108 97
98#define IOAPIC_AUTO -1
99#define IOAPIC_EDGE 0
100#define IOAPIC_LEVEL 1
101
109#ifdef CONFIG_X86_IO_APIC 102#ifdef CONFIG_X86_IO_APIC
110 103
111/* 104/*
112 * # of IO-APICs and # of IRQ routing registers 105 * # of IO-APICs and # of IRQ routing registers
113 */ 106 */
114extern int nr_ioapics; 107extern int nr_ioapics;
115extern int nr_ioapic_registers[MAX_IO_APICS];
116 108
117#define MP_MAX_IOAPIC_PIN 127 109extern int mpc_ioapic_id(int ioapic);
110extern unsigned int mpc_ioapic_addr(int ioapic);
111extern struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int ioapic);
118 112
119/* I/O APIC entries */ 113#define MP_MAX_IOAPIC_PIN 127
120extern struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
121 114
122/* # of MP IRQ source entries */ 115/* # of MP IRQ source entries */
123extern int mp_irq_entries; 116extern int mp_irq_entries;
@@ -150,33 +143,23 @@ extern int timer_through_8259;
150#define io_apic_assign_pci_irqs \ 143#define io_apic_assign_pci_irqs \
151 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs) 144 (mp_irq_entries && !skip_ioapic_setup && io_apic_irqs)
152 145
153extern u8 io_apic_unique_id(u8 id);
154extern int io_apic_get_unique_id(int ioapic, int apic_id);
155extern int io_apic_get_version(int ioapic);
156extern int io_apic_get_redir_entries(int ioapic);
157
158struct io_apic_irq_attr; 146struct io_apic_irq_attr;
159extern int io_apic_set_pci_routing(struct device *dev, int irq, 147extern int io_apic_set_pci_routing(struct device *dev, int irq,
160 struct io_apic_irq_attr *irq_attr); 148 struct io_apic_irq_attr *irq_attr);
161void setup_IO_APIC_irq_extra(u32 gsi); 149void setup_IO_APIC_irq_extra(u32 gsi);
162extern void ioapic_init_mappings(void); 150extern void ioapic_and_gsi_init(void);
163extern void ioapic_insert_resources(void); 151extern void ioapic_insert_resources(void);
164 152
165extern struct IO_APIC_route_entry **alloc_ioapic_entries(void); 153int io_apic_setup_irq_pin_once(unsigned int irq, int node, struct io_apic_irq_attr *attr);
166extern void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries);
167extern int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
168extern void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
169extern int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries);
170 154
171extern void probe_nr_irqs_gsi(void); 155extern int save_ioapic_entries(void);
156extern void mask_ioapic_entries(void);
157extern int restore_ioapic_entries(void);
158
159extern int get_nr_irqs_gsi(void);
172 160
173extern int setup_ioapic_entry(int apic, int irq,
174 struct IO_APIC_route_entry *entry,
175 unsigned int destination, int trigger,
176 int polarity, int vector, int pin);
177extern void ioapic_write_entry(int apic, int pin,
178 struct IO_APIC_route_entry e);
179extern void setup_ioapic_ids_from_mpc(void); 161extern void setup_ioapic_ids_from_mpc(void);
162extern void setup_ioapic_ids_from_mpc_nocheck(void);
180 163
181struct mp_ioapic_gsi{ 164struct mp_ioapic_gsi{
182 u32 gsi_base; 165 u32 gsi_base;
@@ -189,20 +172,37 @@ int mp_find_ioapic_pin(int ioapic, u32 gsi);
189void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); 172void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
190extern void __init pre_init_apic_IRQ0(void); 173extern void __init pre_init_apic_IRQ0(void);
191 174
175extern void mp_save_irq(struct mpc_intsrc *m);
176
177extern void disable_ioapic_support(void);
178
192#else /* !CONFIG_X86_IO_APIC */ 179#else /* !CONFIG_X86_IO_APIC */
193 180
194#define io_apic_assign_pci_irqs 0 181#define io_apic_assign_pci_irqs 0
195#define setup_ioapic_ids_from_mpc x86_init_noop 182#define setup_ioapic_ids_from_mpc x86_init_noop
196static const int timer_through_8259 = 0; 183static const int timer_through_8259 = 0;
197static inline void ioapic_init_mappings(void) { } 184static inline void ioapic_and_gsi_init(void) { }
198static inline void ioapic_insert_resources(void) { } 185static inline void ioapic_insert_resources(void) { }
199static inline void probe_nr_irqs_gsi(void) { }
200#define gsi_top (NR_IRQS_LEGACY) 186#define gsi_top (NR_IRQS_LEGACY)
201static inline int mp_find_ioapic(u32 gsi) { return 0; } 187static inline int mp_find_ioapic(u32 gsi) { return 0; }
202 188
203struct io_apic_irq_attr; 189struct io_apic_irq_attr;
204static inline int io_apic_set_pci_routing(struct device *dev, int irq, 190static inline int io_apic_set_pci_routing(struct device *dev, int irq,
205 struct io_apic_irq_attr *irq_attr) { return 0; } 191 struct io_apic_irq_attr *irq_attr) { return 0; }
192
193static inline int save_ioapic_entries(void)
194{
195 return -ENOMEM;
196}
197
198static inline void mask_ioapic_entries(void) { }
199static inline int restore_ioapic_entries(void)
200{
201 return -ENOMEM;
202}
203
204static inline void mp_save_irq(struct mpc_intsrc *m) { };
205static inline void disable_ioapic_support(void) { }
206#endif 206#endif
207 207
208#endif /* _ASM_X86_IO_APIC_H */ 208#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h
index c4191b3b7056..363e33eb6ec1 100644
--- a/arch/x86/include/asm/iomap.h
+++ b/arch/x86/include/asm/iomap.h
@@ -27,10 +27,10 @@
27#include <asm/tlbflush.h> 27#include <asm/tlbflush.h>
28 28
29void __iomem * 29void __iomem *
30iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 30iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot);
31 31
32void 32void
33iounmap_atomic(void __iomem *kvaddr, enum km_type type); 33iounmap_atomic(void __iomem *kvaddr);
34 34
35int 35int
36iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); 36iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot);
diff --git a/arch/x86/include/asm/iommu_table.h b/arch/x86/include/asm/iommu_table.h
new file mode 100644
index 000000000000..f229b13a5f30
--- /dev/null
+++ b/arch/x86/include/asm/iommu_table.h
@@ -0,0 +1,100 @@
1#ifndef _ASM_X86_IOMMU_TABLE_H
2#define _ASM_X86_IOMMU_TABLE_H
3
4#include <asm/swiotlb.h>
5
6/*
7 * History lesson:
8 * The execution chain of IOMMUs in 2.6.36 looks as so:
9 *
10 * [xen-swiotlb]
11 * |
12 * +----[swiotlb *]--+
13 * / | \
14 * / | \
15 * [GART] [Calgary] [Intel VT-d]
16 * /
17 * /
18 * [AMD-Vi]
19 *
20 * *: if SWIOTLB detected 'iommu=soft'/'swiotlb=force' it would skip
21 * over the rest of IOMMUs and unconditionally initialize the SWIOTLB.
22 * Also it would surreptitiously initialize set the swiotlb=1 if there were
23 * more than 4GB and if the user did not pass in 'iommu=off'. The swiotlb
24 * flag would be turned off by all IOMMUs except the Calgary one.
25 *
26 * The IOMMU_INIT* macros allow a similar tree (or more complex if desired)
27 * to be built by defining who we depend on.
28 *
29 * And all that needs to be done is to use one of the macros in the IOMMU
30 * and the pci-dma.c will take care of the rest.
31 */
32
33struct iommu_table_entry {
34 initcall_t detect;
35 initcall_t depend;
36 void (*early_init)(void); /* No memory allocate available. */
37 void (*late_init)(void); /* Yes, can allocate memory. */
38#define IOMMU_FINISH_IF_DETECTED (1<<0)
39#define IOMMU_DETECTED (1<<1)
40 int flags;
41};
42/*
43 * Macro fills out an entry in the .iommu_table that is equivalent
44 * to the fields that 'struct iommu_table_entry' has. The entries
45 * that are put in the .iommu_table section are not put in any order
46 * hence during boot-time we will have to resort them based on
47 * dependency. */
48
49
50#define __IOMMU_INIT(_detect, _depend, _early_init, _late_init, _finish)\
51 static const struct iommu_table_entry const \
52 __iommu_entry_##_detect __used \
53 __attribute__ ((unused, __section__(".iommu_table"), \
54 aligned((sizeof(void *))))) \
55 = {_detect, _depend, _early_init, _late_init, \
56 _finish ? IOMMU_FINISH_IF_DETECTED : 0}
57/*
58 * The simplest IOMMU definition. Provide the detection routine
59 * and it will be run after the SWIOTLB and the other IOMMUs
60 * that utilize this macro. If the IOMMU is detected (ie, the
61 * detect routine returns a positive value), the other IOMMUs
62 * are also checked. You can use IOMMU_INIT_POST_FINISH if you prefer
63 * to stop detecting the other IOMMUs after yours has been detected.
64 */
65#define IOMMU_INIT_POST(_detect) \
66 __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 0)
67
68#define IOMMU_INIT_POST_FINISH(detect) \
69 __IOMMU_INIT(_detect, pci_swiotlb_detect_4gb, 0, 0, 1)
70
71/*
72 * A more sophisticated version of IOMMU_INIT. This variant requires:
73 * a). A detection routine function.
74 * b). The name of the detection routine we depend on to get called
75 * before us.
76 * c). The init routine which gets called if the detection routine
77 * returns a positive value from the pci_iommu_alloc. This means
78 * no presence of a memory allocator.
79 * d). Similar to the 'init', except that this gets called from pci_iommu_init
80 * where we do have a memory allocator.
81 *
82 * The standard vs the _FINISH differs in that the _FINISH variant will
83 * continue detecting other IOMMUs in the call list after the
84 * the detection routine returns a positive number. The _FINISH will
85 * stop the execution chain. Both will still call the 'init' and
86 * 'late_init' functions if they are set.
87 */
88#define IOMMU_INIT_FINISH(_detect, _depend, _init, _late_init) \
89 __IOMMU_INIT(_detect, _depend, _init, _late_init, 1)
90
91#define IOMMU_INIT(_detect, _depend, _init, _late_init) \
92 __IOMMU_INIT(_detect, _depend, _init, _late_init, 0)
93
94void sort_iommu_table(struct iommu_table_entry *start,
95 struct iommu_table_entry *finish);
96
97void check_iommu_entries(struct iommu_table_entry *start,
98 struct iommu_table_entry *finish);
99
100#endif /* _ASM_X86_IOMMU_TABLE_H */
diff --git a/arch/x86/include/asm/ipi.h b/arch/x86/include/asm/ipi.h
index 0b7228268a63..615fa9061b57 100644
--- a/arch/x86/include/asm/ipi.h
+++ b/arch/x86/include/asm/ipi.h
@@ -123,10 +123,6 @@ extern void default_send_IPI_mask_sequence_phys(const struct cpumask *mask,
123 int vector); 123 int vector);
124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask, 124extern void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
125 int vector); 125 int vector);
126extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
127 int vector);
128extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
129 int vector);
130 126
131/* Avoid include hell */ 127/* Avoid include hell */
132#define NMI_VECTOR 0x02 128#define NMI_VECTOR 0x02
@@ -150,6 +146,10 @@ static inline void __default_local_send_IPI_all(int vector)
150} 146}
151 147
152#ifdef CONFIG_X86_32 148#ifdef CONFIG_X86_32
149extern void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
150 int vector);
151extern void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
152 int vector);
153extern void default_send_IPI_mask_logical(const struct cpumask *mask, 153extern void default_send_IPI_mask_logical(const struct cpumask *mask,
154 int vector); 154 int vector);
155extern void default_send_IPI_allbutself(int vector); 155extern void default_send_IPI_allbutself(int vector);
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index 5458380b6ef8..ba870bb6dd8e 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -15,22 +15,14 @@ static inline int irq_canonicalize(int irq)
15 return ((irq == 2) ? 9 : irq); 15 return ((irq == 2) ? 9 : irq);
16} 16}
17 17
18#ifdef CONFIG_X86_LOCAL_APIC 18#ifdef CONFIG_X86_32
19# define ARCH_HAS_NMI_WATCHDOG 19extern void irq_ctx_init(int cpu);
20#endif
21
22#ifdef CONFIG_4KSTACKS
23 extern void irq_ctx_init(int cpu);
24 extern void irq_ctx_exit(int cpu);
25# define __ARCH_HAS_DO_SOFTIRQ
26#else 20#else
27# define irq_ctx_init(cpu) do { } while (0) 21# define irq_ctx_init(cpu) do { } while (0)
28# define irq_ctx_exit(cpu) do { } while (0)
29# ifdef CONFIG_X86_64
30# define __ARCH_HAS_DO_SOFTIRQ
31# endif
32#endif 22#endif
33 23
24#define __ARCH_HAS_DO_SOFTIRQ
25
34#ifdef CONFIG_HOTPLUG_CPU 26#ifdef CONFIG_HOTPLUG_CPU
35#include <linux/cpumask.h> 27#include <linux/cpumask.h>
36extern void fixup_irqs(void); 28extern void fixup_irqs(void);
diff --git a/arch/x86/include/asm/irq_controller.h b/arch/x86/include/asm/irq_controller.h
new file mode 100644
index 000000000000..423bbbddf36d
--- /dev/null
+++ b/arch/x86/include/asm/irq_controller.h
@@ -0,0 +1,12 @@
1#ifndef __IRQ_CONTROLLER__
2#define __IRQ_CONTROLLER__
3
4struct irq_domain {
5 int (*xlate)(struct irq_domain *h, const u32 *intspec, u32 intsize,
6 u32 *out_hwirq, u32 *out_type);
7 void *priv;
8 struct device_node *controller;
9 struct list_head l;
10};
11
12#endif
diff --git a/arch/x86/include/asm/irq_remapping.h b/arch/x86/include/asm/irq_remapping.h
index f275e2244505..1c23360fb2d8 100644
--- a/arch/x86/include/asm/irq_remapping.h
+++ b/arch/x86/include/asm/irq_remapping.h
@@ -3,4 +3,39 @@
3 3
4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8) 4#define IRTE_DEST(dest) ((x2apic_mode) ? dest : dest << 8)
5 5
6#ifdef CONFIG_INTR_REMAP
7static inline void prepare_irte(struct irte *irte, int vector,
8 unsigned int dest)
9{
10 memset(irte, 0, sizeof(*irte));
11
12 irte->present = 1;
13 irte->dst_mode = apic->irq_dest_mode;
14 /*
15 * Trigger mode in the IRTE will always be edge, and for IO-APIC, the
16 * actual level or edge trigger will be setup in the IO-APIC
17 * RTE. This will help simplify level triggered irq migration.
18 * For more details, see the comments (in io_apic.c) explainig IO-APIC
19 * irq migration in the presence of interrupt-remapping.
20 */
21 irte->trigger_mode = 0;
22 irte->dlvry_mode = apic->irq_delivery_mode;
23 irte->vector = vector;
24 irte->dest_id = IRTE_DEST(dest);
25 irte->redir_hint = 1;
26}
27static inline bool irq_remapped(struct irq_cfg *cfg)
28{
29 return cfg->irq_2_iommu.iommu != NULL;
30}
31#else
32static void prepare_irte(struct irte *irte, int vector, unsigned int dest)
33{
34}
35static inline bool irq_remapped(struct irq_cfg *cfg)
36{
37 return false;
38}
39#endif
40
6#endif /* _ASM_X86_IRQ_REMAPPING_H */ 41#endif /* _ASM_X86_IRQ_REMAPPING_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6143ebeeebfa..99a44cf98453 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -1,6 +1,7 @@
1#ifndef _ASM_X86_IRQ_VECTORS_H 1#ifndef _ASM_X86_IRQ_VECTORS_H
2#define _ASM_X86_IRQ_VECTORS_H 2#define _ASM_X86_IRQ_VECTORS_H
3 3
4#include <linux/threads.h>
4/* 5/*
5 * Linux IRQ vector layout. 6 * Linux IRQ vector layout.
6 * 7 *
@@ -16,8 +17,8 @@
16 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events 17 * Vectors 0 ... 31 : system traps and exceptions - hardcoded events
17 * Vectors 32 ... 127 : device interrupts 18 * Vectors 32 ... 127 : device interrupts
18 * Vector 128 : legacy int80 syscall interface 19 * Vector 128 : legacy int80 syscall interface
19 * Vectors 129 ... 237 : device interrupts 20 * Vectors 129 ... INVALIDATE_TLB_VECTOR_START-1 : device interrupts
20 * Vectors 238 ... 255 : special interrupts 21 * Vectors INVALIDATE_TLB_VECTOR_START ... 255 : special interrupts
21 * 22 *
22 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table. 23 * 64-bit x86 has per CPU IDT tables, 32-bit has one shared IDT table.
23 * 24 *
@@ -96,42 +97,50 @@
96#define THRESHOLD_APIC_VECTOR 0xf9 97#define THRESHOLD_APIC_VECTOR 0xf9
97#define REBOOT_VECTOR 0xf8 98#define REBOOT_VECTOR 0xf8
98 99
99/* f0-f7 used for spreading out TLB flushes: */
100#define INVALIDATE_TLB_VECTOR_END 0xf7
101#define INVALIDATE_TLB_VECTOR_START 0xf0
102#define NUM_INVALIDATE_TLB_VECTORS 8
103
104/* 100/*
105 * Local APIC timer IRQ vector is on a different priority level, 101 * Generic system vector for platform specific use
106 * to work around the 'lost local interrupt if more than 2 IRQ
107 * sources per level' errata.
108 */ 102 */
109#define LOCAL_TIMER_VECTOR 0xef 103#define X86_PLATFORM_IPI_VECTOR 0xf7
110 104
111/* 105/*
112 * LITMUS^RT pull timers IRQ vector 106 * IRQ work vector:
113 */ 107 */
114#define PULL_TIMERS_VECTOR 0xee 108#define IRQ_WORK_VECTOR 0xf6
109
110#define UV_BAU_MESSAGE 0xf5
115 111
116/* 112/*
117 * Generic system vector for platform specific use 113 * Self IPI vector for machine checks
118 */ 114 */
119#define X86_PLATFORM_IPI_VECTOR 0xed 115#define MCE_SELF_VECTOR 0xf4
116
117/* Xen vector callback to receive events in a HVM domain */
118#define XEN_HVM_EVTCHN_CALLBACK 0xf3
120 119
121/* 120/*
122 * Performance monitoring pending work vector: 121 * Local APIC timer IRQ vector is on a different priority level,
122 * to work around the 'lost local interrupt if more than 2 IRQ
123 * sources per level' errata.
123 */ 124 */
124#define LOCAL_PENDING_VECTOR 0xec 125#define LOCAL_TIMER_VECTOR 0xef
125 126
126#define UV_BAU_MESSAGE 0xea 127/* up to 32 vectors used for spreading out TLB flushes: */
128#if NR_CPUS <= 32
129# define NUM_INVALIDATE_TLB_VECTORS (NR_CPUS)
130#else
131# define NUM_INVALIDATE_TLB_VECTORS (32)
132#endif
133
134#define INVALIDATE_TLB_VECTOR_END (0xee)
135#define INVALIDATE_TLB_VECTOR_START \
136 (INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
127 137
128/* 138/*
129 * Self IPI vector for machine checks 139 * LITMUS^RT pull timers IRQ vector
140 * Make sure it's below the above max 32 vectors.
130 */ 141 */
131#define MCE_SELF_VECTOR 0xeb 142#define PULL_TIMERS_VECTOR 0xce
132 143
133/* Xen vector callback to receive events in a HVM domain */
134#define XEN_HVM_EVTCHN_CALLBACK 0xe9
135 144
136#define NR_VECTORS 256 145#define NR_VECTORS 256
137 146
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 9e2b952f810a..5745ce8bf108 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -61,22 +61,22 @@ static inline void native_halt(void)
61#else 61#else
62#ifndef __ASSEMBLY__ 62#ifndef __ASSEMBLY__
63 63
64static inline unsigned long __raw_local_save_flags(void) 64static inline unsigned long arch_local_save_flags(void)
65{ 65{
66 return native_save_fl(); 66 return native_save_fl();
67} 67}
68 68
69static inline void raw_local_irq_restore(unsigned long flags) 69static inline void arch_local_irq_restore(unsigned long flags)
70{ 70{
71 native_restore_fl(flags); 71 native_restore_fl(flags);
72} 72}
73 73
74static inline void raw_local_irq_disable(void) 74static inline void arch_local_irq_disable(void)
75{ 75{
76 native_irq_disable(); 76 native_irq_disable();
77} 77}
78 78
79static inline void raw_local_irq_enable(void) 79static inline void arch_local_irq_enable(void)
80{ 80{
81 native_irq_enable(); 81 native_irq_enable();
82} 82}
@@ -85,7 +85,7 @@ static inline void raw_local_irq_enable(void)
85 * Used in the idle loop; sti takes one instruction cycle 85 * Used in the idle loop; sti takes one instruction cycle
86 * to complete: 86 * to complete:
87 */ 87 */
88static inline void raw_safe_halt(void) 88static inline void arch_safe_halt(void)
89{ 89{
90 native_safe_halt(); 90 native_safe_halt();
91} 91}
@@ -102,12 +102,10 @@ static inline void halt(void)
102/* 102/*
103 * For spinlocks, etc: 103 * For spinlocks, etc:
104 */ 104 */
105static inline unsigned long __raw_local_irq_save(void) 105static inline unsigned long arch_local_irq_save(void)
106{ 106{
107 unsigned long flags = __raw_local_save_flags(); 107 unsigned long flags = arch_local_save_flags();
108 108 arch_local_irq_disable();
109 raw_local_irq_disable();
110
111 return flags; 109 return flags;
112} 110}
113#else 111#else
@@ -153,22 +151,16 @@ static inline unsigned long __raw_local_irq_save(void)
153#endif /* CONFIG_PARAVIRT */ 151#endif /* CONFIG_PARAVIRT */
154 152
155#ifndef __ASSEMBLY__ 153#ifndef __ASSEMBLY__
156#define raw_local_save_flags(flags) \ 154static inline int arch_irqs_disabled_flags(unsigned long flags)
157 do { (flags) = __raw_local_save_flags(); } while (0)
158
159#define raw_local_irq_save(flags) \
160 do { (flags) = __raw_local_irq_save(); } while (0)
161
162static inline int raw_irqs_disabled_flags(unsigned long flags)
163{ 155{
164 return !(flags & X86_EFLAGS_IF); 156 return !(flags & X86_EFLAGS_IF);
165} 157}
166 158
167static inline int raw_irqs_disabled(void) 159static inline int arch_irqs_disabled(void)
168{ 160{
169 unsigned long flags = __raw_local_save_flags(); 161 unsigned long flags = arch_local_save_flags();
170 162
171 return raw_irqs_disabled_flags(flags); 163 return arch_irqs_disabled_flags(flags);
172} 164}
173 165
174#else 166#else
diff --git a/arch/x86/include/asm/jump_label.h b/arch/x86/include/asm/jump_label.h
new file mode 100644
index 000000000000..a32b18ce6ead
--- /dev/null
+++ b/arch/x86/include/asm/jump_label.h
@@ -0,0 +1,42 @@
1#ifndef _ASM_X86_JUMP_LABEL_H
2#define _ASM_X86_JUMP_LABEL_H
3
4#ifdef __KERNEL__
5
6#include <linux/types.h>
7#include <asm/nops.h>
8#include <asm/asm.h>
9
10#define JUMP_LABEL_NOP_SIZE 5
11
12#define JUMP_LABEL_INITIAL_NOP ".byte 0xe9 \n\t .long 0\n\t"
13
14static __always_inline bool arch_static_branch(struct jump_label_key *key)
15{
16 asm goto("1:"
17 JUMP_LABEL_INITIAL_NOP
18 ".pushsection __jump_table, \"aw\" \n\t"
19 _ASM_ALIGN "\n\t"
20 _ASM_PTR "1b, %l[l_yes], %c0 \n\t"
21 ".popsection \n\t"
22 : : "i" (key) : : l_yes);
23 return false;
24l_yes:
25 return true;
26}
27
28#endif /* __KERNEL__ */
29
30#ifdef CONFIG_X86_64
31typedef u64 jump_label_t;
32#else
33typedef u32 jump_label_t;
34#endif
35
36struct jump_entry {
37 jump_label_t code;
38 jump_label_t target;
39 jump_label_t key;
40};
41
42#endif
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
deleted file mode 100644
index af00bd1d2089..000000000000
--- a/arch/x86/include/asm/k8.h
+++ /dev/null
@@ -1,36 +0,0 @@
1#ifndef _ASM_X86_K8_H
2#define _ASM_X86_K8_H
3
4#include <linux/pci.h>
5
6extern struct pci_device_id k8_nb_ids[];
7struct bootnode;
8
9extern int early_is_k8_nb(u32 value);
10extern struct pci_dev **k8_northbridges;
11extern int num_k8_northbridges;
12extern int cache_k8_northbridges(void);
13extern void k8_flush_garts(void);
14extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void);
17
18#ifdef CONFIG_K8_NB
19extern int num_k8_northbridges;
20
21static inline struct pci_dev *node_to_k8_nb_misc(int node)
22{
23 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
24}
25
26#else
27#define num_k8_northbridges 0
28
29static inline struct pci_dev *node_to_k8_nb_misc(int node)
30{
31 return NULL;
32}
33#endif
34
35
36#endif /* _ASM_X86_K8_H */
diff --git a/arch/x86/include/asm/kdebug.h b/arch/x86/include/asm/kdebug.h
index 5bdfca86581b..fe2cc6e105fa 100644
--- a/arch/x86/include/asm/kdebug.h
+++ b/arch/x86/include/asm/kdebug.h
@@ -13,12 +13,10 @@ enum die_val {
13 DIE_PANIC, 13 DIE_PANIC,
14 DIE_NMI, 14 DIE_NMI,
15 DIE_DIE, 15 DIE_DIE,
16 DIE_NMIWATCHDOG,
17 DIE_KERNELDEBUG, 16 DIE_KERNELDEBUG,
18 DIE_TRAP, 17 DIE_TRAP,
19 DIE_GPF, 18 DIE_GPF,
20 DIE_CALL, 19 DIE_CALL,
21 DIE_NMI_IPI,
22 DIE_PAGE_FAULT, 20 DIE_PAGE_FAULT,
23 DIE_NMIUNKNOWN, 21 DIE_NMIUNKNOWN,
24}; 22};
diff --git a/arch/x86/include/asm/kgdb.h b/arch/x86/include/asm/kgdb.h
index 396f5b5fc4d7..77e95f54570a 100644
--- a/arch/x86/include/asm/kgdb.h
+++ b/arch/x86/include/asm/kgdb.h
@@ -77,6 +77,7 @@ static inline void arch_kgdb_breakpoint(void)
77} 77}
78#define BREAK_INSTR_SIZE 1 78#define BREAK_INSTR_SIZE 1
79#define CACHE_FLUSH_IS_SAFE 1 79#define CACHE_FLUSH_IS_SAFE 1
80#define GDB_ADJUSTS_BREAK_OFFSET
80 81
81extern int kgdb_ll_trap(int cmd, const char *str, 82extern int kgdb_ll_trap(int cmd, const char *str,
82 struct pt_regs *regs, long err, int trap, int sig); 83 struct pt_regs *regs, long err, int trap, int sig);
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index 1f99ecfc48e1..0049211959c0 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -14,6 +14,34 @@
14#include <asm/desc_defs.h> 14#include <asm/desc_defs.h>
15 15
16struct x86_emulate_ctxt; 16struct x86_emulate_ctxt;
17enum x86_intercept;
18enum x86_intercept_stage;
19
20struct x86_exception {
21 u8 vector;
22 bool error_code_valid;
23 u16 error_code;
24 bool nested_page_fault;
25 u64 address; /* cr2 or nested page fault gpa */
26};
27
28/*
29 * This struct is used to carry enough information from the instruction
30 * decoder to main KVM so that a decision can be made whether the
31 * instruction needs to be intercepted or not.
32 */
33struct x86_instruction_info {
34 u8 intercept; /* which intercept */
35 u8 rep_prefix; /* rep prefix? */
36 u8 modrm_mod; /* mod part of modrm */
37 u8 modrm_reg; /* index of register used */
38 u8 modrm_rm; /* rm part of modrm */
39 u64 src_val; /* value of source operand */
40 u8 src_bytes; /* size of source operand */
41 u8 dst_bytes; /* size of destination operand */
42 u8 ad_bytes; /* size of src/dst address */
43 u64 next_rip; /* rip following the instruction */
44};
17 45
18/* 46/*
19 * x86_emulate_ops: 47 * x86_emulate_ops:
@@ -54,6 +82,7 @@ struct x86_emulate_ctxt;
54#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */ 82#define X86EMUL_RETRY_INSTR 3 /* retry the instruction for some reason */
55#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */ 83#define X86EMUL_CMPXCHG_FAILED 4 /* cmpxchg did not see expected value */
56#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */ 84#define X86EMUL_IO_NEEDED 5 /* IO is needed to complete emulation */
85#define X86EMUL_INTERCEPTED 6 /* Intercepted by nested VMCB/VMCS */
57 86
58struct x86_emulate_ops { 87struct x86_emulate_ops {
59 /* 88 /*
@@ -63,8 +92,10 @@ struct x86_emulate_ops {
63 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 92 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
64 * @bytes: [IN ] Number of bytes to read from memory. 93 * @bytes: [IN ] Number of bytes to read from memory.
65 */ 94 */
66 int (*read_std)(unsigned long addr, void *val, 95 int (*read_std)(struct x86_emulate_ctxt *ctxt,
67 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 96 unsigned long addr, void *val,
97 unsigned int bytes,
98 struct x86_exception *fault);
68 99
69 /* 100 /*
70 * write_std: Write bytes of standard (non-emulated/special) memory. 101 * write_std: Write bytes of standard (non-emulated/special) memory.
@@ -73,8 +104,9 @@ struct x86_emulate_ops {
73 * @val: [OUT] Value write to memory, zero-extended to 'u_long'. 104 * @val: [OUT] Value write to memory, zero-extended to 'u_long'.
74 * @bytes: [IN ] Number of bytes to write to memory. 105 * @bytes: [IN ] Number of bytes to write to memory.
75 */ 106 */
76 int (*write_std)(unsigned long addr, void *val, 107 int (*write_std)(struct x86_emulate_ctxt *ctxt,
77 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 108 unsigned long addr, void *val, unsigned int bytes,
109 struct x86_exception *fault);
78 /* 110 /*
79 * fetch: Read bytes of standard (non-emulated/special) memory. 111 * fetch: Read bytes of standard (non-emulated/special) memory.
80 * Used for instruction fetch. 112 * Used for instruction fetch.
@@ -82,8 +114,9 @@ struct x86_emulate_ops {
82 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 114 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
83 * @bytes: [IN ] Number of bytes to read from memory. 115 * @bytes: [IN ] Number of bytes to read from memory.
84 */ 116 */
85 int (*fetch)(unsigned long addr, void *val, 117 int (*fetch)(struct x86_emulate_ctxt *ctxt,
86 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error); 118 unsigned long addr, void *val, unsigned int bytes,
119 struct x86_exception *fault);
87 120
88 /* 121 /*
89 * read_emulated: Read bytes from emulated/special memory area. 122 * read_emulated: Read bytes from emulated/special memory area.
@@ -91,11 +124,9 @@ struct x86_emulate_ops {
91 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 124 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
92 * @bytes: [IN ] Number of bytes to read from memory. 125 * @bytes: [IN ] Number of bytes to read from memory.
93 */ 126 */
94 int (*read_emulated)(unsigned long addr, 127 int (*read_emulated)(struct x86_emulate_ctxt *ctxt,
95 void *val, 128 unsigned long addr, void *val, unsigned int bytes,
96 unsigned int bytes, 129 struct x86_exception *fault);
97 unsigned int *error,
98 struct kvm_vcpu *vcpu);
99 130
100 /* 131 /*
101 * write_emulated: Write bytes to emulated/special memory area. 132 * write_emulated: Write bytes to emulated/special memory area.
@@ -104,11 +135,10 @@ struct x86_emulate_ops {
104 * required). 135 * required).
105 * @bytes: [IN ] Number of bytes to write to memory. 136 * @bytes: [IN ] Number of bytes to write to memory.
106 */ 137 */
107 int (*write_emulated)(unsigned long addr, 138 int (*write_emulated)(struct x86_emulate_ctxt *ctxt,
108 const void *val, 139 unsigned long addr, const void *val,
109 unsigned int bytes, 140 unsigned int bytes,
110 unsigned int *error, 141 struct x86_exception *fault);
111 struct kvm_vcpu *vcpu);
112 142
113 /* 143 /*
114 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an 144 * cmpxchg_emulated: Emulate an atomic (LOCKed) CMPXCHG operation on an
@@ -118,49 +148,72 @@ struct x86_emulate_ops {
118 * @new: [IN ] Value to write to @addr. 148 * @new: [IN ] Value to write to @addr.
119 * @bytes: [IN ] Number of bytes to access using CMPXCHG. 149 * @bytes: [IN ] Number of bytes to access using CMPXCHG.
120 */ 150 */
121 int (*cmpxchg_emulated)(unsigned long addr, 151 int (*cmpxchg_emulated)(struct x86_emulate_ctxt *ctxt,
152 unsigned long addr,
122 const void *old, 153 const void *old,
123 const void *new, 154 const void *new,
124 unsigned int bytes, 155 unsigned int bytes,
125 unsigned int *error, 156 struct x86_exception *fault);
126 struct kvm_vcpu *vcpu); 157 void (*invlpg)(struct x86_emulate_ctxt *ctxt, ulong addr);
127 158
128 int (*pio_in_emulated)(int size, unsigned short port, void *val, 159 int (*pio_in_emulated)(struct x86_emulate_ctxt *ctxt,
129 unsigned int count, struct kvm_vcpu *vcpu); 160 int size, unsigned short port, void *val,
130 161 unsigned int count);
131 int (*pio_out_emulated)(int size, unsigned short port, const void *val, 162
132 unsigned int count, struct kvm_vcpu *vcpu); 163 int (*pio_out_emulated)(struct x86_emulate_ctxt *ctxt,
133 164 int size, unsigned short port, const void *val,
134 bool (*get_cached_descriptor)(struct desc_struct *desc, 165 unsigned int count);
135 int seg, struct kvm_vcpu *vcpu); 166
136 void (*set_cached_descriptor)(struct desc_struct *desc, 167 bool (*get_segment)(struct x86_emulate_ctxt *ctxt, u16 *selector,
137 int seg, struct kvm_vcpu *vcpu); 168 struct desc_struct *desc, u32 *base3, int seg);
138 u16 (*get_segment_selector)(int seg, struct kvm_vcpu *vcpu); 169 void (*set_segment)(struct x86_emulate_ctxt *ctxt, u16 selector,
139 void (*set_segment_selector)(u16 sel, int seg, struct kvm_vcpu *vcpu); 170 struct desc_struct *desc, u32 base3, int seg);
140 unsigned long (*get_cached_segment_base)(int seg, struct kvm_vcpu *vcpu); 171 unsigned long (*get_cached_segment_base)(struct x86_emulate_ctxt *ctxt,
141 void (*get_gdt)(struct desc_ptr *dt, struct kvm_vcpu *vcpu); 172 int seg);
142 ulong (*get_cr)(int cr, struct kvm_vcpu *vcpu); 173 void (*get_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
143 int (*set_cr)(int cr, ulong val, struct kvm_vcpu *vcpu); 174 void (*get_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
144 int (*cpl)(struct kvm_vcpu *vcpu); 175 void (*set_gdt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
145 int (*get_dr)(int dr, unsigned long *dest, struct kvm_vcpu *vcpu); 176 void (*set_idt)(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt);
146 int (*set_dr)(int dr, unsigned long value, struct kvm_vcpu *vcpu); 177 ulong (*get_cr)(struct x86_emulate_ctxt *ctxt, int cr);
147 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 178 int (*set_cr)(struct x86_emulate_ctxt *ctxt, int cr, ulong val);
148 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 179 int (*cpl)(struct x86_emulate_ctxt *ctxt);
180 int (*get_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong *dest);
181 int (*set_dr)(struct x86_emulate_ctxt *ctxt, int dr, ulong value);
182 int (*set_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 data);
183 int (*get_msr)(struct x86_emulate_ctxt *ctxt, u32 msr_index, u64 *pdata);
184 void (*halt)(struct x86_emulate_ctxt *ctxt);
185 void (*wbinvd)(struct x86_emulate_ctxt *ctxt);
186 int (*fix_hypercall)(struct x86_emulate_ctxt *ctxt);
187 void (*get_fpu)(struct x86_emulate_ctxt *ctxt); /* disables preempt */
188 void (*put_fpu)(struct x86_emulate_ctxt *ctxt); /* reenables preempt */
189 int (*intercept)(struct x86_emulate_ctxt *ctxt,
190 struct x86_instruction_info *info,
191 enum x86_intercept_stage stage);
149}; 192};
150 193
194typedef u32 __attribute__((vector_size(16))) sse128_t;
195
151/* Type, address-of, and value of an instruction's operand. */ 196/* Type, address-of, and value of an instruction's operand. */
152struct operand { 197struct operand {
153 enum { OP_REG, OP_MEM, OP_IMM, OP_NONE } type; 198 enum { OP_REG, OP_MEM, OP_IMM, OP_XMM, OP_NONE } type;
154 unsigned int bytes; 199 unsigned int bytes;
155 union { 200 union {
156 unsigned long orig_val; 201 unsigned long orig_val;
157 u64 orig_val64; 202 u64 orig_val64;
158 }; 203 };
159 unsigned long *ptr; 204 union {
205 unsigned long *reg;
206 struct segmented_address {
207 ulong ea;
208 unsigned seg;
209 } mem;
210 unsigned xmm;
211 } addr;
160 union { 212 union {
161 unsigned long val; 213 unsigned long val;
162 u64 val64; 214 u64 val64;
163 char valptr[sizeof(unsigned long) + 2]; 215 char valptr[sizeof(unsigned long) + 2];
216 sse128_t vec_val;
164 }; 217 };
165}; 218};
166 219
@@ -179,6 +232,7 @@ struct read_cache {
179struct decode_cache { 232struct decode_cache {
180 u8 twobyte; 233 u8 twobyte;
181 u8 b; 234 u8 b;
235 u8 intercept;
182 u8 lock_prefix; 236 u8 lock_prefix;
183 u8 rep_prefix; 237 u8 rep_prefix;
184 u8 op_bytes; 238 u8 op_bytes;
@@ -190,6 +244,8 @@ struct decode_cache {
190 bool has_seg_override; 244 bool has_seg_override;
191 u8 seg_override; 245 u8 seg_override;
192 unsigned int d; 246 unsigned int d;
247 int (*execute)(struct x86_emulate_ctxt *ctxt);
248 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
193 unsigned long regs[NR_VCPU_REGS]; 249 unsigned long regs[NR_VCPU_REGS];
194 unsigned long eip; 250 unsigned long eip;
195 /* modrm */ 251 /* modrm */
@@ -197,43 +253,39 @@ struct decode_cache {
197 u8 modrm_mod; 253 u8 modrm_mod;
198 u8 modrm_reg; 254 u8 modrm_reg;
199 u8 modrm_rm; 255 u8 modrm_rm;
200 u8 use_modrm_ea; 256 u8 modrm_seg;
201 bool rip_relative; 257 bool rip_relative;
202 unsigned long modrm_ea;
203 void *modrm_ptr;
204 unsigned long modrm_val;
205 struct fetch_cache fetch; 258 struct fetch_cache fetch;
206 struct read_cache io_read; 259 struct read_cache io_read;
207 struct read_cache mem_read; 260 struct read_cache mem_read;
208}; 261};
209 262
210struct x86_emulate_ctxt { 263struct x86_emulate_ctxt {
211 /* Register state before/after emulation. */ 264 struct x86_emulate_ops *ops;
212 struct kvm_vcpu *vcpu;
213 265
266 /* Register state before/after emulation. */
214 unsigned long eflags; 267 unsigned long eflags;
215 unsigned long eip; /* eip before instruction emulation */ 268 unsigned long eip; /* eip before instruction emulation */
216 /* Emulated execution mode, represented by an X86EMUL_MODE value. */ 269 /* Emulated execution mode, represented by an X86EMUL_MODE value. */
217 int mode; 270 int mode;
218 u32 cs_base;
219 271
220 /* interruptibility state, as a result of execution of STI or MOV SS */ 272 /* interruptibility state, as a result of execution of STI or MOV SS */
221 int interruptibility; 273 int interruptibility;
222 274
223 bool restart; /* restart string instruction after writeback */ 275 bool guest_mode; /* guest running a nested guest */
276 bool perm_ok; /* do not check permissions if true */
277 bool only_vendor_specific_insn;
224 278
225 int exception; /* exception that happens during emulation or -1 */ 279 bool have_exception;
226 u32 error_code; /* error code for exception */ 280 struct x86_exception exception;
227 bool error_code_valid;
228 unsigned long cr2; /* faulted address in case of #PF */
229 281
230 /* decode cache */ 282 /* decode cache */
231 struct decode_cache decode; 283 struct decode_cache decode;
232}; 284};
233 285
234/* Repeat String Operation Prefix */ 286/* Repeat String Operation Prefix */
235#define REPE_PREFIX 1 287#define REPE_PREFIX 0xf3
236#define REPNE_PREFIX 2 288#define REPNE_PREFIX 0xf2
237 289
238/* Execution mode, passed to the emulator. */ 290/* Execution mode, passed to the emulator. */
239#define X86EMUL_MODE_REAL 0 /* Real mode. */ 291#define X86EMUL_MODE_REAL 0 /* Real mode. */
@@ -242,6 +294,69 @@ struct x86_emulate_ctxt {
242#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ 294#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
243#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 295#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
244 296
297/* any protected mode */
298#define X86EMUL_MODE_PROT (X86EMUL_MODE_PROT16|X86EMUL_MODE_PROT32| \
299 X86EMUL_MODE_PROT64)
300
301enum x86_intercept_stage {
302 X86_ICTP_NONE = 0, /* Allow zero-init to not match anything */
303 X86_ICPT_PRE_EXCEPT,
304 X86_ICPT_POST_EXCEPT,
305 X86_ICPT_POST_MEMACCESS,
306};
307
308enum x86_intercept {
309 x86_intercept_none,
310 x86_intercept_cr_read,
311 x86_intercept_cr_write,
312 x86_intercept_clts,
313 x86_intercept_lmsw,
314 x86_intercept_smsw,
315 x86_intercept_dr_read,
316 x86_intercept_dr_write,
317 x86_intercept_lidt,
318 x86_intercept_sidt,
319 x86_intercept_lgdt,
320 x86_intercept_sgdt,
321 x86_intercept_lldt,
322 x86_intercept_sldt,
323 x86_intercept_ltr,
324 x86_intercept_str,
325 x86_intercept_rdtsc,
326 x86_intercept_rdpmc,
327 x86_intercept_pushf,
328 x86_intercept_popf,
329 x86_intercept_cpuid,
330 x86_intercept_rsm,
331 x86_intercept_iret,
332 x86_intercept_intn,
333 x86_intercept_invd,
334 x86_intercept_pause,
335 x86_intercept_hlt,
336 x86_intercept_invlpg,
337 x86_intercept_invlpga,
338 x86_intercept_vmrun,
339 x86_intercept_vmload,
340 x86_intercept_vmsave,
341 x86_intercept_vmmcall,
342 x86_intercept_stgi,
343 x86_intercept_clgi,
344 x86_intercept_skinit,
345 x86_intercept_rdtscp,
346 x86_intercept_icebp,
347 x86_intercept_wbinvd,
348 x86_intercept_monitor,
349 x86_intercept_mwait,
350 x86_intercept_rdmsr,
351 x86_intercept_wrmsr,
352 x86_intercept_in,
353 x86_intercept_ins,
354 x86_intercept_out,
355 x86_intercept_outs,
356
357 nr_x86_intercepts
358};
359
245/* Host execution mode. */ 360/* Host execution mode. */
246#if defined(CONFIG_X86_32) 361#if defined(CONFIG_X86_32)
247#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32 362#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT32
@@ -249,13 +364,15 @@ struct x86_emulate_ctxt {
249#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64 364#define X86EMUL_MODE_HOST X86EMUL_MODE_PROT64
250#endif 365#endif
251 366
252int x86_decode_insn(struct x86_emulate_ctxt *ctxt, 367int x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len);
253 struct x86_emulate_ops *ops); 368#define EMULATION_FAILED -1
254int x86_emulate_insn(struct x86_emulate_ctxt *ctxt, 369#define EMULATION_OK 0
255 struct x86_emulate_ops *ops); 370#define EMULATION_RESTART 1
371#define EMULATION_INTERCEPTED 2
372int x86_emulate_insn(struct x86_emulate_ctxt *ctxt);
256int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 373int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
257 struct x86_emulate_ops *ops,
258 u16 tss_selector, int reason, 374 u16 tss_selector, int reason,
259 bool has_error_code, u32 error_code); 375 bool has_error_code, u32 error_code);
260 376int emulate_int_real(struct x86_emulate_ctxt *ctxt,
377 struct x86_emulate_ops *ops, int irq);
261#endif /* _ASM_X86_KVM_X86_EMULATE_H */ 378#endif /* _ASM_X86_KVM_X86_EMULATE_H */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index c52e2eb40a1e..d2ac8e2ee897 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -30,14 +30,30 @@
30#define KVM_MEMORY_SLOTS 32 30#define KVM_MEMORY_SLOTS 32
31/* memory slots that does not exposed to userspace */ 31/* memory slots that does not exposed to userspace */
32#define KVM_PRIVATE_MEM_SLOTS 4 32#define KVM_PRIVATE_MEM_SLOTS 4
33#define KVM_MMIO_SIZE 16
33 34
34#define KVM_PIO_PAGE_OFFSET 1 35#define KVM_PIO_PAGE_OFFSET 1
35#define KVM_COALESCED_MMIO_PAGE_OFFSET 2 36#define KVM_COALESCED_MMIO_PAGE_OFFSET 2
36 37
38#define CR0_RESERVED_BITS \
39 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
40 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
41 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
42
37#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1) 43#define CR3_PAE_RESERVED_BITS ((X86_CR3_PWT | X86_CR3_PCD) - 1)
38#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD)) 44#define CR3_NONPAE_RESERVED_BITS ((PAGE_SIZE-1) & ~(X86_CR3_PWT | X86_CR3_PCD))
39#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 45#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
40 0xFFFFFF0000000000ULL) 46 0xFFFFFF0000000000ULL)
47#define CR4_RESERVED_BITS \
48 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
49 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
50 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
51 | X86_CR4_OSXSAVE \
52 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
53
54#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
55
56
41 57
42#define INVALID_PAGE (~(hpa_t)0) 58#define INVALID_PAGE (~(hpa_t)0)
43#define VALID_PAGE(x) ((x) != INVALID_PAGE) 59#define VALID_PAGE(x) ((x) != INVALID_PAGE)
@@ -79,15 +95,18 @@
79#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT) 95#define KVM_NUM_MMU_PAGES (1 << KVM_MMU_HASH_SHIFT)
80#define KVM_MIN_FREE_MMU_PAGES 5 96#define KVM_MIN_FREE_MMU_PAGES 5
81#define KVM_REFILL_PAGES 25 97#define KVM_REFILL_PAGES 25
82#define KVM_MAX_CPUID_ENTRIES 40 98#define KVM_MAX_CPUID_ENTRIES 80
83#define KVM_NR_FIXED_MTRR_REGION 88 99#define KVM_NR_FIXED_MTRR_REGION 88
84#define KVM_NR_VAR_MTRR 8 100#define KVM_NR_VAR_MTRR 8
85 101
86extern spinlock_t kvm_lock; 102#define ASYNC_PF_PER_VCPU 64
103
104extern raw_spinlock_t kvm_lock;
87extern struct list_head vm_list; 105extern struct list_head vm_list;
88 106
89struct kvm_vcpu; 107struct kvm_vcpu;
90struct kvm; 108struct kvm;
109struct kvm_async_pf;
91 110
92enum kvm_reg { 111enum kvm_reg {
93 VCPU_REGS_RAX = 0, 112 VCPU_REGS_RAX = 0,
@@ -114,6 +133,10 @@ enum kvm_reg {
114 133
115enum kvm_reg_ex { 134enum kvm_reg_ex {
116 VCPU_EXREG_PDPTR = NR_VCPU_REGS, 135 VCPU_EXREG_PDPTR = NR_VCPU_REGS,
136 VCPU_EXREG_CR3,
137 VCPU_EXREG_RFLAGS,
138 VCPU_EXREG_CPL,
139 VCPU_EXREG_SEGMENTS,
117}; 140};
118 141
119enum { 142enum {
@@ -236,26 +259,39 @@ struct kvm_pio_request {
236 */ 259 */
237struct kvm_mmu { 260struct kvm_mmu {
238 void (*new_cr3)(struct kvm_vcpu *vcpu); 261 void (*new_cr3)(struct kvm_vcpu *vcpu);
239 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 262 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long root);
263 unsigned long (*get_cr3)(struct kvm_vcpu *vcpu);
264 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err,
265 bool prefault);
266 void (*inject_page_fault)(struct kvm_vcpu *vcpu,
267 struct x86_exception *fault);
240 void (*free)(struct kvm_vcpu *vcpu); 268 void (*free)(struct kvm_vcpu *vcpu);
241 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access, 269 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
242 u32 *error); 270 struct x86_exception *exception);
271 gpa_t (*translate_gpa)(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access);
243 void (*prefetch_page)(struct kvm_vcpu *vcpu, 272 void (*prefetch_page)(struct kvm_vcpu *vcpu,
244 struct kvm_mmu_page *page); 273 struct kvm_mmu_page *page);
245 int (*sync_page)(struct kvm_vcpu *vcpu, 274 int (*sync_page)(struct kvm_vcpu *vcpu,
246 struct kvm_mmu_page *sp, bool clear_unsync); 275 struct kvm_mmu_page *sp);
247 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva); 276 void (*invlpg)(struct kvm_vcpu *vcpu, gva_t gva);
277 void (*update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
278 u64 *spte, const void *pte);
248 hpa_t root_hpa; 279 hpa_t root_hpa;
249 int root_level; 280 int root_level;
250 int shadow_root_level; 281 int shadow_root_level;
251 union kvm_mmu_page_role base_role; 282 union kvm_mmu_page_role base_role;
283 bool direct_map;
252 284
253 u64 *pae_root; 285 u64 *pae_root;
286 u64 *lm_root;
254 u64 rsvd_bits_mask[2][4]; 287 u64 rsvd_bits_mask[2][4];
288
289 bool nx;
290
291 u64 pdptrs[4]; /* pae */
255}; 292};
256 293
257struct kvm_vcpu_arch { 294struct kvm_vcpu_arch {
258 u64 host_tsc;
259 /* 295 /*
260 * rip and regs accesses must go through 296 * rip and regs accesses must go through
261 * kvm_{register,rip}_{read,write} functions. 297 * kvm_{register,rip}_{read,write} functions.
@@ -272,7 +308,6 @@ struct kvm_vcpu_arch {
272 unsigned long cr4_guest_owned_bits; 308 unsigned long cr4_guest_owned_bits;
273 unsigned long cr8; 309 unsigned long cr8;
274 u32 hflags; 310 u32 hflags;
275 u64 pdptrs[4]; /* pae */
276 u64 efer; 311 u64 efer;
277 u64 apic_base; 312 u64 apic_base;
278 struct kvm_lapic *apic; /* kernel irqchip context */ 313 struct kvm_lapic *apic; /* kernel irqchip context */
@@ -282,7 +317,31 @@ struct kvm_vcpu_arch {
282 u64 ia32_misc_enable_msr; 317 u64 ia32_misc_enable_msr;
283 bool tpr_access_reporting; 318 bool tpr_access_reporting;
284 319
320 /*
321 * Paging state of the vcpu
322 *
323 * If the vcpu runs in guest mode with two level paging this still saves
324 * the paging mode of the l1 guest. This context is always used to
325 * handle faults.
326 */
285 struct kvm_mmu mmu; 327 struct kvm_mmu mmu;
328
329 /*
330 * Paging state of an L2 guest (used for nested npt)
331 *
332 * This context will save all necessary information to walk page tables
333 * of the an L2 guest. This context is only initialized for page table
334 * walking and not for faulting since we never handle l2 page faults on
335 * the host.
336 */
337 struct kvm_mmu nested_mmu;
338
339 /*
340 * Pointer to the mmu context currently used for
341 * gva_to_gpa translations.
342 */
343 struct kvm_mmu *walk_mmu;
344
286 /* only needed in kvm_pv_mmu_op() path, but it's hot so 345 /* only needed in kvm_pv_mmu_op() path, but it's hot so
287 * put it here to avoid allocation */ 346 * put it here to avoid allocation */
288 struct kvm_pv_mmu_op_buffer mmu_op_buffer; 347 struct kvm_pv_mmu_op_buffer mmu_op_buffer;
@@ -297,16 +356,9 @@ struct kvm_vcpu_arch {
297 u64 *last_pte_updated; 356 u64 *last_pte_updated;
298 gfn_t last_pte_gfn; 357 gfn_t last_pte_gfn;
299 358
300 struct {
301 gfn_t gfn; /* presumed gfn during guest pte update */
302 pfn_t pfn; /* pfn corresponding to that gfn */
303 unsigned long mmu_seq;
304 } update_pte;
305
306 struct fpu guest_fpu; 359 struct fpu guest_fpu;
307 u64 xcr0; 360 u64 xcr0;
308 361
309 gva_t mmio_fault_cr2;
310 struct kvm_pio_request pio; 362 struct kvm_pio_request pio;
311 void *pio_data; 363 void *pio_data;
312 364
@@ -333,12 +385,22 @@ struct kvm_vcpu_arch {
333 /* emulate context */ 385 /* emulate context */
334 386
335 struct x86_emulate_ctxt emulate_ctxt; 387 struct x86_emulate_ctxt emulate_ctxt;
388 bool emulate_regs_need_sync_to_vcpu;
389 bool emulate_regs_need_sync_from_vcpu;
336 390
337 gpa_t time; 391 gpa_t time;
338 struct pvclock_vcpu_time_info hv_clock; 392 struct pvclock_vcpu_time_info hv_clock;
339 unsigned int hv_clock_tsc_khz; 393 unsigned int hw_tsc_khz;
340 unsigned int time_offset; 394 unsigned int time_offset;
341 struct page *time_page; 395 struct page *time_page;
396 u64 last_guest_tsc;
397 u64 last_kernel_ns;
398 u64 last_tsc_nsec;
399 u64 last_tsc_write;
400 u32 virtual_tsc_khz;
401 bool tsc_catchup;
402 u32 tsc_catchup_mult;
403 s8 tsc_catchup_shift;
342 404
343 bool nmi_pending; 405 bool nmi_pending;
344 bool nmi_injected; 406 bool nmi_injected;
@@ -364,12 +426,21 @@ struct kvm_vcpu_arch {
364 u64 hv_vapic; 426 u64 hv_vapic;
365 427
366 cpumask_var_t wbinvd_dirty_mask; 428 cpumask_var_t wbinvd_dirty_mask;
429
430 struct {
431 bool halted;
432 gfn_t gfns[roundup_pow_of_two(ASYNC_PF_PER_VCPU)];
433 struct gfn_to_hva_cache data;
434 u64 msr_val;
435 u32 id;
436 bool send_user_only;
437 } apf;
367}; 438};
368 439
369struct kvm_arch { 440struct kvm_arch {
370 unsigned int n_free_mmu_pages; 441 unsigned int n_used_mmu_pages;
371 unsigned int n_requested_mmu_pages; 442 unsigned int n_requested_mmu_pages;
372 unsigned int n_alloc_mmu_pages; 443 unsigned int n_max_mmu_pages;
373 atomic_t invlpg_counter; 444 atomic_t invlpg_counter;
374 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES]; 445 struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
375 /* 446 /*
@@ -394,14 +465,21 @@ struct kvm_arch {
394 gpa_t ept_identity_map_addr; 465 gpa_t ept_identity_map_addr;
395 466
396 unsigned long irq_sources_bitmap; 467 unsigned long irq_sources_bitmap;
397 u64 vm_init_tsc;
398 s64 kvmclock_offset; 468 s64 kvmclock_offset;
469 raw_spinlock_t tsc_write_lock;
470 u64 last_tsc_nsec;
471 u64 last_tsc_offset;
472 u64 last_tsc_write;
399 473
400 struct kvm_xen_hvm_config xen_hvm_config; 474 struct kvm_xen_hvm_config xen_hvm_config;
401 475
402 /* fields used by HYPER-V emulation */ 476 /* fields used by HYPER-V emulation */
403 u64 hv_guest_os_id; 477 u64 hv_guest_os_id;
404 u64 hv_hypercall; 478 u64 hv_hypercall;
479
480 #ifdef CONFIG_KVM_MMU_AUDIT
481 int audit_point;
482 #endif
405}; 483};
406 484
407struct kvm_vm_stat { 485struct kvm_vm_stat {
@@ -443,6 +521,8 @@ struct kvm_vcpu_stat {
443 u32 nmi_injections; 521 u32 nmi_injections;
444}; 522};
445 523
524struct x86_instruction_info;
525
446struct kvm_x86_ops { 526struct kvm_x86_ops {
447 int (*cpu_has_kvm_support)(void); /* __init */ 527 int (*cpu_has_kvm_support)(void); /* __init */
448 int (*disabled_by_bios)(void); /* __init */ 528 int (*disabled_by_bios)(void); /* __init */
@@ -475,6 +555,7 @@ struct kvm_x86_ops {
475 struct kvm_segment *var, int seg); 555 struct kvm_segment *var, int seg);
476 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 556 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
477 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu); 557 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
558 void (*decache_cr3)(struct kvm_vcpu *vcpu);
478 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 559 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
479 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 560 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
480 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 561 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -505,6 +586,7 @@ struct kvm_x86_ops {
505 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr, 586 void (*queue_exception)(struct kvm_vcpu *vcpu, unsigned nr,
506 bool has_error_code, u32 error_code, 587 bool has_error_code, u32 error_code,
507 bool reinject); 588 bool reinject);
589 void (*cancel_injection)(struct kvm_vcpu *vcpu);
508 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 590 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
509 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 591 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
510 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu); 592 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
@@ -517,14 +599,35 @@ struct kvm_x86_ops {
517 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 599 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
518 int (*get_lpage_level)(void); 600 int (*get_lpage_level)(void);
519 bool (*rdtscp_supported)(void); 601 bool (*rdtscp_supported)(void);
602 void (*adjust_tsc_offset)(struct kvm_vcpu *vcpu, s64 adjustment);
603
604 void (*set_tdp_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
520 605
521 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry); 606 void (*set_supported_cpuid)(u32 func, struct kvm_cpuid_entry2 *entry);
522 607
523 bool (*has_wbinvd_exit)(void); 608 bool (*has_wbinvd_exit)(void);
524 609
610 void (*set_tsc_khz)(struct kvm_vcpu *vcpu, u32 user_tsc_khz);
611 void (*write_tsc_offset)(struct kvm_vcpu *vcpu, u64 offset);
612
613 u64 (*compute_tsc_offset)(struct kvm_vcpu *vcpu, u64 target_tsc);
614
615 void (*get_exit_info)(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2);
616
617 int (*check_intercept)(struct kvm_vcpu *vcpu,
618 struct x86_instruction_info *info,
619 enum x86_intercept_stage stage);
620
525 const struct trace_print_flags *exit_reasons_str; 621 const struct trace_print_flags *exit_reasons_str;
526}; 622};
527 623
624struct kvm_arch_async_pf {
625 u32 token;
626 gfn_t gfn;
627 unsigned long cr3;
628 bool direct_map;
629};
630
528extern struct kvm_x86_ops *kvm_x86_ops; 631extern struct kvm_x86_ops *kvm_x86_ops;
529 632
530int kvm_mmu_module_init(void); 633int kvm_mmu_module_init(void);
@@ -534,7 +637,6 @@ void kvm_mmu_destroy(struct kvm_vcpu *vcpu);
534int kvm_mmu_create(struct kvm_vcpu *vcpu); 637int kvm_mmu_create(struct kvm_vcpu *vcpu);
535int kvm_mmu_setup(struct kvm_vcpu *vcpu); 638int kvm_mmu_setup(struct kvm_vcpu *vcpu);
536void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte); 639void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte);
537void kvm_mmu_set_base_ptes(u64 base_pte);
538void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 640void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
539 u64 dirty_mask, u64 nx_mask, u64 x_mask); 641 u64 dirty_mask, u64 nx_mask, u64 x_mask);
540 642
@@ -544,7 +646,7 @@ void kvm_mmu_zap_all(struct kvm *kvm);
544unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm); 646unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm);
545void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages); 647void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages);
546 648
547int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 649int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3);
548 650
549int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 651int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
550 const void *val, int bytes); 652 const void *val, int bytes);
@@ -554,6 +656,13 @@ u8 kvm_get_guest_memory_type(struct kvm_vcpu *vcpu, gfn_t gfn);
554 656
555extern bool tdp_enabled; 657extern bool tdp_enabled;
556 658
659/* control of guest tsc rate supported? */
660extern bool kvm_has_tsc_control;
661/* minimum supported tsc_khz for guests */
662extern u32 kvm_min_guest_tsc_khz;
663/* maximum supported tsc_khz for guests */
664extern u32 kvm_max_guest_tsc_khz;
665
557enum emulation_result { 666enum emulation_result {
558 EMULATE_DONE, /* no further processing */ 667 EMULATE_DONE, /* no further processing */
559 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */ 668 EMULATE_DO_MMIO, /* kvm_run filled with mmio request */
@@ -563,10 +672,14 @@ enum emulation_result {
563#define EMULTYPE_NO_DECODE (1 << 0) 672#define EMULTYPE_NO_DECODE (1 << 0)
564#define EMULTYPE_TRAP_UD (1 << 1) 673#define EMULTYPE_TRAP_UD (1 << 1)
565#define EMULTYPE_SKIP (1 << 2) 674#define EMULTYPE_SKIP (1 << 2)
566int emulate_instruction(struct kvm_vcpu *vcpu, 675int x86_emulate_instruction(struct kvm_vcpu *vcpu, unsigned long cr2,
567 unsigned long cr2, u16 error_code, int emulation_type); 676 int emulation_type, void *insn, int insn_len);
568void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 677
569void realmode_lidt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 678static inline int emulate_instruction(struct kvm_vcpu *vcpu,
679 int emulation_type)
680{
681 return x86_emulate_instruction(vcpu, 0, emulation_type, NULL, 0);
682}
570 683
571void kvm_enable_efer_bits(u64); 684void kvm_enable_efer_bits(u64);
572int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data); 685int kvm_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *data);
@@ -577,8 +690,6 @@ struct x86_emulate_ctxt;
577int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port); 690int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
578void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 691void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
579int kvm_emulate_halt(struct kvm_vcpu *vcpu); 692int kvm_emulate_halt(struct kvm_vcpu *vcpu);
580int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address);
581int emulate_clts(struct kvm_vcpu *vcpu);
582int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu); 693int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
583 694
584void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 695void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
@@ -590,7 +701,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
590int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0); 701int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0);
591int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3); 702int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
592int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); 703int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4);
593void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8); 704int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8);
594int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val); 705int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val);
595int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val); 706int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val);
596unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu); 707unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu);
@@ -608,8 +719,11 @@ void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
608void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 719void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
609void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); 720void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
610void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 721void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
611void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 722void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
612 u32 error_code); 723int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
724 gfn_t gfn, void *data, int offset, int len,
725 u32 access);
726void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
613bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl); 727bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl);
614 728
615int kvm_pic_set_irq(void *opaque, int irq, int level); 729int kvm_pic_set_irq(void *opaque, int irq, int level);
@@ -627,16 +741,19 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
627int kvm_mmu_load(struct kvm_vcpu *vcpu); 741int kvm_mmu_load(struct kvm_vcpu *vcpu);
628void kvm_mmu_unload(struct kvm_vcpu *vcpu); 742void kvm_mmu_unload(struct kvm_vcpu *vcpu);
629void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 743void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
630gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 744gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
631gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 745 struct x86_exception *exception);
632gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 746gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
633gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error); 747 struct x86_exception *exception);
748gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
749 struct x86_exception *exception);
750gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
751 struct x86_exception *exception);
634 752
635int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 753int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
636 754
637int kvm_fix_hypercall(struct kvm_vcpu *vcpu); 755int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code,
638 756 void *insn, int insn_len);
639int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t gva, u32 error_code);
640void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva); 757void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva);
641 758
642void kvm_enable_tdp(void); 759void kvm_enable_tdp(void);
@@ -703,20 +820,25 @@ enum {
703#define HF_VINTR_MASK (1 << 2) 820#define HF_VINTR_MASK (1 << 2)
704#define HF_NMI_MASK (1 << 3) 821#define HF_NMI_MASK (1 << 3)
705#define HF_IRET_MASK (1 << 4) 822#define HF_IRET_MASK (1 << 4)
823#define HF_GUEST_MASK (1 << 5) /* VCPU is in guest-mode */
706 824
707/* 825/*
708 * Hardware virtualization extension instructions may fault if a 826 * Hardware virtualization extension instructions may fault if a
709 * reboot turns off virtualization while processes are running. 827 * reboot turns off virtualization while processes are running.
710 * Trap the fault and ignore the instruction if that happens. 828 * Trap the fault and ignore the instruction if that happens.
711 */ 829 */
712asmlinkage void kvm_handle_fault_on_reboot(void); 830asmlinkage void kvm_spurious_fault(void);
831extern bool kvm_rebooting;
713 832
714#define __kvm_handle_fault_on_reboot(insn) \ 833#define __kvm_handle_fault_on_reboot(insn) \
715 "666: " insn "\n\t" \ 834 "666: " insn "\n\t" \
835 "668: \n\t" \
716 ".pushsection .fixup, \"ax\" \n" \ 836 ".pushsection .fixup, \"ax\" \n" \
717 "667: \n\t" \ 837 "667: \n\t" \
838 "cmpb $0, kvm_rebooting \n\t" \
839 "jne 668b \n\t" \
718 __ASM_SIZE(push) " $666b \n\t" \ 840 __ASM_SIZE(push) " $666b \n\t" \
719 "jmp kvm_handle_fault_on_reboot \n\t" \ 841 "call kvm_spurious_fault \n\t" \
720 ".popsection \n\t" \ 842 ".popsection \n\t" \
721 ".pushsection __ex_table, \"a\" \n\t" \ 843 ".pushsection __ex_table, \"a\" \n\t" \
722 _ASM_PTR " 666b, 667b \n\t" \ 844 _ASM_PTR " 666b, 667b \n\t" \
@@ -725,6 +847,7 @@ asmlinkage void kvm_handle_fault_on_reboot(void);
725#define KVM_ARCH_WANT_MMU_NOTIFIER 847#define KVM_ARCH_WANT_MMU_NOTIFIER
726int kvm_unmap_hva(struct kvm *kvm, unsigned long hva); 848int kvm_unmap_hva(struct kvm *kvm, unsigned long hva);
727int kvm_age_hva(struct kvm *kvm, unsigned long hva); 849int kvm_age_hva(struct kvm *kvm, unsigned long hva);
850int kvm_test_age_hva(struct kvm *kvm, unsigned long hva);
728void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte); 851void kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
729int cpuid_maxphyaddr(struct kvm_vcpu *vcpu); 852int cpuid_maxphyaddr(struct kvm_vcpu *vcpu);
730int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu); 853int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
@@ -736,4 +859,15 @@ void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
736 859
737bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip); 860bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip);
738 861
862void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
863 struct kvm_async_pf *work);
864void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
865 struct kvm_async_pf *work);
866void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu,
867 struct kvm_async_pf *work);
868bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu);
869extern bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn);
870
871void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err);
872
739#endif /* _ASM_X86_KVM_HOST_H */ 873#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index 05eba5e9a8e8..a427bf77a93d 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -20,6 +20,7 @@
20 * are available. The use of 0x11 and 0x12 is deprecated 20 * are available. The use of 0x11 and 0x12 is deprecated
21 */ 21 */
22#define KVM_FEATURE_CLOCKSOURCE2 3 22#define KVM_FEATURE_CLOCKSOURCE2 3
23#define KVM_FEATURE_ASYNC_PF 4
23 24
24/* The last 8 bits are used to indicate how to interpret the flags field 25/* The last 8 bits are used to indicate how to interpret the flags field
25 * in pvclock structure. If no bits are set, all flags are ignored. 26 * in pvclock structure. If no bits are set, all flags are ignored.
@@ -32,9 +33,13 @@
32/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */ 33/* Custom MSRs falls in the range 0x4b564d00-0x4b564dff */
33#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00 34#define MSR_KVM_WALL_CLOCK_NEW 0x4b564d00
34#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01 35#define MSR_KVM_SYSTEM_TIME_NEW 0x4b564d01
36#define MSR_KVM_ASYNC_PF_EN 0x4b564d02
35 37
36#define KVM_MAX_MMU_OP_BATCH 32 38#define KVM_MAX_MMU_OP_BATCH 32
37 39
40#define KVM_ASYNC_PF_ENABLED (1 << 0)
41#define KVM_ASYNC_PF_SEND_ALWAYS (1 << 1)
42
38/* Operations for KVM_HC_MMU_OP */ 43/* Operations for KVM_HC_MMU_OP */
39#define KVM_MMU_OP_WRITE_PTE 1 44#define KVM_MMU_OP_WRITE_PTE 1
40#define KVM_MMU_OP_FLUSH_TLB 2 45#define KVM_MMU_OP_FLUSH_TLB 2
@@ -61,10 +66,20 @@ struct kvm_mmu_op_release_pt {
61 __u64 pt_phys; 66 __u64 pt_phys;
62}; 67};
63 68
69#define KVM_PV_REASON_PAGE_NOT_PRESENT 1
70#define KVM_PV_REASON_PAGE_READY 2
71
72struct kvm_vcpu_pv_apf_data {
73 __u32 reason;
74 __u8 pad[60];
75 __u32 enabled;
76};
77
64#ifdef __KERNEL__ 78#ifdef __KERNEL__
65#include <asm/processor.h> 79#include <asm/processor.h>
66 80
67extern void kvmclock_init(void); 81extern void kvmclock_init(void);
82extern int kvm_register_clock(char *txt);
68 83
69 84
70/* This instruction is vmcall. On non-VT architectures, it will generate a 85/* This instruction is vmcall. On non-VT architectures, it will generate a
@@ -158,6 +173,21 @@ static inline unsigned int kvm_arch_para_features(void)
158 return cpuid_eax(KVM_CPUID_FEATURES); 173 return cpuid_eax(KVM_CPUID_FEATURES);
159} 174}
160 175
176#ifdef CONFIG_KVM_GUEST
177void __init kvm_guest_init(void);
178void kvm_async_pf_task_wait(u32 token);
179void kvm_async_pf_task_wake(u32 token);
180u32 kvm_read_and_reset_pf_reason(void);
181#else
182#define kvm_guest_init() do { } while (0)
183#define kvm_async_pf_task_wait(T) do {} while(0)
184#define kvm_async_pf_task_wake(T) do {} while(0)
185static inline u32 kvm_read_and_reset_pf_reason(void)
186{
187 return 0;
188}
161#endif 189#endif
162 190
191#endif /* __KERNEL__ */
192
163#endif /* _ASM_X86_KVM_PARA_H */ 193#endif /* _ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/asm/linkage.h b/arch/x86/include/asm/linkage.h
index 12d55e773eb6..48142971b25d 100644
--- a/arch/x86/include/asm/linkage.h
+++ b/arch/x86/include/asm/linkage.h
@@ -8,11 +8,6 @@
8 8
9#ifdef CONFIG_X86_32 9#ifdef CONFIG_X86_32
10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0))) 10#define asmlinkage CPP_ASMLINKAGE __attribute__((regparm(0)))
11/*
12 * For 32-bit UML - mark functions implemented in assembly that use
13 * regparm input parameters:
14 */
15#define asmregparm __attribute__((regparm(3)))
16 11
17/* 12/*
18 * Make sure the compiler doesn't do anything stupid with the 13 * Make sure the compiler doesn't do anything stupid with the
diff --git a/arch/x86/include/asm/mach_traps.h b/arch/x86/include/asm/mach_traps.h
index f7920601e472..72a8b52e7dfd 100644
--- a/arch/x86/include/asm/mach_traps.h
+++ b/arch/x86/include/asm/mach_traps.h
@@ -7,9 +7,19 @@
7 7
8#include <asm/mc146818rtc.h> 8#include <asm/mc146818rtc.h>
9 9
10#define NMI_REASON_PORT 0x61
11
12#define NMI_REASON_SERR 0x80
13#define NMI_REASON_IOCHK 0x40
14#define NMI_REASON_MASK (NMI_REASON_SERR | NMI_REASON_IOCHK)
15
16#define NMI_REASON_CLEAR_SERR 0x04
17#define NMI_REASON_CLEAR_IOCHK 0x08
18#define NMI_REASON_CLEAR_MASK 0x0f
19
10static inline unsigned char get_nmi_reason(void) 20static inline unsigned char get_nmi_reason(void)
11{ 21{
12 return inb(0x61); 22 return inb(NMI_REASON_PORT);
13} 23}
14 24
15static inline void reassert_nmi(void) 25static inline void reassert_nmi(void)
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index c62c13cb9788..021979a6e23f 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -142,8 +142,6 @@ static inline void winchip_mcheck_init(struct cpuinfo_x86 *c) {}
142static inline void enable_p5_mce(void) {} 142static inline void enable_p5_mce(void) {}
143#endif 143#endif
144 144
145extern void (*x86_mce_decode_callback)(struct mce *m);
146
147void mce_setup(struct mce *m); 145void mce_setup(struct mce *m);
148void mce_log(struct mce *m); 146void mce_log(struct mce *m);
149DECLARE_PER_CPU(struct sys_device, mce_dev); 147DECLARE_PER_CPU(struct sys_device, mce_dev);
@@ -223,6 +221,9 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
223 221
224void mce_log_therm_throt_event(__u64 status); 222void mce_log_therm_throt_event(__u64 status);
225 223
224/* Interrupt Handler for core thermal thresholds */
225extern int (*platform_thermal_notify)(__u64 msr_val);
226
226#ifdef CONFIG_X86_THERMAL_VECTOR 227#ifdef CONFIG_X86_THERMAL_VECTOR
227extern void mcheck_intel_therm_init(void); 228extern void mcheck_intel_therm_init(void);
228#else 229#else
diff --git a/arch/x86/include/asm/memblock.h b/arch/x86/include/asm/memblock.h
new file mode 100644
index 000000000000..0cd3800f33b9
--- /dev/null
+++ b/arch/x86/include/asm/memblock.h
@@ -0,0 +1,23 @@
1#ifndef _X86_MEMBLOCK_H
2#define _X86_MEMBLOCK_H
3
4#define ARCH_DISCARD_MEMBLOCK
5
6u64 memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align);
7
8void memblock_x86_reserve_range(u64 start, u64 end, char *name);
9void memblock_x86_free_range(u64 start, u64 end);
10struct range;
11int __get_free_all_memory_range(struct range **range, int nodeid,
12 unsigned long start_pfn, unsigned long end_pfn);
13int get_free_all_memory_range(struct range **rangep, int nodeid);
14
15void memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
16 unsigned long last_pfn);
17u64 memblock_x86_hole_size(u64 start, u64 end);
18u64 memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align);
19u64 memblock_x86_free_memory_in_range(u64 addr, u64 limit);
20u64 memblock_x86_memory_in_range(u64 addr, u64 limit);
21bool memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align);
22
23#endif
diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h
index ef51b501e22a..24215072d0e1 100644
--- a/arch/x86/include/asm/microcode.h
+++ b/arch/x86/include/asm/microcode.h
@@ -48,6 +48,12 @@ static inline struct microcode_ops * __init init_intel_microcode(void)
48 48
49#ifdef CONFIG_MICROCODE_AMD 49#ifdef CONFIG_MICROCODE_AMD
50extern struct microcode_ops * __init init_amd_microcode(void); 50extern struct microcode_ops * __init init_amd_microcode(void);
51
52static inline void get_ucode_data(void *to, const u8 *from, size_t n)
53{
54 memcpy(to, from, n);
55}
56
51#else 57#else
52static inline struct microcode_ops * __init init_amd_microcode(void) 58static inline struct microcode_ops * __init init_amd_microcode(void)
53{ 59{
diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h
index 80a1dee5bea5..5f55e6962769 100644
--- a/arch/x86/include/asm/mmu.h
+++ b/arch/x86/include/asm/mmu.h
@@ -11,6 +11,12 @@
11typedef struct { 11typedef struct {
12 void *ldt; 12 void *ldt;
13 int size; 13 int size;
14
15#ifdef CONFIG_X86_64
16 /* True if mm supports a task running in 32 bit compatibility mode. */
17 unsigned short ia32_compat;
18#endif
19
14 struct mutex lock; 20 struct mutex lock;
15 void *vdso; 21 void *vdso;
16} mm_context_t; 22} mm_context_t;
diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h
index 4a2d4e0c18d9..8b5393ec1080 100644
--- a/arch/x86/include/asm/mmu_context.h
+++ b/arch/x86/include/asm/mmu_context.h
@@ -36,8 +36,6 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
36 unsigned cpu = smp_processor_id(); 36 unsigned cpu = smp_processor_id();
37 37
38 if (likely(prev != next)) { 38 if (likely(prev != next)) {
39 /* stop flush ipis for the previous mm */
40 cpumask_clear_cpu(cpu, mm_cpumask(prev));
41#ifdef CONFIG_SMP 39#ifdef CONFIG_SMP
42 percpu_write(cpu_tlbstate.state, TLBSTATE_OK); 40 percpu_write(cpu_tlbstate.state, TLBSTATE_OK);
43 percpu_write(cpu_tlbstate.active_mm, next); 41 percpu_write(cpu_tlbstate.active_mm, next);
@@ -47,6 +45,9 @@ static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next,
47 /* Re-load page tables */ 45 /* Re-load page tables */
48 load_cr3(next->pgd); 46 load_cr3(next->pgd);
49 47
48 /* stop flush ipis for the previous mm */
49 cpumask_clear_cpu(cpu, mm_cpumask(prev));
50
50 /* 51 /*
51 * load the LDT, if the LDT is different: 52 * load the LDT, if the LDT is different:
52 */ 53 */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index 91df7c51806c..ffa037f28d39 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -13,31 +13,11 @@ extern struct pglist_data *node_data[];
13#define NODE_DATA(nid) (node_data[nid]) 13#define NODE_DATA(nid) (node_data[nid])
14 14
15#include <asm/numaq.h> 15#include <asm/numaq.h>
16/* summit or generic arch */
17#include <asm/srat.h>
18
19extern int get_memcfg_numa_flat(void);
20/*
21 * This allows any one NUMA architecture to be compiled
22 * for, and still fall back to the flat function if it
23 * fails.
24 */
25static inline void get_memcfg_numa(void)
26{
27
28 if (get_memcfg_numaq())
29 return;
30 if (get_memcfg_from_srat())
31 return;
32 get_memcfg_numa_flat();
33}
34 16
35extern void resume_map_numa_kva(pgd_t *pgd); 17extern void resume_map_numa_kva(pgd_t *pgd);
36 18
37#else /* !CONFIG_NUMA */ 19#else /* !CONFIG_NUMA */
38 20
39#define get_memcfg_numa get_memcfg_numa_flat
40
41static inline void resume_map_numa_kva(pgd_t *pgd) {} 21static inline void resume_map_numa_kva(pgd_t *pgd) {}
42 22
43#endif /* CONFIG_NUMA */ 23#endif /* CONFIG_NUMA */
@@ -68,17 +48,6 @@ static inline int pfn_to_nid(unsigned long pfn)
68#endif 48#endif
69} 49}
70 50
71/*
72 * Following are macros that each numa implmentation must define.
73 */
74
75#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
76#define node_end_pfn(nid) \
77({ \
78 pg_data_t *__pgdat = NODE_DATA(nid); \
79 __pgdat->node_start_pfn + __pgdat->node_spanned_pages; \
80})
81
82static inline int pfn_valid(int pfn) 51static inline int pfn_valid(int pfn)
83{ 52{
84 int nid = pfn_to_nid(pfn); 53 int nid = pfn_to_nid(pfn);
@@ -88,6 +57,8 @@ static inline int pfn_valid(int pfn)
88 return 0; 57 return 0;
89} 58}
90 59
60#define early_pfn_valid(pfn) pfn_valid((pfn))
61
91#endif /* CONFIG_DISCONTIGMEM */ 62#endif /* CONFIG_DISCONTIGMEM */
92 63
93#ifdef CONFIG_NEED_MULTIPLE_NODES 64#ifdef CONFIG_NEED_MULTIPLE_NODES
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index 288b96f815a6..129d9aa3ceb3 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -4,40 +4,14 @@
4#ifndef _ASM_X86_MMZONE_64_H 4#ifndef _ASM_X86_MMZONE_64_H
5#define _ASM_X86_MMZONE_64_H 5#define _ASM_X86_MMZONE_64_H
6 6
7
8#ifdef CONFIG_NUMA 7#ifdef CONFIG_NUMA
9 8
10#include <linux/mmdebug.h> 9#include <linux/mmdebug.h>
11
12#include <asm/smp.h> 10#include <asm/smp.h>
13 11
14/* Simple perfect hash to map physical addresses to node numbers */
15struct memnode {
16 int shift;
17 unsigned int mapsize;
18 s16 *map;
19 s16 embedded_map[64 - 8];
20} ____cacheline_aligned; /* total size = 128 bytes */
21extern struct memnode memnode;
22#define memnode_shift memnode.shift
23#define memnodemap memnode.map
24#define memnodemapsize memnode.mapsize
25
26extern struct pglist_data *node_data[]; 12extern struct pglist_data *node_data[];
27 13
28static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
29{
30 unsigned nid;
31 VIRTUAL_BUG_ON(!memnodemap);
32 nid = memnodemap[addr >> memnode_shift];
33 VIRTUAL_BUG_ON(nid >= MAX_NUMNODES || !node_data[nid]);
34 return nid;
35}
36
37#define NODE_DATA(nid) (node_data[nid]) 14#define NODE_DATA(nid) (node_data[nid])
38 15
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages)
42#endif 16#endif
43#endif /* _ASM_X86_MMZONE_64_H */ 17#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 3e2ce58a31a3..9eae7752ae9b 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -35,7 +35,7 @@
35#define MODULE_PROC_FAMILY "K7 " 35#define MODULE_PROC_FAMILY "K7 "
36#elif defined CONFIG_MK8 36#elif defined CONFIG_MK8
37#define MODULE_PROC_FAMILY "K8 " 37#define MODULE_PROC_FAMILY "K8 "
38#elif defined CONFIG_X86_ELAN 38#elif defined CONFIG_MELAN
39#define MODULE_PROC_FAMILY "ELAN " 39#define MODULE_PROC_FAMILY "ELAN "
40#elif defined CONFIG_MCRUSOE 40#elif defined CONFIG_MCRUSOE
41#define MODULE_PROC_FAMILY "CRUSOE " 41#define MODULE_PROC_FAMILY "CRUSOE "
@@ -60,12 +60,7 @@
60#endif 60#endif
61 61
62#ifdef CONFIG_X86_32 62#ifdef CONFIG_X86_32
63# ifdef CONFIG_4KSTACKS 63# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY
64# define MODULE_STACKSIZE "4KSTACKS "
65# else
66# define MODULE_STACKSIZE ""
67# endif
68# define MODULE_ARCH_VERMAGIC MODULE_PROC_FAMILY MODULE_STACKSIZE
69#endif 64#endif
70 65
71#endif /* _ASM_X86_MODULE_H */ 66#endif /* _ASM_X86_MODULE_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index c82868e9f905..9c7d95f6174b 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -5,8 +5,9 @@
5 5
6#include <asm/mpspec_def.h> 6#include <asm/mpspec_def.h>
7#include <asm/x86_init.h> 7#include <asm/x86_init.h>
8#include <asm/apicdef.h>
8 9
9extern int apic_version[MAX_APICS]; 10extern int apic_version[];
10extern int pic_mode; 11extern int pic_mode;
11 12
12#ifdef CONFIG_X86_32 13#ifdef CONFIG_X86_32
@@ -24,7 +25,6 @@ extern int pic_mode;
24#define MAX_IRQ_SOURCES 256 25#define MAX_IRQ_SOURCES 256
25 26
26extern unsigned int def_to_bigsmp; 27extern unsigned int def_to_bigsmp;
27extern u8 apicid_2_node[];
28 28
29#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
30extern int mp_bus_id_to_node[MAX_MP_BUSSES]; 30extern int mp_bus_id_to_node[MAX_MP_BUSSES];
@@ -32,8 +32,6 @@ extern int mp_bus_id_to_local[MAX_MP_BUSSES];
32extern int quad_local_to_mp_bus_id [NR_CPUS/4][4]; 32extern int quad_local_to_mp_bus_id [NR_CPUS/4][4];
33#endif 33#endif
34 34
35#define MAX_APICID 256
36
37#else /* CONFIG_X86_64: */ 35#else /* CONFIG_X86_64: */
38 36
39#define MAX_MP_BUSSES 256 37#define MAX_MP_BUSSES 256
@@ -107,7 +105,7 @@ extern int mp_register_gsi(struct device *dev, u32 gsi, int edge_level,
107 int active_high_low); 105 int active_high_low);
108#endif /* CONFIG_ACPI */ 106#endif /* CONFIG_ACPI */
109 107
110#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_APICS) 108#define PHYSID_ARRAY_SIZE BITS_TO_LONGS(MAX_LOCAL_APIC)
111 109
112struct physid_mask { 110struct physid_mask {
113 unsigned long mask[PHYSID_ARRAY_SIZE]; 111 unsigned long mask[PHYSID_ARRAY_SIZE];
@@ -122,31 +120,31 @@ typedef struct physid_mask physid_mask_t;
122 test_and_set_bit(physid, (map).mask) 120 test_and_set_bit(physid, (map).mask)
123 121
124#define physids_and(dst, src1, src2) \ 122#define physids_and(dst, src1, src2) \
125 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 123 bitmap_and((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
126 124
127#define physids_or(dst, src1, src2) \ 125#define physids_or(dst, src1, src2) \
128 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_APICS) 126 bitmap_or((dst).mask, (src1).mask, (src2).mask, MAX_LOCAL_APIC)
129 127
130#define physids_clear(map) \ 128#define physids_clear(map) \
131 bitmap_zero((map).mask, MAX_APICS) 129 bitmap_zero((map).mask, MAX_LOCAL_APIC)
132 130
133#define physids_complement(dst, src) \ 131#define physids_complement(dst, src) \
134 bitmap_complement((dst).mask, (src).mask, MAX_APICS) 132 bitmap_complement((dst).mask, (src).mask, MAX_LOCAL_APIC)
135 133
136#define physids_empty(map) \ 134#define physids_empty(map) \
137 bitmap_empty((map).mask, MAX_APICS) 135 bitmap_empty((map).mask, MAX_LOCAL_APIC)
138 136
139#define physids_equal(map1, map2) \ 137#define physids_equal(map1, map2) \
140 bitmap_equal((map1).mask, (map2).mask, MAX_APICS) 138 bitmap_equal((map1).mask, (map2).mask, MAX_LOCAL_APIC)
141 139
142#define physids_weight(map) \ 140#define physids_weight(map) \
143 bitmap_weight((map).mask, MAX_APICS) 141 bitmap_weight((map).mask, MAX_LOCAL_APIC)
144 142
145#define physids_shift_right(d, s, n) \ 143#define physids_shift_right(d, s, n) \
146 bitmap_shift_right((d).mask, (s).mask, n, MAX_APICS) 144 bitmap_shift_right((d).mask, (s).mask, n, MAX_LOCAL_APIC)
147 145
148#define physids_shift_left(d, s, n) \ 146#define physids_shift_left(d, s, n) \
149 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 147 bitmap_shift_left((d).mask, (s).mask, n, MAX_LOCAL_APIC)
150 148
151static inline unsigned long physids_coerce(physid_mask_t *map) 149static inline unsigned long physids_coerce(physid_mask_t *map)
152{ 150{
@@ -159,14 +157,6 @@ static inline void physids_promote(unsigned long physids, physid_mask_t *map)
159 map->mask[0] = physids; 157 map->mask[0] = physids;
160} 158}
161 159
162/* Note: will create very large stack frames if physid_mask_t is big */
163#define physid_mask_of_physid(physid) \
164 ({ \
165 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \
166 physid_set(physid, __physid_mask); \
167 __physid_mask; \
168 })
169
170static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map) 160static inline void physid_set_mask_of_physid(int physid, physid_mask_t *map)
171{ 161{
172 physids_clear(*map); 162 physids_clear(*map);
diff --git a/arch/x86/include/asm/mpspec_def.h b/arch/x86/include/asm/mpspec_def.h
index 4a7f96d7c188..c0a955a9a087 100644
--- a/arch/x86/include/asm/mpspec_def.h
+++ b/arch/x86/include/asm/mpspec_def.h
@@ -15,13 +15,6 @@
15 15
16#ifdef CONFIG_X86_32 16#ifdef CONFIG_X86_32
17# define MAX_MPC_ENTRY 1024 17# define MAX_MPC_ENTRY 1024
18# define MAX_APICS 256
19#else
20# if NR_CPUS <= 255
21# define MAX_APICS 255
22# else
23# define MAX_APICS 32768
24# endif
25#endif 18#endif
26 19
27/* Intel MP Floating Pointer Structure */ 20/* Intel MP Floating Pointer Structure */
diff --git a/arch/x86/include/asm/mrst-vrtc.h b/arch/x86/include/asm/mrst-vrtc.h
new file mode 100644
index 000000000000..73668abdbedf
--- /dev/null
+++ b/arch/x86/include/asm/mrst-vrtc.h
@@ -0,0 +1,9 @@
1#ifndef _MRST_VRTC_H
2#define _MRST_VRTC_H
3
4extern unsigned char vrtc_cmos_read(unsigned char reg);
5extern void vrtc_cmos_write(unsigned char val, unsigned char reg);
6extern unsigned long vrtc_get_time(void);
7extern int vrtc_set_mmss(unsigned long nowtime);
8
9#endif
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
index 16350740edf6..719f00b28ff5 100644
--- a/arch/x86/include/asm/mrst.h
+++ b/arch/x86/include/asm/mrst.h
@@ -10,8 +10,13 @@
10 */ 10 */
11#ifndef _ASM_X86_MRST_H 11#ifndef _ASM_X86_MRST_H
12#define _ASM_X86_MRST_H 12#define _ASM_X86_MRST_H
13
14#include <linux/sfi.h>
15
13extern int pci_mrst_init(void); 16extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table); 17extern int __init sfi_parse_mrtc(struct sfi_table_header *table);
18extern int sfi_mrtc_num;
19extern struct sfi_rtc_table_entry sfi_mrtc_array[];
15 20
16/* 21/*
17 * Medfield is the follow-up of Moorestown, it combines two chip solution into 22 * Medfield is the follow-up of Moorestown, it combines two chip solution into
@@ -26,7 +31,7 @@ enum mrst_cpu_type {
26}; 31};
27 32
28extern enum mrst_cpu_type __mrst_cpu_chip; 33extern enum mrst_cpu_type __mrst_cpu_chip;
29static enum mrst_cpu_type mrst_identify_cpu(void) 34static inline enum mrst_cpu_type mrst_identify_cpu(void)
30{ 35{
31 return __mrst_cpu_chip; 36 return __mrst_cpu_chip;
32} 37}
@@ -42,4 +47,19 @@ extern enum mrst_timer_options mrst_timer_options;
42#define SFI_MTMR_MAX_NUM 8 47#define SFI_MTMR_MAX_NUM 8
43#define SFI_MRTC_MAX 8 48#define SFI_MRTC_MAX 8
44 49
50extern struct console early_mrst_console;
51extern void mrst_early_console_init(void);
52
53extern struct console early_hsu_console;
54extern void hsu_early_console_init(void);
55
56extern void intel_scu_devices_create(void);
57extern void intel_scu_devices_destroy(void);
58
59/* VRTC timer */
60#define MRST_VRTC_MAP_SZ (1024)
61/*#define MRST_VRTC_PGOFFSET (0xc00) */
62
63extern void mrst_rtc_init(void);
64
45#endif /* _ASM_X86_MRST_H */ 65#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 986f7790fdb2..485b4f1f079b 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -36,8 +36,14 @@
36#define MSR_IA32_PERFCTR1 0x000000c2 36#define MSR_IA32_PERFCTR1 0x000000c2
37#define MSR_FSB_FREQ 0x000000cd 37#define MSR_FSB_FREQ 0x000000cd
38 38
39#define MSR_NHM_SNB_PKG_CST_CFG_CTL 0x000000e2
40#define NHM_C3_AUTO_DEMOTE (1UL << 25)
41#define NHM_C1_AUTO_DEMOTE (1UL << 26)
42#define ATM_LNC_C6_AUTO_DEMOTE (1UL << 25)
43
39#define MSR_MTRRcap 0x000000fe 44#define MSR_MTRRcap 0x000000fe
40#define MSR_IA32_BBL_CR_CTL 0x00000119 45#define MSR_IA32_BBL_CR_CTL 0x00000119
46#define MSR_IA32_BBL_CR_CTL3 0x0000011e
41 47
42#define MSR_IA32_SYSENTER_CS 0x00000174 48#define MSR_IA32_SYSENTER_CS 0x00000174
43#define MSR_IA32_SYSENTER_ESP 0x00000175 49#define MSR_IA32_SYSENTER_ESP 0x00000175
@@ -47,6 +53,9 @@
47#define MSR_IA32_MCG_STATUS 0x0000017a 53#define MSR_IA32_MCG_STATUS 0x0000017a
48#define MSR_IA32_MCG_CTL 0x0000017b 54#define MSR_IA32_MCG_CTL 0x0000017b
49 55
56#define MSR_OFFCORE_RSP_0 0x000001a6
57#define MSR_OFFCORE_RSP_1 0x000001a7
58
50#define MSR_IA32_PEBS_ENABLE 0x000003f1 59#define MSR_IA32_PEBS_ENABLE 0x000003f1
51#define MSR_IA32_DS_AREA 0x00000600 60#define MSR_IA32_DS_AREA 0x00000600
52#define MSR_IA32_PERF_CAPABILITIES 0x00000345 61#define MSR_IA32_PERF_CAPABILITIES 0x00000345
@@ -87,11 +96,15 @@
87#define MSR_IA32_MC0_ADDR 0x00000402 96#define MSR_IA32_MC0_ADDR 0x00000402
88#define MSR_IA32_MC0_MISC 0x00000403 97#define MSR_IA32_MC0_MISC 0x00000403
89 98
99#define MSR_AMD64_MC0_MASK 0xc0010044
100
90#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x)) 101#define MSR_IA32_MCx_CTL(x) (MSR_IA32_MC0_CTL + 4*(x))
91#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x)) 102#define MSR_IA32_MCx_STATUS(x) (MSR_IA32_MC0_STATUS + 4*(x))
92#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x)) 103#define MSR_IA32_MCx_ADDR(x) (MSR_IA32_MC0_ADDR + 4*(x))
93#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x)) 104#define MSR_IA32_MCx_MISC(x) (MSR_IA32_MC0_MISC + 4*(x))
94 105
106#define MSR_AMD64_MCx_MASK(x) (MSR_AMD64_MC0_MASK + (x))
107
95/* These are consecutive and not in the normal 4er MCE bank block */ 108/* These are consecutive and not in the normal 4er MCE bank block */
96#define MSR_IA32_MC0_CTL2 0x00000280 109#define MSR_IA32_MC0_CTL2 0x00000280
97#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x)) 110#define MSR_IA32_MCx_CTL2(x) (MSR_IA32_MC0_CTL2 + (x))
@@ -105,6 +118,7 @@
105 complete list. */ 118 complete list. */
106 119
107#define MSR_AMD64_PATCH_LEVEL 0x0000008b 120#define MSR_AMD64_PATCH_LEVEL 0x0000008b
121#define MSR_AMD64_TSC_RATIO 0xc0000104
108#define MSR_AMD64_NB_CFG 0xc001001f 122#define MSR_AMD64_NB_CFG 0xc001001f
109#define MSR_AMD64_PATCH_LOADER 0xc0010020 123#define MSR_AMD64_PATCH_LOADER 0xc0010020
110#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140 124#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
@@ -121,13 +135,18 @@
121#define MSR_AMD64_IBSDCLINAD 0xc0011038 135#define MSR_AMD64_IBSDCLINAD 0xc0011038
122#define MSR_AMD64_IBSDCPHYSAD 0xc0011039 136#define MSR_AMD64_IBSDCPHYSAD 0xc0011039
123#define MSR_AMD64_IBSCTL 0xc001103a 137#define MSR_AMD64_IBSCTL 0xc001103a
138#define MSR_AMD64_IBSBRTARGET 0xc001103b
139
140/* Fam 15h MSRs */
141#define MSR_F15H_PERF_CTL 0xc0010200
142#define MSR_F15H_PERF_CTR 0xc0010201
124 143
125/* Fam 10h MSRs */ 144/* Fam 10h MSRs */
126#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 145#define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058
127#define FAM10H_MMIO_CONF_ENABLE (1<<0) 146#define FAM10H_MMIO_CONF_ENABLE (1<<0)
128#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf 147#define FAM10H_MMIO_CONF_BUSRANGE_MASK 0xf
129#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 148#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
130#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff 149#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL
131#define FAM10H_MMIO_CONF_BASE_SHIFT 20 150#define FAM10H_MMIO_CONF_BASE_SHIFT 20
132#define MSR_FAM10H_NODE_ID 0xc001100c 151#define MSR_FAM10H_NODE_ID 0xc001100c
133 152
@@ -198,6 +217,7 @@
198#define MSR_IA32_TSC 0x00000010 217#define MSR_IA32_TSC 0x00000010
199#define MSR_IA32_PLATFORM_ID 0x00000017 218#define MSR_IA32_PLATFORM_ID 0x00000017
200#define MSR_IA32_EBL_CR_POWERON 0x0000002a 219#define MSR_IA32_EBL_CR_POWERON 0x0000002a
220#define MSR_EBC_FREQUENCY_ID 0x0000002c
201#define MSR_IA32_FEATURE_CONTROL 0x0000003a 221#define MSR_IA32_FEATURE_CONTROL 0x0000003a
202 222
203#define FEATURE_CONTROL_LOCKED (1<<0) 223#define FEATURE_CONTROL_LOCKED (1<<0)
@@ -251,6 +271,18 @@
251#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1) 271#define PACKAGE_THERM_INT_LOW_ENABLE (1 << 1)
252#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24) 272#define PACKAGE_THERM_INT_PLN_ENABLE (1 << 24)
253 273
274/* Thermal Thresholds Support */
275#define THERM_INT_THRESHOLD0_ENABLE (1 << 15)
276#define THERM_SHIFT_THRESHOLD0 8
277#define THERM_MASK_THRESHOLD0 (0x7f << THERM_SHIFT_THRESHOLD0)
278#define THERM_INT_THRESHOLD1_ENABLE (1 << 23)
279#define THERM_SHIFT_THRESHOLD1 16
280#define THERM_MASK_THRESHOLD1 (0x7f << THERM_SHIFT_THRESHOLD1)
281#define THERM_STATUS_THRESHOLD0 (1 << 6)
282#define THERM_LOG_THRESHOLD0 (1 << 7)
283#define THERM_STATUS_THRESHOLD1 (1 << 8)
284#define THERM_LOG_THRESHOLD1 (1 << 9)
285
254/* MISC_ENABLE bits: architectural */ 286/* MISC_ENABLE bits: architectural */
255#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0) 287#define MSR_IA32_MISC_ENABLE_FAST_STRING (1ULL << 0)
256#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1) 288#define MSR_IA32_MISC_ENABLE_TCC (1ULL << 1)
diff --git a/arch/x86/include/asm/mwait.h b/arch/x86/include/asm/mwait.h
new file mode 100644
index 000000000000..bcdff997668c
--- /dev/null
+++ b/arch/x86/include/asm/mwait.h
@@ -0,0 +1,15 @@
1#ifndef _ASM_X86_MWAIT_H
2#define _ASM_X86_MWAIT_H
3
4#define MWAIT_SUBSTATE_MASK 0xf
5#define MWAIT_CSTATE_MASK 0xf
6#define MWAIT_SUBSTATE_SIZE 4
7#define MWAIT_MAX_NUM_CSTATES 8
8
9#define CPUID_MWAIT_LEAF 5
10#define CPUID5_ECX_EXTENSIONS_SUPPORTED 0x1
11#define CPUID5_ECX_INTERRUPT_BREAK 0x2
12
13#define MWAIT_ECX_INTERRUPT_BREAK 0x1
14
15#endif /* _ASM_X86_MWAIT_H */
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 932f0f86b4b7..4886a68f267e 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -5,41 +5,14 @@
5#include <asm/irq.h> 5#include <asm/irq.h>
6#include <asm/io.h> 6#include <asm/io.h>
7 7
8#ifdef ARCH_HAS_NMI_WATCHDOG 8#ifdef CONFIG_X86_LOCAL_APIC
9 9
10/**
11 * do_nmi_callback
12 *
13 * Check to see if a callback exists and execute it. Return 1
14 * if the handler exists and was handled successfully.
15 */
16int do_nmi_callback(struct pt_regs *regs, int cpu);
17
18extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void);
20#if !defined(CONFIG_LOCKUP_DETECTOR)
21extern int nmi_watchdog_enabled;
22#endif
23extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 10extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
24extern int reserve_perfctr_nmi(unsigned int); 11extern int reserve_perfctr_nmi(unsigned int);
25extern void release_perfctr_nmi(unsigned int); 12extern void release_perfctr_nmi(unsigned int);
26extern int reserve_evntsel_nmi(unsigned int); 13extern int reserve_evntsel_nmi(unsigned int);
27extern void release_evntsel_nmi(unsigned int); 14extern void release_evntsel_nmi(unsigned int);
28 15
29extern void setup_apic_nmi_watchdog(void *);
30extern void stop_apic_nmi_watchdog(void *);
31extern void disable_timer_nmi_watchdog(void);
32extern void enable_timer_nmi_watchdog(void);
33extern int nmi_watchdog_tick(struct pt_regs *regs, unsigned reason);
34extern void cpu_nmi_set_wd_enabled(void);
35
36extern atomic_t nmi_active;
37extern unsigned int nmi_watchdog;
38#define NMI_NONE 0
39#define NMI_IO_APIC 1
40#define NMI_LOCAL_APIC 2
41#define NMI_INVALID 3
42
43struct ctl_table; 16struct ctl_table;
44extern int proc_nmi_enabled(struct ctl_table *, int , 17extern int proc_nmi_enabled(struct ctl_table *, int ,
45 void __user *, size_t *, loff_t *); 18 void __user *, size_t *, loff_t *);
@@ -47,33 +20,28 @@ extern int unknown_nmi_panic;
47 20
48void arch_trigger_all_cpu_backtrace(void); 21void arch_trigger_all_cpu_backtrace(void);
49#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace 22#define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace
23#endif
50 24
51static inline void localise_nmi_watchdog(void) 25/*
52{ 26 * Define some priorities for the nmi notifier call chain.
53 if (nmi_watchdog == NMI_IO_APIC) 27 *
54 nmi_watchdog = NMI_LOCAL_APIC; 28 * Create a local nmi bit that has a higher priority than
55} 29 * external nmis, because the local ones are more frequent.
30 *
31 * Also setup some default high/normal/low settings for
32 * subsystems to registers with. Using 4 bits to separate
33 * the priorities. This can go a lot higher if needed be.
34 */
56 35
57/* check if nmi_watchdog is active (ie was specified at boot) */ 36#define NMI_LOCAL_SHIFT 16 /* randomly picked */
58static inline int nmi_watchdog_active(void) 37#define NMI_LOCAL_BIT (1ULL << NMI_LOCAL_SHIFT)
59{ 38#define NMI_HIGH_PRIOR (1ULL << 8)
60 /* 39#define NMI_NORMAL_PRIOR (1ULL << 4)
61 * actually it should be: 40#define NMI_LOW_PRIOR (1ULL << 0)
62 * return (nmi_watchdog == NMI_LOCAL_APIC || 41#define NMI_LOCAL_HIGH_PRIOR (NMI_LOCAL_BIT | NMI_HIGH_PRIOR)
63 * nmi_watchdog == NMI_IO_APIC) 42#define NMI_LOCAL_NORMAL_PRIOR (NMI_LOCAL_BIT | NMI_NORMAL_PRIOR)
64 * but since they are power of two we could use a 43#define NMI_LOCAL_LOW_PRIOR (NMI_LOCAL_BIT | NMI_LOW_PRIOR)
65 * cheaper way --cvg
66 */
67 return nmi_watchdog & (NMI_LOCAL_APIC | NMI_IO_APIC);
68}
69#endif
70 44
71void lapic_watchdog_stop(void);
72int lapic_watchdog_init(unsigned nmi_hz);
73int lapic_wd_event(unsigned nmi_hz);
74unsigned lapic_adjust_nmi_hz(unsigned hz);
75void disable_lapic_nmi_watchdog(void);
76void enable_lapic_nmi_watchdog(void);
77void stop_nmi(void); 45void stop_nmi(void);
78void restart_nmi(void); 46void restart_nmi(void);
79 47
diff --git a/arch/x86/include/asm/nops.h b/arch/x86/include/asm/nops.h
index 6d8723a766cc..405b4032a60b 100644
--- a/arch/x86/include/asm/nops.h
+++ b/arch/x86/include/asm/nops.h
@@ -1,7 +1,13 @@
1#ifndef _ASM_X86_NOPS_H 1#ifndef _ASM_X86_NOPS_H
2#define _ASM_X86_NOPS_H 2#define _ASM_X86_NOPS_H
3 3
4/* Define nops for use with alternative() */ 4/*
5 * Define nops for use with alternative() and for tracing.
6 *
7 * *_NOP5_ATOMIC must be a single instruction.
8 */
9
10#define NOP_DS_PREFIX 0x3e
5 11
6/* generic versions from gas 12/* generic versions from gas
7 1: nop 13 1: nop
@@ -13,14 +19,15 @@
13 6: leal 0x00000000(%esi),%esi 19 6: leal 0x00000000(%esi),%esi
14 7: leal 0x00000000(,%esi,1),%esi 20 7: leal 0x00000000(,%esi,1),%esi
15*/ 21*/
16#define GENERIC_NOP1 ".byte 0x90\n" 22#define GENERIC_NOP1 0x90
17#define GENERIC_NOP2 ".byte 0x89,0xf6\n" 23#define GENERIC_NOP2 0x89,0xf6
18#define GENERIC_NOP3 ".byte 0x8d,0x76,0x00\n" 24#define GENERIC_NOP3 0x8d,0x76,0x00
19#define GENERIC_NOP4 ".byte 0x8d,0x74,0x26,0x00\n" 25#define GENERIC_NOP4 0x8d,0x74,0x26,0x00
20#define GENERIC_NOP5 GENERIC_NOP1 GENERIC_NOP4 26#define GENERIC_NOP5 GENERIC_NOP1,GENERIC_NOP4
21#define GENERIC_NOP6 ".byte 0x8d,0xb6,0x00,0x00,0x00,0x00\n" 27#define GENERIC_NOP6 0x8d,0xb6,0x00,0x00,0x00,0x00
22#define GENERIC_NOP7 ".byte 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00\n" 28#define GENERIC_NOP7 0x8d,0xb4,0x26,0x00,0x00,0x00,0x00
23#define GENERIC_NOP8 GENERIC_NOP1 GENERIC_NOP7 29#define GENERIC_NOP8 GENERIC_NOP1,GENERIC_NOP7
30#define GENERIC_NOP5_ATOMIC NOP_DS_PREFIX,GENERIC_NOP4
24 31
25/* Opteron 64bit nops 32/* Opteron 64bit nops
26 1: nop 33 1: nop
@@ -29,16 +36,17 @@
29 4: osp osp osp nop 36 4: osp osp osp nop
30*/ 37*/
31#define K8_NOP1 GENERIC_NOP1 38#define K8_NOP1 GENERIC_NOP1
32#define K8_NOP2 ".byte 0x66,0x90\n" 39#define K8_NOP2 0x66,K8_NOP1
33#define K8_NOP3 ".byte 0x66,0x66,0x90\n" 40#define K8_NOP3 0x66,K8_NOP2
34#define K8_NOP4 ".byte 0x66,0x66,0x66,0x90\n" 41#define K8_NOP4 0x66,K8_NOP3
35#define K8_NOP5 K8_NOP3 K8_NOP2 42#define K8_NOP5 K8_NOP3,K8_NOP2
36#define K8_NOP6 K8_NOP3 K8_NOP3 43#define K8_NOP6 K8_NOP3,K8_NOP3
37#define K8_NOP7 K8_NOP4 K8_NOP3 44#define K8_NOP7 K8_NOP4,K8_NOP3
38#define K8_NOP8 K8_NOP4 K8_NOP4 45#define K8_NOP8 K8_NOP4,K8_NOP4
46#define K8_NOP5_ATOMIC 0x66,K8_NOP4
39 47
40/* K7 nops 48/* K7 nops
41 uses eax dependencies (arbitary choice) 49 uses eax dependencies (arbitrary choice)
42 1: nop 50 1: nop
43 2: movl %eax,%eax 51 2: movl %eax,%eax
44 3: leal (,%eax,1),%eax 52 3: leal (,%eax,1),%eax
@@ -47,13 +55,14 @@
47 7: leal 0x00000000(,%eax,1),%eax 55 7: leal 0x00000000(,%eax,1),%eax
48*/ 56*/
49#define K7_NOP1 GENERIC_NOP1 57#define K7_NOP1 GENERIC_NOP1
50#define K7_NOP2 ".byte 0x8b,0xc0\n" 58#define K7_NOP2 0x8b,0xc0
51#define K7_NOP3 ".byte 0x8d,0x04,0x20\n" 59#define K7_NOP3 0x8d,0x04,0x20
52#define K7_NOP4 ".byte 0x8d,0x44,0x20,0x00\n" 60#define K7_NOP4 0x8d,0x44,0x20,0x00
53#define K7_NOP5 K7_NOP4 ASM_NOP1 61#define K7_NOP5 K7_NOP4,K7_NOP1
54#define K7_NOP6 ".byte 0x8d,0x80,0,0,0,0\n" 62#define K7_NOP6 0x8d,0x80,0,0,0,0
55#define K7_NOP7 ".byte 0x8D,0x04,0x05,0,0,0,0\n" 63#define K7_NOP7 0x8D,0x04,0x05,0,0,0,0
56#define K7_NOP8 K7_NOP7 ASM_NOP1 64#define K7_NOP8 K7_NOP7,K7_NOP1
65#define K7_NOP5_ATOMIC NOP_DS_PREFIX,K7_NOP4
57 66
58/* P6 nops 67/* P6 nops
59 uses eax dependencies (Intel-recommended choice) 68 uses eax dependencies (Intel-recommended choice)
@@ -69,52 +78,65 @@
69 There is kernel code that depends on this. 78 There is kernel code that depends on this.
70*/ 79*/
71#define P6_NOP1 GENERIC_NOP1 80#define P6_NOP1 GENERIC_NOP1
72#define P6_NOP2 ".byte 0x66,0x90\n" 81#define P6_NOP2 0x66,0x90
73#define P6_NOP3 ".byte 0x0f,0x1f,0x00\n" 82#define P6_NOP3 0x0f,0x1f,0x00
74#define P6_NOP4 ".byte 0x0f,0x1f,0x40,0\n" 83#define P6_NOP4 0x0f,0x1f,0x40,0
75#define P6_NOP5 ".byte 0x0f,0x1f,0x44,0x00,0\n" 84#define P6_NOP5 0x0f,0x1f,0x44,0x00,0
76#define P6_NOP6 ".byte 0x66,0x0f,0x1f,0x44,0x00,0\n" 85#define P6_NOP6 0x66,0x0f,0x1f,0x44,0x00,0
77#define P6_NOP7 ".byte 0x0f,0x1f,0x80,0,0,0,0\n" 86#define P6_NOP7 0x0f,0x1f,0x80,0,0,0,0
78#define P6_NOP8 ".byte 0x0f,0x1f,0x84,0x00,0,0,0,0\n" 87#define P6_NOP8 0x0f,0x1f,0x84,0x00,0,0,0,0
88#define P6_NOP5_ATOMIC P6_NOP5
89
90#define _ASM_MK_NOP(x) ".byte " __stringify(x) "\n"
79 91
80#if defined(CONFIG_MK7) 92#if defined(CONFIG_MK7)
81#define ASM_NOP1 K7_NOP1 93#define ASM_NOP1 _ASM_MK_NOP(K7_NOP1)
82#define ASM_NOP2 K7_NOP2 94#define ASM_NOP2 _ASM_MK_NOP(K7_NOP2)
83#define ASM_NOP3 K7_NOP3 95#define ASM_NOP3 _ASM_MK_NOP(K7_NOP3)
84#define ASM_NOP4 K7_NOP4 96#define ASM_NOP4 _ASM_MK_NOP(K7_NOP4)
85#define ASM_NOP5 K7_NOP5 97#define ASM_NOP5 _ASM_MK_NOP(K7_NOP5)
86#define ASM_NOP6 K7_NOP6 98#define ASM_NOP6 _ASM_MK_NOP(K7_NOP6)
87#define ASM_NOP7 K7_NOP7 99#define ASM_NOP7 _ASM_MK_NOP(K7_NOP7)
88#define ASM_NOP8 K7_NOP8 100#define ASM_NOP8 _ASM_MK_NOP(K7_NOP8)
101#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K7_NOP5_ATOMIC)
89#elif defined(CONFIG_X86_P6_NOP) 102#elif defined(CONFIG_X86_P6_NOP)
90#define ASM_NOP1 P6_NOP1 103#define ASM_NOP1 _ASM_MK_NOP(P6_NOP1)
91#define ASM_NOP2 P6_NOP2 104#define ASM_NOP2 _ASM_MK_NOP(P6_NOP2)
92#define ASM_NOP3 P6_NOP3 105#define ASM_NOP3 _ASM_MK_NOP(P6_NOP3)
93#define ASM_NOP4 P6_NOP4 106#define ASM_NOP4 _ASM_MK_NOP(P6_NOP4)
94#define ASM_NOP5 P6_NOP5 107#define ASM_NOP5 _ASM_MK_NOP(P6_NOP5)
95#define ASM_NOP6 P6_NOP6 108#define ASM_NOP6 _ASM_MK_NOP(P6_NOP6)
96#define ASM_NOP7 P6_NOP7 109#define ASM_NOP7 _ASM_MK_NOP(P6_NOP7)
97#define ASM_NOP8 P6_NOP8 110#define ASM_NOP8 _ASM_MK_NOP(P6_NOP8)
111#define ASM_NOP5_ATOMIC _ASM_MK_NOP(P6_NOP5_ATOMIC)
98#elif defined(CONFIG_X86_64) 112#elif defined(CONFIG_X86_64)
99#define ASM_NOP1 K8_NOP1 113#define ASM_NOP1 _ASM_MK_NOP(K8_NOP1)
100#define ASM_NOP2 K8_NOP2 114#define ASM_NOP2 _ASM_MK_NOP(K8_NOP2)
101#define ASM_NOP3 K8_NOP3 115#define ASM_NOP3 _ASM_MK_NOP(K8_NOP3)
102#define ASM_NOP4 K8_NOP4 116#define ASM_NOP4 _ASM_MK_NOP(K8_NOP4)
103#define ASM_NOP5 K8_NOP5 117#define ASM_NOP5 _ASM_MK_NOP(K8_NOP5)
104#define ASM_NOP6 K8_NOP6 118#define ASM_NOP6 _ASM_MK_NOP(K8_NOP6)
105#define ASM_NOP7 K8_NOP7 119#define ASM_NOP7 _ASM_MK_NOP(K8_NOP7)
106#define ASM_NOP8 K8_NOP8 120#define ASM_NOP8 _ASM_MK_NOP(K8_NOP8)
121#define ASM_NOP5_ATOMIC _ASM_MK_NOP(K8_NOP5_ATOMIC)
107#else 122#else
108#define ASM_NOP1 GENERIC_NOP1 123#define ASM_NOP1 _ASM_MK_NOP(GENERIC_NOP1)
109#define ASM_NOP2 GENERIC_NOP2 124#define ASM_NOP2 _ASM_MK_NOP(GENERIC_NOP2)
110#define ASM_NOP3 GENERIC_NOP3 125#define ASM_NOP3 _ASM_MK_NOP(GENERIC_NOP3)
111#define ASM_NOP4 GENERIC_NOP4 126#define ASM_NOP4 _ASM_MK_NOP(GENERIC_NOP4)
112#define ASM_NOP5 GENERIC_NOP5 127#define ASM_NOP5 _ASM_MK_NOP(GENERIC_NOP5)
113#define ASM_NOP6 GENERIC_NOP6 128#define ASM_NOP6 _ASM_MK_NOP(GENERIC_NOP6)
114#define ASM_NOP7 GENERIC_NOP7 129#define ASM_NOP7 _ASM_MK_NOP(GENERIC_NOP7)
115#define ASM_NOP8 GENERIC_NOP8 130#define ASM_NOP8 _ASM_MK_NOP(GENERIC_NOP8)
131#define ASM_NOP5_ATOMIC _ASM_MK_NOP(GENERIC_NOP5_ATOMIC)
116#endif 132#endif
117 133
118#define ASM_NOP_MAX 8 134#define ASM_NOP_MAX 8
135#define NOP_ATOMIC5 (ASM_NOP_MAX+1) /* Entry for the 5-byte atomic NOP */
136
137#ifndef __ASSEMBLY__
138extern const unsigned char * const *ideal_nops;
139extern void arch_init_ideal_nops(void);
140#endif
119 141
120#endif /* _ASM_X86_NOPS_H */ 142#endif /* _ASM_X86_NOPS_H */
diff --git a/arch/x86/include/asm/numa.h b/arch/x86/include/asm/numa.h
index 27da400d3138..bfacd2ccf651 100644
--- a/arch/x86/include/asm/numa.h
+++ b/arch/x86/include/asm/numa.h
@@ -1,5 +1,85 @@
1#ifndef _ASM_X86_NUMA_H
2#define _ASM_X86_NUMA_H
3
4#include <linux/nodemask.h>
5
6#include <asm/topology.h>
7#include <asm/apicdef.h>
8
9#ifdef CONFIG_NUMA
10
11#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
12#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
13
14/*
15 * Too small node sizes may confuse the VM badly. Usually they
16 * result from BIOS bugs. So dont recognize nodes as standalone
17 * NUMA entities that have less than this amount of RAM listed:
18 */
19#define NODE_MIN_SIZE (4*1024*1024)
20
21extern int numa_off;
22
23/*
24 * __apicid_to_node[] stores the raw mapping between physical apicid and
25 * node and is used to initialize cpu_to_node mapping.
26 *
27 * The mapping may be overridden by apic->numa_cpu_node() on 32bit and thus
28 * should be accessed by the accessors - set_apicid_to_node() and
29 * numa_cpu_node().
30 */
31extern s16 __apicid_to_node[MAX_LOCAL_APIC];
32extern nodemask_t numa_nodes_parsed __initdata;
33
34extern int __init numa_add_memblk(int nodeid, u64 start, u64 end);
35extern void __init numa_set_distance(int from, int to, int distance);
36
37static inline void set_apicid_to_node(int apicid, s16 node)
38{
39 __apicid_to_node[apicid] = node;
40}
41
42extern int __cpuinit numa_cpu_node(int cpu);
43
44#else /* CONFIG_NUMA */
45static inline void set_apicid_to_node(int apicid, s16 node)
46{
47}
48
49static inline int numa_cpu_node(int cpu)
50{
51 return NUMA_NO_NODE;
52}
53#endif /* CONFIG_NUMA */
54
1#ifdef CONFIG_X86_32 55#ifdef CONFIG_X86_32
2# include "numa_32.h" 56# include "numa_32.h"
3#else 57#else
4# include "numa_64.h" 58# include "numa_64.h"
5#endif 59#endif
60
61#ifdef CONFIG_NUMA
62extern void __cpuinit numa_set_node(int cpu, int node);
63extern void __cpuinit numa_clear_node(int cpu);
64extern void __init init_cpu_to_node(void);
65extern void __cpuinit numa_add_cpu(int cpu);
66extern void __cpuinit numa_remove_cpu(int cpu);
67#else /* CONFIG_NUMA */
68static inline void numa_set_node(int cpu, int node) { }
69static inline void numa_clear_node(int cpu) { }
70static inline void init_cpu_to_node(void) { }
71static inline void numa_add_cpu(int cpu) { }
72static inline void numa_remove_cpu(int cpu) { }
73#endif /* CONFIG_NUMA */
74
75#ifdef CONFIG_DEBUG_PER_CPU_MAPS
76void debug_cpumask_set_cpu(int cpu, int node, bool enable);
77#endif
78
79#ifdef CONFIG_NUMA_EMU
80#define FAKE_NODE_MIN_SIZE ((u64)32 << 20)
81#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
82void numa_emu_cmdline(char *);
83#endif /* CONFIG_NUMA_EMU */
84
85#endif /* _ASM_X86_NUMA_H */
diff --git a/arch/x86/include/asm/numa_32.h b/arch/x86/include/asm/numa_32.h
index a37229011b56..e7d6b8254742 100644
--- a/arch/x86/include/asm/numa_32.h
+++ b/arch/x86/include/asm/numa_32.h
@@ -1,9 +1,6 @@
1#ifndef _ASM_X86_NUMA_32_H 1#ifndef _ASM_X86_NUMA_32_H
2#define _ASM_X86_NUMA_32_H 2#define _ASM_X86_NUMA_32_H
3 3
4extern int pxm_to_nid(int pxm);
5extern void numa_remove_cpu(int cpu);
6
7#ifdef CONFIG_HIGHMEM 4#ifdef CONFIG_HIGHMEM
8extern void set_highmem_pages_init(void); 5extern void set_highmem_pages_init(void);
9#else 6#else
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index 823e070e7c26..0c05f7ae46e8 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -1,52 +1,6 @@
1#ifndef _ASM_X86_NUMA_64_H 1#ifndef _ASM_X86_NUMA_64_H
2#define _ASM_X86_NUMA_64_H 2#define _ASM_X86_NUMA_64_H
3 3
4#include <linux/nodemask.h>
5#include <asm/apicdef.h>
6
7struct bootnode {
8 u64 start;
9 u64 end;
10};
11
12extern int compute_hash_shift(struct bootnode *nodes, int numblks,
13 int *nodeids);
14
15#define ZONE_ALIGN (1UL << (MAX_ORDER+PAGE_SHIFT))
16
17extern void numa_init_array(void);
18extern int numa_off;
19
20extern s16 apicid_to_node[MAX_LOCAL_APIC];
21
22extern unsigned long numa_free_all_bootmem(void); 4extern unsigned long numa_free_all_bootmem(void);
23extern void setup_node_bootmem(int nodeid, unsigned long start,
24 unsigned long end);
25
26#ifdef CONFIG_NUMA
27/*
28 * Too small node sizes may confuse the VM badly. Usually they
29 * result from BIOS bugs. So dont recognize nodes as standalone
30 * NUMA entities that have less than this amount of RAM listed:
31 */
32#define NODE_MIN_SIZE (4*1024*1024)
33
34extern void __init init_cpu_to_node(void);
35extern void __cpuinit numa_set_node(int cpu, int node);
36extern void __cpuinit numa_clear_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu);
39
40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43#endif /* CONFIG_NUMA_EMU */
44#else
45static inline void init_cpu_to_node(void) { }
46static inline void numa_set_node(int cpu, int node) { }
47static inline void numa_clear_node(int cpu) { }
48static inline void numa_add_cpu(int cpu, int node) { }
49static inline void numa_remove_cpu(int cpu) { }
50#endif
51 5
52#endif /* _ASM_X86_NUMA_64_H */ 6#endif /* _ASM_X86_NUMA_64_H */
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 37c516545ec8..c3b3c322fd87 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -29,7 +29,7 @@
29#ifdef CONFIG_X86_NUMAQ 29#ifdef CONFIG_X86_NUMAQ
30 30
31extern int found_numaq; 31extern int found_numaq;
32extern int get_memcfg_numaq(void); 32extern int numaq_numa_init(void);
33extern int pci_numaq_init(void); 33extern int pci_numaq_init(void);
34 34
35extern void *xquad_portio; 35extern void *xquad_portio;
@@ -166,11 +166,6 @@ struct sys_cfg_data {
166 166
167void numaq_tsc_disable(void); 167void numaq_tsc_disable(void);
168 168
169#else
170static inline int get_memcfg_numaq(void)
171{
172 return 0;
173}
174#endif /* CONFIG_X86_NUMAQ */ 169#endif /* CONFIG_X86_NUMAQ */
175#endif /* _ASM_X86_NUMAQ_H */ 170#endif /* _ASM_X86_NUMAQ_H */
176 171
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 101229b0d8ed..5ca6801b75f3 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -20,7 +20,7 @@ extern struct olpc_platform_t olpc_platform_info;
20 20
21/* 21/*
22 * OLPC board IDs contain the major build number within the mask 0x0ff0, 22 * OLPC board IDs contain the major build number within the mask 0x0ff0,
23 * and the minor build number withing 0x000f. Pre-builds have a minor 23 * and the minor build number within 0x000f. Pre-builds have a minor
24 * number less than 8, and normal builds start at 8. For example, 0x0B10 24 * number less than 8, and normal builds start at 8. For example, 0x0B10
25 * is a PreB1, and 0x0C18 is a C1. 25 * is a PreB1, and 0x0C18 is a C1.
26 */ 26 */
@@ -89,6 +89,8 @@ extern int olpc_ec_mask_unset(uint8_t bits);
89/* EC commands */ 89/* EC commands */
90 90
91#define EC_FIRMWARE_REV 0x08 91#define EC_FIRMWARE_REV 0x08
92#define EC_WLAN_ENTER_RESET 0x35
93#define EC_WLAN_LEAVE_RESET 0x25
92 94
93/* SCI source values */ 95/* SCI source values */
94 96
@@ -105,10 +107,14 @@ extern int olpc_ec_mask_unset(uint8_t bits);
105/* GPIO assignments */ 107/* GPIO assignments */
106 108
107#define OLPC_GPIO_MIC_AC 1 109#define OLPC_GPIO_MIC_AC 1
108#define OLPC_GPIO_DCON_IRQ geode_gpio(7) 110#define OLPC_GPIO_DCON_STAT0 5
111#define OLPC_GPIO_DCON_STAT1 6
112#define OLPC_GPIO_DCON_IRQ 7
109#define OLPC_GPIO_THRM_ALRM geode_gpio(10) 113#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
110#define OLPC_GPIO_SMB_CLK geode_gpio(14) 114#define OLPC_GPIO_DCON_LOAD 11
111#define OLPC_GPIO_SMB_DATA geode_gpio(15) 115#define OLPC_GPIO_DCON_BLANK 12
116#define OLPC_GPIO_SMB_CLK 14
117#define OLPC_GPIO_SMB_DATA 15
112#define OLPC_GPIO_WORKAUX geode_gpio(24) 118#define OLPC_GPIO_WORKAUX geode_gpio(24)
113#define OLPC_GPIO_LID geode_gpio(26) 119#define OLPC_GPIO_LID geode_gpio(26)
114#define OLPC_GPIO_ECSCI geode_gpio(27) 120#define OLPC_GPIO_ECSCI geode_gpio(27)
diff --git a/arch/x86/include/asm/olpc_ofw.h b/arch/x86/include/asm/olpc_ofw.h
index 08fde475cb3b..24487712e0b1 100644
--- a/arch/x86/include/asm/olpc_ofw.h
+++ b/arch/x86/include/asm/olpc_ofw.h
@@ -6,7 +6,9 @@
6 6
7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */ 7#define OLPC_OFW_SIG 0x2057464F /* aka "OFW " */
8 8
9#ifdef CONFIG_OLPC_OPENFIRMWARE 9#ifdef CONFIG_OLPC
10
11extern bool olpc_ofw_is_installed(void);
10 12
11/* run an OFW command by calling into the firmware */ 13/* run an OFW command by calling into the firmware */
12#define olpc_ofw(name, args, res) \ 14#define olpc_ofw(name, args, res) \
@@ -21,11 +23,15 @@ extern void olpc_ofw_detect(void);
21/* install OFW's pde permanently into the kernel's pgtable */ 23/* install OFW's pde permanently into the kernel's pgtable */
22extern void setup_olpc_ofw_pgd(void); 24extern void setup_olpc_ofw_pgd(void);
23 25
24#else /* !CONFIG_OLPC_OPENFIRMWARE */ 26/* check if OFW was detected during boot */
27extern bool olpc_ofw_present(void);
28
29extern void olpc_dt_build_devicetree(void);
25 30
31#else /* !CONFIG_OLPC */
26static inline void olpc_ofw_detect(void) { } 32static inline void olpc_ofw_detect(void) { }
27static inline void setup_olpc_ofw_pgd(void) { } 33static inline void setup_olpc_ofw_pgd(void) { }
28 34static inline void olpc_dt_build_devicetree(void) { }
29#endif /* !CONFIG_OLPC_OPENFIRMWARE */ 35#endif /* !CONFIG_OLPC */
30 36
31#endif /* _ASM_X86_OLPC_OFW_H */ 37#endif /* _ASM_X86_OLPC_OFW_H */
diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h
index 6f1b7331313f..ade619ff9e2a 100644
--- a/arch/x86/include/asm/page_32_types.h
+++ b/arch/x86/include/asm/page_32_types.h
@@ -15,11 +15,7 @@
15 */ 15 */
16#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL) 16#define __PAGE_OFFSET _AC(CONFIG_PAGE_OFFSET, UL)
17 17
18#ifdef CONFIG_4KSTACKS
19#define THREAD_ORDER 0
20#else
21#define THREAD_ORDER 1 18#define THREAD_ORDER 1
22#endif
23#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER) 19#define THREAD_SIZE (PAGE_SIZE << THREAD_ORDER)
24 20
25#define STACKFAULT_STACK 0 21#define STACKFAULT_STACK 0
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index a667f24c7254..bce688d54c12 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -2,13 +2,14 @@
2#define _ASM_X86_PAGE_DEFS_H 2#define _ASM_X86_PAGE_DEFS_H
3 3
4#include <linux/const.h> 4#include <linux/const.h>
5#include <linux/types.h>
5 6
6/* PAGE_SHIFT determines the page size */ 7/* PAGE_SHIFT determines the page size */
7#define PAGE_SHIFT 12 8#define PAGE_SHIFT 12
8#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT) 9#define PAGE_SIZE (_AC(1,UL) << PAGE_SHIFT)
9#define PAGE_MASK (~(PAGE_SIZE-1)) 10#define PAGE_MASK (~(PAGE_SIZE-1))
10 11
11#define __PHYSICAL_MASK ((phys_addr_t)(1ULL << __PHYSICAL_MASK_SHIFT) - 1) 12#define __PHYSICAL_MASK ((phys_addr_t)((1ULL << __PHYSICAL_MASK_SHIFT) - 1))
12#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1) 13#define __VIRTUAL_MASK ((1UL << __VIRTUAL_MASK_SHIFT) - 1)
13 14
14/* Cast PAGE_MASK to a signed type so that it is sign-extended if 15/* Cast PAGE_MASK to a signed type so that it is sign-extended if
@@ -45,11 +46,15 @@ extern int devmem_is_allowed(unsigned long pagenr);
45extern unsigned long max_low_pfn_mapped; 46extern unsigned long max_low_pfn_mapped;
46extern unsigned long max_pfn_mapped; 47extern unsigned long max_pfn_mapped;
47 48
49static inline phys_addr_t get_max_mapped(void)
50{
51 return (phys_addr_t)max_pfn_mapped << PAGE_SHIFT;
52}
53
48extern unsigned long init_memory_mapping(unsigned long start, 54extern unsigned long init_memory_mapping(unsigned long start,
49 unsigned long end); 55 unsigned long end);
50 56
51extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn, 57extern void initmem_init(void);
52 int acpi, int k8);
53extern void free_initmem(void); 58extern void free_initmem(void);
54 59
55#endif /* !__ASSEMBLY__ */ 60#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 5653f43d90e5..ebbc4d8ab170 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -105,14 +105,14 @@ static inline void write_cr8(unsigned long x)
105} 105}
106#endif 106#endif
107 107
108static inline void raw_safe_halt(void) 108static inline void arch_safe_halt(void)
109{ 109{
110 PVOP_VCALL0(pv_irq_ops.safe_halt); 110 PVOP_VCALL0(pv_irq_ops.safe_halt);
111} 111}
112 112
113static inline void halt(void) 113static inline void halt(void)
114{ 114{
115 PVOP_VCALL0(pv_irq_ops.safe_halt); 115 PVOP_VCALL0(pv_irq_ops.halt);
116} 116}
117 117
118static inline void wbinvd(void) 118static inline void wbinvd(void)
@@ -416,11 +416,6 @@ static inline void paravirt_alloc_pmd(struct mm_struct *mm, unsigned long pfn)
416 PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn); 416 PVOP_VCALL2(pv_mmu_ops.alloc_pmd, mm, pfn);
417} 417}
418 418
419static inline void paravirt_alloc_pmd_clone(unsigned long pfn, unsigned long clonepfn,
420 unsigned long start, unsigned long count)
421{
422 PVOP_VCALL4(pv_mmu_ops.alloc_pmd_clone, pfn, clonepfn, start, count);
423}
424static inline void paravirt_release_pmd(unsigned long pfn) 419static inline void paravirt_release_pmd(unsigned long pfn)
425{ 420{
426 PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn); 421 PVOP_VCALL1(pv_mmu_ops.release_pmd, pfn);
@@ -440,6 +435,11 @@ static inline void pte_update(struct mm_struct *mm, unsigned long addr,
440{ 435{
441 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); 436 PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep);
442} 437}
438static inline void pmd_update(struct mm_struct *mm, unsigned long addr,
439 pmd_t *pmdp)
440{
441 PVOP_VCALL3(pv_mmu_ops.pmd_update, mm, addr, pmdp);
442}
443 443
444static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr, 444static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
445 pte_t *ptep) 445 pte_t *ptep)
@@ -447,6 +447,12 @@ static inline void pte_update_defer(struct mm_struct *mm, unsigned long addr,
447 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep); 447 PVOP_VCALL3(pv_mmu_ops.pte_update_defer, mm, addr, ptep);
448} 448}
449 449
450static inline void pmd_update_defer(struct mm_struct *mm, unsigned long addr,
451 pmd_t *pmdp)
452{
453 PVOP_VCALL3(pv_mmu_ops.pmd_update_defer, mm, addr, pmdp);
454}
455
450static inline pte_t __pte(pteval_t val) 456static inline pte_t __pte(pteval_t val)
451{ 457{
452 pteval_t ret; 458 pteval_t ret;
@@ -548,6 +554,19 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
548 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); 554 PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte);
549} 555}
550 556
557#ifdef CONFIG_TRANSPARENT_HUGEPAGE
558static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
559 pmd_t *pmdp, pmd_t pmd)
560{
561 if (sizeof(pmdval_t) > sizeof(long))
562 /* 5 arg words */
563 pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd);
564 else
565 PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp,
566 native_pmd_val(pmd));
567}
568#endif
569
551static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) 570static inline void set_pmd(pmd_t *pmdp, pmd_t pmd)
552{ 571{
553 pmdval_t val = native_pmd_val(pmd); 572 pmdval_t val = native_pmd_val(pmd);
@@ -829,32 +848,32 @@ static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
829#define __PV_IS_CALLEE_SAVE(func) \ 848#define __PV_IS_CALLEE_SAVE(func) \
830 ((struct paravirt_callee_save) { func }) 849 ((struct paravirt_callee_save) { func })
831 850
832static inline unsigned long __raw_local_save_flags(void) 851static inline notrace unsigned long arch_local_save_flags(void)
833{ 852{
834 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl); 853 return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
835} 854}
836 855
837static inline void raw_local_irq_restore(unsigned long f) 856static inline notrace void arch_local_irq_restore(unsigned long f)
838{ 857{
839 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f); 858 PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
840} 859}
841 860
842static inline void raw_local_irq_disable(void) 861static inline notrace void arch_local_irq_disable(void)
843{ 862{
844 PVOP_VCALLEE0(pv_irq_ops.irq_disable); 863 PVOP_VCALLEE0(pv_irq_ops.irq_disable);
845} 864}
846 865
847static inline void raw_local_irq_enable(void) 866static inline notrace void arch_local_irq_enable(void)
848{ 867{
849 PVOP_VCALLEE0(pv_irq_ops.irq_enable); 868 PVOP_VCALLEE0(pv_irq_ops.irq_enable);
850} 869}
851 870
852static inline unsigned long __raw_local_irq_save(void) 871static inline notrace unsigned long arch_local_irq_save(void)
853{ 872{
854 unsigned long f; 873 unsigned long f;
855 874
856 f = __raw_local_save_flags(); 875 f = arch_local_save_flags();
857 raw_local_irq_disable(); 876 arch_local_irq_disable();
858 return f; 877 return f;
859} 878}
860 879
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index db9ef5532341..82885099c869 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -255,7 +255,6 @@ struct pv_mmu_ops {
255 */ 255 */
256 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn); 256 void (*alloc_pte)(struct mm_struct *mm, unsigned long pfn);
257 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn); 257 void (*alloc_pmd)(struct mm_struct *mm, unsigned long pfn);
258 void (*alloc_pmd_clone)(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count);
259 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn); 258 void (*alloc_pud)(struct mm_struct *mm, unsigned long pfn);
260 void (*release_pte)(unsigned long pfn); 259 void (*release_pte)(unsigned long pfn);
261 void (*release_pmd)(unsigned long pfn); 260 void (*release_pmd)(unsigned long pfn);
@@ -266,10 +265,16 @@ struct pv_mmu_ops {
266 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, 265 void (*set_pte_at)(struct mm_struct *mm, unsigned long addr,
267 pte_t *ptep, pte_t pteval); 266 pte_t *ptep, pte_t pteval);
268 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); 267 void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval);
268 void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr,
269 pmd_t *pmdp, pmd_t pmdval);
269 void (*pte_update)(struct mm_struct *mm, unsigned long addr, 270 void (*pte_update)(struct mm_struct *mm, unsigned long addr,
270 pte_t *ptep); 271 pte_t *ptep);
271 void (*pte_update_defer)(struct mm_struct *mm, 272 void (*pte_update_defer)(struct mm_struct *mm,
272 unsigned long addr, pte_t *ptep); 273 unsigned long addr, pte_t *ptep);
274 void (*pmd_update)(struct mm_struct *mm, unsigned long addr,
275 pmd_t *pmdp);
276 void (*pmd_update_defer)(struct mm_struct *mm,
277 unsigned long addr, pmd_t *pmdp);
273 278
274 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, 279 pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr,
275 pte_t *ptep); 280 pte_t *ptep);
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index d395540ff894..d498943b906c 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -7,6 +7,7 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <asm/scatterlist.h> 8#include <asm/scatterlist.h>
9#include <asm/io.h> 9#include <asm/io.h>
10#include <asm/x86_init.h>
10 11
11#ifdef __KERNEL__ 12#ifdef __KERNEL__
12 13
@@ -64,6 +65,7 @@ extern unsigned long pci_mem_start;
64 65
65#define PCIBIOS_MIN_CARDBUS_IO 0x4000 66#define PCIBIOS_MIN_CARDBUS_IO 0x4000
66 67
68extern int pcibios_enabled;
67void pcibios_config_init(void); 69void pcibios_config_init(void);
68struct pci_bus *pcibios_scan_root(int bus); 70struct pci_bus *pcibios_scan_root(int bus);
69 71
@@ -94,8 +96,36 @@ static inline void early_quirks(void) { }
94 96
95extern void pci_iommu_alloc(void); 97extern void pci_iommu_alloc(void);
96 98
97/* MSI arch hook */ 99#ifdef CONFIG_PCI_MSI
98#define arch_setup_msi_irqs arch_setup_msi_irqs 100/* MSI arch specific hooks */
101static inline int x86_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
102{
103 return x86_msi.setup_msi_irqs(dev, nvec, type);
104}
105
106static inline void x86_teardown_msi_irqs(struct pci_dev *dev)
107{
108 x86_msi.teardown_msi_irqs(dev);
109}
110
111static inline void x86_teardown_msi_irq(unsigned int irq)
112{
113 x86_msi.teardown_msi_irq(irq);
114}
115#define arch_setup_msi_irqs x86_setup_msi_irqs
116#define arch_teardown_msi_irqs x86_teardown_msi_irqs
117#define arch_teardown_msi_irq x86_teardown_msi_irq
118/* implemented in arch/x86/kernel/apic/io_apic. */
119int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type);
120void native_teardown_msi_irq(unsigned int irq);
121/* default to the implementation in drivers/lib/msi.c */
122#define HAVE_DEFAULT_MSI_TEARDOWN_IRQS
123void default_teardown_msi_irqs(struct pci_dev *dev);
124#else
125#define native_setup_msi_irqs NULL
126#define native_teardown_msi_irq NULL
127#define default_teardown_msi_irqs NULL
128#endif
99 129
100#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 130#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
101 131
@@ -105,8 +135,6 @@ extern void pci_iommu_alloc(void);
105#include "pci_64.h" 135#include "pci_64.h"
106#endif 136#endif
107 137
108void dma32_reserve_bootmem(void);
109
110/* implement the pci_ DMA API in terms of the generic device dma_ one */ 138/* implement the pci_ DMA API in terms of the generic device dma_ one */
111#include <asm-generic/pci-dma-compat.h> 139#include <asm-generic/pci-dma-compat.h>
112 140
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index 49c7219826f9..704526734bef 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -47,6 +47,7 @@ enum pci_bf_sort_state {
47extern unsigned int pcibios_max_latency; 47extern unsigned int pcibios_max_latency;
48 48
49void pcibios_resource_survey(void); 49void pcibios_resource_survey(void);
50void pcibios_set_cache_line_size(void);
50 51
51/* pci-pc.c */ 52/* pci-pc.c */
52 53
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index cd28f9ad910d..a0a9779084d1 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -45,12 +45,28 @@
45#include <linux/stringify.h> 45#include <linux/stringify.h>
46 46
47#ifdef CONFIG_SMP 47#ifdef CONFIG_SMP
48#define __percpu_arg(x) "%%"__stringify(__percpu_seg)":%P" #x 48#define __percpu_prefix "%%"__stringify(__percpu_seg)":"
49#define __my_cpu_offset percpu_read(this_cpu_off) 49#define __my_cpu_offset percpu_read(this_cpu_off)
50
51/*
52 * Compared to the generic __my_cpu_offset version, the following
53 * saves one instruction and avoids clobbering a temp register.
54 */
55#define __this_cpu_ptr(ptr) \
56({ \
57 unsigned long tcp_ptr__; \
58 __verify_pcpu_ptr(ptr); \
59 asm volatile("add " __percpu_arg(1) ", %0" \
60 : "=r" (tcp_ptr__) \
61 : "m" (this_cpu_off), "0" (ptr)); \
62 (typeof(*(ptr)) __kernel __force *)tcp_ptr__; \
63})
50#else 64#else
51#define __percpu_arg(x) "%P" #x 65#define __percpu_prefix ""
52#endif 66#endif
53 67
68#define __percpu_arg(x) __percpu_prefix "%P" #x
69
54/* 70/*
55 * Initialized pointers to per-cpu variables needed for the boot 71 * Initialized pointers to per-cpu variables needed for the boot
56 * processor need to use these macros to get the proper address 72 * processor need to use these macros to get the proper address
@@ -216,6 +232,125 @@ do { \
216}) 232})
217 233
218/* 234/*
235 * Add return operation
236 */
237#define percpu_add_return_op(var, val) \
238({ \
239 typeof(var) paro_ret__ = val; \
240 switch (sizeof(var)) { \
241 case 1: \
242 asm("xaddb %0, "__percpu_arg(1) \
243 : "+q" (paro_ret__), "+m" (var) \
244 : : "memory"); \
245 break; \
246 case 2: \
247 asm("xaddw %0, "__percpu_arg(1) \
248 : "+r" (paro_ret__), "+m" (var) \
249 : : "memory"); \
250 break; \
251 case 4: \
252 asm("xaddl %0, "__percpu_arg(1) \
253 : "+r" (paro_ret__), "+m" (var) \
254 : : "memory"); \
255 break; \
256 case 8: \
257 asm("xaddq %0, "__percpu_arg(1) \
258 : "+re" (paro_ret__), "+m" (var) \
259 : : "memory"); \
260 break; \
261 default: __bad_percpu_size(); \
262 } \
263 paro_ret__ += val; \
264 paro_ret__; \
265})
266
267/*
268 * xchg is implemented using cmpxchg without a lock prefix. xchg is
269 * expensive due to the implied lock prefix. The processor cannot prefetch
270 * cachelines if xchg is used.
271 */
272#define percpu_xchg_op(var, nval) \
273({ \
274 typeof(var) pxo_ret__; \
275 typeof(var) pxo_new__ = (nval); \
276 switch (sizeof(var)) { \
277 case 1: \
278 asm("\n\tmov "__percpu_arg(1)",%%al" \
279 "\n1:\tcmpxchgb %2, "__percpu_arg(1) \
280 "\n\tjnz 1b" \
281 : "=&a" (pxo_ret__), "+m" (var) \
282 : "q" (pxo_new__) \
283 : "memory"); \
284 break; \
285 case 2: \
286 asm("\n\tmov "__percpu_arg(1)",%%ax" \
287 "\n1:\tcmpxchgw %2, "__percpu_arg(1) \
288 "\n\tjnz 1b" \
289 : "=&a" (pxo_ret__), "+m" (var) \
290 : "r" (pxo_new__) \
291 : "memory"); \
292 break; \
293 case 4: \
294 asm("\n\tmov "__percpu_arg(1)",%%eax" \
295 "\n1:\tcmpxchgl %2, "__percpu_arg(1) \
296 "\n\tjnz 1b" \
297 : "=&a" (pxo_ret__), "+m" (var) \
298 : "r" (pxo_new__) \
299 : "memory"); \
300 break; \
301 case 8: \
302 asm("\n\tmov "__percpu_arg(1)",%%rax" \
303 "\n1:\tcmpxchgq %2, "__percpu_arg(1) \
304 "\n\tjnz 1b" \
305 : "=&a" (pxo_ret__), "+m" (var) \
306 : "r" (pxo_new__) \
307 : "memory"); \
308 break; \
309 default: __bad_percpu_size(); \
310 } \
311 pxo_ret__; \
312})
313
314/*
315 * cmpxchg has no such implied lock semantics as a result it is much
316 * more efficient for cpu local operations.
317 */
318#define percpu_cmpxchg_op(var, oval, nval) \
319({ \
320 typeof(var) pco_ret__; \
321 typeof(var) pco_old__ = (oval); \
322 typeof(var) pco_new__ = (nval); \
323 switch (sizeof(var)) { \
324 case 1: \
325 asm("cmpxchgb %2, "__percpu_arg(1) \
326 : "=a" (pco_ret__), "+m" (var) \
327 : "q" (pco_new__), "0" (pco_old__) \
328 : "memory"); \
329 break; \
330 case 2: \
331 asm("cmpxchgw %2, "__percpu_arg(1) \
332 : "=a" (pco_ret__), "+m" (var) \
333 : "r" (pco_new__), "0" (pco_old__) \
334 : "memory"); \
335 break; \
336 case 4: \
337 asm("cmpxchgl %2, "__percpu_arg(1) \
338 : "=a" (pco_ret__), "+m" (var) \
339 : "r" (pco_new__), "0" (pco_old__) \
340 : "memory"); \
341 break; \
342 case 8: \
343 asm("cmpxchgq %2, "__percpu_arg(1) \
344 : "=a" (pco_ret__), "+m" (var) \
345 : "r" (pco_new__), "0" (pco_old__) \
346 : "memory"); \
347 break; \
348 default: __bad_percpu_size(); \
349 } \
350 pco_ret__; \
351})
352
353/*
219 * percpu_read() makes gcc load the percpu variable every time it is 354 * percpu_read() makes gcc load the percpu variable every time it is
220 * accessed while percpu_read_stable() allows the value to be cached. 355 * accessed while percpu_read_stable() allows the value to be cached.
221 * percpu_read_stable() is more efficient and can be used if its value 356 * percpu_read_stable() is more efficient and can be used if its value
@@ -253,6 +388,12 @@ do { \
253#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 388#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
254#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 389#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
255#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 390#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
391/*
392 * Generic fallback operations for __this_cpu_xchg_[1-4] are okay and much
393 * faster than an xchg with forced lock semantics.
394 */
395#define __this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
396#define __this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
256 397
257#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 398#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
258#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 399#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
@@ -272,6 +413,9 @@ do { \
272#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 413#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
273#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 414#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
274#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 415#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
416#define this_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
417#define this_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
418#define this_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
275 419
276#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val) 420#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
277#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val) 421#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
@@ -285,6 +429,49 @@ do { \
285#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val) 429#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
286#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val) 430#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
287#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val) 431#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
432#define irqsafe_cpu_xchg_1(pcp, nval) percpu_xchg_op(pcp, nval)
433#define irqsafe_cpu_xchg_2(pcp, nval) percpu_xchg_op(pcp, nval)
434#define irqsafe_cpu_xchg_4(pcp, nval) percpu_xchg_op(pcp, nval)
435
436#ifndef CONFIG_M386
437#define __this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
438#define __this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
439#define __this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
440#define __this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
441#define __this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
442#define __this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
443
444#define this_cpu_add_return_1(pcp, val) percpu_add_return_op(pcp, val)
445#define this_cpu_add_return_2(pcp, val) percpu_add_return_op(pcp, val)
446#define this_cpu_add_return_4(pcp, val) percpu_add_return_op(pcp, val)
447#define this_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
448#define this_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
449#define this_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
450
451#define irqsafe_cpu_cmpxchg_1(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
452#define irqsafe_cpu_cmpxchg_2(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
453#define irqsafe_cpu_cmpxchg_4(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
454#endif /* !CONFIG_M386 */
455
456#ifdef CONFIG_X86_CMPXCHG64
457#define percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2) \
458({ \
459 char __ret; \
460 typeof(o1) __o1 = o1; \
461 typeof(o1) __n1 = n1; \
462 typeof(o2) __o2 = o2; \
463 typeof(o2) __n2 = n2; \
464 typeof(o2) __dummy = n2; \
465 asm volatile("cmpxchg8b "__percpu_arg(1)"\n\tsetz %0\n\t" \
466 : "=a"(__ret), "=m" (pcp1), "=d"(__dummy) \
467 : "b"(__n1), "c"(__n2), "a"(__o1), "d"(__o2)); \
468 __ret; \
469})
470
471#define __this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
472#define this_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
473#define irqsafe_cpu_cmpxchg_double_4(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg8b_double(pcp1, o1, o2, n1, n2)
474#endif /* CONFIG_X86_CMPXCHG64 */
288 475
289/* 476/*
290 * Per cpu atomic 64 bit operations are only available under 64 bit. 477 * Per cpu atomic 64 bit operations are only available under 64 bit.
@@ -297,6 +484,7 @@ do { \
297#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 484#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
298#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 485#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
299#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 486#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
487#define __this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
300 488
301#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp)) 489#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
302#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val) 490#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
@@ -304,11 +492,48 @@ do { \
304#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 492#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
305#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 493#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
306#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 494#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
495#define this_cpu_add_return_8(pcp, val) percpu_add_return_op(pcp, val)
496#define this_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
497#define this_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
307 498
308#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val) 499#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
309#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val) 500#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
310#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val) 501#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
311#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val) 502#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
503#define irqsafe_cpu_xchg_8(pcp, nval) percpu_xchg_op(pcp, nval)
504#define irqsafe_cpu_cmpxchg_8(pcp, oval, nval) percpu_cmpxchg_op(pcp, oval, nval)
505
506/*
507 * Pretty complex macro to generate cmpxchg16 instruction. The instruction
508 * is not supported on early AMD64 processors so we must be able to emulate
509 * it in software. The address used in the cmpxchg16 instruction must be
510 * aligned to a 16 byte boundary.
511 */
512#ifdef CONFIG_SMP
513#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP3
514#else
515#define CMPXCHG16B_EMU_CALL "call this_cpu_cmpxchg16b_emu\n\t" ASM_NOP2
516#endif
517#define percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2) \
518({ \
519 char __ret; \
520 typeof(o1) __o1 = o1; \
521 typeof(o1) __n1 = n1; \
522 typeof(o2) __o2 = o2; \
523 typeof(o2) __n2 = n2; \
524 typeof(o2) __dummy; \
525 alternative_io(CMPXCHG16B_EMU_CALL, \
526 "cmpxchg16b " __percpu_prefix "(%%rsi)\n\tsetz %0\n\t", \
527 X86_FEATURE_CX16, \
528 ASM_OUTPUT2("=a"(__ret), "=d"(__dummy)), \
529 "S" (&pcp1), "b"(__n1), "c"(__n2), \
530 "a"(__o1), "d"(__o2) : "memory"); \
531 __ret; \
532})
533
534#define __this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
535#define this_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
536#define irqsafe_cpu_cmpxchg_double_8(pcp1, pcp2, o1, o2, n1, n2) percpu_cmpxchg16b_double(pcp1, o1, o2, n1, n2)
312 537
313#endif 538#endif
314 539
@@ -322,6 +547,33 @@ do { \
322 old__; \ 547 old__; \
323}) 548})
324 549
550static __always_inline int x86_this_cpu_constant_test_bit(unsigned int nr,
551 const unsigned long __percpu *addr)
552{
553 unsigned long __percpu *a = (unsigned long *)addr + nr / BITS_PER_LONG;
554
555 return ((1UL << (nr % BITS_PER_LONG)) & percpu_read(*a)) != 0;
556}
557
558static inline int x86_this_cpu_variable_test_bit(int nr,
559 const unsigned long __percpu *addr)
560{
561 int oldbit;
562
563 asm volatile("bt "__percpu_arg(2)",%1\n\t"
564 "sbb %0,%0"
565 : "=r" (oldbit)
566 : "m" (*(unsigned long *)addr), "Ir" (nr));
567
568 return oldbit;
569}
570
571#define x86_this_cpu_test_bit(nr, addr) \
572 (__builtin_constant_p((nr)) \
573 ? x86_this_cpu_constant_test_bit((nr), (addr)) \
574 : x86_this_cpu_variable_test_bit((nr), (addr)))
575
576
325#include <asm-generic/percpu.h> 577#include <asm-generic/percpu.h>
326 578
327/* We can use this directly for local CPU (faster). */ 579/* We can use this directly for local CPU (faster). */
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index 6e742cc4251b..d9d4dae305f6 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -111,20 +111,20 @@ union cpuid10_edx {
111#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 111#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
112 112
113/* IbsFetchCtl bits/masks */ 113/* IbsFetchCtl bits/masks */
114#define IBS_FETCH_RAND_EN (1ULL<<57) 114#define IBS_FETCH_RAND_EN (1ULL<<57)
115#define IBS_FETCH_VAL (1ULL<<49) 115#define IBS_FETCH_VAL (1ULL<<49)
116#define IBS_FETCH_ENABLE (1ULL<<48) 116#define IBS_FETCH_ENABLE (1ULL<<48)
117#define IBS_FETCH_CNT 0xFFFF0000ULL 117#define IBS_FETCH_CNT 0xFFFF0000ULL
118#define IBS_FETCH_MAX_CNT 0x0000FFFFULL 118#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
119 119
120/* IbsOpCtl bits */ 120/* IbsOpCtl bits */
121#define IBS_OP_CNT_CTL (1ULL<<19) 121#define IBS_OP_CNT_CTL (1ULL<<19)
122#define IBS_OP_VAL (1ULL<<18) 122#define IBS_OP_VAL (1ULL<<18)
123#define IBS_OP_ENABLE (1ULL<<17) 123#define IBS_OP_ENABLE (1ULL<<17)
124#define IBS_OP_MAX_CNT 0x0000FFFFULL 124#define IBS_OP_MAX_CNT 0x0000FFFFULL
125#define IBS_OP_MAX_CNT_EXT 0x007FFFFFULL /* not a register bit mask */
125 126
126#ifdef CONFIG_PERF_EVENTS 127#ifdef CONFIG_PERF_EVENTS
127extern void init_hw_perf_events(void);
128extern void perf_events_lapic_init(void); 128extern void perf_events_lapic_init(void);
129 129
130#define PERF_EVENT_INDEX_OFFSET 0 130#define PERF_EVENT_INDEX_OFFSET 0
@@ -155,7 +155,6 @@ extern unsigned long perf_misc_flags(struct pt_regs *regs);
155} 155}
156 156
157#else 157#else
158static inline void init_hw_perf_events(void) { }
159static inline void perf_events_lapic_init(void) { } 158static inline void perf_events_lapic_init(void) { }
160#endif 159#endif
161 160
diff --git a/arch/x86/include/asm/perf_event_p4.h b/arch/x86/include/asm/perf_event_p4.h
index def500776b16..56fd9e3abbda 100644
--- a/arch/x86/include/asm/perf_event_p4.h
+++ b/arch/x86/include/asm/perf_event_p4.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 */ 3 */
4 4
5#ifndef PERF_EVENT_P4_H 5#ifndef PERF_EVENT_P4_H
@@ -9,7 +9,7 @@
9#include <linux/bitops.h> 9#include <linux/bitops.h>
10 10
11/* 11/*
12 * NetBurst has perfomance MSRs shared between 12 * NetBurst has performance MSRs shared between
13 * threads if HT is turned on, ie for both logical 13 * threads if HT is turned on, ie for both logical
14 * processors (mem: in turn in Atom with HT support 14 * processors (mem: in turn in Atom with HT support
15 * perf-MSRs are not shared and every thread has its 15 * perf-MSRs are not shared and every thread has its
@@ -20,6 +20,10 @@
20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR) 20#define ARCH_P4_MAX_ESCR (ARCH_P4_TOTAL_ESCR - ARCH_P4_RESERVED_ESCR)
21#define ARCH_P4_MAX_CCCR (18) 21#define ARCH_P4_MAX_CCCR (18)
22 22
23#define ARCH_P4_CNTRVAL_BITS (40)
24#define ARCH_P4_CNTRVAL_MASK ((1ULL << ARCH_P4_CNTRVAL_BITS) - 1)
25#define ARCH_P4_UNFLAGGED_BIT ((1ULL) << (ARCH_P4_CNTRVAL_BITS - 1))
26
23#define P4_ESCR_EVENT_MASK 0x7e000000U 27#define P4_ESCR_EVENT_MASK 0x7e000000U
24#define P4_ESCR_EVENT_SHIFT 25 28#define P4_ESCR_EVENT_SHIFT 25
25#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U 29#define P4_ESCR_EVENTMASK_MASK 0x01fffe00U
@@ -36,19 +40,6 @@
36#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT) 40#define P4_ESCR_EMASK(v) ((v) << P4_ESCR_EVENTMASK_SHIFT)
37#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT) 41#define P4_ESCR_TAG(v) ((v) << P4_ESCR_TAG_SHIFT)
38 42
39/* Non HT mask */
40#define P4_ESCR_MASK \
41 (P4_ESCR_EVENT_MASK | \
42 P4_ESCR_EVENTMASK_MASK | \
43 P4_ESCR_TAG_MASK | \
44 P4_ESCR_TAG_ENABLE | \
45 P4_ESCR_T0_OS | \
46 P4_ESCR_T0_USR)
47
48/* HT mask */
49#define P4_ESCR_MASK_HT \
50 (P4_ESCR_MASK | P4_ESCR_T1_OS | P4_ESCR_T1_USR)
51
52#define P4_CCCR_OVF 0x80000000U 43#define P4_CCCR_OVF 0x80000000U
53#define P4_CCCR_CASCADE 0x40000000U 44#define P4_CCCR_CASCADE 0x40000000U
54#define P4_CCCR_OVF_PMI_T0 0x04000000U 45#define P4_CCCR_OVF_PMI_T0 0x04000000U
@@ -70,23 +61,6 @@
70#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT) 61#define P4_CCCR_THRESHOLD(v) ((v) << P4_CCCR_THRESHOLD_SHIFT)
71#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT) 62#define P4_CCCR_ESEL(v) ((v) << P4_CCCR_ESCR_SELECT_SHIFT)
72 63
73/* Non HT mask */
74#define P4_CCCR_MASK \
75 (P4_CCCR_OVF | \
76 P4_CCCR_CASCADE | \
77 P4_CCCR_OVF_PMI_T0 | \
78 P4_CCCR_FORCE_OVF | \
79 P4_CCCR_EDGE | \
80 P4_CCCR_THRESHOLD_MASK | \
81 P4_CCCR_COMPLEMENT | \
82 P4_CCCR_COMPARE | \
83 P4_CCCR_ESCR_SELECT_MASK | \
84 P4_CCCR_ENABLE)
85
86/* HT mask */
87#define P4_CCCR_MASK_HT \
88 (P4_CCCR_MASK | P4_CCCR_OVF_PMI_T1 | P4_CCCR_THREAD_ANY)
89
90#define P4_GEN_ESCR_EMASK(class, name, bit) \ 64#define P4_GEN_ESCR_EMASK(class, name, bit) \
91 class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT) 65 class##__##name = ((1 << bit) << P4_ESCR_EVENTMASK_SHIFT)
92#define P4_ESCR_EMASK_BIT(class, name) class##__##name 66#define P4_ESCR_EMASK_BIT(class, name) class##__##name
@@ -127,6 +101,28 @@
127#define P4_CONFIG_HT_SHIFT 63 101#define P4_CONFIG_HT_SHIFT 63
128#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT) 102#define P4_CONFIG_HT (1ULL << P4_CONFIG_HT_SHIFT)
129 103
104/*
105 * The bits we allow to pass for RAW events
106 */
107#define P4_CONFIG_MASK_ESCR \
108 P4_ESCR_EVENT_MASK | \
109 P4_ESCR_EVENTMASK_MASK | \
110 P4_ESCR_TAG_MASK | \
111 P4_ESCR_TAG_ENABLE
112
113#define P4_CONFIG_MASK_CCCR \
114 P4_CCCR_EDGE | \
115 P4_CCCR_THRESHOLD_MASK | \
116 P4_CCCR_COMPLEMENT | \
117 P4_CCCR_COMPARE | \
118 P4_CCCR_THREAD_ANY | \
119 P4_CCCR_RESERVED
120
121/* some dangerous bits are reserved for kernel internals */
122#define P4_CONFIG_MASK \
123 (p4_config_pack_escr(P4_CONFIG_MASK_ESCR)) | \
124 (p4_config_pack_cccr(P4_CONFIG_MASK_CCCR))
125
130static inline bool p4_is_event_cascaded(u64 config) 126static inline bool p4_is_event_cascaded(u64 config)
131{ 127{
132 u32 cccr = p4_config_unpack_cccr(config); 128 u32 cccr = p4_config_unpack_cccr(config);
@@ -752,14 +748,6 @@ enum P4_ESCR_EMASKS {
752}; 748};
753 749
754/* 750/*
755 * P4 PEBS specifics (Replay Event only)
756 *
757 * Format (bits):
758 * 0-6: metric from P4_PEBS_METRIC enum
759 * 7 : reserved
760 * 8 : reserved
761 * 9-11 : reserved
762 *
763 * Note we have UOP and PEBS bits reserved for now 751 * Note we have UOP and PEBS bits reserved for now
764 * just in case if we will need them once 752 * just in case if we will need them once
765 */ 753 */
@@ -796,5 +784,60 @@ enum P4_PEBS_METRIC {
796 P4_PEBS_METRIC__max 784 P4_PEBS_METRIC__max
797}; 785};
798 786
787/*
788 * Notes on internal configuration of ESCR+CCCR tuples
789 *
790 * Since P4 has quite the different architecture of
791 * performance registers in compare with "architectural"
792 * once and we have on 64 bits to keep configuration
793 * of performance event, the following trick is used.
794 *
795 * 1) Since both ESCR and CCCR registers have only low
796 * 32 bits valuable, we pack them into a single 64 bit
797 * configuration. Low 32 bits of such config correspond
798 * to low 32 bits of CCCR register and high 32 bits
799 * correspond to low 32 bits of ESCR register.
800 *
801 * 2) The meaning of every bit of such config field can
802 * be found in Intel SDM but it should be noted that
803 * we "borrow" some reserved bits for own usage and
804 * clean them or set to a proper value when we do
805 * a real write to hardware registers.
806 *
807 * 3) The format of bits of config is the following
808 * and should be either 0 or set to some predefined
809 * values:
810 *
811 * Low 32 bits
812 * -----------
813 * 0-6: P4_PEBS_METRIC enum
814 * 7-11: reserved
815 * 12: reserved (Enable)
816 * 13-15: reserved (ESCR select)
817 * 16-17: Active Thread
818 * 18: Compare
819 * 19: Complement
820 * 20-23: Threshold
821 * 24: Edge
822 * 25: reserved (FORCE_OVF)
823 * 26: reserved (OVF_PMI_T0)
824 * 27: reserved (OVF_PMI_T1)
825 * 28-29: reserved
826 * 30: reserved (Cascade)
827 * 31: reserved (OVF)
828 *
829 * High 32 bits
830 * ------------
831 * 0: reserved (T1_USR)
832 * 1: reserved (T1_OS)
833 * 2: reserved (T0_USR)
834 * 3: reserved (T0_OS)
835 * 4: Tag Enable
836 * 5-8: Tag Value
837 * 9-24: Event Mask (may use P4_ESCR_EMASK_BIT helper)
838 * 25-30: enum P4_EVENTS
839 * 31: reserved (HT thread)
840 */
841
799#endif /* PERF_EVENT_P4_H */ 842#endif /* PERF_EVENT_P4_H */
800 843
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 271de94c3810..b4389a468fb6 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -92,7 +92,7 @@ static inline void pmd_free(struct mm_struct *mm, pmd_t *pmd)
92extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd); 92extern void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd);
93 93
94static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd, 94static inline void __pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd,
95 unsigned long adddress) 95 unsigned long address)
96{ 96{
97 ___pmd_free_tlb(tlb, pmd); 97 ___pmd_free_tlb(tlb, pmd);
98} 98}
diff --git a/arch/x86/include/asm/pgtable-2level.h b/arch/x86/include/asm/pgtable-2level.h
index 2334982b339e..98391db840c6 100644
--- a/arch/x86/include/asm/pgtable-2level.h
+++ b/arch/x86/include/asm/pgtable-2level.h
@@ -46,6 +46,15 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 46#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
47#endif 47#endif
48 48
49#ifdef CONFIG_SMP
50static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
51{
52 return __pmd(xchg((pmdval_t *)xp, 0));
53}
54#else
55#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
56#endif
57
49/* 58/*
50 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken, 59 * Bits _PAGE_BIT_PRESENT, _PAGE_BIT_FILE and _PAGE_BIT_PROTNONE are taken,
51 * split up the 29 bits of offset into this range: 60 * split up the 29 bits of offset into this range:
diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
index 177b0165ea01..effff47a3c82 100644
--- a/arch/x86/include/asm/pgtable-3level.h
+++ b/arch/x86/include/asm/pgtable-3level.h
@@ -69,8 +69,6 @@ static inline void native_pmd_clear(pmd_t *pmd)
69 69
70static inline void pud_clear(pud_t *pudp) 70static inline void pud_clear(pud_t *pudp)
71{ 71{
72 unsigned long pgd;
73
74 set_pud(pudp, __pud(0)); 72 set_pud(pudp, __pud(0));
75 73
76 /* 74 /*
@@ -79,13 +77,10 @@ static inline void pud_clear(pud_t *pudp)
79 * section 8.1: in PAE mode we explicitly have to flush the 77 * section 8.1: in PAE mode we explicitly have to flush the
80 * TLB via cr3 if the top-level pgd is changed... 78 * TLB via cr3 if the top-level pgd is changed...
81 * 79 *
82 * Make sure the pud entry we're updating is within the 80 * Currently all places where pud_clear() is called either have
83 * current pgd to avoid unnecessary TLB flushes. 81 * flush_tlb_mm() followed or don't need TLB flush (x86_64 code or
82 * pud_clear_bad()), so we don't need TLB flush here.
84 */ 83 */
85 pgd = read_cr3();
86 if (__pa(pudp) >= pgd && __pa(pudp) <
87 (pgd + sizeof(pgd_t)*PTRS_PER_PGD))
88 write_cr3(pgd);
89} 84}
90 85
91#ifdef CONFIG_SMP 86#ifdef CONFIG_SMP
@@ -104,6 +99,29 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
104#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) 99#define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
105#endif 100#endif
106 101
102#ifdef CONFIG_SMP
103union split_pmd {
104 struct {
105 u32 pmd_low;
106 u32 pmd_high;
107 };
108 pmd_t pmd;
109};
110static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
111{
112 union split_pmd res, *orig = (union split_pmd *)pmdp;
113
114 /* xchg acts as a barrier before setting of the high bits */
115 res.pmd_low = xchg(&orig->pmd_low, 0);
116 res.pmd_high = orig->pmd_high;
117 orig->pmd_high = 0;
118
119 return res.pmd;
120}
121#else
122#define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
123#endif
124
107/* 125/*
108 * Bits 0, 6 and 7 are taken in the low part of the pte, 126 * Bits 0, 6 and 7 are taken in the low part of the pte,
109 * put the 32 bits of offset into the high part. 127 * put the 32 bits of offset into the high part.
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index a34c785c5a63..18601c86fab1 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -28,11 +28,14 @@ extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)];
28extern spinlock_t pgd_lock; 28extern spinlock_t pgd_lock;
29extern struct list_head pgd_list; 29extern struct list_head pgd_list;
30 30
31extern struct mm_struct *pgd_page_get_mm(struct page *page);
32
31#ifdef CONFIG_PARAVIRT 33#ifdef CONFIG_PARAVIRT
32#include <asm/paravirt.h> 34#include <asm/paravirt.h>
33#else /* !CONFIG_PARAVIRT */ 35#else /* !CONFIG_PARAVIRT */
34#define set_pte(ptep, pte) native_set_pte(ptep, pte) 36#define set_pte(ptep, pte) native_set_pte(ptep, pte)
35#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) 37#define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte)
38#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd)
36 39
37#define set_pte_atomic(ptep, pte) \ 40#define set_pte_atomic(ptep, pte) \
38 native_set_pte_atomic(ptep, pte) 41 native_set_pte_atomic(ptep, pte)
@@ -57,6 +60,8 @@ extern struct list_head pgd_list;
57 60
58#define pte_update(mm, addr, ptep) do { } while (0) 61#define pte_update(mm, addr, ptep) do { } while (0)
59#define pte_update_defer(mm, addr, ptep) do { } while (0) 62#define pte_update_defer(mm, addr, ptep) do { } while (0)
63#define pmd_update(mm, addr, ptep) do { } while (0)
64#define pmd_update_defer(mm, addr, ptep) do { } while (0)
60 65
61#define pgd_val(x) native_pgd_val(x) 66#define pgd_val(x) native_pgd_val(x)
62#define __pgd(x) native_make_pgd(x) 67#define __pgd(x) native_make_pgd(x)
@@ -92,6 +97,11 @@ static inline int pte_young(pte_t pte)
92 return pte_flags(pte) & _PAGE_ACCESSED; 97 return pte_flags(pte) & _PAGE_ACCESSED;
93} 98}
94 99
100static inline int pmd_young(pmd_t pmd)
101{
102 return pmd_flags(pmd) & _PAGE_ACCESSED;
103}
104
95static inline int pte_write(pte_t pte) 105static inline int pte_write(pte_t pte)
96{ 106{
97 return pte_flags(pte) & _PAGE_RW; 107 return pte_flags(pte) & _PAGE_RW;
@@ -140,6 +150,23 @@ static inline int pmd_large(pmd_t pte)
140 (_PAGE_PSE | _PAGE_PRESENT); 150 (_PAGE_PSE | _PAGE_PRESENT);
141} 151}
142 152
153#ifdef CONFIG_TRANSPARENT_HUGEPAGE
154static inline int pmd_trans_splitting(pmd_t pmd)
155{
156 return pmd_val(pmd) & _PAGE_SPLITTING;
157}
158
159static inline int pmd_trans_huge(pmd_t pmd)
160{
161 return pmd_val(pmd) & _PAGE_PSE;
162}
163
164static inline int has_transparent_hugepage(void)
165{
166 return cpu_has_pse;
167}
168#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
169
143static inline pte_t pte_set_flags(pte_t pte, pteval_t set) 170static inline pte_t pte_set_flags(pte_t pte, pteval_t set)
144{ 171{
145 pteval_t v = native_pte_val(pte); 172 pteval_t v = native_pte_val(pte);
@@ -214,6 +241,55 @@ static inline pte_t pte_mkspecial(pte_t pte)
214 return pte_set_flags(pte, _PAGE_SPECIAL); 241 return pte_set_flags(pte, _PAGE_SPECIAL);
215} 242}
216 243
244static inline pmd_t pmd_set_flags(pmd_t pmd, pmdval_t set)
245{
246 pmdval_t v = native_pmd_val(pmd);
247
248 return __pmd(v | set);
249}
250
251static inline pmd_t pmd_clear_flags(pmd_t pmd, pmdval_t clear)
252{
253 pmdval_t v = native_pmd_val(pmd);
254
255 return __pmd(v & ~clear);
256}
257
258static inline pmd_t pmd_mkold(pmd_t pmd)
259{
260 return pmd_clear_flags(pmd, _PAGE_ACCESSED);
261}
262
263static inline pmd_t pmd_wrprotect(pmd_t pmd)
264{
265 return pmd_clear_flags(pmd, _PAGE_RW);
266}
267
268static inline pmd_t pmd_mkdirty(pmd_t pmd)
269{
270 return pmd_set_flags(pmd, _PAGE_DIRTY);
271}
272
273static inline pmd_t pmd_mkhuge(pmd_t pmd)
274{
275 return pmd_set_flags(pmd, _PAGE_PSE);
276}
277
278static inline pmd_t pmd_mkyoung(pmd_t pmd)
279{
280 return pmd_set_flags(pmd, _PAGE_ACCESSED);
281}
282
283static inline pmd_t pmd_mkwrite(pmd_t pmd)
284{
285 return pmd_set_flags(pmd, _PAGE_RW);
286}
287
288static inline pmd_t pmd_mknotpresent(pmd_t pmd)
289{
290 return pmd_clear_flags(pmd, _PAGE_PRESENT);
291}
292
217/* 293/*
218 * Mask out unsupported bits in a present pgprot. Non-present pgprots 294 * Mask out unsupported bits in a present pgprot. Non-present pgprots
219 * can use those bits for other purposes, so leave them be. 295 * can use those bits for other purposes, so leave them be.
@@ -254,6 +330,16 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot)
254 return __pte(val); 330 return __pte(val);
255} 331}
256 332
333static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
334{
335 pmdval_t val = pmd_val(pmd);
336
337 val &= _HPAGE_CHG_MASK;
338 val |= massage_pgprot(newprot) & ~_HPAGE_CHG_MASK;
339
340 return __pmd(val);
341}
342
257/* mprotect needs to preserve PAT bits when updating vm_page_prot */ 343/* mprotect needs to preserve PAT bits when updating vm_page_prot */
258#define pgprot_modify pgprot_modify 344#define pgprot_modify pgprot_modify
259static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot) 345static inline pgprot_t pgprot_modify(pgprot_t oldprot, pgprot_t newprot)
@@ -348,7 +434,7 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd)
348 * Currently stuck as a macro due to indirect forward reference to 434 * Currently stuck as a macro due to indirect forward reference to
349 * linux/mmzone.h's __section_mem_map_addr() definition: 435 * linux/mmzone.h's __section_mem_map_addr() definition:
350 */ 436 */
351#define pmd_page(pmd) pfn_to_page(pmd_val(pmd) >> PAGE_SHIFT) 437#define pmd_page(pmd) pfn_to_page((pmd_val(pmd) & PTE_PFN_MASK) >> PAGE_SHIFT)
352 438
353/* 439/*
354 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD] 440 * the pmd page can be thought of an array like this: pmd_t[PTRS_PER_PMD]
@@ -522,12 +608,26 @@ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep)
522 return res; 608 return res;
523} 609}
524 610
611static inline pmd_t native_local_pmdp_get_and_clear(pmd_t *pmdp)
612{
613 pmd_t res = *pmdp;
614
615 native_pmd_clear(pmdp);
616 return res;
617}
618
525static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr, 619static inline void native_set_pte_at(struct mm_struct *mm, unsigned long addr,
526 pte_t *ptep , pte_t pte) 620 pte_t *ptep , pte_t pte)
527{ 621{
528 native_set_pte(ptep, pte); 622 native_set_pte(ptep, pte);
529} 623}
530 624
625static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr,
626 pmd_t *pmdp , pmd_t pmd)
627{
628 native_set_pmd(pmdp, pmd);
629}
630
531#ifndef CONFIG_PARAVIRT 631#ifndef CONFIG_PARAVIRT
532/* 632/*
533 * Rules for using pte_update - it must be called after any PTE update which 633 * Rules for using pte_update - it must be called after any PTE update which
@@ -603,6 +703,51 @@ static inline void ptep_set_wrprotect(struct mm_struct *mm,
603 pte_update(mm, addr, ptep); 703 pte_update(mm, addr, ptep);
604} 704}
605 705
706#define flush_tlb_fix_spurious_fault(vma, address)
707
708#define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot))
709
710#define __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS
711extern int pmdp_set_access_flags(struct vm_area_struct *vma,
712 unsigned long address, pmd_t *pmdp,
713 pmd_t entry, int dirty);
714
715#define __HAVE_ARCH_PMDP_TEST_AND_CLEAR_YOUNG
716extern int pmdp_test_and_clear_young(struct vm_area_struct *vma,
717 unsigned long addr, pmd_t *pmdp);
718
719#define __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH
720extern int pmdp_clear_flush_young(struct vm_area_struct *vma,
721 unsigned long address, pmd_t *pmdp);
722
723
724#define __HAVE_ARCH_PMDP_SPLITTING_FLUSH
725extern void pmdp_splitting_flush(struct vm_area_struct *vma,
726 unsigned long addr, pmd_t *pmdp);
727
728#define __HAVE_ARCH_PMD_WRITE
729static inline int pmd_write(pmd_t pmd)
730{
731 return pmd_flags(pmd) & _PAGE_RW;
732}
733
734#define __HAVE_ARCH_PMDP_GET_AND_CLEAR
735static inline pmd_t pmdp_get_and_clear(struct mm_struct *mm, unsigned long addr,
736 pmd_t *pmdp)
737{
738 pmd_t pmd = native_pmdp_get_and_clear(pmdp);
739 pmd_update(mm, addr, pmdp);
740 return pmd;
741}
742
743#define __HAVE_ARCH_PMDP_SET_WRPROTECT
744static inline void pmdp_set_wrprotect(struct mm_struct *mm,
745 unsigned long addr, pmd_t *pmdp)
746{
747 clear_bit(_PAGE_BIT_RW, (unsigned long *)pmdp);
748 pmd_update(mm, addr, pmdp);
749}
750
606/* 751/*
607 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); 752 * clone_pgd_range(pgd_t *dst, pgd_t *src, int count);
608 * 753 *
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index f686f49e8b7b..0c92113c4cb6 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -26,7 +26,7 @@ struct mm_struct;
26struct vm_area_struct; 26struct vm_area_struct;
27 27
28extern pgd_t swapper_pg_dir[1024]; 28extern pgd_t swapper_pg_dir[1024];
29extern pgd_t trampoline_pg_dir[1024]; 29extern pgd_t initial_page_table[1024];
30 30
31static inline void pgtable_cache_init(void) { } 31static inline void pgtable_cache_init(void) { }
32static inline void check_pgt_cache(void) { } 32static inline void check_pgt_cache(void) { }
@@ -49,24 +49,14 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
49#endif 49#endif
50 50
51#if defined(CONFIG_HIGHPTE) 51#if defined(CONFIG_HIGHPTE)
52#define __KM_PTE \
53 (in_nmi() ? KM_NMI_PTE : \
54 in_irq() ? KM_IRQ_PTE : \
55 KM_PTE0)
56#define pte_offset_map(dir, address) \ 52#define pte_offset_map(dir, address) \
57 ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \ 53 ((pte_t *)kmap_atomic(pmd_page(*(dir))) + \
58 pte_index((address))) 54 pte_index((address)))
59#define pte_offset_map_nested(dir, address) \ 55#define pte_unmap(pte) kunmap_atomic((pte))
60 ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
61 pte_index((address)))
62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
64#else 56#else
65#define pte_offset_map(dir, address) \ 57#define pte_offset_map(dir, address) \
66 ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address))) 58 ((pte_t *)page_address(pmd_page(*(dir))) + pte_index((address)))
67#define pte_offset_map_nested(dir, address) pte_offset_map((dir), (address))
68#define pte_unmap(pte) do { } while (0) 59#define pte_unmap(pte) do { } while (0)
69#define pte_unmap_nested(pte) do { } while (0)
70#endif 60#endif
71 61
72/* Clear a kernel PTE and flush it from the TLB */ 62/* Clear a kernel PTE and flush it from the TLB */
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index 076052cd62be..975f709e09ae 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -59,6 +59,16 @@ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
59 native_set_pte(ptep, pte); 59 native_set_pte(ptep, pte);
60} 60}
61 61
62static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd)
63{
64 *pmdp = pmd;
65}
66
67static inline void native_pmd_clear(pmd_t *pmd)
68{
69 native_set_pmd(pmd, native_make_pmd(0));
70}
71
62static inline pte_t native_ptep_get_and_clear(pte_t *xp) 72static inline pte_t native_ptep_get_and_clear(pte_t *xp)
63{ 73{
64#ifdef CONFIG_SMP 74#ifdef CONFIG_SMP
@@ -72,14 +82,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *xp)
72#endif 82#endif
73} 83}
74 84
75static inline void native_set_pmd(pmd_t *pmdp, pmd_t pmd) 85static inline pmd_t native_pmdp_get_and_clear(pmd_t *xp)
76{
77 *pmdp = pmd;
78}
79
80static inline void native_pmd_clear(pmd_t *pmd)
81{ 86{
82 native_set_pmd(pmd, native_make_pmd(0)); 87#ifdef CONFIG_SMP
88 return native_make_pmd(xchg(&xp->pmd, 0));
89#else
90 /* native_local_pmdp_get_and_clear,
91 but duplicated because of cyclic dependency */
92 pmd_t ret = *xp;
93 native_pmd_clear(xp);
94 return ret;
95#endif
83} 96}
84 97
85static inline void native_set_pud(pud_t *pudp, pud_t pud) 98static inline void native_set_pud(pud_t *pudp, pud_t pud)
@@ -102,6 +115,8 @@ static inline void native_pgd_clear(pgd_t *pgd)
102 native_set_pgd(pgd, native_make_pgd(0)); 115 native_set_pgd(pgd, native_make_pgd(0));
103} 116}
104 117
118extern void sync_global_pgds(unsigned long start, unsigned long end);
119
105/* 120/*
106 * Conversion functions: convert a page and protection to a page entry, 121 * Conversion functions: convert a page and protection to a page entry,
107 * and a page entry and page directory to the page they refer to. 122 * and a page entry and page directory to the page they refer to.
@@ -125,9 +140,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
125 140
126/* x86-64 always has all page tables mapped. */ 141/* x86-64 always has all page tables mapped. */
127#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address)) 142#define pte_offset_map(dir, address) pte_offset_kernel((dir), (address))
128#define pte_offset_map_nested(dir, address) pte_offset_kernel((dir), (address))
129#define pte_unmap(pte) ((void)(pte))/* NOP */ 143#define pte_unmap(pte) ((void)(pte))/* NOP */
130#define pte_unmap_nested(pte) ((void)(pte)) /* NOP */
131 144
132#define update_mmu_cache(vma, address, ptep) do { } while (0) 145#define update_mmu_cache(vma, address, ptep) do { } while (0)
133 146
@@ -168,6 +181,7 @@ extern void cleanup_highmap(void);
168#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK) 181#define kc_offset_to_vaddr(o) ((o) | ~__VIRTUAL_MASK)
169 182
170#define __HAVE_ARCH_PTE_SAME 183#define __HAVE_ARCH_PTE_SAME
184
171#endif /* !__ASSEMBLY__ */ 185#endif /* !__ASSEMBLY__ */
172 186
173#endif /* _ASM_X86_PGTABLE_64_H */ 187#endif /* _ASM_X86_PGTABLE_64_H */
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index d1f4a760be23..d56187c6b838 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -22,6 +22,7 @@
22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */ 22#define _PAGE_BIT_PAT_LARGE 12 /* On 2MB or 1GB pages */
23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1 23#define _PAGE_BIT_SPECIAL _PAGE_BIT_UNUSED1
24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1 24#define _PAGE_BIT_CPA_TEST _PAGE_BIT_UNUSED1
25#define _PAGE_BIT_SPLITTING _PAGE_BIT_UNUSED1 /* only valid on a PSE pmd */
25#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */ 26#define _PAGE_BIT_NX 63 /* No execute: only valid after cpuid check */
26 27
27/* If _PAGE_BIT_PRESENT is clear, we use these: */ 28/* If _PAGE_BIT_PRESENT is clear, we use these: */
@@ -45,6 +46,7 @@
45#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE) 46#define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
46#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL) 47#define _PAGE_SPECIAL (_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
47#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST) 48#define _PAGE_CPA_TEST (_AT(pteval_t, 1) << _PAGE_BIT_CPA_TEST)
49#define _PAGE_SPLITTING (_AT(pteval_t, 1) << _PAGE_BIT_SPLITTING)
48#define __HAVE_ARCH_PTE_SPECIAL 50#define __HAVE_ARCH_PTE_SPECIAL
49 51
50#ifdef CONFIG_KMEMCHECK 52#ifdef CONFIG_KMEMCHECK
@@ -70,6 +72,7 @@
70/* Set of bits not changed in pte_modify */ 72/* Set of bits not changed in pte_modify */
71#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \ 73#define _PAGE_CHG_MASK (PTE_PFN_MASK | _PAGE_PCD | _PAGE_PWT | \
72 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY) 74 _PAGE_SPECIAL | _PAGE_ACCESSED | _PAGE_DIRTY)
75#define _HPAGE_CHG_MASK (_PAGE_CHG_MASK | _PAGE_PSE)
73 76
74#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT) 77#define _PAGE_CACHE_MASK (_PAGE_PCD | _PAGE_PWT)
75#define _PAGE_CACHE_WB (0) 78#define _PAGE_CACHE_WB (0)
@@ -296,6 +299,7 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
296/* Install a pte for a particular vaddr in kernel space. */ 299/* Install a pte for a particular vaddr in kernel space. */
297void set_pte_vaddr(unsigned long vaddr, pte_t pte); 300void set_pte_vaddr(unsigned long vaddr, pte_t pte);
298 301
302extern void native_pagetable_reserve(u64 start, u64 end);
299#ifdef CONFIG_X86_32 303#ifdef CONFIG_X86_32
300extern void native_pagetable_setup_start(pgd_t *base); 304extern void native_pagetable_setup_start(pgd_t *base);
301extern void native_pagetable_setup_done(pgd_t *base); 305extern void native_pagetable_setup_done(pgd_t *base);
diff --git a/arch/x86/include/asm/probe_roms.h b/arch/x86/include/asm/probe_roms.h
new file mode 100644
index 000000000000..4950a0b1d09c
--- /dev/null
+++ b/arch/x86/include/asm/probe_roms.h
@@ -0,0 +1,8 @@
1#ifndef _PROBE_ROMS_H_
2#define _PROBE_ROMS_H_
3struct pci_dev;
4
5extern void __iomem *pci_map_biosrom(struct pci_dev *pdev);
6extern void pci_unmap_biosrom(void __iomem *rom);
7extern size_t pci_biosrom_size(struct pci_dev *pdev);
8#endif
diff --git a/arch/x86/include/asm/processor-flags.h b/arch/x86/include/asm/processor-flags.h
index 7a3e836eb2a9..59ab4dffa377 100644
--- a/arch/x86/include/asm/processor-flags.h
+++ b/arch/x86/include/asm/processor-flags.h
@@ -7,7 +7,7 @@
7 */ 7 */
8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */ 8#define X86_EFLAGS_CF 0x00000001 /* Carry Flag */
9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */ 9#define X86_EFLAGS_PF 0x00000004 /* Parity Flag */
10#define X86_EFLAGS_AF 0x00000010 /* Auxillary carry Flag */ 10#define X86_EFLAGS_AF 0x00000010 /* Auxiliary carry Flag */
11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */ 11#define X86_EFLAGS_ZF 0x00000040 /* Zero Flag */
12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */ 12#define X86_EFLAGS_SF 0x00000080 /* Sign Flag */
13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */ 13#define X86_EFLAGS_TF 0x00000100 /* Trap Flag */
@@ -60,6 +60,7 @@
60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */ 60#define X86_CR4_OSXMMEXCPT 0x00000400 /* enable unmasked SSE exceptions */
61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */ 61#define X86_CR4_VMXE 0x00002000 /* enable VMX virtualization */
62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */ 62#define X86_CR4_OSXSAVE 0x00040000 /* enable xsave and xrestore */
63#define X86_CR4_SMEP 0x00100000 /* enable SMEP support */
63 64
64/* 65/*
65 * x86-64 Task Priority Register, CR8 66 * x86-64 Task Priority Register, CR8
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ebaa04a8d3af..b844edc69fe9 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -94,10 +94,6 @@ struct cpuinfo_x86 {
94 int x86_cache_alignment; /* In bytes */ 94 int x86_cache_alignment; /* In bytes */
95 int x86_power; 95 int x86_power;
96 unsigned long loops_per_jiffy; 96 unsigned long loops_per_jiffy;
97#ifdef CONFIG_SMP
98 /* cpus sharing the last level cache: */
99 cpumask_var_t llc_shared_map;
100#endif
101 /* cpuid returned max cores value: */ 97 /* cpuid returned max cores value: */
102 u16 x86_max_cores; 98 u16 x86_max_cores;
103 u16 apicid; 99 u16 apicid;
@@ -110,6 +106,8 @@ struct cpuinfo_x86 {
110 u16 phys_proc_id; 106 u16 phys_proc_id;
111 /* Core id: */ 107 /* Core id: */
112 u16 cpu_core_id; 108 u16 cpu_core_id;
109 /* Compute unit id */
110 u8 compute_unit_id;
113 /* Index into per_cpu list: */ 111 /* Index into per_cpu list: */
114 u16 cpu_index; 112 u16 cpu_index;
115#endif 113#endif
@@ -139,10 +137,9 @@ extern __u32 cpu_caps_set[NCAPINTS];
139#ifdef CONFIG_SMP 137#ifdef CONFIG_SMP
140DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 138DECLARE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
141#define cpu_data(cpu) per_cpu(cpu_info, cpu) 139#define cpu_data(cpu) per_cpu(cpu_info, cpu)
142#define current_cpu_data __get_cpu_var(cpu_info)
143#else 140#else
141#define cpu_info boot_cpu_data
144#define cpu_data(cpu) boot_cpu_data 142#define cpu_data(cpu) boot_cpu_data
145#define current_cpu_data boot_cpu_data
146#endif 143#endif
147 144
148extern const struct seq_operations cpuinfo_op; 145extern const struct seq_operations cpuinfo_op;
@@ -606,7 +603,7 @@ extern unsigned long mmu_cr4_features;
606 603
607static inline void set_in_cr4(unsigned long mask) 604static inline void set_in_cr4(unsigned long mask)
608{ 605{
609 unsigned cr4; 606 unsigned long cr4;
610 607
611 mmu_cr4_features |= mask; 608 mmu_cr4_features |= mask;
612 cr4 = read_cr4(); 609 cr4 = read_cr4();
@@ -616,7 +613,7 @@ static inline void set_in_cr4(unsigned long mask)
616 613
617static inline void clear_in_cr4(unsigned long mask) 614static inline void clear_in_cr4(unsigned long mask)
618{ 615{
619 unsigned cr4; 616 unsigned long cr4;
620 617
621 mmu_cr4_features &= ~mask; 618 mmu_cr4_features &= ~mask;
622 cr4 = read_cr4(); 619 cr4 = read_cr4();
@@ -761,35 +758,13 @@ static inline void __sti_mwait(unsigned long eax, unsigned long ecx)
761extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx); 758extern void mwait_idle_with_hints(unsigned long eax, unsigned long ecx);
762 759
763extern void select_idle_routine(const struct cpuinfo_x86 *c); 760extern void select_idle_routine(const struct cpuinfo_x86 *c);
764extern void init_c1e_mask(void); 761extern void init_amd_e400_c1e_mask(void);
765 762
766extern unsigned long boot_option_idle_override; 763extern unsigned long boot_option_idle_override;
767extern unsigned long idle_halt; 764extern bool amd_e400_c1e_detected;
768extern unsigned long idle_nomwait;
769extern bool c1e_detected;
770 765
771/* 766enum idle_boot_override {IDLE_NO_OVERRIDE=0, IDLE_HALT, IDLE_NOMWAIT,
772 * on systems with caches, caches must be flashed as the absolute 767 IDLE_POLL, IDLE_FORCE_MWAIT};
773 * last instruction before going into a suspended halt. Otherwise,
774 * dirty data can linger in the cache and become stale on resume,
775 * leading to strange errors.
776 *
777 * perform a variety of operations to guarantee that the compiler
778 * will not reorder instructions. wbinvd itself is serializing
779 * so the processor will not reorder.
780 *
781 * Systems without cache can just go into halt.
782 */
783static inline void wbinvd_halt(void)
784{
785 mb();
786 /* check for clflush to determine if wbinvd is legal */
787 if (cpu_has_clflush)
788 asm volatile("cli; wbinvd; 1: hlt; jmp 1b" : : : "memory");
789 else
790 while (1)
791 halt();
792}
793 768
794extern void enable_sep_cpu(void); 769extern void enable_sep_cpu(void);
795extern int sysenter_setup(void); 770extern int sysenter_setup(void);
@@ -927,7 +902,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
927/* 902/*
928 * The below -8 is to reserve 8 bytes on top of the ring0 stack. 903 * The below -8 is to reserve 8 bytes on top of the ring0 stack.
929 * This is necessary to guarantee that the entire "struct pt_regs" 904 * This is necessary to guarantee that the entire "struct pt_regs"
930 * is accessable even if the CPU haven't stored the SS/ESP registers 905 * is accessible even if the CPU haven't stored the SS/ESP registers
931 * on the stack (interrupt gate does not save these registers 906 * on the stack (interrupt gate does not save these registers
932 * when switching to the same priv ring). 907 * when switching to the same priv ring).
933 * Therefore beware: accessing the ss/esp fields of the 908 * Therefore beware: accessing the ss/esp fields of the
diff --git a/arch/x86/include/asm/prom.h b/arch/x86/include/asm/prom.h
new file mode 100644
index 000000000000..971e0b46446e
--- /dev/null
+++ b/arch/x86/include/asm/prom.h
@@ -0,0 +1,69 @@
1/*
2 * Definitions for Device tree / OpenFirmware handling on X86
3 *
4 * based on arch/powerpc/include/asm/prom.h which is
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version
10 * 2 of the License, or (at your option) any later version.
11 */
12
13#ifndef _ASM_X86_PROM_H
14#define _ASM_X86_PROM_H
15#ifndef __ASSEMBLY__
16
17#include <linux/of.h>
18#include <linux/types.h>
19#include <linux/pci.h>
20
21#include <asm/irq.h>
22#include <asm/atomic.h>
23#include <asm/setup.h>
24#include <asm/irq_controller.h>
25
26#ifdef CONFIG_OF
27extern int of_ioapic;
28extern u64 initial_dtb;
29extern void add_dtb(u64 data);
30extern void x86_add_irq_domains(void);
31void __cpuinit x86_of_pci_init(void);
32void x86_dtb_init(void);
33
34static inline struct device_node *pci_device_to_OF_node(struct pci_dev *pdev)
35{
36 return pdev ? pdev->dev.of_node : NULL;
37}
38
39static inline struct device_node *pci_bus_to_OF_node(struct pci_bus *bus)
40{
41 return pci_device_to_OF_node(bus->self);
42}
43
44#else
45static inline void add_dtb(u64 data) { }
46static inline void x86_add_irq_domains(void) { }
47static inline void x86_of_pci_init(void) { }
48static inline void x86_dtb_init(void) { }
49#define of_ioapic 0
50#endif
51
52extern char cmd_line[COMMAND_LINE_SIZE];
53
54#define pci_address_to_pio pci_address_to_pio
55unsigned long pci_address_to_pio(phys_addr_t addr);
56
57/**
58 * irq_dispose_mapping - Unmap an interrupt
59 * @virq: linux virq number of the interrupt to unmap
60 *
61 * FIXME: We really should implement proper virq handling like power,
62 * but that's going to be major surgery.
63 */
64static inline void irq_dispose_mapping(unsigned int virq) { }
65
66#define HAVE_ARCH_DEVTREE_FIXUPS
67
68#endif /* __ASSEMBLY__ */
69#endif
diff --git a/arch/x86/include/asm/ptrace-abi.h b/arch/x86/include/asm/ptrace-abi.h
index 52b098a6eebb..7b0a55a88851 100644
--- a/arch/x86/include/asm/ptrace-abi.h
+++ b/arch/x86/include/asm/ptrace-abi.h
@@ -31,7 +31,7 @@
31#define R12 24 31#define R12 24
32#define RBP 32 32#define RBP 32
33#define RBX 40 33#define RBX 40
34/* arguments: interrupts/non tracing syscalls only save upto here*/ 34/* arguments: interrupts/non tracing syscalls only save up to here*/
35#define R11 48 35#define R11 48
36#define R10 56 36#define R10 56
37#define R9 64 37#define R9 64
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 78cd1ea94500..94e7618fcac8 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -73,7 +73,7 @@ struct pt_regs {
73 unsigned long r12; 73 unsigned long r12;
74 unsigned long rbp; 74 unsigned long rbp;
75 unsigned long rbx; 75 unsigned long rbx;
76/* arguments: non interrupts/non tracing syscalls only save upto here*/ 76/* arguments: non interrupts/non tracing syscalls only save up to here*/
77 unsigned long r11; 77 unsigned long r11;
78 unsigned long r10; 78 unsigned long r10;
79 unsigned long r9; 79 unsigned long r9;
@@ -103,7 +103,7 @@ struct pt_regs {
103 unsigned long r12; 103 unsigned long r12;
104 unsigned long bp; 104 unsigned long bp;
105 unsigned long bx; 105 unsigned long bx;
106/* arguments: non interrupts/non tracing syscalls only save upto here*/ 106/* arguments: non interrupts/non tracing syscalls only save up to here*/
107 unsigned long r11; 107 unsigned long r11;
108 unsigned long r10; 108 unsigned long r10;
109 unsigned long r9; 109 unsigned long r9;
@@ -136,6 +136,7 @@ struct cpuinfo_x86;
136struct task_struct; 136struct task_struct;
137 137
138extern unsigned long profile_pc(struct pt_regs *regs); 138extern unsigned long profile_pc(struct pt_regs *regs);
139#define profile_pc profile_pc
139 140
140extern unsigned long 141extern unsigned long
141convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs); 142convert_ip_to_linear(struct task_struct *child, struct pt_regs *regs);
@@ -202,20 +203,11 @@ static inline unsigned long kernel_stack_pointer(struct pt_regs *regs)
202#endif 203#endif
203} 204}
204 205
205static inline unsigned long instruction_pointer(struct pt_regs *regs) 206#define GET_IP(regs) ((regs)->ip)
206{ 207#define GET_FP(regs) ((regs)->bp)
207 return regs->ip; 208#define GET_USP(regs) ((regs)->sp)
208}
209
210static inline unsigned long frame_pointer(struct pt_regs *regs)
211{
212 return regs->bp;
213}
214 209
215static inline unsigned long user_stack_pointer(struct pt_regs *regs) 210#include <asm-generic/ptrace.h>
216{
217 return regs->sp;
218}
219 211
220/* Query offset/name of register from its name/offset */ 212/* Query offset/name of register from its name/offset */
221extern int regs_query_register_offset(const char *name); 213extern int regs_query_register_offset(const char *name);
diff --git a/arch/x86/include/asm/pvclock.h b/arch/x86/include/asm/pvclock.h
index cd02f324aa6b..a518c0a45044 100644
--- a/arch/x86/include/asm/pvclock.h
+++ b/arch/x86/include/asm/pvclock.h
@@ -11,5 +11,49 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src);
11void pvclock_read_wallclock(struct pvclock_wall_clock *wall, 11void pvclock_read_wallclock(struct pvclock_wall_clock *wall,
12 struct pvclock_vcpu_time_info *vcpu, 12 struct pvclock_vcpu_time_info *vcpu,
13 struct timespec *ts); 13 struct timespec *ts);
14void pvclock_resume(void);
15
16/*
17 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
18 * yielding a 64-bit result.
19 */
20static inline u64 pvclock_scale_delta(u64 delta, u32 mul_frac, int shift)
21{
22 u64 product;
23#ifdef __i386__
24 u32 tmp1, tmp2;
25#else
26 ulong tmp;
27#endif
28
29 if (shift < 0)
30 delta >>= -shift;
31 else
32 delta <<= shift;
33
34#ifdef __i386__
35 __asm__ (
36 "mul %5 ; "
37 "mov %4,%%eax ; "
38 "mov %%edx,%4 ; "
39 "mul %5 ; "
40 "xor %5,%5 ; "
41 "add %4,%%eax ; "
42 "adc %5,%%edx ; "
43 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
44 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
45#elif defined(__x86_64__)
46 __asm__ (
47 "mul %[mul_frac] ; shrd $32, %[hi], %[lo]"
48 : [lo]"=a"(product),
49 [hi]"=d"(tmp)
50 : "0"(delta),
51 [mul_frac]"rm"((u64)mul_frac));
52#else
53#error implement me!
54#endif
55
56 return product;
57}
14 58
15#endif /* _ASM_X86_PVCLOCK_H */ 59#endif /* _ASM_X86_PVCLOCK_H */
diff --git a/arch/x86/include/asm/reboot.h b/arch/x86/include/asm/reboot.h
index 562d4fd31ba8..3250e3d605d9 100644
--- a/arch/x86/include/asm/reboot.h
+++ b/arch/x86/include/asm/reboot.h
@@ -18,7 +18,10 @@ extern struct machine_ops machine_ops;
18 18
19void native_machine_crash_shutdown(struct pt_regs *regs); 19void native_machine_crash_shutdown(struct pt_regs *regs);
20void native_machine_shutdown(void); 20void native_machine_shutdown(void);
21void machine_real_restart(const unsigned char *code, int length); 21void machine_real_restart(unsigned int type);
22/* These must match dispatch_table in reboot_32.S */
23#define MRR_BIOS 0
24#define MRR_APM 1
22 25
23typedef void (*nmi_shootdown_cb)(int, struct die_args*); 26typedef void (*nmi_shootdown_cb)(int, struct die_args*);
24void nmi_shootdown_cpus(nmi_shootdown_cb callback); 27void nmi_shootdown_cpus(nmi_shootdown_cb callback);
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index d1e41b0f9b60..df4cd32b4cc6 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -37,26 +37,9 @@
37#endif 37#endif
38 38
39#ifdef __KERNEL__ 39#ifdef __KERNEL__
40
41#include <linux/list.h>
42#include <linux/spinlock.h>
43#include <linux/lockdep.h>
44#include <asm/asm.h> 40#include <asm/asm.h>
45 41
46struct rwsem_waiter;
47
48extern asmregparm struct rw_semaphore *
49 rwsem_down_read_failed(struct rw_semaphore *sem);
50extern asmregparm struct rw_semaphore *
51 rwsem_down_write_failed(struct rw_semaphore *sem);
52extern asmregparm struct rw_semaphore *
53 rwsem_wake(struct rw_semaphore *);
54extern asmregparm struct rw_semaphore *
55 rwsem_downgrade_wake(struct rw_semaphore *sem);
56
57/* 42/*
58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of 43 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647 44 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits. 45 * for 64 bits.
@@ -74,43 +57,6 @@ extern asmregparm struct rw_semaphore *
74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 57#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 58#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
76 59
77typedef signed long rwsem_count_t;
78
79struct rw_semaphore {
80 rwsem_count_t count;
81 spinlock_t wait_lock;
82 struct list_head wait_list;
83#ifdef CONFIG_DEBUG_LOCK_ALLOC
84 struct lockdep_map dep_map;
85#endif
86};
87
88#ifdef CONFIG_DEBUG_LOCK_ALLOC
89# define __RWSEM_DEP_MAP_INIT(lockname) , .dep_map = { .name = #lockname }
90#else
91# define __RWSEM_DEP_MAP_INIT(lockname)
92#endif
93
94
95#define __RWSEM_INITIALIZER(name) \
96{ \
97 RWSEM_UNLOCKED_VALUE, __SPIN_LOCK_UNLOCKED((name).wait_lock), \
98 LIST_HEAD_INIT((name).wait_list) __RWSEM_DEP_MAP_INIT(name) \
99}
100
101#define DECLARE_RWSEM(name) \
102 struct rw_semaphore name = __RWSEM_INITIALIZER(name)
103
104extern void __init_rwsem(struct rw_semaphore *sem, const char *name,
105 struct lock_class_key *key);
106
107#define init_rwsem(sem) \
108do { \
109 static struct lock_class_key __key; \
110 \
111 __init_rwsem((sem), #sem, &__key); \
112} while (0)
113
114/* 60/*
115 * lock for reading 61 * lock for reading
116 */ 62 */
@@ -133,7 +79,7 @@ static inline void __down_read(struct rw_semaphore *sem)
133 */ 79 */
134static inline int __down_read_trylock(struct rw_semaphore *sem) 80static inline int __down_read_trylock(struct rw_semaphore *sem)
135{ 81{
136 rwsem_count_t result, tmp; 82 long result, tmp;
137 asm volatile("# beginning __down_read_trylock\n\t" 83 asm volatile("# beginning __down_read_trylock\n\t"
138 " mov %0,%1\n\t" 84 " mov %0,%1\n\t"
139 "1:\n\t" 85 "1:\n\t"
@@ -155,7 +101,7 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
155 */ 101 */
156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 102static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
157{ 103{
158 rwsem_count_t tmp; 104 long tmp;
159 asm volatile("# beginning down_write\n\t" 105 asm volatile("# beginning down_write\n\t"
160 LOCK_PREFIX " xadd %1,(%2)\n\t" 106 LOCK_PREFIX " xadd %1,(%2)\n\t"
161 /* adds 0xffff0001, returns the old value */ 107 /* adds 0xffff0001, returns the old value */
@@ -180,9 +126,8 @@ static inline void __down_write(struct rw_semaphore *sem)
180 */ 126 */
181static inline int __down_write_trylock(struct rw_semaphore *sem) 127static inline int __down_write_trylock(struct rw_semaphore *sem)
182{ 128{
183 rwsem_count_t ret = cmpxchg(&sem->count, 129 long ret = cmpxchg(&sem->count, RWSEM_UNLOCKED_VALUE,
184 RWSEM_UNLOCKED_VALUE, 130 RWSEM_ACTIVE_WRITE_BIAS);
185 RWSEM_ACTIVE_WRITE_BIAS);
186 if (ret == RWSEM_UNLOCKED_VALUE) 131 if (ret == RWSEM_UNLOCKED_VALUE)
187 return 1; 132 return 1;
188 return 0; 133 return 0;
@@ -193,7 +138,7 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
193 */ 138 */
194static inline void __up_read(struct rw_semaphore *sem) 139static inline void __up_read(struct rw_semaphore *sem)
195{ 140{
196 rwsem_count_t tmp; 141 long tmp;
197 asm volatile("# beginning __up_read\n\t" 142 asm volatile("# beginning __up_read\n\t"
198 LOCK_PREFIX " xadd %1,(%2)\n\t" 143 LOCK_PREFIX " xadd %1,(%2)\n\t"
199 /* subtracts 1, returns the old value */ 144 /* subtracts 1, returns the old value */
@@ -211,7 +156,7 @@ static inline void __up_read(struct rw_semaphore *sem)
211 */ 156 */
212static inline void __up_write(struct rw_semaphore *sem) 157static inline void __up_write(struct rw_semaphore *sem)
213{ 158{
214 rwsem_count_t tmp; 159 long tmp;
215 asm volatile("# beginning __up_write\n\t" 160 asm volatile("# beginning __up_write\n\t"
216 LOCK_PREFIX " xadd %1,(%2)\n\t" 161 LOCK_PREFIX " xadd %1,(%2)\n\t"
217 /* subtracts 0xffff0001, returns the old value */ 162 /* subtracts 0xffff0001, returns the old value */
@@ -247,8 +192,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem)
247/* 192/*
248 * implement atomic add functionality 193 * implement atomic add functionality
249 */ 194 */
250static inline void rwsem_atomic_add(rwsem_count_t delta, 195static inline void rwsem_atomic_add(long delta, struct rw_semaphore *sem)
251 struct rw_semaphore *sem)
252{ 196{
253 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0" 197 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
254 : "+m" (sem->count) 198 : "+m" (sem->count)
@@ -258,10 +202,9 @@ static inline void rwsem_atomic_add(rwsem_count_t delta,
258/* 202/*
259 * implement exchange and add functionality 203 * implement exchange and add functionality
260 */ 204 */
261static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta, 205static inline long rwsem_atomic_update(long delta, struct rw_semaphore *sem)
262 struct rw_semaphore *sem)
263{ 206{
264 rwsem_count_t tmp = delta; 207 long tmp = delta;
265 208
266 asm volatile(LOCK_PREFIX "xadd %0,%1" 209 asm volatile(LOCK_PREFIX "xadd %0,%1"
267 : "+r" (tmp), "+m" (sem->count) 210 : "+r" (tmp), "+m" (sem->count)
@@ -270,10 +213,5 @@ static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
270 return tmp + delta; 213 return tmp + delta;
271} 214}
272 215
273static inline int rwsem_is_locked(struct rw_semaphore *sem)
274{
275 return (sem->count != 0);
276}
277
278#endif /* __KERNEL__ */ 216#endif /* __KERNEL__ */
279#endif /* _ASM_X86_RWSEM_H */ 217#endif /* _ASM_X86_RWSEM_H */
diff --git a/arch/x86/include/asm/segment.h b/arch/x86/include/asm/segment.h
index 14e0ed86a6f9..cd84f7208f76 100644
--- a/arch/x86/include/asm/segment.h
+++ b/arch/x86/include/asm/segment.h
@@ -1,14 +1,16 @@
1#ifndef _ASM_X86_SEGMENT_H 1#ifndef _ASM_X86_SEGMENT_H
2#define _ASM_X86_SEGMENT_H 2#define _ASM_X86_SEGMENT_H
3 3
4#include <linux/const.h>
5
4/* Constructor for a conventional segment GDT (or LDT) entry */ 6/* Constructor for a conventional segment GDT (or LDT) entry */
5/* This is a macro so it can be used in initializers */ 7/* This is a macro so it can be used in initializers */
6#define GDT_ENTRY(flags, base, limit) \ 8#define GDT_ENTRY(flags, base, limit) \
7 ((((base) & 0xff000000ULL) << (56-24)) | \ 9 ((((base) & _AC(0xff000000,ULL)) << (56-24)) | \
8 (((flags) & 0x0000f0ffULL) << 40) | \ 10 (((flags) & _AC(0x0000f0ff,ULL)) << 40) | \
9 (((limit) & 0x000f0000ULL) << (48-16)) | \ 11 (((limit) & _AC(0x000f0000,ULL)) << (48-16)) | \
10 (((base) & 0x00ffffffULL) << 16) | \ 12 (((base) & _AC(0x00ffffff,ULL)) << 16) | \
11 (((limit) & 0x0000ffffULL))) 13 (((limit) & _AC(0x0000ffff,ULL))))
12 14
13/* Simple and small GDT entries for booting only */ 15/* Simple and small GDT entries for booting only */
14 16
@@ -73,31 +75,31 @@
73 75
74#define GDT_ENTRY_DEFAULT_USER_DS 15 76#define GDT_ENTRY_DEFAULT_USER_DS 15
75 77
76#define GDT_ENTRY_KERNEL_BASE 12 78#define GDT_ENTRY_KERNEL_BASE (12)
77 79
78#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE + 0) 80#define GDT_ENTRY_KERNEL_CS (GDT_ENTRY_KERNEL_BASE+0)
79 81
80#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE + 1) 82#define GDT_ENTRY_KERNEL_DS (GDT_ENTRY_KERNEL_BASE+1)
81 83
82#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE + 4) 84#define GDT_ENTRY_TSS (GDT_ENTRY_KERNEL_BASE+4)
83#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE + 5) 85#define GDT_ENTRY_LDT (GDT_ENTRY_KERNEL_BASE+5)
84 86
85#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 6) 87#define GDT_ENTRY_PNPBIOS_BASE (GDT_ENTRY_KERNEL_BASE+6)
86#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE + 11) 88#define GDT_ENTRY_APMBIOS_BASE (GDT_ENTRY_KERNEL_BASE+11)
87 89
88#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE + 14) 90#define GDT_ENTRY_ESPFIX_SS (GDT_ENTRY_KERNEL_BASE+14)
89#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS * 8) 91#define __ESPFIX_SS (GDT_ENTRY_ESPFIX_SS*8)
90 92
91#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE + 15) 93#define GDT_ENTRY_PERCPU (GDT_ENTRY_KERNEL_BASE+15)
92#ifdef CONFIG_SMP 94#ifdef CONFIG_SMP
93#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8) 95#define __KERNEL_PERCPU (GDT_ENTRY_PERCPU * 8)
94#else 96#else
95#define __KERNEL_PERCPU 0 97#define __KERNEL_PERCPU 0
96#endif 98#endif
97 99
98#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE + 16) 100#define GDT_ENTRY_STACK_CANARY (GDT_ENTRY_KERNEL_BASE+16)
99#ifdef CONFIG_CC_STACKPROTECTOR 101#ifdef CONFIG_CC_STACKPROTECTOR
100#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY * 8) 102#define __KERNEL_STACK_CANARY (GDT_ENTRY_STACK_CANARY*8)
101#else 103#else
102#define __KERNEL_STACK_CANARY 0 104#define __KERNEL_STACK_CANARY 0
103#endif 105#endif
@@ -182,10 +184,10 @@
182 184
183#endif 185#endif
184 186
185#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS * 8) 187#define __KERNEL_CS (GDT_ENTRY_KERNEL_CS*8)
186#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS * 8) 188#define __KERNEL_DS (GDT_ENTRY_KERNEL_DS*8)
187#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS* 8 + 3) 189#define __USER_DS (GDT_ENTRY_DEFAULT_USER_DS*8+3)
188#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS* 8 + 3) 190#define __USER_CS (GDT_ENTRY_DEFAULT_USER_CS*8+3)
189#ifndef CONFIG_PARAVIRT 191#ifndef CONFIG_PARAVIRT
190#define get_kernel_rpl() 0 192#define get_kernel_rpl() 0
191#endif 193#endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index ef292c792d74..9756551ec760 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -53,6 +53,12 @@ extern void x86_mrst_early_setup(void);
53static inline void x86_mrst_early_setup(void) { } 53static inline void x86_mrst_early_setup(void) { }
54#endif 54#endif
55 55
56#ifdef CONFIG_X86_INTEL_CE
57extern void x86_ce4100_early_setup(void);
58#else
59static inline void x86_ce4100_early_setup(void) { }
60#endif
61
56#ifndef _SETUP 62#ifndef _SETUP
57 63
58/* 64/*
@@ -82,7 +88,7 @@ void *extend_brk(size_t size, size_t align);
82 * executable.) 88 * executable.)
83 */ 89 */
84#define RESERVE_BRK(name,sz) \ 90#define RESERVE_BRK(name,sz) \
85 static void __section(.discard.text) __used \ 91 static void __section(.discard.text) __used notrace \
86 __brk_reservation_fn_##name##__(void) { \ 92 __brk_reservation_fn_##name##__(void) { \
87 asm volatile ( \ 93 asm volatile ( \
88 ".pushsection .brk_reservation,\"aw\",@nobits;" \ 94 ".pushsection .brk_reservation,\"aw\",@nobits;" \
@@ -93,10 +99,15 @@ void *extend_brk(size_t size, size_t align);
93 : : "i" (sz)); \ 99 : : "i" (sz)); \
94 } 100 }
95 101
102/* Helper for reserving space for arrays of things */
103#define RESERVE_BRK_ARRAY(type, name, entries) \
104 type *name; \
105 RESERVE_BRK(name, sizeof(type) * entries)
106
107extern void probe_roms(void);
96#ifdef __i386__ 108#ifdef __i386__
97 109
98void __init i386_start_kernel(void); 110void __init i386_start_kernel(void);
99extern void probe_roms(void);
100 111
101#else 112#else
102void __init x86_64_start_kernel(char *real_mode); 113void __init x86_64_start_kernel(char *real_mode);
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 4cfc90824068..73b11bc0ae6f 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -17,12 +17,24 @@
17#endif 17#endif
18#include <asm/thread_info.h> 18#include <asm/thread_info.h>
19#include <asm/cpumask.h> 19#include <asm/cpumask.h>
20#include <asm/cpufeature.h>
20 21
21extern int smp_num_siblings; 22extern int smp_num_siblings;
22extern unsigned int num_processors; 23extern unsigned int num_processors;
23 24
25static inline bool cpu_has_ht_siblings(void)
26{
27 bool has_siblings = false;
28#ifdef CONFIG_SMP
29 has_siblings = cpu_has_ht && smp_num_siblings > 1;
30#endif
31 return has_siblings;
32}
33
24DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map); 34DECLARE_PER_CPU(cpumask_var_t, cpu_sibling_map);
25DECLARE_PER_CPU(cpumask_var_t, cpu_core_map); 35DECLARE_PER_CPU(cpumask_var_t, cpu_core_map);
36/* cpus sharing the last level cache: */
37DECLARE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
26DECLARE_PER_CPU(u16, cpu_llc_id); 38DECLARE_PER_CPU(u16, cpu_llc_id);
27DECLARE_PER_CPU(int, cpu_number); 39DECLARE_PER_CPU(int, cpu_number);
28 40
@@ -36,21 +48,26 @@ static inline struct cpumask *cpu_core_mask(int cpu)
36 return per_cpu(cpu_core_map, cpu); 48 return per_cpu(cpu_core_map, cpu);
37} 49}
38 50
51static inline struct cpumask *cpu_llc_shared_mask(int cpu)
52{
53 return per_cpu(cpu_llc_shared_map, cpu);
54}
55
39DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid); 56DECLARE_EARLY_PER_CPU(u16, x86_cpu_to_apicid);
40DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid); 57DECLARE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid);
58#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_32)
59DECLARE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid);
60#endif
41 61
42/* Static state in head.S used to set up a CPU */ 62/* Static state in head.S used to set up a CPU */
43extern struct { 63extern unsigned long stack_start; /* Initial stack pointer address */
44 void *sp;
45 unsigned short ss;
46} stack_start;
47 64
48struct smp_ops { 65struct smp_ops {
49 void (*smp_prepare_boot_cpu)(void); 66 void (*smp_prepare_boot_cpu)(void);
50 void (*smp_prepare_cpus)(unsigned max_cpus); 67 void (*smp_prepare_cpus)(unsigned max_cpus);
51 void (*smp_cpus_done)(unsigned max_cpus); 68 void (*smp_cpus_done)(unsigned max_cpus);
52 69
53 void (*smp_send_stop)(void); 70 void (*stop_other_cpus)(int wait);
54 void (*smp_send_reschedule)(int cpu); 71 void (*smp_send_reschedule)(int cpu);
55 72
56 int (*cpu_up)(unsigned cpu); 73 int (*cpu_up)(unsigned cpu);
@@ -73,7 +90,12 @@ extern struct smp_ops smp_ops;
73 90
74static inline void smp_send_stop(void) 91static inline void smp_send_stop(void)
75{ 92{
76 smp_ops.smp_send_stop(); 93 smp_ops.stop_other_cpus(0);
94}
95
96static inline void stop_other_cpus(void)
97{
98 smp_ops.stop_other_cpus(1);
77} 99}
78 100
79static inline void smp_prepare_boot_cpu(void) 101static inline void smp_prepare_boot_cpu(void)
diff --git a/arch/x86/include/asm/smpboot_hooks.h b/arch/x86/include/asm/smpboot_hooks.h
index 1def60114906..725b77831993 100644
--- a/arch/x86/include/asm/smpboot_hooks.h
+++ b/arch/x86/include/asm/smpboot_hooks.h
@@ -34,7 +34,7 @@ static inline void smpboot_restore_warm_reset_vector(void)
34 */ 34 */
35 CMOS_WRITE(0, 0xf); 35 CMOS_WRITE(0, 0xf);
36 36
37 *((volatile long *)phys_to_virt(apic->trampoline_phys_low)) = 0; 37 *((volatile u32 *)phys_to_virt(apic->trampoline_phys_low)) = 0;
38} 38}
39 39
40static inline void __init smpboot_setup_io_apic(void) 40static inline void __init smpboot_setup_io_apic(void)
@@ -48,7 +48,6 @@ static inline void __init smpboot_setup_io_apic(void)
48 setup_IO_APIC(); 48 setup_IO_APIC();
49 else { 49 else {
50 nr_ioapics = 0; 50 nr_ioapics = 0;
51 localise_nmi_watchdog();
52 } 51 }
53#endif 52#endif
54} 53}
diff --git a/arch/x86/include/asm/srat.h b/arch/x86/include/asm/srat.h
deleted file mode 100644
index b508d639d1a7..000000000000
--- a/arch/x86/include/asm/srat.h
+++ /dev/null
@@ -1,39 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26
27#ifndef _ASM_X86_SRAT_H
28#define _ASM_X86_SRAT_H
29
30#ifdef CONFIG_ACPI_NUMA
31extern int get_memcfg_from_srat(void);
32#else
33static inline int get_memcfg_from_srat(void)
34{
35 return 0;
36}
37#endif
38
39#endif /* _ASM_X86_SRAT_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index 2b16a2ad23dc..70bbe39043a9 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -7,6 +7,7 @@
7#define _ASM_X86_STACKTRACE_H 7#define _ASM_X86_STACKTRACE_H
8 8
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <linux/ptrace.h>
10 11
11extern int kstack_depth_to_print; 12extern int kstack_depth_to_print;
12 13
@@ -36,9 +37,6 @@ print_context_stack_bp(struct thread_info *tinfo,
36/* Generic stack tracer with callbacks */ 37/* Generic stack tracer with callbacks */
37 38
38struct stacktrace_ops { 39struct stacktrace_ops {
39 void (*warning)(void *data, char *msg);
40 /* msg must contain %s for the symbol */
41 void (*warning_symbol)(void *data, char *msg, unsigned long symbol);
42 void (*address)(void *data, unsigned long address, int reliable); 40 void (*address)(void *data, unsigned long address, int reliable);
43 /* On negative return stop dumping */ 41 /* On negative return stop dumping */
44 int (*stack)(void *data, char *name); 42 int (*stack)(void *data, char *name);
@@ -57,13 +55,39 @@ void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
57#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 55#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
58#endif 56#endif
59 57
58#ifdef CONFIG_FRAME_POINTER
59static inline unsigned long
60stack_frame(struct task_struct *task, struct pt_regs *regs)
61{
62 unsigned long bp;
63
64 if (regs)
65 return regs->bp;
66
67 if (task == current) {
68 /* Grab bp right from our regs */
69 get_bp(bp);
70 return bp;
71 }
72
73 /* bp is the last reg pushed by switch_to */
74 return *(unsigned long *)task->thread.sp;
75}
76#else
77static inline unsigned long
78stack_frame(struct task_struct *task, struct pt_regs *regs)
79{
80 return 0;
81}
82#endif
83
60extern void 84extern void
61show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 85show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
62 unsigned long *stack, unsigned long bp, char *log_lvl); 86 unsigned long *stack, unsigned long bp, char *log_lvl);
63 87
64extern void 88extern void
65show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 89show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
66 unsigned long *sp, unsigned long bp, char *log_lvl); 90 unsigned long *sp, unsigned long bp, char *log_lvl);
67 91
68extern unsigned int code_bytes; 92extern unsigned int code_bytes;
69 93
diff --git a/arch/x86/include/asm/suspend_32.h b/arch/x86/include/asm/suspend_32.h
index fd921c3a6841..487055c8c1aa 100644
--- a/arch/x86/include/asm/suspend_32.h
+++ b/arch/x86/include/asm/suspend_32.h
@@ -9,8 +9,6 @@
9#include <asm/desc.h> 9#include <asm/desc.h>
10#include <asm/i387.h> 10#include <asm/i387.h>
11 11
12static inline int arch_prepare_suspend(void) { return 0; }
13
14/* image of the saved processor state */ 12/* image of the saved processor state */
15struct saved_context { 13struct saved_context {
16 u16 es, fs, gs, ss; 14 u16 es, fs, gs, ss;
diff --git a/arch/x86/include/asm/suspend_64.h b/arch/x86/include/asm/suspend_64.h
index 8d942afae681..09b0bf104156 100644
--- a/arch/x86/include/asm/suspend_64.h
+++ b/arch/x86/include/asm/suspend_64.h
@@ -9,11 +9,6 @@
9#include <asm/desc.h> 9#include <asm/desc.h>
10#include <asm/i387.h> 10#include <asm/i387.h>
11 11
12static inline int arch_prepare_suspend(void)
13{
14 return 0;
15}
16
17/* 12/*
18 * Image of the saved processor state, used by the low level ACPI suspend to 13 * Image of the saved processor state, used by the low level ACPI suspend to
19 * RAM code and by the low level hibernation code. 14 * RAM code and by the low level hibernation code.
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 0e831059ac5a..f2b83bc7d784 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -47,14 +47,13 @@ enum {
47 INTERCEPT_MONITOR, 47 INTERCEPT_MONITOR,
48 INTERCEPT_MWAIT, 48 INTERCEPT_MWAIT,
49 INTERCEPT_MWAIT_COND, 49 INTERCEPT_MWAIT_COND,
50 INTERCEPT_XSETBV,
50}; 51};
51 52
52 53
53struct __attribute__ ((__packed__)) vmcb_control_area { 54struct __attribute__ ((__packed__)) vmcb_control_area {
54 u16 intercept_cr_read; 55 u32 intercept_cr;
55 u16 intercept_cr_write; 56 u32 intercept_dr;
56 u16 intercept_dr_read;
57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 57 u32 intercept_exceptions;
59 u64 intercept; 58 u64 intercept;
60 u8 reserved_1[42]; 59 u8 reserved_1[42];
@@ -81,14 +80,19 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
81 u32 event_inj_err; 80 u32 event_inj_err;
82 u64 nested_cr3; 81 u64 nested_cr3;
83 u64 lbr_ctl; 82 u64 lbr_ctl;
84 u64 reserved_5; 83 u32 clean;
84 u32 reserved_5;
85 u64 next_rip; 85 u64 next_rip;
86 u8 reserved_6[816]; 86 u8 insn_len;
87 u8 insn_bytes[15];
88 u8 reserved_6[800];
87}; 89};
88 90
89 91
90#define TLB_CONTROL_DO_NOTHING 0 92#define TLB_CONTROL_DO_NOTHING 0
91#define TLB_CONTROL_FLUSH_ALL_ASID 1 93#define TLB_CONTROL_FLUSH_ALL_ASID 1
94#define TLB_CONTROL_FLUSH_ASID 3
95#define TLB_CONTROL_FLUSH_ASID_LOCAL 7
92 96
93#define V_TPR_MASK 0x0f 97#define V_TPR_MASK 0x0f
94 98
@@ -204,19 +208,31 @@ struct __attribute__ ((__packed__)) vmcb {
204#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK 208#define SVM_SELECTOR_READ_MASK SVM_SELECTOR_WRITE_MASK
205#define SVM_SELECTOR_CODE_MASK (1 << 3) 209#define SVM_SELECTOR_CODE_MASK (1 << 3)
206 210
207#define INTERCEPT_CR0_MASK 1 211#define INTERCEPT_CR0_READ 0
208#define INTERCEPT_CR3_MASK (1 << 3) 212#define INTERCEPT_CR3_READ 3
209#define INTERCEPT_CR4_MASK (1 << 4) 213#define INTERCEPT_CR4_READ 4
210#define INTERCEPT_CR8_MASK (1 << 8) 214#define INTERCEPT_CR8_READ 8
211 215#define INTERCEPT_CR0_WRITE (16 + 0)
212#define INTERCEPT_DR0_MASK 1 216#define INTERCEPT_CR3_WRITE (16 + 3)
213#define INTERCEPT_DR1_MASK (1 << 1) 217#define INTERCEPT_CR4_WRITE (16 + 4)
214#define INTERCEPT_DR2_MASK (1 << 2) 218#define INTERCEPT_CR8_WRITE (16 + 8)
215#define INTERCEPT_DR3_MASK (1 << 3) 219
216#define INTERCEPT_DR4_MASK (1 << 4) 220#define INTERCEPT_DR0_READ 0
217#define INTERCEPT_DR5_MASK (1 << 5) 221#define INTERCEPT_DR1_READ 1
218#define INTERCEPT_DR6_MASK (1 << 6) 222#define INTERCEPT_DR2_READ 2
219#define INTERCEPT_DR7_MASK (1 << 7) 223#define INTERCEPT_DR3_READ 3
224#define INTERCEPT_DR4_READ 4
225#define INTERCEPT_DR5_READ 5
226#define INTERCEPT_DR6_READ 6
227#define INTERCEPT_DR7_READ 7
228#define INTERCEPT_DR0_WRITE (16 + 0)
229#define INTERCEPT_DR1_WRITE (16 + 1)
230#define INTERCEPT_DR2_WRITE (16 + 2)
231#define INTERCEPT_DR3_WRITE (16 + 3)
232#define INTERCEPT_DR4_WRITE (16 + 4)
233#define INTERCEPT_DR5_WRITE (16 + 5)
234#define INTERCEPT_DR6_WRITE (16 + 6)
235#define INTERCEPT_DR7_WRITE (16 + 7)
220 236
221#define SVM_EVTINJ_VEC_MASK 0xff 237#define SVM_EVTINJ_VEC_MASK 0xff
222 238
@@ -246,6 +262,8 @@ struct __attribute__ ((__packed__)) vmcb {
246#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38 262#define SVM_EXITINFOSHIFT_TS_REASON_JMP 38
247#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44 263#define SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE 44
248 264
265#define SVM_EXITINFO_REG_MASK 0x0F
266
249#define SVM_EXIT_READ_CR0 0x000 267#define SVM_EXIT_READ_CR0 0x000
250#define SVM_EXIT_READ_CR3 0x003 268#define SVM_EXIT_READ_CR3 0x003
251#define SVM_EXIT_READ_CR4 0x004 269#define SVM_EXIT_READ_CR4 0x004
@@ -316,6 +334,7 @@ struct __attribute__ ((__packed__)) vmcb {
316#define SVM_EXIT_MONITOR 0x08a 334#define SVM_EXIT_MONITOR 0x08a
317#define SVM_EXIT_MWAIT 0x08b 335#define SVM_EXIT_MWAIT 0x08b
318#define SVM_EXIT_MWAIT_COND 0x08c 336#define SVM_EXIT_MWAIT_COND 0x08c
337#define SVM_EXIT_XSETBV 0x08d
319#define SVM_EXIT_NPF 0x400 338#define SVM_EXIT_NPF 0x400
320 339
321#define SVM_EXIT_ERR -1 340#define SVM_EXIT_ERR -1
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index 8085277e1b8b..977f1761a25d 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -5,17 +5,26 @@
5 5
6#ifdef CONFIG_SWIOTLB 6#ifdef CONFIG_SWIOTLB
7extern int swiotlb; 7extern int swiotlb;
8extern int __init pci_swiotlb_detect(void); 8extern int __init pci_swiotlb_detect_override(void);
9extern int __init pci_swiotlb_detect_4gb(void);
9extern void __init pci_swiotlb_init(void); 10extern void __init pci_swiotlb_init(void);
11extern void __init pci_swiotlb_late_init(void);
10#else 12#else
11#define swiotlb 0 13#define swiotlb 0
12static inline int pci_swiotlb_detect(void) 14static inline int pci_swiotlb_detect_override(void)
15{
16 return 0;
17}
18static inline int pci_swiotlb_detect_4gb(void)
13{ 19{
14 return 0; 20 return 0;
15} 21}
16static inline void pci_swiotlb_init(void) 22static inline void pci_swiotlb_init(void)
17{ 23{
18} 24}
25static inline void pci_swiotlb_late_init(void)
26{
27}
19#endif 28#endif
20 29
21static inline void dma_mark_clean(void *addr, size_t size) {} 30static inline void dma_mark_clean(void *addr, size_t size) {}
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index 33ecc3ea8782..c2ff2a1d845e 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -98,8 +98,6 @@ do { \
98 */ 98 */
99#define HAVE_DISABLE_HLT 99#define HAVE_DISABLE_HLT
100#else 100#else
101#define __SAVE(reg, offset) "movq %%" #reg ",(14-" #offset ")*8(%%rsp)\n\t"
102#define __RESTORE(reg, offset) "movq (14-" #offset ")*8(%%rsp),%%" #reg "\n\t"
103 101
104/* frame pointer must be last for get_wchan */ 102/* frame pointer must be last for get_wchan */
105#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t" 103#define SAVE_CONTEXT "pushf ; pushq %%rbp ; movq %%rsi,%%rbp\n\t"
@@ -305,24 +303,81 @@ static inline void native_wbinvd(void)
305#ifdef CONFIG_PARAVIRT 303#ifdef CONFIG_PARAVIRT
306#include <asm/paravirt.h> 304#include <asm/paravirt.h>
307#else 305#else
308#define read_cr0() (native_read_cr0()) 306
309#define write_cr0(x) (native_write_cr0(x)) 307static inline unsigned long read_cr0(void)
310#define read_cr2() (native_read_cr2()) 308{
311#define write_cr2(x) (native_write_cr2(x)) 309 return native_read_cr0();
312#define read_cr3() (native_read_cr3()) 310}
313#define write_cr3(x) (native_write_cr3(x)) 311
314#define read_cr4() (native_read_cr4()) 312static inline void write_cr0(unsigned long x)
315#define read_cr4_safe() (native_read_cr4_safe()) 313{
316#define write_cr4(x) (native_write_cr4(x)) 314 native_write_cr0(x);
317#define wbinvd() (native_wbinvd()) 315}
316
317static inline unsigned long read_cr2(void)
318{
319 return native_read_cr2();
320}
321
322static inline void write_cr2(unsigned long x)
323{
324 native_write_cr2(x);
325}
326
327static inline unsigned long read_cr3(void)
328{
329 return native_read_cr3();
330}
331
332static inline void write_cr3(unsigned long x)
333{
334 native_write_cr3(x);
335}
336
337static inline unsigned long read_cr4(void)
338{
339 return native_read_cr4();
340}
341
342static inline unsigned long read_cr4_safe(void)
343{
344 return native_read_cr4_safe();
345}
346
347static inline void write_cr4(unsigned long x)
348{
349 native_write_cr4(x);
350}
351
352static inline void wbinvd(void)
353{
354 native_wbinvd();
355}
356
318#ifdef CONFIG_X86_64 357#ifdef CONFIG_X86_64
319#define read_cr8() (native_read_cr8()) 358
320#define write_cr8(x) (native_write_cr8(x)) 359static inline unsigned long read_cr8(void)
321#define load_gs_index native_load_gs_index 360{
361 return native_read_cr8();
362}
363
364static inline void write_cr8(unsigned long x)
365{
366 native_write_cr8(x);
367}
368
369static inline void load_gs_index(unsigned selector)
370{
371 native_load_gs_index(selector);
372}
373
322#endif 374#endif
323 375
324/* Clear the 'TS' bit */ 376/* Clear the 'TS' bit */
325#define clts() (native_clts()) 377static inline void clts(void)
378{
379 native_clts();
380}
326 381
327#endif/* CONFIG_PARAVIRT */ 382#endif/* CONFIG_PARAVIRT */
328 383
diff --git a/arch/x86/include/asm/system_64.h b/arch/x86/include/asm/system_64.h
deleted file mode 100644
index 1159e091ad09..000000000000
--- a/arch/x86/include/asm/system_64.h
+++ /dev/null
@@ -1,22 +0,0 @@
1#ifndef _ASM_X86_SYSTEM_64_H
2#define _ASM_X86_SYSTEM_64_H
3
4#include <asm/segment.h>
5#include <asm/cmpxchg.h>
6
7
8static inline unsigned long read_cr8(void)
9{
10 unsigned long cr8;
11 asm volatile("movq %%cr8,%0" : "=r" (cr8));
12 return cr8;
13}
14
15static inline void write_cr8(unsigned long val)
16{
17 asm volatile("movq %0,%%cr8" :: "r" (val) : "memory");
18}
19
20#include <linux/irqflags.h>
21
22#endif /* _ASM_X86_SYSTEM_64_H */
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index f0b6e5dbc5a0..1f2e61e28981 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -161,8 +161,14 @@ struct thread_info {
161 161
162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR 162#define __HAVE_ARCH_THREAD_INFO_ALLOCATOR
163 163
164#define alloc_thread_info(tsk) \ 164#define alloc_thread_info_node(tsk, node) \
165 ((struct thread_info *)__get_free_pages(THREAD_FLAGS, THREAD_ORDER)) 165({ \
166 struct page *page = alloc_pages_node(node, THREAD_FLAGS, \
167 THREAD_ORDER); \
168 struct thread_info *ret = page ? page_address(page) : NULL; \
169 \
170 ret; \
171})
166 172
167#ifdef CONFIG_X86_32 173#ifdef CONFIG_X86_32
168 174
diff --git a/arch/x86/include/asm/timer.h b/arch/x86/include/asm/timer.h
index 5469630b27f5..fa7b9176b76c 100644
--- a/arch/x86/include/asm/timer.h
+++ b/arch/x86/include/asm/timer.h
@@ -10,12 +10,6 @@
10unsigned long long native_sched_clock(void); 10unsigned long long native_sched_clock(void);
11extern int recalibrate_cpu_khz(void); 11extern int recalibrate_cpu_khz(void);
12 12
13#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
14extern int timer_ack;
15#else
16# define timer_ack (0)
17#endif
18
19extern int no_timer_check; 13extern int no_timer_check;
20 14
21/* Accelerators for sched_clock() 15/* Accelerators for sched_clock()
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 7f3eba08e7de..169be8938b96 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -172,6 +172,4 @@ static inline void flush_tlb_kernel_range(unsigned long start,
172 flush_tlb_all(); 172 flush_tlb_all();
173} 173}
174 174
175extern void zap_low_mappings(bool early);
176
177#endif /* _ASM_X86_TLBFLUSH_H */ 175#endif /* _ASM_X86_TLBFLUSH_H */
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 21899cc31e52..c00692476e9f 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -47,21 +47,6 @@
47 47
48#include <asm/mpspec.h> 48#include <asm/mpspec.h>
49 49
50#ifdef CONFIG_X86_32
51
52/* Mappings between logical cpu number and node number */
53extern int cpu_to_node_map[];
54
55/* Returns the number of the node containing CPU 'cpu' */
56static inline int __cpu_to_node(int cpu)
57{
58 return cpu_to_node_map[cpu];
59}
60#define early_cpu_to_node __cpu_to_node
61#define cpu_to_node __cpu_to_node
62
63#else /* CONFIG_X86_64 */
64
65/* Mappings between logical cpu number and node number */ 50/* Mappings between logical cpu number and node number */
66DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map); 51DECLARE_EARLY_PER_CPU(int, x86_cpu_to_node_map);
67 52
@@ -84,8 +69,6 @@ static inline int early_cpu_to_node(int cpu)
84 69
85#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */ 70#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
86 71
87#endif /* CONFIG_X86_64 */
88
89/* Mappings between node number and cpus on that node. */ 72/* Mappings between node number and cpus on that node. */
90extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 73extern cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
91 74
@@ -110,19 +93,11 @@ extern void setup_node_to_cpumask_map(void);
110#define pcibus_to_node(bus) __pcibus_to_node(bus) 93#define pcibus_to_node(bus) __pcibus_to_node(bus)
111 94
112#ifdef CONFIG_X86_32 95#ifdef CONFIG_X86_32
113extern unsigned long node_start_pfn[];
114extern unsigned long node_end_pfn[];
115extern unsigned long node_remap_size[];
116#define node_has_online_mem(nid) (node_start_pfn[nid] != node_end_pfn[nid])
117
118# define SD_CACHE_NICE_TRIES 1 96# define SD_CACHE_NICE_TRIES 1
119# define SD_IDLE_IDX 1 97# define SD_IDLE_IDX 1
120
121#else 98#else
122
123# define SD_CACHE_NICE_TRIES 2 99# define SD_CACHE_NICE_TRIES 2
124# define SD_IDLE_IDX 2 100# define SD_IDLE_IDX 2
125
126#endif 101#endif
127 102
128/* sched_domains SD_NODE_INIT for NUMA machines */ 103/* sched_domains SD_NODE_INIT for NUMA machines */
@@ -155,7 +130,7 @@ extern unsigned long node_remap_size[];
155 .balance_interval = 1, \ 130 .balance_interval = 1, \
156} 131}
157 132
158#ifdef CONFIG_X86_64_ACPI_NUMA 133#ifdef CONFIG_X86_64
159extern int __node_distance(int, int); 134extern int __node_distance(int, int);
160#define node_distance(a, b) __node_distance(a, b) 135#define node_distance(a, b) __node_distance(a, b)
161#endif 136#endif
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 4dde797c0578..feca3118a73b 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -3,28 +3,36 @@
3 3
4#ifndef __ASSEMBLY__ 4#ifndef __ASSEMBLY__
5 5
6#ifdef CONFIG_X86_TRAMPOLINE 6#include <linux/types.h>
7#include <asm/io.h>
8
7/* 9/*
8 * Trampoline 80x86 program as an array. 10 * Trampoline 80x86 program as an array. These are in the init rodata
11 * segment, but that's okay, because we only care about the relative
12 * addresses of the symbols.
9 */ 13 */
10extern const unsigned char trampoline_data []; 14extern const unsigned char x86_trampoline_start [];
11extern const unsigned char trampoline_end []; 15extern const unsigned char x86_trampoline_end [];
12extern unsigned char *trampoline_base; 16extern unsigned char *x86_trampoline_base;
13 17
14extern unsigned long init_rsp; 18extern unsigned long init_rsp;
15extern unsigned long initial_code; 19extern unsigned long initial_code;
16extern unsigned long initial_page_table;
17extern unsigned long initial_gs; 20extern unsigned long initial_gs;
18 21
19#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 22extern void __init setup_trampolines(void);
23
24extern const unsigned char trampoline_data[];
25extern const unsigned char trampoline_status[];
26
27#define TRAMPOLINE_SYM(x) \
28 ((void *)(x86_trampoline_base + \
29 ((const unsigned char *)(x) - x86_trampoline_start)))
20 30
21extern unsigned long setup_trampoline(void); 31/* Address of the SMP trampoline */
22extern void __init setup_trampoline_page_table(void); 32static inline unsigned long trampoline_address(void)
23extern void __init reserve_trampoline_memory(void); 33{
24#else 34 return virt_to_phys(TRAMPOLINE_SYM(trampoline_data));
25static inline void setup_trampoline_page_table(void) {} 35}
26static inline void reserve_trampoline_memory(void) {}
27#endif /* CONFIG_X86_TRAMPOLINE */
28 36
29#endif /* __ASSEMBLY__ */ 37#endif /* __ASSEMBLY__ */
30 38
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index f66cda56781d..0310da67307f 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -30,6 +30,7 @@ asmlinkage void segment_not_present(void);
30asmlinkage void stack_segment(void); 30asmlinkage void stack_segment(void);
31asmlinkage void general_protection(void); 31asmlinkage void general_protection(void);
32asmlinkage void page_fault(void); 32asmlinkage void page_fault(void);
33asmlinkage void async_page_fault(void);
33asmlinkage void spurious_interrupt_bug(void); 34asmlinkage void spurious_interrupt_bug(void);
34asmlinkage void coprocessor_error(void); 35asmlinkage void coprocessor_error(void);
35asmlinkage void alignment_check(void); 36asmlinkage void alignment_check(void);
diff --git a/arch/x86/include/asm/tsc.h b/arch/x86/include/asm/tsc.h
index 1ca132fc0d03..9db5583b6d38 100644
--- a/arch/x86/include/asm/tsc.h
+++ b/arch/x86/include/asm/tsc.h
@@ -35,7 +35,7 @@ static inline cycles_t get_cycles(void)
35static __always_inline cycles_t vget_cycles(void) 35static __always_inline cycles_t vget_cycles(void)
36{ 36{
37 /* 37 /*
38 * We only do VDSOs on TSC capable CPUs, so this shouldnt 38 * We only do VDSOs on TSC capable CPUs, so this shouldn't
39 * access boot_cpu_data (which is not VDSO-safe): 39 * access boot_cpu_data (which is not VDSO-safe):
40 */ 40 */
41#ifndef CONFIG_X86_TSC 41#ifndef CONFIG_X86_TSC
@@ -51,6 +51,10 @@ extern int unsynchronized_tsc(void);
51extern int check_tsc_unstable(void); 51extern int check_tsc_unstable(void);
52extern unsigned long native_calibrate_tsc(void); 52extern unsigned long native_calibrate_tsc(void);
53 53
54#ifdef CONFIG_X86_64
55extern cycles_t vread_tsc(void);
56#endif
57
54/* 58/*
55 * Boot-time check whether the TSCs are synchronized across 59 * Boot-time check whether the TSCs are synchronized across
56 * all CPUs/cores: 60 * all CPUs/cores:
diff --git a/arch/x86/include/asm/types.h b/arch/x86/include/asm/types.h
index df1da20f4534..8e8c23fef08c 100644
--- a/arch/x86/include/asm/types.h
+++ b/arch/x86/include/asm/types.h
@@ -1,22 +1,6 @@
1#ifndef _ASM_X86_TYPES_H 1#ifndef _ASM_X86_TYPES_H
2#define _ASM_X86_TYPES_H 2#define _ASM_X86_TYPES_H
3 3
4#define dma_addr_t dma_addr_t
5
6#include <asm-generic/types.h> 4#include <asm-generic/types.h>
7 5
8#ifdef __KERNEL__
9#ifndef __ASSEMBLY__
10
11typedef u64 dma64_addr_t;
12#if defined(CONFIG_X86_64) || defined(CONFIG_HIGHMEM64G)
13/* DMA addresses come in 32-bit and 64-bit flavours. */
14typedef u64 dma_addr_t;
15#else
16typedef u32 dma_addr_t;
17#endif
18
19#endif /* __ASSEMBLY__ */
20#endif /* __KERNEL__ */
21
22#endif /* _ASM_X86_TYPES_H */ 6#endif /* _ASM_X86_TYPES_H */
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index abd3e0ea762a..99ddd148a760 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -6,7 +6,6 @@
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/compiler.h> 7#include <linux/compiler.h>
8#include <linux/thread_info.h> 8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h> 9#include <linux/string.h>
11#include <asm/asm.h> 10#include <asm/asm.h>
12#include <asm/page.h> 11#include <asm/page.h>
@@ -42,7 +41,7 @@
42 * Returns 0 if the range is valid, nonzero otherwise. 41 * Returns 0 if the range is valid, nonzero otherwise.
43 * 42 *
44 * This is equivalent to the following test: 43 * This is equivalent to the following test:
45 * (u33)addr + (u33)size >= (u33)current->addr_limit.seg (u65 for x86_64) 44 * (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
46 * 45 *
47 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry... 46 * This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
48 */ 47 */
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 088d09fb1615..566e803cc602 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -6,7 +6,6 @@
6 */ 6 */
7#include <linux/errno.h> 7#include <linux/errno.h>
8#include <linux/thread_info.h> 8#include <linux/thread_info.h>
9#include <linux/prefetch.h>
10#include <linux/string.h> 9#include <linux/string.h>
11#include <asm/asm.h> 10#include <asm/asm.h>
12#include <asm/page.h> 11#include <asm/page.h>
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index 316708d5af92..1c66d30971ad 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -6,7 +6,6 @@
6 */ 6 */
7#include <linux/compiler.h> 7#include <linux/compiler.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/prefetch.h>
10#include <linux/lockdep.h> 9#include <linux/lockdep.h>
11#include <asm/alternative.h> 10#include <asm/alternative.h>
12#include <asm/cpufeature.h> 11#include <asm/cpufeature.h>
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index b7ba19acd3f8..2f6e127db30c 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -346,14 +346,20 @@
346#define __NR_fanotify_init 338 346#define __NR_fanotify_init 338
347#define __NR_fanotify_mark 339 347#define __NR_fanotify_mark 339
348#define __NR_prlimit64 340 348#define __NR_prlimit64 340
349#define __NR_name_to_handle_at 341
350#define __NR_open_by_handle_at 342
351#define __NR_clock_adjtime 343
352#define __NR_syncfs 344
353#define __NR_sendmmsg 345
354#define __NR_setns 346
349 355
350#define __NR_LITMUS 341 356#define __NR_LITMUS 347
351 357
352#include "litmus/unistd_32.h" 358#include "litmus/unistd_32.h"
353 359
354#ifdef __KERNEL__ 360#ifdef __KERNEL__
355 361
356#define NR_syscalls 341 + NR_litmus_syscalls 362#define NR_syscalls 347 + NR_litmus_syscalls
357 363
358#define __ARCH_WANT_IPC_PARSE_VERSION 364#define __ARCH_WANT_IPC_PARSE_VERSION
359#define __ARCH_WANT_OLD_READDIR 365#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 332bf3c9c84c..e347f0773788 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -669,8 +669,20 @@ __SYSCALL(__NR_fanotify_init, sys_fanotify_init)
669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark) 669__SYSCALL(__NR_fanotify_mark, sys_fanotify_mark)
670#define __NR_prlimit64 302 670#define __NR_prlimit64 302
671__SYSCALL(__NR_prlimit64, sys_prlimit64) 671__SYSCALL(__NR_prlimit64, sys_prlimit64)
672 672#define __NR_name_to_handle_at 303
673#define __NR_LITMUS 303 673__SYSCALL(__NR_name_to_handle_at, sys_name_to_handle_at)
674#define __NR_open_by_handle_at 304
675__SYSCALL(__NR_open_by_handle_at, sys_open_by_handle_at)
676#define __NR_clock_adjtime 305
677__SYSCALL(__NR_clock_adjtime, sys_clock_adjtime)
678#define __NR_syncfs 306
679__SYSCALL(__NR_syncfs, sys_syncfs)
680#define __NR_sendmmsg 307
681__SYSCALL(__NR_sendmmsg, sys_sendmmsg)
682#define __NR_setns 308
683__SYSCALL(__NR_setns, sys_setns)
684
685#define __NR_LITMUS 309
674 686
675#include "litmus/unistd_64.h" 687#include "litmus/unistd_64.h"
676 688
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 42d412fd8b02..a291c40efd43 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV Broadcast Assist Unit definitions 6 * SGI UV Broadcast Assist Unit definitions
7 * 7 *
8 * Copyright (C) 2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2008-2011 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#ifndef _ASM_X86_UV_UV_BAU_H 11#ifndef _ASM_X86_UV_UV_BAU_H
@@ -26,24 +26,29 @@
26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512, 26 * BAU_SB_DESCRIPTOR_BASE register, set 1 is located at BASE + 512,
27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on. 27 * set 2 is at BASE + 2*512, set 3 at BASE + 3*512, and so on.
28 * 28 *
29 * We will use 31 sets, one for sending BAU messages from each of the 32 29 * We will use one set for sending BAU messages from each of the
30 * cpu's on the uvhub. 30 * cpu's on the uvhub.
31 * 31 *
32 * TLB shootdown will use the first of the 8 descriptors of each set. 32 * TLB shootdown will use the first of the 8 descriptors of each set.
33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set). 33 * Each of the descriptors is 64 bytes in size (8*64 = 512 bytes in a set).
34 */ 34 */
35 35
36#define UV_ITEMS_PER_DESCRIPTOR 8 36#define MAX_CPUS_PER_UVHUB 64
37#define MAX_CPUS_PER_SOCKET 32
38#define ADP_SZ 64 /* hardware-provided max. */
39#define UV_CPUS_PER_AS 32 /* hardware-provided max. */
40#define ITEMS_PER_DESC 8
37/* the 'throttle' to prevent the hardware stay-busy bug */ 41/* the 'throttle' to prevent the hardware stay-busy bug */
38#define MAX_BAU_CONCURRENT 3 42#define MAX_BAU_CONCURRENT 3
39#define UV_CPUS_PER_ACT_STATUS 32
40#define UV_ACT_STATUS_MASK 0x3 43#define UV_ACT_STATUS_MASK 0x3
41#define UV_ACT_STATUS_SIZE 2 44#define UV_ACT_STATUS_SIZE 2
42#define UV_ADP_SIZE 32
43#define UV_DISTRIBUTION_SIZE 256 45#define UV_DISTRIBUTION_SIZE 256
44#define UV_SW_ACK_NPENDING 8 46#define UV_SW_ACK_NPENDING 8
45#define UV_NET_ENDPOINT_INTD 0x38 47#define UV1_NET_ENDPOINT_INTD 0x38
46#define UV_DESC_BASE_PNODE_SHIFT 49 48#define UV2_NET_ENDPOINT_INTD 0x28
49#define UV_NET_ENDPOINT_INTD (is_uv1_hub() ? \
50 UV1_NET_ENDPOINT_INTD : UV2_NET_ENDPOINT_INTD)
51#define UV_DESC_PSHIFT 49
47#define UV_PAYLOADQ_PNODE_SHIFT 49 52#define UV_PAYLOADQ_PNODE_SHIFT 49
48#define UV_PTC_BASENAME "sgi_uv/ptc_statistics" 53#define UV_PTC_BASENAME "sgi_uv/ptc_statistics"
49#define UV_BAU_BASENAME "sgi_uv/bau_tunables" 54#define UV_BAU_BASENAME "sgi_uv/bau_tunables"
@@ -51,29 +56,64 @@
51#define UV_BAU_TUNABLES_FILE "bau_tunables" 56#define UV_BAU_TUNABLES_FILE "bau_tunables"
52#define WHITESPACE " \t\n" 57#define WHITESPACE " \t\n"
53#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask)) 58#define uv_physnodeaddr(x) ((__pa((unsigned long)(x)) & uv_mmask))
54#define UV_ENABLE_INTD_SOFT_ACK_MODE_SHIFT 15 59#define cpubit_isset(cpu, bau_local_cpumask) \
55#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHIFT 16 60 test_bit((cpu), (bau_local_cpumask).bits)
56#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x0000000009UL 61
57/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */ 62/* [19:16] SOFT_ACK timeout period 19: 1 is urgency 7 17:16 1 is multiplier */
58#define BAU_MISC_CONTROL_MULT_MASK 3 63/*
64 * UV2: Bit 19 selects between
65 * (0): 10 microsecond timebase and
66 * (1): 80 microseconds
67 * we're using 655us, similar to UV1: 65 units of 10us
68 */
69#define UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD (9UL)
70#define UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD (65*10UL)
71
72#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD (is_uv1_hub() ? \
73 UV1_INTD_SOFT_ACK_TIMEOUT_PERIOD : \
74 UV2_INTD_SOFT_ACK_TIMEOUT_PERIOD)
59 75
60#define UVH_AGING_PRESCALE_SEL 0x000000b000UL 76#define BAU_MISC_CONTROL_MULT_MASK 3
77
78#define UVH_AGING_PRESCALE_SEL 0x000000b000UL
61/* [30:28] URGENCY_7 an index into a table of times */ 79/* [30:28] URGENCY_7 an index into a table of times */
62#define BAU_URGENCY_7_SHIFT 28 80#define BAU_URGENCY_7_SHIFT 28
63#define BAU_URGENCY_7_MASK 7 81#define BAU_URGENCY_7_MASK 7
64 82
65#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL 83#define UVH_TRANSACTION_TIMEOUT 0x000000b200UL
66/* [45:40] BAU - BAU transaction timeout select - a multiplier */ 84/* [45:40] BAU - BAU transaction timeout select - a multiplier */
67#define BAU_TRANS_SHIFT 40 85#define BAU_TRANS_SHIFT 40
68#define BAU_TRANS_MASK 0x3f 86#define BAU_TRANS_MASK 0x3f
87
88/*
89 * shorten some awkward names
90 */
91#define AS_PUSH_SHIFT UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT
92#define SOFTACK_MSHIFT UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT
93#define SOFTACK_PSHIFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
94#define SOFTACK_TIMEOUT_PERIOD UV_INTD_SOFT_ACK_TIMEOUT_PERIOD
95#define write_gmmr uv_write_global_mmr64
96#define write_lmmr uv_write_local_mmr
97#define read_lmmr uv_read_local_mmr
98#define read_gmmr uv_read_global_mmr64
69 99
70/* 100/*
71 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1 101 * bits in UVH_LB_BAU_SB_ACTIVATION_STATUS_0/1
72 */ 102 */
73#define DESC_STATUS_IDLE 0 103#define DS_IDLE 0
74#define DESC_STATUS_ACTIVE 1 104#define DS_ACTIVE 1
75#define DESC_STATUS_DESTINATION_TIMEOUT 2 105#define DS_DESTINATION_TIMEOUT 2
76#define DESC_STATUS_SOURCE_TIMEOUT 3 106#define DS_SOURCE_TIMEOUT 3
107/*
108 * bits put together from HRP_LB_BAU_SB_ACTIVATION_STATUS_0/1/2
109 * values 1 and 5 will not occur
110 */
111#define UV2H_DESC_IDLE 0
112#define UV2H_DESC_DEST_TIMEOUT 2
113#define UV2H_DESC_DEST_STRONG_NACK 3
114#define UV2H_DESC_BUSY 4
115#define UV2H_DESC_SOURCE_TIMEOUT 6
116#define UV2H_DESC_DEST_PUT_ERR 7
77 117
78/* 118/*
79 * delay for 'plugged' timeout retries, in microseconds 119 * delay for 'plugged' timeout retries, in microseconds
@@ -84,13 +124,24 @@
84 * threshholds at which to use IPI to free resources 124 * threshholds at which to use IPI to free resources
85 */ 125 */
86/* after this # consecutive 'plugged' timeouts, use IPI to release resources */ 126/* after this # consecutive 'plugged' timeouts, use IPI to release resources */
87#define PLUGSB4RESET 100 127#define PLUGSB4RESET 100
88/* after this many consecutive timeouts, use IPI to release resources */ 128/* after this many consecutive timeouts, use IPI to release resources */
89#define TIMEOUTSB4RESET 1 129#define TIMEOUTSB4RESET 1
90/* at this number uses of IPI to release resources, giveup the request */ 130/* at this number uses of IPI to release resources, giveup the request */
91#define IPI_RESET_LIMIT 1 131#define IPI_RESET_LIMIT 1
92/* after this # consecutive successes, bump up the throttle if it was lowered */ 132/* after this # consecutive successes, bump up the throttle if it was lowered */
93#define COMPLETE_THRESHOLD 5 133#define COMPLETE_THRESHOLD 5
134
135#define UV_LB_SUBNODEID 0x10
136
137/* these two are the same for UV1 and UV2: */
138#define UV_SA_SHFT UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT
139#define UV_SA_MASK UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK
140/* 4 bits of software ack period */
141#define UV2_ACK_MASK 0x7UL
142#define UV2_ACK_UNITS_SHFT 3
143#define UV2_LEG_SHFT UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT
144#define UV2_EXT_SHFT UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT
94 145
95/* 146/*
96 * number of entries in the destination side payload queue 147 * number of entries in the destination side payload queue
@@ -100,7 +151,6 @@
100 * number of destination side software ack resources 151 * number of destination side software ack resources
101 */ 152 */
102#define DEST_NUM_RESOURCES 8 153#define DEST_NUM_RESOURCES 8
103#define MAX_CPUS_PER_NODE 32
104/* 154/*
105 * completion statuses for sending a TLB flush message 155 * completion statuses for sending a TLB flush message
106 */ 156 */
@@ -112,9 +162,16 @@
112/* 162/*
113 * tuning the action when the numalink network is extremely delayed 163 * tuning the action when the numalink network is extremely delayed
114 */ 164 */
115#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in microseconds */ 165#define CONGESTED_RESPONSE_US 1000 /* 'long' response time, in
116#define CONGESTED_REPS 10 /* long delays averaged over this many broadcasts */ 166 microseconds */
117#define CONGESTED_PERIOD 30 /* time for the bau to be disabled, in seconds */ 167#define CONGESTED_REPS 10 /* long delays averaged over
168 this many broadcasts */
169#define CONGESTED_PERIOD 30 /* time for the bau to be
170 disabled, in seconds */
171/* see msg_type: */
172#define MSG_NOOP 0
173#define MSG_REGULAR 1
174#define MSG_RETRY 2
118 175
119/* 176/*
120 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor) 177 * Distribution: 32 bytes (256 bits) (bytes 0-0x1f of descriptor)
@@ -123,11 +180,11 @@
123 * The distribution specification (32 bytes) is interpreted as a 256-bit 180 * The distribution specification (32 bytes) is interpreted as a 256-bit
124 * distribution vector. Adjacent bits correspond to consecutive even numbered 181 * distribution vector. Adjacent bits correspond to consecutive even numbered
125 * nodeIDs. The result of adding the index of a given bit to the 15-bit 182 * nodeIDs. The result of adding the index of a given bit to the 15-bit
126 * 'base_dest_nodeid' field of the header corresponds to the 183 * 'base_dest_nasid' field of the header corresponds to the
127 * destination nodeID associated with that specified bit. 184 * destination nodeID associated with that specified bit.
128 */ 185 */
129struct bau_target_uvhubmask { 186struct bau_targ_hubmask {
130 unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)]; 187 unsigned long bits[BITS_TO_LONGS(UV_DISTRIBUTION_SIZE)];
131}; 188};
132 189
133/* 190/*
@@ -136,7 +193,7 @@ struct bau_target_uvhubmask {
136 * enough bits for max. cpu's per uvhub) 193 * enough bits for max. cpu's per uvhub)
137 */ 194 */
138struct bau_local_cpumask { 195struct bau_local_cpumask {
139 unsigned long bits; 196 unsigned long bits;
140}; 197};
141 198
142/* 199/*
@@ -157,14 +214,14 @@ struct bau_local_cpumask {
157 * The payload is software-defined for INTD transactions 214 * The payload is software-defined for INTD transactions
158 */ 215 */
159struct bau_msg_payload { 216struct bau_msg_payload {
160 unsigned long address; /* signifies a page or all TLB's 217 unsigned long address; /* signifies a page or all
161 of the cpu */ 218 TLB's of the cpu */
162 /* 64 bits */ 219 /* 64 bits */
163 unsigned short sending_cpu; /* filled in by sender */ 220 unsigned short sending_cpu; /* filled in by sender */
164 /* 16 bits */ 221 /* 16 bits */
165 unsigned short acknowledge_count;/* filled in by destination */ 222 unsigned short acknowledge_count; /* filled in by destination */
166 /* 16 bits */ 223 /* 16 bits */
167 unsigned int reserved1:32; /* not usable */ 224 unsigned int reserved1:32; /* not usable */
168}; 225};
169 226
170 227
@@ -173,93 +230,96 @@ struct bau_msg_payload {
173 * see table 4.2.3.0.1 in broacast_assist spec. 230 * see table 4.2.3.0.1 in broacast_assist spec.
174 */ 231 */
175struct bau_msg_header { 232struct bau_msg_header {
176 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */ 233 unsigned int dest_subnodeid:6; /* must be 0x10, for the LB */
177 /* bits 5:0 */ 234 /* bits 5:0 */
178 unsigned int base_dest_nodeid:15; /* nasid (pnode<<1) of */ 235 unsigned int base_dest_nasid:15; /* nasid of the first bit */
179 /* bits 20:6 */ /* first bit in uvhub map */ 236 /* bits 20:6 */ /* in uvhub map */
180 unsigned int command:8; /* message type */ 237 unsigned int command:8; /* message type */
181 /* bits 28:21 */ 238 /* bits 28:21 */
182 /* 0x38: SN3net EndPoint Message */ 239 /* 0x38: SN3net EndPoint Message */
183 unsigned int rsvd_1:3; /* must be zero */ 240 unsigned int rsvd_1:3; /* must be zero */
184 /* bits 31:29 */ 241 /* bits 31:29 */
185 /* int will align on 32 bits */ 242 /* int will align on 32 bits */
186 unsigned int rsvd_2:9; /* must be zero */ 243 unsigned int rsvd_2:9; /* must be zero */
187 /* bits 40:32 */ 244 /* bits 40:32 */
188 /* Suppl_A is 56-41 */ 245 /* Suppl_A is 56-41 */
189 unsigned int sequence:16;/* message sequence number */ 246 unsigned int sequence:16; /* message sequence number */
190 /* bits 56:41 */ /* becomes bytes 16-17 of msg */ 247 /* bits 56:41 */ /* becomes bytes 16-17 of msg */
191 /* Address field (96:57) is never used as an 248 /* Address field (96:57) is
192 address (these are address bits 42:3) */ 249 never used as an address
193 250 (these are address bits
194 unsigned int rsvd_3:1; /* must be zero */ 251 42:3) */
252
253 unsigned int rsvd_3:1; /* must be zero */
195 /* bit 57 */ 254 /* bit 57 */
196 /* address bits 27:4 are payload */ 255 /* address bits 27:4 are payload */
197 /* these next 24 (58-81) bits become bytes 12-14 of msg */ 256 /* these next 24 (58-81) bits become bytes 12-14 of msg */
198
199 /* bits 65:58 land in byte 12 */ 257 /* bits 65:58 land in byte 12 */
200 unsigned int replied_to:1;/* sent as 0 by the source to byte 12 */ 258 unsigned int replied_to:1; /* sent as 0 by the source to
259 byte 12 */
201 /* bit 58 */ 260 /* bit 58 */
202 unsigned int msg_type:3; /* software type of the message*/ 261 unsigned int msg_type:3; /* software type of the
262 message */
203 /* bits 61:59 */ 263 /* bits 61:59 */
204 unsigned int canceled:1; /* message canceled, resource to be freed*/ 264 unsigned int canceled:1; /* message canceled, resource
265 is to be freed*/
205 /* bit 62 */ 266 /* bit 62 */
206 unsigned int payload_1a:1;/* not currently used */ 267 unsigned int payload_1a:1; /* not currently used */
207 /* bit 63 */ 268 /* bit 63 */
208 unsigned int payload_1b:2;/* not currently used */ 269 unsigned int payload_1b:2; /* not currently used */
209 /* bits 65:64 */ 270 /* bits 65:64 */
210 271
211 /* bits 73:66 land in byte 13 */ 272 /* bits 73:66 land in byte 13 */
212 unsigned int payload_1ca:6;/* not currently used */ 273 unsigned int payload_1ca:6; /* not currently used */
213 /* bits 71:66 */ 274 /* bits 71:66 */
214 unsigned int payload_1c:2;/* not currently used */ 275 unsigned int payload_1c:2; /* not currently used */
215 /* bits 73:72 */ 276 /* bits 73:72 */
216 277
217 /* bits 81:74 land in byte 14 */ 278 /* bits 81:74 land in byte 14 */
218 unsigned int payload_1d:6;/* not currently used */ 279 unsigned int payload_1d:6; /* not currently used */
219 /* bits 79:74 */ 280 /* bits 79:74 */
220 unsigned int payload_1e:2;/* not currently used */ 281 unsigned int payload_1e:2; /* not currently used */
221 /* bits 81:80 */ 282 /* bits 81:80 */
222 283
223 unsigned int rsvd_4:7; /* must be zero */ 284 unsigned int rsvd_4:7; /* must be zero */
224 /* bits 88:82 */ 285 /* bits 88:82 */
225 unsigned int sw_ack_flag:1;/* software acknowledge flag */ 286 unsigned int swack_flag:1; /* software acknowledge flag */
226 /* bit 89 */ 287 /* bit 89 */
227 /* INTD trasactions at destination are to 288 /* INTD trasactions at
228 wait for software acknowledge */ 289 destination are to wait for
229 unsigned int rsvd_5:6; /* must be zero */ 290 software acknowledge */
291 unsigned int rsvd_5:6; /* must be zero */
230 /* bits 95:90 */ 292 /* bits 95:90 */
231 unsigned int rsvd_6:5; /* must be zero */ 293 unsigned int rsvd_6:5; /* must be zero */
232 /* bits 100:96 */ 294 /* bits 100:96 */
233 unsigned int int_both:1;/* if 1, interrupt both sockets on the uvhub */ 295 unsigned int int_both:1; /* if 1, interrupt both sockets
296 on the uvhub */
234 /* bit 101*/ 297 /* bit 101*/
235 unsigned int fairness:3;/* usually zero */ 298 unsigned int fairness:3; /* usually zero */
236 /* bits 104:102 */ 299 /* bits 104:102 */
237 unsigned int multilevel:1; /* multi-level multicast format */ 300 unsigned int multilevel:1; /* multi-level multicast
301 format */
238 /* bit 105 */ 302 /* bit 105 */
239 /* 0 for TLB: endpoint multi-unicast messages */ 303 /* 0 for TLB: endpoint multi-unicast messages */
240 unsigned int chaining:1;/* next descriptor is part of this activation*/ 304 unsigned int chaining:1; /* next descriptor is part of
305 this activation*/
241 /* bit 106 */ 306 /* bit 106 */
242 unsigned int rsvd_7:21; /* must be zero */ 307 unsigned int rsvd_7:21; /* must be zero */
243 /* bits 127:107 */ 308 /* bits 127:107 */
244}; 309};
245 310
246/* see msg_type: */
247#define MSG_NOOP 0
248#define MSG_REGULAR 1
249#define MSG_RETRY 2
250
251/* 311/*
252 * The activation descriptor: 312 * The activation descriptor:
253 * The format of the message to send, plus all accompanying control 313 * The format of the message to send, plus all accompanying control
254 * Should be 64 bytes 314 * Should be 64 bytes
255 */ 315 */
256struct bau_desc { 316struct bau_desc {
257 struct bau_target_uvhubmask distribution; 317 struct bau_targ_hubmask distribution;
258 /* 318 /*
259 * message template, consisting of header and payload: 319 * message template, consisting of header and payload:
260 */ 320 */
261 struct bau_msg_header header; 321 struct bau_msg_header header;
262 struct bau_msg_payload payload; 322 struct bau_msg_payload payload;
263}; 323};
264/* 324/*
265 * -payload-- ---------header------ 325 * -payload-- ---------header------
@@ -278,59 +338,51 @@ struct bau_desc {
278 * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17 338 * are 32 bytes (2 micropackets) (256 bits) in length, but contain only 17
279 * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120) 339 * bytes of usable data, including the sw ack vector in byte 15 (bits 127:120)
280 * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from 340 * (12 bytes come from bau_msg_payload, 3 from payload_1, 2 from
281 * sw_ack_vector and payload_2) 341 * swack_vec and payload_2)
282 * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software 342 * "Enabling Software Acknowledgment mode (see Section 4.3.3 Software
283 * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload 343 * Acknowledge Processing) also selects 32 byte (17 bytes usable) payload
284 * operation." 344 * operation."
285 */ 345 */
286struct bau_payload_queue_entry { 346struct bau_pq_entry {
287 unsigned long address; /* signifies a page or all TLB's 347 unsigned long address; /* signifies a page or all TLB's
288 of the cpu */ 348 of the cpu */
289 /* 64 bits, bytes 0-7 */ 349 /* 64 bits, bytes 0-7 */
290 350 unsigned short sending_cpu; /* cpu that sent the message */
291 unsigned short sending_cpu; /* cpu that sent the message */
292 /* 16 bits, bytes 8-9 */ 351 /* 16 bits, bytes 8-9 */
293 352 unsigned short acknowledge_count; /* filled in by destination */
294 unsigned short acknowledge_count; /* filled in by destination */
295 /* 16 bits, bytes 10-11 */ 353 /* 16 bits, bytes 10-11 */
296
297 /* these next 3 bytes come from bits 58-81 of the message header */ 354 /* these next 3 bytes come from bits 58-81 of the message header */
298 unsigned short replied_to:1; /* sent as 0 by the source */ 355 unsigned short replied_to:1; /* sent as 0 by the source */
299 unsigned short msg_type:3; /* software message type */ 356 unsigned short msg_type:3; /* software message type */
300 unsigned short canceled:1; /* sent as 0 by the source */ 357 unsigned short canceled:1; /* sent as 0 by the source */
301 unsigned short unused1:3; /* not currently using */ 358 unsigned short unused1:3; /* not currently using */
302 /* byte 12 */ 359 /* byte 12 */
303 360 unsigned char unused2a; /* not currently using */
304 unsigned char unused2a; /* not currently using */
305 /* byte 13 */ 361 /* byte 13 */
306 unsigned char unused2; /* not currently using */ 362 unsigned char unused2; /* not currently using */
307 /* byte 14 */ 363 /* byte 14 */
308 364 unsigned char swack_vec; /* filled in by the hardware */
309 unsigned char sw_ack_vector; /* filled in by the hardware */
310 /* byte 15 (bits 127:120) */ 365 /* byte 15 (bits 127:120) */
311 366 unsigned short sequence; /* message sequence number */
312 unsigned short sequence; /* message sequence number */
313 /* bytes 16-17 */ 367 /* bytes 16-17 */
314 unsigned char unused4[2]; /* not currently using bytes 18-19 */ 368 unsigned char unused4[2]; /* not currently using bytes 18-19 */
315 /* bytes 18-19 */ 369 /* bytes 18-19 */
316 370 int number_of_cpus; /* filled in at destination */
317 int number_of_cpus; /* filled in at destination */
318 /* 32 bits, bytes 20-23 (aligned) */ 371 /* 32 bits, bytes 20-23 (aligned) */
319 372 unsigned char unused5[8]; /* not using */
320 unsigned char unused5[8]; /* not using */
321 /* bytes 24-31 */ 373 /* bytes 24-31 */
322}; 374};
323 375
324struct msg_desc { 376struct msg_desc {
325 struct bau_payload_queue_entry *msg; 377 struct bau_pq_entry *msg;
326 int msg_slot; 378 int msg_slot;
327 int sw_ack_slot; 379 int swack_slot;
328 struct bau_payload_queue_entry *va_queue_first; 380 struct bau_pq_entry *queue_first;
329 struct bau_payload_queue_entry *va_queue_last; 381 struct bau_pq_entry *queue_last;
330}; 382};
331 383
332struct reset_args { 384struct reset_args {
333 int sender; 385 int sender;
334}; 386};
335 387
336/* 388/*
@@ -338,105 +390,226 @@ struct reset_args {
338 */ 390 */
339struct ptc_stats { 391struct ptc_stats {
340 /* sender statistics */ 392 /* sender statistics */
341 unsigned long s_giveup; /* number of fall backs to IPI-style flushes */ 393 unsigned long s_giveup; /* number of fall backs to
342 unsigned long s_requestor; /* number of shootdown requests */ 394 IPI-style flushes */
343 unsigned long s_stimeout; /* source side timeouts */ 395 unsigned long s_requestor; /* number of shootdown
344 unsigned long s_dtimeout; /* destination side timeouts */ 396 requests */
345 unsigned long s_time; /* time spent in sending side */ 397 unsigned long s_stimeout; /* source side timeouts */
346 unsigned long s_retriesok; /* successful retries */ 398 unsigned long s_dtimeout; /* destination side timeouts */
347 unsigned long s_ntargcpu; /* total number of cpu's targeted */ 399 unsigned long s_time; /* time spent in sending side */
348 unsigned long s_ntargself; /* times the sending cpu was targeted */ 400 unsigned long s_retriesok; /* successful retries */
349 unsigned long s_ntarglocals; /* targets of cpus on the local blade */ 401 unsigned long s_ntargcpu; /* total number of cpu's
350 unsigned long s_ntargremotes; /* targets of cpus on remote blades */ 402 targeted */
351 unsigned long s_ntarglocaluvhub; /* targets of the local hub */ 403 unsigned long s_ntargself; /* times the sending cpu was
352 unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */ 404 targeted */
353 unsigned long s_ntarguvhub; /* total number of uvhubs targeted */ 405 unsigned long s_ntarglocals; /* targets of cpus on the local
354 unsigned long s_ntarguvhub16; /* number of times target hubs >= 16*/ 406 blade */
355 unsigned long s_ntarguvhub8; /* number of times target hubs >= 8 */ 407 unsigned long s_ntargremotes; /* targets of cpus on remote
356 unsigned long s_ntarguvhub4; /* number of times target hubs >= 4 */ 408 blades */
357 unsigned long s_ntarguvhub2; /* number of times target hubs >= 2 */ 409 unsigned long s_ntarglocaluvhub; /* targets of the local hub */
358 unsigned long s_ntarguvhub1; /* number of times target hubs == 1 */ 410 unsigned long s_ntargremoteuvhub; /* remotes hubs targeted */
359 unsigned long s_resets_plug; /* ipi-style resets from plug state */ 411 unsigned long s_ntarguvhub; /* total number of uvhubs
360 unsigned long s_resets_timeout; /* ipi-style resets from timeouts */ 412 targeted */
361 unsigned long s_busy; /* status stayed busy past s/w timer */ 413 unsigned long s_ntarguvhub16; /* number of times target
362 unsigned long s_throttles; /* waits in throttle */ 414 hubs >= 16*/
363 unsigned long s_retry_messages; /* retry broadcasts */ 415 unsigned long s_ntarguvhub8; /* number of times target
364 unsigned long s_bau_reenabled; /* for bau enable/disable */ 416 hubs >= 8 */
365 unsigned long s_bau_disabled; /* for bau enable/disable */ 417 unsigned long s_ntarguvhub4; /* number of times target
418 hubs >= 4 */
419 unsigned long s_ntarguvhub2; /* number of times target
420 hubs >= 2 */
421 unsigned long s_ntarguvhub1; /* number of times target
422 hubs == 1 */
423 unsigned long s_resets_plug; /* ipi-style resets from plug
424 state */
425 unsigned long s_resets_timeout; /* ipi-style resets from
426 timeouts */
427 unsigned long s_busy; /* status stayed busy past
428 s/w timer */
429 unsigned long s_throttles; /* waits in throttle */
430 unsigned long s_retry_messages; /* retry broadcasts */
431 unsigned long s_bau_reenabled; /* for bau enable/disable */
432 unsigned long s_bau_disabled; /* for bau enable/disable */
366 /* destination statistics */ 433 /* destination statistics */
367 unsigned long d_alltlb; /* times all tlb's on this cpu were flushed */ 434 unsigned long d_alltlb; /* times all tlb's on this
368 unsigned long d_onetlb; /* times just one tlb on this cpu was flushed */ 435 cpu were flushed */
369 unsigned long d_multmsg; /* interrupts with multiple messages */ 436 unsigned long d_onetlb; /* times just one tlb on this
370 unsigned long d_nomsg; /* interrupts with no message */ 437 cpu was flushed */
371 unsigned long d_time; /* time spent on destination side */ 438 unsigned long d_multmsg; /* interrupts with multiple
372 unsigned long d_requestee; /* number of messages processed */ 439 messages */
373 unsigned long d_retries; /* number of retry messages processed */ 440 unsigned long d_nomsg; /* interrupts with no message */
374 unsigned long d_canceled; /* number of messages canceled by retries */ 441 unsigned long d_time; /* time spent on destination
375 unsigned long d_nocanceled; /* retries that found nothing to cancel */ 442 side */
376 unsigned long d_resets; /* number of ipi-style requests processed */ 443 unsigned long d_requestee; /* number of messages
377 unsigned long d_rcanceled; /* number of messages canceled by resets */ 444 processed */
445 unsigned long d_retries; /* number of retry messages
446 processed */
447 unsigned long d_canceled; /* number of messages canceled
448 by retries */
449 unsigned long d_nocanceled; /* retries that found nothing
450 to cancel */
451 unsigned long d_resets; /* number of ipi-style requests
452 processed */
453 unsigned long d_rcanceled; /* number of messages canceled
454 by resets */
455};
456
457struct tunables {
458 int *tunp;
459 int deflt;
460};
461
462struct hub_and_pnode {
463 short uvhub;
464 short pnode;
465};
466
467struct socket_desc {
468 short num_cpus;
469 short cpu_number[MAX_CPUS_PER_SOCKET];
470};
471
472struct uvhub_desc {
473 unsigned short socket_mask;
474 short num_cpus;
475 short uvhub;
476 short pnode;
477 struct socket_desc socket[2];
378}; 478};
379 479
380/* 480/*
381 * one per-cpu; to locate the software tables 481 * one per-cpu; to locate the software tables
382 */ 482 */
383struct bau_control { 483struct bau_control {
384 struct bau_desc *descriptor_base; 484 struct bau_desc *descriptor_base;
385 struct bau_payload_queue_entry *va_queue_first; 485 struct bau_pq_entry *queue_first;
386 struct bau_payload_queue_entry *va_queue_last; 486 struct bau_pq_entry *queue_last;
387 struct bau_payload_queue_entry *bau_msg_head; 487 struct bau_pq_entry *bau_msg_head;
388 struct bau_control *uvhub_master; 488 struct bau_control *uvhub_master;
389 struct bau_control *socket_master; 489 struct bau_control *socket_master;
390 struct ptc_stats *statp; 490 struct ptc_stats *statp;
391 unsigned long timeout_interval; 491 unsigned long timeout_interval;
392 unsigned long set_bau_on_time; 492 unsigned long set_bau_on_time;
393 atomic_t active_descriptor_count; 493 atomic_t active_descriptor_count;
394 int plugged_tries; 494 int plugged_tries;
395 int timeout_tries; 495 int timeout_tries;
396 int ipi_attempts; 496 int ipi_attempts;
397 int conseccompletes; 497 int conseccompletes;
398 int baudisabled; 498 int baudisabled;
399 int set_bau_off; 499 int set_bau_off;
400 short cpu; 500 short cpu;
401 short uvhub_cpu; 501 short osnode;
402 short uvhub; 502 short uvhub_cpu;
403 short cpus_in_socket; 503 short uvhub;
404 short cpus_in_uvhub; 504 short cpus_in_socket;
405 unsigned short message_number; 505 short cpus_in_uvhub;
406 unsigned short uvhub_quiesce; 506 short partition_base_pnode;
407 short socket_acknowledge_count[DEST_Q_SIZE]; 507 unsigned short message_number;
408 cycles_t send_message; 508 unsigned short uvhub_quiesce;
409 spinlock_t uvhub_lock; 509 short socket_acknowledge_count[DEST_Q_SIZE];
410 spinlock_t queue_lock; 510 cycles_t send_message;
511 spinlock_t uvhub_lock;
512 spinlock_t queue_lock;
411 /* tunables */ 513 /* tunables */
412 int max_bau_concurrent; 514 int max_concurr;
413 int max_bau_concurrent_constant; 515 int max_concurr_const;
414 int plugged_delay; 516 int plugged_delay;
415 int plugsb4reset; 517 int plugsb4reset;
416 int timeoutsb4reset; 518 int timeoutsb4reset;
417 int ipi_reset_limit; 519 int ipi_reset_limit;
418 int complete_threshold; 520 int complete_threshold;
419 int congested_response_us; 521 int cong_response_us;
420 int congested_reps; 522 int cong_reps;
421 int congested_period; 523 int cong_period;
422 cycles_t period_time; 524 cycles_t period_time;
423 long period_requests; 525 long period_requests;
526 struct hub_and_pnode *thp;
424}; 527};
425 528
426static inline int bau_uvhub_isset(int uvhub, struct bau_target_uvhubmask *dstp) 529static unsigned long read_mmr_uv2_status(void)
530{
531 return read_lmmr(UV2H_LB_BAU_SB_ACTIVATION_STATUS_2);
532}
533
534static void write_mmr_data_broadcast(int pnode, unsigned long mmr_image)
535{
536 write_gmmr(pnode, UVH_BAU_DATA_BROADCAST, mmr_image);
537}
538
539static void write_mmr_descriptor_base(int pnode, unsigned long mmr_image)
540{
541 write_gmmr(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE, mmr_image);
542}
543
544static void write_mmr_activation(unsigned long index)
545{
546 write_lmmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
547}
548
549static void write_gmmr_activation(int pnode, unsigned long mmr_image)
550{
551 write_gmmr(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, mmr_image);
552}
553
554static void write_mmr_payload_first(int pnode, unsigned long mmr_image)
555{
556 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST, mmr_image);
557}
558
559static void write_mmr_payload_tail(int pnode, unsigned long mmr_image)
560{
561 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL, mmr_image);
562}
563
564static void write_mmr_payload_last(int pnode, unsigned long mmr_image)
565{
566 write_gmmr(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST, mmr_image);
567}
568
569static void write_mmr_misc_control(int pnode, unsigned long mmr_image)
570{
571 write_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
572}
573
574static unsigned long read_mmr_misc_control(int pnode)
575{
576 return read_gmmr(pnode, UVH_LB_BAU_MISC_CONTROL);
577}
578
579static void write_mmr_sw_ack(unsigned long mr)
580{
581 uv_write_local_mmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, mr);
582}
583
584static unsigned long read_mmr_sw_ack(void)
585{
586 return read_lmmr(UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
587}
588
589static unsigned long read_gmmr_sw_ack(int pnode)
590{
591 return read_gmmr(pnode, UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
592}
593
594static void write_mmr_data_config(int pnode, unsigned long mr)
595{
596 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG, mr);
597}
598
599static inline int bau_uvhub_isset(int uvhub, struct bau_targ_hubmask *dstp)
427{ 600{
428 return constant_test_bit(uvhub, &dstp->bits[0]); 601 return constant_test_bit(uvhub, &dstp->bits[0]);
429} 602}
430static inline void bau_uvhub_set(int uvhub, struct bau_target_uvhubmask *dstp) 603static inline void bau_uvhub_set(int pnode, struct bau_targ_hubmask *dstp)
431{ 604{
432 __set_bit(uvhub, &dstp->bits[0]); 605 __set_bit(pnode, &dstp->bits[0]);
433} 606}
434static inline void bau_uvhubs_clear(struct bau_target_uvhubmask *dstp, 607static inline void bau_uvhubs_clear(struct bau_targ_hubmask *dstp,
435 int nbits) 608 int nbits)
436{ 609{
437 bitmap_zero(&dstp->bits[0], nbits); 610 bitmap_zero(&dstp->bits[0], nbits);
438} 611}
439static inline int bau_uvhub_weight(struct bau_target_uvhubmask *dstp) 612static inline int bau_uvhub_weight(struct bau_targ_hubmask *dstp)
440{ 613{
441 return bitmap_weight((unsigned long *)&dstp->bits[0], 614 return bitmap_weight((unsigned long *)&dstp->bits[0],
442 UV_DISTRIBUTION_SIZE); 615 UV_DISTRIBUTION_SIZE);
@@ -447,9 +620,6 @@ static inline void bau_cpubits_clear(struct bau_local_cpumask *dstp, int nbits)
447 bitmap_zero(&dstp->bits, nbits); 620 bitmap_zero(&dstp->bits, nbits);
448} 621}
449 622
450#define cpubit_isset(cpu, bau_local_cpumask) \
451 test_bit((cpu), (bau_local_cpumask).bits)
452
453extern void uv_bau_message_intr1(void); 623extern void uv_bau_message_intr1(void);
454extern void uv_bau_timeout_intr1(void); 624extern void uv_bau_timeout_intr1(void);
455 625
@@ -457,7 +627,7 @@ struct atomic_short {
457 short counter; 627 short counter;
458}; 628};
459 629
460/** 630/*
461 * atomic_read_short - read a short atomic variable 631 * atomic_read_short - read a short atomic variable
462 * @v: pointer of type atomic_short 632 * @v: pointer of type atomic_short
463 * 633 *
@@ -468,14 +638,14 @@ static inline int atomic_read_short(const struct atomic_short *v)
468 return v->counter; 638 return v->counter;
469} 639}
470 640
471/** 641/*
472 * atomic_add_short_return - add and return a short int 642 * atom_asr - add and return a short int
473 * @i: short value to add 643 * @i: short value to add
474 * @v: pointer of type atomic_short 644 * @v: pointer of type atomic_short
475 * 645 *
476 * Atomically adds @i to @v and returns @i + @v 646 * Atomically adds @i to @v and returns @i + @v
477 */ 647 */
478static inline int atomic_add_short_return(short i, struct atomic_short *v) 648static inline int atom_asr(short i, struct atomic_short *v)
479{ 649{
480 short __i = i; 650 short __i = i;
481 asm volatile(LOCK_PREFIX "xaddw %0, %1" 651 asm volatile(LOCK_PREFIX "xaddw %0, %1"
@@ -484,4 +654,26 @@ static inline int atomic_add_short_return(short i, struct atomic_short *v)
484 return i + __i; 654 return i + __i;
485} 655}
486 656
657/*
658 * conditionally add 1 to *v, unless *v is >= u
659 * return 0 if we cannot add 1 to *v because it is >= u
660 * return 1 if we can add 1 to *v because it is < u
661 * the add is atomic
662 *
663 * This is close to atomic_add_unless(), but this allows the 'u' value
664 * to be lowered below the current 'v'. atomic_add_unless can only stop
665 * on equal.
666 */
667static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
668{
669 spin_lock(lock);
670 if (atomic_read(v) >= u) {
671 spin_unlock(lock);
672 return 0;
673 }
674 atomic_inc(v);
675 spin_unlock(lock);
676 return 1;
677}
678
487#endif /* _ASM_X86_UV_UV_BAU_H */ 679#endif /* _ASM_X86_UV_UV_BAU_H */
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index bf6b88ef8eeb..f26544a15214 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV architectural definitions 6 * SGI UV architectural definitions
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#ifndef _ASM_X86_UV_UV_HUB_H 11#ifndef _ASM_X86_UV_UV_HUB_H
@@ -77,7 +77,9 @@
77 * 77 *
78 * 1111110000000000 78 * 1111110000000000
79 * 5432109876543210 79 * 5432109876543210
80 * pppppppppplc0cch 80 * pppppppppplc0cch Nehalem-EX (12 bits in hdw reg)
81 * ppppppppplcc0cch Westmere-EX (12 bits in hdw reg)
82 * pppppppppppcccch SandyBridge (15 bits in hdw reg)
81 * sssssssssss 83 * sssssssssss
82 * 84 *
83 * p = pnode bits 85 * p = pnode bits
@@ -86,7 +88,7 @@
86 * h = hyperthread 88 * h = hyperthread
87 * s = bits that are in the SOCKET_ID CSR 89 * s = bits that are in the SOCKET_ID CSR
88 * 90 *
89 * Note: Processor only supports 12 bits in the APICID register. The ACPI 91 * Note: Processor may support fewer bits in the APICID register. The ACPI
90 * tables hold all 16 bits. Software needs to be aware of this. 92 * tables hold all 16 bits. Software needs to be aware of this.
91 * 93 *
92 * Unless otherwise specified, all references to APICID refer to 94 * Unless otherwise specified, all references to APICID refer to
@@ -137,6 +139,8 @@ struct uv_hub_info_s {
137 unsigned long global_mmr_base; 139 unsigned long global_mmr_base;
138 unsigned long gpa_mask; 140 unsigned long gpa_mask;
139 unsigned int gnode_extra; 141 unsigned int gnode_extra;
142 unsigned char hub_revision;
143 unsigned char apic_pnode_shift;
140 unsigned long gnode_upper; 144 unsigned long gnode_upper;
141 unsigned long lowmem_remap_top; 145 unsigned long lowmem_remap_top;
142 unsigned long lowmem_remap_base; 146 unsigned long lowmem_remap_base;
@@ -155,6 +159,37 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
155#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) 159#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
156 160
157/* 161/*
162 * Hub revisions less than UV2_HUB_REVISION_BASE are UV1 hubs. All UV2
163 * hubs have revision numbers greater than or equal to UV2_HUB_REVISION_BASE.
164 * This is a software convention - NOT the hardware revision numbers in
165 * the hub chip.
166 */
167#define UV1_HUB_REVISION_BASE 1
168#define UV2_HUB_REVISION_BASE 3
169
170static inline int is_uv1_hub(void)
171{
172 return uv_hub_info->hub_revision < UV2_HUB_REVISION_BASE;
173}
174
175static inline int is_uv2_hub(void)
176{
177 return uv_hub_info->hub_revision >= UV2_HUB_REVISION_BASE;
178}
179
180union uvh_apicid {
181 unsigned long v;
182 struct uvh_apicid_s {
183 unsigned long local_apic_mask : 24;
184 unsigned long local_apic_shift : 5;
185 unsigned long unused1 : 3;
186 unsigned long pnode_mask : 24;
187 unsigned long pnode_shift : 5;
188 unsigned long unused2 : 3;
189 } s;
190};
191
192/*
158 * Local & Global MMR space macros. 193 * Local & Global MMR space macros.
159 * Note: macros are intended to be used ONLY by inline functions 194 * Note: macros are intended to be used ONLY by inline functions
160 * in this file - not by other kernel code. 195 * in this file - not by other kernel code.
@@ -166,11 +201,25 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
166#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) 201#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
167#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1) 202#define UV_PNODE_TO_NASID(p) (UV_PNODE_TO_GNODE(p) << 1)
168 203
169#define UV_LOCAL_MMR_BASE 0xf4000000UL 204#define UV1_LOCAL_MMR_BASE 0xf4000000UL
170#define UV_GLOBAL_MMR32_BASE 0xf8000000UL 205#define UV1_GLOBAL_MMR32_BASE 0xf8000000UL
206#define UV1_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
207#define UV1_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
208
209#define UV2_LOCAL_MMR_BASE 0xfa000000UL
210#define UV2_GLOBAL_MMR32_BASE 0xfc000000UL
211#define UV2_LOCAL_MMR_SIZE (32UL * 1024 * 1024)
212#define UV2_GLOBAL_MMR32_SIZE (32UL * 1024 * 1024)
213
214#define UV_LOCAL_MMR_BASE (is_uv1_hub() ? UV1_LOCAL_MMR_BASE \
215 : UV2_LOCAL_MMR_BASE)
216#define UV_GLOBAL_MMR32_BASE (is_uv1_hub() ? UV1_GLOBAL_MMR32_BASE \
217 : UV2_GLOBAL_MMR32_BASE)
218#define UV_LOCAL_MMR_SIZE (is_uv1_hub() ? UV1_LOCAL_MMR_SIZE : \
219 UV2_LOCAL_MMR_SIZE)
220#define UV_GLOBAL_MMR32_SIZE (is_uv1_hub() ? UV1_GLOBAL_MMR32_SIZE :\
221 UV2_GLOBAL_MMR32_SIZE)
171#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base) 222#define UV_GLOBAL_MMR64_BASE (uv_hub_info->global_mmr_base)
172#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
173#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
174 223
175#define UV_GLOBAL_GRU_MMR_BASE 0x4000000 224#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
176 225
@@ -182,8 +231,11 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
182#define UV_GLOBAL_MMR64_PNODE_BITS(p) \ 231#define UV_GLOBAL_MMR64_PNODE_BITS(p) \
183 (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT) 232 (((unsigned long)(p)) << UV_GLOBAL_MMR64_PNODE_SHIFT)
184 233
234#define UVH_APICID 0x002D0E00L
185#define UV_APIC_PNODE_SHIFT 6 235#define UV_APIC_PNODE_SHIFT 6
186 236
237#define UV_APICID_HIBIT_MASK 0xffff0000
238
187/* Local Bus from cpu's perspective */ 239/* Local Bus from cpu's perspective */
188#define LOCAL_BUS_BASE 0x1c00000 240#define LOCAL_BUS_BASE 0x1c00000
189#define LOCAL_BUS_SIZE (4 * 1024 * 1024) 241#define LOCAL_BUS_SIZE (4 * 1024 * 1024)
@@ -280,7 +332,18 @@ static inline void *uv_pnode_offset_to_vaddr(int pnode, unsigned long offset)
280 */ 332 */
281static inline int uv_apicid_to_pnode(int apicid) 333static inline int uv_apicid_to_pnode(int apicid)
282{ 334{
283 return (apicid >> UV_APIC_PNODE_SHIFT); 335 return (apicid >> uv_hub_info->apic_pnode_shift);
336}
337
338/*
339 * Convert an apicid to the socket number on the blade
340 */
341static inline int uv_apicid_to_socket(int apicid)
342{
343 if (is_uv1_hub())
344 return (apicid >> (uv_hub_info->apic_pnode_shift - 1)) & 1;
345 else
346 return 0;
284} 347}
285 348
286/* 349/*
@@ -381,6 +444,8 @@ struct uv_blade_info {
381 unsigned short nr_online_cpus; 444 unsigned short nr_online_cpus;
382 unsigned short pnode; 445 unsigned short pnode;
383 short memory_nid; 446 short memory_nid;
447 spinlock_t nmi_lock;
448 unsigned long nmi_count;
384}; 449};
385extern struct uv_blade_info *uv_blade_info; 450extern struct uv_blade_info *uv_blade_info;
386extern short *uv_node_to_blade; 451extern short *uv_node_to_blade;
@@ -476,8 +541,10 @@ static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
476 } 541 }
477} 542}
478 543
544extern unsigned int uv_apicid_hibits;
479static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode) 545static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
480{ 546{
547 apicid |= uv_apicid_hibits;
481 return (1UL << UVH_IPI_INT_SEND_SHFT) | 548 return (1UL << UVH_IPI_INT_SEND_SHFT) |
482 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) | 549 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
483 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) | 550 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
@@ -498,14 +565,13 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
498 565
499/* 566/*
500 * Get the minimum revision number of the hub chips within the partition. 567 * Get the minimum revision number of the hub chips within the partition.
501 * 1 - initial rev 1.0 silicon 568 * 1 - UV1 rev 1.0 initial silicon
502 * 2 - rev 2.0 production silicon 569 * 2 - UV1 rev 2.0 production silicon
570 * 3 - UV2 rev 1.0 initial silicon
503 */ 571 */
504static inline int uv_get_min_hub_revision_id(void) 572static inline int uv_get_min_hub_revision_id(void)
505{ 573{
506 extern int uv_min_hub_revision_id; 574 return uv_hub_info->hub_revision;
507
508 return uv_min_hub_revision_id;
509} 575}
510 576
511#endif /* CONFIG_X86_64 */ 577#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/uv/uv_mmrs.h b/arch/x86/include/asm/uv/uv_mmrs.h
index b2f2d2e05cec..4be52c863448 100644
--- a/arch/x86/include/asm/uv/uv_mmrs.h
+++ b/arch/x86/include/asm/uv/uv_mmrs.h
@@ -5,19 +5,70 @@
5 * 5 *
6 * SGI UV MMR definitions 6 * SGI UV MMR definitions
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2011 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10 10
11#ifndef _ASM_X86_UV_UV_MMRS_H 11#ifndef _ASM_X86_UV_UV_MMRS_H
12#define _ASM_X86_UV_UV_MMRS_H 12#define _ASM_X86_UV_UV_MMRS_H
13 13
14/*
15 * This file contains MMR definitions for both UV1 & UV2 hubs.
16 *
17 * In general, MMR addresses and structures are identical on both hubs.
18 * These MMRs are identified as:
19 * #define UVH_xxx <address>
20 * union uvh_xxx {
21 * unsigned long v;
22 * struct uvh_int_cmpd_s {
23 * } s;
24 * };
25 *
26 * If the MMR exists on both hub type but has different addresses or
27 * contents, the MMR definition is similar to:
28 * #define UV1H_xxx <uv1 address>
29 * #define UV2H_xxx <uv2address>
30 * #define UVH_xxx (is_uv1_hub() ? UV1H_xxx : UV2H_xxx)
31 * union uvh_xxx {
32 * unsigned long v;
33 * struct uv1h_int_cmpd_s { (Common fields only)
34 * } s;
35 * struct uv1h_int_cmpd_s { (Full UV1 definition)
36 * } s1;
37 * struct uv2h_int_cmpd_s { (Full UV2 definition)
38 * } s2;
39 * };
40 *
41 * Only essential difference are enumerated. For example, if the address is
42 * the same for both UV1 & UV2, only a single #define is generated. Likewise,
43 * if the contents is the same for both hubs, only the "s" structure is
44 * generated.
45 *
46 * If the MMR exists on ONLY 1 type of hub, no generic definition is
47 * generated:
48 * #define UVnH_xxx <uvn address>
49 * union uvnh_xxx {
50 * unsigned long v;
51 * struct uvh_int_cmpd_s {
52 * } sn;
53 * };
54 */
55
14#define UV_MMR_ENABLE (1UL << 63) 56#define UV_MMR_ENABLE (1UL << 63)
15 57
58#define UV1_HUB_PART_NUMBER 0x88a5
59#define UV2_HUB_PART_NUMBER 0x8eb8
60
61/* Compat: if this #define is present, UV headers support UV2 */
62#define UV2_HUB_IS_SUPPORTED 1
63
64/* KABI compat: if this #define is present, KABI hacks are present */
65#define UV2_HUB_KABI_HACKS 1
66
16/* ========================================================================= */ 67/* ========================================================================= */
17/* UVH_BAU_DATA_BROADCAST */ 68/* UVH_BAU_DATA_BROADCAST */
18/* ========================================================================= */ 69/* ========================================================================= */
19#define UVH_BAU_DATA_BROADCAST 0x61688UL 70#define UVH_BAU_DATA_BROADCAST 0x61688UL
20#define UVH_BAU_DATA_BROADCAST_32 0x0440 71#define UVH_BAU_DATA_BROADCAST_32 0x440
21 72
22#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0 73#define UVH_BAU_DATA_BROADCAST_ENABLE_SHFT 0
23#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL 74#define UVH_BAU_DATA_BROADCAST_ENABLE_MASK 0x0000000000000001UL
@@ -34,7 +85,7 @@ union uvh_bau_data_broadcast_u {
34/* UVH_BAU_DATA_CONFIG */ 85/* UVH_BAU_DATA_CONFIG */
35/* ========================================================================= */ 86/* ========================================================================= */
36#define UVH_BAU_DATA_CONFIG 0x61680UL 87#define UVH_BAU_DATA_CONFIG 0x61680UL
37#define UVH_BAU_DATA_CONFIG_32 0x0438 88#define UVH_BAU_DATA_CONFIG_32 0x438
38 89
39#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0 90#define UVH_BAU_DATA_CONFIG_VECTOR_SHFT 0
40#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL 91#define UVH_BAU_DATA_CONFIG_VECTOR_MASK 0x00000000000000ffUL
@@ -73,125 +124,245 @@ union uvh_bau_data_config_u {
73/* UVH_EVENT_OCCURRED0 */ 124/* UVH_EVENT_OCCURRED0 */
74/* ========================================================================= */ 125/* ========================================================================= */
75#define UVH_EVENT_OCCURRED0 0x70000UL 126#define UVH_EVENT_OCCURRED0 0x70000UL
76#define UVH_EVENT_OCCURRED0_32 0x005e8 127#define UVH_EVENT_OCCURRED0_32 0x5e8
77 128
78#define UVH_EVENT_OCCURRED0_LB_HCERR_SHFT 0 129#define UV1H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
79#define UVH_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL 130#define UV1H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
80#define UVH_EVENT_OCCURRED0_GR0_HCERR_SHFT 1 131#define UV1H_EVENT_OCCURRED0_GR0_HCERR_SHFT 1
81#define UVH_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL 132#define UV1H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000002UL
82#define UVH_EVENT_OCCURRED0_GR1_HCERR_SHFT 2 133#define UV1H_EVENT_OCCURRED0_GR1_HCERR_SHFT 2
83#define UVH_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL 134#define UV1H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000004UL
84#define UVH_EVENT_OCCURRED0_LH_HCERR_SHFT 3 135#define UV1H_EVENT_OCCURRED0_LH_HCERR_SHFT 3
85#define UVH_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL 136#define UV1H_EVENT_OCCURRED0_LH_HCERR_MASK 0x0000000000000008UL
86#define UVH_EVENT_OCCURRED0_RH_HCERR_SHFT 4 137#define UV1H_EVENT_OCCURRED0_RH_HCERR_SHFT 4
87#define UVH_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL 138#define UV1H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000010UL
88#define UVH_EVENT_OCCURRED0_XN_HCERR_SHFT 5 139#define UV1H_EVENT_OCCURRED0_XN_HCERR_SHFT 5
89#define UVH_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL 140#define UV1H_EVENT_OCCURRED0_XN_HCERR_MASK 0x0000000000000020UL
90#define UVH_EVENT_OCCURRED0_SI_HCERR_SHFT 6 141#define UV1H_EVENT_OCCURRED0_SI_HCERR_SHFT 6
91#define UVH_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL 142#define UV1H_EVENT_OCCURRED0_SI_HCERR_MASK 0x0000000000000040UL
92#define UVH_EVENT_OCCURRED0_LB_AOERR0_SHFT 7 143#define UV1H_EVENT_OCCURRED0_LB_AOERR0_SHFT 7
93#define UVH_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL 144#define UV1H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000080UL
94#define UVH_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8 145#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 8
95#define UVH_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL 146#define UV1H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000000100UL
96#define UVH_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9 147#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 9
97#define UVH_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL 148#define UV1H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000000200UL
98#define UVH_EVENT_OCCURRED0_LH_AOERR0_SHFT 10 149#define UV1H_EVENT_OCCURRED0_LH_AOERR0_SHFT 10
99#define UVH_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL 150#define UV1H_EVENT_OCCURRED0_LH_AOERR0_MASK 0x0000000000000400UL
100#define UVH_EVENT_OCCURRED0_RH_AOERR0_SHFT 11 151#define UV1H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
101#define UVH_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL 152#define UV1H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
102#define UVH_EVENT_OCCURRED0_XN_AOERR0_SHFT 12 153#define UV1H_EVENT_OCCURRED0_XN_AOERR0_SHFT 12
103#define UVH_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL 154#define UV1H_EVENT_OCCURRED0_XN_AOERR0_MASK 0x0000000000001000UL
104#define UVH_EVENT_OCCURRED0_SI_AOERR0_SHFT 13 155#define UV1H_EVENT_OCCURRED0_SI_AOERR0_SHFT 13
105#define UVH_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL 156#define UV1H_EVENT_OCCURRED0_SI_AOERR0_MASK 0x0000000000002000UL
106#define UVH_EVENT_OCCURRED0_LB_AOERR1_SHFT 14 157#define UV1H_EVENT_OCCURRED0_LB_AOERR1_SHFT 14
107#define UVH_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL 158#define UV1H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000004000UL
108#define UVH_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15 159#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 15
109#define UVH_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL 160#define UV1H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000000008000UL
110#define UVH_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16 161#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 16
111#define UVH_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL 162#define UV1H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000000010000UL
112#define UVH_EVENT_OCCURRED0_LH_AOERR1_SHFT 17 163#define UV1H_EVENT_OCCURRED0_LH_AOERR1_SHFT 17
113#define UVH_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL 164#define UV1H_EVENT_OCCURRED0_LH_AOERR1_MASK 0x0000000000020000UL
114#define UVH_EVENT_OCCURRED0_RH_AOERR1_SHFT 18 165#define UV1H_EVENT_OCCURRED0_RH_AOERR1_SHFT 18
115#define UVH_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL 166#define UV1H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000040000UL
116#define UVH_EVENT_OCCURRED0_XN_AOERR1_SHFT 19 167#define UV1H_EVENT_OCCURRED0_XN_AOERR1_SHFT 19
117#define UVH_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL 168#define UV1H_EVENT_OCCURRED0_XN_AOERR1_MASK 0x0000000000080000UL
118#define UVH_EVENT_OCCURRED0_SI_AOERR1_SHFT 20 169#define UV1H_EVENT_OCCURRED0_SI_AOERR1_SHFT 20
119#define UVH_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL 170#define UV1H_EVENT_OCCURRED0_SI_AOERR1_MASK 0x0000000000100000UL
120#define UVH_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21 171#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_SHFT 21
121#define UVH_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL 172#define UV1H_EVENT_OCCURRED0_RH_VPI_INT_MASK 0x0000000000200000UL
122#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22 173#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 22
123#define UVH_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL 174#define UV1H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000000400000UL
124#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23 175#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 23
125#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL 176#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000000800000UL
126#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24 177#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 24
127#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL 178#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000001000000UL
128#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25 179#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 25
129#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL 180#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000002000000UL
130#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26 181#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 26
131#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL 182#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000004000000UL
132#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27 183#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 27
133#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL 184#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000000008000000UL
134#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28 185#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 28
135#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL 186#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000000010000000UL
136#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29 187#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 29
137#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL 188#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000000020000000UL
138#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30 189#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 30
139#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL 190#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000000040000000UL
140#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31 191#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 31
141#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL 192#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000000080000000UL
142#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32 193#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 32
143#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL 194#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000000100000000UL
144#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33 195#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 33
145#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL 196#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000000200000000UL
146#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34 197#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 34
147#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL 198#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000000400000000UL
148#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35 199#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 35
149#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL 200#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000000800000000UL
150#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36 201#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 36
151#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL 202#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000001000000000UL
152#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37 203#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 37
153#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL 204#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000002000000000UL
154#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38 205#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 38
155#define UVH_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL 206#define UV1H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000004000000000UL
156#define UVH_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39 207#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 39
157#define UVH_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL 208#define UV1H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0000008000000000UL
158#define UVH_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40 209#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 40
159#define UVH_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL 210#define UV1H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0000010000000000UL
160#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41 211#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 41
161#define UVH_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL 212#define UV1H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0000020000000000UL
162#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42 213#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 42
163#define UVH_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL 214#define UV1H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0000040000000000UL
164#define UVH_EVENT_OCCURRED0_LTC_INT_SHFT 43 215#define UV1H_EVENT_OCCURRED0_LTC_INT_SHFT 43
165#define UVH_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL 216#define UV1H_EVENT_OCCURRED0_LTC_INT_MASK 0x0000080000000000UL
166#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44 217#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 44
167#define UVH_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL 218#define UV1H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0000100000000000UL
168#define UVH_EVENT_OCCURRED0_IPI_INT_SHFT 45 219#define UV1H_EVENT_OCCURRED0_IPI_INT_SHFT 45
169#define UVH_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL 220#define UV1H_EVENT_OCCURRED0_IPI_INT_MASK 0x0000200000000000UL
170#define UVH_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46 221#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 46
171#define UVH_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL 222#define UV1H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0000400000000000UL
172#define UVH_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47 223#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 47
173#define UVH_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL 224#define UV1H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0000800000000000UL
174#define UVH_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48 225#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 48
175#define UVH_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL 226#define UV1H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0001000000000000UL
176#define UVH_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49 227#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 49
177#define UVH_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL 228#define UV1H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0002000000000000UL
178#define UVH_EVENT_OCCURRED0_PROFILE_INT_SHFT 50 229#define UV1H_EVENT_OCCURRED0_PROFILE_INT_SHFT 50
179#define UVH_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL 230#define UV1H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0004000000000000UL
180#define UVH_EVENT_OCCURRED0_RTC0_SHFT 51 231#define UV1H_EVENT_OCCURRED0_RTC0_SHFT 51
181#define UVH_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL 232#define UV1H_EVENT_OCCURRED0_RTC0_MASK 0x0008000000000000UL
182#define UVH_EVENT_OCCURRED0_RTC1_SHFT 52 233#define UV1H_EVENT_OCCURRED0_RTC1_SHFT 52
183#define UVH_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL 234#define UV1H_EVENT_OCCURRED0_RTC1_MASK 0x0010000000000000UL
184#define UVH_EVENT_OCCURRED0_RTC2_SHFT 53 235#define UV1H_EVENT_OCCURRED0_RTC2_SHFT 53
185#define UVH_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL 236#define UV1H_EVENT_OCCURRED0_RTC2_MASK 0x0020000000000000UL
186#define UVH_EVENT_OCCURRED0_RTC3_SHFT 54 237#define UV1H_EVENT_OCCURRED0_RTC3_SHFT 54
187#define UVH_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL 238#define UV1H_EVENT_OCCURRED0_RTC3_MASK 0x0040000000000000UL
188#define UVH_EVENT_OCCURRED0_BAU_DATA_SHFT 55 239#define UV1H_EVENT_OCCURRED0_BAU_DATA_SHFT 55
189#define UVH_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL 240#define UV1H_EVENT_OCCURRED0_BAU_DATA_MASK 0x0080000000000000UL
190#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56 241#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_SHFT 56
191#define UVH_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL 242#define UV1H_EVENT_OCCURRED0_POWER_MANAGEMENT_REQ_MASK 0x0100000000000000UL
243
244#define UV2H_EVENT_OCCURRED0_LB_HCERR_SHFT 0
245#define UV2H_EVENT_OCCURRED0_LB_HCERR_MASK 0x0000000000000001UL
246#define UV2H_EVENT_OCCURRED0_QP_HCERR_SHFT 1
247#define UV2H_EVENT_OCCURRED0_QP_HCERR_MASK 0x0000000000000002UL
248#define UV2H_EVENT_OCCURRED0_RH_HCERR_SHFT 2
249#define UV2H_EVENT_OCCURRED0_RH_HCERR_MASK 0x0000000000000004UL
250#define UV2H_EVENT_OCCURRED0_LH0_HCERR_SHFT 3
251#define UV2H_EVENT_OCCURRED0_LH0_HCERR_MASK 0x0000000000000008UL
252#define UV2H_EVENT_OCCURRED0_LH1_HCERR_SHFT 4
253#define UV2H_EVENT_OCCURRED0_LH1_HCERR_MASK 0x0000000000000010UL
254#define UV2H_EVENT_OCCURRED0_GR0_HCERR_SHFT 5
255#define UV2H_EVENT_OCCURRED0_GR0_HCERR_MASK 0x0000000000000020UL
256#define UV2H_EVENT_OCCURRED0_GR1_HCERR_SHFT 6
257#define UV2H_EVENT_OCCURRED0_GR1_HCERR_MASK 0x0000000000000040UL
258#define UV2H_EVENT_OCCURRED0_NI0_HCERR_SHFT 7
259#define UV2H_EVENT_OCCURRED0_NI0_HCERR_MASK 0x0000000000000080UL
260#define UV2H_EVENT_OCCURRED0_NI1_HCERR_SHFT 8
261#define UV2H_EVENT_OCCURRED0_NI1_HCERR_MASK 0x0000000000000100UL
262#define UV2H_EVENT_OCCURRED0_LB_AOERR0_SHFT 9
263#define UV2H_EVENT_OCCURRED0_LB_AOERR0_MASK 0x0000000000000200UL
264#define UV2H_EVENT_OCCURRED0_QP_AOERR0_SHFT 10
265#define UV2H_EVENT_OCCURRED0_QP_AOERR0_MASK 0x0000000000000400UL
266#define UV2H_EVENT_OCCURRED0_RH_AOERR0_SHFT 11
267#define UV2H_EVENT_OCCURRED0_RH_AOERR0_MASK 0x0000000000000800UL
268#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_SHFT 12
269#define UV2H_EVENT_OCCURRED0_LH0_AOERR0_MASK 0x0000000000001000UL
270#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_SHFT 13
271#define UV2H_EVENT_OCCURRED0_LH1_AOERR0_MASK 0x0000000000002000UL
272#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_SHFT 14
273#define UV2H_EVENT_OCCURRED0_GR0_AOERR0_MASK 0x0000000000004000UL
274#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_SHFT 15
275#define UV2H_EVENT_OCCURRED0_GR1_AOERR0_MASK 0x0000000000008000UL
276#define UV2H_EVENT_OCCURRED0_XB_AOERR0_SHFT 16
277#define UV2H_EVENT_OCCURRED0_XB_AOERR0_MASK 0x0000000000010000UL
278#define UV2H_EVENT_OCCURRED0_RT_AOERR0_SHFT 17
279#define UV2H_EVENT_OCCURRED0_RT_AOERR0_MASK 0x0000000000020000UL
280#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_SHFT 18
281#define UV2H_EVENT_OCCURRED0_NI0_AOERR0_MASK 0x0000000000040000UL
282#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_SHFT 19
283#define UV2H_EVENT_OCCURRED0_NI1_AOERR0_MASK 0x0000000000080000UL
284#define UV2H_EVENT_OCCURRED0_LB_AOERR1_SHFT 20
285#define UV2H_EVENT_OCCURRED0_LB_AOERR1_MASK 0x0000000000100000UL
286#define UV2H_EVENT_OCCURRED0_QP_AOERR1_SHFT 21
287#define UV2H_EVENT_OCCURRED0_QP_AOERR1_MASK 0x0000000000200000UL
288#define UV2H_EVENT_OCCURRED0_RH_AOERR1_SHFT 22
289#define UV2H_EVENT_OCCURRED0_RH_AOERR1_MASK 0x0000000000400000UL
290#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_SHFT 23
291#define UV2H_EVENT_OCCURRED0_LH0_AOERR1_MASK 0x0000000000800000UL
292#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_SHFT 24
293#define UV2H_EVENT_OCCURRED0_LH1_AOERR1_MASK 0x0000000001000000UL
294#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_SHFT 25
295#define UV2H_EVENT_OCCURRED0_GR0_AOERR1_MASK 0x0000000002000000UL
296#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_SHFT 26
297#define UV2H_EVENT_OCCURRED0_GR1_AOERR1_MASK 0x0000000004000000UL
298#define UV2H_EVENT_OCCURRED0_XB_AOERR1_SHFT 27
299#define UV2H_EVENT_OCCURRED0_XB_AOERR1_MASK 0x0000000008000000UL
300#define UV2H_EVENT_OCCURRED0_RT_AOERR1_SHFT 28
301#define UV2H_EVENT_OCCURRED0_RT_AOERR1_MASK 0x0000000010000000UL
302#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_SHFT 29
303#define UV2H_EVENT_OCCURRED0_NI0_AOERR1_MASK 0x0000000020000000UL
304#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_SHFT 30
305#define UV2H_EVENT_OCCURRED0_NI1_AOERR1_MASK 0x0000000040000000UL
306#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_SHFT 31
307#define UV2H_EVENT_OCCURRED0_SYSTEM_SHUTDOWN_INT_MASK 0x0000000080000000UL
308#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_SHFT 32
309#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_0_MASK 0x0000000100000000UL
310#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_SHFT 33
311#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_1_MASK 0x0000000200000000UL
312#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_SHFT 34
313#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_2_MASK 0x0000000400000000UL
314#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_SHFT 35
315#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_3_MASK 0x0000000800000000UL
316#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_SHFT 36
317#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_4_MASK 0x0000001000000000UL
318#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_SHFT 37
319#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_5_MASK 0x0000002000000000UL
320#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_SHFT 38
321#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_6_MASK 0x0000004000000000UL
322#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_SHFT 39
323#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_7_MASK 0x0000008000000000UL
324#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_SHFT 40
325#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_8_MASK 0x0000010000000000UL
326#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_SHFT 41
327#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_9_MASK 0x0000020000000000UL
328#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_SHFT 42
329#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_10_MASK 0x0000040000000000UL
330#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_SHFT 43
331#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_11_MASK 0x0000080000000000UL
332#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_SHFT 44
333#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_12_MASK 0x0000100000000000UL
334#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_SHFT 45
335#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_13_MASK 0x0000200000000000UL
336#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_SHFT 46
337#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_14_MASK 0x0000400000000000UL
338#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_SHFT 47
339#define UV2H_EVENT_OCCURRED0_LB_IRQ_INT_15_MASK 0x0000800000000000UL
340#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_SHFT 48
341#define UV2H_EVENT_OCCURRED0_L1_NMI_INT_MASK 0x0001000000000000UL
342#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_SHFT 49
343#define UV2H_EVENT_OCCURRED0_STOP_CLOCK_MASK 0x0002000000000000UL
344#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_SHFT 50
345#define UV2H_EVENT_OCCURRED0_ASIC_TO_L1_MASK 0x0004000000000000UL
346#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_SHFT 51
347#define UV2H_EVENT_OCCURRED0_L1_TO_ASIC_MASK 0x0008000000000000UL
348#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_SHFT 52
349#define UV2H_EVENT_OCCURRED0_LA_SEQ_TRIGGER_MASK 0x0010000000000000UL
350#define UV2H_EVENT_OCCURRED0_IPI_INT_SHFT 53
351#define UV2H_EVENT_OCCURRED0_IPI_INT_MASK 0x0020000000000000UL
352#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_SHFT 54
353#define UV2H_EVENT_OCCURRED0_EXTIO_INT0_MASK 0x0040000000000000UL
354#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_SHFT 55
355#define UV2H_EVENT_OCCURRED0_EXTIO_INT1_MASK 0x0080000000000000UL
356#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_SHFT 56
357#define UV2H_EVENT_OCCURRED0_EXTIO_INT2_MASK 0x0100000000000000UL
358#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_SHFT 57
359#define UV2H_EVENT_OCCURRED0_EXTIO_INT3_MASK 0x0200000000000000UL
360#define UV2H_EVENT_OCCURRED0_PROFILE_INT_SHFT 58
361#define UV2H_EVENT_OCCURRED0_PROFILE_INT_MASK 0x0400000000000000UL
362
192union uvh_event_occurred0_u { 363union uvh_event_occurred0_u {
193 unsigned long v; 364 unsigned long v;
194 struct uvh_event_occurred0_s { 365 struct uv1h_event_occurred0_s {
195 unsigned long lb_hcerr : 1; /* RW, W1C */ 366 unsigned long lb_hcerr : 1; /* RW, W1C */
196 unsigned long gr0_hcerr : 1; /* RW, W1C */ 367 unsigned long gr0_hcerr : 1; /* RW, W1C */
197 unsigned long gr1_hcerr : 1; /* RW, W1C */ 368 unsigned long gr1_hcerr : 1; /* RW, W1C */
@@ -250,14 +421,76 @@ union uvh_event_occurred0_u {
250 unsigned long bau_data : 1; /* RW, W1C */ 421 unsigned long bau_data : 1; /* RW, W1C */
251 unsigned long power_management_req : 1; /* RW, W1C */ 422 unsigned long power_management_req : 1; /* RW, W1C */
252 unsigned long rsvd_57_63 : 7; /* */ 423 unsigned long rsvd_57_63 : 7; /* */
253 } s; 424 } s1;
425 struct uv2h_event_occurred0_s {
426 unsigned long lb_hcerr : 1; /* RW */
427 unsigned long qp_hcerr : 1; /* RW */
428 unsigned long rh_hcerr : 1; /* RW */
429 unsigned long lh0_hcerr : 1; /* RW */
430 unsigned long lh1_hcerr : 1; /* RW */
431 unsigned long gr0_hcerr : 1; /* RW */
432 unsigned long gr1_hcerr : 1; /* RW */
433 unsigned long ni0_hcerr : 1; /* RW */
434 unsigned long ni1_hcerr : 1; /* RW */
435 unsigned long lb_aoerr0 : 1; /* RW */
436 unsigned long qp_aoerr0 : 1; /* RW */
437 unsigned long rh_aoerr0 : 1; /* RW */
438 unsigned long lh0_aoerr0 : 1; /* RW */
439 unsigned long lh1_aoerr0 : 1; /* RW */
440 unsigned long gr0_aoerr0 : 1; /* RW */
441 unsigned long gr1_aoerr0 : 1; /* RW */
442 unsigned long xb_aoerr0 : 1; /* RW */
443 unsigned long rt_aoerr0 : 1; /* RW */
444 unsigned long ni0_aoerr0 : 1; /* RW */
445 unsigned long ni1_aoerr0 : 1; /* RW */
446 unsigned long lb_aoerr1 : 1; /* RW */
447 unsigned long qp_aoerr1 : 1; /* RW */
448 unsigned long rh_aoerr1 : 1; /* RW */
449 unsigned long lh0_aoerr1 : 1; /* RW */
450 unsigned long lh1_aoerr1 : 1; /* RW */
451 unsigned long gr0_aoerr1 : 1; /* RW */
452 unsigned long gr1_aoerr1 : 1; /* RW */
453 unsigned long xb_aoerr1 : 1; /* RW */
454 unsigned long rt_aoerr1 : 1; /* RW */
455 unsigned long ni0_aoerr1 : 1; /* RW */
456 unsigned long ni1_aoerr1 : 1; /* RW */
457 unsigned long system_shutdown_int : 1; /* RW */
458 unsigned long lb_irq_int_0 : 1; /* RW */
459 unsigned long lb_irq_int_1 : 1; /* RW */
460 unsigned long lb_irq_int_2 : 1; /* RW */
461 unsigned long lb_irq_int_3 : 1; /* RW */
462 unsigned long lb_irq_int_4 : 1; /* RW */
463 unsigned long lb_irq_int_5 : 1; /* RW */
464 unsigned long lb_irq_int_6 : 1; /* RW */
465 unsigned long lb_irq_int_7 : 1; /* RW */
466 unsigned long lb_irq_int_8 : 1; /* RW */
467 unsigned long lb_irq_int_9 : 1; /* RW */
468 unsigned long lb_irq_int_10 : 1; /* RW */
469 unsigned long lb_irq_int_11 : 1; /* RW */
470 unsigned long lb_irq_int_12 : 1; /* RW */
471 unsigned long lb_irq_int_13 : 1; /* RW */
472 unsigned long lb_irq_int_14 : 1; /* RW */
473 unsigned long lb_irq_int_15 : 1; /* RW */
474 unsigned long l1_nmi_int : 1; /* RW */
475 unsigned long stop_clock : 1; /* RW */
476 unsigned long asic_to_l1 : 1; /* RW */
477 unsigned long l1_to_asic : 1; /* RW */
478 unsigned long la_seq_trigger : 1; /* RW */
479 unsigned long ipi_int : 1; /* RW */
480 unsigned long extio_int0 : 1; /* RW */
481 unsigned long extio_int1 : 1; /* RW */
482 unsigned long extio_int2 : 1; /* RW */
483 unsigned long extio_int3 : 1; /* RW */
484 unsigned long profile_int : 1; /* RW */
485 unsigned long rsvd_59_63 : 5; /* */
486 } s2;
254}; 487};
255 488
256/* ========================================================================= */ 489/* ========================================================================= */
257/* UVH_EVENT_OCCURRED0_ALIAS */ 490/* UVH_EVENT_OCCURRED0_ALIAS */
258/* ========================================================================= */ 491/* ========================================================================= */
259#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL 492#define UVH_EVENT_OCCURRED0_ALIAS 0x0000000000070008UL
260#define UVH_EVENT_OCCURRED0_ALIAS_32 0x005f0 493#define UVH_EVENT_OCCURRED0_ALIAS_32 0x5f0
261 494
262/* ========================================================================= */ 495/* ========================================================================= */
263/* UVH_GR0_TLB_INT0_CONFIG */ 496/* UVH_GR0_TLB_INT0_CONFIG */
@@ -432,8 +665,16 @@ union uvh_int_cmpb_u {
432/* ========================================================================= */ 665/* ========================================================================= */
433#define UVH_INT_CMPC 0x22100UL 666#define UVH_INT_CMPC 0x22100UL
434 667
435#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT 0 668#define UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT 0
436#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK 0x00ffffffffffffffUL 669#define UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT 0
670#define UVH_INT_CMPC_REAL_TIME_CMPC_SHFT (is_uv1_hub() ? \
671 UV1H_INT_CMPC_REAL_TIME_CMPC_SHFT : \
672 UV2H_INT_CMPC_REAL_TIME_CMPC_SHFT)
673#define UV1H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
674#define UV2H_INT_CMPC_REAL_TIME_CMPC_MASK 0xffffffffffffffUL
675#define UVH_INT_CMPC_REAL_TIME_CMPC_MASK (is_uv1_hub() ? \
676 UV1H_INT_CMPC_REAL_TIME_CMPC_MASK : \
677 UV2H_INT_CMPC_REAL_TIME_CMPC_MASK)
437 678
438union uvh_int_cmpc_u { 679union uvh_int_cmpc_u {
439 unsigned long v; 680 unsigned long v;
@@ -448,8 +689,16 @@ union uvh_int_cmpc_u {
448/* ========================================================================= */ 689/* ========================================================================= */
449#define UVH_INT_CMPD 0x22180UL 690#define UVH_INT_CMPD 0x22180UL
450 691
451#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT 0 692#define UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT 0
452#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK 0x00ffffffffffffffUL 693#define UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT 0
694#define UVH_INT_CMPD_REAL_TIME_CMPD_SHFT (is_uv1_hub() ? \
695 UV1H_INT_CMPD_REAL_TIME_CMPD_SHFT : \
696 UV2H_INT_CMPD_REAL_TIME_CMPD_SHFT)
697#define UV1H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
698#define UV2H_INT_CMPD_REAL_TIME_CMPD_MASK 0xffffffffffffffUL
699#define UVH_INT_CMPD_REAL_TIME_CMPD_MASK (is_uv1_hub() ? \
700 UV1H_INT_CMPD_REAL_TIME_CMPD_MASK : \
701 UV2H_INT_CMPD_REAL_TIME_CMPD_MASK)
453 702
454union uvh_int_cmpd_u { 703union uvh_int_cmpd_u {
455 unsigned long v; 704 unsigned long v;
@@ -463,7 +712,7 @@ union uvh_int_cmpd_u {
463/* UVH_IPI_INT */ 712/* UVH_IPI_INT */
464/* ========================================================================= */ 713/* ========================================================================= */
465#define UVH_IPI_INT 0x60500UL 714#define UVH_IPI_INT 0x60500UL
466#define UVH_IPI_INT_32 0x0348 715#define UVH_IPI_INT_32 0x348
467 716
468#define UVH_IPI_INT_VECTOR_SHFT 0 717#define UVH_IPI_INT_VECTOR_SHFT 0
469#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL 718#define UVH_IPI_INT_VECTOR_MASK 0x00000000000000ffUL
@@ -493,7 +742,7 @@ union uvh_ipi_int_u {
493/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */ 742/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST */
494/* ========================================================================= */ 743/* ========================================================================= */
495#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL 744#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST 0x320050UL
496#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x009c0 745#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_32 0x9c0
497 746
498#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4 747#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_SHFT 4
499#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL 748#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST_ADDRESS_MASK 0x000007fffffffff0UL
@@ -515,7 +764,7 @@ union uvh_lb_bau_intd_payload_queue_first_u {
515/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */ 764/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST */
516/* ========================================================================= */ 765/* ========================================================================= */
517#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL 766#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST 0x320060UL
518#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x009c8 767#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_32 0x9c8
519 768
520#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4 769#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_SHFT 4
521#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL 770#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST_ADDRESS_MASK 0x000007fffffffff0UL
@@ -533,7 +782,7 @@ union uvh_lb_bau_intd_payload_queue_last_u {
533/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */ 782/* UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL */
534/* ========================================================================= */ 783/* ========================================================================= */
535#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL 784#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL 0x320070UL
536#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x009d0 785#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_32 0x9d0
537 786
538#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4 787#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_SHFT 4
539#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL 788#define UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL_ADDRESS_MASK 0x000007fffffffff0UL
@@ -551,7 +800,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u {
551/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */ 800/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE */
552/* ========================================================================= */ 801/* ========================================================================= */
553#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL 802#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE 0x320080UL
554#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0x0a68 803#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_32 0xa68
555 804
556#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0 805#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_SHFT 0
557#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL 806#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_PENDING_0_MASK 0x0000000000000001UL
@@ -585,6 +834,7 @@ union uvh_lb_bau_intd_payload_queue_tail_u {
585#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL 834#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_6_MASK 0x0000000000004000UL
586#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15 835#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_SHFT 15
587#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL 836#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_TIMEOUT_7_MASK 0x0000000000008000UL
837
588union uvh_lb_bau_intd_software_acknowledge_u { 838union uvh_lb_bau_intd_software_acknowledge_u {
589 unsigned long v; 839 unsigned long v;
590 struct uvh_lb_bau_intd_software_acknowledge_s { 840 struct uvh_lb_bau_intd_software_acknowledge_s {
@@ -612,13 +862,13 @@ union uvh_lb_bau_intd_software_acknowledge_u {
612/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */ 862/* UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS */
613/* ========================================================================= */ 863/* ========================================================================= */
614#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL 864#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS 0x0000000000320088UL
615#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0x0a70 865#define UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS_32 0xa70
616 866
617/* ========================================================================= */ 867/* ========================================================================= */
618/* UVH_LB_BAU_MISC_CONTROL */ 868/* UVH_LB_BAU_MISC_CONTROL */
619/* ========================================================================= */ 869/* ========================================================================= */
620#define UVH_LB_BAU_MISC_CONTROL 0x320170UL 870#define UVH_LB_BAU_MISC_CONTROL 0x320170UL
621#define UVH_LB_BAU_MISC_CONTROL_32 0x00a10 871#define UVH_LB_BAU_MISC_CONTROL_32 0xa10
622 872
623#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0 873#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
624#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL 874#define UVH_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
@@ -628,8 +878,8 @@ union uvh_lb_bau_intd_software_acknowledge_u {
628#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL 878#define UVH_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
629#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10 879#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
630#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL 880#define UVH_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
631#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_SHFT 11 881#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
632#define UVH_LB_BAU_MISC_CONTROL_CSI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL 882#define UVH_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
633#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14 883#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
634#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL 884#define UVH_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
635#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15 885#define UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
@@ -650,8 +900,86 @@ union uvh_lb_bau_intd_software_acknowledge_u {
650#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL 900#define UVH_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
651#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28 901#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
652#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL 902#define UVH_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
653#define UVH_LB_BAU_MISC_CONTROL_FUN_SHFT 48 903
654#define UVH_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL 904#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
905#define UV1H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
906#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
907#define UV1H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
908#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
909#define UV1H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
910#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
911#define UV1H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
912#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
913#define UV1H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
914#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
915#define UV1H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
916#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
917#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
918#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
919#define UV1H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
920#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
921#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
922#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
923#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
924#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
925#define UV1H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
926#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
927#define UV1H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
928#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
929#define UV1H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
930#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
931#define UV1H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
932#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
933#define UV1H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
934#define UV1H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
935#define UV1H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
936
937#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_SHFT 0
938#define UV2H_LB_BAU_MISC_CONTROL_REJECTION_DELAY_MASK 0x00000000000000ffUL
939#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_SHFT 8
940#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_MASK 0x0000000000000100UL
941#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_SHFT 9
942#define UV2H_LB_BAU_MISC_CONTROL_FORCE_BROADCAST_MASK 0x0000000000000200UL
943#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_SHFT 10
944#define UV2H_LB_BAU_MISC_CONTROL_FORCE_LOCK_NOP_MASK 0x0000000000000400UL
945#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_SHFT 11
946#define UV2H_LB_BAU_MISC_CONTROL_QPI_AGENT_PRESENCE_VECTOR_MASK 0x0000000000003800UL
947#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_SHFT 14
948#define UV2H_LB_BAU_MISC_CONTROL_DESCRIPTOR_FETCH_MODE_MASK 0x0000000000004000UL
949#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT 15
950#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_MASK 0x0000000000008000UL
951#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT 16
952#define UV2H_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_MASK 0x00000000000f0000UL
953#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_SHFT 20
954#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_DUAL_MAPPING_MODE_MASK 0x0000000000100000UL
955#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_SHFT 21
956#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_DECODE_ENABLE_MASK 0x0000000000200000UL
957#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_SHFT 22
958#define UV2H_LB_BAU_MISC_CONTROL_VGA_IO_PORT_16_BIT_DECODE_MASK 0x0000000000400000UL
959#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_SHFT 23
960#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_DEST_REGISTRATION_MASK 0x0000000000800000UL
961#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_SHFT 24
962#define UV2H_LB_BAU_MISC_CONTROL_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000007000000UL
963#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_SHFT 27
964#define UV2H_LB_BAU_MISC_CONTROL_USE_INCOMING_PRIORITY_MASK 0x0000000008000000UL
965#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_SHFT 28
966#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_PROGRAMMED_INITIAL_PRIORITY_MASK 0x0000000010000000UL
967#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_SHFT 29
968#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_AUTOMATIC_APIC_MODE_SELECTION_MASK 0x0000000020000000UL
969#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_SHFT 30
970#define UV2H_LB_BAU_MISC_CONTROL_APIC_MODE_STATUS_MASK 0x0000000040000000UL
971#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_SHFT 31
972#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INTERRUPTS_TO_SELF_MASK 0x0000000080000000UL
973#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_SHFT 32
974#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_LOCK_BASED_SYSTEM_FLUSH_MASK 0x0000000100000000UL
975#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_SHFT 33
976#define UV2H_LB_BAU_MISC_CONTROL_ENABLE_EXTENDED_SB_STATUS_MASK 0x0000000200000000UL
977#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_SHFT 34
978#define UV2H_LB_BAU_MISC_CONTROL_SUPPRESS_INT_PRIO_UDT_TO_SELF_MASK 0x0000000400000000UL
979#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_SHFT 35
980#define UV2H_LB_BAU_MISC_CONTROL_USE_LEGACY_DESCRIPTOR_FORMATS_MASK 0x0000000800000000UL
981#define UV2H_LB_BAU_MISC_CONTROL_FUN_SHFT 48
982#define UV2H_LB_BAU_MISC_CONTROL_FUN_MASK 0xffff000000000000UL
655 983
656union uvh_lb_bau_misc_control_u { 984union uvh_lb_bau_misc_control_u {
657 unsigned long v; 985 unsigned long v;
@@ -660,7 +988,25 @@ union uvh_lb_bau_misc_control_u {
660 unsigned long apic_mode : 1; /* RW */ 988 unsigned long apic_mode : 1; /* RW */
661 unsigned long force_broadcast : 1; /* RW */ 989 unsigned long force_broadcast : 1; /* RW */
662 unsigned long force_lock_nop : 1; /* RW */ 990 unsigned long force_lock_nop : 1; /* RW */
663 unsigned long csi_agent_presence_vector : 3; /* RW */ 991 unsigned long qpi_agent_presence_vector : 3; /* RW */
992 unsigned long descriptor_fetch_mode : 1; /* RW */
993 unsigned long enable_intd_soft_ack_mode : 1; /* RW */
994 unsigned long intd_soft_ack_timeout_period : 4; /* RW */
995 unsigned long enable_dual_mapping_mode : 1; /* RW */
996 unsigned long vga_io_port_decode_enable : 1; /* RW */
997 unsigned long vga_io_port_16_bit_decode : 1; /* RW */
998 unsigned long suppress_dest_registration : 1; /* RW */
999 unsigned long programmed_initial_priority : 3; /* RW */
1000 unsigned long use_incoming_priority : 1; /* RW */
1001 unsigned long enable_programmed_initial_priority : 1; /* RW */
1002 unsigned long rsvd_29_63 : 35;
1003 } s;
1004 struct uv1h_lb_bau_misc_control_s {
1005 unsigned long rejection_delay : 8; /* RW */
1006 unsigned long apic_mode : 1; /* RW */
1007 unsigned long force_broadcast : 1; /* RW */
1008 unsigned long force_lock_nop : 1; /* RW */
1009 unsigned long qpi_agent_presence_vector : 3; /* RW */
664 unsigned long descriptor_fetch_mode : 1; /* RW */ 1010 unsigned long descriptor_fetch_mode : 1; /* RW */
665 unsigned long enable_intd_soft_ack_mode : 1; /* RW */ 1011 unsigned long enable_intd_soft_ack_mode : 1; /* RW */
666 unsigned long intd_soft_ack_timeout_period : 4; /* RW */ 1012 unsigned long intd_soft_ack_timeout_period : 4; /* RW */
@@ -673,14 +1019,40 @@ union uvh_lb_bau_misc_control_u {
673 unsigned long enable_programmed_initial_priority : 1; /* RW */ 1019 unsigned long enable_programmed_initial_priority : 1; /* RW */
674 unsigned long rsvd_29_47 : 19; /* */ 1020 unsigned long rsvd_29_47 : 19; /* */
675 unsigned long fun : 16; /* RW */ 1021 unsigned long fun : 16; /* RW */
676 } s; 1022 } s1;
1023 struct uv2h_lb_bau_misc_control_s {
1024 unsigned long rejection_delay : 8; /* RW */
1025 unsigned long apic_mode : 1; /* RW */
1026 unsigned long force_broadcast : 1; /* RW */
1027 unsigned long force_lock_nop : 1; /* RW */
1028 unsigned long qpi_agent_presence_vector : 3; /* RW */
1029 unsigned long descriptor_fetch_mode : 1; /* RW */
1030 unsigned long enable_intd_soft_ack_mode : 1; /* RW */
1031 unsigned long intd_soft_ack_timeout_period : 4; /* RW */
1032 unsigned long enable_dual_mapping_mode : 1; /* RW */
1033 unsigned long vga_io_port_decode_enable : 1; /* RW */
1034 unsigned long vga_io_port_16_bit_decode : 1; /* RW */
1035 unsigned long suppress_dest_registration : 1; /* RW */
1036 unsigned long programmed_initial_priority : 3; /* RW */
1037 unsigned long use_incoming_priority : 1; /* RW */
1038 unsigned long enable_programmed_initial_priority : 1; /* RW */
1039 unsigned long enable_automatic_apic_mode_selection : 1; /* RW */
1040 unsigned long apic_mode_status : 1; /* RO */
1041 unsigned long suppress_interrupts_to_self : 1; /* RW */
1042 unsigned long enable_lock_based_system_flush : 1; /* RW */
1043 unsigned long enable_extended_sb_status : 1; /* RW */
1044 unsigned long suppress_int_prio_udt_to_self : 1; /* RW */
1045 unsigned long use_legacy_descriptor_formats : 1; /* RW */
1046 unsigned long rsvd_36_47 : 12; /* */
1047 unsigned long fun : 16; /* RW */
1048 } s2;
677}; 1049};
678 1050
679/* ========================================================================= */ 1051/* ========================================================================= */
680/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */ 1052/* UVH_LB_BAU_SB_ACTIVATION_CONTROL */
681/* ========================================================================= */ 1053/* ========================================================================= */
682#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL 1054#define UVH_LB_BAU_SB_ACTIVATION_CONTROL 0x320020UL
683#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x009a8 1055#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_32 0x9a8
684 1056
685#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0 1057#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_SHFT 0
686#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL 1058#define UVH_LB_BAU_SB_ACTIVATION_CONTROL_INDEX_MASK 0x000000000000003fUL
@@ -703,7 +1075,7 @@ union uvh_lb_bau_sb_activation_control_u {
703/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */ 1075/* UVH_LB_BAU_SB_ACTIVATION_STATUS_0 */
704/* ========================================================================= */ 1076/* ========================================================================= */
705#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL 1077#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0 0x320030UL
706#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x009b0 1078#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_32 0x9b0
707 1079
708#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0 1080#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_SHFT 0
709#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL 1081#define UVH_LB_BAU_SB_ACTIVATION_STATUS_0_STATUS_MASK 0xffffffffffffffffUL
@@ -719,7 +1091,7 @@ union uvh_lb_bau_sb_activation_status_0_u {
719/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */ 1091/* UVH_LB_BAU_SB_ACTIVATION_STATUS_1 */
720/* ========================================================================= */ 1092/* ========================================================================= */
721#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL 1093#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1 0x320040UL
722#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x009b8 1094#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_32 0x9b8
723 1095
724#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0 1096#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_SHFT 0
725#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL 1097#define UVH_LB_BAU_SB_ACTIVATION_STATUS_1_STATUS_MASK 0xffffffffffffffffUL
@@ -735,7 +1107,7 @@ union uvh_lb_bau_sb_activation_status_1_u {
735/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */ 1107/* UVH_LB_BAU_SB_DESCRIPTOR_BASE */
736/* ========================================================================= */ 1108/* ========================================================================= */
737#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL 1109#define UVH_LB_BAU_SB_DESCRIPTOR_BASE 0x320010UL
738#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x009a0 1110#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_32 0x9a0
739 1111
740#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12 1112#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_SHFT 12
741#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL 1113#define UVH_LB_BAU_SB_DESCRIPTOR_BASE_PAGE_ADDRESS_MASK 0x000007fffffff000UL
@@ -768,10 +1140,36 @@ union uvh_lb_bau_sb_descriptor_base_u {
768#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL 1140#define UVH_NODE_ID_REVISION_MASK 0x00000000f0000000UL
769#define UVH_NODE_ID_NODE_ID_SHFT 32 1141#define UVH_NODE_ID_NODE_ID_SHFT 32
770#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL 1142#define UVH_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
771#define UVH_NODE_ID_NODES_PER_BIT_SHFT 48 1143
772#define UVH_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL 1144#define UV1H_NODE_ID_FORCE1_SHFT 0
773#define UVH_NODE_ID_NI_PORT_SHFT 56 1145#define UV1H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
774#define UVH_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL 1146#define UV1H_NODE_ID_MANUFACTURER_SHFT 1
1147#define UV1H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
1148#define UV1H_NODE_ID_PART_NUMBER_SHFT 12
1149#define UV1H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
1150#define UV1H_NODE_ID_REVISION_SHFT 28
1151#define UV1H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
1152#define UV1H_NODE_ID_NODE_ID_SHFT 32
1153#define UV1H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
1154#define UV1H_NODE_ID_NODES_PER_BIT_SHFT 48
1155#define UV1H_NODE_ID_NODES_PER_BIT_MASK 0x007f000000000000UL
1156#define UV1H_NODE_ID_NI_PORT_SHFT 56
1157#define UV1H_NODE_ID_NI_PORT_MASK 0x0f00000000000000UL
1158
1159#define UV2H_NODE_ID_FORCE1_SHFT 0
1160#define UV2H_NODE_ID_FORCE1_MASK 0x0000000000000001UL
1161#define UV2H_NODE_ID_MANUFACTURER_SHFT 1
1162#define UV2H_NODE_ID_MANUFACTURER_MASK 0x0000000000000ffeUL
1163#define UV2H_NODE_ID_PART_NUMBER_SHFT 12
1164#define UV2H_NODE_ID_PART_NUMBER_MASK 0x000000000ffff000UL
1165#define UV2H_NODE_ID_REVISION_SHFT 28
1166#define UV2H_NODE_ID_REVISION_MASK 0x00000000f0000000UL
1167#define UV2H_NODE_ID_NODE_ID_SHFT 32
1168#define UV2H_NODE_ID_NODE_ID_MASK 0x00007fff00000000UL
1169#define UV2H_NODE_ID_NODES_PER_BIT_SHFT 50
1170#define UV2H_NODE_ID_NODES_PER_BIT_MASK 0x01fc000000000000UL
1171#define UV2H_NODE_ID_NI_PORT_SHFT 57
1172#define UV2H_NODE_ID_NI_PORT_MASK 0x3e00000000000000UL
775 1173
776union uvh_node_id_u { 1174union uvh_node_id_u {
777 unsigned long v; 1175 unsigned long v;
@@ -781,12 +1179,31 @@ union uvh_node_id_u {
781 unsigned long part_number : 16; /* RO */ 1179 unsigned long part_number : 16; /* RO */
782 unsigned long revision : 4; /* RO */ 1180 unsigned long revision : 4; /* RO */
783 unsigned long node_id : 15; /* RW */ 1181 unsigned long node_id : 15; /* RW */
1182 unsigned long rsvd_47_63 : 17;
1183 } s;
1184 struct uv1h_node_id_s {
1185 unsigned long force1 : 1; /* RO */
1186 unsigned long manufacturer : 11; /* RO */
1187 unsigned long part_number : 16; /* RO */
1188 unsigned long revision : 4; /* RO */
1189 unsigned long node_id : 15; /* RW */
784 unsigned long rsvd_47 : 1; /* */ 1190 unsigned long rsvd_47 : 1; /* */
785 unsigned long nodes_per_bit : 7; /* RW */ 1191 unsigned long nodes_per_bit : 7; /* RW */
786 unsigned long rsvd_55 : 1; /* */ 1192 unsigned long rsvd_55 : 1; /* */
787 unsigned long ni_port : 4; /* RO */ 1193 unsigned long ni_port : 4; /* RO */
788 unsigned long rsvd_60_63 : 4; /* */ 1194 unsigned long rsvd_60_63 : 4; /* */
789 } s; 1195 } s1;
1196 struct uv2h_node_id_s {
1197 unsigned long force1 : 1; /* RO */
1198 unsigned long manufacturer : 11; /* RO */
1199 unsigned long part_number : 16; /* RO */
1200 unsigned long revision : 4; /* RO */
1201 unsigned long node_id : 15; /* RW */
1202 unsigned long rsvd_47_49 : 3; /* */
1203 unsigned long nodes_per_bit : 7; /* RO */
1204 unsigned long ni_port : 5; /* RO */
1205 unsigned long rsvd_62_63 : 2; /* */
1206 } s2;
790}; 1207};
791 1208
792/* ========================================================================= */ 1209/* ========================================================================= */
@@ -806,6 +1223,78 @@ union uvh_node_present_table_u {
806}; 1223};
807 1224
808/* ========================================================================= */ 1225/* ========================================================================= */
1226/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR */
1227/* ========================================================================= */
1228#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR 0x16000c8UL
1229
1230#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_SHFT 24
1231#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_BASE_MASK 0x00000000ff000000UL
1232#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_SHFT 48
1233#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_M_ALIAS_MASK 0x001f000000000000UL
1234#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_SHFT 63
1235#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR_ENABLE_MASK 0x8000000000000000UL
1236
1237union uvh_rh_gam_alias210_overlay_config_0_mmr_u {
1238 unsigned long v;
1239 struct uvh_rh_gam_alias210_overlay_config_0_mmr_s {
1240 unsigned long rsvd_0_23: 24; /* */
1241 unsigned long base : 8; /* RW */
1242 unsigned long rsvd_32_47: 16; /* */
1243 unsigned long m_alias : 5; /* RW */
1244 unsigned long rsvd_53_62: 10; /* */
1245 unsigned long enable : 1; /* RW */
1246 } s;
1247};
1248
1249/* ========================================================================= */
1250/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR */
1251/* ========================================================================= */
1252#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR 0x16000d8UL
1253
1254#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_SHFT 24
1255#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_BASE_MASK 0x00000000ff000000UL
1256#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_SHFT 48
1257#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_M_ALIAS_MASK 0x001f000000000000UL
1258#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_SHFT 63
1259#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR_ENABLE_MASK 0x8000000000000000UL
1260
1261union uvh_rh_gam_alias210_overlay_config_1_mmr_u {
1262 unsigned long v;
1263 struct uvh_rh_gam_alias210_overlay_config_1_mmr_s {
1264 unsigned long rsvd_0_23: 24; /* */
1265 unsigned long base : 8; /* RW */
1266 unsigned long rsvd_32_47: 16; /* */
1267 unsigned long m_alias : 5; /* RW */
1268 unsigned long rsvd_53_62: 10; /* */
1269 unsigned long enable : 1; /* RW */
1270 } s;
1271};
1272
1273/* ========================================================================= */
1274/* UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR */
1275/* ========================================================================= */
1276#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR 0x16000e8UL
1277
1278#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_SHFT 24
1279#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_BASE_MASK 0x00000000ff000000UL
1280#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_SHFT 48
1281#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_M_ALIAS_MASK 0x001f000000000000UL
1282#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_SHFT 63
1283#define UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR_ENABLE_MASK 0x8000000000000000UL
1284
1285union uvh_rh_gam_alias210_overlay_config_2_mmr_u {
1286 unsigned long v;
1287 struct uvh_rh_gam_alias210_overlay_config_2_mmr_s {
1288 unsigned long rsvd_0_23: 24; /* */
1289 unsigned long base : 8; /* RW */
1290 unsigned long rsvd_32_47: 16; /* */
1291 unsigned long m_alias : 5; /* RW */
1292 unsigned long rsvd_53_62: 10; /* */
1293 unsigned long enable : 1; /* RW */
1294 } s;
1295};
1296
1297/* ========================================================================= */
809/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */ 1298/* UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR */
810/* ========================================================================= */ 1299/* ========================================================================= */
811#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL 1300#define UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR 0x16000d0UL
@@ -857,31 +1346,98 @@ union uvh_rh_gam_alias210_redirect_config_2_mmr_u {
857}; 1346};
858 1347
859/* ========================================================================= */ 1348/* ========================================================================= */
1349/* UVH_RH_GAM_CONFIG_MMR */
1350/* ========================================================================= */
1351#define UVH_RH_GAM_CONFIG_MMR 0x1600000UL
1352
1353#define UVH_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
1354#define UVH_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
1355#define UVH_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
1356#define UVH_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
1357
1358#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
1359#define UV1H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
1360#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
1361#define UV1H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
1362#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_SHFT 12
1363#define UV1H_RH_GAM_CONFIG_MMR_MMIOL_CFG_MASK 0x0000000000001000UL
1364
1365#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_SHFT 0
1366#define UV2H_RH_GAM_CONFIG_MMR_M_SKT_MASK 0x000000000000003fUL
1367#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_SHFT 6
1368#define UV2H_RH_GAM_CONFIG_MMR_N_SKT_MASK 0x00000000000003c0UL
1369
1370union uvh_rh_gam_config_mmr_u {
1371 unsigned long v;
1372 struct uvh_rh_gam_config_mmr_s {
1373 unsigned long m_skt : 6; /* RW */
1374 unsigned long n_skt : 4; /* RW */
1375 unsigned long rsvd_10_63 : 54;
1376 } s;
1377 struct uv1h_rh_gam_config_mmr_s {
1378 unsigned long m_skt : 6; /* RW */
1379 unsigned long n_skt : 4; /* RW */
1380 unsigned long rsvd_10_11: 2; /* */
1381 unsigned long mmiol_cfg : 1; /* RW */
1382 unsigned long rsvd_13_63: 51; /* */
1383 } s1;
1384 struct uv2h_rh_gam_config_mmr_s {
1385 unsigned long m_skt : 6; /* RW */
1386 unsigned long n_skt : 4; /* RW */
1387 unsigned long rsvd_10_63: 54; /* */
1388 } s2;
1389};
1390
1391/* ========================================================================= */
860/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */ 1392/* UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR */
861/* ========================================================================= */ 1393/* ========================================================================= */
862#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL 1394#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR 0x1600010UL
863 1395
864#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28 1396#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
865#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL 1397#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
866#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48 1398
867#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL 1399#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
868#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52 1400#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
869#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL 1401#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_SHFT 48
870#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1402#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_GR4_MASK 0x0001000000000000UL
871#define UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1403#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
1404#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
1405#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1406#define UV1H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1407
1408#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT 28
1409#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff0000000UL
1410#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_SHFT 52
1411#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_N_GRU_MASK 0x00f0000000000000UL
1412#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1413#define UV2H_RH_GAM_GRU_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
872 1414
873union uvh_rh_gam_gru_overlay_config_mmr_u { 1415union uvh_rh_gam_gru_overlay_config_mmr_u {
874 unsigned long v; 1416 unsigned long v;
875 struct uvh_rh_gam_gru_overlay_config_mmr_s { 1417 struct uvh_rh_gam_gru_overlay_config_mmr_s {
876 unsigned long rsvd_0_27: 28; /* */ 1418 unsigned long rsvd_0_27: 28; /* */
877 unsigned long base : 18; /* RW */ 1419 unsigned long base : 18; /* RW */
1420 unsigned long rsvd_46_62 : 17;
1421 unsigned long enable : 1; /* RW */
1422 } s;
1423 struct uv1h_rh_gam_gru_overlay_config_mmr_s {
1424 unsigned long rsvd_0_27: 28; /* */
1425 unsigned long base : 18; /* RW */
878 unsigned long rsvd_46_47: 2; /* */ 1426 unsigned long rsvd_46_47: 2; /* */
879 unsigned long gr4 : 1; /* RW */ 1427 unsigned long gr4 : 1; /* RW */
880 unsigned long rsvd_49_51: 3; /* */ 1428 unsigned long rsvd_49_51: 3; /* */
881 unsigned long n_gru : 4; /* RW */ 1429 unsigned long n_gru : 4; /* RW */
882 unsigned long rsvd_56_62: 7; /* */ 1430 unsigned long rsvd_56_62: 7; /* */
883 unsigned long enable : 1; /* RW */ 1431 unsigned long enable : 1; /* RW */
884 } s; 1432 } s1;
1433 struct uv2h_rh_gam_gru_overlay_config_mmr_s {
1434 unsigned long rsvd_0_27: 28; /* */
1435 unsigned long base : 18; /* RW */
1436 unsigned long rsvd_46_51: 6; /* */
1437 unsigned long n_gru : 4; /* RW */
1438 unsigned long rsvd_56_62: 7; /* */
1439 unsigned long enable : 1; /* RW */
1440 } s2;
885}; 1441};
886 1442
887/* ========================================================================= */ 1443/* ========================================================================= */
@@ -889,25 +1445,42 @@ union uvh_rh_gam_gru_overlay_config_mmr_u {
889/* ========================================================================= */ 1445/* ========================================================================= */
890#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL 1446#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR 0x1600030UL
891 1447
892#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30 1448#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 30
893#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL 1449#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003fffc0000000UL
894#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46 1450#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
895#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL 1451#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
896#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52 1452#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
897#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL 1453#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
898#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1454#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
899#define UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1455#define UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1456
1457#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT 27
1458#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffff8000000UL
1459#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_SHFT 46
1460#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_M_IO_MASK 0x000fc00000000000UL
1461#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_SHFT 52
1462#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_N_IO_MASK 0x00f0000000000000UL
1463#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1464#define UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
900 1465
901union uvh_rh_gam_mmioh_overlay_config_mmr_u { 1466union uvh_rh_gam_mmioh_overlay_config_mmr_u {
902 unsigned long v; 1467 unsigned long v;
903 struct uvh_rh_gam_mmioh_overlay_config_mmr_s { 1468 struct uv1h_rh_gam_mmioh_overlay_config_mmr_s {
904 unsigned long rsvd_0_29: 30; /* */ 1469 unsigned long rsvd_0_29: 30; /* */
905 unsigned long base : 16; /* RW */ 1470 unsigned long base : 16; /* RW */
906 unsigned long m_io : 6; /* RW */ 1471 unsigned long m_io : 6; /* RW */
907 unsigned long n_io : 4; /* RW */ 1472 unsigned long n_io : 4; /* RW */
908 unsigned long rsvd_56_62: 7; /* */ 1473 unsigned long rsvd_56_62: 7; /* */
909 unsigned long enable : 1; /* RW */ 1474 unsigned long enable : 1; /* RW */
910 } s; 1475 } s1;
1476 struct uv2h_rh_gam_mmioh_overlay_config_mmr_s {
1477 unsigned long rsvd_0_26: 27; /* */
1478 unsigned long base : 19; /* RW */
1479 unsigned long m_io : 6; /* RW */
1480 unsigned long n_io : 4; /* RW */
1481 unsigned long rsvd_56_62: 7; /* */
1482 unsigned long enable : 1; /* RW */
1483 } s2;
911}; 1484};
912 1485
913/* ========================================================================= */ 1486/* ========================================================================= */
@@ -917,20 +1490,40 @@ union uvh_rh_gam_mmioh_overlay_config_mmr_u {
917 1490
918#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26 1491#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
919#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL 1492#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
920#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46 1493
921#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL 1494#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
922#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63 1495#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
923#define UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL 1496#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_SHFT 46
1497#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_DUAL_HUB_MASK 0x0000400000000000UL
1498#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1499#define UV1H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
1500
1501#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_SHFT 26
1502#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_BASE_MASK 0x00003ffffc000000UL
1503#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_SHFT 63
1504#define UV2H_RH_GAM_MMR_OVERLAY_CONFIG_MMR_ENABLE_MASK 0x8000000000000000UL
924 1505
925union uvh_rh_gam_mmr_overlay_config_mmr_u { 1506union uvh_rh_gam_mmr_overlay_config_mmr_u {
926 unsigned long v; 1507 unsigned long v;
927 struct uvh_rh_gam_mmr_overlay_config_mmr_s { 1508 struct uvh_rh_gam_mmr_overlay_config_mmr_s {
928 unsigned long rsvd_0_25: 26; /* */ 1509 unsigned long rsvd_0_25: 26; /* */
929 unsigned long base : 20; /* RW */ 1510 unsigned long base : 20; /* RW */
1511 unsigned long rsvd_46_62 : 17;
1512 unsigned long enable : 1; /* RW */
1513 } s;
1514 struct uv1h_rh_gam_mmr_overlay_config_mmr_s {
1515 unsigned long rsvd_0_25: 26; /* */
1516 unsigned long base : 20; /* RW */
930 unsigned long dual_hub : 1; /* RW */ 1517 unsigned long dual_hub : 1; /* RW */
931 unsigned long rsvd_47_62: 16; /* */ 1518 unsigned long rsvd_47_62: 16; /* */
932 unsigned long enable : 1; /* RW */ 1519 unsigned long enable : 1; /* RW */
933 } s; 1520 } s1;
1521 struct uv2h_rh_gam_mmr_overlay_config_mmr_s {
1522 unsigned long rsvd_0_25: 26; /* */
1523 unsigned long base : 20; /* RW */
1524 unsigned long rsvd_46_62: 17; /* */
1525 unsigned long enable : 1; /* RW */
1526 } s2;
934}; 1527};
935 1528
936/* ========================================================================= */ 1529/* ========================================================================= */
@@ -988,96 +1581,169 @@ union uvh_rtc1_int_config_u {
988}; 1581};
989 1582
990/* ========================================================================= */ 1583/* ========================================================================= */
991/* UVH_SI_ADDR_MAP_CONFIG */ 1584/* UVH_SCRATCH5 */
992/* ========================================================================= */ 1585/* ========================================================================= */
993#define UVH_SI_ADDR_MAP_CONFIG 0xc80000UL 1586#define UVH_SCRATCH5 0x2d0200UL
1587#define UVH_SCRATCH5_32 0x778
994 1588
995#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_SHFT 0 1589#define UVH_SCRATCH5_SCRATCH5_SHFT 0
996#define UVH_SI_ADDR_MAP_CONFIG_M_SKT_MASK 0x000000000000003fUL 1590#define UVH_SCRATCH5_SCRATCH5_MASK 0xffffffffffffffffUL
997#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_SHFT 8
998#define UVH_SI_ADDR_MAP_CONFIG_N_SKT_MASK 0x0000000000000f00UL
999 1591
1000union uvh_si_addr_map_config_u { 1592union uvh_scratch5_u {
1001 unsigned long v; 1593 unsigned long v;
1002 struct uvh_si_addr_map_config_s { 1594 struct uvh_scratch5_s {
1003 unsigned long m_skt : 6; /* RW */ 1595 unsigned long scratch5 : 64; /* RW, W1CS */
1004 unsigned long rsvd_6_7: 2; /* */
1005 unsigned long n_skt : 4; /* RW */
1006 unsigned long rsvd_12_63: 52; /* */
1007 } s; 1596 } s;
1008}; 1597};
1009 1598
1010/* ========================================================================= */ 1599/* ========================================================================= */
1011/* UVH_SI_ALIAS0_OVERLAY_CONFIG */ 1600/* UV2H_EVENT_OCCURRED2 */
1012/* ========================================================================= */ 1601/* ========================================================================= */
1013#define UVH_SI_ALIAS0_OVERLAY_CONFIG 0xc80008UL 1602#define UV2H_EVENT_OCCURRED2 0x70100UL
1014 1603#define UV2H_EVENT_OCCURRED2_32 0xb68
1015#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_SHFT 24 1604
1016#define UVH_SI_ALIAS0_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL 1605#define UV2H_EVENT_OCCURRED2_RTC_0_SHFT 0
1017#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_SHFT 48 1606#define UV2H_EVENT_OCCURRED2_RTC_0_MASK 0x0000000000000001UL
1018#define UVH_SI_ALIAS0_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL 1607#define UV2H_EVENT_OCCURRED2_RTC_1_SHFT 1
1019#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_SHFT 63 1608#define UV2H_EVENT_OCCURRED2_RTC_1_MASK 0x0000000000000002UL
1020#define UVH_SI_ALIAS0_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL 1609#define UV2H_EVENT_OCCURRED2_RTC_2_SHFT 2
1021 1610#define UV2H_EVENT_OCCURRED2_RTC_2_MASK 0x0000000000000004UL
1022union uvh_si_alias0_overlay_config_u { 1611#define UV2H_EVENT_OCCURRED2_RTC_3_SHFT 3
1612#define UV2H_EVENT_OCCURRED2_RTC_3_MASK 0x0000000000000008UL
1613#define UV2H_EVENT_OCCURRED2_RTC_4_SHFT 4
1614#define UV2H_EVENT_OCCURRED2_RTC_4_MASK 0x0000000000000010UL
1615#define UV2H_EVENT_OCCURRED2_RTC_5_SHFT 5
1616#define UV2H_EVENT_OCCURRED2_RTC_5_MASK 0x0000000000000020UL
1617#define UV2H_EVENT_OCCURRED2_RTC_6_SHFT 6
1618#define UV2H_EVENT_OCCURRED2_RTC_6_MASK 0x0000000000000040UL
1619#define UV2H_EVENT_OCCURRED2_RTC_7_SHFT 7
1620#define UV2H_EVENT_OCCURRED2_RTC_7_MASK 0x0000000000000080UL
1621#define UV2H_EVENT_OCCURRED2_RTC_8_SHFT 8
1622#define UV2H_EVENT_OCCURRED2_RTC_8_MASK 0x0000000000000100UL
1623#define UV2H_EVENT_OCCURRED2_RTC_9_SHFT 9
1624#define UV2H_EVENT_OCCURRED2_RTC_9_MASK 0x0000000000000200UL
1625#define UV2H_EVENT_OCCURRED2_RTC_10_SHFT 10
1626#define UV2H_EVENT_OCCURRED2_RTC_10_MASK 0x0000000000000400UL
1627#define UV2H_EVENT_OCCURRED2_RTC_11_SHFT 11
1628#define UV2H_EVENT_OCCURRED2_RTC_11_MASK 0x0000000000000800UL
1629#define UV2H_EVENT_OCCURRED2_RTC_12_SHFT 12
1630#define UV2H_EVENT_OCCURRED2_RTC_12_MASK 0x0000000000001000UL
1631#define UV2H_EVENT_OCCURRED2_RTC_13_SHFT 13
1632#define UV2H_EVENT_OCCURRED2_RTC_13_MASK 0x0000000000002000UL
1633#define UV2H_EVENT_OCCURRED2_RTC_14_SHFT 14
1634#define UV2H_EVENT_OCCURRED2_RTC_14_MASK 0x0000000000004000UL
1635#define UV2H_EVENT_OCCURRED2_RTC_15_SHFT 15
1636#define UV2H_EVENT_OCCURRED2_RTC_15_MASK 0x0000000000008000UL
1637#define UV2H_EVENT_OCCURRED2_RTC_16_SHFT 16
1638#define UV2H_EVENT_OCCURRED2_RTC_16_MASK 0x0000000000010000UL
1639#define UV2H_EVENT_OCCURRED2_RTC_17_SHFT 17
1640#define UV2H_EVENT_OCCURRED2_RTC_17_MASK 0x0000000000020000UL
1641#define UV2H_EVENT_OCCURRED2_RTC_18_SHFT 18
1642#define UV2H_EVENT_OCCURRED2_RTC_18_MASK 0x0000000000040000UL
1643#define UV2H_EVENT_OCCURRED2_RTC_19_SHFT 19
1644#define UV2H_EVENT_OCCURRED2_RTC_19_MASK 0x0000000000080000UL
1645#define UV2H_EVENT_OCCURRED2_RTC_20_SHFT 20
1646#define UV2H_EVENT_OCCURRED2_RTC_20_MASK 0x0000000000100000UL
1647#define UV2H_EVENT_OCCURRED2_RTC_21_SHFT 21
1648#define UV2H_EVENT_OCCURRED2_RTC_21_MASK 0x0000000000200000UL
1649#define UV2H_EVENT_OCCURRED2_RTC_22_SHFT 22
1650#define UV2H_EVENT_OCCURRED2_RTC_22_MASK 0x0000000000400000UL
1651#define UV2H_EVENT_OCCURRED2_RTC_23_SHFT 23
1652#define UV2H_EVENT_OCCURRED2_RTC_23_MASK 0x0000000000800000UL
1653#define UV2H_EVENT_OCCURRED2_RTC_24_SHFT 24
1654#define UV2H_EVENT_OCCURRED2_RTC_24_MASK 0x0000000001000000UL
1655#define UV2H_EVENT_OCCURRED2_RTC_25_SHFT 25
1656#define UV2H_EVENT_OCCURRED2_RTC_25_MASK 0x0000000002000000UL
1657#define UV2H_EVENT_OCCURRED2_RTC_26_SHFT 26
1658#define UV2H_EVENT_OCCURRED2_RTC_26_MASK 0x0000000004000000UL
1659#define UV2H_EVENT_OCCURRED2_RTC_27_SHFT 27
1660#define UV2H_EVENT_OCCURRED2_RTC_27_MASK 0x0000000008000000UL
1661#define UV2H_EVENT_OCCURRED2_RTC_28_SHFT 28
1662#define UV2H_EVENT_OCCURRED2_RTC_28_MASK 0x0000000010000000UL
1663#define UV2H_EVENT_OCCURRED2_RTC_29_SHFT 29
1664#define UV2H_EVENT_OCCURRED2_RTC_29_MASK 0x0000000020000000UL
1665#define UV2H_EVENT_OCCURRED2_RTC_30_SHFT 30
1666#define UV2H_EVENT_OCCURRED2_RTC_30_MASK 0x0000000040000000UL
1667#define UV2H_EVENT_OCCURRED2_RTC_31_SHFT 31
1668#define UV2H_EVENT_OCCURRED2_RTC_31_MASK 0x0000000080000000UL
1669
1670union uv2h_event_occurred2_u {
1023 unsigned long v; 1671 unsigned long v;
1024 struct uvh_si_alias0_overlay_config_s { 1672 struct uv2h_event_occurred2_s {
1025 unsigned long rsvd_0_23: 24; /* */ 1673 unsigned long rtc_0 : 1; /* RW */
1026 unsigned long base : 8; /* RW */ 1674 unsigned long rtc_1 : 1; /* RW */
1027 unsigned long rsvd_32_47: 16; /* */ 1675 unsigned long rtc_2 : 1; /* RW */
1028 unsigned long m_alias : 5; /* RW */ 1676 unsigned long rtc_3 : 1; /* RW */
1029 unsigned long rsvd_53_62: 10; /* */ 1677 unsigned long rtc_4 : 1; /* RW */
1030 unsigned long enable : 1; /* RW */ 1678 unsigned long rtc_5 : 1; /* RW */
1031 } s; 1679 unsigned long rtc_6 : 1; /* RW */
1680 unsigned long rtc_7 : 1; /* RW */
1681 unsigned long rtc_8 : 1; /* RW */
1682 unsigned long rtc_9 : 1; /* RW */
1683 unsigned long rtc_10 : 1; /* RW */
1684 unsigned long rtc_11 : 1; /* RW */
1685 unsigned long rtc_12 : 1; /* RW */
1686 unsigned long rtc_13 : 1; /* RW */
1687 unsigned long rtc_14 : 1; /* RW */
1688 unsigned long rtc_15 : 1; /* RW */
1689 unsigned long rtc_16 : 1; /* RW */
1690 unsigned long rtc_17 : 1; /* RW */
1691 unsigned long rtc_18 : 1; /* RW */
1692 unsigned long rtc_19 : 1; /* RW */
1693 unsigned long rtc_20 : 1; /* RW */
1694 unsigned long rtc_21 : 1; /* RW */
1695 unsigned long rtc_22 : 1; /* RW */
1696 unsigned long rtc_23 : 1; /* RW */
1697 unsigned long rtc_24 : 1; /* RW */
1698 unsigned long rtc_25 : 1; /* RW */
1699 unsigned long rtc_26 : 1; /* RW */
1700 unsigned long rtc_27 : 1; /* RW */
1701 unsigned long rtc_28 : 1; /* RW */
1702 unsigned long rtc_29 : 1; /* RW */
1703 unsigned long rtc_30 : 1; /* RW */
1704 unsigned long rtc_31 : 1; /* RW */
1705 unsigned long rsvd_32_63: 32; /* */
1706 } s1;
1032}; 1707};
1033 1708
1034/* ========================================================================= */ 1709/* ========================================================================= */
1035/* UVH_SI_ALIAS1_OVERLAY_CONFIG */ 1710/* UV2H_EVENT_OCCURRED2_ALIAS */
1036/* ========================================================================= */ 1711/* ========================================================================= */
1037#define UVH_SI_ALIAS1_OVERLAY_CONFIG 0xc80010UL 1712#define UV2H_EVENT_OCCURRED2_ALIAS 0x70108UL
1713#define UV2H_EVENT_OCCURRED2_ALIAS_32 0xb70
1038 1714
1039#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_SHFT 24 1715/* ========================================================================= */
1040#define UVH_SI_ALIAS1_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL 1716/* UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 */
1041#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_SHFT 48 1717/* ========================================================================= */
1042#define UVH_SI_ALIAS1_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL 1718#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2 0x320130UL
1043#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_SHFT 63 1719#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_32 0x9f0
1044#define UVH_SI_ALIAS1_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1045 1720
1046union uvh_si_alias1_overlay_config_u { 1721#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_SHFT 0
1722#define UV2H_LB_BAU_SB_ACTIVATION_STATUS_2_AUX_ERROR_MASK 0xffffffffffffffffUL
1723
1724union uv2h_lb_bau_sb_activation_status_2_u {
1047 unsigned long v; 1725 unsigned long v;
1048 struct uvh_si_alias1_overlay_config_s { 1726 struct uv2h_lb_bau_sb_activation_status_2_s {
1049 unsigned long rsvd_0_23: 24; /* */ 1727 unsigned long aux_error : 64; /* RW */
1050 unsigned long base : 8; /* RW */ 1728 } s1;
1051 unsigned long rsvd_32_47: 16; /* */
1052 unsigned long m_alias : 5; /* RW */
1053 unsigned long rsvd_53_62: 10; /* */
1054 unsigned long enable : 1; /* RW */
1055 } s;
1056}; 1729};
1057 1730
1058/* ========================================================================= */ 1731/* ========================================================================= */
1059/* UVH_SI_ALIAS2_OVERLAY_CONFIG */ 1732/* UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK */
1060/* ========================================================================= */ 1733/* ========================================================================= */
1061#define UVH_SI_ALIAS2_OVERLAY_CONFIG 0xc80018UL 1734#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK 0x320130UL
1735#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_32 0x9f0
1062 1736
1063#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_SHFT 24 1737#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_SHFT 0
1064#define UVH_SI_ALIAS2_OVERLAY_CONFIG_BASE_MASK 0x00000000ff000000UL 1738#define UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK_BIT_ENABLES_MASK 0x00000000ffffffffUL
1065#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_SHFT 48
1066#define UVH_SI_ALIAS2_OVERLAY_CONFIG_M_ALIAS_MASK 0x001f000000000000UL
1067#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_SHFT 63
1068#define UVH_SI_ALIAS2_OVERLAY_CONFIG_ENABLE_MASK 0x8000000000000000UL
1069 1739
1070union uvh_si_alias2_overlay_config_u { 1740union uv1h_lb_target_physical_apic_id_mask_u {
1071 unsigned long v; 1741 unsigned long v;
1072 struct uvh_si_alias2_overlay_config_s { 1742 struct uv1h_lb_target_physical_apic_id_mask_s {
1073 unsigned long rsvd_0_23: 24; /* */ 1743 unsigned long bit_enables : 32; /* RW */
1074 unsigned long base : 8; /* RW */ 1744 unsigned long rsvd_32_63 : 32; /* */
1075 unsigned long rsvd_32_47: 16; /* */ 1745 } s1;
1076 unsigned long m_alias : 5; /* RW */
1077 unsigned long rsvd_53_62: 10; /* */
1078 unsigned long enable : 1; /* RW */
1079 } s;
1080}; 1746};
1081 1747
1082 1748
1083#endif /* _ASM_X86_UV_UV_MMRS_H */ 1749#endif /* __ASM_UV_MMRS_X86_H__ */
diff --git a/arch/x86/include/asm/vdso.h b/arch/x86/include/asm/vdso.h
index 9064052b73de..bb0522850b74 100644
--- a/arch/x86/include/asm/vdso.h
+++ b/arch/x86/include/asm/vdso.h
@@ -1,20 +1,6 @@
1#ifndef _ASM_X86_VDSO_H 1#ifndef _ASM_X86_VDSO_H
2#define _ASM_X86_VDSO_H 2#define _ASM_X86_VDSO_H
3 3
4#ifdef CONFIG_X86_64
5extern const char VDSO64_PRELINK[];
6
7/*
8 * Given a pointer to the vDSO image, find the pointer to VDSO64_name
9 * as that symbol is defined in the vDSO sources or linker script.
10 */
11#define VDSO64_SYMBOL(base, name) \
12({ \
13 extern const char VDSO64_##name[]; \
14 (void *)(VDSO64_##name - VDSO64_PRELINK + (unsigned long)(base)); \
15})
16#endif
17
18#if defined CONFIG_X86_32 || defined CONFIG_COMPAT 4#if defined CONFIG_X86_32 || defined CONFIG_COMPAT
19extern const char VDSO32_PRELINK[]; 5extern const char VDSO32_PRELINK[];
20 6
diff --git a/arch/x86/include/asm/vgtod.h b/arch/x86/include/asm/vgtod.h
index 3d61e204826f..646b4c1ca695 100644
--- a/arch/x86/include/asm/vgtod.h
+++ b/arch/x86/include/asm/vgtod.h
@@ -23,8 +23,6 @@ struct vsyscall_gtod_data {
23 struct timespec wall_to_monotonic; 23 struct timespec wall_to_monotonic;
24 struct timespec wall_time_coarse; 24 struct timespec wall_time_coarse;
25}; 25};
26extern struct vsyscall_gtod_data __vsyscall_gtod_data
27__section_vsyscall_gtod_data;
28extern struct vsyscall_gtod_data vsyscall_gtod_data; 26extern struct vsyscall_gtod_data vsyscall_gtod_data;
29 27
30#endif /* _ASM_X86_VGTOD_H */ 28#endif /* _ASM_X86_VGTOD_H */
diff --git a/arch/x86/include/asm/vmi.h b/arch/x86/include/asm/vmi.h
deleted file mode 100644
index 61e08c0a2907..000000000000
--- a/arch/x86/include/asm/vmi.h
+++ /dev/null
@@ -1,269 +0,0 @@
1/*
2 * VMI interface definition
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Maintained by: Zachary Amsden zach@vmware.com
22 *
23 */
24#include <linux/types.h>
25
26/*
27 *---------------------------------------------------------------------
28 *
29 * VMI Option ROM API
30 *
31 *---------------------------------------------------------------------
32 */
33#define VMI_SIGNATURE 0x696d5663 /* "cVmi" */
34
35#define PCI_VENDOR_ID_VMWARE 0x15AD
36#define PCI_DEVICE_ID_VMWARE_VMI 0x0801
37
38/*
39 * We use two version numbers for compatibility, with the major
40 * number signifying interface breakages, and the minor number
41 * interface extensions.
42 */
43#define VMI_API_REV_MAJOR 3
44#define VMI_API_REV_MINOR 0
45
46#define VMI_CALL_CPUID 0
47#define VMI_CALL_WRMSR 1
48#define VMI_CALL_RDMSR 2
49#define VMI_CALL_SetGDT 3
50#define VMI_CALL_SetLDT 4
51#define VMI_CALL_SetIDT 5
52#define VMI_CALL_SetTR 6
53#define VMI_CALL_GetGDT 7
54#define VMI_CALL_GetLDT 8
55#define VMI_CALL_GetIDT 9
56#define VMI_CALL_GetTR 10
57#define VMI_CALL_WriteGDTEntry 11
58#define VMI_CALL_WriteLDTEntry 12
59#define VMI_CALL_WriteIDTEntry 13
60#define VMI_CALL_UpdateKernelStack 14
61#define VMI_CALL_SetCR0 15
62#define VMI_CALL_SetCR2 16
63#define VMI_CALL_SetCR3 17
64#define VMI_CALL_SetCR4 18
65#define VMI_CALL_GetCR0 19
66#define VMI_CALL_GetCR2 20
67#define VMI_CALL_GetCR3 21
68#define VMI_CALL_GetCR4 22
69#define VMI_CALL_WBINVD 23
70#define VMI_CALL_SetDR 24
71#define VMI_CALL_GetDR 25
72#define VMI_CALL_RDPMC 26
73#define VMI_CALL_RDTSC 27
74#define VMI_CALL_CLTS 28
75#define VMI_CALL_EnableInterrupts 29
76#define VMI_CALL_DisableInterrupts 30
77#define VMI_CALL_GetInterruptMask 31
78#define VMI_CALL_SetInterruptMask 32
79#define VMI_CALL_IRET 33
80#define VMI_CALL_SYSEXIT 34
81#define VMI_CALL_Halt 35
82#define VMI_CALL_Reboot 36
83#define VMI_CALL_Shutdown 37
84#define VMI_CALL_SetPxE 38
85#define VMI_CALL_SetPxELong 39
86#define VMI_CALL_UpdatePxE 40
87#define VMI_CALL_UpdatePxELong 41
88#define VMI_CALL_MachineToPhysical 42
89#define VMI_CALL_PhysicalToMachine 43
90#define VMI_CALL_AllocatePage 44
91#define VMI_CALL_ReleasePage 45
92#define VMI_CALL_InvalPage 46
93#define VMI_CALL_FlushTLB 47
94#define VMI_CALL_SetLinearMapping 48
95
96#define VMI_CALL_SetIOPLMask 61
97#define VMI_CALL_SetInitialAPState 62
98#define VMI_CALL_APICWrite 63
99#define VMI_CALL_APICRead 64
100#define VMI_CALL_IODelay 65
101#define VMI_CALL_SetLazyMode 73
102
103/*
104 *---------------------------------------------------------------------
105 *
106 * MMU operation flags
107 *
108 *---------------------------------------------------------------------
109 */
110
111/* Flags used by VMI_{Allocate|Release}Page call */
112#define VMI_PAGE_PAE 0x10 /* Allocate PAE shadow */
113#define VMI_PAGE_CLONE 0x20 /* Clone from another shadow */
114#define VMI_PAGE_ZEROED 0x40 /* Page is pre-zeroed */
115
116
117/* Flags shared by Allocate|Release Page and PTE updates */
118#define VMI_PAGE_PT 0x01
119#define VMI_PAGE_PD 0x02
120#define VMI_PAGE_PDP 0x04
121#define VMI_PAGE_PML4 0x08
122
123#define VMI_PAGE_NORMAL 0x00 /* for debugging */
124
125/* Flags used by PTE updates */
126#define VMI_PAGE_CURRENT_AS 0x10 /* implies VMI_PAGE_VA_MASK is valid */
127#define VMI_PAGE_DEFER 0x20 /* may queue update until TLB inval */
128#define VMI_PAGE_VA_MASK 0xfffff000
129
130#ifdef CONFIG_X86_PAE
131#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
132#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_PAE | VMI_PAGE_ZEROED)
133#else
134#define VMI_PAGE_L1 (VMI_PAGE_PT | VMI_PAGE_ZEROED)
135#define VMI_PAGE_L2 (VMI_PAGE_PD | VMI_PAGE_ZEROED)
136#endif
137
138/* Flags used by VMI_FlushTLB call */
139#define VMI_FLUSH_TLB 0x01
140#define VMI_FLUSH_GLOBAL 0x02
141
142/*
143 *---------------------------------------------------------------------
144 *
145 * VMI relocation definitions for ROM call get_reloc
146 *
147 *---------------------------------------------------------------------
148 */
149
150/* VMI Relocation types */
151#define VMI_RELOCATION_NONE 0
152#define VMI_RELOCATION_CALL_REL 1
153#define VMI_RELOCATION_JUMP_REL 2
154#define VMI_RELOCATION_NOP 3
155
156#ifndef __ASSEMBLY__
157struct vmi_relocation_info {
158 unsigned char *eip;
159 unsigned char type;
160 unsigned char reserved[3];
161};
162#endif
163
164
165/*
166 *---------------------------------------------------------------------
167 *
168 * Generic ROM structures and definitions
169 *
170 *---------------------------------------------------------------------
171 */
172
173#ifndef __ASSEMBLY__
174
175struct vrom_header {
176 u16 rom_signature; /* option ROM signature */
177 u8 rom_length; /* ROM length in 512 byte chunks */
178 u8 rom_entry[4]; /* 16-bit code entry point */
179 u8 rom_pad0; /* 4-byte align pad */
180 u32 vrom_signature; /* VROM identification signature */
181 u8 api_version_min;/* Minor version of API */
182 u8 api_version_maj;/* Major version of API */
183 u8 jump_slots; /* Number of jump slots */
184 u8 reserved1; /* Reserved for expansion */
185 u32 virtual_top; /* Hypervisor virtual address start */
186 u16 reserved2; /* Reserved for expansion */
187 u16 license_offs; /* Offset to License string */
188 u16 pci_header_offs;/* Offset to PCI OPROM header */
189 u16 pnp_header_offs;/* Offset to PnP OPROM header */
190 u32 rom_pad3; /* PnP reserverd / VMI reserved */
191 u8 reserved[96]; /* Reserved for headers */
192 char vmi_init[8]; /* VMI_Init jump point */
193 char get_reloc[8]; /* VMI_GetRelocationInfo jump point */
194} __attribute__((packed));
195
196struct pnp_header {
197 char sig[4];
198 char rev;
199 char size;
200 short next;
201 short res;
202 long devID;
203 unsigned short manufacturer_offset;
204 unsigned short product_offset;
205} __attribute__((packed));
206
207struct pci_header {
208 char sig[4];
209 short vendorID;
210 short deviceID;
211 short vpdData;
212 short size;
213 char rev;
214 char class;
215 char subclass;
216 char interface;
217 short chunks;
218 char rom_version_min;
219 char rom_version_maj;
220 char codetype;
221 char lastRom;
222 short reserved;
223} __attribute__((packed));
224
225/* Function prototypes for bootstrapping */
226#ifdef CONFIG_VMI
227extern void vmi_init(void);
228extern void vmi_activate(void);
229extern void vmi_bringup(void);
230#else
231static inline void vmi_init(void) {}
232static inline void vmi_activate(void) {}
233static inline void vmi_bringup(void) {}
234#endif
235
236/* State needed to start an application processor in an SMP system. */
237struct vmi_ap_state {
238 u32 cr0;
239 u32 cr2;
240 u32 cr3;
241 u32 cr4;
242
243 u64 efer;
244
245 u32 eip;
246 u32 eflags;
247 u32 eax;
248 u32 ebx;
249 u32 ecx;
250 u32 edx;
251 u32 esp;
252 u32 ebp;
253 u32 esi;
254 u32 edi;
255 u16 cs;
256 u16 ss;
257 u16 ds;
258 u16 es;
259 u16 fs;
260 u16 gs;
261 u16 ldtr;
262
263 u16 gdtr_limit;
264 u32 gdtr_base;
265 u32 idtr_base;
266 u16 idtr_limit;
267};
268
269#endif
diff --git a/arch/x86/include/asm/vmi_time.h b/arch/x86/include/asm/vmi_time.h
deleted file mode 100644
index c6e0bee93e3c..000000000000
--- a/arch/x86/include/asm/vmi_time.h
+++ /dev/null
@@ -1,98 +0,0 @@
1/*
2 * VMI Time wrappers
3 *
4 * Copyright (C) 2006, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to dhecht@vmware.com
22 *
23 */
24
25#ifndef _ASM_X86_VMI_TIME_H
26#define _ASM_X86_VMI_TIME_H
27
28/*
29 * Raw VMI call indices for timer functions
30 */
31#define VMI_CALL_GetCycleFrequency 66
32#define VMI_CALL_GetCycleCounter 67
33#define VMI_CALL_SetAlarm 68
34#define VMI_CALL_CancelAlarm 69
35#define VMI_CALL_GetWallclockTime 70
36#define VMI_CALL_WallclockUpdated 71
37
38/* Cached VMI timer operations */
39extern struct vmi_timer_ops {
40 u64 (*get_cycle_frequency)(void);
41 u64 (*get_cycle_counter)(int);
42 u64 (*get_wallclock)(void);
43 int (*wallclock_updated)(void);
44 void (*set_alarm)(u32 flags, u64 expiry, u64 period);
45 void (*cancel_alarm)(u32 flags);
46} vmi_timer_ops;
47
48/* Prototypes */
49extern void __init vmi_time_init(void);
50extern unsigned long vmi_get_wallclock(void);
51extern int vmi_set_wallclock(unsigned long now);
52extern unsigned long long vmi_sched_clock(void);
53extern unsigned long vmi_tsc_khz(void);
54
55#ifdef CONFIG_X86_LOCAL_APIC
56extern void __devinit vmi_time_bsp_init(void);
57extern void __devinit vmi_time_ap_init(void);
58#endif
59
60/*
61 * When run under a hypervisor, a vcpu is always in one of three states:
62 * running, halted, or ready. The vcpu is in the 'running' state if it
63 * is executing. When the vcpu executes the halt interface, the vcpu
64 * enters the 'halted' state and remains halted until there is some work
65 * pending for the vcpu (e.g. an alarm expires, host I/O completes on
66 * behalf of virtual I/O). At this point, the vcpu enters the 'ready'
67 * state (waiting for the hypervisor to reschedule it). Finally, at any
68 * time when the vcpu is not in the 'running' state nor the 'halted'
69 * state, it is in the 'ready' state.
70 *
71 * Real time is advances while the vcpu is 'running', 'ready', or
72 * 'halted'. Stolen time is the time in which the vcpu is in the
73 * 'ready' state. Available time is the remaining time -- the vcpu is
74 * either 'running' or 'halted'.
75 *
76 * All three views of time are accessible through the VMI cycle
77 * counters.
78 */
79
80/* The cycle counters. */
81#define VMI_CYCLES_REAL 0
82#define VMI_CYCLES_AVAILABLE 1
83#define VMI_CYCLES_STOLEN 2
84
85/* The alarm interface 'flags' bits */
86#define VMI_ALARM_COUNTERS 2
87
88#define VMI_ALARM_COUNTER_MASK 0x000000ff
89
90#define VMI_ALARM_WIRED_IRQ0 0x00000000
91#define VMI_ALARM_WIRED_LVTT 0x00010000
92
93#define VMI_ALARM_IS_ONESHOT 0x00000000
94#define VMI_ALARM_IS_PERIODIC 0x00000100
95
96#define CONFIG_VMI_ALARM_HZ 100
97
98#endif /* _ASM_X86_VMI_TIME_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 9f0cbd987d50..84471b810460 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -66,15 +66,23 @@
66#define PIN_BASED_NMI_EXITING 0x00000008 66#define PIN_BASED_NMI_EXITING 0x00000008
67#define PIN_BASED_VIRTUAL_NMIS 0x00000020 67#define PIN_BASED_VIRTUAL_NMIS 0x00000020
68 68
69#define VM_EXIT_SAVE_DEBUG_CONTROLS 0x00000002
69#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200 70#define VM_EXIT_HOST_ADDR_SPACE_SIZE 0x00000200
71#define VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL 0x00001000
70#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000 72#define VM_EXIT_ACK_INTR_ON_EXIT 0x00008000
71#define VM_EXIT_SAVE_IA32_PAT 0x00040000 73#define VM_EXIT_SAVE_IA32_PAT 0x00040000
72#define VM_EXIT_LOAD_IA32_PAT 0x00080000 74#define VM_EXIT_LOAD_IA32_PAT 0x00080000
75#define VM_EXIT_SAVE_IA32_EFER 0x00100000
76#define VM_EXIT_LOAD_IA32_EFER 0x00200000
77#define VM_EXIT_SAVE_VMX_PREEMPTION_TIMER 0x00400000
73 78
79#define VM_ENTRY_LOAD_DEBUG_CONTROLS 0x00000002
74#define VM_ENTRY_IA32E_MODE 0x00000200 80#define VM_ENTRY_IA32E_MODE 0x00000200
75#define VM_ENTRY_SMM 0x00000400 81#define VM_ENTRY_SMM 0x00000400
76#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800 82#define VM_ENTRY_DEACT_DUAL_MONITOR 0x00000800
83#define VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL 0x00002000
77#define VM_ENTRY_LOAD_IA32_PAT 0x00004000 84#define VM_ENTRY_LOAD_IA32_PAT 0x00004000
85#define VM_ENTRY_LOAD_IA32_EFER 0x00008000
78 86
79/* VMCS Encodings */ 87/* VMCS Encodings */
80enum vmcs_field { 88enum vmcs_field {
@@ -239,6 +247,7 @@ enum vmcs_field {
239#define EXIT_REASON_TASK_SWITCH 9 247#define EXIT_REASON_TASK_SWITCH 9
240#define EXIT_REASON_CPUID 10 248#define EXIT_REASON_CPUID 10
241#define EXIT_REASON_HLT 12 249#define EXIT_REASON_HLT 12
250#define EXIT_REASON_INVD 13
242#define EXIT_REASON_INVLPG 14 251#define EXIT_REASON_INVLPG 14
243#define EXIT_REASON_RDPMC 15 252#define EXIT_REASON_RDPMC 15
244#define EXIT_REASON_RDTSC 16 253#define EXIT_REASON_RDTSC 16
@@ -296,6 +305,12 @@ enum vmcs_field {
296#define GUEST_INTR_STATE_SMI 0x00000004 305#define GUEST_INTR_STATE_SMI 0x00000004
297#define GUEST_INTR_STATE_NMI 0x00000008 306#define GUEST_INTR_STATE_NMI 0x00000008
298 307
308/* GUEST_ACTIVITY_STATE flags */
309#define GUEST_ACTIVITY_ACTIVE 0
310#define GUEST_ACTIVITY_HLT 1
311#define GUEST_ACTIVITY_SHUTDOWN 2
312#define GUEST_ACTIVITY_WAIT_SIPI 3
313
299/* 314/*
300 * Exit Qualifications for MOV for Control Register Access 315 * Exit Qualifications for MOV for Control Register Access
301 */ 316 */
diff --git a/arch/x86/include/asm/vsyscall.h b/arch/x86/include/asm/vsyscall.h
index d0983d255fbd..d55597351f6a 100644
--- a/arch/x86/include/asm/vsyscall.h
+++ b/arch/x86/include/asm/vsyscall.h
@@ -16,27 +16,19 @@ enum vsyscall_num {
16#ifdef __KERNEL__ 16#ifdef __KERNEL__
17#include <linux/seqlock.h> 17#include <linux/seqlock.h>
18 18
19#define __section_vgetcpu_mode __attribute__ ((unused, __section__ (".vgetcpu_mode"), aligned(16)))
20#define __section_jiffies __attribute__ ((unused, __section__ (".jiffies"), aligned(16)))
21
22/* Definitions for CONFIG_GENERIC_TIME definitions */ 19/* Definitions for CONFIG_GENERIC_TIME definitions */
23#define __section_vsyscall_gtod_data __attribute__ \
24 ((unused, __section__ (".vsyscall_gtod_data"),aligned(16)))
25#define __section_vsyscall_clock __attribute__ \
26 ((unused, __section__ (".vsyscall_clock"),aligned(16)))
27#define __vsyscall_fn \ 20#define __vsyscall_fn \
28 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace 21 __attribute__ ((unused, __section__(".vsyscall_fn"))) notrace
29 22
30#define VGETCPU_RDTSCP 1 23#define VGETCPU_RDTSCP 1
31#define VGETCPU_LSL 2 24#define VGETCPU_LSL 2
32 25
33extern int __vgetcpu_mode;
34extern volatile unsigned long __jiffies;
35
36/* kernel space (writeable) */ 26/* kernel space (writeable) */
37extern int vgetcpu_mode; 27extern int vgetcpu_mode;
38extern struct timezone sys_tz; 28extern struct timezone sys_tz;
39 29
30#include <asm/vvar.h>
31
40extern void map_vsyscall(void); 32extern void map_vsyscall(void);
41 33
42#endif /* __KERNEL__ */ 34#endif /* __KERNEL__ */
diff --git a/arch/x86/include/asm/vvar.h b/arch/x86/include/asm/vvar.h
new file mode 100644
index 000000000000..341b3559452b
--- /dev/null
+++ b/arch/x86/include/asm/vvar.h
@@ -0,0 +1,52 @@
1/*
2 * vvar.h: Shared vDSO/kernel variable declarations
3 * Copyright (c) 2011 Andy Lutomirski
4 * Subject to the GNU General Public License, version 2
5 *
6 * A handful of variables are accessible (read-only) from userspace
7 * code in the vsyscall page and the vdso. They are declared here.
8 * Some other file must define them with DEFINE_VVAR.
9 *
10 * In normal kernel code, they are used like any other variable.
11 * In user code, they are accessed through the VVAR macro.
12 *
13 * Each of these variables lives in the vsyscall page, and each
14 * one needs a unique offset within the little piece of the page
15 * reserved for vvars. Specify that offset in DECLARE_VVAR.
16 * (There are 896 bytes available. If you mess up, the linker will
17 * catch it.)
18 */
19
20/* Offset of vars within vsyscall page */
21#define VSYSCALL_VARS_OFFSET (3072 + 128)
22
23#if defined(__VVAR_KERNEL_LDS)
24
25/* The kernel linker script defines its own magic to put vvars in the
26 * right place.
27 */
28#define DECLARE_VVAR(offset, type, name) \
29 EMIT_VVAR(name, VSYSCALL_VARS_OFFSET + offset)
30
31#else
32
33#define DECLARE_VVAR(offset, type, name) \
34 static type const * const vvaraddr_ ## name = \
35 (void *)(VSYSCALL_START + VSYSCALL_VARS_OFFSET + (offset));
36
37#define DEFINE_VVAR(type, name) \
38 type __vvar_ ## name \
39 __attribute__((section(".vsyscall_var_" #name), aligned(16)))
40
41#define VVAR(name) (*vvaraddr_ ## name)
42
43#endif
44
45/* DECLARE_VVAR(offset, type, name) */
46
47DECLARE_VVAR(0, volatile unsigned long, jiffies)
48DECLARE_VVAR(8, int, vgetcpu_mode)
49DECLARE_VVAR(128, struct vsyscall_gtod_data, vsyscall_gtod_data)
50
51#undef DECLARE_VVAR
52#undef VSYSCALL_VARS_OFFSET
diff --git a/arch/x86/include/asm/x2apic.h b/arch/x86/include/asm/x2apic.h
new file mode 100644
index 000000000000..6bf5b8e478c0
--- /dev/null
+++ b/arch/x86/include/asm/x2apic.h
@@ -0,0 +1,62 @@
1/*
2 * Common bits for X2APIC cluster/physical modes.
3 */
4
5#ifndef _ASM_X86_X2APIC_H
6#define _ASM_X86_X2APIC_H
7
8#include <asm/apic.h>
9#include <asm/ipi.h>
10#include <linux/cpumask.h>
11
12/*
13 * Need to use more than cpu 0, because we need more vectors
14 * when MSI-X are used.
15 */
16static const struct cpumask *x2apic_target_cpus(void)
17{
18 return cpu_online_mask;
19}
20
21static int x2apic_apic_id_registered(void)
22{
23 return 1;
24}
25
26/*
27 * For now each logical cpu is in its own vector allocation domain.
28 */
29static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
30{
31 cpumask_clear(retmask);
32 cpumask_set_cpu(cpu, retmask);
33}
34
35static void
36__x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest)
37{
38 unsigned long cfg = __prepare_ICR(0, vector, dest);
39 native_x2apic_icr_write(cfg, apicid);
40}
41
42static unsigned int x2apic_get_apic_id(unsigned long id)
43{
44 return id;
45}
46
47static unsigned long x2apic_set_apic_id(unsigned int id)
48{
49 return id;
50}
51
52static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
53{
54 return initial_apicid >> index_msb;
55}
56
57static void x2apic_send_IPI_self(int vector)
58{
59 apic_write(APIC_SELF_IPI, vector);
60}
61
62#endif /* _ASM_X86_X2APIC_H */
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index baa579c8e038..d3d859035af9 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -68,6 +68,17 @@ struct x86_init_oem {
68}; 68};
69 69
70/** 70/**
71 * struct x86_init_mapping - platform specific initial kernel pagetable setup
72 * @pagetable_reserve: reserve a range of addresses for kernel pagetable usage
73 *
74 * For more details on the purpose of this hook, look in
75 * init_memory_mapping and the commit that added it.
76 */
77struct x86_init_mapping {
78 void (*pagetable_reserve)(u64 start, u64 end);
79};
80
81/**
71 * struct x86_init_paging - platform specific paging functions 82 * struct x86_init_paging - platform specific paging functions
72 * @pagetable_setup_start: platform specific pre paging_init() call 83 * @pagetable_setup_start: platform specific pre paging_init() call
73 * @pagetable_setup_done: platform specific post paging_init() call 84 * @pagetable_setup_done: platform specific post paging_init() call
@@ -83,11 +94,13 @@ struct x86_init_paging {
83 * boot cpu 94 * boot cpu
84 * @tsc_pre_init: platform function called before TSC init 95 * @tsc_pre_init: platform function called before TSC init
85 * @timer_init: initialize the platform timer (default PIT/HPET) 96 * @timer_init: initialize the platform timer (default PIT/HPET)
97 * @wallclock_init: init the wallclock device
86 */ 98 */
87struct x86_init_timers { 99struct x86_init_timers {
88 void (*setup_percpu_clockev)(void); 100 void (*setup_percpu_clockev)(void);
89 void (*tsc_pre_init)(void); 101 void (*tsc_pre_init)(void);
90 void (*timer_init)(void); 102 void (*timer_init)(void);
103 void (*wallclock_init)(void);
91}; 104};
92 105
93/** 106/**
@@ -121,6 +134,7 @@ struct x86_init_ops {
121 struct x86_init_mpparse mpparse; 134 struct x86_init_mpparse mpparse;
122 struct x86_init_irqs irqs; 135 struct x86_init_irqs irqs;
123 struct x86_init_oem oem; 136 struct x86_init_oem oem;
137 struct x86_init_mapping mapping;
124 struct x86_init_paging paging; 138 struct x86_init_paging paging;
125 struct x86_init_timers timers; 139 struct x86_init_timers timers;
126 struct x86_init_iommu iommu; 140 struct x86_init_iommu iommu;
@@ -154,9 +168,18 @@ struct x86_platform_ops {
154 int (*i8042_detect)(void); 168 int (*i8042_detect)(void);
155}; 169};
156 170
171struct pci_dev;
172
173struct x86_msi_ops {
174 int (*setup_msi_irqs)(struct pci_dev *dev, int nvec, int type);
175 void (*teardown_msi_irq)(unsigned int irq);
176 void (*teardown_msi_irqs)(struct pci_dev *dev);
177};
178
157extern struct x86_init_ops x86_init; 179extern struct x86_init_ops x86_init;
158extern struct x86_cpuinit_ops x86_cpuinit; 180extern struct x86_cpuinit_ops x86_cpuinit;
159extern struct x86_platform_ops x86_platform; 181extern struct x86_platform_ops x86_platform;
182extern struct x86_msi_ops x86_msi;
160 183
161extern void x86_init_noop(void); 184extern void x86_init_noop(void);
162extern void x86_init_uint_noop(unsigned int unused); 185extern void x86_init_uint_noop(unsigned int unused);
diff --git a/arch/x86/include/asm/xen/hypercall.h b/arch/x86/include/asm/xen/hypercall.h
index 7fda040a76cd..d240ea950519 100644
--- a/arch/x86/include/asm/xen/hypercall.h
+++ b/arch/x86/include/asm/xen/hypercall.h
@@ -200,6 +200,23 @@ extern struct { char _entry[32]; } hypercall_page[];
200 (type)__res; \ 200 (type)__res; \
201}) 201})
202 202
203static inline long
204privcmd_call(unsigned call,
205 unsigned long a1, unsigned long a2,
206 unsigned long a3, unsigned long a4,
207 unsigned long a5)
208{
209 __HYPERCALL_DECLS;
210 __HYPERCALL_5ARG(a1, a2, a3, a4, a5);
211
212 asm volatile("call *%[call]"
213 : __HYPERCALL_5PARAM
214 : [call] "a" (&hypercall_page[call])
215 : __HYPERCALL_CLOBBER5);
216
217 return (long)__res;
218}
219
203static inline int 220static inline int
204HYPERVISOR_set_trap_table(struct trap_info *table) 221HYPERVISOR_set_trap_table(struct trap_info *table)
205{ 222{
@@ -270,7 +287,7 @@ HYPERVISOR_fpu_taskswitch(int set)
270static inline int 287static inline int
271HYPERVISOR_sched_op(int cmd, void *arg) 288HYPERVISOR_sched_op(int cmd, void *arg)
272{ 289{
273 return _hypercall2(int, sched_op_new, cmd, arg); 290 return _hypercall2(int, sched_op, cmd, arg);
274} 291}
275 292
276static inline long 293static inline long
@@ -405,10 +422,17 @@ HYPERVISOR_set_segment_base(int reg, unsigned long value)
405#endif 422#endif
406 423
407static inline int 424static inline int
408HYPERVISOR_suspend(unsigned long srec) 425HYPERVISOR_suspend(unsigned long start_info_mfn)
409{ 426{
410 return _hypercall3(int, sched_op, SCHEDOP_shutdown, 427 struct sched_shutdown r = { .reason = SHUTDOWN_suspend };
411 SHUTDOWN_suspend, srec); 428
429 /*
430 * For a PV guest the tools require that the start_info mfn be
431 * present in rdx/edx when the hypercall is made. Per the
432 * hypercall calling convention this is the third hypercall
433 * argument, which is start_info_mfn here.
434 */
435 return _hypercall3(int, sched_op, SCHEDOP_shutdown, &r, start_info_mfn);
412} 436}
413 437
414static inline int 438static inline int
@@ -423,6 +447,13 @@ HYPERVISOR_hvm_op(int op, void *arg)
423 return _hypercall2(unsigned long, hvm_op, op, arg); 447 return _hypercall2(unsigned long, hvm_op, op, arg);
424} 448}
425 449
450static inline int
451HYPERVISOR_tmem_op(
452 struct tmem_op *op)
453{
454 return _hypercall1(int, tmem_op, op);
455}
456
426static inline void 457static inline void
427MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set) 458MULTI_fpu_taskswitch(struct multicall_entry *mcl, int set)
428{ 459{
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index 396ff4cc8ed4..66d0fff1ee84 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,4 +37,39 @@
37extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
38extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
39 39
40#include <asm/processor.h>
41
42static inline uint32_t xen_cpuid_base(void)
43{
44 uint32_t base, eax, ebx, ecx, edx;
45 char signature[13];
46
47 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
48 cpuid(base, &eax, &ebx, &ecx, &edx);
49 *(uint32_t *)(signature + 0) = ebx;
50 *(uint32_t *)(signature + 4) = ecx;
51 *(uint32_t *)(signature + 8) = edx;
52 signature[12] = 0;
53
54 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
55 return base;
56 }
57
58 return 0;
59}
60
61#ifdef CONFIG_XEN
62extern bool xen_hvm_need_lapic(void);
63
64static inline bool xen_x2apic_para_available(void)
65{
66 return xen_hvm_need_lapic();
67}
68#else
69static inline bool xen_x2apic_para_available(void)
70{
71 return (xen_cpuid_base() != 0);
72}
73#endif
74
40#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 75#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xen/interface.h b/arch/x86/include/asm/xen/interface.h
index e8506c1f0c55..5d4922ad4b9b 100644
--- a/arch/x86/include/asm/xen/interface.h
+++ b/arch/x86/include/asm/xen/interface.h
@@ -61,9 +61,9 @@ DEFINE_GUEST_HANDLE(void);
61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START) 61#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
62#endif 62#endif
63 63
64#ifndef machine_to_phys_mapping 64#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
65#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START) 65#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
66#endif 66#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>__MACH2PHYS_SHIFT)
67 67
68/* Maximum number of virtual CPUs in multi-processor guests. */ 68/* Maximum number of virtual CPUs in multi-processor guests. */
69#define MAX_VIRT_CPUS 32 69#define MAX_VIRT_CPUS 32
@@ -86,7 +86,7 @@ DEFINE_GUEST_HANDLE(void);
86 * The privilege level specifies which modes may enter a trap via a software 86 * The privilege level specifies which modes may enter a trap via a software
87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate 87 * interrupt. On x86/64, since rings 1 and 2 are unavailable, we allocate
88 * privilege levels as follows: 88 * privilege levels as follows:
89 * Level == 0: Noone may enter 89 * Level == 0: No one may enter
90 * Level == 1: Kernel may enter 90 * Level == 1: Kernel may enter
91 * Level == 2: Kernel may enter 91 * Level == 2: Kernel may enter
92 * Level == 3: Everyone may enter 92 * Level == 3: Everyone may enter
diff --git a/arch/x86/include/asm/xen/interface_32.h b/arch/x86/include/asm/xen/interface_32.h
index 42a7e004ae5c..8413688b2571 100644
--- a/arch/x86/include/asm/xen/interface_32.h
+++ b/arch/x86/include/asm/xen/interface_32.h
@@ -32,6 +32,11 @@
32/* And the trap vector is... */ 32/* And the trap vector is... */
33#define TRAP_INSTR "int $0x82" 33#define TRAP_INSTR "int $0x82"
34 34
35#define __MACH2PHYS_VIRT_START 0xF5800000
36#define __MACH2PHYS_VIRT_END 0xF6800000
37
38#define __MACH2PHYS_SHIFT 2
39
35/* 40/*
36 * Virtual addresses beyond this are not modifiable by guest OSes. The 41 * Virtual addresses beyond this are not modifiable by guest OSes. The
37 * machine->physical mapping table starts at this address, read-only. 42 * machine->physical mapping table starts at this address, read-only.
diff --git a/arch/x86/include/asm/xen/interface_64.h b/arch/x86/include/asm/xen/interface_64.h
index 100d2662b97c..839a4811cf98 100644
--- a/arch/x86/include/asm/xen/interface_64.h
+++ b/arch/x86/include/asm/xen/interface_64.h
@@ -39,18 +39,7 @@
39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000 39#define __HYPERVISOR_VIRT_END 0xFFFF880000000000
40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000 40#define __MACH2PHYS_VIRT_START 0xFFFF800000000000
41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000 41#define __MACH2PHYS_VIRT_END 0xFFFF804000000000
42 42#define __MACH2PHYS_SHIFT 3
43#ifndef HYPERVISOR_VIRT_START
44#define HYPERVISOR_VIRT_START mk_unsigned_long(__HYPERVISOR_VIRT_START)
45#define HYPERVISOR_VIRT_END mk_unsigned_long(__HYPERVISOR_VIRT_END)
46#endif
47
48#define MACH2PHYS_VIRT_START mk_unsigned_long(__MACH2PHYS_VIRT_START)
49#define MACH2PHYS_VIRT_END mk_unsigned_long(__MACH2PHYS_VIRT_END)
50#define MACH2PHYS_NR_ENTRIES ((MACH2PHYS_VIRT_END-MACH2PHYS_VIRT_START)>>3)
51#ifndef machine_to_phys_mapping
52#define machine_to_phys_mapping ((unsigned long *)HYPERVISOR_VIRT_START)
53#endif
54 43
55/* 44/*
56 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base) 45 * int HYPERVISOR_set_segment_base(unsigned int which, unsigned long base)
diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h
index bf5f7d32bd08..64a619d47d34 100644
--- a/arch/x86/include/asm/xen/page.h
+++ b/arch/x86/include/asm/xen/page.h
@@ -5,6 +5,7 @@
5#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/pfn.h> 7#include <linux/pfn.h>
8#include <linux/mm.h>
8 9
9#include <asm/uaccess.h> 10#include <asm/uaccess.h>
10#include <asm/page.h> 11#include <asm/page.h>
@@ -28,23 +29,46 @@ typedef struct xpaddr {
28 29
29/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/ 30/**** MACHINE <-> PHYSICAL CONVERSION MACROS ****/
30#define INVALID_P2M_ENTRY (~0UL) 31#define INVALID_P2M_ENTRY (~0UL)
31#define FOREIGN_FRAME_BIT (1UL<<31) 32#define FOREIGN_FRAME_BIT (1UL<<(BITS_PER_LONG-1))
33#define IDENTITY_FRAME_BIT (1UL<<(BITS_PER_LONG-2))
32#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT) 34#define FOREIGN_FRAME(m) ((m) | FOREIGN_FRAME_BIT)
35#define IDENTITY_FRAME(m) ((m) | IDENTITY_FRAME_BIT)
33 36
34/* Maximum amount of memory we can handle in a domain in pages */ 37/* Maximum amount of memory we can handle in a domain in pages */
35#define MAX_DOMAIN_PAGES \ 38#define MAX_DOMAIN_PAGES \
36 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE)) 39 ((unsigned long)((u64)CONFIG_XEN_MAX_DOMAIN_MEMORY * 1024 * 1024 * 1024 / PAGE_SIZE))
37 40
41extern unsigned long *machine_to_phys_mapping;
42extern unsigned int machine_to_phys_order;
38 43
39extern unsigned long get_phys_to_machine(unsigned long pfn); 44extern unsigned long get_phys_to_machine(unsigned long pfn);
40extern void set_phys_to_machine(unsigned long pfn, unsigned long mfn); 45extern bool set_phys_to_machine(unsigned long pfn, unsigned long mfn);
41 46extern bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
47extern unsigned long set_phys_range_identity(unsigned long pfn_s,
48 unsigned long pfn_e);
49
50extern int m2p_add_override(unsigned long mfn, struct page *page,
51 bool clear_pte);
52extern int m2p_remove_override(struct page *page, bool clear_pte);
53extern struct page *m2p_find_override(unsigned long mfn);
54extern unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn);
55
56#ifdef CONFIG_XEN_DEBUG_FS
57extern int p2m_dump_show(struct seq_file *m, void *v);
58#endif
42static inline unsigned long pfn_to_mfn(unsigned long pfn) 59static inline unsigned long pfn_to_mfn(unsigned long pfn)
43{ 60{
61 unsigned long mfn;
62
44 if (xen_feature(XENFEAT_auto_translated_physmap)) 63 if (xen_feature(XENFEAT_auto_translated_physmap))
45 return pfn; 64 return pfn;
46 65
47 return get_phys_to_machine(pfn) & ~FOREIGN_FRAME_BIT; 66 mfn = get_phys_to_machine(pfn);
67
68 if (mfn != INVALID_P2M_ENTRY)
69 mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
70
71 return mfn;
48} 72}
49 73
50static inline int phys_to_machine_mapping_valid(unsigned long pfn) 74static inline int phys_to_machine_mapping_valid(unsigned long pfn)
@@ -58,22 +82,44 @@ static inline int phys_to_machine_mapping_valid(unsigned long pfn)
58static inline unsigned long mfn_to_pfn(unsigned long mfn) 82static inline unsigned long mfn_to_pfn(unsigned long mfn)
59{ 83{
60 unsigned long pfn; 84 unsigned long pfn;
85 int ret = 0;
61 86
62 if (xen_feature(XENFEAT_auto_translated_physmap)) 87 if (xen_feature(XENFEAT_auto_translated_physmap))
63 return mfn; 88 return mfn;
64 89
65#if 0 90 if (unlikely((mfn >> machine_to_phys_order) != 0)) {
66 if (unlikely((mfn >> machine_to_phys_order) != 0)) 91 pfn = ~0;
67 return max_mapnr; 92 goto try_override;
68#endif 93 }
69
70 pfn = 0; 94 pfn = 0;
71 /* 95 /*
72 * The array access can fail (e.g., device space beyond end of RAM). 96 * The array access can fail (e.g., device space beyond end of RAM).
73 * In such cases it doesn't matter what we return (we return garbage), 97 * In such cases it doesn't matter what we return (we return garbage),
74 * but we must handle the fault without crashing! 98 * but we must handle the fault without crashing!
75 */ 99 */
76 __get_user(pfn, &machine_to_phys_mapping[mfn]); 100 ret = __get_user(pfn, &machine_to_phys_mapping[mfn]);
101try_override:
102 /* ret might be < 0 if there are no entries in the m2p for mfn */
103 if (ret < 0)
104 pfn = ~0;
105 else if (get_phys_to_machine(pfn) != mfn)
106 /*
107 * If this appears to be a foreign mfn (because the pfn
108 * doesn't map back to the mfn), then check the local override
109 * table to see if there's a better pfn to use.
110 *
111 * m2p_find_override_pfn returns ~0 if it doesn't find anything.
112 */
113 pfn = m2p_find_override_pfn(mfn, ~0);
114
115 /*
116 * pfn is ~0 if there are no entries in the m2p for mfn or if the
117 * entry doesn't map back to the mfn and m2p_override doesn't have a
118 * valid entry for it.
119 */
120 if (pfn == ~0 &&
121 get_phys_to_machine(mfn) == IDENTITY_FRAME(mfn))
122 pfn = mfn;
77 123
78 return pfn; 124 return pfn;
79} 125}
@@ -159,6 +205,7 @@ static inline pte_t __pte_ma(pteval_t x)
159 205
160#define pgd_val_ma(x) ((x).pgd) 206#define pgd_val_ma(x) ((x).pgd)
161 207
208void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid);
162 209
163xmaddr_t arbitrary_virt_to_machine(void *address); 210xmaddr_t arbitrary_virt_to_machine(void *address);
164unsigned long arbitrary_virt_to_mfn(void *vaddr); 211unsigned long arbitrary_virt_to_mfn(void *vaddr);
diff --git a/arch/x86/include/asm/xen/pci.h b/arch/x86/include/asm/xen/pci.h
new file mode 100644
index 000000000000..4fbda9a3f339
--- /dev/null
+++ b/arch/x86/include/asm/xen/pci.h
@@ -0,0 +1,81 @@
1#ifndef _ASM_X86_XEN_PCI_H
2#define _ASM_X86_XEN_PCI_H
3
4#if defined(CONFIG_PCI_XEN)
5extern int __init pci_xen_init(void);
6extern int __init pci_xen_hvm_init(void);
7#define pci_xen 1
8#else
9#define pci_xen 0
10#define pci_xen_init (0)
11static inline int pci_xen_hvm_init(void)
12{
13 return -1;
14}
15#endif
16#if defined(CONFIG_XEN_DOM0)
17void __init xen_setup_pirqs(void);
18int xen_find_device_domain_owner(struct pci_dev *dev);
19int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain);
20int xen_unregister_device_domain_owner(struct pci_dev *dev);
21#else
22static inline void __init xen_setup_pirqs(void)
23{
24}
25static inline int xen_find_device_domain_owner(struct pci_dev *dev)
26{
27 return -1;
28}
29static inline int xen_register_device_domain_owner(struct pci_dev *dev,
30 uint16_t domain)
31{
32 return -1;
33}
34static inline int xen_unregister_device_domain_owner(struct pci_dev *dev)
35{
36 return -1;
37}
38#endif
39
40#if defined(CONFIG_PCI_MSI)
41#if defined(CONFIG_PCI_XEN)
42/* The drivers/pci/xen-pcifront.c sets this structure to
43 * its own functions.
44 */
45struct xen_pci_frontend_ops {
46 int (*enable_msi)(struct pci_dev *dev, int vectors[]);
47 void (*disable_msi)(struct pci_dev *dev);
48 int (*enable_msix)(struct pci_dev *dev, int vectors[], int nvec);
49 void (*disable_msix)(struct pci_dev *dev);
50};
51
52extern struct xen_pci_frontend_ops *xen_pci_frontend;
53
54static inline int xen_pci_frontend_enable_msi(struct pci_dev *dev,
55 int vectors[])
56{
57 if (xen_pci_frontend && xen_pci_frontend->enable_msi)
58 return xen_pci_frontend->enable_msi(dev, vectors);
59 return -ENODEV;
60}
61static inline void xen_pci_frontend_disable_msi(struct pci_dev *dev)
62{
63 if (xen_pci_frontend && xen_pci_frontend->disable_msi)
64 xen_pci_frontend->disable_msi(dev);
65}
66static inline int xen_pci_frontend_enable_msix(struct pci_dev *dev,
67 int vectors[], int nvec)
68{
69 if (xen_pci_frontend && xen_pci_frontend->enable_msix)
70 return xen_pci_frontend->enable_msix(dev, vectors, nvec);
71 return -ENODEV;
72}
73static inline void xen_pci_frontend_disable_msix(struct pci_dev *dev)
74{
75 if (xen_pci_frontend && xen_pci_frontend->disable_msix)
76 xen_pci_frontend->disable_msix(dev);
77}
78#endif /* CONFIG_PCI_XEN */
79#endif /* CONFIG_PCI_MSI */
80
81#endif /* _ASM_X86_XEN_PCI_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 6890dbb9ac15..d727f8f94333 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -24,28 +24,34 @@ endif
24nostackp := $(call cc-option, -fno-stack-protector) 24nostackp := $(call cc-option, -fno-stack-protector)
25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp) 25CFLAGS_vsyscall_64.o := $(PROFILING) -g0 $(nostackp)
26CFLAGS_hpet.o := $(nostackp) 26CFLAGS_hpet.o := $(nostackp)
27CFLAGS_tsc.o := $(nostackp) 27CFLAGS_vread_tsc_64.o := $(nostackp)
28CFLAGS_paravirt.o := $(nostackp) 28CFLAGS_paravirt.o := $(nostackp)
29GCOV_PROFILE_vsyscall_64.o := n 29GCOV_PROFILE_vsyscall_64.o := n
30GCOV_PROFILE_hpet.o := n 30GCOV_PROFILE_hpet.o := n
31GCOV_PROFILE_tsc.o := n 31GCOV_PROFILE_tsc.o := n
32GCOV_PROFILE_vread_tsc_64.o := n
32GCOV_PROFILE_paravirt.o := n 33GCOV_PROFILE_paravirt.o := n
33 34
35# vread_tsc_64 is hot and should be fully optimized:
36CFLAGS_REMOVE_vread_tsc_64.o = -pg -fno-optimize-sibling-calls
37
34obj-y := process_$(BITS).o signal.o entry_$(BITS).o 38obj-y := process_$(BITS).o signal.o entry_$(BITS).o
35obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o 39obj-y += traps.o irq.o irq_$(BITS).o dumpstack_$(BITS).o
36obj-y += time.o ioport.o ldt.o dumpstack.o 40obj-y += time.o ioport.o ldt.o dumpstack.o
37obj-y += setup.o x86_init.o i8259.o irqinit.o 41obj-y += setup.o x86_init.o i8259.o irqinit.o jump_label.o
38obj-$(CONFIG_X86_VISWS) += visws_quirks.o 42obj-$(CONFIG_IRQ_WORK) += irq_work.o
39obj-$(CONFIG_X86_32) += probe_roms_32.o 43obj-y += probe_roms.o
40obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o 44obj-$(CONFIG_X86_32) += sys_i386_32.o i386_ksyms_32.o
41obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o 45obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
42obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 46obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o vread_tsc_64.o
43obj-y += bootflag.o e820.o 47obj-y += bootflag.o e820.o
44obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 48obj-y += pci-dma.o quirks.o topology.o kdebugfs.o
45obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o 49obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
46obj-y += tsc.o io_delay.o rtc.o 50obj-y += tsc.o io_delay.o rtc.o
51obj-y += pci-iommu_table.o
52obj-y += resource.o
47 53
48obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 54obj-y += trampoline.o trampoline_$(BITS).o
49obj-y += process.o 55obj-y += process.o
50obj-y += i387.o xsave.o 56obj-y += i387.o xsave.o
51obj-y += ptrace.o 57obj-y += ptrace.o
@@ -53,11 +59,12 @@ obj-$(CONFIG_X86_32) += tls.o
53obj-$(CONFIG_IA32_EMULATION) += tls.o 59obj-$(CONFIG_IA32_EMULATION) += tls.o
54obj-y += step.o 60obj-y += step.o
55obj-$(CONFIG_INTEL_TXT) += tboot.o 61obj-$(CONFIG_INTEL_TXT) += tboot.o
62obj-$(CONFIG_ISA_DMA_API) += i8237.o
56obj-$(CONFIG_STACKTRACE) += stacktrace.o 63obj-$(CONFIG_STACKTRACE) += stacktrace.o
57obj-y += cpu/ 64obj-y += cpu/
58obj-y += acpi/ 65obj-y += acpi/
59obj-$(CONFIG_SFI) += sfi.o
60obj-y += reboot.o 66obj-y += reboot.o
67obj-$(CONFIG_X86_32) += reboot_32.o
61obj-$(CONFIG_MCA) += mca_32.o 68obj-$(CONFIG_MCA) += mca_32.o
62obj-$(CONFIG_X86_MSR) += msr.o 69obj-$(CONFIG_X86_MSR) += msr.o
63obj-$(CONFIG_X86_CPUID) += cpuid.o 70obj-$(CONFIG_X86_CPUID) += cpuid.o
@@ -65,10 +72,9 @@ obj-$(CONFIG_PCI) += early-quirks.o
65apm-y := apm_32.o 72apm-y := apm_32.o
66obj-$(CONFIG_APM) += apm.o 73obj-$(CONFIG_APM) += apm.o
67obj-$(CONFIG_SMP) += smp.o 74obj-$(CONFIG_SMP) += smp.o
68obj-$(CONFIG_SMP) += smpboot.o tsc_sync.o 75obj-$(CONFIG_SMP) += smpboot.o
76obj-$(CONFIG_SMP) += tsc_sync.o
69obj-$(CONFIG_SMP) += setup_percpu.o 77obj-$(CONFIG_SMP) += setup_percpu.o
70obj-$(CONFIG_X86_64_SMP) += tsc_sync.o
71obj-$(CONFIG_X86_TRAMPOLINE) += trampoline_$(BITS).o
72obj-$(CONFIG_X86_MPPARSE) += mpparse.o 78obj-$(CONFIG_X86_MPPARSE) += mpparse.o
73obj-y += apic/ 79obj-y += apic/
74obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o 80obj-$(CONFIG_X86_REBOOTFIXUPS) += reboot_fixups_32.o
@@ -80,7 +86,6 @@ obj-$(CONFIG_KEXEC) += relocate_kernel_$(BITS).o crash.o
80obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o 86obj-$(CONFIG_CRASH_DUMP) += crash_dump_$(BITS).o
81obj-$(CONFIG_KPROBES) += kprobes.o 87obj-$(CONFIG_KPROBES) += kprobes.o
82obj-$(CONFIG_MODULES) += module.o 88obj-$(CONFIG_MODULES) += module.o
83obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
84obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o 89obj-$(CONFIG_DOUBLEFAULT) += doublefault_32.o
85obj-$(CONFIG_KGDB) += kgdb.o 90obj-$(CONFIG_KGDB) += kgdb.o
86obj-$(CONFIG_VM86) += vm86_32.o 91obj-$(CONFIG_VM86) += vm86_32.o
@@ -89,11 +94,10 @@ obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
89obj-$(CONFIG_HPET_TIMER) += hpet.o 94obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o 95obj-$(CONFIG_APB_TIMER) += apb_timer.o
91 96
92obj-$(CONFIG_K8_NB) += k8.o 97obj-$(CONFIG_AMD_NB) += amd_nb.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 98obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 99obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 100
96obj-$(CONFIG_VMI) += vmi_32.o vmiclock_32.o
97obj-$(CONFIG_KVM_GUEST) += kvm.o 101obj-$(CONFIG_KVM_GUEST) += kvm.o
98obj-$(CONFIG_KVM_CLOCK) += kvmclock.o 102obj-$(CONFIG_KVM_CLOCK) += kvmclock.o
99obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o 103obj-$(CONFIG_PARAVIRT) += paravirt.o paravirt_patch_$(BITS).o
@@ -102,13 +106,6 @@ obj-$(CONFIG_PARAVIRT_CLOCK) += pvclock.o
102 106
103obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o 107obj-$(CONFIG_PCSPKR_PLATFORM) += pcspeaker.o
104 108
105obj-$(CONFIG_SCx200) += scx200.o
106scx200-y += scx200_32.o
107
108obj-$(CONFIG_OLPC) += olpc.o
109obj-$(CONFIG_OLPC_OPENFIRMWARE) += olpc_ofw.o
110obj-$(CONFIG_X86_MRST) += mrst.o
111
112microcode-y := microcode_core.o 109microcode-y := microcode_core.o
113microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o 110microcode-$(CONFIG_MICROCODE_INTEL) += microcode_intel.o
114microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o 111microcode-$(CONFIG_MICROCODE_AMD) += microcode_amd.o
@@ -117,17 +114,16 @@ obj-$(CONFIG_MICROCODE) += microcode.o
117obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o 114obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
118 115
119obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o 116obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
117obj-$(CONFIG_OF) += devicetree.o
120 118
121obj-$(CONFIG_FEATHER_TRACE) += ft_event.o 119obj-$(CONFIG_FEATHER_TRACE) += ft_event.o
122 120
123### 121###
124# 64 bit specific files 122# 64 bit specific files
125ifeq ($(CONFIG_X86_64),y) 123ifeq ($(CONFIG_X86_64),y)
126 obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
127 obj-$(CONFIG_X86_PM_TIMER) += pmtimer_64.o
128 obj-$(CONFIG_AUDIT) += audit_64.o 124 obj-$(CONFIG_AUDIT) += audit_64.o
129 125
130 obj-$(CONFIG_GART_IOMMU) += pci-gart_64.o aperture_64.o 126 obj-$(CONFIG_GART_IOMMU) += amd_gart_64.o aperture_64.o
131 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o 127 obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary_64.o tce_64.o
132 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o 128 obj-$(CONFIG_AMD_IOMMU) += amd_iommu_init.o amd_iommu.o
133 129
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index c05872aa3ce0..4558f0d0822d 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -72,6 +72,7 @@ u8 acpi_sci_flags __initdata;
72int acpi_sci_override_gsi __initdata; 72int acpi_sci_override_gsi __initdata;
73int acpi_skip_timer_override __initdata; 73int acpi_skip_timer_override __initdata;
74int acpi_use_timer_override __initdata; 74int acpi_use_timer_override __initdata;
75int acpi_fix_pin2_polarity __initdata;
75 76
76#ifdef CONFIG_X86_LOCAL_APIC 77#ifdef CONFIG_X86_LOCAL_APIC
77static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 78static u64 acpi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
@@ -198,6 +199,11 @@ static void __cpuinit acpi_register_lapic(int id, u8 enabled)
198{ 199{
199 unsigned int ver = 0; 200 unsigned int ver = 0;
200 201
202 if (id >= (MAX_LOCAL_APIC-1)) {
203 printk(KERN_INFO PREFIX "skipped apicid that is too big\n");
204 return;
205 }
206
201 if (!enabled) { 207 if (!enabled) {
202 ++disabled_cpus; 208 ++disabled_cpus;
203 return; 209 return;
@@ -410,10 +416,15 @@ acpi_parse_int_src_ovr(struct acpi_subtable_header * header,
410 return 0; 416 return 0;
411 } 417 }
412 418
413 if (acpi_skip_timer_override && 419 if (intsrc->source_irq == 0 && intsrc->global_irq == 2) {
414 intsrc->source_irq == 0 && intsrc->global_irq == 2) { 420 if (acpi_skip_timer_override) {
415 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n"); 421 printk(PREFIX "BIOS IRQ0 pin2 override ignored.\n");
416 return 0; 422 return 0;
423 }
424 if (acpi_fix_pin2_polarity && (intsrc->inti_flags & ACPI_MADT_POLARITY_MASK)) {
425 intsrc->inti_flags &= ~ACPI_MADT_POLARITY_MASK;
426 printk(PREFIX "BIOS IRQ0 pin2 override: forcing polarity to high active.\n");
427 }
417 } 428 }
418 429
419 mp_override_legacy_irq(intsrc->source_irq, 430 mp_override_legacy_irq(intsrc->source_irq,
@@ -504,6 +515,7 @@ int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
504 515
505 return 0; 516 return 0;
506} 517}
518EXPORT_SYMBOL_GPL(acpi_gsi_to_irq);
507 519
508int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi) 520int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
509{ 521{
@@ -513,35 +525,62 @@ int acpi_isa_irq_to_gsi(unsigned isa_irq, u32 *gsi)
513 return 0; 525 return 0;
514} 526}
515 527
516/* 528static int acpi_register_gsi_pic(struct device *dev, u32 gsi,
517 * success: return IRQ number (>=0) 529 int trigger, int polarity)
518 * failure: return < 0
519 */
520int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
521{ 530{
522 unsigned int irq;
523 unsigned int plat_gsi = gsi;
524
525#ifdef CONFIG_PCI 531#ifdef CONFIG_PCI
526 /* 532 /*
527 * Make sure all (legacy) PCI IRQs are set as level-triggered. 533 * Make sure all (legacy) PCI IRQs are set as level-triggered.
528 */ 534 */
529 if (acpi_irq_model == ACPI_IRQ_MODEL_PIC) { 535 if (trigger == ACPI_LEVEL_SENSITIVE)
530 if (trigger == ACPI_LEVEL_SENSITIVE) 536 eisa_set_level_irq(gsi);
531 eisa_set_level_irq(gsi);
532 }
533#endif 537#endif
534 538
539 return gsi;
540}
541
542static int acpi_register_gsi_ioapic(struct device *dev, u32 gsi,
543 int trigger, int polarity)
544{
535#ifdef CONFIG_X86_IO_APIC 545#ifdef CONFIG_X86_IO_APIC
536 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC) { 546 gsi = mp_register_gsi(dev, gsi, trigger, polarity);
537 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
538 }
539#endif 547#endif
548
549 return gsi;
550}
551
552int (*__acpi_register_gsi)(struct device *dev, u32 gsi,
553 int trigger, int polarity) = acpi_register_gsi_pic;
554
555/*
556 * success: return IRQ number (>=0)
557 * failure: return < 0
558 */
559int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
560{
561 unsigned int irq;
562 unsigned int plat_gsi = gsi;
563
564 plat_gsi = (*__acpi_register_gsi)(dev, gsi, trigger, polarity);
540 irq = gsi_to_irq(plat_gsi); 565 irq = gsi_to_irq(plat_gsi);
541 566
542 return irq; 567 return irq;
543} 568}
544 569
570void __init acpi_set_irq_model_pic(void)
571{
572 acpi_irq_model = ACPI_IRQ_MODEL_PIC;
573 __acpi_register_gsi = acpi_register_gsi_pic;
574 acpi_ioapic = 0;
575}
576
577void __init acpi_set_irq_model_ioapic(void)
578{
579 acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC;
580 __acpi_register_gsi = acpi_register_gsi_ioapic;
581 acpi_ioapic = 1;
582}
583
545/* 584/*
546 * ACPI based hotplug support for CPU 585 * ACPI based hotplug support for CPU
547 */ 586 */
@@ -556,14 +595,8 @@ static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
556 nid = acpi_get_node(handle); 595 nid = acpi_get_node(handle);
557 if (nid == -1 || !node_online(nid)) 596 if (nid == -1 || !node_online(nid))
558 return; 597 return;
559#ifdef CONFIG_X86_64 598 set_apicid_to_node(physid, nid);
560 apicid_to_node[physid] = nid;
561 numa_set_node(cpu, nid); 599 numa_set_node(cpu, nid);
562#else /* CONFIG_X86_32 */
563 apicid_2_node[physid] = nid;
564 cpu_to_node_map[cpu] = nid;
565#endif
566
567#endif 600#endif
568} 601}
569 602
@@ -820,18 +853,6 @@ static int __init acpi_parse_fadt(struct acpi_table_header *table)
820 * returns 0 on success, < 0 on error 853 * returns 0 on success, < 0 on error
821 */ 854 */
822 855
823static void __init acpi_register_lapic_address(unsigned long address)
824{
825 mp_lapic_addr = address;
826
827 set_fixmap_nocache(FIX_APIC_BASE, address);
828 if (boot_cpu_physical_apicid == -1U) {
829 boot_cpu_physical_apicid = read_apic_id();
830 apic_version[boot_cpu_physical_apicid] =
831 GET_APIC_VERSION(apic_read(APIC_LVR));
832 }
833}
834
835static int __init early_acpi_parse_madt_lapic_addr_ovr(void) 856static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
836{ 857{
837 int count; 858 int count;
@@ -853,7 +874,7 @@ static int __init early_acpi_parse_madt_lapic_addr_ovr(void)
853 return count; 874 return count;
854 } 875 }
855 876
856 acpi_register_lapic_address(acpi_lapic_addr); 877 register_lapic_address(acpi_lapic_addr);
857 878
858 return count; 879 return count;
859} 880}
@@ -880,16 +901,16 @@ static int __init acpi_parse_madt_lapic_entries(void)
880 return count; 901 return count;
881 } 902 }
882 903
883 acpi_register_lapic_address(acpi_lapic_addr); 904 register_lapic_address(acpi_lapic_addr);
884 905
885 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC, 906 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_SAPIC,
886 acpi_parse_sapic, MAX_APICS); 907 acpi_parse_sapic, MAX_LOCAL_APIC);
887 908
888 if (!count) { 909 if (!count) {
889 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC, 910 x2count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_X2APIC,
890 acpi_parse_x2apic, MAX_APICS); 911 acpi_parse_x2apic, MAX_LOCAL_APIC);
891 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC, 912 count = acpi_table_parse_madt(ACPI_MADT_TYPE_LOCAL_APIC,
892 acpi_parse_lapic, MAX_APICS); 913 acpi_parse_lapic, MAX_LOCAL_APIC);
893 } 914 }
894 if (!count && !x2count) { 915 if (!count && !x2count) {
895 printk(KERN_ERR PREFIX "No LAPIC entries present\n"); 916 printk(KERN_ERR PREFIX "No LAPIC entries present\n");
@@ -922,32 +943,6 @@ static int __init acpi_parse_madt_lapic_entries(void)
922extern int es7000_plat; 943extern int es7000_plat;
923#endif 944#endif
924 945
925static void assign_to_mp_irq(struct mpc_intsrc *m,
926 struct mpc_intsrc *mp_irq)
927{
928 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
929}
930
931static int mp_irq_cmp(struct mpc_intsrc *mp_irq,
932 struct mpc_intsrc *m)
933{
934 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
935}
936
937static void save_mp_irq(struct mpc_intsrc *m)
938{
939 int i;
940
941 for (i = 0; i < mp_irq_entries; i++) {
942 if (!mp_irq_cmp(&mp_irqs[i], m))
943 return;
944 }
945
946 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
947 if (++mp_irq_entries == MAX_IRQ_SOURCES)
948 panic("Max # of irq sources exceeded!!\n");
949}
950
951void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi) 946void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
952{ 947{
953 int ioapic; 948 int ioapic;
@@ -975,10 +970,10 @@ void __init mp_override_legacy_irq(u8 bus_irq, u8 polarity, u8 trigger, u32 gsi)
975 mp_irq.irqflag = (trigger << 2) | polarity; 970 mp_irq.irqflag = (trigger << 2) | polarity;
976 mp_irq.srcbus = MP_ISA_BUS; 971 mp_irq.srcbus = MP_ISA_BUS;
977 mp_irq.srcbusirq = bus_irq; /* IRQ */ 972 mp_irq.srcbusirq = bus_irq; /* IRQ */
978 mp_irq.dstapic = mp_ioapics[ioapic].apicid; /* APIC ID */ 973 mp_irq.dstapic = mpc_ioapic_id(ioapic); /* APIC ID */
979 mp_irq.dstirq = pin; /* INTIN# */ 974 mp_irq.dstirq = pin; /* INTIN# */
980 975
981 save_mp_irq(&mp_irq); 976 mp_save_irq(&mp_irq);
982 977
983 isa_irq_to_gsi[bus_irq] = gsi; 978 isa_irq_to_gsi[bus_irq] = gsi;
984} 979}
@@ -1026,7 +1021,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1026 if (ioapic < 0) 1021 if (ioapic < 0)
1027 continue; 1022 continue;
1028 pin = mp_find_ioapic_pin(ioapic, gsi); 1023 pin = mp_find_ioapic_pin(ioapic, gsi);
1029 dstapic = mp_ioapics[ioapic].apicid; 1024 dstapic = mpc_ioapic_id(ioapic);
1030 1025
1031 for (idx = 0; idx < mp_irq_entries; idx++) { 1026 for (idx = 0; idx < mp_irq_entries; idx++) {
1032 struct mpc_intsrc *irq = mp_irqs + idx; 1027 struct mpc_intsrc *irq = mp_irqs + idx;
@@ -1053,7 +1048,7 @@ void __init mp_config_acpi_legacy_irqs(void)
1053 mp_irq.srcbusirq = i; /* Identity mapped */ 1048 mp_irq.srcbusirq = i; /* Identity mapped */
1054 mp_irq.dstirq = pin; 1049 mp_irq.dstirq = pin;
1055 1050
1056 save_mp_irq(&mp_irq); 1051 mp_save_irq(&mp_irq);
1057 } 1052 }
1058} 1053}
1059 1054
@@ -1087,10 +1082,10 @@ static int mp_config_acpi_gsi(struct device *dev, u32 gsi, int trigger,
1087 mp_irq.srcbus = number; 1082 mp_irq.srcbus = number;
1088 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3); 1083 mp_irq.srcbusirq = (((devfn >> 3) & 0x1f) << 2) | ((pin - 1) & 3);
1089 ioapic = mp_find_ioapic(gsi); 1084 ioapic = mp_find_ioapic(gsi);
1090 mp_irq.dstapic = mp_ioapics[ioapic].apicid; 1085 mp_irq.dstapic = mpc_ioapic_id(ioapic);
1091 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi); 1086 mp_irq.dstirq = mp_find_ioapic_pin(ioapic, gsi);
1092 1087
1093 save_mp_irq(&mp_irq); 1088 mp_save_irq(&mp_irq);
1094#endif 1089#endif
1095 return 0; 1090 return 0;
1096} 1091}
@@ -1118,7 +1113,7 @@ int mp_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
1118 1113
1119 if (ioapic_pin > MP_MAX_IOAPIC_PIN) { 1114 if (ioapic_pin > MP_MAX_IOAPIC_PIN) {
1120 printk(KERN_ERR "Invalid reference to IOAPIC pin " 1115 printk(KERN_ERR "Invalid reference to IOAPIC pin "
1121 "%d-%d\n", mp_ioapics[ioapic].apicid, 1116 "%d-%d\n", mpc_ioapic_id(ioapic),
1122 ioapic_pin); 1117 ioapic_pin);
1123 return gsi; 1118 return gsi;
1124 } 1119 }
@@ -1259,8 +1254,7 @@ static void __init acpi_process_madt(void)
1259 */ 1254 */
1260 error = acpi_parse_madt_ioapic_entries(); 1255 error = acpi_parse_madt_ioapic_entries();
1261 if (!error) { 1256 if (!error) {
1262 acpi_irq_model = ACPI_IRQ_MODEL_IOAPIC; 1257 acpi_set_irq_model_ioapic();
1263 acpi_ioapic = 1;
1264 1258
1265 smp_found_config = 1; 1259 smp_found_config = 1;
1266 } 1260 }
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index fb16f17e59be..5812404a0d4c 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -13,6 +13,7 @@
13 13
14#include <acpi/processor.h> 14#include <acpi/processor.h>
15#include <asm/acpi.h> 15#include <asm/acpi.h>
16#include <asm/mwait.h>
16 17
17/* 18/*
18 * Initialize bm_flags based on the CPU cache properties 19 * Initialize bm_flags based on the CPU cache properties
@@ -65,16 +66,6 @@ static struct cstate_entry __percpu *cpu_cstate_entry; /* per CPU ptr */
65 66
66static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; 67static short mwait_supported[ACPI_PROCESSOR_MAX_POWER];
67 68
68#define MWAIT_SUBSTATE_MASK (0xf)
69#define MWAIT_CSTATE_MASK (0xf)
70#define MWAIT_SUBSTATE_SIZE (4)
71
72#define CPUID_MWAIT_LEAF (5)
73#define CPUID5_ECX_EXTENSIONS_SUPPORTED (0x1)
74#define CPUID5_ECX_INTERRUPT_BREAK (0x2)
75
76#define MWAIT_ECX_INTERRUPT_BREAK (0x1)
77
78#define NATIVE_CSTATE_BEYOND_HALT (2) 69#define NATIVE_CSTATE_BEYOND_HALT (2)
79 70
80static long acpi_processor_ffh_cstate_probe_cpu(void *_cx) 71static long acpi_processor_ffh_cstate_probe_cpu(void *_cx)
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.S b/arch/x86/kernel/acpi/realmode/wakeup.S
index 28595d6df47c..b4fd836e4053 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.S
@@ -6,11 +6,17 @@
6#include <asm/page_types.h> 6#include <asm/page_types.h>
7#include <asm/pgtable_types.h> 7#include <asm/pgtable_types.h>
8#include <asm/processor-flags.h> 8#include <asm/processor-flags.h>
9#include "wakeup.h"
9 10
10 .code16 11 .code16
11 .section ".header", "a" 12 .section ".jump", "ax"
13 .globl _start
14_start:
15 cli
16 jmp wakeup_code
12 17
13/* This should match the structure in wakeup.h */ 18/* This should match the structure in wakeup.h */
19 .section ".header", "a"
14 .globl wakeup_header 20 .globl wakeup_header
15wakeup_header: 21wakeup_header:
16video_mode: .short 0 /* Video mode number */ 22video_mode: .short 0 /* Video mode number */
@@ -22,6 +28,8 @@ pmode_cr3: .long 0 /* Saved %cr3 */
22pmode_cr4: .long 0 /* Saved %cr4 */ 28pmode_cr4: .long 0 /* Saved %cr4 */
23pmode_efer: .quad 0 /* Saved EFER */ 29pmode_efer: .quad 0 /* Saved EFER */
24pmode_gdt: .quad 0 30pmode_gdt: .quad 0
31pmode_misc_en: .quad 0 /* Saved MISC_ENABLE MSR */
32pmode_behavior: .long 0 /* Wakeup behavior flags */
25realmode_flags: .long 0 33realmode_flags: .long 0
26real_magic: .long 0 34real_magic: .long 0
27trampoline_segment: .word 0 35trampoline_segment: .word 0
@@ -30,14 +38,11 @@ wakeup_jmp: .byte 0xea /* ljmpw */
30wakeup_jmp_off: .word 3f 38wakeup_jmp_off: .word 3f
31wakeup_jmp_seg: .word 0 39wakeup_jmp_seg: .word 0
32wakeup_gdt: .quad 0, 0, 0 40wakeup_gdt: .quad 0, 0, 0
33signature: .long 0x51ee1111 41signature: .long WAKEUP_HEADER_SIGNATURE
34 42
35 .text 43 .text
36 .globl _start
37 .code16 44 .code16
38wakeup_code: 45wakeup_code:
39_start:
40 cli
41 cld 46 cld
42 47
43 /* Apparently some dimwit BIOS programmers don't know how to 48 /* Apparently some dimwit BIOS programmers don't know how to
@@ -77,17 +82,29 @@ _start:
77 82
78 /* Check header signature... */ 83 /* Check header signature... */
79 movl signature, %eax 84 movl signature, %eax
80 cmpl $0x51ee1111, %eax 85 cmpl $WAKEUP_HEADER_SIGNATURE, %eax
81 jne bogus_real_magic 86 jne bogus_real_magic
82 87
83 /* Check we really have everything... */ 88 /* Check we really have everything... */
84 movl end_signature, %eax 89 movl end_signature, %eax
85 cmpl $0x65a22c82, %eax 90 cmpl $WAKEUP_END_SIGNATURE, %eax
86 jne bogus_real_magic 91 jne bogus_real_magic
87 92
88 /* Call the C code */ 93 /* Call the C code */
89 calll main 94 calll main
90 95
96 /* Restore MISC_ENABLE before entering protected mode, in case
97 BIOS decided to clear XD_DISABLE during S3. */
98 movl pmode_behavior, %eax
99 btl $WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE, %eax
100 jnc 1f
101
102 movl pmode_misc_en, %eax
103 movl pmode_misc_en + 4, %edx
104 movl $MSR_IA32_MISC_ENABLE, %ecx
105 wrmsr
1061:
107
91 /* Do any other stuff... */ 108 /* Do any other stuff... */
92 109
93#ifndef CONFIG_64BIT 110#ifndef CONFIG_64BIT
@@ -147,3 +164,7 @@ wakeup_heap:
147wakeup_stack: 164wakeup_stack:
148 .space 2048 165 .space 2048
149wakeup_stack_end: 166wakeup_stack_end:
167
168 .section ".signature","a"
169end_signature:
170 .long WAKEUP_END_SIGNATURE
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.h b/arch/x86/kernel/acpi/realmode/wakeup.h
index 69d38d0b2b64..97a29e1430e3 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.h
+++ b/arch/x86/kernel/acpi/realmode/wakeup.h
@@ -21,6 +21,9 @@ struct wakeup_header {
21 u32 pmode_efer_low; /* Protected mode EFER */ 21 u32 pmode_efer_low; /* Protected mode EFER */
22 u32 pmode_efer_high; 22 u32 pmode_efer_high;
23 u64 pmode_gdt; 23 u64 pmode_gdt;
24 u32 pmode_misc_en_low; /* Protected mode MISC_ENABLE */
25 u32 pmode_misc_en_high;
26 u32 pmode_behavior; /* Wakeup routine behavior flags */
24 u32 realmode_flags; 27 u32 realmode_flags;
25 u32 real_magic; 28 u32 real_magic;
26 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */ 29 u16 trampoline_segment; /* segment with trampoline code, 64-bit only */
@@ -35,7 +38,11 @@ struct wakeup_header {
35extern struct wakeup_header wakeup_header; 38extern struct wakeup_header wakeup_header;
36#endif 39#endif
37 40
38#define HEADER_OFFSET 0x3f00 41#define WAKEUP_HEADER_OFFSET 8
39#define WAKEUP_SIZE 0x4000 42#define WAKEUP_HEADER_SIGNATURE 0x51ee1111
43#define WAKEUP_END_SIGNATURE 0x65a22c82
44
45/* Wakeup behavior bits */
46#define WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE 0
40 47
41#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */ 48#endif /* ARCH_X86_KERNEL_ACPI_RM_WAKEUP_H */
diff --git a/arch/x86/kernel/acpi/realmode/wakeup.lds.S b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
index 060fff8f5c5b..d4f8010a5b1b 100644
--- a/arch/x86/kernel/acpi/realmode/wakeup.lds.S
+++ b/arch/x86/kernel/acpi/realmode/wakeup.lds.S
@@ -13,9 +13,19 @@ ENTRY(_start)
13SECTIONS 13SECTIONS
14{ 14{
15 . = 0; 15 . = 0;
16 .jump : {
17 *(.jump)
18 } = 0x90909090
19
20 . = WAKEUP_HEADER_OFFSET;
21 .header : {
22 *(.header)
23 }
24
25 . = ALIGN(16);
16 .text : { 26 .text : {
17 *(.text*) 27 *(.text*)
18 } 28 } = 0x90909090
19 29
20 . = ALIGN(16); 30 . = ALIGN(16);
21 .rodata : { 31 .rodata : {
@@ -33,11 +43,6 @@ SECTIONS
33 *(.data*) 43 *(.data*)
34 } 44 }
35 45
36 .signature : {
37 end_signature = .;
38 LONG(0x65a22c82)
39 }
40
41 . = ALIGN(16); 46 . = ALIGN(16);
42 .bss : { 47 .bss : {
43 __bss_start = .; 48 __bss_start = .;
@@ -45,20 +50,13 @@ SECTIONS
45 __bss_end = .; 50 __bss_end = .;
46 } 51 }
47 52
48 . = HEADER_OFFSET; 53 .signature : {
49 .header : { 54 *(.signature)
50 *(.header)
51 } 55 }
52 56
53 . = ALIGN(16);
54 _end = .; 57 _end = .;
55 58
56 /DISCARD/ : { 59 /DISCARD/ : {
57 *(.note*) 60 *(.note*)
58 } 61 }
59
60 /*
61 * The ASSERT() sink to . is intentional, for binutils 2.14 compatibility:
62 */
63 . = ASSERT(_end <= WAKEUP_SIZE, "Wakeup too big!");
64} 62}
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index 33cec152070d..103b6ab368d3 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -7,45 +7,39 @@
7 7
8#include <linux/acpi.h> 8#include <linux/acpi.h>
9#include <linux/bootmem.h> 9#include <linux/bootmem.h>
10#include <linux/memblock.h>
10#include <linux/dmi.h> 11#include <linux/dmi.h>
11#include <linux/cpumask.h> 12#include <linux/cpumask.h>
12#include <asm/segment.h> 13#include <asm/segment.h>
13#include <asm/desc.h> 14#include <asm/desc.h>
15#include <asm/pgtable.h>
16#include <asm/cacheflush.h>
14 17
15#include "realmode/wakeup.h" 18#include "realmode/wakeup.h"
16#include "sleep.h" 19#include "sleep.h"
17 20
18unsigned long acpi_wakeup_address;
19unsigned long acpi_realmode_flags; 21unsigned long acpi_realmode_flags;
20 22
21/* address in low memory of the wakeup routine. */
22static unsigned long acpi_realmode;
23
24#if defined(CONFIG_SMP) && defined(CONFIG_64BIT) 23#if defined(CONFIG_SMP) && defined(CONFIG_64BIT)
25static char temp_stack[4096]; 24static char temp_stack[4096];
26#endif 25#endif
27 26
28/** 27/**
29 * acpi_save_state_mem - save kernel state 28 * acpi_suspend_lowlevel - save kernel state
30 * 29 *
31 * Create an identity mapped page table and copy the wakeup routine to 30 * Create an identity mapped page table and copy the wakeup routine to
32 * low memory. 31 * low memory.
33 *
34 * Note that this is too late to change acpi_wakeup_address.
35 */ 32 */
36int acpi_save_state_mem(void) 33int acpi_suspend_lowlevel(void)
37{ 34{
38 struct wakeup_header *header; 35 struct wakeup_header *header;
36 /* address in low memory of the wakeup routine. */
37 char *acpi_realmode;
39 38
40 if (!acpi_realmode) { 39 acpi_realmode = TRAMPOLINE_SYM(acpi_wakeup_code);
41 printk(KERN_ERR "Could not allocate memory during boot, "
42 "S3 disabled\n");
43 return -ENOMEM;
44 }
45 memcpy((void *)acpi_realmode, &wakeup_code_start, WAKEUP_SIZE);
46 40
47 header = (struct wakeup_header *)(acpi_realmode + HEADER_OFFSET); 41 header = (struct wakeup_header *)(acpi_realmode + WAKEUP_HEADER_OFFSET);
48 if (header->signature != 0x51ee1111) { 42 if (header->signature != WAKEUP_HEADER_SIGNATURE) {
49 printk(KERN_ERR "wakeup header does not match\n"); 43 printk(KERN_ERR "wakeup header does not match\n");
50 return -EINVAL; 44 return -EINVAL;
51 } 45 }
@@ -65,9 +59,7 @@ int acpi_save_state_mem(void)
65 /* GDT[0]: GDT self-pointer */ 59 /* GDT[0]: GDT self-pointer */
66 header->wakeup_gdt[0] = 60 header->wakeup_gdt[0] =
67 (u64)(sizeof(header->wakeup_gdt) - 1) + 61 (u64)(sizeof(header->wakeup_gdt) - 1) +
68 ((u64)(acpi_wakeup_address + 62 ((u64)__pa(&header->wakeup_gdt) << 16);
69 ((char *)&header->wakeup_gdt - (char *)acpi_realmode))
70 << 16);
71 /* GDT[1]: big real mode-like code segment */ 63 /* GDT[1]: big real mode-like code segment */
72 header->wakeup_gdt[1] = 64 header->wakeup_gdt[1] =
73 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff); 65 GDT_ENTRY(0x809b, acpi_wakeup_address, 0xfffff);
@@ -85,17 +77,23 @@ int acpi_save_state_mem(void)
85 77
86 header->pmode_cr0 = read_cr0(); 78 header->pmode_cr0 = read_cr0();
87 header->pmode_cr4 = read_cr4_safe(); 79 header->pmode_cr4 = read_cr4_safe();
80 header->pmode_behavior = 0;
81 if (!rdmsr_safe(MSR_IA32_MISC_ENABLE,
82 &header->pmode_misc_en_low,
83 &header->pmode_misc_en_high))
84 header->pmode_behavior |=
85 (1 << WAKEUP_BEHAVIOR_RESTORE_MISC_ENABLE);
88 header->realmode_flags = acpi_realmode_flags; 86 header->realmode_flags = acpi_realmode_flags;
89 header->real_magic = 0x12345678; 87 header->real_magic = 0x12345678;
90 88
91#ifndef CONFIG_64BIT 89#ifndef CONFIG_64BIT
92 header->pmode_entry = (u32)&wakeup_pmode_return; 90 header->pmode_entry = (u32)&wakeup_pmode_return;
93 header->pmode_cr3 = (u32)(swsusp_pg_dir - __PAGE_OFFSET); 91 header->pmode_cr3 = (u32)__pa(&initial_page_table);
94 saved_magic = 0x12345678; 92 saved_magic = 0x12345678;
95#else /* CONFIG_64BIT */ 93#else /* CONFIG_64BIT */
96 header->trampoline_segment = setup_trampoline() >> 4; 94 header->trampoline_segment = trampoline_address() >> 4;
97#ifdef CONFIG_SMP 95#ifdef CONFIG_SMP
98 stack_start.sp = temp_stack + sizeof(temp_stack); 96 stack_start = (unsigned long)temp_stack + sizeof(temp_stack);
99 early_gdt_descr.address = 97 early_gdt_descr.address =
100 (unsigned long)get_cpu_gdt_table(smp_processor_id()); 98 (unsigned long)get_cpu_gdt_table(smp_processor_id());
101 initial_gs = per_cpu_offset(smp_processor_id()); 99 initial_gs = per_cpu_offset(smp_processor_id());
@@ -104,47 +102,10 @@ int acpi_save_state_mem(void)
104 saved_magic = 0x123456789abcdef0L; 102 saved_magic = 0x123456789abcdef0L;
105#endif /* CONFIG_64BIT */ 103#endif /* CONFIG_64BIT */
106 104
105 do_suspend_lowlevel();
107 return 0; 106 return 0;
108} 107}
109 108
110/*
111 * acpi_restore_state - undo effects of acpi_save_state_mem
112 */
113void acpi_restore_state_mem(void)
114{
115}
116
117
118/**
119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
120 *
121 * We allocate a page from the first 1MB of memory for the wakeup
122 * routine for when we come back from a sleep state. The
123 * runtime allocator allows specification of <16MB pages, but not
124 * <1MB pages.
125 */
126void __init acpi_reserve_wakeup_memory(void)
127{
128 unsigned long mem;
129
130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
131 printk(KERN_ERR
132 "ACPI: Wakeup code way too big, S3 disabled.\n");
133 return;
134 }
135
136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
137
138 if (mem == -1L) {
139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
140 return;
141 }
142 acpi_realmode = (unsigned long) phys_to_virt(mem);
143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145}
146
147
148static int __init acpi_sleep_setup(char *str) 109static int __init acpi_sleep_setup(char *str)
149{ 110{
150 while ((str != NULL) && (*str != '\0')) { 111 while ((str != NULL) && (*str != '\0')) {
@@ -157,11 +118,6 @@ static int __init acpi_sleep_setup(char *str)
157#ifdef CONFIG_HIBERNATION 118#ifdef CONFIG_HIBERNATION
158 if (strncmp(str, "s4_nohwsig", 10) == 0) 119 if (strncmp(str, "s4_nohwsig", 10) == 0)
159 acpi_no_s4_hw_signature(); 120 acpi_no_s4_hw_signature();
160 if (strncmp(str, "s4_nonvs", 8) == 0) {
161 pr_warning("ACPI: acpi_sleep=s4_nonvs is deprecated, "
162 "please use acpi_sleep=nonvs instead");
163 acpi_nvs_nosave();
164 }
165#endif 121#endif
166 if (strncmp(str, "nonvs", 5) == 0) 122 if (strncmp(str, "nonvs", 5) == 0)
167 acpi_nvs_nosave(); 123 acpi_nvs_nosave();
diff --git a/arch/x86/kernel/acpi/sleep.h b/arch/x86/kernel/acpi/sleep.h
index adbcbaa6f1df..416d4be13fef 100644
--- a/arch/x86/kernel/acpi/sleep.h
+++ b/arch/x86/kernel/acpi/sleep.h
@@ -4,13 +4,12 @@
4 4
5#include <asm/trampoline.h> 5#include <asm/trampoline.h>
6 6
7extern char wakeup_code_start, wakeup_code_end;
8
9extern unsigned long saved_video_mode; 7extern unsigned long saved_video_mode;
10extern long saved_magic; 8extern long saved_magic;
11 9
12extern int wakeup_pmode_return; 10extern int wakeup_pmode_return;
13extern char swsusp_pg_dir[PAGE_SIZE];
14 11
15extern unsigned long acpi_copy_wakeup_routine(unsigned long); 12extern unsigned long acpi_copy_wakeup_routine(unsigned long);
16extern void wakeup_long64(void); 13extern void wakeup_long64(void);
14
15extern void do_suspend_lowlevel(void);
diff --git a/arch/x86/kernel/acpi/wakeup_rm.S b/arch/x86/kernel/acpi/wakeup_rm.S
index 6ff3b5730575..63b8ab524f2c 100644
--- a/arch/x86/kernel/acpi/wakeup_rm.S
+++ b/arch/x86/kernel/acpi/wakeup_rm.S
@@ -2,9 +2,11 @@
2 * Wrapper script for the realmode binary as a transport object 2 * Wrapper script for the realmode binary as a transport object
3 * before copying to low memory. 3 * before copying to low memory.
4 */ 4 */
5 .section ".rodata","a" 5#include <asm/page_types.h>
6 .globl wakeup_code_start, wakeup_code_end 6
7wakeup_code_start: 7 .section ".x86_trampoline","a"
8 .balign PAGE_SIZE
9 .globl acpi_wakeup_code
10acpi_wakeup_code:
8 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin" 11 .incbin "arch/x86/kernel/acpi/realmode/wakeup.bin"
9wakeup_code_end: 12 .size acpi_wakeup_code, .-acpi_wakeup_code
10 .size wakeup_code_start, .-wakeup_code_start
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index f65ab8b014c4..a81f2d52f869 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -67,17 +67,30 @@ __setup("noreplace-paravirt", setup_noreplace_paravirt);
67#define DPRINTK(fmt, args...) if (debug_alternative) \ 67#define DPRINTK(fmt, args...) if (debug_alternative) \
68 printk(KERN_DEBUG fmt, args) 68 printk(KERN_DEBUG fmt, args)
69 69
70/*
71 * Each GENERIC_NOPX is of X bytes, and defined as an array of bytes
72 * that correspond to that nop. Getting from one nop to the next, we
73 * add to the array the offset that is equal to the sum of all sizes of
74 * nops preceding the one we are after.
75 *
76 * Note: The GENERIC_NOP5_ATOMIC is at the end, as it breaks the
77 * nice symmetry of sizes of the previous nops.
78 */
70#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64) 79#if defined(GENERIC_NOP1) && !defined(CONFIG_X86_64)
71/* Use inline assembly to define this because the nops are defined 80static const unsigned char intelnops[] =
72 as inline assembly strings in the include files and we cannot 81{
73 get them easily into strings. */ 82 GENERIC_NOP1,
74asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nintelnops: " 83 GENERIC_NOP2,
75 GENERIC_NOP1 GENERIC_NOP2 GENERIC_NOP3 GENERIC_NOP4 GENERIC_NOP5 GENERIC_NOP6 84 GENERIC_NOP3,
76 GENERIC_NOP7 GENERIC_NOP8 85 GENERIC_NOP4,
77 "\t.previous"); 86 GENERIC_NOP5,
78extern const unsigned char intelnops[]; 87 GENERIC_NOP6,
79static const unsigned char *const __initconst_or_module 88 GENERIC_NOP7,
80intel_nops[ASM_NOP_MAX+1] = { 89 GENERIC_NOP8,
90 GENERIC_NOP5_ATOMIC
91};
92static const unsigned char * const intel_nops[ASM_NOP_MAX+2] =
93{
81 NULL, 94 NULL,
82 intelnops, 95 intelnops,
83 intelnops + 1, 96 intelnops + 1,
@@ -87,17 +100,25 @@ intel_nops[ASM_NOP_MAX+1] = {
87 intelnops + 1 + 2 + 3 + 4 + 5, 100 intelnops + 1 + 2 + 3 + 4 + 5,
88 intelnops + 1 + 2 + 3 + 4 + 5 + 6, 101 intelnops + 1 + 2 + 3 + 4 + 5 + 6,
89 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 102 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
103 intelnops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
90}; 104};
91#endif 105#endif
92 106
93#ifdef K8_NOP1 107#ifdef K8_NOP1
94asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk8nops: " 108static const unsigned char k8nops[] =
95 K8_NOP1 K8_NOP2 K8_NOP3 K8_NOP4 K8_NOP5 K8_NOP6 109{
96 K8_NOP7 K8_NOP8 110 K8_NOP1,
97 "\t.previous"); 111 K8_NOP2,
98extern const unsigned char k8nops[]; 112 K8_NOP3,
99static const unsigned char *const __initconst_or_module 113 K8_NOP4,
100k8_nops[ASM_NOP_MAX+1] = { 114 K8_NOP5,
115 K8_NOP6,
116 K8_NOP7,
117 K8_NOP8,
118 K8_NOP5_ATOMIC
119};
120static const unsigned char * const k8_nops[ASM_NOP_MAX+2] =
121{
101 NULL, 122 NULL,
102 k8nops, 123 k8nops,
103 k8nops + 1, 124 k8nops + 1,
@@ -107,17 +128,25 @@ k8_nops[ASM_NOP_MAX+1] = {
107 k8nops + 1 + 2 + 3 + 4 + 5, 128 k8nops + 1 + 2 + 3 + 4 + 5,
108 k8nops + 1 + 2 + 3 + 4 + 5 + 6, 129 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
109 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 130 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
131 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
110}; 132};
111#endif 133#endif
112 134
113#if defined(K7_NOP1) && !defined(CONFIG_X86_64) 135#if defined(K7_NOP1) && !defined(CONFIG_X86_64)
114asm("\t" __stringify(__INITRODATA_OR_MODULE) "\nk7nops: " 136static const unsigned char k7nops[] =
115 K7_NOP1 K7_NOP2 K7_NOP3 K7_NOP4 K7_NOP5 K7_NOP6 137{
116 K7_NOP7 K7_NOP8 138 K7_NOP1,
117 "\t.previous"); 139 K7_NOP2,
118extern const unsigned char k7nops[]; 140 K7_NOP3,
119static const unsigned char *const __initconst_or_module 141 K7_NOP4,
120k7_nops[ASM_NOP_MAX+1] = { 142 K7_NOP5,
143 K7_NOP6,
144 K7_NOP7,
145 K7_NOP8,
146 K7_NOP5_ATOMIC
147};
148static const unsigned char * const k7_nops[ASM_NOP_MAX+2] =
149{
121 NULL, 150 NULL,
122 k7nops, 151 k7nops,
123 k7nops + 1, 152 k7nops + 1,
@@ -127,17 +156,25 @@ k7_nops[ASM_NOP_MAX+1] = {
127 k7nops + 1 + 2 + 3 + 4 + 5, 156 k7nops + 1 + 2 + 3 + 4 + 5,
128 k7nops + 1 + 2 + 3 + 4 + 5 + 6, 157 k7nops + 1 + 2 + 3 + 4 + 5 + 6,
129 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 158 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
159 k7nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
130}; 160};
131#endif 161#endif
132 162
133#ifdef P6_NOP1 163#ifdef P6_NOP1
134asm("\t" __stringify(__INITRODATA_OR_MODULE) "\np6nops: " 164static const unsigned char __initconst_or_module p6nops[] =
135 P6_NOP1 P6_NOP2 P6_NOP3 P6_NOP4 P6_NOP5 P6_NOP6 165{
136 P6_NOP7 P6_NOP8 166 P6_NOP1,
137 "\t.previous"); 167 P6_NOP2,
138extern const unsigned char p6nops[]; 168 P6_NOP3,
139static const unsigned char *const __initconst_or_module 169 P6_NOP4,
140p6_nops[ASM_NOP_MAX+1] = { 170 P6_NOP5,
171 P6_NOP6,
172 P6_NOP7,
173 P6_NOP8,
174 P6_NOP5_ATOMIC
175};
176static const unsigned char * const p6_nops[ASM_NOP_MAX+2] =
177{
141 NULL, 178 NULL,
142 p6nops, 179 p6nops,
143 p6nops + 1, 180 p6nops + 1,
@@ -147,47 +184,65 @@ p6_nops[ASM_NOP_MAX+1] = {
147 p6nops + 1 + 2 + 3 + 4 + 5, 184 p6nops + 1 + 2 + 3 + 4 + 5,
148 p6nops + 1 + 2 + 3 + 4 + 5 + 6, 185 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
149 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7, 186 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
187 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
150}; 188};
151#endif 189#endif
152 190
191/* Initialize these to a safe default */
153#ifdef CONFIG_X86_64 192#ifdef CONFIG_X86_64
193const unsigned char * const *ideal_nops = p6_nops;
194#else
195const unsigned char * const *ideal_nops = intel_nops;
196#endif
154 197
155extern char __vsyscall_0; 198void __init arch_init_ideal_nops(void)
156static const unsigned char *const *__init_or_module find_nop_table(void)
157{ 199{
158 if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && 200 switch (boot_cpu_data.x86_vendor) {
159 boot_cpu_has(X86_FEATURE_NOPL)) 201 case X86_VENDOR_INTEL:
160 return p6_nops; 202 /*
161 else 203 * Due to a decoder implementation quirk, some
162 return k8_nops; 204 * specific Intel CPUs actually perform better with
163} 205 * the "k8_nops" than with the SDM-recommended NOPs.
164 206 */
165#else /* CONFIG_X86_64 */ 207 if (boot_cpu_data.x86 == 6 &&
208 boot_cpu_data.x86_model >= 0x0f &&
209 boot_cpu_data.x86_model != 0x1c &&
210 boot_cpu_data.x86_model != 0x26 &&
211 boot_cpu_data.x86_model != 0x27 &&
212 boot_cpu_data.x86_model < 0x30) {
213 ideal_nops = k8_nops;
214 } else if (boot_cpu_has(X86_FEATURE_NOPL)) {
215 ideal_nops = p6_nops;
216 } else {
217#ifdef CONFIG_X86_64
218 ideal_nops = k8_nops;
219#else
220 ideal_nops = intel_nops;
221#endif
222 }
166 223
167static const unsigned char *const *__init_or_module find_nop_table(void) 224 default:
168{ 225#ifdef CONFIG_X86_64
169 if (boot_cpu_has(X86_FEATURE_K8)) 226 ideal_nops = k8_nops;
170 return k8_nops; 227#else
171 else if (boot_cpu_has(X86_FEATURE_K7)) 228 if (boot_cpu_has(X86_FEATURE_K8))
172 return k7_nops; 229 ideal_nops = k8_nops;
173 else if (boot_cpu_has(X86_FEATURE_NOPL)) 230 else if (boot_cpu_has(X86_FEATURE_K7))
174 return p6_nops; 231 ideal_nops = k7_nops;
175 else 232 else
176 return intel_nops; 233 ideal_nops = intel_nops;
234#endif
235 }
177} 236}
178 237
179#endif /* CONFIG_X86_64 */
180
181/* Use this to add nops to a buffer, then text_poke the whole buffer. */ 238/* Use this to add nops to a buffer, then text_poke the whole buffer. */
182static void __init_or_module add_nops(void *insns, unsigned int len) 239static void __init_or_module add_nops(void *insns, unsigned int len)
183{ 240{
184 const unsigned char *const *noptable = find_nop_table();
185
186 while (len > 0) { 241 while (len > 0) {
187 unsigned int noplen = len; 242 unsigned int noplen = len;
188 if (noplen > ASM_NOP_MAX) 243 if (noplen > ASM_NOP_MAX)
189 noplen = ASM_NOP_MAX; 244 noplen = ASM_NOP_MAX;
190 memcpy(insns, noptable[noplen], noplen); 245 memcpy(insns, ideal_nops[noplen], noplen);
191 insns += noplen; 246 insns += noplen;
192 len -= noplen; 247 len -= noplen;
193 } 248 }
@@ -195,11 +250,12 @@ static void __init_or_module add_nops(void *insns, unsigned int len)
195 250
196extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; 251extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
197extern s32 __smp_locks[], __smp_locks_end[]; 252extern s32 __smp_locks[], __smp_locks_end[];
198static void *text_poke_early(void *addr, const void *opcode, size_t len); 253extern char __vsyscall_0;
254void *text_poke_early(void *addr, const void *opcode, size_t len);
199 255
200/* Replace instructions with better alternatives for this CPU type. 256/* Replace instructions with better alternatives for this CPU type.
201 This runs before SMP is initialized to avoid SMP problems with 257 This runs before SMP is initialized to avoid SMP problems with
202 self modifying code. This implies that assymetric systems where 258 self modifying code. This implies that asymmetric systems where
203 APs have less capabilities than the boot processor are not handled. 259 APs have less capabilities than the boot processor are not handled.
204 Tough. Make sure you disable such features by hand. */ 260 Tough. Make sure you disable such features by hand. */
205 261
@@ -210,6 +266,15 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
210 u8 insnbuf[MAX_PATCH_LEN]; 266 u8 insnbuf[MAX_PATCH_LEN];
211 267
212 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 268 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
269 /*
270 * The scan order should be from start to end. A later scanned
271 * alternative code can overwrite a previous scanned alternative code.
272 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
273 * patch code.
274 *
275 * So be careful if you want to change the scan order to any other
276 * order.
277 */
213 for (a = start; a < end; a++) { 278 for (a = start; a < end; a++) {
214 u8 *instr = a->instr; 279 u8 *instr = a->instr;
215 BUG_ON(a->replacementlen > a->instrlen); 280 BUG_ON(a->replacementlen > a->instrlen);
@@ -353,6 +418,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
353 mutex_unlock(&smp_alt); 418 mutex_unlock(&smp_alt);
354} 419}
355 420
421bool skip_smp_alternatives;
356void alternatives_smp_switch(int smp) 422void alternatives_smp_switch(int smp)
357{ 423{
358 struct smp_alt_module *mod; 424 struct smp_alt_module *mod;
@@ -368,7 +434,7 @@ void alternatives_smp_switch(int smp)
368 printk("lockdep: fixing up alternatives.\n"); 434 printk("lockdep: fixing up alternatives.\n");
369#endif 435#endif
370 436
371 if (noreplace_smp || smp_alt_once) 437 if (noreplace_smp || smp_alt_once || skip_smp_alternatives)
372 return; 438 return;
373 BUG_ON(!smp && (num_online_cpus() > 1)); 439 BUG_ON(!smp && (num_online_cpus() > 1));
374 440
@@ -522,7 +588,7 @@ void __init alternative_instructions(void)
522 * instructions. And on the local CPU you need to be protected again NMI or MCE 588 * instructions. And on the local CPU you need to be protected again NMI or MCE
523 * handlers seeing an inconsistent instruction while you patch. 589 * handlers seeing an inconsistent instruction while you patch.
524 */ 590 */
525static void *__init_or_module text_poke_early(void *addr, const void *opcode, 591void *__init_or_module text_poke_early(void *addr, const void *opcode,
526 size_t len) 592 size_t len)
527{ 593{
528 unsigned long flags; 594 unsigned long flags;
@@ -591,17 +657,21 @@ static atomic_t stop_machine_first;
591static int wrote_text; 657static int wrote_text;
592 658
593struct text_poke_params { 659struct text_poke_params {
594 void *addr; 660 struct text_poke_param *params;
595 const void *opcode; 661 int nparams;
596 size_t len;
597}; 662};
598 663
599static int __kprobes stop_machine_text_poke(void *data) 664static int __kprobes stop_machine_text_poke(void *data)
600{ 665{
601 struct text_poke_params *tpp = data; 666 struct text_poke_params *tpp = data;
667 struct text_poke_param *p;
668 int i;
602 669
603 if (atomic_dec_and_test(&stop_machine_first)) { 670 if (atomic_dec_and_test(&stop_machine_first)) {
604 text_poke(tpp->addr, tpp->opcode, tpp->len); 671 for (i = 0; i < tpp->nparams; i++) {
672 p = &tpp->params[i];
673 text_poke(p->addr, p->opcode, p->len);
674 }
605 smp_wmb(); /* Make sure other cpus see that this has run */ 675 smp_wmb(); /* Make sure other cpus see that this has run */
606 wrote_text = 1; 676 wrote_text = 1;
607 } else { 677 } else {
@@ -610,8 +680,17 @@ static int __kprobes stop_machine_text_poke(void *data)
610 smp_mb(); /* Load wrote_text before following execution */ 680 smp_mb(); /* Load wrote_text before following execution */
611 } 681 }
612 682
613 flush_icache_range((unsigned long)tpp->addr, 683 for (i = 0; i < tpp->nparams; i++) {
614 (unsigned long)tpp->addr + tpp->len); 684 p = &tpp->params[i];
685 flush_icache_range((unsigned long)p->addr,
686 (unsigned long)p->addr + p->len);
687 }
688 /*
689 * Intel Archiecture Software Developer's Manual section 7.1.3 specifies
690 * that a core serializing instruction such as "cpuid" should be
691 * executed on _each_ core before the new instruction is made visible.
692 */
693 sync_core();
615 return 0; 694 return 0;
616} 695}
617 696
@@ -631,13 +710,36 @@ static int __kprobes stop_machine_text_poke(void *data)
631void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len) 710void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
632{ 711{
633 struct text_poke_params tpp; 712 struct text_poke_params tpp;
713 struct text_poke_param p;
634 714
635 tpp.addr = addr; 715 p.addr = addr;
636 tpp.opcode = opcode; 716 p.opcode = opcode;
637 tpp.len = len; 717 p.len = len;
718 tpp.params = &p;
719 tpp.nparams = 1;
638 atomic_set(&stop_machine_first, 1); 720 atomic_set(&stop_machine_first, 1);
639 wrote_text = 0; 721 wrote_text = 0;
640 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL); 722 /* Use __stop_machine() because the caller already got online_cpus. */
723 __stop_machine(stop_machine_text_poke, (void *)&tpp, cpu_online_mask);
641 return addr; 724 return addr;
642} 725}
643 726
727/**
728 * text_poke_smp_batch - Update instructions on a live kernel on SMP
729 * @params: an array of text_poke parameters
730 * @n: the number of elements in params.
731 *
732 * Modify multi-byte instruction by using stop_machine() on SMP. Since the
733 * stop_machine() is heavy task, it is better to aggregate text_poke requests
734 * and do it once if possible.
735 *
736 * Note: Must be called under get_online_cpus() and text_mutex.
737 */
738void __kprobes text_poke_smp_batch(struct text_poke_param *params, int n)
739{
740 struct text_poke_params tpp = {.params = params, .nparams = n};
741
742 atomic_set(&stop_machine_first, 1);
743 wrote_text = 0;
744 __stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
745}
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/amd_gart_64.c
index 0f7f130caa67..b117efd24f71 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/amd_gart_64.c
@@ -27,7 +27,7 @@
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/syscore_ops.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <asm/atomic.h> 33#include <asm/atomic.h>
@@ -39,8 +39,9 @@
39#include <asm/cacheflush.h> 39#include <asm/cacheflush.h>
40#include <asm/swiotlb.h> 40#include <asm/swiotlb.h>
41#include <asm/dma.h> 41#include <asm/dma.h>
42#include <asm/k8.h> 42#include <asm/amd_nb.h>
43#include <asm/x86_init.h> 43#include <asm/x86_init.h>
44#include <asm/iommu_table.h>
44 45
45static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 46static unsigned long iommu_bus_base; /* GART remapping area (physical) */
46static unsigned long iommu_size; /* size of remapping area bytes */ 47static unsigned long iommu_size; /* size of remapping area bytes */
@@ -80,6 +81,9 @@ static u32 gart_unmapped_entry;
80#define AGPEXTERN 81#define AGPEXTERN
81#endif 82#endif
82 83
84/* GART can only remap to physical addresses < 1TB */
85#define GART_MAX_PHYS_ADDR (1ULL << 40)
86
83/* backdoor interface to AGP driver */ 87/* backdoor interface to AGP driver */
84AGPEXTERN int agp_memory_reserved; 88AGPEXTERN int agp_memory_reserved;
85AGPEXTERN __u32 *agp_gatt_table; 89AGPEXTERN __u32 *agp_gatt_table;
@@ -142,7 +146,7 @@ static void flush_gart(void)
142 146
143 spin_lock_irqsave(&iommu_bitmap_lock, flags); 147 spin_lock_irqsave(&iommu_bitmap_lock, flags);
144 if (need_flush) { 148 if (need_flush) {
145 k8_flush_garts(); 149 amd_flush_garts();
146 need_flush = false; 150 need_flush = false;
147 } 151 }
148 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 152 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -211,9 +215,13 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
211 size_t size, int dir, unsigned long align_mask) 215 size_t size, int dir, unsigned long align_mask)
212{ 216{
213 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE); 217 unsigned long npages = iommu_num_pages(phys_mem, size, PAGE_SIZE);
214 unsigned long iommu_page = alloc_iommu(dev, npages, align_mask); 218 unsigned long iommu_page;
215 int i; 219 int i;
216 220
221 if (unlikely(phys_mem + size > GART_MAX_PHYS_ADDR))
222 return bad_dma_addr;
223
224 iommu_page = alloc_iommu(dev, npages, align_mask);
217 if (iommu_page == -1) { 225 if (iommu_page == -1) {
218 if (!nonforced_iommu(dev, phys_mem, size)) 226 if (!nonforced_iommu(dev, phys_mem, size))
219 return phys_mem; 227 return phys_mem;
@@ -560,14 +568,17 @@ static void enable_gart_translations(void)
560{ 568{
561 int i; 569 int i;
562 570
563 for (i = 0; i < num_k8_northbridges; i++) { 571 if (!amd_nb_has_feature(AMD_NB_GART))
564 struct pci_dev *dev = k8_northbridges[i]; 572 return;
573
574 for (i = 0; i < amd_nb_num(); i++) {
575 struct pci_dev *dev = node_to_amd_nb(i)->misc;
565 576
566 enable_gart_translation(dev, __pa(agp_gatt_table)); 577 enable_gart_translation(dev, __pa(agp_gatt_table));
567 } 578 }
568 579
569 /* Flush the GART-TLB to remove stale entries */ 580 /* Flush the GART-TLB to remove stale entries */
570 k8_flush_garts(); 581 amd_flush_garts();
571} 582}
572 583
573/* 584/*
@@ -585,72 +596,62 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
585 aperture_alloc = aper_alloc; 596 aperture_alloc = aper_alloc;
586} 597}
587 598
588static void gart_fixup_northbridges(struct sys_device *dev) 599static void gart_fixup_northbridges(void)
589{ 600{
590 int i; 601 int i;
591 602
592 if (!fix_up_north_bridges) 603 if (!fix_up_north_bridges)
593 return; 604 return;
594 605
606 if (!amd_nb_has_feature(AMD_NB_GART))
607 return;
608
595 pr_info("PCI-DMA: Restoring GART aperture settings\n"); 609 pr_info("PCI-DMA: Restoring GART aperture settings\n");
596 610
597 for (i = 0; i < num_k8_northbridges; i++) { 611 for (i = 0; i < amd_nb_num(); i++) {
598 struct pci_dev *dev = k8_northbridges[i]; 612 struct pci_dev *dev = node_to_amd_nb(i)->misc;
599 613
600 /* 614 /*
601 * Don't enable translations just yet. That is the next 615 * Don't enable translations just yet. That is the next
602 * step. Restore the pre-suspend aperture settings. 616 * step. Restore the pre-suspend aperture settings.
603 */ 617 */
604 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1); 618 gart_set_size_and_enable(dev, aperture_order);
605 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25); 619 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
606 } 620 }
607} 621}
608 622
609static int gart_resume(struct sys_device *dev) 623static void gart_resume(void)
610{ 624{
611 pr_info("PCI-DMA: Resuming GART IOMMU\n"); 625 pr_info("PCI-DMA: Resuming GART IOMMU\n");
612 626
613 gart_fixup_northbridges(dev); 627 gart_fixup_northbridges();
614 628
615 enable_gart_translations(); 629 enable_gart_translations();
616
617 return 0;
618} 630}
619 631
620static int gart_suspend(struct sys_device *dev, pm_message_t state) 632static struct syscore_ops gart_syscore_ops = {
621{
622 return 0;
623}
624
625static struct sysdev_class gart_sysdev_class = {
626 .name = "gart",
627 .suspend = gart_suspend,
628 .resume = gart_resume, 633 .resume = gart_resume,
629 634
630}; 635};
631 636
632static struct sys_device device_gart = {
633 .cls = &gart_sysdev_class,
634};
635
636/* 637/*
637 * Private Northbridge GATT initialization in case we cannot use the 638 * Private Northbridge GATT initialization in case we cannot use the
638 * AGP driver for some reason. 639 * AGP driver for some reason.
639 */ 640 */
640static __init int init_k8_gatt(struct agp_kern_info *info) 641static __init int init_amd_gatt(struct agp_kern_info *info)
641{ 642{
642 unsigned aper_size, gatt_size, new_aper_size; 643 unsigned aper_size, gatt_size, new_aper_size;
643 unsigned aper_base, new_aper_base; 644 unsigned aper_base, new_aper_base;
644 struct pci_dev *dev; 645 struct pci_dev *dev;
645 void *gatt; 646 void *gatt;
646 int i, error; 647 int i;
647 648
648 pr_info("PCI-DMA: Disabling AGP.\n"); 649 pr_info("PCI-DMA: Disabling AGP.\n");
649 650
650 aper_size = aper_base = info->aper_size = 0; 651 aper_size = aper_base = info->aper_size = 0;
651 dev = NULL; 652 dev = NULL;
652 for (i = 0; i < num_k8_northbridges; i++) { 653 for (i = 0; i < amd_nb_num(); i++) {
653 dev = k8_northbridges[i]; 654 dev = node_to_amd_nb(i)->misc;
654 new_aper_base = read_aperture(dev, &new_aper_size); 655 new_aper_base = read_aperture(dev, &new_aper_size);
655 if (!new_aper_base) 656 if (!new_aper_base)
656 goto nommu; 657 goto nommu;
@@ -678,12 +679,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
678 679
679 agp_gatt_table = gatt; 680 agp_gatt_table = gatt;
680 681
681 error = sysdev_class_register(&gart_sysdev_class); 682 register_syscore_ops(&gart_syscore_ops);
682 if (!error)
683 error = sysdev_register(&device_gart);
684 if (error)
685 panic("Could not register gart_sysdev -- "
686 "would corrupt data on next suspend");
687 683
688 flush_gart(); 684 flush_gart();
689 685
@@ -718,10 +714,13 @@ static void gart_iommu_shutdown(void)
718 if (!no_agp) 714 if (!no_agp)
719 return; 715 return;
720 716
721 for (i = 0; i < num_k8_northbridges; i++) { 717 if (!amd_nb_has_feature(AMD_NB_GART))
718 return;
719
720 for (i = 0; i < amd_nb_num(); i++) {
722 u32 ctl; 721 u32 ctl;
723 722
724 dev = k8_northbridges[i]; 723 dev = node_to_amd_nb(i)->misc;
725 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl); 724 pci_read_config_dword(dev, AMD64_GARTAPERTURECTL, &ctl);
726 725
727 ctl &= ~GARTEN; 726 ctl &= ~GARTEN;
@@ -739,14 +738,14 @@ int __init gart_iommu_init(void)
739 unsigned long scratch; 738 unsigned long scratch;
740 long i; 739 long i;
741 740
742 if (num_k8_northbridges == 0) 741 if (!amd_nb_has_feature(AMD_NB_GART))
743 return 0; 742 return 0;
744 743
745#ifndef CONFIG_AGP_AMD64 744#ifndef CONFIG_AGP_AMD64
746 no_agp = 1; 745 no_agp = 1;
747#else 746#else
748 /* Makefile puts PCI initialization via subsys_initcall first. */ 747 /* Makefile puts PCI initialization via subsys_initcall first. */
749 /* Add other K8 AGP bridge drivers here */ 748 /* Add other AMD AGP bridge drivers here */
750 no_agp = no_agp || 749 no_agp = no_agp ||
751 (agp_amd64_init() < 0) || 750 (agp_amd64_init() < 0) ||
752 (agp_copy_info(agp_bridge, &info) < 0); 751 (agp_copy_info(agp_bridge, &info) < 0);
@@ -755,7 +754,7 @@ int __init gart_iommu_init(void)
755 if (no_iommu || 754 if (no_iommu ||
756 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 755 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
757 !gart_iommu_aperture || 756 !gart_iommu_aperture ||
758 (no_agp && init_k8_gatt(&info) < 0)) { 757 (no_agp && init_amd_gatt(&info) < 0)) {
759 if (max_pfn > MAX_DMA32_PFN) { 758 if (max_pfn > MAX_DMA32_PFN) {
760 pr_warning("More than 4GB of memory but GART IOMMU not available.\n"); 759 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
761 pr_warning("falling back to iommu=soft.\n"); 760 pr_warning("falling back to iommu=soft.\n");
@@ -896,3 +895,4 @@ void __init gart_parse_options(char *p)
896 } 895 }
897 } 896 }
898} 897}
898IOMMU_INIT_POST(gart_iommu_hole_init);
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 679b6450382b..7c3a95e54ec5 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/pci-ats.h>
21#include <linux/bitmap.h> 22#include <linux/bitmap.h>
22#include <linux/slab.h> 23#include <linux/slab.h>
23#include <linux/debugfs.h> 24#include <linux/debugfs.h>
@@ -25,16 +26,18 @@
25#include <linux/dma-mapping.h> 26#include <linux/dma-mapping.h>
26#include <linux/iommu-helper.h> 27#include <linux/iommu-helper.h>
27#include <linux/iommu.h> 28#include <linux/iommu.h>
29#include <linux/delay.h>
28#include <asm/proto.h> 30#include <asm/proto.h>
29#include <asm/iommu.h> 31#include <asm/iommu.h>
30#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/dma.h>
31#include <asm/amd_iommu_proto.h> 34#include <asm/amd_iommu_proto.h>
32#include <asm/amd_iommu_types.h> 35#include <asm/amd_iommu_types.h>
33#include <asm/amd_iommu.h> 36#include <asm/amd_iommu.h>
34 37
35#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28)) 38#define CMD_SET_TYPE(cmd, t) ((cmd)->data[1] |= ((t) << 28))
36 39
37#define EXIT_LOOP_COUNT 10000000 40#define LOOP_TIMEOUT 100000
38 41
39static DEFINE_RWLOCK(amd_iommu_devtable_lock); 42static DEFINE_RWLOCK(amd_iommu_devtable_lock);
40 43
@@ -57,7 +60,6 @@ struct iommu_cmd {
57 u32 data[4]; 60 u32 data[4];
58}; 61};
59 62
60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
61static void update_domain(struct protection_domain *domain); 63static void update_domain(struct protection_domain *domain);
62 64
63/**************************************************************************** 65/****************************************************************************
@@ -153,6 +155,10 @@ static int iommu_init_device(struct device *dev)
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff); 155 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev) 156 if (pdev)
155 dev_data->alias = &pdev->dev; 157 dev_data->alias = &pdev->dev;
158 else {
159 kfree(dev_data);
160 return -ENOTSUPP;
161 }
156 162
157 atomic_set(&dev_data->bind, 0); 163 atomic_set(&dev_data->bind, 0);
158 164
@@ -162,6 +168,20 @@ static int iommu_init_device(struct device *dev)
162 return 0; 168 return 0;
163} 169}
164 170
171static void iommu_ignore_device(struct device *dev)
172{
173 u16 devid, alias;
174
175 devid = get_device_id(dev);
176 alias = amd_iommu_alias_table[devid];
177
178 memset(&amd_iommu_dev_table[devid], 0, sizeof(struct dev_table_entry));
179 memset(&amd_iommu_dev_table[alias], 0, sizeof(struct dev_table_entry));
180
181 amd_iommu_rlookup_table[devid] = NULL;
182 amd_iommu_rlookup_table[alias] = NULL;
183}
184
165static void iommu_uninit_device(struct device *dev) 185static void iommu_uninit_device(struct device *dev)
166{ 186{
167 kfree(dev->archdata.iommu); 187 kfree(dev->archdata.iommu);
@@ -191,7 +211,9 @@ int __init amd_iommu_init_devices(void)
191 continue; 211 continue;
192 212
193 ret = iommu_init_device(&pdev->dev); 213 ret = iommu_init_device(&pdev->dev);
194 if (ret) 214 if (ret == -ENOTSUPP)
215 iommu_ignore_device(&pdev->dev);
216 else if (ret)
195 goto out_free; 217 goto out_free;
196 } 218 }
197 219
@@ -322,8 +344,6 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
322 break; 344 break;
323 case EVENT_TYPE_ILL_CMD: 345 case EVENT_TYPE_ILL_CMD:
324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 346 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 dump_command(address); 347 dump_command(address);
328 break; 348 break;
329 case EVENT_TYPE_CMD_HARD_ERR: 349 case EVENT_TYPE_CMD_HARD_ERR:
@@ -367,7 +387,7 @@ static void iommu_poll_events(struct amd_iommu *iommu)
367 spin_unlock_irqrestore(&iommu->lock, flags); 387 spin_unlock_irqrestore(&iommu->lock, flags);
368} 388}
369 389
370irqreturn_t amd_iommu_int_handler(int irq, void *data) 390irqreturn_t amd_iommu_int_thread(int irq, void *data)
371{ 391{
372 struct amd_iommu *iommu; 392 struct amd_iommu *iommu;
373 393
@@ -377,192 +397,300 @@ irqreturn_t amd_iommu_int_handler(int irq, void *data)
377 return IRQ_HANDLED; 397 return IRQ_HANDLED;
378} 398}
379 399
400irqreturn_t amd_iommu_int_handler(int irq, void *data)
401{
402 return IRQ_WAKE_THREAD;
403}
404
380/**************************************************************************** 405/****************************************************************************
381 * 406 *
382 * IOMMU command queuing functions 407 * IOMMU command queuing functions
383 * 408 *
384 ****************************************************************************/ 409 ****************************************************************************/
385 410
386/* 411static int wait_on_sem(volatile u64 *sem)
387 * Writes the command to the IOMMUs command buffer and informs the 412{
388 * hardware about the new command. Must be called with iommu->lock held. 413 int i = 0;
389 */ 414
390static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 415 while (*sem == 0 && i < LOOP_TIMEOUT) {
416 udelay(1);
417 i += 1;
418 }
419
420 if (i == LOOP_TIMEOUT) {
421 pr_alert("AMD-Vi: Completion-Wait loop timed out\n");
422 return -EIO;
423 }
424
425 return 0;
426}
427
428static void copy_cmd_to_buffer(struct amd_iommu *iommu,
429 struct iommu_cmd *cmd,
430 u32 tail)
391{ 431{
392 u32 tail, head;
393 u8 *target; 432 u8 *target;
394 433
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
397 target = iommu->cmd_buf + tail; 434 target = iommu->cmd_buf + tail;
398 memcpy_toio(target, cmd, sizeof(*cmd)); 435 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
399 tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size; 436
400 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET); 437 /* Copy command to buffer */
401 if (tail == head) 438 memcpy(target, cmd, sizeof(*cmd));
402 return -ENOMEM; 439
440 /* Tell the IOMMU about it */
403 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 441 writel(tail, iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
442}
404 443
405 return 0; 444static void build_completion_wait(struct iommu_cmd *cmd, u64 address)
445{
446 WARN_ON(address & 0x7ULL);
447
448 memset(cmd, 0, sizeof(*cmd));
449 cmd->data[0] = lower_32_bits(__pa(address)) | CMD_COMPL_WAIT_STORE_MASK;
450 cmd->data[1] = upper_32_bits(__pa(address));
451 cmd->data[2] = 1;
452 CMD_SET_TYPE(cmd, CMD_COMPL_WAIT);
453}
454
455static void build_inv_dte(struct iommu_cmd *cmd, u16 devid)
456{
457 memset(cmd, 0, sizeof(*cmd));
458 cmd->data[0] = devid;
459 CMD_SET_TYPE(cmd, CMD_INV_DEV_ENTRY);
460}
461
462static void build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
463 size_t size, u16 domid, int pde)
464{
465 u64 pages;
466 int s;
467
468 pages = iommu_num_pages(address, size, PAGE_SIZE);
469 s = 0;
470
471 if (pages > 1) {
472 /*
473 * If we have to flush more than one page, flush all
474 * TLB entries for this domain
475 */
476 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
477 s = 1;
478 }
479
480 address &= PAGE_MASK;
481
482 memset(cmd, 0, sizeof(*cmd));
483 cmd->data[1] |= domid;
484 cmd->data[2] = lower_32_bits(address);
485 cmd->data[3] = upper_32_bits(address);
486 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
487 if (s) /* size bit - we flush more than one 4kb page */
488 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
489 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
490 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
491}
492
493static void build_inv_iotlb_pages(struct iommu_cmd *cmd, u16 devid, int qdep,
494 u64 address, size_t size)
495{
496 u64 pages;
497 int s;
498
499 pages = iommu_num_pages(address, size, PAGE_SIZE);
500 s = 0;
501
502 if (pages > 1) {
503 /*
504 * If we have to flush more than one page, flush all
505 * TLB entries for this domain
506 */
507 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
508 s = 1;
509 }
510
511 address &= PAGE_MASK;
512
513 memset(cmd, 0, sizeof(*cmd));
514 cmd->data[0] = devid;
515 cmd->data[0] |= (qdep & 0xff) << 24;
516 cmd->data[1] = devid;
517 cmd->data[2] = lower_32_bits(address);
518 cmd->data[3] = upper_32_bits(address);
519 CMD_SET_TYPE(cmd, CMD_INV_IOTLB_PAGES);
520 if (s)
521 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
522}
523
524static void build_inv_all(struct iommu_cmd *cmd)
525{
526 memset(cmd, 0, sizeof(*cmd));
527 CMD_SET_TYPE(cmd, CMD_INV_ALL);
406} 528}
407 529
408/* 530/*
409 * General queuing function for commands. Takes iommu->lock and calls 531 * Writes the command to the IOMMUs command buffer and informs the
410 * __iommu_queue_command(). 532 * hardware about the new command.
411 */ 533 */
412static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd) 534static int iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
413{ 535{
536 u32 left, tail, head, next_tail;
414 unsigned long flags; 537 unsigned long flags;
415 int ret;
416 538
539 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
540
541again:
417 spin_lock_irqsave(&iommu->lock, flags); 542 spin_lock_irqsave(&iommu->lock, flags);
418 ret = __iommu_queue_command(iommu, cmd);
419 if (!ret)
420 iommu->need_sync = true;
421 spin_unlock_irqrestore(&iommu->lock, flags);
422 543
423 return ret; 544 head = readl(iommu->mmio_base + MMIO_CMD_HEAD_OFFSET);
424} 545 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
546 next_tail = (tail + sizeof(*cmd)) % iommu->cmd_buf_size;
547 left = (head - next_tail) % iommu->cmd_buf_size;
425 548
426/* 549 if (left <= 2) {
427 * This function waits until an IOMMU has completed a completion 550 struct iommu_cmd sync_cmd;
428 * wait command 551 volatile u64 sem = 0;
429 */ 552 int ret;
430static void __iommu_wait_for_completion(struct amd_iommu *iommu)
431{
432 int ready = 0;
433 unsigned status = 0;
434 unsigned long i = 0;
435 553
436 INC_STATS_COUNTER(compl_wait); 554 build_completion_wait(&sync_cmd, (u64)&sem);
555 copy_cmd_to_buffer(iommu, &sync_cmd, tail);
437 556
438 while (!ready && (i < EXIT_LOOP_COUNT)) { 557 spin_unlock_irqrestore(&iommu->lock, flags);
439 ++i; 558
440 /* wait for the bit to become one */ 559 if ((ret = wait_on_sem(&sem)) != 0)
441 status = readl(iommu->mmio_base + MMIO_STATUS_OFFSET); 560 return ret;
442 ready = status & MMIO_STATUS_COM_WAIT_INT_MASK; 561
562 goto again;
443 } 563 }
444 564
445 /* set bit back to zero */ 565 copy_cmd_to_buffer(iommu, cmd, tail);
446 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 566
447 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 567 /* We need to sync now to make sure all commands are processed */
568 iommu->need_sync = true;
448 569
449 if (unlikely(i == EXIT_LOOP_COUNT)) 570 spin_unlock_irqrestore(&iommu->lock, flags);
450 iommu->reset_in_progress = true; 571
572 return 0;
451} 573}
452 574
453/* 575/*
454 * This function queues a completion wait command into the command 576 * This function queues a completion wait command into the command
455 * buffer of an IOMMU 577 * buffer of an IOMMU
456 */ 578 */
457static int __iommu_completion_wait(struct amd_iommu *iommu) 579static int iommu_completion_wait(struct amd_iommu *iommu)
458{ 580{
459 struct iommu_cmd cmd; 581 struct iommu_cmd cmd;
582 volatile u64 sem = 0;
583 int ret;
460 584
461 memset(&cmd, 0, sizeof(cmd)); 585 if (!iommu->need_sync)
462 cmd.data[0] = CMD_COMPL_WAIT_INT_MASK; 586 return 0;
463 CMD_SET_TYPE(&cmd, CMD_COMPL_WAIT);
464 587
465 return __iommu_queue_command(iommu, &cmd); 588 build_completion_wait(&cmd, (u64)&sem);
589
590 ret = iommu_queue_command(iommu, &cmd);
591 if (ret)
592 return ret;
593
594 return wait_on_sem(&sem);
466} 595}
467 596
468/* 597static int iommu_flush_dte(struct amd_iommu *iommu, u16 devid)
469 * This function is called whenever we need to ensure that the IOMMU has
470 * completed execution of all commands we sent. It sends a
471 * COMPLETION_WAIT command and waits for it to finish. The IOMMU informs
472 * us about that by writing a value to a physical address we pass with
473 * the command.
474 */
475static int iommu_completion_wait(struct amd_iommu *iommu)
476{ 598{
477 int ret = 0; 599 struct iommu_cmd cmd;
478 unsigned long flags;
479
480 spin_lock_irqsave(&iommu->lock, flags);
481 600
482 if (!iommu->need_sync) 601 build_inv_dte(&cmd, devid);
483 goto out;
484 602
485 ret = __iommu_completion_wait(iommu); 603 return iommu_queue_command(iommu, &cmd);
604}
486 605
487 iommu->need_sync = false; 606static void iommu_flush_dte_all(struct amd_iommu *iommu)
607{
608 u32 devid;
488 609
489 if (ret) 610 for (devid = 0; devid <= 0xffff; ++devid)
490 goto out; 611 iommu_flush_dte(iommu, devid);
491 612
492 __iommu_wait_for_completion(iommu); 613 iommu_completion_wait(iommu);
614}
493 615
494out: 616/*
495 spin_unlock_irqrestore(&iommu->lock, flags); 617 * This function uses heavy locking and may disable irqs for some time. But
618 * this is no issue because it is only called during resume.
619 */
620static void iommu_flush_tlb_all(struct amd_iommu *iommu)
621{
622 u32 dom_id;
496 623
497 if (iommu->reset_in_progress) 624 for (dom_id = 0; dom_id <= 0xffff; ++dom_id) {
498 reset_iommu_command_buffer(iommu); 625 struct iommu_cmd cmd;
626 build_inv_iommu_pages(&cmd, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS,
627 dom_id, 1);
628 iommu_queue_command(iommu, &cmd);
629 }
499 630
500 return 0; 631 iommu_completion_wait(iommu);
501} 632}
502 633
503static void iommu_flush_complete(struct protection_domain *domain) 634static void iommu_flush_all(struct amd_iommu *iommu)
504{ 635{
505 int i; 636 struct iommu_cmd cmd;
506 637
507 for (i = 0; i < amd_iommus_present; ++i) { 638 build_inv_all(&cmd);
508 if (!domain->dev_iommu[i])
509 continue;
510 639
511 /* 640 iommu_queue_command(iommu, &cmd);
512 * Devices of this domain are behind this IOMMU 641 iommu_completion_wait(iommu);
513 * We need to wait for completion of all commands. 642}
514 */ 643
515 iommu_completion_wait(amd_iommus[i]); 644void iommu_flush_all_caches(struct amd_iommu *iommu)
645{
646 if (iommu_feature(iommu, FEATURE_IA)) {
647 iommu_flush_all(iommu);
648 } else {
649 iommu_flush_dte_all(iommu);
650 iommu_flush_tlb_all(iommu);
516 } 651 }
517} 652}
518 653
519/* 654/*
520 * Command send function for invalidating a device table entry 655 * Command send function for flushing on-device TLB
521 */ 656 */
522static int iommu_flush_device(struct device *dev) 657static int device_flush_iotlb(struct device *dev, u64 address, size_t size)
523{ 658{
659 struct pci_dev *pdev = to_pci_dev(dev);
524 struct amd_iommu *iommu; 660 struct amd_iommu *iommu;
525 struct iommu_cmd cmd; 661 struct iommu_cmd cmd;
526 u16 devid; 662 u16 devid;
663 int qdep;
527 664
665 qdep = pci_ats_queue_depth(pdev);
528 devid = get_device_id(dev); 666 devid = get_device_id(dev);
529 iommu = amd_iommu_rlookup_table[devid]; 667 iommu = amd_iommu_rlookup_table[devid];
530 668
531 /* Build command */ 669 build_inv_iotlb_pages(&cmd, devid, qdep, address, size);
532 memset(&cmd, 0, sizeof(cmd));
533 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
534 cmd.data[0] = devid;
535 670
536 return iommu_queue_command(iommu, &cmd); 671 return iommu_queue_command(iommu, &cmd);
537} 672}
538 673
539static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
540 u16 domid, int pde, int s)
541{
542 memset(cmd, 0, sizeof(*cmd));
543 address &= PAGE_MASK;
544 CMD_SET_TYPE(cmd, CMD_INV_IOMMU_PAGES);
545 cmd->data[1] |= domid;
546 cmd->data[2] = lower_32_bits(address);
547 cmd->data[3] = upper_32_bits(address);
548 if (s) /* size bit - we flush more than one 4kb page */
549 cmd->data[2] |= CMD_INV_IOMMU_PAGES_SIZE_MASK;
550 if (pde) /* PDE bit - we wan't flush everything not only the PTEs */
551 cmd->data[2] |= CMD_INV_IOMMU_PAGES_PDE_MASK;
552}
553
554/* 674/*
555 * Generic command send function for invalidaing TLB entries 675 * Command send function for invalidating a device table entry
556 */ 676 */
557static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu, 677static int device_flush_dte(struct device *dev)
558 u64 address, u16 domid, int pde, int s)
559{ 678{
560 struct iommu_cmd cmd; 679 struct amd_iommu *iommu;
680 struct pci_dev *pdev;
681 u16 devid;
561 int ret; 682 int ret;
562 683
563 __iommu_build_inv_iommu_pages(&cmd, address, domid, pde, s); 684 pdev = to_pci_dev(dev);
685 devid = get_device_id(dev);
686 iommu = amd_iommu_rlookup_table[devid];
564 687
565 ret = iommu_queue_command(iommu, &cmd); 688 ret = iommu_flush_dte(iommu, devid);
689 if (ret)
690 return ret;
691
692 if (pci_ats_enabled(pdev))
693 ret = device_flush_iotlb(dev, 0, ~0UL);
566 694
567 return ret; 695 return ret;
568} 696}
@@ -572,23 +700,14 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
572 * It invalidates a single PTE if the range to flush is within a single 700 * It invalidates a single PTE if the range to flush is within a single
573 * page. Otherwise it flushes the whole TLB of the IOMMU. 701 * page. Otherwise it flushes the whole TLB of the IOMMU.
574 */ 702 */
575static void __iommu_flush_pages(struct protection_domain *domain, 703static void __domain_flush_pages(struct protection_domain *domain,
576 u64 address, size_t size, int pde) 704 u64 address, size_t size, int pde)
577{ 705{
578 int s = 0, i; 706 struct iommu_dev_data *dev_data;
579 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE); 707 struct iommu_cmd cmd;
580 708 int ret = 0, i;
581 address &= PAGE_MASK;
582
583 if (pages > 1) {
584 /*
585 * If we have to flush more than one page, flush all
586 * TLB entries for this domain
587 */
588 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS;
589 s = 1;
590 }
591 709
710 build_inv_iommu_pages(&cmd, address, size, domain->id, pde);
592 711
593 for (i = 0; i < amd_iommus_present; ++i) { 712 for (i = 0; i < amd_iommus_present; ++i) {
594 if (!domain->dev_iommu[i]) 713 if (!domain->dev_iommu[i])
@@ -598,101 +717,70 @@ static void __iommu_flush_pages(struct protection_domain *domain,
598 * Devices of this domain are behind this IOMMU 717 * Devices of this domain are behind this IOMMU
599 * We need a TLB flush 718 * We need a TLB flush
600 */ 719 */
601 iommu_queue_inv_iommu_pages(amd_iommus[i], address, 720 ret |= iommu_queue_command(amd_iommus[i], &cmd);
602 domain->id, pde, s); 721 }
722
723 list_for_each_entry(dev_data, &domain->dev_list, list) {
724 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
725
726 if (!pci_ats_enabled(pdev))
727 continue;
728
729 ret |= device_flush_iotlb(dev_data->dev, address, size);
603 } 730 }
604 731
605 return; 732 WARN_ON(ret);
606} 733}
607 734
608static void iommu_flush_pages(struct protection_domain *domain, 735static void domain_flush_pages(struct protection_domain *domain,
609 u64 address, size_t size) 736 u64 address, size_t size)
610{ 737{
611 __iommu_flush_pages(domain, address, size, 0); 738 __domain_flush_pages(domain, address, size, 0);
612} 739}
613 740
614/* Flush the whole IO/TLB for a given protection domain */ 741/* Flush the whole IO/TLB for a given protection domain */
615static void iommu_flush_tlb(struct protection_domain *domain) 742static void domain_flush_tlb(struct protection_domain *domain)
616{ 743{
617 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0); 744 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
618} 745}
619 746
620/* Flush the whole IO/TLB for a given protection domain - including PDE */ 747/* Flush the whole IO/TLB for a given protection domain - including PDE */
621static void iommu_flush_tlb_pde(struct protection_domain *domain) 748static void domain_flush_tlb_pde(struct protection_domain *domain)
622{
623 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
624}
625
626
627/*
628 * This function flushes the DTEs for all devices in domain
629 */
630static void iommu_flush_domain_devices(struct protection_domain *domain)
631{ 749{
632 struct iommu_dev_data *dev_data; 750 __domain_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
633 unsigned long flags;
634
635 spin_lock_irqsave(&domain->lock, flags);
636
637 list_for_each_entry(dev_data, &domain->dev_list, list)
638 iommu_flush_device(dev_data->dev);
639
640 spin_unlock_irqrestore(&domain->lock, flags);
641} 751}
642 752
643static void iommu_flush_all_domain_devices(void) 753static void domain_flush_complete(struct protection_domain *domain)
644{ 754{
645 struct protection_domain *domain; 755 int i;
646 unsigned long flags;
647 756
648 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 757 for (i = 0; i < amd_iommus_present; ++i) {
758 if (!domain->dev_iommu[i])
759 continue;
649 760
650 list_for_each_entry(domain, &amd_iommu_pd_list, list) { 761 /*
651 iommu_flush_domain_devices(domain); 762 * Devices of this domain are behind this IOMMU
652 iommu_flush_complete(domain); 763 * We need to wait for completion of all commands.
764 */
765 iommu_completion_wait(amd_iommus[i]);
653 } 766 }
654
655 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
656} 767}
657 768
658void amd_iommu_flush_all_devices(void)
659{
660 iommu_flush_all_domain_devices();
661}
662 769
663/* 770/*
664 * This function uses heavy locking and may disable irqs for some time. But 771 * This function flushes the DTEs for all devices in domain
665 * this is no issue because it is only called during resume.
666 */ 772 */
667void amd_iommu_flush_all_domains(void) 773static void domain_flush_devices(struct protection_domain *domain)
668{ 774{
669 struct protection_domain *domain; 775 struct iommu_dev_data *dev_data;
670 unsigned long flags; 776 unsigned long flags;
671 777
672 spin_lock_irqsave(&amd_iommu_pd_lock, flags); 778 spin_lock_irqsave(&domain->lock, flags);
673
674 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
675 spin_lock(&domain->lock);
676 iommu_flush_tlb_pde(domain);
677 iommu_flush_complete(domain);
678 spin_unlock(&domain->lock);
679 }
680
681 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
682}
683
684static void reset_iommu_command_buffer(struct amd_iommu *iommu)
685{
686 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
687
688 if (iommu->reset_in_progress)
689 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
690 779
691 amd_iommu_reset_cmd_buffer(iommu); 780 list_for_each_entry(dev_data, &domain->dev_list, list)
692 amd_iommu_flush_all_devices(); 781 device_flush_dte(dev_data->dev);
693 amd_iommu_flush_all_domains();
694 782
695 iommu->reset_in_progress = false; 783 spin_unlock_irqrestore(&domain->lock, flags);
696} 784}
697 785
698/**************************************************************************** 786/****************************************************************************
@@ -1086,7 +1174,7 @@ static int alloc_new_range(struct dma_ops_domain *dma_dom,
1086 1174
1087 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1175 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
1088 1176
1089 /* Intialize the exclusion range if necessary */ 1177 /* Initialize the exclusion range if necessary */
1090 for_each_iommu(iommu) { 1178 for_each_iommu(iommu) {
1091 if (iommu->exclusion_start && 1179 if (iommu->exclusion_start &&
1092 iommu->exclusion_start >= dma_dom->aperture[index]->offset 1180 iommu->exclusion_start >= dma_dom->aperture[index]->offset
@@ -1353,7 +1441,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1353 1441
1354/* 1442/*
1355 * Allocates a new protection domain usable for the dma_ops functions. 1443 * Allocates a new protection domain usable for the dma_ops functions.
1356 * It also intializes the page table and the address allocator data 1444 * It also initializes the page table and the address allocator data
1357 * structures required for the dma_ops interface 1445 * structures required for the dma_ops interface
1358 */ 1446 */
1359static struct dma_ops_domain *dma_ops_domain_alloc(void) 1447static struct dma_ops_domain *dma_ops_domain_alloc(void)
@@ -1410,17 +1498,22 @@ static bool dma_ops_domain(struct protection_domain *domain)
1410 return domain->flags & PD_DMA_OPS_MASK; 1498 return domain->flags & PD_DMA_OPS_MASK;
1411} 1499}
1412 1500
1413static void set_dte_entry(u16 devid, struct protection_domain *domain) 1501static void set_dte_entry(u16 devid, struct protection_domain *domain, bool ats)
1414{ 1502{
1415 u64 pte_root = virt_to_phys(domain->pt_root); 1503 u64 pte_root = virt_to_phys(domain->pt_root);
1504 u32 flags = 0;
1416 1505
1417 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK) 1506 pte_root |= (domain->mode & DEV_ENTRY_MODE_MASK)
1418 << DEV_ENTRY_MODE_SHIFT; 1507 << DEV_ENTRY_MODE_SHIFT;
1419 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV; 1508 pte_root |= IOMMU_PTE_IR | IOMMU_PTE_IW | IOMMU_PTE_P | IOMMU_PTE_TV;
1420 1509
1421 amd_iommu_dev_table[devid].data[2] = domain->id; 1510 if (ats)
1422 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1511 flags |= DTE_FLAG_IOTLB;
1423 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1512
1513 amd_iommu_dev_table[devid].data[3] |= flags;
1514 amd_iommu_dev_table[devid].data[2] = domain->id;
1515 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1516 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1424} 1517}
1425 1518
1426static void clear_dte_entry(u16 devid) 1519static void clear_dte_entry(u16 devid)
@@ -1437,23 +1530,29 @@ static void do_attach(struct device *dev, struct protection_domain *domain)
1437{ 1530{
1438 struct iommu_dev_data *dev_data; 1531 struct iommu_dev_data *dev_data;
1439 struct amd_iommu *iommu; 1532 struct amd_iommu *iommu;
1533 struct pci_dev *pdev;
1534 bool ats = false;
1440 u16 devid; 1535 u16 devid;
1441 1536
1442 devid = get_device_id(dev); 1537 devid = get_device_id(dev);
1443 iommu = amd_iommu_rlookup_table[devid]; 1538 iommu = amd_iommu_rlookup_table[devid];
1444 dev_data = get_dev_data(dev); 1539 dev_data = get_dev_data(dev);
1540 pdev = to_pci_dev(dev);
1541
1542 if (amd_iommu_iotlb_sup)
1543 ats = pci_ats_enabled(pdev);
1445 1544
1446 /* Update data structures */ 1545 /* Update data structures */
1447 dev_data->domain = domain; 1546 dev_data->domain = domain;
1448 list_add(&dev_data->list, &domain->dev_list); 1547 list_add(&dev_data->list, &domain->dev_list);
1449 set_dte_entry(devid, domain); 1548 set_dte_entry(devid, domain, ats);
1450 1549
1451 /* Do reference counting */ 1550 /* Do reference counting */
1452 domain->dev_iommu[iommu->index] += 1; 1551 domain->dev_iommu[iommu->index] += 1;
1453 domain->dev_cnt += 1; 1552 domain->dev_cnt += 1;
1454 1553
1455 /* Flush the DTE entry */ 1554 /* Flush the DTE entry */
1456 iommu_flush_device(dev); 1555 device_flush_dte(dev);
1457} 1556}
1458 1557
1459static void do_detach(struct device *dev) 1558static void do_detach(struct device *dev)
@@ -1476,7 +1575,7 @@ static void do_detach(struct device *dev)
1476 clear_dte_entry(devid); 1575 clear_dte_entry(devid);
1477 1576
1478 /* Flush the DTE entry */ 1577 /* Flush the DTE entry */
1479 iommu_flush_device(dev); 1578 device_flush_dte(dev);
1480} 1579}
1481 1580
1482/* 1581/*
@@ -1539,9 +1638,13 @@ out_unlock:
1539static int attach_device(struct device *dev, 1638static int attach_device(struct device *dev,
1540 struct protection_domain *domain) 1639 struct protection_domain *domain)
1541{ 1640{
1641 struct pci_dev *pdev = to_pci_dev(dev);
1542 unsigned long flags; 1642 unsigned long flags;
1543 int ret; 1643 int ret;
1544 1644
1645 if (amd_iommu_iotlb_sup)
1646 pci_enable_ats(pdev, PAGE_SHIFT);
1647
1545 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1648 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1546 ret = __attach_device(dev, domain); 1649 ret = __attach_device(dev, domain);
1547 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1650 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
@@ -1551,7 +1654,7 @@ static int attach_device(struct device *dev,
1551 * left the caches in the IOMMU dirty. So we have to flush 1654 * left the caches in the IOMMU dirty. So we have to flush
1552 * here to evict all dirty stuff. 1655 * here to evict all dirty stuff.
1553 */ 1656 */
1554 iommu_flush_tlb_pde(domain); 1657 domain_flush_tlb_pde(domain);
1555 1658
1556 return ret; 1659 return ret;
1557} 1660}
@@ -1598,12 +1701,16 @@ static void __detach_device(struct device *dev)
1598 */ 1701 */
1599static void detach_device(struct device *dev) 1702static void detach_device(struct device *dev)
1600{ 1703{
1704 struct pci_dev *pdev = to_pci_dev(dev);
1601 unsigned long flags; 1705 unsigned long flags;
1602 1706
1603 /* lock device table */ 1707 /* lock device table */
1604 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1708 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1605 __detach_device(dev); 1709 __detach_device(dev);
1606 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1710 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1711
1712 if (amd_iommu_iotlb_sup && pci_ats_enabled(pdev))
1713 pci_disable_ats(pdev);
1607} 1714}
1608 1715
1609/* 1716/*
@@ -1615,10 +1722,9 @@ static struct protection_domain *domain_for_device(struct device *dev)
1615 struct protection_domain *dom; 1722 struct protection_domain *dom;
1616 struct iommu_dev_data *dev_data, *alias_data; 1723 struct iommu_dev_data *dev_data, *alias_data;
1617 unsigned long flags; 1724 unsigned long flags;
1618 u16 devid, alias; 1725 u16 devid;
1619 1726
1620 devid = get_device_id(dev); 1727 devid = get_device_id(dev);
1621 alias = amd_iommu_alias_table[devid];
1622 dev_data = get_dev_data(dev); 1728 dev_data = get_dev_data(dev);
1623 alias_data = get_dev_data(dev_data->alias); 1729 alias_data = get_dev_data(dev_data->alias);
1624 if (!alias_data) 1730 if (!alias_data)
@@ -1692,7 +1798,7 @@ static int device_change_notifier(struct notifier_block *nb,
1692 goto out; 1798 goto out;
1693 } 1799 }
1694 1800
1695 iommu_flush_device(dev); 1801 device_flush_dte(dev);
1696 iommu_completion_wait(iommu); 1802 iommu_completion_wait(iommu);
1697 1803
1698out: 1804out:
@@ -1753,8 +1859,9 @@ static void update_device_table(struct protection_domain *domain)
1753 struct iommu_dev_data *dev_data; 1859 struct iommu_dev_data *dev_data;
1754 1860
1755 list_for_each_entry(dev_data, &domain->dev_list, list) { 1861 list_for_each_entry(dev_data, &domain->dev_list, list) {
1862 struct pci_dev *pdev = to_pci_dev(dev_data->dev);
1756 u16 devid = get_device_id(dev_data->dev); 1863 u16 devid = get_device_id(dev_data->dev);
1757 set_dte_entry(devid, domain); 1864 set_dte_entry(devid, domain, pci_ats_enabled(pdev));
1758 } 1865 }
1759} 1866}
1760 1867
@@ -1764,8 +1871,9 @@ static void update_domain(struct protection_domain *domain)
1764 return; 1871 return;
1765 1872
1766 update_device_table(domain); 1873 update_device_table(domain);
1767 iommu_flush_domain_devices(domain); 1874
1768 iommu_flush_tlb_pde(domain); 1875 domain_flush_devices(domain);
1876 domain_flush_tlb_pde(domain);
1769 1877
1770 domain->updated = false; 1878 domain->updated = false;
1771} 1879}
@@ -1924,10 +2032,10 @@ retry:
1924 ADD_STATS_COUNTER(alloced_io_mem, size); 2032 ADD_STATS_COUNTER(alloced_io_mem, size);
1925 2033
1926 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 2034 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1927 iommu_flush_tlb(&dma_dom->domain); 2035 domain_flush_tlb(&dma_dom->domain);
1928 dma_dom->need_flush = false; 2036 dma_dom->need_flush = false;
1929 } else if (unlikely(amd_iommu_np_cache)) 2037 } else if (unlikely(amd_iommu_np_cache))
1930 iommu_flush_pages(&dma_dom->domain, address, size); 2038 domain_flush_pages(&dma_dom->domain, address, size);
1931 2039
1932out: 2040out:
1933 return address; 2041 return address;
@@ -1976,7 +2084,7 @@ static void __unmap_single(struct dma_ops_domain *dma_dom,
1976 dma_ops_free_addresses(dma_dom, dma_addr, pages); 2084 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1977 2085
1978 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 2086 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1979 iommu_flush_pages(&dma_dom->domain, flush_addr, size); 2087 domain_flush_pages(&dma_dom->domain, flush_addr, size);
1980 dma_dom->need_flush = false; 2088 dma_dom->need_flush = false;
1981 } 2089 }
1982} 2090}
@@ -2012,7 +2120,7 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
2012 if (addr == DMA_ERROR_CODE) 2120 if (addr == DMA_ERROR_CODE)
2013 goto out; 2121 goto out;
2014 2122
2015 iommu_flush_complete(domain); 2123 domain_flush_complete(domain);
2016 2124
2017out: 2125out:
2018 spin_unlock_irqrestore(&domain->lock, flags); 2126 spin_unlock_irqrestore(&domain->lock, flags);
@@ -2039,7 +2147,7 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
2039 2147
2040 __unmap_single(domain->priv, dma_addr, size, dir); 2148 __unmap_single(domain->priv, dma_addr, size, dir);
2041 2149
2042 iommu_flush_complete(domain); 2150 domain_flush_complete(domain);
2043 2151
2044 spin_unlock_irqrestore(&domain->lock, flags); 2152 spin_unlock_irqrestore(&domain->lock, flags);
2045} 2153}
@@ -2104,7 +2212,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
2104 goto unmap; 2212 goto unmap;
2105 } 2213 }
2106 2214
2107 iommu_flush_complete(domain); 2215 domain_flush_complete(domain);
2108 2216
2109out: 2217out:
2110 spin_unlock_irqrestore(&domain->lock, flags); 2218 spin_unlock_irqrestore(&domain->lock, flags);
@@ -2150,7 +2258,7 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
2150 s->dma_address = s->dma_length = 0; 2258 s->dma_address = s->dma_length = 0;
2151 } 2259 }
2152 2260
2153 iommu_flush_complete(domain); 2261 domain_flush_complete(domain);
2154 2262
2155 spin_unlock_irqrestore(&domain->lock, flags); 2263 spin_unlock_irqrestore(&domain->lock, flags);
2156} 2264}
@@ -2200,7 +2308,7 @@ static void *alloc_coherent(struct device *dev, size_t size,
2200 goto out_free; 2308 goto out_free;
2201 } 2309 }
2202 2310
2203 iommu_flush_complete(domain); 2311 domain_flush_complete(domain);
2204 2312
2205 spin_unlock_irqrestore(&domain->lock, flags); 2313 spin_unlock_irqrestore(&domain->lock, flags);
2206 2314
@@ -2232,7 +2340,7 @@ static void free_coherent(struct device *dev, size_t size,
2232 2340
2233 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2341 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2234 2342
2235 iommu_flush_complete(domain); 2343 domain_flush_complete(domain);
2236 2344
2237 spin_unlock_irqrestore(&domain->lock, flags); 2345 spin_unlock_irqrestore(&domain->lock, flags);
2238 2346
@@ -2296,6 +2404,23 @@ static struct dma_map_ops amd_iommu_dma_ops = {
2296 .dma_supported = amd_iommu_dma_supported, 2404 .dma_supported = amd_iommu_dma_supported,
2297}; 2405};
2298 2406
2407static unsigned device_dma_ops_init(void)
2408{
2409 struct pci_dev *pdev = NULL;
2410 unsigned unhandled = 0;
2411
2412 for_each_pci_dev(pdev) {
2413 if (!check_device(&pdev->dev)) {
2414 unhandled += 1;
2415 continue;
2416 }
2417
2418 pdev->dev.archdata.dma_ops = &amd_iommu_dma_ops;
2419 }
2420
2421 return unhandled;
2422}
2423
2299/* 2424/*
2300 * The function which clues the AMD IOMMU driver into dma_ops. 2425 * The function which clues the AMD IOMMU driver into dma_ops.
2301 */ 2426 */
@@ -2308,7 +2433,7 @@ void __init amd_iommu_init_api(void)
2308int __init amd_iommu_init_dma_ops(void) 2433int __init amd_iommu_init_dma_ops(void)
2309{ 2434{
2310 struct amd_iommu *iommu; 2435 struct amd_iommu *iommu;
2311 int ret; 2436 int ret, unhandled;
2312 2437
2313 /* 2438 /*
2314 * first allocate a default protection domain for every IOMMU we 2439 * first allocate a default protection domain for every IOMMU we
@@ -2334,7 +2459,11 @@ int __init amd_iommu_init_dma_ops(void)
2334 swiotlb = 0; 2459 swiotlb = 0;
2335 2460
2336 /* Make the driver finally visible to the drivers */ 2461 /* Make the driver finally visible to the drivers */
2337 dma_ops = &amd_iommu_dma_ops; 2462 unhandled = device_dma_ops_init();
2463 if (unhandled && max_pfn > MAX_DMA32_PFN) {
2464 /* There are unhandled devices - initialize swiotlb for them */
2465 swiotlb = 1;
2466 }
2338 2467
2339 amd_iommu_stats_init(); 2468 amd_iommu_stats_init();
2340 2469
@@ -2476,7 +2605,7 @@ static void amd_iommu_detach_device(struct iommu_domain *dom,
2476 if (!iommu) 2605 if (!iommu)
2477 return; 2606 return;
2478 2607
2479 iommu_flush_device(dev); 2608 device_flush_dte(dev);
2480 iommu_completion_wait(iommu); 2609 iommu_completion_wait(iommu);
2481} 2610}
2482 2611
@@ -2542,7 +2671,7 @@ static int amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova,
2542 unmap_size = iommu_unmap_page(domain, iova, page_size); 2671 unmap_size = iommu_unmap_page(domain, iova, page_size);
2543 mutex_unlock(&domain->api_lock); 2672 mutex_unlock(&domain->api_lock);
2544 2673
2545 iommu_flush_tlb_pde(domain); 2674 domain_flush_tlb_pde(domain);
2546 2675
2547 return get_order(unmap_size); 2676 return get_order(unmap_size);
2548} 2677}
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index 5a170cbbbed8..bfc8453bd98d 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2010 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -21,7 +21,7 @@
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h> 23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/syscore_ops.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
@@ -31,7 +31,7 @@
31#include <asm/iommu.h> 31#include <asm/iommu.h>
32#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h> 33#include <asm/x86_init.h>
34 34#include <asm/iommu_table.h>
35/* 35/*
36 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
37 */ 37 */
@@ -137,6 +137,7 @@ int amd_iommus_present;
137 137
138/* IOMMUs have a non-present cache? */ 138/* IOMMUs have a non-present cache? */
139bool amd_iommu_np_cache __read_mostly; 139bool amd_iommu_np_cache __read_mostly;
140bool amd_iommu_iotlb_sup __read_mostly = true;
140 141
141/* 142/*
142 * The ACPI table parsing functions set this variable on an error 143 * The ACPI table parsing functions set this variable on an error
@@ -180,6 +181,12 @@ static u32 dev_table_size; /* size of the device table */
180static u32 alias_table_size; /* size of the alias table */ 181static u32 alias_table_size; /* size of the alias table */
181static u32 rlookup_table_size; /* size if the rlookup table */ 182static u32 rlookup_table_size; /* size if the rlookup table */
182 183
184/*
185 * This function flushes all internal caches of
186 * the IOMMU used by this driver.
187 */
188extern void iommu_flush_all_caches(struct amd_iommu *iommu);
189
183static inline void update_last_devid(u16 devid) 190static inline void update_last_devid(u16 devid)
184{ 191{
185 if (devid > amd_iommu_last_bdf) 192 if (devid > amd_iommu_last_bdf)
@@ -194,6 +201,39 @@ static inline unsigned long tbl_size(int entry_size)
194 return 1UL << shift; 201 return 1UL << shift;
195} 202}
196 203
204/* Access to l1 and l2 indexed register spaces */
205
206static u32 iommu_read_l1(struct amd_iommu *iommu, u16 l1, u8 address)
207{
208 u32 val;
209
210 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
211 pci_read_config_dword(iommu->dev, 0xfc, &val);
212 return val;
213}
214
215static void iommu_write_l1(struct amd_iommu *iommu, u16 l1, u8 address, u32 val)
216{
217 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16 | 1 << 31));
218 pci_write_config_dword(iommu->dev, 0xfc, val);
219 pci_write_config_dword(iommu->dev, 0xf8, (address | l1 << 16));
220}
221
222static u32 iommu_read_l2(struct amd_iommu *iommu, u8 address)
223{
224 u32 val;
225
226 pci_write_config_dword(iommu->dev, 0xf0, address);
227 pci_read_config_dword(iommu->dev, 0xf4, &val);
228 return val;
229}
230
231static void iommu_write_l2(struct amd_iommu *iommu, u8 address, u32 val)
232{
233 pci_write_config_dword(iommu->dev, 0xf0, (address | 1 << 8));
234 pci_write_config_dword(iommu->dev, 0xf4, val);
235}
236
197/**************************************************************************** 237/****************************************************************************
198 * 238 *
199 * AMD IOMMU MMIO register space handling functions 239 * AMD IOMMU MMIO register space handling functions
@@ -260,9 +300,23 @@ static void iommu_feature_disable(struct amd_iommu *iommu, u8 bit)
260/* Function to enable the hardware */ 300/* Function to enable the hardware */
261static void iommu_enable(struct amd_iommu *iommu) 301static void iommu_enable(struct amd_iommu *iommu)
262{ 302{
263 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx\n", 303 static const char * const feat_str[] = {
304 "PreF", "PPR", "X2APIC", "NX", "GT", "[5]",
305 "IA", "GA", "HE", "PC", NULL
306 };
307 int i;
308
309 printk(KERN_INFO "AMD-Vi: Enabling IOMMU at %s cap 0x%hx",
264 dev_name(&iommu->dev->dev), iommu->cap_ptr); 310 dev_name(&iommu->dev->dev), iommu->cap_ptr);
265 311
312 if (iommu->cap & (1 << IOMMU_CAP_EFR)) {
313 printk(KERN_CONT " extended features: ");
314 for (i = 0; feat_str[i]; ++i)
315 if (iommu_feature(iommu, (1ULL << i)))
316 printk(KERN_CONT " %s", feat_str[i]);
317 }
318 printk(KERN_CONT "\n");
319
266 iommu_feature_enable(iommu, CONTROL_IOMMU_EN); 320 iommu_feature_enable(iommu, CONTROL_IOMMU_EN);
267} 321}
268 322
@@ -618,7 +672,8 @@ static void __init set_device_exclusion_range(u16 devid, struct ivmd_header *m)
618static void __init init_iommu_from_pci(struct amd_iommu *iommu) 672static void __init init_iommu_from_pci(struct amd_iommu *iommu)
619{ 673{
620 int cap_ptr = iommu->cap_ptr; 674 int cap_ptr = iommu->cap_ptr;
621 u32 range, misc; 675 u32 range, misc, low, high;
676 int i, j;
622 677
623 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET, 678 pci_read_config_dword(iommu->dev, cap_ptr + MMIO_CAP_HDR_OFFSET,
624 &iommu->cap); 679 &iommu->cap);
@@ -633,12 +688,38 @@ static void __init init_iommu_from_pci(struct amd_iommu *iommu)
633 MMIO_GET_LD(range)); 688 MMIO_GET_LD(range));
634 iommu->evt_msi_num = MMIO_MSI_NUM(misc); 689 iommu->evt_msi_num = MMIO_MSI_NUM(misc);
635 690
636 if (is_rd890_iommu(iommu->dev)) { 691 if (!(iommu->cap & (1 << IOMMU_CAP_IOTLB)))
637 pci_read_config_dword(iommu->dev, 0xf0, &iommu->cache_cfg[0]); 692 amd_iommu_iotlb_sup = false;
638 pci_read_config_dword(iommu->dev, 0xf4, &iommu->cache_cfg[1]); 693
639 pci_read_config_dword(iommu->dev, 0xf8, &iommu->cache_cfg[2]); 694 /* read extended feature bits */
640 pci_read_config_dword(iommu->dev, 0xfc, &iommu->cache_cfg[3]); 695 low = readl(iommu->mmio_base + MMIO_EXT_FEATURES);
641 } 696 high = readl(iommu->mmio_base + MMIO_EXT_FEATURES + 4);
697
698 iommu->features = ((u64)high << 32) | low;
699
700 if (!is_rd890_iommu(iommu->dev))
701 return;
702
703 /*
704 * Some rd890 systems may not be fully reconfigured by the BIOS, so
705 * it's necessary for us to store this information so it can be
706 * reprogrammed on resume
707 */
708
709 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 4,
710 &iommu->stored_addr_lo);
711 pci_read_config_dword(iommu->dev, iommu->cap_ptr + 8,
712 &iommu->stored_addr_hi);
713
714 /* Low bit locks writes to configuration space */
715 iommu->stored_addr_lo &= ~1;
716
717 for (i = 0; i < 6; i++)
718 for (j = 0; j < 0x12; j++)
719 iommu->stored_l1[i][j] = iommu_read_l1(iommu, i, j);
720
721 for (i = 0; i < 0x83; i++)
722 iommu->stored_l2[i] = iommu_read_l2(iommu, i);
642} 723}
643 724
644/* 725/*
@@ -650,8 +731,8 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
650{ 731{
651 u8 *p = (u8 *)h; 732 u8 *p = (u8 *)h;
652 u8 *end = p, flags = 0; 733 u8 *end = p, flags = 0;
653 u16 dev_i, devid = 0, devid_start = 0, devid_to = 0; 734 u16 devid = 0, devid_start = 0, devid_to = 0;
654 u32 ext_flags = 0; 735 u32 dev_i, ext_flags = 0;
655 bool alias = false; 736 bool alias = false;
656 struct ivhd_entry *e; 737 struct ivhd_entry *e;
657 738
@@ -806,7 +887,7 @@ static void __init init_iommu_from_acpi(struct amd_iommu *iommu,
806/* Initializes the device->iommu mapping for the driver */ 887/* Initializes the device->iommu mapping for the driver */
807static int __init init_iommu_devices(struct amd_iommu *iommu) 888static int __init init_iommu_devices(struct amd_iommu *iommu)
808{ 889{
809 u16 i; 890 u32 i;
810 891
811 for (i = iommu->first_device; i <= iommu->last_device; ++i) 892 for (i = iommu->first_device; i <= iommu->last_device; ++i)
812 set_iommu_for_device(iommu, i); 893 set_iommu_for_device(iommu, i);
@@ -953,10 +1034,11 @@ static int iommu_setup_msi(struct amd_iommu *iommu)
953 if (pci_enable_msi(iommu->dev)) 1034 if (pci_enable_msi(iommu->dev))
954 return 1; 1035 return 1;
955 1036
956 r = request_irq(iommu->dev->irq, amd_iommu_int_handler, 1037 r = request_threaded_irq(iommu->dev->irq,
957 IRQF_SAMPLE_RANDOM, 1038 amd_iommu_int_handler,
958 "AMD-Vi", 1039 amd_iommu_int_thread,
959 NULL); 1040 0, "AMD-Vi",
1041 iommu->dev);
960 1042
961 if (r) { 1043 if (r) {
962 pci_disable_msi(iommu->dev); 1044 pci_disable_msi(iommu->dev);
@@ -1095,7 +1177,7 @@ static int __init init_memory_definitions(struct acpi_table_header *table)
1095 */ 1177 */
1096static void init_device_table(void) 1178static void init_device_table(void)
1097{ 1179{
1098 u16 devid; 1180 u32 devid;
1099 1181
1100 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) { 1182 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) {
1101 set_dev_entry_bit(devid, DEV_ENTRY_VALID); 1183 set_dev_entry_bit(devid, DEV_ENTRY_VALID);
@@ -1127,14 +1209,53 @@ static void iommu_init_flags(struct amd_iommu *iommu)
1127 iommu_feature_enable(iommu, CONTROL_COHERENT_EN); 1209 iommu_feature_enable(iommu, CONTROL_COHERENT_EN);
1128} 1210}
1129 1211
1130static void iommu_apply_quirks(struct amd_iommu *iommu) 1212static void iommu_apply_resume_quirks(struct amd_iommu *iommu)
1131{ 1213{
1132 if (is_rd890_iommu(iommu->dev)) { 1214 int i, j;
1133 pci_write_config_dword(iommu->dev, 0xf0, iommu->cache_cfg[0]); 1215 u32 ioc_feature_control;
1134 pci_write_config_dword(iommu->dev, 0xf4, iommu->cache_cfg[1]); 1216 struct pci_dev *pdev = NULL;
1135 pci_write_config_dword(iommu->dev, 0xf8, iommu->cache_cfg[2]); 1217
1136 pci_write_config_dword(iommu->dev, 0xfc, iommu->cache_cfg[3]); 1218 /* RD890 BIOSes may not have completely reconfigured the iommu */
1137 } 1219 if (!is_rd890_iommu(iommu->dev))
1220 return;
1221
1222 /*
1223 * First, we need to ensure that the iommu is enabled. This is
1224 * controlled by a register in the northbridge
1225 */
1226 pdev = pci_get_bus_and_slot(iommu->dev->bus->number, PCI_DEVFN(0, 0));
1227
1228 if (!pdev)
1229 return;
1230
1231 /* Select Northbridge indirect register 0x75 and enable writing */
1232 pci_write_config_dword(pdev, 0x60, 0x75 | (1 << 7));
1233 pci_read_config_dword(pdev, 0x64, &ioc_feature_control);
1234
1235 /* Enable the iommu */
1236 if (!(ioc_feature_control & 0x1))
1237 pci_write_config_dword(pdev, 0x64, ioc_feature_control | 1);
1238
1239 pci_dev_put(pdev);
1240
1241 /* Restore the iommu BAR */
1242 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1243 iommu->stored_addr_lo);
1244 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 8,
1245 iommu->stored_addr_hi);
1246
1247 /* Restore the l1 indirect regs for each of the 6 l1s */
1248 for (i = 0; i < 6; i++)
1249 for (j = 0; j < 0x12; j++)
1250 iommu_write_l1(iommu, i, j, iommu->stored_l1[i][j]);
1251
1252 /* Restore the l2 indirect regs */
1253 for (i = 0; i < 0x83; i++)
1254 iommu_write_l2(iommu, i, iommu->stored_l2[i]);
1255
1256 /* Lock PCI setup registers */
1257 pci_write_config_dword(iommu->dev, iommu->cap_ptr + 4,
1258 iommu->stored_addr_lo | 1);
1138} 1259}
1139 1260
1140/* 1261/*
@@ -1147,7 +1268,6 @@ static void enable_iommus(void)
1147 1268
1148 for_each_iommu(iommu) { 1269 for_each_iommu(iommu) {
1149 iommu_disable(iommu); 1270 iommu_disable(iommu);
1150 iommu_apply_quirks(iommu);
1151 iommu_init_flags(iommu); 1271 iommu_init_flags(iommu);
1152 iommu_set_device_table(iommu); 1272 iommu_set_device_table(iommu);
1153 iommu_enable_command_buffer(iommu); 1273 iommu_enable_command_buffer(iommu);
@@ -1155,6 +1275,7 @@ static void enable_iommus(void)
1155 iommu_set_exclusion_range(iommu); 1275 iommu_set_exclusion_range(iommu);
1156 iommu_init_msi(iommu); 1276 iommu_init_msi(iommu);
1157 iommu_enable(iommu); 1277 iommu_enable(iommu);
1278 iommu_flush_all_caches(iommu);
1158 } 1279 }
1159} 1280}
1160 1281
@@ -1171,8 +1292,13 @@ static void disable_iommus(void)
1171 * disable suspend until real resume implemented 1292 * disable suspend until real resume implemented
1172 */ 1293 */
1173 1294
1174static int amd_iommu_resume(struct sys_device *dev) 1295static void amd_iommu_resume(void)
1175{ 1296{
1297 struct amd_iommu *iommu;
1298
1299 for_each_iommu(iommu)
1300 iommu_apply_resume_quirks(iommu);
1301
1176 /* re-load the hardware */ 1302 /* re-load the hardware */
1177 enable_iommus(); 1303 enable_iommus();
1178 1304
@@ -1180,13 +1306,11 @@ static int amd_iommu_resume(struct sys_device *dev)
1180 * we have to flush after the IOMMUs are enabled because a 1306 * we have to flush after the IOMMUs are enabled because a
1181 * disabled IOMMU will never execute the commands we send 1307 * disabled IOMMU will never execute the commands we send
1182 */ 1308 */
1183 amd_iommu_flush_all_devices(); 1309 for_each_iommu(iommu)
1184 amd_iommu_flush_all_domains(); 1310 iommu_flush_all_caches(iommu);
1185
1186 return 0;
1187} 1311}
1188 1312
1189static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state) 1313static int amd_iommu_suspend(void)
1190{ 1314{
1191 /* disable IOMMUs to go out of the way for BIOS */ 1315 /* disable IOMMUs to go out of the way for BIOS */
1192 disable_iommus(); 1316 disable_iommus();
@@ -1194,17 +1318,11 @@ static int amd_iommu_suspend(struct sys_device *dev, pm_message_t state)
1194 return 0; 1318 return 0;
1195} 1319}
1196 1320
1197static struct sysdev_class amd_iommu_sysdev_class = { 1321static struct syscore_ops amd_iommu_syscore_ops = {
1198 .name = "amd_iommu",
1199 .suspend = amd_iommu_suspend, 1322 .suspend = amd_iommu_suspend,
1200 .resume = amd_iommu_resume, 1323 .resume = amd_iommu_resume,
1201}; 1324};
1202 1325
1203static struct sys_device device_amd_iommu = {
1204 .id = 0,
1205 .cls = &amd_iommu_sysdev_class,
1206};
1207
1208/* 1326/*
1209 * This is the core init function for AMD IOMMU hardware in the system. 1327 * This is the core init function for AMD IOMMU hardware in the system.
1210 * This function is called from the generic x86 DMA layer initialization 1328 * This function is called from the generic x86 DMA layer initialization
@@ -1321,14 +1439,6 @@ static int __init amd_iommu_init(void)
1321 goto free; 1439 goto free;
1322 } 1440 }
1323 1441
1324 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1325 if (ret)
1326 goto free;
1327
1328 ret = sysdev_register(&device_amd_iommu);
1329 if (ret)
1330 goto free;
1331
1332 ret = amd_iommu_init_devices(); 1442 ret = amd_iommu_init_devices();
1333 if (ret) 1443 if (ret)
1334 goto free; 1444 goto free;
@@ -1347,6 +1457,8 @@ static int __init amd_iommu_init(void)
1347 1457
1348 amd_iommu_init_notifier(); 1458 amd_iommu_init_notifier();
1349 1459
1460 register_syscore_ops(&amd_iommu_syscore_ops);
1461
1350 if (iommu_pass_through) 1462 if (iommu_pass_through)
1351 goto out; 1463 goto out;
1352 1464
@@ -1405,13 +1517,13 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1405 return 0; 1517 return 0;
1406} 1518}
1407 1519
1408void __init amd_iommu_detect(void) 1520int __init amd_iommu_detect(void)
1409{ 1521{
1410 if (no_iommu || (iommu_detected && !gart_iommu_aperture)) 1522 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1411 return; 1523 return -ENODEV;
1412 1524
1413 if (amd_iommu_disabled) 1525 if (amd_iommu_disabled)
1414 return; 1526 return -ENODEV;
1415 1527
1416 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1528 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1417 iommu_detected = 1; 1529 iommu_detected = 1;
@@ -1420,7 +1532,9 @@ void __init amd_iommu_detect(void)
1420 1532
1421 /* Make sure ACS will be enabled */ 1533 /* Make sure ACS will be enabled */
1422 pci_request_acs(); 1534 pci_request_acs();
1535 return 1;
1423 } 1536 }
1537 return -ENODEV;
1424} 1538}
1425 1539
1426/**************************************************************************** 1540/****************************************************************************
@@ -1451,3 +1565,8 @@ static int __init parse_amd_iommu_options(char *str)
1451 1565
1452__setup("amd_iommu_dump", parse_amd_iommu_dump); 1566__setup("amd_iommu_dump", parse_amd_iommu_dump);
1453__setup("amd_iommu=", parse_amd_iommu_options); 1567__setup("amd_iommu=", parse_amd_iommu_options);
1568
1569IOMMU_INIT_FINISH(amd_iommu_detect,
1570 gart_iommu_hole_init,
1571 0,
1572 0);
diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c
new file mode 100644
index 000000000000..4c39baa8facc
--- /dev/null
+++ b/arch/x86/kernel/amd_nb.c
@@ -0,0 +1,255 @@
1/*
2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */
5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <asm/amd_nb.h>
12
13static u32 *flush_words;
14
15const struct pci_device_id amd_nb_misc_ids[] = {
16 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
17 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
18 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3) },
19 {}
20};
21EXPORT_SYMBOL(amd_nb_misc_ids);
22
23static struct pci_device_id amd_nb_link_ids[] = {
24 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4) },
25 {}
26};
27
28const struct amd_nb_bus_dev_range amd_nb_bus_dev_ranges[] __initconst = {
29 { 0x00, 0x18, 0x20 },
30 { 0xff, 0x00, 0x20 },
31 { 0xfe, 0x00, 0x20 },
32 { }
33};
34
35struct amd_northbridge_info amd_northbridges;
36EXPORT_SYMBOL(amd_northbridges);
37
38static struct pci_dev *next_northbridge(struct pci_dev *dev,
39 const struct pci_device_id *ids)
40{
41 do {
42 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
43 if (!dev)
44 break;
45 } while (!pci_match_id(ids, dev));
46 return dev;
47}
48
49int amd_cache_northbridges(void)
50{
51 u16 i = 0;
52 struct amd_northbridge *nb;
53 struct pci_dev *misc, *link;
54
55 if (amd_nb_num())
56 return 0;
57
58 misc = NULL;
59 while ((misc = next_northbridge(misc, amd_nb_misc_ids)) != NULL)
60 i++;
61
62 if (i == 0)
63 return 0;
64
65 nb = kzalloc(i * sizeof(struct amd_northbridge), GFP_KERNEL);
66 if (!nb)
67 return -ENOMEM;
68
69 amd_northbridges.nb = nb;
70 amd_northbridges.num = i;
71
72 link = misc = NULL;
73 for (i = 0; i != amd_nb_num(); i++) {
74 node_to_amd_nb(i)->misc = misc =
75 next_northbridge(misc, amd_nb_misc_ids);
76 node_to_amd_nb(i)->link = link =
77 next_northbridge(link, amd_nb_link_ids);
78 }
79
80 /* some CPU families (e.g. family 0x11) do not support GART */
81 if (boot_cpu_data.x86 == 0xf || boot_cpu_data.x86 == 0x10 ||
82 boot_cpu_data.x86 == 0x15)
83 amd_northbridges.flags |= AMD_NB_GART;
84
85 /*
86 * Some CPU families support L3 Cache Index Disable. There are some
87 * limitations because of E382 and E388 on family 0x10.
88 */
89 if (boot_cpu_data.x86 == 0x10 &&
90 boot_cpu_data.x86_model >= 0x8 &&
91 (boot_cpu_data.x86_model > 0x9 ||
92 boot_cpu_data.x86_mask >= 0x1))
93 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
94
95 if (boot_cpu_data.x86 == 0x15)
96 amd_northbridges.flags |= AMD_NB_L3_INDEX_DISABLE;
97
98 /* L3 cache partitioning is supported on family 0x15 */
99 if (boot_cpu_data.x86 == 0x15)
100 amd_northbridges.flags |= AMD_NB_L3_PARTITIONING;
101
102 return 0;
103}
104EXPORT_SYMBOL_GPL(amd_cache_northbridges);
105
106/*
107 * Ignores subdevice/subvendor but as far as I can figure out
108 * they're useless anyways
109 */
110bool __init early_is_amd_nb(u32 device)
111{
112 const struct pci_device_id *id;
113 u32 vendor = device & 0xffff;
114
115 device >>= 16;
116 for (id = amd_nb_misc_ids; id->vendor; id++)
117 if (vendor == id->vendor && device == id->device)
118 return true;
119 return false;
120}
121
122int amd_get_subcaches(int cpu)
123{
124 struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link;
125 unsigned int mask;
126 int cuid = 0;
127
128 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
129 return 0;
130
131 pci_read_config_dword(link, 0x1d4, &mask);
132
133#ifdef CONFIG_SMP
134 cuid = cpu_data(cpu).compute_unit_id;
135#endif
136 return (mask >> (4 * cuid)) & 0xf;
137}
138
139int amd_set_subcaches(int cpu, int mask)
140{
141 static unsigned int reset, ban;
142 struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu));
143 unsigned int reg;
144 int cuid = 0;
145
146 if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf)
147 return -EINVAL;
148
149 /* if necessary, collect reset state of L3 partitioning and BAN mode */
150 if (reset == 0) {
151 pci_read_config_dword(nb->link, 0x1d4, &reset);
152 pci_read_config_dword(nb->misc, 0x1b8, &ban);
153 ban &= 0x180000;
154 }
155
156 /* deactivate BAN mode if any subcaches are to be disabled */
157 if (mask != 0xf) {
158 pci_read_config_dword(nb->misc, 0x1b8, &reg);
159 pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000);
160 }
161
162#ifdef CONFIG_SMP
163 cuid = cpu_data(cpu).compute_unit_id;
164#endif
165 mask <<= 4 * cuid;
166 mask |= (0xf ^ (1 << cuid)) << 26;
167
168 pci_write_config_dword(nb->link, 0x1d4, mask);
169
170 /* reset BAN mode if L3 partitioning returned to reset state */
171 pci_read_config_dword(nb->link, 0x1d4, &reg);
172 if (reg == reset) {
173 pci_read_config_dword(nb->misc, 0x1b8, &reg);
174 reg &= ~0x180000;
175 pci_write_config_dword(nb->misc, 0x1b8, reg | ban);
176 }
177
178 return 0;
179}
180
181static int amd_cache_gart(void)
182{
183 u16 i;
184
185 if (!amd_nb_has_feature(AMD_NB_GART))
186 return 0;
187
188 flush_words = kmalloc(amd_nb_num() * sizeof(u32), GFP_KERNEL);
189 if (!flush_words) {
190 amd_northbridges.flags &= ~AMD_NB_GART;
191 return -ENOMEM;
192 }
193
194 for (i = 0; i != amd_nb_num(); i++)
195 pci_read_config_dword(node_to_amd_nb(i)->misc, 0x9c,
196 &flush_words[i]);
197
198 return 0;
199}
200
201void amd_flush_garts(void)
202{
203 int flushed, i;
204 unsigned long flags;
205 static DEFINE_SPINLOCK(gart_lock);
206
207 if (!amd_nb_has_feature(AMD_NB_GART))
208 return;
209
210 /* Avoid races between AGP and IOMMU. In theory it's not needed
211 but I'm not sure if the hardware won't lose flush requests
212 when another is pending. This whole thing is so expensive anyways
213 that it doesn't matter to serialize more. -AK */
214 spin_lock_irqsave(&gart_lock, flags);
215 flushed = 0;
216 for (i = 0; i < amd_nb_num(); i++) {
217 pci_write_config_dword(node_to_amd_nb(i)->misc, 0x9c,
218 flush_words[i] | 1);
219 flushed++;
220 }
221 for (i = 0; i < amd_nb_num(); i++) {
222 u32 w;
223 /* Make sure the hardware actually executed the flush*/
224 for (;;) {
225 pci_read_config_dword(node_to_amd_nb(i)->misc,
226 0x9c, &w);
227 if (!(w & 1))
228 break;
229 cpu_relax();
230 }
231 }
232 spin_unlock_irqrestore(&gart_lock, flags);
233 if (!flushed)
234 printk("nothing to flush?\n");
235}
236EXPORT_SYMBOL_GPL(amd_flush_garts);
237
238static __init int init_amd_nbs(void)
239{
240 int err = 0;
241
242 err = amd_cache_northbridges();
243
244 if (err < 0)
245 printk(KERN_NOTICE "AMD NB: Cannot enumerate AMD northbridges.\n");
246
247 if (amd_cache_gart() < 0)
248 printk(KERN_NOTICE "AMD NB: Cannot initialize GART flush words, "
249 "GART support disabled.\n");
250
251 return err;
252}
253
254/* This has to go after the PCI subsystem */
255fs_initcall(init_amd_nbs);
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
index 8dd77800ff5d..289e92862fd9 100644
--- a/arch/x86/kernel/apb_timer.c
+++ b/arch/x86/kernel/apb_timer.c
@@ -177,7 +177,6 @@ static struct clocksource clocksource_apbt = {
177 .rating = APBT_CLOCKSOURCE_RATING, 177 .rating = APBT_CLOCKSOURCE_RATING,
178 .read = apbt_read_clocksource, 178 .read = apbt_read_clocksource,
179 .mask = APBT_MASK, 179 .mask = APBT_MASK,
180 .shift = APBT_SHIFT,
181 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 180 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
182 .resume = apbt_restart_clocksource, 181 .resume = apbt_restart_clocksource,
183}; 182};
@@ -231,34 +230,6 @@ static void apbt_restart_clocksource(struct clocksource *cs)
231 apbt_start_counter(phy_cs_timer_id); 230 apbt_start_counter(phy_cs_timer_id);
232} 231}
233 232
234/* Setup IRQ routing via IOAPIC */
235#ifdef CONFIG_SMP
236static void apbt_setup_irq(struct apbt_dev *adev)
237{
238 struct irq_chip *chip;
239 struct irq_desc *desc;
240
241 /* timer0 irq has been setup early */
242 if (adev->irq == 0)
243 return;
244 desc = irq_to_desc(adev->irq);
245 chip = get_irq_chip(adev->irq);
246 disable_irq(adev->irq);
247 desc->status |= IRQ_MOVE_PCNTXT;
248 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
249 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
250 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
251 enable_irq(adev->irq);
252 if (system_state == SYSTEM_BOOTING)
253 if (request_irq(adev->irq, apbt_interrupt_handler,
254 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
255 adev->name, adev)) {
256 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
257 adev->num);
258 }
259}
260#endif
261
262static void apbt_enable_int(int n) 233static void apbt_enable_int(int n)
263{ 234{
264 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL); 235 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
@@ -312,7 +283,7 @@ static int __init apbt_clockevent_register(void)
312 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device)); 283 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
313 284
314 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) { 285 if (mrst_timer_options == MRST_TIMER_LAPIC_APBT) {
315 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100; 286 adev->evt.rating = APBT_CLOCKEVENT_RATING - 100;
316 global_clock_event = &adev->evt; 287 global_clock_event = &adev->evt;
317 printk(KERN_DEBUG "%s clockevent registered as global\n", 288 printk(KERN_DEBUG "%s clockevent registered as global\n",
318 global_clock_event->name); 289 global_clock_event->name);
@@ -334,6 +305,30 @@ static int __init apbt_clockevent_register(void)
334} 305}
335 306
336#ifdef CONFIG_SMP 307#ifdef CONFIG_SMP
308
309static void apbt_setup_irq(struct apbt_dev *adev)
310{
311 /* timer0 irq has been setup early */
312 if (adev->irq == 0)
313 return;
314
315 irq_modify_status(adev->irq, 0, IRQ_MOVE_PCNTXT);
316 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
317 /* APB timer irqs are set up as mp_irqs, timer is edge type */
318 __irq_set_handler(adev->irq, handle_edge_irq, 0, "edge");
319
320 if (system_state == SYSTEM_BOOTING) {
321 if (request_irq(adev->irq, apbt_interrupt_handler,
322 IRQF_TIMER | IRQF_DISABLED |
323 IRQF_NOBALANCING,
324 adev->name, adev)) {
325 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
326 adev->num);
327 }
328 } else
329 enable_irq(adev->irq);
330}
331
337/* Should be called with per cpu */ 332/* Should be called with per cpu */
338void apbt_setup_secondary_clock(void) 333void apbt_setup_secondary_clock(void)
339{ 334{
@@ -343,7 +338,7 @@ void apbt_setup_secondary_clock(void)
343 338
344 /* Don't register boot CPU clockevent */ 339 /* Don't register boot CPU clockevent */
345 cpu = smp_processor_id(); 340 cpu = smp_processor_id();
346 if (cpu == boot_cpu_id) 341 if (!cpu)
347 return; 342 return;
348 /* 343 /*
349 * We need to calculate the scaled math multiplication factor for 344 * We need to calculate the scaled math multiplication factor for
@@ -389,16 +384,17 @@ static int apbt_cpuhp_notify(struct notifier_block *n,
389 384
390 switch (action & 0xf) { 385 switch (action & 0xf) {
391 case CPU_DEAD: 386 case CPU_DEAD:
387 disable_irq(adev->irq);
392 apbt_disable_int(cpu); 388 apbt_disable_int(cpu);
393 if (system_state == SYSTEM_RUNNING) 389 if (system_state == SYSTEM_RUNNING) {
394 pr_debug("skipping APBT CPU %lu offline\n", cpu); 390 pr_debug("skipping APBT CPU %lu offline\n", cpu);
395 else if (adev) { 391 } else if (adev) {
396 pr_debug("APBT clockevent for cpu %lu offline\n", cpu); 392 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
397 free_irq(adev->irq, adev); 393 free_irq(adev->irq, adev);
398 } 394 }
399 break; 395 break;
400 default: 396 default:
401 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action); 397 pr_debug("APBT notified %lu, no action\n", action);
402 } 398 }
403 return NOTIFY_OK; 399 return NOTIFY_OK;
404} 400}
@@ -511,64 +507,12 @@ static int apbt_next_event(unsigned long delta,
511 return 0; 507 return 0;
512} 508}
513 509
514/*
515 * APB timer clock is not in sync with pclk on Langwell, which translates to
516 * unreliable read value caused by sampling error. the error does not add up
517 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
518 * would go backwards. the following code is trying to prevent time traveling
519 * backwards. little bit paranoid.
520 */
521static cycle_t apbt_read_clocksource(struct clocksource *cs) 510static cycle_t apbt_read_clocksource(struct clocksource *cs)
522{ 511{
523 unsigned long t0, t1, t2; 512 unsigned long current_count;
524 static unsigned long last_read; 513
525 514 current_count = apbt_readl(phy_cs_timer_id, APBTMR_N_CURRENT_VALUE);
526bad_count: 515 return (cycle_t)~current_count;
527 t1 = apbt_readl(phy_cs_timer_id,
528 APBTMR_N_CURRENT_VALUE);
529 t2 = apbt_readl(phy_cs_timer_id,
530 APBTMR_N_CURRENT_VALUE);
531 if (unlikely(t1 < t2)) {
532 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
533 t1, t2, t2 - t1);
534 goto bad_count;
535 }
536 /*
537 * check against cached last read, makes sure time does not go back.
538 * it could be a normal rollover but we will do tripple check anyway
539 */
540 if (unlikely(t2 > last_read)) {
541 /* check if we have a normal rollover */
542 unsigned long raw_intr_status =
543 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
544 /*
545 * cs timer interrupt is masked but raw intr bit is set if
546 * rollover occurs. then we read EOI reg to clear it.
547 */
548 if (raw_intr_status & (1 << phy_cs_timer_id)) {
549 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
550 goto out;
551 }
552 pr_debug("APB CS going back %lx:%lx:%lx ",
553 t2, last_read, t2 - last_read);
554bad_count_x3:
555 pr_debug(KERN_INFO "tripple check enforced\n");
556 t0 = apbt_readl(phy_cs_timer_id,
557 APBTMR_N_CURRENT_VALUE);
558 udelay(1);
559 t1 = apbt_readl(phy_cs_timer_id,
560 APBTMR_N_CURRENT_VALUE);
561 udelay(1);
562 t2 = apbt_readl(phy_cs_timer_id,
563 APBTMR_N_CURRENT_VALUE);
564 if ((t2 > t1) || (t1 > t0)) {
565 printk(KERN_ERR "Error: APB CS tripple check failed\n");
566 goto bad_count_x3;
567 }
568 }
569out:
570 last_read = t2;
571 return (cycle_t)~t2;
572} 516}
573 517
574static int apbt_clocksource_register(void) 518static int apbt_clocksource_register(void)
@@ -598,14 +542,7 @@ static int apbt_clocksource_register(void)
598 if (t1 == apbt_read_clocksource(&clocksource_apbt)) 542 if (t1 == apbt_read_clocksource(&clocksource_apbt))
599 panic("APBT counter not counting. APBT disabled\n"); 543 panic("APBT counter not counting. APBT disabled\n");
600 544
601 /* 545 clocksource_register_khz(&clocksource_apbt, (u32)apbt_freq*1000);
602 * initialize and register APBT clocksource
603 * convert that to ns/clock cycle
604 * mult = (ns/c) * 2^APBT_SHIFT
605 */
606 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
607 (unsigned long) apbt_freq, APBT_SHIFT);
608 clocksource_register(&clocksource_apbt);
609 546
610 return 0; 547 return 0;
611} 548}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index a2e0caf26e17..3d2661ca6542 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -13,7 +13,7 @@
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/types.h> 14#include <linux/types.h>
15#include <linux/init.h> 15#include <linux/init.h>
16#include <linux/bootmem.h> 16#include <linux/memblock.h>
17#include <linux/mmzone.h> 17#include <linux/mmzone.h>
18#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
19#include <linux/pci.h> 19#include <linux/pci.h>
@@ -27,9 +27,25 @@
27#include <asm/gart.h> 27#include <asm/gart.h>
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/amd_nb.h>
31#include <asm/x86_init.h> 31#include <asm/x86_init.h>
32 32
33/*
34 * Using 512M as goal, in case kexec will load kernel_big
35 * that will do the on-position decompress, and could overlap with
36 * with the gart aperture that is used.
37 * Sequence:
38 * kernel_small
39 * ==> kexec (with kdump trigger path or gart still enabled)
40 * ==> kernel_small (gart area become e820_reserved)
41 * ==> kexec (with kdump trigger path or gart still enabled)
42 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
43 * So don't use 512M below as gart iommu, leave the space for kernel
44 * code for safe.
45 */
46#define GART_MIN_ADDR (512ULL << 20)
47#define GART_MAX_ADDR (1ULL << 32)
48
33int gart_iommu_aperture; 49int gart_iommu_aperture;
34int gart_iommu_aperture_disabled __initdata; 50int gart_iommu_aperture_disabled __initdata;
35int gart_iommu_aperture_allowed __initdata; 51int gart_iommu_aperture_allowed __initdata;
@@ -39,18 +55,6 @@ int fallback_aper_force __initdata;
39 55
40int fix_aperture __initdata = 1; 56int fix_aperture __initdata = 1;
41 57
42struct bus_dev_range {
43 int bus;
44 int dev_base;
45 int dev_limit;
46};
47
48static struct bus_dev_range bus_dev_ranges[] __initdata = {
49 { 0x00, 0x18, 0x20},
50 { 0xff, 0x00, 0x20},
51 { 0xfe, 0x00, 0x20}
52};
53
54static struct resource gart_resource = { 58static struct resource gart_resource = {
55 .name = "GART", 59 .name = "GART",
56 .flags = IORESOURCE_MEM, 60 .flags = IORESOURCE_MEM,
@@ -69,7 +73,7 @@ static void __init insert_aperture_resource(u32 aper_base, u32 aper_size)
69static u32 __init allocate_aperture(void) 73static u32 __init allocate_aperture(void)
70{ 74{
71 u32 aper_size; 75 u32 aper_size;
72 void *p; 76 unsigned long addr;
73 77
74 /* aper_size should <= 1G */ 78 /* aper_size should <= 1G */
75 if (fallback_aper_order > 5) 79 if (fallback_aper_order > 5)
@@ -82,40 +86,27 @@ static u32 __init allocate_aperture(void)
82 * memory. Unfortunately we cannot move it up because that would 86 * memory. Unfortunately we cannot move it up because that would
83 * make the IOMMU useless. 87 * make the IOMMU useless.
84 */ 88 */
85 /* 89 addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR,
86 * using 512M as goal, in case kexec will load kernel_big 90 aper_size, aper_size);
87 * that will do the on position decompress, and could overlap with 91 if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) {
88 * that positon with gart that is used. 92 printk(KERN_ERR
89 * sequende: 93 "Cannot allocate aperture memory hole (%lx,%uK)\n",
90 * kernel_small 94 addr, aper_size>>10);
91 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) 95 return 0;
92 * ==> kernel_small(gart area become e820_reserved) 96 }
93 * ==> kexec (with kdump trigger path or previous doesn't shutdown gart) 97 memblock_x86_reserve_range(addr, addr + aper_size, "aperture64");
94 * ==> kerne_big (uncompressed size will be big than 64M or 128M)
95 * so don't use 512M below as gart iommu, leave the space for kernel
96 * code for safe
97 */
98 p = __alloc_bootmem_nopanic(aper_size, aper_size, 512ULL<<20);
99 /* 98 /*
100 * Kmemleak should not scan this block as it may not be mapped via the 99 * Kmemleak should not scan this block as it may not be mapped via the
101 * kernel direct mapping. 100 * kernel direct mapping.
102 */ 101 */
103 kmemleak_ignore(p); 102 kmemleak_ignore(phys_to_virt(addr));
104 if (!p || __pa(p)+aper_size > 0xffffffff) {
105 printk(KERN_ERR
106 "Cannot allocate aperture memory hole (%p,%uK)\n",
107 p, aper_size>>10);
108 if (p)
109 free_bootmem(__pa(p), aper_size);
110 return 0;
111 }
112 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n", 103 printk(KERN_INFO "Mapping aperture over %d KB of RAM @ %lx\n",
113 aper_size >> 10, __pa(p)); 104 aper_size >> 10, addr);
114 insert_aperture_resource((u32)__pa(p), aper_size); 105 insert_aperture_resource((u32)addr, aper_size);
115 register_nosave_region((u32)__pa(p) >> PAGE_SHIFT, 106 register_nosave_region(addr >> PAGE_SHIFT,
116 (u32)__pa(p+aper_size) >> PAGE_SHIFT); 107 (addr+aper_size) >> PAGE_SHIFT);
117 108
118 return (u32)__pa(p); 109 return (u32)addr;
119} 110}
120 111
121 112
@@ -206,7 +197,7 @@ static u32 __init read_agp(int bus, int slot, int func, int cap, u32 *order)
206 * Do an PCI bus scan by hand because we're running before the PCI 197 * Do an PCI bus scan by hand because we're running before the PCI
207 * subsystem. 198 * subsystem.
208 * 199 *
209 * All K8 AGP bridges are AGPv3 compliant, so we can do this scan 200 * All AMD AGP bridges are AGPv3 compliant, so we can do this scan
210 * generically. It's probably overkill to always scan all slots because 201 * generically. It's probably overkill to always scan all slots because
211 * the AGP bridges should be always an own bus on the HT hierarchy, 202 * the AGP bridges should be always an own bus on the HT hierarchy,
212 * but do it here for future safety. 203 * but do it here for future safety.
@@ -294,20 +285,20 @@ void __init early_gart_iommu_check(void)
294 search_agp_bridge(&agp_aper_order, &valid_agp); 285 search_agp_bridge(&agp_aper_order, &valid_agp);
295 286
296 fix = 0; 287 fix = 0;
297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 288 for (i = 0; amd_nb_bus_dev_ranges[i].dev_limit; i++) {
298 int bus; 289 int bus;
299 int dev_base, dev_limit; 290 int dev_base, dev_limit;
300 291
301 bus = bus_dev_ranges[i].bus; 292 bus = amd_nb_bus_dev_ranges[i].bus;
302 dev_base = bus_dev_ranges[i].dev_base; 293 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
303 dev_limit = bus_dev_ranges[i].dev_limit; 294 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
304 295
305 for (slot = dev_base; slot < dev_limit; slot++) { 296 for (slot = dev_base; slot < dev_limit; slot++) {
306 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 297 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
307 continue; 298 continue;
308 299
309 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 300 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
310 aper_enabled = ctl & AMD64_GARTEN; 301 aper_enabled = ctl & GARTEN;
311 aper_order = (ctl >> 1) & 7; 302 aper_order = (ctl >> 1) & 7;
312 aper_size = (32 * 1024 * 1024) << aper_order; 303 aper_size = (32 * 1024 * 1024) << aper_order;
313 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 304 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
@@ -349,20 +340,20 @@ void __init early_gart_iommu_check(void)
349 return; 340 return;
350 341
351 /* disable them all at first */ 342 /* disable them all at first */
352 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 343 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
353 int bus; 344 int bus;
354 int dev_base, dev_limit; 345 int dev_base, dev_limit;
355 346
356 bus = bus_dev_ranges[i].bus; 347 bus = amd_nb_bus_dev_ranges[i].bus;
357 dev_base = bus_dev_ranges[i].dev_base; 348 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
358 dev_limit = bus_dev_ranges[i].dev_limit; 349 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
359 350
360 for (slot = dev_base; slot < dev_limit; slot++) { 351 for (slot = dev_base; slot < dev_limit; slot++) {
361 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 352 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
362 continue; 353 continue;
363 354
364 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL); 355 ctl = read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL);
365 ctl &= ~AMD64_GARTEN; 356 ctl &= ~GARTEN;
366 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl); 357 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
367 } 358 }
368 } 359 }
@@ -371,7 +362,7 @@ void __init early_gart_iommu_check(void)
371 362
372static int __initdata printed_gart_size_msg; 363static int __initdata printed_gart_size_msg;
373 364
374void __init gart_iommu_hole_init(void) 365int __init gart_iommu_hole_init(void)
375{ 366{
376 u32 agp_aper_base = 0, agp_aper_order = 0; 367 u32 agp_aper_base = 0, agp_aper_order = 0;
377 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0; 368 u32 aper_size, aper_alloc = 0, aper_order = 0, last_aper_order = 0;
@@ -381,7 +372,7 @@ void __init gart_iommu_hole_init(void)
381 372
382 if (gart_iommu_aperture_disabled || !fix_aperture || 373 if (gart_iommu_aperture_disabled || !fix_aperture ||
383 !early_pci_allowed()) 374 !early_pci_allowed())
384 return; 375 return -ENODEV;
385 376
386 printk(KERN_INFO "Checking aperture...\n"); 377 printk(KERN_INFO "Checking aperture...\n");
387 378
@@ -390,17 +381,17 @@ void __init gart_iommu_hole_init(void)
390 381
391 fix = 0; 382 fix = 0;
392 node = 0; 383 node = 0;
393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 384 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
394 int bus; 385 int bus;
395 int dev_base, dev_limit; 386 int dev_base, dev_limit;
396 u32 ctl; 387 u32 ctl;
397 388
398 bus = bus_dev_ranges[i].bus; 389 bus = amd_nb_bus_dev_ranges[i].bus;
399 dev_base = bus_dev_ranges[i].dev_base; 390 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
400 dev_limit = bus_dev_ranges[i].dev_limit; 391 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
401 392
402 for (slot = dev_base; slot < dev_limit; slot++) { 393 for (slot = dev_base; slot < dev_limit; slot++) {
403 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 394 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
404 continue; 395 continue;
405 396
406 iommu_detected = 1; 397 iommu_detected = 1;
@@ -463,8 +454,9 @@ out:
463 unsigned long n = (32 * 1024 * 1024) << last_aper_order; 454 unsigned long n = (32 * 1024 * 1024) << last_aper_order;
464 455
465 insert_aperture_resource((u32)last_aper_base, n); 456 insert_aperture_resource((u32)last_aper_base, n);
457 return 1;
466 } 458 }
467 return; 459 return 0;
468 } 460 }
469 461
470 if (!fallback_aper_force) { 462 if (!fallback_aper_force) {
@@ -500,28 +492,32 @@ out:
500 panic("Not enough memory for aperture"); 492 panic("Not enough memory for aperture");
501 } 493 }
502 } else { 494 } else {
503 return; 495 return 0;
504 } 496 }
505 497
506 /* Fix up the north bridges */ 498 /* Fix up the north bridges */
507 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 499 for (i = 0; i < amd_nb_bus_dev_ranges[i].dev_limit; i++) {
508 int bus; 500 int bus, dev_base, dev_limit;
509 int dev_base, dev_limit; 501
510 502 /*
511 bus = bus_dev_ranges[i].bus; 503 * Don't enable translation yet but enable GART IO and CPU
512 dev_base = bus_dev_ranges[i].dev_base; 504 * accesses and set DISTLBWALKPRB since GART table memory is UC.
513 dev_limit = bus_dev_ranges[i].dev_limit; 505 */
506 u32 ctl = aper_order << 1;
507
508 bus = amd_nb_bus_dev_ranges[i].bus;
509 dev_base = amd_nb_bus_dev_ranges[i].dev_base;
510 dev_limit = amd_nb_bus_dev_ranges[i].dev_limit;
514 for (slot = dev_base; slot < dev_limit; slot++) { 511 for (slot = dev_base; slot < dev_limit; slot++) {
515 if (!early_is_k8_nb(read_pci_config(bus, slot, 3, 0x00))) 512 if (!early_is_amd_nb(read_pci_config(bus, slot, 3, 0x00)))
516 continue; 513 continue;
517 514
518 /* Don't enable translation yet. That is done later. 515 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
519 Assume this BIOS didn't initialise the GART so
520 just overwrite all previous bits */
521 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, aper_order << 1);
522 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25); 516 write_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE, aper_alloc >> 25);
523 } 517 }
524 } 518 }
525 519
526 set_up_gart_resume(aper_order, aper_alloc); 520 set_up_gart_resume(aper_order, aper_alloc);
521
522 return 1;
527} 523}
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index 910f20b457c4..767fd04f2843 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,23 +2,25 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o ipi.o
6ifneq ($(CONFIG_HARDLOCKUP_DETECTOR),y) 6obj-y += hw_nmi.o
7obj-$(CONFIG_X86_LOCAL_APIC) += nmi.o
8endif
9obj-$(CONFIG_HARDLOCKUP_DETECTOR) += hw_nmi.o
10 7
11obj-$(CONFIG_X86_IO_APIC) += io_apic.o 8obj-$(CONFIG_X86_IO_APIC) += io_apic.o
12obj-$(CONFIG_SMP) += ipi.o 9obj-$(CONFIG_SMP) += ipi.o
13 10
14ifeq ($(CONFIG_X86_64),y) 11ifeq ($(CONFIG_X86_64),y)
15obj-y += apic_flat_64.o 12# APIC probe will depend on the listing order here
16obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
17obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
18obj-$(CONFIG_X86_UV) += x2apic_uv_x.o 13obj-$(CONFIG_X86_UV) += x2apic_uv_x.o
14obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o
15obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o
16obj-y += apic_flat_64.o
19endif 17endif
20 18
21obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o 19# APIC probe will depend on the listing order here
22obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 20obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
23obj-$(CONFIG_X86_ES7000) += es7000_32.o
24obj-$(CONFIG_X86_SUMMIT) += summit_32.o 21obj-$(CONFIG_X86_SUMMIT) += summit_32.o
22obj-$(CONFIG_X86_BIGSMP) += bigsmp_32.o
23obj-$(CONFIG_X86_ES7000) += es7000_32.o
24
25# For 32bit, probe_32 need to be listed last
26obj-$(CONFIG_X86_LOCAL_APIC) += probe_$(BITS).o
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index e3b534cda49a..b9338b8cf420 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -24,14 +24,13 @@
24#include <linux/ftrace.h> 24#include <linux/ftrace.h>
25#include <linux/ioport.h> 25#include <linux/ioport.h>
26#include <linux/module.h> 26#include <linux/module.h>
27#include <linux/sysdev.h> 27#include <linux/syscore_ops.h>
28#include <linux/delay.h> 28#include <linux/delay.h>
29#include <linux/timex.h> 29#include <linux/timex.h>
30#include <linux/dmar.h> 30#include <linux/dmar.h>
31#include <linux/init.h> 31#include <linux/init.h>
32#include <linux/cpu.h> 32#include <linux/cpu.h>
33#include <linux/dmi.h> 33#include <linux/dmi.h>
34#include <linux/nmi.h>
35#include <linux/smp.h> 34#include <linux/smp.h>
36#include <linux/mm.h> 35#include <linux/mm.h>
37 36
@@ -44,14 +43,15 @@
44#include <asm/i8259.h> 43#include <asm/i8259.h>
45#include <asm/proto.h> 44#include <asm/proto.h>
46#include <asm/apic.h> 45#include <asm/apic.h>
46#include <asm/io_apic.h>
47#include <asm/desc.h> 47#include <asm/desc.h>
48#include <asm/hpet.h> 48#include <asm/hpet.h>
49#include <asm/idle.h> 49#include <asm/idle.h>
50#include <asm/mtrr.h> 50#include <asm/mtrr.h>
51#include <asm/smp.h> 51#include <asm/smp.h>
52#include <asm/mce.h> 52#include <asm/mce.h>
53#include <asm/kvm_para.h>
54#include <asm/tsc.h> 53#include <asm/tsc.h>
54#include <asm/hypervisor.h>
55 55
56unsigned int num_processors; 56unsigned int num_processors;
57 57
@@ -79,12 +79,21 @@ EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid);
79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); 79EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid);
80 80
81#ifdef CONFIG_X86_32 81#ifdef CONFIG_X86_32
82
83/*
84 * On x86_32, the mapping between cpu and logical apicid may vary
85 * depending on apic in use. The following early percpu variable is
86 * used for the mapping. This is where the behaviors of x86_64 and 32
87 * actually diverge. Let's keep it ugly for now.
88 */
89DEFINE_EARLY_PER_CPU(int, x86_cpu_to_logical_apicid, BAD_APICID);
90
82/* 91/*
83 * Knob to control our willingness to enable the local APIC. 92 * Knob to control our willingness to enable the local APIC.
84 * 93 *
85 * +1=force-enable 94 * +1=force-enable
86 */ 95 */
87static int force_enable_local_apic; 96static int force_enable_local_apic __initdata;
88/* 97/*
89 * APIC command line parameters 98 * APIC command line parameters
90 */ 99 */
@@ -154,7 +163,7 @@ early_param("nox2apic", setup_nox2apic);
154unsigned long mp_lapic_addr; 163unsigned long mp_lapic_addr;
155int disable_apic; 164int disable_apic;
156/* Disable local APIC timer from the kernel commandline or via dmi quirk */ 165/* Disable local APIC timer from the kernel commandline or via dmi quirk */
157static int disable_apic_timer __cpuinitdata; 166static int disable_apic_timer __initdata;
158/* Local APIC timer works in C2 */ 167/* Local APIC timer works in C2 */
159int local_apic_timer_c2_ok; 168int local_apic_timer_c2_ok;
160EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok); 169EXPORT_SYMBOL_GPL(local_apic_timer_c2_ok);
@@ -178,29 +187,8 @@ static struct resource lapic_resource = {
178 187
179static unsigned int calibration_result; 188static unsigned int calibration_result;
180 189
181static int lapic_next_event(unsigned long delta,
182 struct clock_event_device *evt);
183static void lapic_timer_setup(enum clock_event_mode mode,
184 struct clock_event_device *evt);
185static void lapic_timer_broadcast(const struct cpumask *mask);
186static void apic_pm_activate(void); 190static void apic_pm_activate(void);
187 191
188/*
189 * The local apic timer can be used for any function which is CPU local.
190 */
191static struct clock_event_device lapic_clockevent = {
192 .name = "lapic",
193 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
194 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
195 .shift = 32,
196 .set_mode = lapic_timer_setup,
197 .set_next_event = lapic_next_event,
198 .broadcast = lapic_timer_broadcast,
199 .rating = 100,
200 .irq = -1,
201};
202static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
203
204static unsigned long apic_phys; 192static unsigned long apic_phys;
205 193
206/* 194/*
@@ -239,7 +227,7 @@ static int modern_apic(void)
239 * right after this call apic become NOOP driven 227 * right after this call apic become NOOP driven
240 * so apic->write/read doesn't do anything 228 * so apic->write/read doesn't do anything
241 */ 229 */
242void apic_disable(void) 230static void __init apic_disable(void)
243{ 231{
244 pr_info("APIC: switched to apic NOOP\n"); 232 pr_info("APIC: switched to apic NOOP\n");
245 apic = &apic_noop; 233 apic = &apic_noop;
@@ -283,23 +271,6 @@ u64 native_apic_icr_read(void)
283 return icr1 | ((u64)icr2 << 32); 271 return icr1 | ((u64)icr2 << 32);
284} 272}
285 273
286/**
287 * enable_NMI_through_LVT0 - enable NMI through local vector table 0
288 */
289void __cpuinit enable_NMI_through_LVT0(void)
290{
291 unsigned int v;
292
293 /* unmask and set to NMI */
294 v = APIC_DM_NMI;
295
296 /* Level triggered for 82489DX (32bit mode) */
297 if (!lapic_is_integrated())
298 v |= APIC_LVT_LEVEL_TRIGGER;
299
300 apic_write(APIC_LVT0, v);
301}
302
303#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
304/** 275/**
305 * get_physical_broadcast - Get number of physical broadcast IDs 276 * get_physical_broadcast - Get number of physical broadcast IDs
@@ -370,38 +341,89 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen)
370} 341}
371 342
372/* 343/*
373 * Setup extended LVT, AMD specific (K8, family 10h) 344 * Setup extended LVT, AMD specific
374 * 345 *
375 * Vector mappings are hard coded. On K8 only offset 0 (APIC500) and 346 * Software should use the LVT offsets the BIOS provides. The offsets
376 * MCE interrupts are supported. Thus MCE offset must be set to 0. 347 * are determined by the subsystems using it like those for MCE
348 * threshold or IBS. On K8 only offset 0 (APIC500) and MCE interrupts
349 * are supported. Beginning with family 10h at least 4 offsets are
350 * available.
377 * 351 *
378 * If mask=1, the LVT entry does not generate interrupts while mask=0 352 * Since the offsets must be consistent for all cores, we keep track
379 * enables the vector. See also the BKDGs. 353 * of the LVT offsets in software and reserve the offset for the same
354 * vector also to be used on other cores. An offset is freed by
355 * setting the entry to APIC_EILVT_MASKED.
356 *
357 * If the BIOS is right, there should be no conflicts. Otherwise a
358 * "[Firmware Bug]: ..." error message is generated. However, if
359 * software does not properly determines the offsets, it is not
360 * necessarily a BIOS bug.
380 */ 361 */
381 362
382#define APIC_EILVT_LVTOFF_MCE 0 363static atomic_t eilvt_offsets[APIC_EILVT_NR_MAX];
383#define APIC_EILVT_LVTOFF_IBS 1
384 364
385static void setup_APIC_eilvt(u8 lvt_off, u8 vector, u8 msg_type, u8 mask) 365static inline int eilvt_entry_is_changeable(unsigned int old, unsigned int new)
386{ 366{
387 unsigned long reg = (lvt_off << 4) + APIC_EILVTn(0); 367 return (old & APIC_EILVT_MASKED)
388 unsigned int v = (mask << 16) | (msg_type << 8) | vector; 368 || (new == APIC_EILVT_MASKED)
389 369 || ((new & ~APIC_EILVT_MASKED) == old);
390 apic_write(reg, v);
391} 370}
392 371
393u8 setup_APIC_eilvt_mce(u8 vector, u8 msg_type, u8 mask) 372static unsigned int reserve_eilvt_offset(int offset, unsigned int new)
394{ 373{
395 setup_APIC_eilvt(APIC_EILVT_LVTOFF_MCE, vector, msg_type, mask); 374 unsigned int rsvd; /* 0: uninitialized */
396 return APIC_EILVT_LVTOFF_MCE; 375
376 if (offset >= APIC_EILVT_NR_MAX)
377 return ~0;
378
379 rsvd = atomic_read(&eilvt_offsets[offset]) & ~APIC_EILVT_MASKED;
380 do {
381 if (rsvd &&
382 !eilvt_entry_is_changeable(rsvd, new))
383 /* may not change if vectors are different */
384 return rsvd;
385 rsvd = atomic_cmpxchg(&eilvt_offsets[offset], rsvd, new);
386 } while (rsvd != new);
387
388 return new;
397} 389}
398 390
399u8 setup_APIC_eilvt_ibs(u8 vector, u8 msg_type, u8 mask) 391/*
392 * If mask=1, the LVT entry does not generate interrupts while mask=0
393 * enables the vector. See also the BKDGs. Must be called with
394 * preemption disabled.
395 */
396
397int setup_APIC_eilvt(u8 offset, u8 vector, u8 msg_type, u8 mask)
400{ 398{
401 setup_APIC_eilvt(APIC_EILVT_LVTOFF_IBS, vector, msg_type, mask); 399 unsigned long reg = APIC_EILVTn(offset);
402 return APIC_EILVT_LVTOFF_IBS; 400 unsigned int new, old, reserved;
401
402 new = (mask << 16) | (msg_type << 8) | vector;
403 old = apic_read(reg);
404 reserved = reserve_eilvt_offset(offset, new);
405
406 if (reserved != new) {
407 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
408 "vector 0x%x, but the register is already in use for "
409 "vector 0x%x on another cpu\n",
410 smp_processor_id(), reg, offset, new, reserved);
411 return -EINVAL;
412 }
413
414 if (!eilvt_entry_is_changeable(old, new)) {
415 pr_err(FW_BUG "cpu %d, try to use APIC%lX (LVT offset %d) for "
416 "vector 0x%x, but the register is already in use for "
417 "vector 0x%x on this cpu\n",
418 smp_processor_id(), reg, offset, new, old);
419 return -EBUSY;
420 }
421
422 apic_write(reg, new);
423
424 return 0;
403} 425}
404EXPORT_SYMBOL_GPL(setup_APIC_eilvt_ibs); 426EXPORT_SYMBOL_GPL(setup_APIC_eilvt);
405 427
406/* 428/*
407 * Program the next event, relative to now 429 * Program the next event, relative to now
@@ -459,6 +481,23 @@ static void lapic_timer_broadcast(const struct cpumask *mask)
459#endif 481#endif
460} 482}
461 483
484
485/*
486 * The local apic timer can be used for any function which is CPU local.
487 */
488static struct clock_event_device lapic_clockevent = {
489 .name = "lapic",
490 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT
491 | CLOCK_EVT_FEAT_C3STOP | CLOCK_EVT_FEAT_DUMMY,
492 .shift = 32,
493 .set_mode = lapic_timer_setup,
494 .set_next_event = lapic_next_event,
495 .broadcast = lapic_timer_broadcast,
496 .rating = 100,
497 .irq = -1,
498};
499static DEFINE_PER_CPU(struct clock_event_device, lapic_events);
500
462/* 501/*
463 * Setup the local APIC timer for this CPU. Copy the initialized values 502 * Setup the local APIC timer for this CPU. Copy the initialized values
464 * of the boot CPU and register the clock event in the framework. 503 * of the boot CPU and register the clock event in the framework.
@@ -467,7 +506,7 @@ static void __cpuinit setup_APIC_timer(void)
467{ 506{
468 struct clock_event_device *levt = &__get_cpu_var(lapic_events); 507 struct clock_event_device *levt = &__get_cpu_var(lapic_events);
469 508
470 if (cpu_has(&current_cpu_data, X86_FEATURE_ARAT)) { 509 if (this_cpu_has(X86_FEATURE_ARAT)) {
471 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP; 510 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_C3STOP;
472 /* Make LAPIC timer preferrable over percpu HPET */ 511 /* Make LAPIC timer preferrable over percpu HPET */
473 lapic_clockevent.rating = 150; 512 lapic_clockevent.rating = 150;
@@ -635,7 +674,7 @@ static int __init calibrate_APIC_clock(void)
635 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS, 674 lapic_clockevent.mult = div_sc(delta, TICK_NSEC * LAPIC_CAL_LOOPS,
636 lapic_clockevent.shift); 675 lapic_clockevent.shift);
637 lapic_clockevent.max_delta_ns = 676 lapic_clockevent.max_delta_ns =
638 clockevent_delta2ns(0x7FFFFF, &lapic_clockevent); 677 clockevent_delta2ns(0x7FFFFFFF, &lapic_clockevent);
639 lapic_clockevent.min_delta_ns = 678 lapic_clockevent.min_delta_ns =
640 clockevent_delta2ns(0xF, &lapic_clockevent); 679 clockevent_delta2ns(0xF, &lapic_clockevent);
641 680
@@ -750,11 +789,7 @@ void __init setup_boot_APIC_clock(void)
750 * PIT/HPET going. Otherwise register lapic as a dummy 789 * PIT/HPET going. Otherwise register lapic as a dummy
751 * device. 790 * device.
752 */ 791 */
753 if (nmi_watchdog != NMI_IO_APIC) 792 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
754 lapic_clockevent.features &= ~CLOCK_EVT_FEAT_DUMMY;
755 else
756 pr_warning("APIC timer registered as dummy,"
757 " due to nmi_watchdog=%d!\n", nmi_watchdog);
758 793
759 /* Setup the lapic or request the broadcast */ 794 /* Setup the lapic or request the broadcast */
760 setup_APIC_timer(); 795 setup_APIC_timer();
@@ -1146,12 +1181,15 @@ static void __cpuinit lapic_setup_esr(void)
1146 oldvalue, value); 1181 oldvalue, value);
1147} 1182}
1148 1183
1149
1150/** 1184/**
1151 * setup_local_APIC - setup the local APIC 1185 * setup_local_APIC - setup the local APIC
1186 *
1187 * Used to setup local APIC while initializing BSP or bringin up APs.
1188 * Always called with preemption disabled.
1152 */ 1189 */
1153void __cpuinit setup_local_APIC(void) 1190void __cpuinit setup_local_APIC(void)
1154{ 1191{
1192 int cpu = smp_processor_id();
1155 unsigned int value, queued; 1193 unsigned int value, queued;
1156 int i, j, acked = 0; 1194 int i, j, acked = 0;
1157 unsigned long long tsc = 0, ntsc; 1195 unsigned long long tsc = 0, ntsc;
@@ -1161,7 +1199,7 @@ void __cpuinit setup_local_APIC(void)
1161 rdtscll(tsc); 1199 rdtscll(tsc);
1162 1200
1163 if (disable_apic) { 1201 if (disable_apic) {
1164 arch_disable_smp_support(); 1202 disable_ioapic_support();
1165 return; 1203 return;
1166 } 1204 }
1167 1205
@@ -1176,8 +1214,6 @@ void __cpuinit setup_local_APIC(void)
1176#endif 1214#endif
1177 perf_events_lapic_init(); 1215 perf_events_lapic_init();
1178 1216
1179 preempt_disable();
1180
1181 /* 1217 /*
1182 * Double-check whether this APIC is really registered. 1218 * Double-check whether this APIC is really registered.
1183 * This is meaningless in clustered apic mode, so we skip it. 1219 * This is meaningless in clustered apic mode, so we skip it.
@@ -1191,6 +1227,30 @@ void __cpuinit setup_local_APIC(void)
1191 */ 1227 */
1192 apic->init_apic_ldr(); 1228 apic->init_apic_ldr();
1193 1229
1230#ifdef CONFIG_X86_32
1231 /*
1232 * APIC LDR is initialized. If logical_apicid mapping was
1233 * initialized during get_smp_config(), make sure it matches the
1234 * actual value.
1235 */
1236 i = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
1237 WARN_ON(i != BAD_APICID && i != logical_smp_processor_id());
1238 /* always use the value from LDR */
1239 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1240 logical_smp_processor_id();
1241
1242 /*
1243 * Some NUMA implementations (NUMAQ) don't initialize apicid to
1244 * node mapping during NUMA init. Now that logical apicid is
1245 * guaranteed to be known, give it another chance. This is already
1246 * a bit too late - percpu allocation has already happened without
1247 * proper NUMA affinity.
1248 */
1249 if (apic->x86_32_numa_cpu_node)
1250 set_apicid_to_node(early_per_cpu(x86_cpu_to_apicid, cpu),
1251 apic->x86_32_numa_cpu_node(cpu));
1252#endif
1253
1194 /* 1254 /*
1195 * Set Task Priority to 'accept all'. We never change this 1255 * Set Task Priority to 'accept all'. We never change this
1196 * later on. 1256 * later on.
@@ -1293,21 +1353,19 @@ void __cpuinit setup_local_APIC(void)
1293 * TODO: set up through-local-APIC from through-I/O-APIC? --macro 1353 * TODO: set up through-local-APIC from through-I/O-APIC? --macro
1294 */ 1354 */
1295 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED; 1355 value = apic_read(APIC_LVT0) & APIC_LVT_MASKED;
1296 if (!smp_processor_id() && (pic_mode || !value)) { 1356 if (!cpu && (pic_mode || !value)) {
1297 value = APIC_DM_EXTINT; 1357 value = APIC_DM_EXTINT;
1298 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", 1358 apic_printk(APIC_VERBOSE, "enabled ExtINT on CPU#%d\n", cpu);
1299 smp_processor_id());
1300 } else { 1359 } else {
1301 value = APIC_DM_EXTINT | APIC_LVT_MASKED; 1360 value = APIC_DM_EXTINT | APIC_LVT_MASKED;
1302 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", 1361 apic_printk(APIC_VERBOSE, "masked ExtINT on CPU#%d\n", cpu);
1303 smp_processor_id());
1304 } 1362 }
1305 apic_write(APIC_LVT0, value); 1363 apic_write(APIC_LVT0, value);
1306 1364
1307 /* 1365 /*
1308 * only the BP should see the LINT1 NMI signal, obviously. 1366 * only the BP should see the LINT1 NMI signal, obviously.
1309 */ 1367 */
1310 if (!smp_processor_id()) 1368 if (!cpu)
1311 value = APIC_DM_NMI; 1369 value = APIC_DM_NMI;
1312 else 1370 else
1313 value = APIC_DM_NMI | APIC_LVT_MASKED; 1371 value = APIC_DM_NMI | APIC_LVT_MASKED;
@@ -1315,11 +1373,9 @@ void __cpuinit setup_local_APIC(void)
1315 value |= APIC_LVT_LEVEL_TRIGGER; 1373 value |= APIC_LVT_LEVEL_TRIGGER;
1316 apic_write(APIC_LVT1, value); 1374 apic_write(APIC_LVT1, value);
1317 1375
1318 preempt_enable();
1319
1320#ifdef CONFIG_X86_MCE_INTEL 1376#ifdef CONFIG_X86_MCE_INTEL
1321 /* Recheck CMCI information after local APIC is up on CPU #0 */ 1377 /* Recheck CMCI information after local APIC is up on CPU #0 */
1322 if (smp_processor_id() == 0) 1378 if (!cpu)
1323 cmci_recheck(); 1379 cmci_recheck();
1324#endif 1380#endif
1325} 1381}
@@ -1338,10 +1394,22 @@ void __cpuinit end_local_APIC_setup(void)
1338 } 1394 }
1339#endif 1395#endif
1340 1396
1341 setup_apic_nmi_watchdog(NULL);
1342 apic_pm_activate(); 1397 apic_pm_activate();
1343} 1398}
1344 1399
1400void __init bsp_end_local_APIC_setup(void)
1401{
1402 end_local_APIC_setup();
1403
1404 /*
1405 * Now that local APIC setup is completed for BP, configure the fault
1406 * handling for interrupt remapping.
1407 */
1408 if (intr_remapping_enabled)
1409 enable_drhd_fault_handling();
1410
1411}
1412
1345#ifdef CONFIG_X86_X2APIC 1413#ifdef CONFIG_X86_X2APIC
1346void check_x2apic(void) 1414void check_x2apic(void)
1347{ 1415{
@@ -1394,7 +1462,6 @@ int __init enable_IR(void)
1394void __init enable_IR_x2apic(void) 1462void __init enable_IR_x2apic(void)
1395{ 1463{
1396 unsigned long flags; 1464 unsigned long flags;
1397 struct IO_APIC_route_entry **ioapic_entries = NULL;
1398 int ret, x2apic_enabled = 0; 1465 int ret, x2apic_enabled = 0;
1399 int dmar_table_init_ret; 1466 int dmar_table_init_ret;
1400 1467
@@ -1402,13 +1469,7 @@ void __init enable_IR_x2apic(void)
1402 if (dmar_table_init_ret && !x2apic_supported()) 1469 if (dmar_table_init_ret && !x2apic_supported())
1403 return; 1470 return;
1404 1471
1405 ioapic_entries = alloc_ioapic_entries(); 1472 ret = save_ioapic_entries();
1406 if (!ioapic_entries) {
1407 pr_err("Allocate ioapic_entries failed\n");
1408 goto out;
1409 }
1410
1411 ret = save_IO_APIC_setup(ioapic_entries);
1412 if (ret) { 1473 if (ret) {
1413 pr_info("Saving IO-APIC state failed: %d\n", ret); 1474 pr_info("Saving IO-APIC state failed: %d\n", ret);
1414 goto out; 1475 goto out;
@@ -1416,7 +1477,7 @@ void __init enable_IR_x2apic(void)
1416 1477
1417 local_irq_save(flags); 1478 local_irq_save(flags);
1418 legacy_pic->mask_all(); 1479 legacy_pic->mask_all();
1419 mask_IO_APIC_setup(ioapic_entries); 1480 mask_ioapic_entries();
1420 1481
1421 if (dmar_table_init_ret) 1482 if (dmar_table_init_ret)
1422 ret = 0; 1483 ret = 0;
@@ -1427,7 +1488,8 @@ void __init enable_IR_x2apic(void)
1427 /* IR is required if there is APIC ID > 255 even when running 1488 /* IR is required if there is APIC ID > 255 even when running
1428 * under KVM 1489 * under KVM
1429 */ 1490 */
1430 if (max_physical_apicid > 255 || !kvm_para_available()) 1491 if (max_physical_apicid > 255 ||
1492 !hypervisor_x2apic_available())
1431 goto nox2apic; 1493 goto nox2apic;
1432 /* 1494 /*
1433 * without IR all CPUs can be addressed by IOAPIC/MSI 1495 * without IR all CPUs can be addressed by IOAPIC/MSI
@@ -1446,14 +1508,11 @@ void __init enable_IR_x2apic(void)
1446 1508
1447nox2apic: 1509nox2apic:
1448 if (!ret) /* IR enabling failed */ 1510 if (!ret) /* IR enabling failed */
1449 restore_IO_APIC_setup(ioapic_entries); 1511 restore_ioapic_entries();
1450 legacy_pic->restore_mask(); 1512 legacy_pic->restore_mask();
1451 local_irq_restore(flags); 1513 local_irq_restore(flags);
1452 1514
1453out: 1515out:
1454 if (ioapic_entries)
1455 free_ioapic_entries(ioapic_entries);
1456
1457 if (x2apic_enabled) 1516 if (x2apic_enabled)
1458 return; 1517 return;
1459 1518
@@ -1481,13 +1540,60 @@ static int __init detect_init_APIC(void)
1481 return 0; 1540 return 0;
1482} 1541}
1483#else 1542#else
1543
1544static int __init apic_verify(void)
1545{
1546 u32 features, h, l;
1547
1548 /*
1549 * The APIC feature bit should now be enabled
1550 * in `cpuid'
1551 */
1552 features = cpuid_edx(1);
1553 if (!(features & (1 << X86_FEATURE_APIC))) {
1554 pr_warning("Could not enable APIC!\n");
1555 return -1;
1556 }
1557 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1558 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1559
1560 /* The BIOS may have set up the APIC at some other address */
1561 rdmsr(MSR_IA32_APICBASE, l, h);
1562 if (l & MSR_IA32_APICBASE_ENABLE)
1563 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1564
1565 pr_info("Found and enabled local APIC!\n");
1566 return 0;
1567}
1568
1569int __init apic_force_enable(unsigned long addr)
1570{
1571 u32 h, l;
1572
1573 if (disable_apic)
1574 return -1;
1575
1576 /*
1577 * Some BIOSes disable the local APIC in the APIC_BASE
1578 * MSR. This can only be done in software for Intel P6 or later
1579 * and AMD K7 (Model > 1) or later.
1580 */
1581 rdmsr(MSR_IA32_APICBASE, l, h);
1582 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1583 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1584 l &= ~MSR_IA32_APICBASE_BASE;
1585 l |= MSR_IA32_APICBASE_ENABLE | addr;
1586 wrmsr(MSR_IA32_APICBASE, l, h);
1587 enabled_via_apicbase = 1;
1588 }
1589 return apic_verify();
1590}
1591
1484/* 1592/*
1485 * Detect and initialize APIC 1593 * Detect and initialize APIC
1486 */ 1594 */
1487static int __init detect_init_APIC(void) 1595static int __init detect_init_APIC(void)
1488{ 1596{
1489 u32 h, l, features;
1490
1491 /* Disabled by kernel option? */ 1597 /* Disabled by kernel option? */
1492 if (disable_apic) 1598 if (disable_apic)
1493 return -1; 1599 return -1;
@@ -1517,38 +1623,12 @@ static int __init detect_init_APIC(void)
1517 "you can enable it with \"lapic\"\n"); 1623 "you can enable it with \"lapic\"\n");
1518 return -1; 1624 return -1;
1519 } 1625 }
1520 /* 1626 if (apic_force_enable(APIC_DEFAULT_PHYS_BASE))
1521 * Some BIOSes disable the local APIC in the APIC_BASE 1627 return -1;
1522 * MSR. This can only be done in software for Intel P6 or later 1628 } else {
1523 * and AMD K7 (Model > 1) or later. 1629 if (apic_verify())
1524 */ 1630 return -1;
1525 rdmsr(MSR_IA32_APICBASE, l, h);
1526 if (!(l & MSR_IA32_APICBASE_ENABLE)) {
1527 pr_info("Local APIC disabled by BIOS -- reenabling.\n");
1528 l &= ~MSR_IA32_APICBASE_BASE;
1529 l |= MSR_IA32_APICBASE_ENABLE | APIC_DEFAULT_PHYS_BASE;
1530 wrmsr(MSR_IA32_APICBASE, l, h);
1531 enabled_via_apicbase = 1;
1532 }
1533 }
1534 /*
1535 * The APIC feature bit should now be enabled
1536 * in `cpuid'
1537 */
1538 features = cpuid_edx(1);
1539 if (!(features & (1 << X86_FEATURE_APIC))) {
1540 pr_warning("Could not enable APIC!\n");
1541 return -1;
1542 } 1631 }
1543 set_cpu_cap(&boot_cpu_data, X86_FEATURE_APIC);
1544 mp_lapic_addr = APIC_DEFAULT_PHYS_BASE;
1545
1546 /* The BIOS may have set up the APIC at some other address */
1547 rdmsr(MSR_IA32_APICBASE, l, h);
1548 if (l & MSR_IA32_APICBASE_ENABLE)
1549 mp_lapic_addr = l & MSR_IA32_APICBASE_BASE;
1550
1551 pr_info("Found and enabled local APIC!\n");
1552 1632
1553 apic_pm_activate(); 1633 apic_pm_activate();
1554 1634
@@ -1560,28 +1640,6 @@ no_apic:
1560} 1640}
1561#endif 1641#endif
1562 1642
1563#ifdef CONFIG_X86_64
1564void __init early_init_lapic_mapping(void)
1565{
1566 /*
1567 * If no local APIC can be found then go out
1568 * : it means there is no mpatable and MADT
1569 */
1570 if (!smp_found_config)
1571 return;
1572
1573 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
1574 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1575 APIC_BASE, mp_lapic_addr);
1576
1577 /*
1578 * Fetch the APIC ID of the BSP in case we have a
1579 * default configuration (or the MP table is broken).
1580 */
1581 boot_cpu_physical_apicid = read_apic_id();
1582}
1583#endif
1584
1585/** 1643/**
1586 * init_apic_mappings - initialize APIC mappings 1644 * init_apic_mappings - initialize APIC mappings
1587 */ 1645 */
@@ -1607,10 +1665,7 @@ void __init init_apic_mappings(void)
1607 * acpi_register_lapic_address() 1665 * acpi_register_lapic_address()
1608 */ 1666 */
1609 if (!acpi_lapic && !smp_found_config) 1667 if (!acpi_lapic && !smp_found_config)
1610 set_fixmap_nocache(FIX_APIC_BASE, apic_phys); 1668 register_lapic_address(apic_phys);
1611
1612 apic_printk(APIC_VERBOSE, "mapped APIC to %08lx (%08lx)\n",
1613 APIC_BASE, apic_phys);
1614 } 1669 }
1615 1670
1616 /* 1671 /*
@@ -1632,11 +1687,27 @@ void __init init_apic_mappings(void)
1632 } 1687 }
1633} 1688}
1634 1689
1690void __init register_lapic_address(unsigned long address)
1691{
1692 mp_lapic_addr = address;
1693
1694 if (!x2apic_mode) {
1695 set_fixmap_nocache(FIX_APIC_BASE, address);
1696 apic_printk(APIC_VERBOSE, "mapped APIC to %16lx (%16lx)\n",
1697 APIC_BASE, mp_lapic_addr);
1698 }
1699 if (boot_cpu_physical_apicid == -1U) {
1700 boot_cpu_physical_apicid = read_apic_id();
1701 apic_version[boot_cpu_physical_apicid] =
1702 GET_APIC_VERSION(apic_read(APIC_LVR));
1703 }
1704}
1705
1635/* 1706/*
1636 * This initializes the IO-APIC and APIC hardware if this is 1707 * This initializes the IO-APIC and APIC hardware if this is
1637 * a UP kernel. 1708 * a UP kernel.
1638 */ 1709 */
1639int apic_version[MAX_APICS]; 1710int apic_version[MAX_LOCAL_APIC];
1640 1711
1641int __init APIC_init_uniprocessor(void) 1712int __init APIC_init_uniprocessor(void)
1642{ 1713{
@@ -1665,10 +1736,7 @@ int __init APIC_init_uniprocessor(void)
1665 } 1736 }
1666#endif 1737#endif
1667 1738
1668#ifndef CONFIG_SMP
1669 enable_IR_x2apic();
1670 default_setup_apic_routing(); 1739 default_setup_apic_routing();
1671#endif
1672 1740
1673 verify_local_APIC(); 1741 verify_local_APIC();
1674 connect_bsp_APIC(); 1742 connect_bsp_APIC();
@@ -1697,24 +1765,17 @@ int __init APIC_init_uniprocessor(void)
1697 enable_IO_APIC(); 1765 enable_IO_APIC();
1698#endif 1766#endif
1699 1767
1700 end_local_APIC_setup(); 1768 bsp_end_local_APIC_setup();
1701 1769
1702#ifdef CONFIG_X86_IO_APIC 1770#ifdef CONFIG_X86_IO_APIC
1703 if (smp_found_config && !skip_ioapic_setup && nr_ioapics) 1771 if (smp_found_config && !skip_ioapic_setup && nr_ioapics)
1704 setup_IO_APIC(); 1772 setup_IO_APIC();
1705 else { 1773 else {
1706 nr_ioapics = 0; 1774 nr_ioapics = 0;
1707 localise_nmi_watchdog();
1708 } 1775 }
1709#else
1710 localise_nmi_watchdog();
1711#endif 1776#endif
1712 1777
1713 x86_init.timers.setup_percpu_clockev(); 1778 x86_init.timers.setup_percpu_clockev();
1714#ifdef CONFIG_X86_64
1715 check_nmi_watchdog();
1716#endif
1717
1718 return 0; 1779 return 0;
1719} 1780}
1720 1781
@@ -1753,30 +1814,41 @@ void smp_spurious_interrupt(struct pt_regs *regs)
1753 */ 1814 */
1754void smp_error_interrupt(struct pt_regs *regs) 1815void smp_error_interrupt(struct pt_regs *regs)
1755{ 1816{
1756 u32 v, v1; 1817 u32 v0, v1;
1818 u32 i = 0;
1819 static const char * const error_interrupt_reason[] = {
1820 "Send CS error", /* APIC Error Bit 0 */
1821 "Receive CS error", /* APIC Error Bit 1 */
1822 "Send accept error", /* APIC Error Bit 2 */
1823 "Receive accept error", /* APIC Error Bit 3 */
1824 "Redirectable IPI", /* APIC Error Bit 4 */
1825 "Send illegal vector", /* APIC Error Bit 5 */
1826 "Received illegal vector", /* APIC Error Bit 6 */
1827 "Illegal register address", /* APIC Error Bit 7 */
1828 };
1757 1829
1758 exit_idle(); 1830 exit_idle();
1759 irq_enter(); 1831 irq_enter();
1760 /* First tickle the hardware, only then report what went on. -- REW */ 1832 /* First tickle the hardware, only then report what went on. -- REW */
1761 v = apic_read(APIC_ESR); 1833 v0 = apic_read(APIC_ESR);
1762 apic_write(APIC_ESR, 0); 1834 apic_write(APIC_ESR, 0);
1763 v1 = apic_read(APIC_ESR); 1835 v1 = apic_read(APIC_ESR);
1764 ack_APIC_irq(); 1836 ack_APIC_irq();
1765 atomic_inc(&irq_err_count); 1837 atomic_inc(&irq_err_count);
1766 1838
1767 /* 1839 apic_printk(APIC_DEBUG, KERN_DEBUG "APIC error on CPU%d: %02x(%02x)",
1768 * Here is what the APIC error bits mean: 1840 smp_processor_id(), v0 , v1);
1769 * 0: Send CS error 1841
1770 * 1: Receive CS error 1842 v1 = v1 & 0xff;
1771 * 2: Send accept error 1843 while (v1) {
1772 * 3: Receive accept error 1844 if (v1 & 0x1)
1773 * 4: Reserved 1845 apic_printk(APIC_DEBUG, KERN_CONT " : %s", error_interrupt_reason[i]);
1774 * 5: Send illegal vector 1846 i++;
1775 * 6: Received illegal vector 1847 v1 >>= 1;
1776 * 7: Illegal register address 1848 };
1777 */ 1849
1778 pr_debug("APIC error on CPU%d: %02x(%02x)\n", 1850 apic_printk(APIC_DEBUG, KERN_CONT "\n");
1779 smp_processor_id(), v , v1); 1851
1780 irq_exit(); 1852 irq_exit();
1781} 1853}
1782 1854
@@ -1873,17 +1945,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1873{ 1945{
1874 int cpu; 1946 int cpu;
1875 1947
1876 /*
1877 * Validate version
1878 */
1879 if (version == 0x0) {
1880 pr_warning("BIOS bug, APIC version is 0 for CPU#%d! "
1881 "fixing up to 0x10. (tell your hw vendor)\n",
1882 version);
1883 version = 0x10;
1884 }
1885 apic_version[apicid] = version;
1886
1887 if (num_processors >= nr_cpu_ids) { 1948 if (num_processors >= nr_cpu_ids) {
1888 int max = nr_cpu_ids; 1949 int max = nr_cpu_ids;
1889 int thiscpu = max + disabled_cpus; 1950 int thiscpu = max + disabled_cpus;
@@ -1897,22 +1958,34 @@ void __cpuinit generic_processor_info(int apicid, int version)
1897 } 1958 }
1898 1959
1899 num_processors++; 1960 num_processors++;
1900 cpu = cpumask_next_zero(-1, cpu_present_mask);
1901
1902 if (version != apic_version[boot_cpu_physical_apicid])
1903 WARN_ONCE(1,
1904 "ACPI: apic version mismatch, bootcpu: %x cpu %d: %x\n",
1905 apic_version[boot_cpu_physical_apicid], cpu, version);
1906
1907 physid_set(apicid, phys_cpu_present_map);
1908 if (apicid == boot_cpu_physical_apicid) { 1961 if (apicid == boot_cpu_physical_apicid) {
1909 /* 1962 /*
1910 * x86_bios_cpu_apicid is required to have processors listed 1963 * x86_bios_cpu_apicid is required to have processors listed
1911 * in same order as logical cpu numbers. Hence the first 1964 * in same order as logical cpu numbers. Hence the first
1912 * entry is BSP, and so on. 1965 * entry is BSP, and so on.
1966 * boot_cpu_init() already hold bit 0 in cpu_present_mask
1967 * for BSP.
1913 */ 1968 */
1914 cpu = 0; 1969 cpu = 0;
1970 } else
1971 cpu = cpumask_next_zero(-1, cpu_present_mask);
1972
1973 /*
1974 * Validate version
1975 */
1976 if (version == 0x0) {
1977 pr_warning("BIOS bug: APIC version is 0 for CPU %d/0x%x, fixing up to 0x10\n",
1978 cpu, apicid);
1979 version = 0x10;
1915 } 1980 }
1981 apic_version[apicid] = version;
1982
1983 if (version != apic_version[boot_cpu_physical_apicid]) {
1984 pr_warning("BIOS bug: APIC version mismatch, boot CPU: %x, CPU %d: version %x\n",
1985 apic_version[boot_cpu_physical_apicid], cpu, version);
1986 }
1987
1988 physid_set(apicid, phys_cpu_present_map);
1916 if (apicid > max_physical_apicid) 1989 if (apicid > max_physical_apicid)
1917 max_physical_apicid = apicid; 1990 max_physical_apicid = apicid;
1918 1991
@@ -1920,7 +1993,10 @@ void __cpuinit generic_processor_info(int apicid, int version)
1920 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1993 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1921 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1994 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
1922#endif 1995#endif
1923 1996#ifdef CONFIG_X86_32
1997 early_per_cpu(x86_cpu_to_logical_apicid, cpu) =
1998 apic->x86_32_early_logical_apicid(cpu);
1999#endif
1924 set_cpu_possible(cpu, true); 2000 set_cpu_possible(cpu, true);
1925 set_cpu_present(cpu, true); 2001 set_cpu_present(cpu, true);
1926} 2002}
@@ -1940,17 +2016,6 @@ void default_init_apic_ldr(void)
1940 apic_write(APIC_LDR, val); 2016 apic_write(APIC_LDR, val);
1941} 2017}
1942 2018
1943#ifdef CONFIG_X86_32
1944int default_apicid_to_node(int logical_apicid)
1945{
1946#ifdef CONFIG_SMP
1947 return apicid_2_node[hard_smp_processor_id()];
1948#else
1949 return 0;
1950#endif
1951}
1952#endif
1953
1954/* 2019/*
1955 * Power management 2020 * Power management
1956 */ 2021 */
@@ -1979,7 +2044,7 @@ static struct {
1979 unsigned int apic_thmr; 2044 unsigned int apic_thmr;
1980} apic_pm_state; 2045} apic_pm_state;
1981 2046
1982static int lapic_suspend(struct sys_device *dev, pm_message_t state) 2047static int lapic_suspend(void)
1983{ 2048{
1984 unsigned long flags; 2049 unsigned long flags;
1985 int maxlvt; 2050 int maxlvt;
@@ -2017,34 +2082,24 @@ static int lapic_suspend(struct sys_device *dev, pm_message_t state)
2017 return 0; 2082 return 0;
2018} 2083}
2019 2084
2020static int lapic_resume(struct sys_device *dev) 2085static void lapic_resume(void)
2021{ 2086{
2022 unsigned int l, h; 2087 unsigned int l, h;
2023 unsigned long flags; 2088 unsigned long flags;
2024 int maxlvt; 2089 int maxlvt;
2025 int ret = 0;
2026 struct IO_APIC_route_entry **ioapic_entries = NULL;
2027 2090
2028 if (!apic_pm_state.active) 2091 if (!apic_pm_state.active)
2029 return 0; 2092 return;
2030 2093
2031 local_irq_save(flags); 2094 local_irq_save(flags);
2032 if (intr_remapping_enabled) { 2095 if (intr_remapping_enabled) {
2033 ioapic_entries = alloc_ioapic_entries(); 2096 /*
2034 if (!ioapic_entries) { 2097 * IO-APIC and PIC have their own resume routines.
2035 WARN(1, "Alloc ioapic_entries in lapic resume failed."); 2098 * We just mask them here to make sure the interrupt
2036 ret = -ENOMEM; 2099 * subsystem is completely quiet while we enable x2apic
2037 goto restore; 2100 * and interrupt-remapping.
2038 } 2101 */
2039 2102 mask_ioapic_entries();
2040 ret = save_IO_APIC_setup(ioapic_entries);
2041 if (ret) {
2042 WARN(1, "Saving IO-APIC state failed: %d\n", ret);
2043 free_ioapic_entries(ioapic_entries);
2044 goto restore;
2045 }
2046
2047 mask_IO_APIC_setup(ioapic_entries);
2048 legacy_pic->mask_all(); 2103 legacy_pic->mask_all();
2049 } 2104 }
2050 2105
@@ -2087,16 +2142,10 @@ static int lapic_resume(struct sys_device *dev)
2087 apic_write(APIC_ESR, 0); 2142 apic_write(APIC_ESR, 0);
2088 apic_read(APIC_ESR); 2143 apic_read(APIC_ESR);
2089 2144
2090 if (intr_remapping_enabled) { 2145 if (intr_remapping_enabled)
2091 reenable_intr_remapping(x2apic_mode); 2146 reenable_intr_remapping(x2apic_mode);
2092 legacy_pic->restore_mask();
2093 restore_IO_APIC_setup(ioapic_entries);
2094 free_ioapic_entries(ioapic_entries);
2095 }
2096restore:
2097 local_irq_restore(flags);
2098 2147
2099 return ret; 2148 local_irq_restore(flags);
2100} 2149}
2101 2150
2102/* 2151/*
@@ -2104,17 +2153,11 @@ restore:
2104 * are needed on every CPU up until machine_halt/restart/poweroff. 2153 * are needed on every CPU up until machine_halt/restart/poweroff.
2105 */ 2154 */
2106 2155
2107static struct sysdev_class lapic_sysclass = { 2156static struct syscore_ops lapic_syscore_ops = {
2108 .name = "lapic",
2109 .resume = lapic_resume, 2157 .resume = lapic_resume,
2110 .suspend = lapic_suspend, 2158 .suspend = lapic_suspend,
2111}; 2159};
2112 2160
2113static struct sys_device device_lapic = {
2114 .id = 0,
2115 .cls = &lapic_sysclass,
2116};
2117
2118static void __cpuinit apic_pm_activate(void) 2161static void __cpuinit apic_pm_activate(void)
2119{ 2162{
2120 apic_pm_state.active = 1; 2163 apic_pm_state.active = 1;
@@ -2122,16 +2165,11 @@ static void __cpuinit apic_pm_activate(void)
2122 2165
2123static int __init init_lapic_sysfs(void) 2166static int __init init_lapic_sysfs(void)
2124{ 2167{
2125 int error;
2126
2127 if (!cpu_has_apic)
2128 return 0;
2129 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */ 2168 /* XXX: remove suspend/resume procs if !apic_pm_state.active? */
2169 if (cpu_has_apic)
2170 register_syscore_ops(&lapic_syscore_ops);
2130 2171
2131 error = sysdev_class_register(&lapic_sysclass); 2172 return 0;
2132 if (!error)
2133 error = sysdev_register(&device_lapic);
2134 return error;
2135} 2173}
2136 2174
2137/* local apic needs to resume before other devices access its registers. */ 2175/* local apic needs to resume before other devices access its registers. */
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index 09d3b17ce0c2..f7a41e4cae47 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -16,6 +16,7 @@
16#include <linux/ctype.h> 16#include <linux/ctype.h>
17#include <linux/init.h> 17#include <linux/init.h>
18#include <linux/hardirq.h> 18#include <linux/hardirq.h>
19#include <linux/module.h>
19#include <asm/smp.h> 20#include <asm/smp.h>
20#include <asm/apic.h> 21#include <asm/apic.h>
21#include <asm/ipi.h> 22#include <asm/ipi.h>
@@ -24,6 +25,12 @@
24#include <acpi/acpi_bus.h> 25#include <acpi/acpi_bus.h>
25#endif 26#endif
26 27
28static struct apic apic_physflat;
29static struct apic apic_flat;
30
31struct apic __read_mostly *apic = &apic_flat;
32EXPORT_SYMBOL_GPL(apic);
33
27static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 34static int flat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
28{ 35{
29 return 1; 36 return 1;
@@ -164,7 +171,7 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb)
164 return initial_apic_id >> index_msb; 171 return initial_apic_id >> index_msb;
165} 172}
166 173
167struct apic apic_flat = { 174static struct apic apic_flat = {
168 .name = "flat", 175 .name = "flat",
169 .probe = NULL, 176 .probe = NULL,
170 .acpi_madt_oem_check = flat_acpi_madt_oem_check, 177 .acpi_madt_oem_check = flat_acpi_madt_oem_check,
@@ -185,8 +192,6 @@ struct apic apic_flat = {
185 .ioapic_phys_id_map = NULL, 192 .ioapic_phys_id_map = NULL,
186 .setup_apic_routing = NULL, 193 .setup_apic_routing = NULL,
187 .multi_timer_check = NULL, 194 .multi_timer_check = NULL,
188 .apicid_to_node = NULL,
189 .cpu_to_logical_apicid = NULL,
190 .cpu_present_to_apicid = default_cpu_present_to_apicid, 195 .cpu_present_to_apicid = default_cpu_present_to_apicid,
191 .apicid_to_cpu_present = NULL, 196 .apicid_to_cpu_present = NULL,
192 .setup_portio_remap = NULL, 197 .setup_portio_remap = NULL,
@@ -314,10 +319,18 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
314 return per_cpu(x86_cpu_to_apicid, cpu); 319 return per_cpu(x86_cpu_to_apicid, cpu);
315} 320}
316 321
317struct apic apic_physflat = { 322static int physflat_probe(void)
323{
324 if (apic == &apic_physflat || num_possible_cpus() > 8)
325 return 1;
326
327 return 0;
328}
329
330static struct apic apic_physflat = {
318 331
319 .name = "physical flat", 332 .name = "physical flat",
320 .probe = NULL, 333 .probe = physflat_probe,
321 .acpi_madt_oem_check = physflat_acpi_madt_oem_check, 334 .acpi_madt_oem_check = physflat_acpi_madt_oem_check,
322 .apic_id_registered = flat_apic_id_registered, 335 .apic_id_registered = flat_apic_id_registered,
323 336
@@ -337,8 +350,6 @@ struct apic apic_physflat = {
337 .ioapic_phys_id_map = NULL, 350 .ioapic_phys_id_map = NULL,
338 .setup_apic_routing = NULL, 351 .setup_apic_routing = NULL,
339 .multi_timer_check = NULL, 352 .multi_timer_check = NULL,
340 .apicid_to_node = NULL,
341 .cpu_to_logical_apicid = NULL,
342 .cpu_present_to_apicid = default_cpu_present_to_apicid, 353 .cpu_present_to_apicid = default_cpu_present_to_apicid,
343 .apicid_to_cpu_present = NULL, 354 .apicid_to_cpu_present = NULL,
344 .setup_portio_remap = NULL, 355 .setup_portio_remap = NULL,
@@ -373,3 +384,8 @@ struct apic apic_physflat = {
373 .wait_icr_idle = native_apic_wait_icr_idle, 384 .wait_icr_idle = native_apic_wait_icr_idle,
374 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 385 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
375}; 386};
387
388/*
389 * We need to check for physflat first, so this order is important.
390 */
391apic_drivers(apic_physflat, apic_flat);
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
index e31b9ffe25f5..775b82bc655c 100644
--- a/arch/x86/kernel/apic/apic_noop.c
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -54,11 +54,6 @@ static u64 noop_apic_icr_read(void)
54 return 0; 54 return 0;
55} 55}
56 56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb) 57static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{ 58{
64 return 0; 59 return 0;
@@ -113,12 +108,6 @@ static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
113 cpumask_set_cpu(cpu, retmask); 108 cpumask_set_cpu(cpu, retmask);
114} 109}
115 110
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg) 111static u32 noop_apic_read(u32 reg)
123{ 112{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic)); 113 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
@@ -153,9 +142,7 @@ struct apic apic_noop = {
153 .ioapic_phys_id_map = default_ioapic_phys_id_map, 142 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL, 143 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL, 144 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157 145
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid, 146 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid, 147 .apicid_to_cpu_present = physid_set_mask_of_physid,
161 148
@@ -197,4 +184,8 @@ struct apic apic_noop = {
197 .icr_write = noop_apic_icr_write, 184 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle, 185 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle, 186 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
187
188#ifdef CONFIG_X86_32
189 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
190#endif
200}; 191};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index cb804c5091b9..efd737e827f4 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -45,6 +45,12 @@ static unsigned long bigsmp_check_apicid_present(int bit)
45 return 1; 45 return 1;
46} 46}
47 47
48static int bigsmp_early_logical_apicid(int cpu)
49{
50 /* on bigsmp, logical apicid is the same as physical */
51 return early_per_cpu(x86_cpu_to_apicid, cpu);
52}
53
48static inline unsigned long calculate_ldr(int cpu) 54static inline unsigned long calculate_ldr(int cpu)
49{ 55{
50 unsigned long val, id; 56 unsigned long val, id;
@@ -80,11 +86,6 @@ static void bigsmp_setup_apic_routing(void)
80 nr_ioapics); 86 nr_ioapics);
81} 87}
82 88
83static int bigsmp_apicid_to_node(int logical_apicid)
84{
85 return apicid_2_node[hard_smp_processor_id()];
86}
87
88static int bigsmp_cpu_present_to_apicid(int mps_cpu) 89static int bigsmp_cpu_present_to_apicid(int mps_cpu)
89{ 90{
90 if (mps_cpu < nr_cpu_ids) 91 if (mps_cpu < nr_cpu_ids)
@@ -93,14 +94,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 94 return BAD_APICID;
94} 95}
95 96
96/* Mapping from cpu number to logical apicid */
97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
98{
99 if (cpu >= nr_cpu_ids)
100 return BAD_APICID;
101 return cpu_physical_id(cpu);
102}
103
104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 97static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
105{ 98{
106 /* For clustered we don't have a good way to do this yet - hack */ 99 /* For clustered we don't have a good way to do this yet - hack */
@@ -115,7 +108,11 @@ static int bigsmp_check_phys_apicid_present(int phys_apicid)
115/* As we are using single CPU as destination, pick only one CPU here */ 108/* As we are using single CPU as destination, pick only one CPU here */
116static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask) 109static unsigned int bigsmp_cpu_mask_to_apicid(const struct cpumask *cpumask)
117{ 110{
118 return bigsmp_cpu_to_logical_apicid(cpumask_first(cpumask)); 111 int cpu = cpumask_first(cpumask);
112
113 if (cpu < nr_cpu_ids)
114 return cpu_physical_id(cpu);
115 return BAD_APICID;
119} 116}
120 117
121static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask, 118static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
@@ -129,9 +126,9 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
129 */ 126 */
130 for_each_cpu_and(cpu, cpumask, andmask) { 127 for_each_cpu_and(cpu, cpumask, andmask) {
131 if (cpumask_test_cpu(cpu, cpu_online_mask)) 128 if (cpumask_test_cpu(cpu, cpu_online_mask))
132 break; 129 return cpu_physical_id(cpu);
133 } 130 }
134 return bigsmp_cpu_to_logical_apicid(cpu); 131 return BAD_APICID;
135} 132}
136 133
137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 134static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -196,7 +193,7 @@ static int probe_bigsmp(void)
196 return dmi_bigsmp; 193 return dmi_bigsmp;
197} 194}
198 195
199struct apic apic_bigsmp = { 196static struct apic apic_bigsmp = {
200 197
201 .name = "bigsmp", 198 .name = "bigsmp",
202 .probe = probe_bigsmp, 199 .probe = probe_bigsmp,
@@ -219,8 +216,6 @@ struct apic apic_bigsmp = {
219 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map, 216 .ioapic_phys_id_map = bigsmp_ioapic_phys_id_map,
220 .setup_apic_routing = bigsmp_setup_apic_routing, 217 .setup_apic_routing = bigsmp_setup_apic_routing,
221 .multi_timer_check = NULL, 218 .multi_timer_check = NULL,
222 .apicid_to_node = bigsmp_apicid_to_node,
223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 219 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
225 .apicid_to_cpu_present = physid_set_mask_of_physid, 220 .apicid_to_cpu_present = physid_set_mask_of_physid,
226 .setup_portio_remap = NULL, 221 .setup_portio_remap = NULL,
@@ -256,4 +251,16 @@ struct apic apic_bigsmp = {
256 .icr_write = native_apic_icr_write, 251 .icr_write = native_apic_icr_write,
257 .wait_icr_idle = native_apic_wait_icr_idle, 252 .wait_icr_idle = native_apic_wait_icr_idle,
258 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 253 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
254
255 .x86_32_early_logical_apicid = bigsmp_early_logical_apicid,
259}; 256};
257
258struct apic * __init generic_bigsmp_probe(void)
259{
260 if (probe_bigsmp())
261 return &apic_bigsmp;
262
263 return NULL;
264}
265
266apic_driver(apic_bigsmp);
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 8593582d8022..9536b3fe43f8 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -460,6 +460,12 @@ static unsigned long es7000_check_apicid_present(int bit)
460 return physid_isset(bit, phys_cpu_present_map); 460 return physid_isset(bit, phys_cpu_present_map);
461} 461}
462 462
463static int es7000_early_logical_apicid(int cpu)
464{
465 /* on es7000, logical apicid is the same as physical */
466 return early_per_cpu(x86_bios_cpu_apicid, cpu);
467}
468
463static unsigned long calculate_ldr(int cpu) 469static unsigned long calculate_ldr(int cpu)
464{ 470{
465 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu); 471 unsigned long id = per_cpu(x86_bios_cpu_apicid, cpu);
@@ -504,12 +510,6 @@ static void es7000_setup_apic_routing(void)
504 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 510 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
505} 511}
506 512
507static int es7000_apicid_to_node(int logical_apicid)
508{
509 return 0;
510}
511
512
513static int es7000_cpu_present_to_apicid(int mps_cpu) 513static int es7000_cpu_present_to_apicid(int mps_cpu)
514{ 514{
515 if (!mps_cpu) 515 if (!mps_cpu)
@@ -528,18 +528,6 @@ static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
528 ++cpu_id; 528 ++cpu_id;
529} 529}
530 530
531/* Mapping from cpu number to logical apicid */
532static int es7000_cpu_to_logical_apicid(int cpu)
533{
534#ifdef CONFIG_SMP
535 if (cpu >= nr_cpu_ids)
536 return BAD_APICID;
537 return cpu_2_logical_apicid[cpu];
538#else
539 return logical_smp_processor_id();
540#endif
541}
542
543static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap) 531static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
544{ 532{
545 /* For clustered we don't have a good way to do this yet - hack */ 533 /* For clustered we don't have a good way to do this yet - hack */
@@ -561,7 +549,7 @@ static unsigned int es7000_cpu_mask_to_apicid(const struct cpumask *cpumask)
561 * The cpus in the mask must all be on the apic cluster. 549 * The cpus in the mask must all be on the apic cluster.
562 */ 550 */
563 for_each_cpu(cpu, cpumask) { 551 for_each_cpu(cpu, cpumask) {
564 int new_apicid = es7000_cpu_to_logical_apicid(cpu); 552 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
565 553
566 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 554 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
567 WARN(1, "Not a valid mask!"); 555 WARN(1, "Not a valid mask!");
@@ -578,7 +566,7 @@ static unsigned int
578es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask, 566es7000_cpu_mask_to_apicid_and(const struct cpumask *inmask,
579 const struct cpumask *andmask) 567 const struct cpumask *andmask)
580{ 568{
581 int apicid = es7000_cpu_to_logical_apicid(0); 569 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
582 cpumask_var_t cpumask; 570 cpumask_var_t cpumask;
583 571
584 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 572 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -632,7 +620,7 @@ static int es7000_mps_oem_check_cluster(struct mpc_table *mpc, char *oem,
632} 620}
633 621
634/* We've been warned by a false positive warning.Use __refdata to keep calm. */ 622/* We've been warned by a false positive warning.Use __refdata to keep calm. */
635struct apic __refdata apic_es7000_cluster = { 623static struct apic __refdata apic_es7000_cluster = {
636 624
637 .name = "es7000", 625 .name = "es7000",
638 .probe = probe_es7000, 626 .probe = probe_es7000,
@@ -655,8 +643,6 @@ struct apic __refdata apic_es7000_cluster = {
655 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 643 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
656 .setup_apic_routing = es7000_setup_apic_routing, 644 .setup_apic_routing = es7000_setup_apic_routing,
657 .multi_timer_check = NULL, 645 .multi_timer_check = NULL,
658 .apicid_to_node = es7000_apicid_to_node,
659 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
660 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 646 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
661 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 647 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
662 .setup_portio_remap = NULL, 648 .setup_portio_remap = NULL,
@@ -695,9 +681,11 @@ struct apic __refdata apic_es7000_cluster = {
695 .icr_write = native_apic_icr_write, 681 .icr_write = native_apic_icr_write,
696 .wait_icr_idle = native_apic_wait_icr_idle, 682 .wait_icr_idle = native_apic_wait_icr_idle,
697 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 683 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
684
685 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
698}; 686};
699 687
700struct apic __refdata apic_es7000 = { 688static struct apic __refdata apic_es7000 = {
701 689
702 .name = "es7000", 690 .name = "es7000",
703 .probe = probe_es7000, 691 .probe = probe_es7000,
@@ -720,8 +708,6 @@ struct apic __refdata apic_es7000 = {
720 .ioapic_phys_id_map = es7000_ioapic_phys_id_map, 708 .ioapic_phys_id_map = es7000_ioapic_phys_id_map,
721 .setup_apic_routing = es7000_setup_apic_routing, 709 .setup_apic_routing = es7000_setup_apic_routing,
722 .multi_timer_check = NULL, 710 .multi_timer_check = NULL,
723 .apicid_to_node = es7000_apicid_to_node,
724 .cpu_to_logical_apicid = es7000_cpu_to_logical_apicid,
725 .cpu_present_to_apicid = es7000_cpu_present_to_apicid, 711 .cpu_present_to_apicid = es7000_cpu_present_to_apicid,
726 .apicid_to_cpu_present = es7000_apicid_to_cpu_present, 712 .apicid_to_cpu_present = es7000_apicid_to_cpu_present,
727 .setup_portio_remap = NULL, 713 .setup_portio_remap = NULL,
@@ -758,4 +744,12 @@ struct apic __refdata apic_es7000 = {
758 .icr_write = native_apic_icr_write, 744 .icr_write = native_apic_icr_write,
759 .wait_icr_idle = native_apic_wait_icr_idle, 745 .wait_icr_idle = native_apic_wait_icr_idle,
760 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 746 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
747
748 .x86_32_early_logical_apicid = es7000_early_logical_apicid,
761}; 749};
750
751/*
752 * Need to check for es7000 followed by es7000_cluster, so this order
753 * in apic_drivers is important.
754 */
755apic_drivers(apic_es7000, apic_es7000_cluster);
diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c
index cefd6942f0e9..d5e57db0f7be 100644
--- a/arch/x86/kernel/apic/hw_nmi.c
+++ b/arch/x86/kernel/apic/hw_nmi.c
@@ -16,20 +16,33 @@
16#include <linux/kprobes.h> 16#include <linux/kprobes.h>
17#include <linux/nmi.h> 17#include <linux/nmi.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/delay.h>
19 20
21#ifdef CONFIG_HARDLOCKUP_DETECTOR
22u64 hw_nmi_get_sample_period(int watchdog_thresh)
23{
24 return (u64)(cpu_khz) * 1000 * watchdog_thresh;
25}
26#endif
27
28#ifdef arch_trigger_all_cpu_backtrace
20/* For reliability, we're prepared to waste bits here. */ 29/* For reliability, we're prepared to waste bits here. */
21static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; 30static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
22 31
23u64 hw_nmi_get_sample_period(void) 32/* "in progress" flag of arch_trigger_all_cpu_backtrace */
24{ 33static unsigned long backtrace_flag;
25 return (u64)(cpu_khz) * 1000 * 60;
26}
27 34
28#ifdef ARCH_HAS_NMI_WATCHDOG
29void arch_trigger_all_cpu_backtrace(void) 35void arch_trigger_all_cpu_backtrace(void)
30{ 36{
31 int i; 37 int i;
32 38
39 if (test_and_set_bit(0, &backtrace_flag))
40 /*
41 * If there is already a trigger_all_cpu_backtrace() in progress
42 * (backtrace_flag == 1), don't output double cpu dump infos.
43 */
44 return;
45
33 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); 46 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
34 47
35 printk(KERN_INFO "sending NMI to all CPUs:\n"); 48 printk(KERN_INFO "sending NMI to all CPUs:\n");
@@ -41,6 +54,9 @@ void arch_trigger_all_cpu_backtrace(void)
41 break; 54 break;
42 mdelay(1); 55 mdelay(1);
43 } 56 }
57
58 clear_bit(0, &backtrace_flag);
59 smp_mb__after_clear_bit();
44} 60}
45 61
46static int __kprobes 62static int __kprobes
@@ -49,11 +65,10 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
49{ 65{
50 struct die_args *args = __args; 66 struct die_args *args = __args;
51 struct pt_regs *regs; 67 struct pt_regs *regs;
52 int cpu = smp_processor_id(); 68 int cpu;
53 69
54 switch (cmd) { 70 switch (cmd) {
55 case DIE_NMI: 71 case DIE_NMI:
56 case DIE_NMI_IPI:
57 break; 72 break;
58 73
59 default: 74 default:
@@ -61,6 +76,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
61 } 76 }
62 77
63 regs = args->regs; 78 regs = args->regs;
79 cpu = smp_processor_id();
64 80
65 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) { 81 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
66 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; 82 static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED;
@@ -68,7 +84,6 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
68 arch_spin_lock(&lock); 84 arch_spin_lock(&lock);
69 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 85 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
70 show_regs(regs); 86 show_regs(regs);
71 dump_stack();
72 arch_spin_unlock(&lock); 87 arch_spin_unlock(&lock);
73 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); 88 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
74 return NOTIFY_STOP; 89 return NOTIFY_STOP;
@@ -80,7 +95,7 @@ arch_trigger_all_cpu_backtrace_handler(struct notifier_block *self,
80static __read_mostly struct notifier_block backtrace_notifier = { 95static __read_mostly struct notifier_block backtrace_notifier = {
81 .notifier_call = arch_trigger_all_cpu_backtrace_handler, 96 .notifier_call = arch_trigger_all_cpu_backtrace_handler,
82 .next = NULL, 97 .next = NULL,
83 .priority = 1 98 .priority = NMI_LOCAL_LOW_PRIOR,
84}; 99};
85 100
86static int __init register_trigger_all_cpu_backtrace(void) 101static int __init register_trigger_all_cpu_backtrace(void)
@@ -90,18 +105,3 @@ static int __init register_trigger_all_cpu_backtrace(void)
90} 105}
91early_initcall(register_trigger_all_cpu_backtrace); 106early_initcall(register_trigger_all_cpu_backtrace);
92#endif 107#endif
93
94/* STUB calls to mimic old nmi_watchdog behaviour */
95#if defined(CONFIG_X86_LOCAL_APIC)
96unsigned int nmi_watchdog = NMI_NONE;
97EXPORT_SYMBOL(nmi_watchdog);
98void acpi_nmi_enable(void) { return; }
99void acpi_nmi_disable(void) { return; }
100#endif
101atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
102EXPORT_SYMBOL(nmi_active);
103int unknown_nmi_panic;
104void cpu_nmi_set_wd_enabled(void) { return; }
105void stop_apic_nmi_watchdog(void *unused) { return; }
106void setup_apic_nmi_watchdog(void *unused) { return; }
107int __init check_nmi_watchdog(void) { return 0; }
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index 5c5b8f3dddb5..e5293394b548 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -30,7 +30,7 @@
30#include <linux/compiler.h> 30#include <linux/compiler.h>
31#include <linux/acpi.h> 31#include <linux/acpi.h>
32#include <linux/module.h> 32#include <linux/module.h>
33#include <linux/sysdev.h> 33#include <linux/syscore_ops.h>
34#include <linux/msi.h> 34#include <linux/msi.h>
35#include <linux/htirq.h> 35#include <linux/htirq.h>
36#include <linux/freezer.h> 36#include <linux/freezer.h>
@@ -54,7 +54,6 @@
54#include <asm/dma.h> 54#include <asm/dma.h>
55#include <asm/timer.h> 55#include <asm/timer.h>
56#include <asm/i8259.h> 56#include <asm/i8259.h>
57#include <asm/nmi.h>
58#include <asm/msidef.h> 57#include <asm/msidef.h>
59#include <asm/hypertransport.h> 58#include <asm/hypertransport.h>
60#include <asm/setup.h> 59#include <asm/setup.h>
@@ -77,17 +76,40 @@ int sis_apic_bug = -1;
77static DEFINE_RAW_SPINLOCK(ioapic_lock); 76static DEFINE_RAW_SPINLOCK(ioapic_lock);
78static DEFINE_RAW_SPINLOCK(vector_lock); 77static DEFINE_RAW_SPINLOCK(vector_lock);
79 78
80/* 79static struct ioapic {
81 * # of IRQ routing registers 80 /*
82 */ 81 * # of IRQ routing registers
83int nr_ioapic_registers[MAX_IO_APICS]; 82 */
83 int nr_registers;
84 /*
85 * Saved state during suspend/resume, or while enabling intr-remap.
86 */
87 struct IO_APIC_route_entry *saved_registers;
88 /* I/O APIC config */
89 struct mpc_ioapic mp_config;
90 /* IO APIC gsi routing info */
91 struct mp_ioapic_gsi gsi_config;
92 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
93} ioapics[MAX_IO_APICS];
84 94
85/* I/O APIC entries */ 95#define mpc_ioapic_ver(id) ioapics[id].mp_config.apicver
86struct mpc_ioapic mp_ioapics[MAX_IO_APICS];
87int nr_ioapics;
88 96
89/* IO APIC gsi routing info */ 97int mpc_ioapic_id(int id)
90struct mp_ioapic_gsi mp_gsi_routing[MAX_IO_APICS]; 98{
99 return ioapics[id].mp_config.apicid;
100}
101
102unsigned int mpc_ioapic_addr(int id)
103{
104 return ioapics[id].mp_config.apicaddr;
105}
106
107struct mp_ioapic_gsi *mp_ioapic_gsi_routing(int id)
108{
109 return &ioapics[id].gsi_config;
110}
111
112int nr_ioapics;
91 113
92/* The one past the highest gsi number used */ 114/* The one past the highest gsi number used */
93u32 gsi_top; 115u32 gsi_top;
@@ -109,7 +131,10 @@ DECLARE_BITMAP(mp_bus_not_pci, MAX_MP_BUSSES);
109 131
110int skip_ioapic_setup; 132int skip_ioapic_setup;
111 133
112void arch_disable_smp_support(void) 134/**
135 * disable_ioapic_support() - disables ioapic support at runtime
136 */
137void disable_ioapic_support(void)
113{ 138{
114#ifdef CONFIG_PCI 139#ifdef CONFIG_PCI
115 noioapicquirk = 1; 140 noioapicquirk = 1;
@@ -121,25 +146,45 @@ void arch_disable_smp_support(void)
121static int __init parse_noapic(char *str) 146static int __init parse_noapic(char *str)
122{ 147{
123 /* disable IO-APIC */ 148 /* disable IO-APIC */
124 arch_disable_smp_support(); 149 disable_ioapic_support();
125 return 0; 150 return 0;
126} 151}
127early_param("noapic", parse_noapic); 152early_param("noapic", parse_noapic);
128 153
154static int io_apic_setup_irq_pin(unsigned int irq, int node,
155 struct io_apic_irq_attr *attr);
156
157/* Will be called in mpparse/acpi/sfi codes for saving IRQ info */
158void mp_save_irq(struct mpc_intsrc *m)
159{
160 int i;
161
162 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
163 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
164 m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
165 m->srcbusirq, m->dstapic, m->dstirq);
166
167 for (i = 0; i < mp_irq_entries; i++) {
168 if (!memcmp(&mp_irqs[i], m, sizeof(*m)))
169 return;
170 }
171
172 memcpy(&mp_irqs[mp_irq_entries], m, sizeof(*m));
173 if (++mp_irq_entries == MAX_IRQ_SOURCES)
174 panic("Max # of irq sources exceeded!!\n");
175}
176
129struct irq_pin_list { 177struct irq_pin_list {
130 int apic, pin; 178 int apic, pin;
131 struct irq_pin_list *next; 179 struct irq_pin_list *next;
132}; 180};
133 181
134static struct irq_pin_list *get_one_free_irq_2_pin(int node) 182static struct irq_pin_list *alloc_irq_pin_list(int node)
135{ 183{
136 struct irq_pin_list *pin; 184 return kzalloc_node(sizeof(struct irq_pin_list), GFP_KERNEL, node);
137
138 pin = kzalloc_node(sizeof(*pin), GFP_ATOMIC, node);
139
140 return pin;
141} 185}
142 186
187
143/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 188/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
144#ifdef CONFIG_SPARSE_IRQ 189#ifdef CONFIG_SPARSE_IRQ
145static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY]; 190static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
@@ -150,25 +195,32 @@ static struct irq_cfg irq_cfgx[NR_IRQS];
150int __init arch_early_irq_init(void) 195int __init arch_early_irq_init(void)
151{ 196{
152 struct irq_cfg *cfg; 197 struct irq_cfg *cfg;
153 struct irq_desc *desc; 198 int count, node, i;
154 int count;
155 int node;
156 int i;
157 199
158 if (!legacy_pic->nr_legacy_irqs) { 200 if (!legacy_pic->nr_legacy_irqs) {
159 nr_irqs_gsi = 0; 201 nr_irqs_gsi = 0;
160 io_apic_irqs = ~0UL; 202 io_apic_irqs = ~0UL;
161 } 203 }
162 204
205 for (i = 0; i < nr_ioapics; i++) {
206 ioapics[i].saved_registers =
207 kzalloc(sizeof(struct IO_APIC_route_entry) *
208 ioapics[i].nr_registers, GFP_KERNEL);
209 if (!ioapics[i].saved_registers)
210 pr_err("IOAPIC %d: suspend/resume impossible!\n", i);
211 }
212
163 cfg = irq_cfgx; 213 cfg = irq_cfgx;
164 count = ARRAY_SIZE(irq_cfgx); 214 count = ARRAY_SIZE(irq_cfgx);
165 node= cpu_to_node(boot_cpu_id); 215 node = cpu_to_node(0);
216
217 /* Make sure the legacy interrupts are marked in the bitmap */
218 irq_reserve_irqs(0, legacy_pic->nr_legacy_irqs);
166 219
167 for (i = 0; i < count; i++) { 220 for (i = 0; i < count; i++) {
168 desc = irq_to_desc(i); 221 irq_set_chip_data(i, &cfg[i]);
169 desc->chip_data = &cfg[i]; 222 zalloc_cpumask_var_node(&cfg[i].domain, GFP_KERNEL, node);
170 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 223 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_KERNEL, node);
171 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
172 /* 224 /*
173 * For legacy IRQ's, start with assigning irq0 to irq15 to 225 * For legacy IRQ's, start with assigning irq0 to irq15 to
174 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0. 226 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
@@ -183,170 +235,88 @@ int __init arch_early_irq_init(void)
183} 235}
184 236
185#ifdef CONFIG_SPARSE_IRQ 237#ifdef CONFIG_SPARSE_IRQ
186struct irq_cfg *irq_cfg(unsigned int irq) 238static struct irq_cfg *irq_cfg(unsigned int irq)
187{ 239{
188 struct irq_cfg *cfg = NULL; 240 return irq_get_chip_data(irq);
189 struct irq_desc *desc;
190
191 desc = irq_to_desc(irq);
192 if (desc)
193 cfg = desc->chip_data;
194
195 return cfg;
196} 241}
197 242
198static struct irq_cfg *get_one_free_irq_cfg(int node) 243static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
199{ 244{
200 struct irq_cfg *cfg; 245 struct irq_cfg *cfg;
201 246
202 cfg = kzalloc_node(sizeof(*cfg), GFP_ATOMIC, node); 247 cfg = kzalloc_node(sizeof(*cfg), GFP_KERNEL, node);
203 if (cfg) { 248 if (!cfg)
204 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_ATOMIC, node)) { 249 return NULL;
205 kfree(cfg); 250 if (!zalloc_cpumask_var_node(&cfg->domain, GFP_KERNEL, node))
206 cfg = NULL; 251 goto out_cfg;
207 } else if (!zalloc_cpumask_var_node(&cfg->old_domain, 252 if (!zalloc_cpumask_var_node(&cfg->old_domain, GFP_KERNEL, node))
208 GFP_ATOMIC, node)) { 253 goto out_domain;
209 free_cpumask_var(cfg->domain);
210 kfree(cfg);
211 cfg = NULL;
212 }
213 }
214
215 return cfg; 254 return cfg;
255out_domain:
256 free_cpumask_var(cfg->domain);
257out_cfg:
258 kfree(cfg);
259 return NULL;
216} 260}
217 261
218int arch_init_chip_data(struct irq_desc *desc, int node) 262static void free_irq_cfg(unsigned int at, struct irq_cfg *cfg)
219{
220 struct irq_cfg *cfg;
221
222 cfg = desc->chip_data;
223 if (!cfg) {
224 desc->chip_data = get_one_free_irq_cfg(node);
225 if (!desc->chip_data) {
226 printk(KERN_ERR "can not alloc irq_cfg\n");
227 BUG_ON(1);
228 }
229 }
230
231 return 0;
232}
233
234/* for move_irq_desc */
235static void
236init_copy_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg, int node)
237{ 263{
238 struct irq_pin_list *old_entry, *head, *tail, *entry; 264 if (!cfg)
239
240 cfg->irq_2_pin = NULL;
241 old_entry = old_cfg->irq_2_pin;
242 if (!old_entry)
243 return;
244
245 entry = get_one_free_irq_2_pin(node);
246 if (!entry)
247 return; 265 return;
266 irq_set_chip_data(at, NULL);
267 free_cpumask_var(cfg->domain);
268 free_cpumask_var(cfg->old_domain);
269 kfree(cfg);
270}
248 271
249 entry->apic = old_entry->apic; 272#else
250 entry->pin = old_entry->pin;
251 head = entry;
252 tail = entry;
253 old_entry = old_entry->next;
254 while (old_entry) {
255 entry = get_one_free_irq_2_pin(node);
256 if (!entry) {
257 entry = head;
258 while (entry) {
259 head = entry->next;
260 kfree(entry);
261 entry = head;
262 }
263 /* still use the old one */
264 return;
265 }
266 entry->apic = old_entry->apic;
267 entry->pin = old_entry->pin;
268 tail->next = entry;
269 tail = entry;
270 old_entry = old_entry->next;
271 }
272 273
273 tail->next = NULL; 274struct irq_cfg *irq_cfg(unsigned int irq)
274 cfg->irq_2_pin = head; 275{
276 return irq < nr_irqs ? irq_cfgx + irq : NULL;
275} 277}
276 278
277static void free_irq_2_pin(struct irq_cfg *old_cfg, struct irq_cfg *cfg) 279static struct irq_cfg *alloc_irq_cfg(unsigned int irq, int node)
278{ 280{
279 struct irq_pin_list *entry, *next; 281 return irq_cfgx + irq;
280 282}
281 if (old_cfg->irq_2_pin == cfg->irq_2_pin)
282 return;
283 283
284 entry = old_cfg->irq_2_pin; 284static inline void free_irq_cfg(unsigned int at, struct irq_cfg *cfg) { }
285 285
286 while (entry) { 286#endif
287 next = entry->next;
288 kfree(entry);
289 entry = next;
290 }
291 old_cfg->irq_2_pin = NULL;
292}
293 287
294void arch_init_copy_chip_data(struct irq_desc *old_desc, 288static struct irq_cfg *alloc_irq_and_cfg_at(unsigned int at, int node)
295 struct irq_desc *desc, int node)
296{ 289{
290 int res = irq_alloc_desc_at(at, node);
297 struct irq_cfg *cfg; 291 struct irq_cfg *cfg;
298 struct irq_cfg *old_cfg;
299 292
300 cfg = get_one_free_irq_cfg(node); 293 if (res < 0) {
301 294 if (res != -EEXIST)
302 if (!cfg) 295 return NULL;
303 return; 296 cfg = irq_get_chip_data(at);
304 297 if (cfg)
305 desc->chip_data = cfg; 298 return cfg;
306 299 }
307 old_cfg = old_desc->chip_data;
308
309 cfg->vector = old_cfg->vector;
310 cfg->move_in_progress = old_cfg->move_in_progress;
311 cpumask_copy(cfg->domain, old_cfg->domain);
312 cpumask_copy(cfg->old_domain, old_cfg->old_domain);
313
314 init_copy_irq_2_pin(old_cfg, cfg, node);
315}
316 300
317static void free_irq_cfg(struct irq_cfg *cfg) 301 cfg = alloc_irq_cfg(at, node);
318{ 302 if (cfg)
319 free_cpumask_var(cfg->domain); 303 irq_set_chip_data(at, cfg);
320 free_cpumask_var(cfg->old_domain); 304 else
321 kfree(cfg); 305 irq_free_desc(at);
306 return cfg;
322} 307}
323 308
324void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc) 309static int alloc_irq_from(unsigned int from, int node)
325{ 310{
326 struct irq_cfg *old_cfg, *cfg; 311 return irq_alloc_desc_from(from, node);
327
328 old_cfg = old_desc->chip_data;
329 cfg = desc->chip_data;
330
331 if (old_cfg == cfg)
332 return;
333
334 if (old_cfg) {
335 free_irq_2_pin(old_cfg, cfg);
336 free_irq_cfg(old_cfg);
337 old_desc->chip_data = NULL;
338 }
339} 312}
340/* end for move_irq_desc */
341 313
342#else 314static void free_irq_at(unsigned int at, struct irq_cfg *cfg)
343struct irq_cfg *irq_cfg(unsigned int irq)
344{ 315{
345 return irq < nr_irqs ? irq_cfgx + irq : NULL; 316 free_irq_cfg(at, cfg);
317 irq_free_desc(at);
346} 318}
347 319
348#endif
349
350struct io_apic { 320struct io_apic {
351 unsigned int index; 321 unsigned int index;
352 unsigned int unused[3]; 322 unsigned int unused[3];
@@ -358,7 +328,7 @@ struct io_apic {
358static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx) 328static __attribute_const__ struct io_apic __iomem *io_apic_base(int idx)
359{ 329{
360 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx) 330 return (void __iomem *) __fix_to_virt(FIX_IO_APIC_BASE_0 + idx)
361 + (mp_ioapics[idx].apicaddr & ~PAGE_MASK); 331 + (mpc_ioapic_addr(idx) & ~PAGE_MASK);
362} 332}
363 333
364static inline void io_apic_eoi(unsigned int apic, unsigned int vector) 334static inline void io_apic_eoi(unsigned int apic, unsigned int vector)
@@ -451,7 +421,7 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
451 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 421 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
452} 422}
453 423
454void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 424static void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
455{ 425{
456 unsigned long flags; 426 unsigned long flags;
457 raw_spin_lock_irqsave(&ioapic_lock, flags); 427 raw_spin_lock_irqsave(&ioapic_lock, flags);
@@ -481,7 +451,7 @@ static void ioapic_mask_entry(int apic, int pin)
481 * fast in the common case, and fast for shared ISA-space IRQs. 451 * fast in the common case, and fast for shared ISA-space IRQs.
482 */ 452 */
483static int 453static int
484add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin) 454__add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
485{ 455{
486 struct irq_pin_list **last, *entry; 456 struct irq_pin_list **last, *entry;
487 457
@@ -493,7 +463,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
493 last = &entry->next; 463 last = &entry->next;
494 } 464 }
495 465
496 entry = get_one_free_irq_2_pin(node); 466 entry = alloc_irq_pin_list(node);
497 if (!entry) { 467 if (!entry) {
498 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n", 468 printk(KERN_ERR "can not alloc irq_pin_list (%d,%d,%d)\n",
499 node, apic, pin); 469 node, apic, pin);
@@ -508,7 +478,7 @@ add_pin_to_irq_node_nopanic(struct irq_cfg *cfg, int node, int apic, int pin)
508 478
509static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin) 479static void add_pin_to_irq_node(struct irq_cfg *cfg, int node, int apic, int pin)
510{ 480{
511 if (add_pin_to_irq_node_nopanic(cfg, node, apic, pin)) 481 if (__add_pin_to_irq_node(cfg, node, apic, pin))
512 panic("IO-APIC: failed to add irq-pin. Can not proceed\n"); 482 panic("IO-APIC: failed to add irq-pin. Can not proceed\n");
513} 483}
514 484
@@ -571,11 +541,6 @@ static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
571 IO_APIC_REDIR_LEVEL_TRIGGER, NULL); 541 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
572} 542}
573 543
574static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
575{
576 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
577}
578
579static void io_apic_sync(struct irq_pin_list *entry) 544static void io_apic_sync(struct irq_pin_list *entry)
580{ 545{
581 /* 546 /*
@@ -587,44 +552,37 @@ static void io_apic_sync(struct irq_pin_list *entry)
587 readl(&io_apic->data); 552 readl(&io_apic->data);
588} 553}
589 554
590static void __mask_IO_APIC_irq(struct irq_cfg *cfg) 555static void mask_ioapic(struct irq_cfg *cfg)
591{ 556{
557 unsigned long flags;
558
559 raw_spin_lock_irqsave(&ioapic_lock, flags);
592 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 560 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
561 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
593} 562}
594 563
595static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 564static void mask_ioapic_irq(struct irq_data *data)
596{ 565{
597 struct irq_cfg *cfg = desc->chip_data; 566 mask_ioapic(data->chip_data);
598 unsigned long flags; 567}
599
600 BUG_ON(!cfg);
601 568
602 raw_spin_lock_irqsave(&ioapic_lock, flags); 569static void __unmask_ioapic(struct irq_cfg *cfg)
603 __mask_IO_APIC_irq(cfg); 570{
604 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 571 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED, 0, NULL);
605} 572}
606 573
607static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 574static void unmask_ioapic(struct irq_cfg *cfg)
608{ 575{
609 struct irq_cfg *cfg = desc->chip_data;
610 unsigned long flags; 576 unsigned long flags;
611 577
612 raw_spin_lock_irqsave(&ioapic_lock, flags); 578 raw_spin_lock_irqsave(&ioapic_lock, flags);
613 __unmask_IO_APIC_irq(cfg); 579 __unmask_ioapic(cfg);
614 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 580 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
615} 581}
616 582
617static void mask_IO_APIC_irq(unsigned int irq) 583static void unmask_ioapic_irq(struct irq_data *data)
618{ 584{
619 struct irq_desc *desc = irq_to_desc(irq); 585 unmask_ioapic(data->chip_data);
620
621 mask_IO_APIC_irq_desc(desc);
622}
623static void unmask_IO_APIC_irq(unsigned int irq)
624{
625 struct irq_desc *desc = irq_to_desc(irq);
626
627 unmask_IO_APIC_irq_desc(desc);
628} 586}
629 587
630static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin) 588static void clear_IO_APIC_pin(unsigned int apic, unsigned int pin)
@@ -646,7 +604,7 @@ static void clear_IO_APIC (void)
646 int apic, pin; 604 int apic, pin;
647 605
648 for (apic = 0; apic < nr_ioapics; apic++) 606 for (apic = 0; apic < nr_ioapics; apic++)
649 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 607 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
650 clear_IO_APIC_pin(apic, pin); 608 clear_IO_APIC_pin(apic, pin);
651} 609}
652 610
@@ -688,74 +646,43 @@ static int __init ioapic_pirq_setup(char *str)
688__setup("pirq=", ioapic_pirq_setup); 646__setup("pirq=", ioapic_pirq_setup);
689#endif /* CONFIG_X86_32 */ 647#endif /* CONFIG_X86_32 */
690 648
691struct IO_APIC_route_entry **alloc_ioapic_entries(void)
692{
693 int apic;
694 struct IO_APIC_route_entry **ioapic_entries;
695
696 ioapic_entries = kzalloc(sizeof(*ioapic_entries) * nr_ioapics,
697 GFP_ATOMIC);
698 if (!ioapic_entries)
699 return 0;
700
701 for (apic = 0; apic < nr_ioapics; apic++) {
702 ioapic_entries[apic] =
703 kzalloc(sizeof(struct IO_APIC_route_entry) *
704 nr_ioapic_registers[apic], GFP_ATOMIC);
705 if (!ioapic_entries[apic])
706 goto nomem;
707 }
708
709 return ioapic_entries;
710
711nomem:
712 while (--apic >= 0)
713 kfree(ioapic_entries[apic]);
714 kfree(ioapic_entries);
715
716 return 0;
717}
718
719/* 649/*
720 * Saves all the IO-APIC RTE's 650 * Saves all the IO-APIC RTE's
721 */ 651 */
722int save_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 652int save_ioapic_entries(void)
723{ 653{
724 int apic, pin; 654 int apic, pin;
725 655 int err = 0;
726 if (!ioapic_entries)
727 return -ENOMEM;
728 656
729 for (apic = 0; apic < nr_ioapics; apic++) { 657 for (apic = 0; apic < nr_ioapics; apic++) {
730 if (!ioapic_entries[apic]) 658 if (!ioapics[apic].saved_registers) {
731 return -ENOMEM; 659 err = -ENOMEM;
660 continue;
661 }
732 662
733 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 663 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
734 ioapic_entries[apic][pin] = 664 ioapics[apic].saved_registers[pin] =
735 ioapic_read_entry(apic, pin); 665 ioapic_read_entry(apic, pin);
736 } 666 }
737 667
738 return 0; 668 return err;
739} 669}
740 670
741/* 671/*
742 * Mask all IO APIC entries. 672 * Mask all IO APIC entries.
743 */ 673 */
744void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 674void mask_ioapic_entries(void)
745{ 675{
746 int apic, pin; 676 int apic, pin;
747 677
748 if (!ioapic_entries)
749 return;
750
751 for (apic = 0; apic < nr_ioapics; apic++) { 678 for (apic = 0; apic < nr_ioapics; apic++) {
752 if (!ioapic_entries[apic]) 679 if (!ioapics[apic].saved_registers)
753 break; 680 continue;
754 681
755 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 682 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
756 struct IO_APIC_route_entry entry; 683 struct IO_APIC_route_entry entry;
757 684
758 entry = ioapic_entries[apic][pin]; 685 entry = ioapics[apic].saved_registers[pin];
759 if (!entry.mask) { 686 if (!entry.mask) {
760 entry.mask = 1; 687 entry.mask = 1;
761 ioapic_write_entry(apic, pin, entry); 688 ioapic_write_entry(apic, pin, entry);
@@ -765,36 +692,23 @@ void mask_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries)
765} 692}
766 693
767/* 694/*
768 * Restore IO APIC entries which was saved in ioapic_entries. 695 * Restore IO APIC entries which was saved in the ioapic structure.
769 */ 696 */
770int restore_IO_APIC_setup(struct IO_APIC_route_entry **ioapic_entries) 697int restore_ioapic_entries(void)
771{ 698{
772 int apic, pin; 699 int apic, pin;
773 700
774 if (!ioapic_entries)
775 return -ENOMEM;
776
777 for (apic = 0; apic < nr_ioapics; apic++) { 701 for (apic = 0; apic < nr_ioapics; apic++) {
778 if (!ioapic_entries[apic]) 702 if (!ioapics[apic].saved_registers)
779 return -ENOMEM; 703 continue;
780 704
781 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) 705 for (pin = 0; pin < ioapics[apic].nr_registers; pin++)
782 ioapic_write_entry(apic, pin, 706 ioapic_write_entry(apic, pin,
783 ioapic_entries[apic][pin]); 707 ioapics[apic].saved_registers[pin]);
784 } 708 }
785 return 0; 709 return 0;
786} 710}
787 711
788void free_ioapic_entries(struct IO_APIC_route_entry **ioapic_entries)
789{
790 int apic;
791
792 for (apic = 0; apic < nr_ioapics; apic++)
793 kfree(ioapic_entries[apic]);
794
795 kfree(ioapic_entries);
796}
797
798/* 712/*
799 * Find the IRQ entry number of a certain pin. 713 * Find the IRQ entry number of a certain pin.
800 */ 714 */
@@ -804,7 +718,7 @@ static int find_irq_entry(int apic, int pin, int type)
804 718
805 for (i = 0; i < mp_irq_entries; i++) 719 for (i = 0; i < mp_irq_entries; i++)
806 if (mp_irqs[i].irqtype == type && 720 if (mp_irqs[i].irqtype == type &&
807 (mp_irqs[i].dstapic == mp_ioapics[apic].apicid || 721 (mp_irqs[i].dstapic == mpc_ioapic_id(apic) ||
808 mp_irqs[i].dstapic == MP_APIC_ALL) && 722 mp_irqs[i].dstapic == MP_APIC_ALL) &&
809 mp_irqs[i].dstirq == pin) 723 mp_irqs[i].dstirq == pin)
810 return i; 724 return i;
@@ -846,7 +760,7 @@ static int __init find_isa_irq_apic(int irq, int type)
846 if (i < mp_irq_entries) { 760 if (i < mp_irq_entries) {
847 int apic; 761 int apic;
848 for(apic = 0; apic < nr_ioapics; apic++) { 762 for(apic = 0; apic < nr_ioapics; apic++) {
849 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic) 763 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic)
850 return apic; 764 return apic;
851 } 765 }
852 } 766 }
@@ -897,7 +811,7 @@ static int EISA_ELCR(unsigned int irq)
897#define default_MCA_trigger(idx) (1) 811#define default_MCA_trigger(idx) (1)
898#define default_MCA_polarity(idx) default_ISA_polarity(idx) 812#define default_MCA_polarity(idx) default_ISA_polarity(idx)
899 813
900static int MPBIOS_polarity(int idx) 814static int irq_polarity(int idx)
901{ 815{
902 int bus = mp_irqs[idx].srcbus; 816 int bus = mp_irqs[idx].srcbus;
903 int polarity; 817 int polarity;
@@ -939,7 +853,7 @@ static int MPBIOS_polarity(int idx)
939 return polarity; 853 return polarity;
940} 854}
941 855
942static int MPBIOS_trigger(int idx) 856static int irq_trigger(int idx)
943{ 857{
944 int bus = mp_irqs[idx].srcbus; 858 int bus = mp_irqs[idx].srcbus;
945 int trigger; 859 int trigger;
@@ -1011,20 +925,11 @@ static int MPBIOS_trigger(int idx)
1011 return trigger; 925 return trigger;
1012} 926}
1013 927
1014static inline int irq_polarity(int idx)
1015{
1016 return MPBIOS_polarity(idx);
1017}
1018
1019static inline int irq_trigger(int idx)
1020{
1021 return MPBIOS_trigger(idx);
1022}
1023
1024static int pin_2_irq(int idx, int apic, int pin) 928static int pin_2_irq(int idx, int apic, int pin)
1025{ 929{
1026 int irq; 930 int irq;
1027 int bus = mp_irqs[idx].srcbus; 931 int bus = mp_irqs[idx].srcbus;
932 struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(apic);
1028 933
1029 /* 934 /*
1030 * Debugging check, we are in big trouble if this message pops up! 935 * Debugging check, we are in big trouble if this message pops up!
@@ -1035,7 +940,7 @@ static int pin_2_irq(int idx, int apic, int pin)
1035 if (test_bit(bus, mp_bus_not_pci)) { 940 if (test_bit(bus, mp_bus_not_pci)) {
1036 irq = mp_irqs[idx].srcbusirq; 941 irq = mp_irqs[idx].srcbusirq;
1037 } else { 942 } else {
1038 u32 gsi = mp_gsi_routing[apic].gsi_base + pin; 943 u32 gsi = gsi_cfg->gsi_base + pin;
1039 944
1040 if (gsi >= NR_IRQS_LEGACY) 945 if (gsi >= NR_IRQS_LEGACY)
1041 irq = gsi; 946 irq = gsi;
@@ -1086,7 +991,7 @@ int IO_APIC_get_PCI_irq_vector(int bus, int slot, int pin,
1086 int lbus = mp_irqs[i].srcbus; 991 int lbus = mp_irqs[i].srcbus;
1087 992
1088 for (apic = 0; apic < nr_ioapics; apic++) 993 for (apic = 0; apic < nr_ioapics; apic++)
1089 if (mp_ioapics[apic].apicid == mp_irqs[i].dstapic || 994 if (mpc_ioapic_id(apic) == mp_irqs[i].dstapic ||
1090 mp_irqs[i].dstapic == MP_APIC_ALL) 995 mp_irqs[i].dstapic == MP_APIC_ALL)
1091 break; 996 break;
1092 997
@@ -1259,7 +1164,6 @@ void __setup_vector_irq(int cpu)
1259 /* Initialize vector_irq on a new cpu */ 1164 /* Initialize vector_irq on a new cpu */
1260 int irq, vector; 1165 int irq, vector;
1261 struct irq_cfg *cfg; 1166 struct irq_cfg *cfg;
1262 struct irq_desc *desc;
1263 1167
1264 /* 1168 /*
1265 * vector_lock will make sure that we don't run into irq vector 1169 * vector_lock will make sure that we don't run into irq vector
@@ -1268,9 +1172,10 @@ void __setup_vector_irq(int cpu)
1268 */ 1172 */
1269 raw_spin_lock(&vector_lock); 1173 raw_spin_lock(&vector_lock);
1270 /* Mark the inuse vectors */ 1174 /* Mark the inuse vectors */
1271 for_each_irq_desc(irq, desc) { 1175 for_each_active_irq(irq) {
1272 cfg = desc->chip_data; 1176 cfg = irq_get_chip_data(irq);
1273 1177 if (!cfg)
1178 continue;
1274 /* 1179 /*
1275 * If it is a legacy IRQ handled by the legacy PIC, this cpu 1180 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1276 * will be part of the irq_cfg's domain. 1181 * will be part of the irq_cfg's domain.
@@ -1299,17 +1204,13 @@ void __setup_vector_irq(int cpu)
1299static struct irq_chip ioapic_chip; 1204static struct irq_chip ioapic_chip;
1300static struct irq_chip ir_ioapic_chip; 1205static struct irq_chip ir_ioapic_chip;
1301 1206
1302#define IOAPIC_AUTO -1
1303#define IOAPIC_EDGE 0
1304#define IOAPIC_LEVEL 1
1305
1306#ifdef CONFIG_X86_32 1207#ifdef CONFIG_X86_32
1307static inline int IO_APIC_irq_trigger(int irq) 1208static inline int IO_APIC_irq_trigger(int irq)
1308{ 1209{
1309 int apic, idx, pin; 1210 int apic, idx, pin;
1310 1211
1311 for (apic = 0; apic < nr_ioapics; apic++) { 1212 for (apic = 0; apic < nr_ioapics; apic++) {
1312 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1213 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
1313 idx = find_irq_entry(apic, pin, mp_INT); 1214 idx = find_irq_entry(apic, pin, mp_INT);
1314 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin))) 1215 if ((idx != -1) && (irq == pin_2_irq(idx, apic, pin)))
1315 return irq_trigger(idx); 1216 return irq_trigger(idx);
@@ -1327,41 +1228,37 @@ static inline int IO_APIC_irq_trigger(int irq)
1327} 1228}
1328#endif 1229#endif
1329 1230
1330static void ioapic_register_intr(int irq, struct irq_desc *desc, unsigned long trigger) 1231static void ioapic_register_intr(unsigned int irq, struct irq_cfg *cfg,
1232 unsigned long trigger)
1331{ 1233{
1234 struct irq_chip *chip = &ioapic_chip;
1235 irq_flow_handler_t hdl;
1236 bool fasteoi;
1332 1237
1333 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1238 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) ||
1334 trigger == IOAPIC_LEVEL) 1239 trigger == IOAPIC_LEVEL) {
1335 desc->status |= IRQ_LEVEL; 1240 irq_set_status_flags(irq, IRQ_LEVEL);
1336 else 1241 fasteoi = true;
1337 desc->status &= ~IRQ_LEVEL; 1242 } else {
1338 1243 irq_clear_status_flags(irq, IRQ_LEVEL);
1339 if (irq_remapped(irq)) { 1244 fasteoi = false;
1340 desc->status |= IRQ_MOVE_PCNTXT;
1341 if (trigger)
1342 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1343 handle_fasteoi_irq,
1344 "fasteoi");
1345 else
1346 set_irq_chip_and_handler_name(irq, &ir_ioapic_chip,
1347 handle_edge_irq, "edge");
1348 return;
1349 } 1245 }
1350 1246
1351 if ((trigger == IOAPIC_AUTO && IO_APIC_irq_trigger(irq)) || 1247 if (irq_remapped(cfg)) {
1352 trigger == IOAPIC_LEVEL) 1248 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
1353 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1249 chip = &ir_ioapic_chip;
1354 handle_fasteoi_irq, 1250 fasteoi = trigger != 0;
1355 "fasteoi"); 1251 }
1356 else 1252
1357 set_irq_chip_and_handler_name(irq, &ioapic_chip, 1253 hdl = fasteoi ? handle_fasteoi_irq : handle_edge_irq;
1358 handle_edge_irq, "edge"); 1254 irq_set_chip_and_handler_name(irq, chip, hdl,
1255 fasteoi ? "fasteoi" : "edge");
1359} 1256}
1360 1257
1361int setup_ioapic_entry(int apic_id, int irq, 1258static int setup_ioapic_entry(int apic_id, int irq,
1362 struct IO_APIC_route_entry *entry, 1259 struct IO_APIC_route_entry *entry,
1363 unsigned int destination, int trigger, 1260 unsigned int destination, int trigger,
1364 int polarity, int vector, int pin) 1261 int polarity, int vector, int pin)
1365{ 1262{
1366 /* 1263 /*
1367 * add it to the IO-APIC irq-routing table: 1264 * add it to the IO-APIC irq-routing table:
@@ -1382,21 +1279,7 @@ int setup_ioapic_entry(int apic_id, int irq,
1382 if (index < 0) 1279 if (index < 0)
1383 panic("Failed to allocate IRTE for ioapic %d\n", apic_id); 1280 panic("Failed to allocate IRTE for ioapic %d\n", apic_id);
1384 1281
1385 memset(&irte, 0, sizeof(irte)); 1282 prepare_irte(&irte, vector, destination);
1386
1387 irte.present = 1;
1388 irte.dst_mode = apic->irq_dest_mode;
1389 /*
1390 * Trigger mode in the IRTE will always be edge, and the
1391 * actual level or edge trigger will be setup in the IO-APIC
1392 * RTE. This will help simplify level triggered irq migration.
1393 * For more details, see the comments above explainig IO-APIC
1394 * irq migration in the presence of interrupt-remapping.
1395 */
1396 irte.trigger_mode = 0;
1397 irte.dlvry_mode = apic->irq_delivery_mode;
1398 irte.vector = vector;
1399 irte.dest_id = IRTE_DEST(destination);
1400 1283
1401 /* Set source-id of interrupt request */ 1284 /* Set source-id of interrupt request */
1402 set_ioapic_sid(&irte, apic_id); 1285 set_ioapic_sid(&irte, apic_id);
@@ -1431,18 +1314,14 @@ int setup_ioapic_entry(int apic_id, int irq,
1431 return 0; 1314 return 0;
1432} 1315}
1433 1316
1434static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq_desc *desc, 1317static void setup_ioapic_irq(int apic_id, int pin, unsigned int irq,
1435 int trigger, int polarity) 1318 struct irq_cfg *cfg, int trigger, int polarity)
1436{ 1319{
1437 struct irq_cfg *cfg;
1438 struct IO_APIC_route_entry entry; 1320 struct IO_APIC_route_entry entry;
1439 unsigned int dest; 1321 unsigned int dest;
1440 1322
1441 if (!IO_APIC_IRQ(irq)) 1323 if (!IO_APIC_IRQ(irq))
1442 return; 1324 return;
1443
1444 cfg = desc->chip_data;
1445
1446 /* 1325 /*
1447 * For legacy irqs, cfg->domain starts with cpu 0 for legacy 1326 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1448 * controllers like 8259. Now that IO-APIC can handle this irq, update 1327 * controllers like 8259. Now that IO-APIC can handle this irq, update
@@ -1459,58 +1338,45 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1459 apic_printk(APIC_VERBOSE,KERN_DEBUG 1338 apic_printk(APIC_VERBOSE,KERN_DEBUG
1460 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> " 1339 "IOAPIC[%d]: Set routing entry (%d-%d -> 0x%x -> "
1461 "IRQ %d Mode:%i Active:%i)\n", 1340 "IRQ %d Mode:%i Active:%i)\n",
1462 apic_id, mp_ioapics[apic_id].apicid, pin, cfg->vector, 1341 apic_id, mpc_ioapic_id(apic_id), pin, cfg->vector,
1463 irq, trigger, polarity); 1342 irq, trigger, polarity);
1464 1343
1465 1344
1466 if (setup_ioapic_entry(mp_ioapics[apic_id].apicid, irq, &entry, 1345 if (setup_ioapic_entry(mpc_ioapic_id(apic_id), irq, &entry,
1467 dest, trigger, polarity, cfg->vector, pin)) { 1346 dest, trigger, polarity, cfg->vector, pin)) {
1468 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n", 1347 printk("Failed to setup ioapic entry for ioapic %d, pin %d\n",
1469 mp_ioapics[apic_id].apicid, pin); 1348 mpc_ioapic_id(apic_id), pin);
1470 __clear_irq_vector(irq, cfg); 1349 __clear_irq_vector(irq, cfg);
1471 return; 1350 return;
1472 } 1351 }
1473 1352
1474 ioapic_register_intr(irq, desc, trigger); 1353 ioapic_register_intr(irq, cfg, trigger);
1475 if (irq < legacy_pic->nr_legacy_irqs) 1354 if (irq < legacy_pic->nr_legacy_irqs)
1476 legacy_pic->chip->mask(irq); 1355 legacy_pic->mask(irq);
1477 1356
1478 ioapic_write_entry(apic_id, pin, entry); 1357 ioapic_write_entry(apic_id, pin, entry);
1479} 1358}
1480 1359
1481static struct { 1360static bool __init io_apic_pin_not_connected(int idx, int apic_id, int pin)
1482 DECLARE_BITMAP(pin_programmed, MP_MAX_IOAPIC_PIN + 1);
1483} mp_ioapic_routing[MAX_IO_APICS];
1484
1485static void __init setup_IO_APIC_irqs(void)
1486{ 1361{
1487 int apic_id, pin, idx, irq; 1362 if (idx != -1)
1488 int notcon = 0; 1363 return false;
1489 struct irq_desc *desc;
1490 struct irq_cfg *cfg;
1491 int node = cpu_to_node(boot_cpu_id);
1492 1364
1493 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1365 apic_printk(APIC_VERBOSE, KERN_DEBUG " apic %d pin %d not connected\n",
1366 mpc_ioapic_id(apic_id), pin);
1367 return true;
1368}
1494 1369
1495 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) 1370static void __init __io_apic_setup_irqs(unsigned int apic_id)
1496 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1371{
1372 int idx, node = cpu_to_node(0);
1373 struct io_apic_irq_attr attr;
1374 unsigned int pin, irq;
1375
1376 for (pin = 0; pin < ioapics[apic_id].nr_registers; pin++) {
1497 idx = find_irq_entry(apic_id, pin, mp_INT); 1377 idx = find_irq_entry(apic_id, pin, mp_INT);
1498 if (idx == -1) { 1378 if (io_apic_pin_not_connected(idx, apic_id, pin))
1499 if (!notcon) {
1500 notcon = 1;
1501 apic_printk(APIC_VERBOSE,
1502 KERN_DEBUG " %d-%d",
1503 mp_ioapics[apic_id].apicid, pin);
1504 } else
1505 apic_printk(APIC_VERBOSE, " %d-%d",
1506 mp_ioapics[apic_id].apicid, pin);
1507 continue; 1379 continue;
1508 }
1509 if (notcon) {
1510 apic_printk(APIC_VERBOSE,
1511 " (apicid-pin) not connected\n");
1512 notcon = 0;
1513 }
1514 1380
1515 irq = pin_2_irq(idx, apic_id, pin); 1381 irq = pin_2_irq(idx, apic_id, pin);
1516 1382
@@ -1522,27 +1388,24 @@ static void __init setup_IO_APIC_irqs(void)
1522 * installed and if it returns 1: 1388 * installed and if it returns 1:
1523 */ 1389 */
1524 if (apic->multi_timer_check && 1390 if (apic->multi_timer_check &&
1525 apic->multi_timer_check(apic_id, irq)) 1391 apic->multi_timer_check(apic_id, irq))
1526 continue; 1392 continue;
1527 1393
1528 desc = irq_to_desc_alloc_node(irq, node); 1394 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1529 if (!desc) { 1395 irq_polarity(idx));
1530 printk(KERN_INFO "can not get irq_desc for %d\n", irq); 1396
1531 continue; 1397 io_apic_setup_irq_pin(irq, node, &attr);
1532 }
1533 cfg = desc->chip_data;
1534 add_pin_to_irq_node(cfg, node, apic_id, pin);
1535 /*
1536 * don't mark it in pin_programmed, so later acpi could
1537 * set it correctly when irq < 16
1538 */
1539 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1540 irq_trigger(idx), irq_polarity(idx));
1541 } 1398 }
1399}
1542 1400
1543 if (notcon) 1401static void __init setup_IO_APIC_irqs(void)
1544 apic_printk(APIC_VERBOSE, 1402{
1545 " (apicid-pin) not connected\n"); 1403 unsigned int apic_id;
1404
1405 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1406
1407 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1408 __io_apic_setup_irqs(apic_id);
1546} 1409}
1547 1410
1548/* 1411/*
@@ -1552,10 +1415,8 @@ static void __init setup_IO_APIC_irqs(void)
1552 */ 1415 */
1553void setup_IO_APIC_irq_extra(u32 gsi) 1416void setup_IO_APIC_irq_extra(u32 gsi)
1554{ 1417{
1555 int apic_id = 0, pin, idx, irq; 1418 int apic_id = 0, pin, idx, irq, node = cpu_to_node(0);
1556 int node = cpu_to_node(boot_cpu_id); 1419 struct io_apic_irq_attr attr;
1557 struct irq_desc *desc;
1558 struct irq_cfg *cfg;
1559 1420
1560 /* 1421 /*
1561 * Convert 'gsi' to 'ioapic.pin'. 1422 * Convert 'gsi' to 'ioapic.pin'.
@@ -1570,29 +1431,15 @@ void setup_IO_APIC_irq_extra(u32 gsi)
1570 return; 1431 return;
1571 1432
1572 irq = pin_2_irq(idx, apic_id, pin); 1433 irq = pin_2_irq(idx, apic_id, pin);
1573#ifdef CONFIG_SPARSE_IRQ
1574 desc = irq_to_desc(irq);
1575 if (desc)
1576 return;
1577#endif
1578 desc = irq_to_desc_alloc_node(irq, node);
1579 if (!desc) {
1580 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1581 return;
1582 }
1583 1434
1584 cfg = desc->chip_data; 1435 /* Only handle the non legacy irqs on secondary ioapics */
1585 add_pin_to_irq_node(cfg, node, apic_id, pin); 1436 if (apic_id == 0 || irq < NR_IRQS_LEGACY)
1586
1587 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1588 pr_debug("Pin %d-%d already programmed\n",
1589 mp_ioapics[apic_id].apicid, pin);
1590 return; 1437 return;
1591 }
1592 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1593 1438
1594 setup_IO_APIC_irq(apic_id, pin, irq, desc, 1439 set_io_apic_irq_attr(&attr, apic_id, pin, irq_trigger(idx),
1595 irq_trigger(idx), irq_polarity(idx)); 1440 irq_polarity(idx));
1441
1442 io_apic_setup_irq_pin_once(irq, node, &attr);
1596} 1443}
1597 1444
1598/* 1445/*
@@ -1624,7 +1471,8 @@ static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
1624 * The timer IRQ doesn't have to know that behind the 1471 * The timer IRQ doesn't have to know that behind the
1625 * scene we may have a 8259A-master in AEOI mode ... 1472 * scene we may have a 8259A-master in AEOI mode ...
1626 */ 1473 */
1627 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 1474 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
1475 "edge");
1628 1476
1629 /* 1477 /*
1630 * Add it to the IO-APIC irq-routing table: 1478 * Add it to the IO-APIC irq-routing table:
@@ -1642,13 +1490,12 @@ __apicdebuginit(void) print_IO_APIC(void)
1642 union IO_APIC_reg_03 reg_03; 1490 union IO_APIC_reg_03 reg_03;
1643 unsigned long flags; 1491 unsigned long flags;
1644 struct irq_cfg *cfg; 1492 struct irq_cfg *cfg;
1645 struct irq_desc *desc;
1646 unsigned int irq; 1493 unsigned int irq;
1647 1494
1648 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1495 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1649 for (i = 0; i < nr_ioapics; i++) 1496 for (i = 0; i < nr_ioapics; i++)
1650 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1497 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
1651 mp_ioapics[i].apicid, nr_ioapic_registers[i]); 1498 mpc_ioapic_id(i), ioapics[i].nr_registers);
1652 1499
1653 /* 1500 /*
1654 * We are a bit conservative about what we expect. We have to 1501 * We are a bit conservative about what we expect. We have to
@@ -1668,7 +1515,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1668 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1515 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1669 1516
1670 printk("\n"); 1517 printk("\n");
1671 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1518 printk(KERN_DEBUG "IO APIC #%d......\n", mpc_ioapic_id(apic));
1672 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw); 1519 printk(KERN_DEBUG ".... register #00: %08X\n", reg_00.raw);
1673 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID); 1520 printk(KERN_DEBUG "....... : physical APIC id: %02X\n", reg_00.bits.ID);
1674 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type); 1521 printk(KERN_DEBUG "....... : Delivery Type: %X\n", reg_00.bits.delivery_type);
@@ -1729,10 +1576,10 @@ __apicdebuginit(void) print_IO_APIC(void)
1729 } 1576 }
1730 } 1577 }
1731 printk(KERN_DEBUG "IRQ to pin mappings:\n"); 1578 printk(KERN_DEBUG "IRQ to pin mappings:\n");
1732 for_each_irq_desc(irq, desc) { 1579 for_each_active_irq(irq) {
1733 struct irq_pin_list *entry; 1580 struct irq_pin_list *entry;
1734 1581
1735 cfg = desc->chip_data; 1582 cfg = irq_get_chip_data(irq);
1736 if (!cfg) 1583 if (!cfg)
1737 continue; 1584 continue;
1738 entry = cfg->irq_2_pin; 1585 entry = cfg->irq_2_pin;
@@ -1962,7 +1809,7 @@ void __init enable_IO_APIC(void)
1962 for(apic = 0; apic < nr_ioapics; apic++) { 1809 for(apic = 0; apic < nr_ioapics; apic++) {
1963 int pin; 1810 int pin;
1964 /* See if any of the pins is in ExtINT mode */ 1811 /* See if any of the pins is in ExtINT mode */
1965 for (pin = 0; pin < nr_ioapic_registers[apic]; pin++) { 1812 for (pin = 0; pin < ioapics[apic].nr_registers; pin++) {
1966 struct IO_APIC_route_entry entry; 1813 struct IO_APIC_route_entry entry;
1967 entry = ioapic_read_entry(apic, pin); 1814 entry = ioapic_read_entry(apic, pin);
1968 1815
@@ -2023,7 +1870,7 @@ void disable_IO_APIC(void)
2023 * 1870 *
2024 * With interrupt-remapping, for now we will use virtual wire A mode, 1871 * With interrupt-remapping, for now we will use virtual wire A mode,
2025 * as virtual wire B is little complex (need to configure both 1872 * as virtual wire B is little complex (need to configure both
2026 * IOAPIC RTE aswell as interrupt-remapping table entry). 1873 * IOAPIC RTE as well as interrupt-remapping table entry).
2027 * As this gets called during crash dump, keep this simple for now. 1874 * As this gets called during crash dump, keep this simple for now.
2028 */ 1875 */
2029 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) { 1876 if (ioapic_i8259.pin != -1 && !intr_remapping_enabled) {
@@ -2061,8 +1908,7 @@ void disable_IO_APIC(void)
2061 * 1908 *
2062 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999 1909 * by Matt Domsch <Matt_Domsch@dell.com> Tue Dec 21 12:25:05 CST 1999
2063 */ 1910 */
2064 1911void __init setup_ioapic_ids_from_mpc_nocheck(void)
2065void __init setup_ioapic_ids_from_mpc(void)
2066{ 1912{
2067 union IO_APIC_reg_00 reg_00; 1913 union IO_APIC_reg_00 reg_00;
2068 physid_mask_t phys_id_present_map; 1914 physid_mask_t phys_id_present_map;
@@ -2071,15 +1917,6 @@ void __init setup_ioapic_ids_from_mpc(void)
2071 unsigned char old_id; 1917 unsigned char old_id;
2072 unsigned long flags; 1918 unsigned long flags;
2073 1919
2074 if (acpi_ioapic)
2075 return;
2076 /*
2077 * Don't check I/O APIC IDs for xAPIC systems. They have
2078 * no meaning without the serial APIC bus.
2079 */
2080 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2081 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
2082 return;
2083 /* 1920 /*
2084 * This is broken; anything with a real cpu count has to 1921 * This is broken; anything with a real cpu count has to
2085 * circumvent this idiocy regardless. 1922 * circumvent this idiocy regardless.
@@ -2096,14 +1933,14 @@ void __init setup_ioapic_ids_from_mpc(void)
2096 reg_00.raw = io_apic_read(apic_id, 0); 1933 reg_00.raw = io_apic_read(apic_id, 0);
2097 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1934 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2098 1935
2099 old_id = mp_ioapics[apic_id].apicid; 1936 old_id = mpc_ioapic_id(apic_id);
2100 1937
2101 if (mp_ioapics[apic_id].apicid >= get_physical_broadcast()) { 1938 if (mpc_ioapic_id(apic_id) >= get_physical_broadcast()) {
2102 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n", 1939 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID is %d in the MPC table!...\n",
2103 apic_id, mp_ioapics[apic_id].apicid); 1940 apic_id, mpc_ioapic_id(apic_id));
2104 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1941 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2105 reg_00.bits.ID); 1942 reg_00.bits.ID);
2106 mp_ioapics[apic_id].apicid = reg_00.bits.ID; 1943 ioapics[apic_id].mp_config.apicid = reg_00.bits.ID;
2107 } 1944 }
2108 1945
2109 /* 1946 /*
@@ -2112,9 +1949,9 @@ void __init setup_ioapic_ids_from_mpc(void)
2112 * 'stuck on smp_invalidate_needed IPI wait' messages. 1949 * 'stuck on smp_invalidate_needed IPI wait' messages.
2113 */ 1950 */
2114 if (apic->check_apicid_used(&phys_id_present_map, 1951 if (apic->check_apicid_used(&phys_id_present_map,
2115 mp_ioapics[apic_id].apicid)) { 1952 mpc_ioapic_id(apic_id))) {
2116 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 1953 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2117 apic_id, mp_ioapics[apic_id].apicid); 1954 apic_id, mpc_ioapic_id(apic_id));
2118 for (i = 0; i < get_physical_broadcast(); i++) 1955 for (i = 0; i < get_physical_broadcast(); i++)
2119 if (!physid_isset(i, phys_id_present_map)) 1956 if (!physid_isset(i, phys_id_present_map))
2120 break; 1957 break;
@@ -2123,36 +1960,39 @@ void __init setup_ioapic_ids_from_mpc(void)
2123 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n", 1960 printk(KERN_ERR "... fixing up to %d. (tell your hw vendor)\n",
2124 i); 1961 i);
2125 physid_set(i, phys_id_present_map); 1962 physid_set(i, phys_id_present_map);
2126 mp_ioapics[apic_id].apicid = i; 1963 ioapics[apic_id].mp_config.apicid = i;
2127 } else { 1964 } else {
2128 physid_mask_t tmp; 1965 physid_mask_t tmp;
2129 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp); 1966 apic->apicid_to_cpu_present(mpc_ioapic_id(apic_id),
1967 &tmp);
2130 apic_printk(APIC_VERBOSE, "Setting %d in the " 1968 apic_printk(APIC_VERBOSE, "Setting %d in the "
2131 "phys_id_present_map\n", 1969 "phys_id_present_map\n",
2132 mp_ioapics[apic_id].apicid); 1970 mpc_ioapic_id(apic_id));
2133 physids_or(phys_id_present_map, phys_id_present_map, tmp); 1971 physids_or(phys_id_present_map, phys_id_present_map, tmp);
2134 } 1972 }
2135 1973
2136
2137 /* 1974 /*
2138 * We need to adjust the IRQ routing table 1975 * We need to adjust the IRQ routing table
2139 * if the ID changed. 1976 * if the ID changed.
2140 */ 1977 */
2141 if (old_id != mp_ioapics[apic_id].apicid) 1978 if (old_id != mpc_ioapic_id(apic_id))
2142 for (i = 0; i < mp_irq_entries; i++) 1979 for (i = 0; i < mp_irq_entries; i++)
2143 if (mp_irqs[i].dstapic == old_id) 1980 if (mp_irqs[i].dstapic == old_id)
2144 mp_irqs[i].dstapic 1981 mp_irqs[i].dstapic
2145 = mp_ioapics[apic_id].apicid; 1982 = mpc_ioapic_id(apic_id);
2146 1983
2147 /* 1984 /*
2148 * Read the right value from the MPC table and 1985 * Update the ID register according to the right value
2149 * write it into the ID register. 1986 * from the MPC table if they are different.
2150 */ 1987 */
1988 if (mpc_ioapic_id(apic_id) == reg_00.bits.ID)
1989 continue;
1990
2151 apic_printk(APIC_VERBOSE, KERN_INFO 1991 apic_printk(APIC_VERBOSE, KERN_INFO
2152 "...changing IO-APIC physical APIC ID to %d ...", 1992 "...changing IO-APIC physical APIC ID to %d ...",
2153 mp_ioapics[apic_id].apicid); 1993 mpc_ioapic_id(apic_id));
2154 1994
2155 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 1995 reg_00.bits.ID = mpc_ioapic_id(apic_id);
2156 raw_spin_lock_irqsave(&ioapic_lock, flags); 1996 raw_spin_lock_irqsave(&ioapic_lock, flags);
2157 io_apic_write(apic_id, 0, reg_00.raw); 1997 io_apic_write(apic_id, 0, reg_00.raw);
2158 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 1998 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
@@ -2163,12 +2003,27 @@ void __init setup_ioapic_ids_from_mpc(void)
2163 raw_spin_lock_irqsave(&ioapic_lock, flags); 2003 raw_spin_lock_irqsave(&ioapic_lock, flags);
2164 reg_00.raw = io_apic_read(apic_id, 0); 2004 reg_00.raw = io_apic_read(apic_id, 0);
2165 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2005 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2166 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2006 if (reg_00.bits.ID != mpc_ioapic_id(apic_id))
2167 printk("could not set ID!\n"); 2007 printk("could not set ID!\n");
2168 else 2008 else
2169 apic_printk(APIC_VERBOSE, " ok.\n"); 2009 apic_printk(APIC_VERBOSE, " ok.\n");
2170 } 2010 }
2171} 2011}
2012
2013void __init setup_ioapic_ids_from_mpc(void)
2014{
2015
2016 if (acpi_ioapic)
2017 return;
2018 /*
2019 * Don't check I/O APIC IDs for xAPIC systems. They have
2020 * no meaning without the serial APIC bus.
2021 */
2022 if (!(boot_cpu_data.x86_vendor == X86_VENDOR_INTEL)
2023 || APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
2024 return;
2025 setup_ioapic_ids_from_mpc_nocheck();
2026}
2172#endif 2027#endif
2173 2028
2174int no_timer_check __initdata; 2029int no_timer_check __initdata;
@@ -2239,29 +2094,26 @@ static int __init timer_irq_works(void)
2239 * an edge even if it isn't on the 8259A... 2094 * an edge even if it isn't on the 8259A...
2240 */ 2095 */
2241 2096
2242static unsigned int startup_ioapic_irq(unsigned int irq) 2097static unsigned int startup_ioapic_irq(struct irq_data *data)
2243{ 2098{
2244 int was_pending = 0; 2099 int was_pending = 0, irq = data->irq;
2245 unsigned long flags; 2100 unsigned long flags;
2246 struct irq_cfg *cfg;
2247 2101
2248 raw_spin_lock_irqsave(&ioapic_lock, flags); 2102 raw_spin_lock_irqsave(&ioapic_lock, flags);
2249 if (irq < legacy_pic->nr_legacy_irqs) { 2103 if (irq < legacy_pic->nr_legacy_irqs) {
2250 legacy_pic->chip->mask(irq); 2104 legacy_pic->mask(irq);
2251 if (legacy_pic->irq_pending(irq)) 2105 if (legacy_pic->irq_pending(irq))
2252 was_pending = 1; 2106 was_pending = 1;
2253 } 2107 }
2254 cfg = irq_cfg(irq); 2108 __unmask_ioapic(data->chip_data);
2255 __unmask_IO_APIC_irq(cfg);
2256 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2109 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2257 2110
2258 return was_pending; 2111 return was_pending;
2259} 2112}
2260 2113
2261static int ioapic_retrigger_irq(unsigned int irq) 2114static int ioapic_retrigger_irq(struct irq_data *data)
2262{ 2115{
2263 2116 struct irq_cfg *cfg = data->chip_data;
2264 struct irq_cfg *cfg = irq_cfg(irq);
2265 unsigned long flags; 2117 unsigned long flags;
2266 2118
2267 raw_spin_lock_irqsave(&vector_lock, flags); 2119 raw_spin_lock_irqsave(&vector_lock, flags);
@@ -2312,7 +2164,7 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2312 * With interrupt-remapping, destination information comes 2164 * With interrupt-remapping, destination information comes
2313 * from interrupt-remapping table entry. 2165 * from interrupt-remapping table entry.
2314 */ 2166 */
2315 if (!irq_remapped(irq)) 2167 if (!irq_remapped(cfg))
2316 io_apic_write(apic, 0x11 + pin*2, dest); 2168 io_apic_write(apic, 0x11 + pin*2, dest);
2317 reg = io_apic_read(apic, 0x10 + pin*2); 2169 reg = io_apic_read(apic, 0x10 + pin*2);
2318 reg &= ~IO_APIC_REDIR_VECTOR_MASK; 2170 reg &= ~IO_APIC_REDIR_VECTOR_MASK;
@@ -2322,65 +2174,46 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2322} 2174}
2323 2175
2324/* 2176/*
2325 * Either sets desc->affinity to a valid value, and returns 2177 * Either sets data->affinity to a valid value, and returns
2326 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and 2178 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2327 * leaves desc->affinity untouched. 2179 * leaves data->affinity untouched.
2328 */ 2180 */
2329unsigned int 2181int __ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2330set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask, 2182 unsigned int *dest_id)
2331 unsigned int *dest_id)
2332{ 2183{
2333 struct irq_cfg *cfg; 2184 struct irq_cfg *cfg = data->chip_data;
2334 unsigned int irq;
2335 2185
2336 if (!cpumask_intersects(mask, cpu_online_mask)) 2186 if (!cpumask_intersects(mask, cpu_online_mask))
2337 return -1; 2187 return -1;
2338 2188
2339 irq = desc->irq; 2189 if (assign_irq_vector(data->irq, data->chip_data, mask))
2340 cfg = desc->chip_data;
2341 if (assign_irq_vector(irq, cfg, mask))
2342 return -1; 2190 return -1;
2343 2191
2344 cpumask_copy(desc->affinity, mask); 2192 cpumask_copy(data->affinity, mask);
2345 2193
2346 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2194 *dest_id = apic->cpu_mask_to_apicid_and(mask, cfg->domain);
2347 return 0; 2195 return 0;
2348} 2196}
2349 2197
2350static int 2198static int
2351set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2199ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2200 bool force)
2352{ 2201{
2353 struct irq_cfg *cfg; 2202 unsigned int dest, irq = data->irq;
2354 unsigned long flags; 2203 unsigned long flags;
2355 unsigned int dest; 2204 int ret;
2356 unsigned int irq;
2357 int ret = -1;
2358
2359 irq = desc->irq;
2360 cfg = desc->chip_data;
2361 2205
2362 raw_spin_lock_irqsave(&ioapic_lock, flags); 2206 raw_spin_lock_irqsave(&ioapic_lock, flags);
2363 ret = set_desc_affinity(desc, mask, &dest); 2207 ret = __ioapic_set_affinity(data, mask, &dest);
2364 if (!ret) { 2208 if (!ret) {
2365 /* Only the high 8 bits are valid. */ 2209 /* Only the high 8 bits are valid. */
2366 dest = SET_APIC_LOGICAL_ID(dest); 2210 dest = SET_APIC_LOGICAL_ID(dest);
2367 __target_IO_APIC_irq(irq, dest, cfg); 2211 __target_IO_APIC_irq(irq, dest, data->chip_data);
2368 } 2212 }
2369 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2213 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2370
2371 return ret; 2214 return ret;
2372} 2215}
2373 2216
2374static int
2375set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2376{
2377 struct irq_desc *desc;
2378
2379 desc = irq_to_desc(irq);
2380
2381 return set_ioapic_affinity_irq_desc(desc, mask);
2382}
2383
2384#ifdef CONFIG_INTR_REMAP 2217#ifdef CONFIG_INTR_REMAP
2385 2218
2386/* 2219/*
@@ -2395,24 +2228,21 @@ set_ioapic_affinity_irq(unsigned int irq, const struct cpumask *mask)
2395 * the interrupt-remapping table entry. 2228 * the interrupt-remapping table entry.
2396 */ 2229 */
2397static int 2230static int
2398migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask) 2231ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2232 bool force)
2399{ 2233{
2400 struct irq_cfg *cfg; 2234 struct irq_cfg *cfg = data->chip_data;
2235 unsigned int dest, irq = data->irq;
2401 struct irte irte; 2236 struct irte irte;
2402 unsigned int dest;
2403 unsigned int irq;
2404 int ret = -1;
2405 2237
2406 if (!cpumask_intersects(mask, cpu_online_mask)) 2238 if (!cpumask_intersects(mask, cpu_online_mask))
2407 return ret; 2239 return -EINVAL;
2408 2240
2409 irq = desc->irq;
2410 if (get_irte(irq, &irte)) 2241 if (get_irte(irq, &irte))
2411 return ret; 2242 return -EBUSY;
2412 2243
2413 cfg = desc->chip_data;
2414 if (assign_irq_vector(irq, cfg, mask)) 2244 if (assign_irq_vector(irq, cfg, mask))
2415 return ret; 2245 return -EBUSY;
2416 2246
2417 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask); 2247 dest = apic->cpu_mask_to_apicid_and(cfg->domain, mask);
2418 2248
@@ -2427,29 +2257,14 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2427 if (cfg->move_in_progress) 2257 if (cfg->move_in_progress)
2428 send_cleanup_vector(cfg); 2258 send_cleanup_vector(cfg);
2429 2259
2430 cpumask_copy(desc->affinity, mask); 2260 cpumask_copy(data->affinity, mask);
2431
2432 return 0; 2261 return 0;
2433} 2262}
2434 2263
2435/*
2436 * Migrates the IRQ destination in the process context.
2437 */
2438static int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
2439 const struct cpumask *mask)
2440{
2441 return migrate_ioapic_irq_desc(desc, mask);
2442}
2443static int set_ir_ioapic_affinity_irq(unsigned int irq,
2444 const struct cpumask *mask)
2445{
2446 struct irq_desc *desc = irq_to_desc(irq);
2447
2448 return set_ir_ioapic_affinity_irq_desc(desc, mask);
2449}
2450#else 2264#else
2451static inline int set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc, 2265static inline int
2452 const struct cpumask *mask) 2266ir_ioapic_set_affinity(struct irq_data *data, const struct cpumask *mask,
2267 bool force)
2453{ 2268{
2454 return 0; 2269 return 0;
2455} 2270}
@@ -2469,7 +2284,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2469 unsigned int irr; 2284 unsigned int irr;
2470 struct irq_desc *desc; 2285 struct irq_desc *desc;
2471 struct irq_cfg *cfg; 2286 struct irq_cfg *cfg;
2472 irq = __get_cpu_var(vector_irq)[vector]; 2287 irq = __this_cpu_read(vector_irq[vector]);
2473 2288
2474 if (irq == -1) 2289 if (irq == -1)
2475 continue; 2290 continue;
@@ -2503,7 +2318,7 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2503 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR); 2318 apic->send_IPI_self(IRQ_MOVE_CLEANUP_VECTOR);
2504 goto unlock; 2319 goto unlock;
2505 } 2320 }
2506 __get_cpu_var(vector_irq)[vector] = -1; 2321 __this_cpu_write(vector_irq[vector], -1);
2507unlock: 2322unlock:
2508 raw_spin_unlock(&desc->lock); 2323 raw_spin_unlock(&desc->lock);
2509 } 2324 }
@@ -2511,10 +2326,8 @@ unlock:
2511 irq_exit(); 2326 irq_exit();
2512} 2327}
2513 2328
2514static void __irq_complete_move(struct irq_desc **descp, unsigned vector) 2329static void __irq_complete_move(struct irq_cfg *cfg, unsigned vector)
2515{ 2330{
2516 struct irq_desc *desc = *descp;
2517 struct irq_cfg *cfg = desc->chip_data;
2518 unsigned me; 2331 unsigned me;
2519 2332
2520 if (likely(!cfg->move_in_progress)) 2333 if (likely(!cfg->move_in_progress))
@@ -2526,31 +2339,28 @@ static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2526 send_cleanup_vector(cfg); 2339 send_cleanup_vector(cfg);
2527} 2340}
2528 2341
2529static void irq_complete_move(struct irq_desc **descp) 2342static void irq_complete_move(struct irq_cfg *cfg)
2530{ 2343{
2531 __irq_complete_move(descp, ~get_irq_regs()->orig_ax); 2344 __irq_complete_move(cfg, ~get_irq_regs()->orig_ax);
2532} 2345}
2533 2346
2534void irq_force_complete_move(int irq) 2347void irq_force_complete_move(int irq)
2535{ 2348{
2536 struct irq_desc *desc = irq_to_desc(irq); 2349 struct irq_cfg *cfg = irq_get_chip_data(irq);
2537 struct irq_cfg *cfg = desc->chip_data;
2538 2350
2539 if (!cfg) 2351 if (!cfg)
2540 return; 2352 return;
2541 2353
2542 __irq_complete_move(&desc, cfg->vector); 2354 __irq_complete_move(cfg, cfg->vector);
2543} 2355}
2544#else 2356#else
2545static inline void irq_complete_move(struct irq_desc **descp) {} 2357static inline void irq_complete_move(struct irq_cfg *cfg) { }
2546#endif 2358#endif
2547 2359
2548static void ack_apic_edge(unsigned int irq) 2360static void ack_apic_edge(struct irq_data *data)
2549{ 2361{
2550 struct irq_desc *desc = irq_to_desc(irq); 2362 irq_complete_move(data->chip_data);
2551 2363 irq_move_irq(data);
2552 irq_complete_move(&desc);
2553 move_native_irq(irq);
2554 ack_APIC_irq(); 2364 ack_APIC_irq();
2555} 2365}
2556 2366
@@ -2572,19 +2382,21 @@ atomic_t irq_mis_count;
2572 * Otherwise, we simulate the EOI message manually by changing the trigger 2382 * Otherwise, we simulate the EOI message manually by changing the trigger
2573 * mode to edge and then back to level, with RTE being masked during this. 2383 * mode to edge and then back to level, with RTE being masked during this.
2574*/ 2384*/
2575static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg) 2385static void eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2576{ 2386{
2577 struct irq_pin_list *entry; 2387 struct irq_pin_list *entry;
2388 unsigned long flags;
2578 2389
2390 raw_spin_lock_irqsave(&ioapic_lock, flags);
2579 for_each_irq_pin(entry, cfg->irq_2_pin) { 2391 for_each_irq_pin(entry, cfg->irq_2_pin) {
2580 if (mp_ioapics[entry->apic].apicver >= 0x20) { 2392 if (mpc_ioapic_ver(entry->apic) >= 0x20) {
2581 /* 2393 /*
2582 * Intr-remapping uses pin number as the virtual vector 2394 * Intr-remapping uses pin number as the virtual vector
2583 * in the RTE. Actual vector is programmed in 2395 * in the RTE. Actual vector is programmed in
2584 * intr-remapping table entry. Hence for the io-apic 2396 * intr-remapping table entry. Hence for the io-apic
2585 * EOI we use the pin number. 2397 * EOI we use the pin number.
2586 */ 2398 */
2587 if (irq_remapped(irq)) 2399 if (irq_remapped(cfg))
2588 io_apic_eoi(entry->apic, entry->pin); 2400 io_apic_eoi(entry->apic, entry->pin);
2589 else 2401 else
2590 io_apic_eoi(entry->apic, cfg->vector); 2402 io_apic_eoi(entry->apic, cfg->vector);
@@ -2593,36 +2405,21 @@ static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2593 __unmask_and_level_IO_APIC_irq(entry); 2405 __unmask_and_level_IO_APIC_irq(entry);
2594 } 2406 }
2595 } 2407 }
2596}
2597
2598static void eoi_ioapic_irq(struct irq_desc *desc)
2599{
2600 struct irq_cfg *cfg;
2601 unsigned long flags;
2602 unsigned int irq;
2603
2604 irq = desc->irq;
2605 cfg = desc->chip_data;
2606
2607 raw_spin_lock_irqsave(&ioapic_lock, flags);
2608 __eoi_ioapic_irq(irq, cfg);
2609 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2408 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2610} 2409}
2611 2410
2612static void ack_apic_level(unsigned int irq) 2411static void ack_apic_level(struct irq_data *data)
2613{ 2412{
2614 struct irq_desc *desc = irq_to_desc(irq); 2413 struct irq_cfg *cfg = data->chip_data;
2414 int i, do_unmask_irq = 0, irq = data->irq;
2615 unsigned long v; 2415 unsigned long v;
2616 int i;
2617 struct irq_cfg *cfg;
2618 int do_unmask_irq = 0;
2619 2416
2620 irq_complete_move(&desc); 2417 irq_complete_move(cfg);
2621#ifdef CONFIG_GENERIC_PENDING_IRQ 2418#ifdef CONFIG_GENERIC_PENDING_IRQ
2622 /* If we are moving the irq we need to mask it */ 2419 /* If we are moving the irq we need to mask it */
2623 if (unlikely(desc->status & IRQ_MOVE_PENDING)) { 2420 if (unlikely(irqd_is_setaffinity_pending(data))) {
2624 do_unmask_irq = 1; 2421 do_unmask_irq = 1;
2625 mask_IO_APIC_irq_desc(desc); 2422 mask_ioapic(cfg);
2626 } 2423 }
2627#endif 2424#endif
2628 2425
@@ -2658,7 +2455,6 @@ static void ack_apic_level(unsigned int irq)
2658 * we use the above logic (mask+edge followed by unmask+level) from 2455 * we use the above logic (mask+edge followed by unmask+level) from
2659 * Manfred Spraul to clear the remote IRR. 2456 * Manfred Spraul to clear the remote IRR.
2660 */ 2457 */
2661 cfg = desc->chip_data;
2662 i = cfg->vector; 2458 i = cfg->vector;
2663 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1)); 2459 v = apic_read(APIC_TMR + ((i & ~0x1f) >> 1));
2664 2460
@@ -2678,7 +2474,7 @@ static void ack_apic_level(unsigned int irq)
2678 if (!(v & (1 << (i & 0x1f)))) { 2474 if (!(v & (1 << (i & 0x1f)))) {
2679 atomic_inc(&irq_mis_count); 2475 atomic_inc(&irq_mis_count);
2680 2476
2681 eoi_ioapic_irq(desc); 2477 eoi_ioapic_irq(irq, cfg);
2682 } 2478 }
2683 2479
2684 /* Now we can move and renable the irq */ 2480 /* Now we can move and renable the irq */
@@ -2709,61 +2505,57 @@ static void ack_apic_level(unsigned int irq)
2709 * accurate and is causing problems then it is a hardware bug 2505 * accurate and is causing problems then it is a hardware bug
2710 * and you can go talk to the chipset vendor about it. 2506 * and you can go talk to the chipset vendor about it.
2711 */ 2507 */
2712 cfg = desc->chip_data;
2713 if (!io_apic_level_ack_pending(cfg)) 2508 if (!io_apic_level_ack_pending(cfg))
2714 move_masked_irq(irq); 2509 irq_move_masked_irq(data);
2715 unmask_IO_APIC_irq_desc(desc); 2510 unmask_ioapic(cfg);
2716 } 2511 }
2717} 2512}
2718 2513
2719#ifdef CONFIG_INTR_REMAP 2514#ifdef CONFIG_INTR_REMAP
2720static void ir_ack_apic_edge(unsigned int irq) 2515static void ir_ack_apic_edge(struct irq_data *data)
2721{ 2516{
2722 ack_APIC_irq(); 2517 ack_APIC_irq();
2723} 2518}
2724 2519
2725static void ir_ack_apic_level(unsigned int irq) 2520static void ir_ack_apic_level(struct irq_data *data)
2726{ 2521{
2727 struct irq_desc *desc = irq_to_desc(irq);
2728
2729 ack_APIC_irq(); 2522 ack_APIC_irq();
2730 eoi_ioapic_irq(desc); 2523 eoi_ioapic_irq(data->irq, data->chip_data);
2731} 2524}
2732#endif /* CONFIG_INTR_REMAP */ 2525#endif /* CONFIG_INTR_REMAP */
2733 2526
2734static struct irq_chip ioapic_chip __read_mostly = { 2527static struct irq_chip ioapic_chip __read_mostly = {
2735 .name = "IO-APIC", 2528 .name = "IO-APIC",
2736 .startup = startup_ioapic_irq, 2529 .irq_startup = startup_ioapic_irq,
2737 .mask = mask_IO_APIC_irq, 2530 .irq_mask = mask_ioapic_irq,
2738 .unmask = unmask_IO_APIC_irq, 2531 .irq_unmask = unmask_ioapic_irq,
2739 .ack = ack_apic_edge, 2532 .irq_ack = ack_apic_edge,
2740 .eoi = ack_apic_level, 2533 .irq_eoi = ack_apic_level,
2741#ifdef CONFIG_SMP 2534#ifdef CONFIG_SMP
2742 .set_affinity = set_ioapic_affinity_irq, 2535 .irq_set_affinity = ioapic_set_affinity,
2743#endif 2536#endif
2744 .retrigger = ioapic_retrigger_irq, 2537 .irq_retrigger = ioapic_retrigger_irq,
2745}; 2538};
2746 2539
2747static struct irq_chip ir_ioapic_chip __read_mostly = { 2540static struct irq_chip ir_ioapic_chip __read_mostly = {
2748 .name = "IR-IO-APIC", 2541 .name = "IR-IO-APIC",
2749 .startup = startup_ioapic_irq, 2542 .irq_startup = startup_ioapic_irq,
2750 .mask = mask_IO_APIC_irq, 2543 .irq_mask = mask_ioapic_irq,
2751 .unmask = unmask_IO_APIC_irq, 2544 .irq_unmask = unmask_ioapic_irq,
2752#ifdef CONFIG_INTR_REMAP 2545#ifdef CONFIG_INTR_REMAP
2753 .ack = ir_ack_apic_edge, 2546 .irq_ack = ir_ack_apic_edge,
2754 .eoi = ir_ack_apic_level, 2547 .irq_eoi = ir_ack_apic_level,
2755#ifdef CONFIG_SMP 2548#ifdef CONFIG_SMP
2756 .set_affinity = set_ir_ioapic_affinity_irq, 2549 .irq_set_affinity = ir_ioapic_set_affinity,
2757#endif 2550#endif
2758#endif 2551#endif
2759 .retrigger = ioapic_retrigger_irq, 2552 .irq_retrigger = ioapic_retrigger_irq,
2760}; 2553};
2761 2554
2762static inline void init_IO_APIC_traps(void) 2555static inline void init_IO_APIC_traps(void)
2763{ 2556{
2764 int irq;
2765 struct irq_desc *desc;
2766 struct irq_cfg *cfg; 2557 struct irq_cfg *cfg;
2558 unsigned int irq;
2767 2559
2768 /* 2560 /*
2769 * NOTE! The local APIC isn't very good at handling 2561 * NOTE! The local APIC isn't very good at handling
@@ -2776,8 +2568,8 @@ static inline void init_IO_APIC_traps(void)
2776 * Also, we've got to be careful not to trash gate 2568 * Also, we've got to be careful not to trash gate
2777 * 0x80, because int 0x80 is hm, kind of importantish. ;) 2569 * 0x80, because int 0x80 is hm, kind of importantish. ;)
2778 */ 2570 */
2779 for_each_irq_desc(irq, desc) { 2571 for_each_active_irq(irq) {
2780 cfg = desc->chip_data; 2572 cfg = irq_get_chip_data(irq);
2781 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) { 2573 if (IO_APIC_IRQ(irq) && cfg && !cfg->vector) {
2782 /* 2574 /*
2783 * Hmm.. We don't have an entry for this, 2575 * Hmm.. We don't have an entry for this,
@@ -2788,7 +2580,7 @@ static inline void init_IO_APIC_traps(void)
2788 legacy_pic->make_irq(irq); 2580 legacy_pic->make_irq(irq);
2789 else 2581 else
2790 /* Strange. Oh, well.. */ 2582 /* Strange. Oh, well.. */
2791 desc->chip = &no_irq_chip; 2583 irq_set_chip(irq, &no_irq_chip);
2792 } 2584 }
2793 } 2585 }
2794} 2586}
@@ -2797,7 +2589,7 @@ static inline void init_IO_APIC_traps(void)
2797 * The local APIC irq-chip implementation: 2589 * The local APIC irq-chip implementation:
2798 */ 2590 */
2799 2591
2800static void mask_lapic_irq(unsigned int irq) 2592static void mask_lapic_irq(struct irq_data *data)
2801{ 2593{
2802 unsigned long v; 2594 unsigned long v;
2803 2595
@@ -2805,7 +2597,7 @@ static void mask_lapic_irq(unsigned int irq)
2805 apic_write(APIC_LVT0, v | APIC_LVT_MASKED); 2597 apic_write(APIC_LVT0, v | APIC_LVT_MASKED);
2806} 2598}
2807 2599
2808static void unmask_lapic_irq(unsigned int irq) 2600static void unmask_lapic_irq(struct irq_data *data)
2809{ 2601{
2810 unsigned long v; 2602 unsigned long v;
2811 2603
@@ -2813,43 +2605,25 @@ static void unmask_lapic_irq(unsigned int irq)
2813 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED); 2605 apic_write(APIC_LVT0, v & ~APIC_LVT_MASKED);
2814} 2606}
2815 2607
2816static void ack_lapic_irq(unsigned int irq) 2608static void ack_lapic_irq(struct irq_data *data)
2817{ 2609{
2818 ack_APIC_irq(); 2610 ack_APIC_irq();
2819} 2611}
2820 2612
2821static struct irq_chip lapic_chip __read_mostly = { 2613static struct irq_chip lapic_chip __read_mostly = {
2822 .name = "local-APIC", 2614 .name = "local-APIC",
2823 .mask = mask_lapic_irq, 2615 .irq_mask = mask_lapic_irq,
2824 .unmask = unmask_lapic_irq, 2616 .irq_unmask = unmask_lapic_irq,
2825 .ack = ack_lapic_irq, 2617 .irq_ack = ack_lapic_irq,
2826}; 2618};
2827 2619
2828static void lapic_register_intr(int irq, struct irq_desc *desc) 2620static void lapic_register_intr(int irq)
2829{ 2621{
2830 desc->status &= ~IRQ_LEVEL; 2622 irq_clear_status_flags(irq, IRQ_LEVEL);
2831 set_irq_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq, 2623 irq_set_chip_and_handler_name(irq, &lapic_chip, handle_edge_irq,
2832 "edge"); 2624 "edge");
2833} 2625}
2834 2626
2835static void __init setup_nmi(void)
2836{
2837 /*
2838 * Dirty trick to enable the NMI watchdog ...
2839 * We put the 8259A master into AEOI mode and
2840 * unmask on all local APICs LVT0 as NMI.
2841 *
2842 * The idea to use the 8259A in AEOI mode ('8259A Virtual Wire')
2843 * is from Maciej W. Rozycki - so we do not have to EOI from
2844 * the NMI handler or the timer interrupt.
2845 */
2846 apic_printk(APIC_VERBOSE, KERN_INFO "activating NMI Watchdog ...");
2847
2848 enable_NMI_through_LVT0();
2849
2850 apic_printk(APIC_VERBOSE, " done.\n");
2851}
2852
2853/* 2627/*
2854 * This looks a bit hackish but it's about the only one way of sending 2628 * This looks a bit hackish but it's about the only one way of sending
2855 * a few INTA cycles to 8259As and any associated glue logic. ICR does 2629 * a few INTA cycles to 8259As and any associated glue logic. ICR does
@@ -2930,9 +2704,8 @@ int timer_through_8259 __initdata;
2930 */ 2704 */
2931static inline void __init check_timer(void) 2705static inline void __init check_timer(void)
2932{ 2706{
2933 struct irq_desc *desc = irq_to_desc(0); 2707 struct irq_cfg *cfg = irq_get_chip_data(0);
2934 struct irq_cfg *cfg = desc->chip_data; 2708 int node = cpu_to_node(0);
2935 int node = cpu_to_node(boot_cpu_id);
2936 int apic1, pin1, apic2, pin2; 2709 int apic1, pin1, apic2, pin2;
2937 unsigned long flags; 2710 unsigned long flags;
2938 int no_pin1 = 0; 2711 int no_pin1 = 0;
@@ -2942,7 +2715,7 @@ static inline void __init check_timer(void)
2942 /* 2715 /*
2943 * get/set the timer IRQ vector: 2716 * get/set the timer IRQ vector:
2944 */ 2717 */
2945 legacy_pic->chip->mask(0); 2718 legacy_pic->mask(0);
2946 assign_irq_vector(0, cfg, apic->target_cpus()); 2719 assign_irq_vector(0, cfg, apic->target_cpus());
2947 2720
2948 /* 2721 /*
@@ -2956,15 +2729,6 @@ static inline void __init check_timer(void)
2956 */ 2729 */
2957 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2730 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2958 legacy_pic->init(1); 2731 legacy_pic->init(1);
2959#ifdef CONFIG_X86_32
2960 {
2961 unsigned int ver;
2962
2963 ver = apic_read(APIC_LVR);
2964 ver = GET_APIC_VERSION(ver);
2965 timer_ack = (nmi_watchdog == NMI_IO_APIC && !APIC_INTEGRATED(ver));
2966 }
2967#endif
2968 2732
2969 pin1 = find_isa_irq_pin(0, mp_INT); 2733 pin1 = find_isa_irq_pin(0, mp_INT);
2970 apic1 = find_isa_irq_apic(0, mp_INT); 2734 apic1 = find_isa_irq_apic(0, mp_INT);
@@ -3001,7 +2765,7 @@ static inline void __init check_timer(void)
3001 add_pin_to_irq_node(cfg, node, apic1, pin1); 2765 add_pin_to_irq_node(cfg, node, apic1, pin1);
3002 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector); 2766 setup_timer_IRQ0_pin(apic1, pin1, cfg->vector);
3003 } else { 2767 } else {
3004 /* for edge trigger, setup_IO_APIC_irq already 2768 /* for edge trigger, setup_ioapic_irq already
3005 * leave it unmasked. 2769 * leave it unmasked.
3006 * so only need to unmask if it is level-trigger 2770 * so only need to unmask if it is level-trigger
3007 * do we really have level trigger timer? 2771 * do we really have level trigger timer?
@@ -3009,13 +2773,9 @@ static inline void __init check_timer(void)
3009 int idx; 2773 int idx;
3010 idx = find_irq_entry(apic1, pin1, mp_INT); 2774 idx = find_irq_entry(apic1, pin1, mp_INT);
3011 if (idx != -1 && irq_trigger(idx)) 2775 if (idx != -1 && irq_trigger(idx))
3012 unmask_IO_APIC_irq_desc(desc); 2776 unmask_ioapic(cfg);
3013 } 2777 }
3014 if (timer_irq_works()) { 2778 if (timer_irq_works()) {
3015 if (nmi_watchdog == NMI_IO_APIC) {
3016 setup_nmi();
3017 legacy_pic->chip->unmask(0);
3018 }
3019 if (disable_timer_pin_1 > 0) 2779 if (disable_timer_pin_1 > 0)
3020 clear_IO_APIC_pin(0, pin1); 2780 clear_IO_APIC_pin(0, pin1);
3021 goto out; 2781 goto out;
@@ -3037,48 +2797,34 @@ static inline void __init check_timer(void)
3037 */ 2797 */
3038 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 2798 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
3039 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 2799 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
3040 legacy_pic->chip->unmask(0); 2800 legacy_pic->unmask(0);
3041 if (timer_irq_works()) { 2801 if (timer_irq_works()) {
3042 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 2802 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
3043 timer_through_8259 = 1; 2803 timer_through_8259 = 1;
3044 if (nmi_watchdog == NMI_IO_APIC) {
3045 legacy_pic->chip->mask(0);
3046 setup_nmi();
3047 legacy_pic->chip->unmask(0);
3048 }
3049 goto out; 2804 goto out;
3050 } 2805 }
3051 /* 2806 /*
3052 * Cleanup, just in case ... 2807 * Cleanup, just in case ...
3053 */ 2808 */
3054 local_irq_disable(); 2809 local_irq_disable();
3055 legacy_pic->chip->mask(0); 2810 legacy_pic->mask(0);
3056 clear_IO_APIC_pin(apic2, pin2); 2811 clear_IO_APIC_pin(apic2, pin2);
3057 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 2812 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
3058 } 2813 }
3059 2814
3060 if (nmi_watchdog == NMI_IO_APIC) {
3061 apic_printk(APIC_QUIET, KERN_WARNING "timer doesn't work "
3062 "through the IO-APIC - disabling NMI Watchdog!\n");
3063 nmi_watchdog = NMI_NONE;
3064 }
3065#ifdef CONFIG_X86_32
3066 timer_ack = 0;
3067#endif
3068
3069 apic_printk(APIC_QUIET, KERN_INFO 2815 apic_printk(APIC_QUIET, KERN_INFO
3070 "...trying to set up timer as Virtual Wire IRQ...\n"); 2816 "...trying to set up timer as Virtual Wire IRQ...\n");
3071 2817
3072 lapic_register_intr(0, desc); 2818 lapic_register_intr(0);
3073 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 2819 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
3074 legacy_pic->chip->unmask(0); 2820 legacy_pic->unmask(0);
3075 2821
3076 if (timer_irq_works()) { 2822 if (timer_irq_works()) {
3077 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 2823 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
3078 goto out; 2824 goto out;
3079 } 2825 }
3080 local_irq_disable(); 2826 local_irq_disable();
3081 legacy_pic->chip->mask(0); 2827 legacy_pic->mask(0);
3082 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 2828 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
3083 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 2829 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
3084 2830
@@ -3144,7 +2890,7 @@ void __init setup_IO_APIC(void)
3144} 2890}
3145 2891
3146/* 2892/*
3147 * Called after all the initialization is done. If we didnt find any 2893 * Called after all the initialization is done. If we didn't find any
3148 * APIC bugs then we can allow the modify fast path 2894 * APIC bugs then we can allow the modify fast path
3149 */ 2895 */
3150 2896
@@ -3157,136 +2903,84 @@ static int __init io_apic_bug_finalize(void)
3157 2903
3158late_initcall(io_apic_bug_finalize); 2904late_initcall(io_apic_bug_finalize);
3159 2905
3160struct sysfs_ioapic_data { 2906static void resume_ioapic_id(int ioapic_id)
3161 struct sys_device dev;
3162 struct IO_APIC_route_entry entry[0];
3163};
3164static struct sysfs_ioapic_data * mp_ioapic_data[MAX_IO_APICS];
3165
3166static int ioapic_suspend(struct sys_device *dev, pm_message_t state)
3167{ 2907{
3168 struct IO_APIC_route_entry *entry;
3169 struct sysfs_ioapic_data *data;
3170 int i;
3171
3172 data = container_of(dev, struct sysfs_ioapic_data, dev);
3173 entry = data->entry;
3174 for (i = 0; i < nr_ioapic_registers[dev->id]; i ++, entry ++ )
3175 *entry = ioapic_read_entry(dev->id, i);
3176
3177 return 0;
3178}
3179
3180static int ioapic_resume(struct sys_device *dev)
3181{
3182 struct IO_APIC_route_entry *entry;
3183 struct sysfs_ioapic_data *data;
3184 unsigned long flags; 2908 unsigned long flags;
3185 union IO_APIC_reg_00 reg_00; 2909 union IO_APIC_reg_00 reg_00;
3186 int i;
3187 2910
3188 data = container_of(dev, struct sysfs_ioapic_data, dev);
3189 entry = data->entry;
3190 2911
3191 raw_spin_lock_irqsave(&ioapic_lock, flags); 2912 raw_spin_lock_irqsave(&ioapic_lock, flags);
3192 reg_00.raw = io_apic_read(dev->id, 0); 2913 reg_00.raw = io_apic_read(ioapic_id, 0);
3193 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 2914 if (reg_00.bits.ID != mpc_ioapic_id(ioapic_id)) {
3194 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 2915 reg_00.bits.ID = mpc_ioapic_id(ioapic_id);
3195 io_apic_write(dev->id, 0, reg_00.raw); 2916 io_apic_write(ioapic_id, 0, reg_00.raw);
3196 } 2917 }
3197 raw_spin_unlock_irqrestore(&ioapic_lock, flags); 2918 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3198 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 2919}
3199 ioapic_write_entry(dev->id, i, entry[i]);
3200 2920
3201 return 0; 2921static void ioapic_resume(void)
2922{
2923 int ioapic_id;
2924
2925 for (ioapic_id = nr_ioapics - 1; ioapic_id >= 0; ioapic_id--)
2926 resume_ioapic_id(ioapic_id);
2927
2928 restore_ioapic_entries();
3202} 2929}
3203 2930
3204static struct sysdev_class ioapic_sysdev_class = { 2931static struct syscore_ops ioapic_syscore_ops = {
3205 .name = "ioapic", 2932 .suspend = save_ioapic_entries,
3206 .suspend = ioapic_suspend,
3207 .resume = ioapic_resume, 2933 .resume = ioapic_resume,
3208}; 2934};
3209 2935
3210static int __init ioapic_init_sysfs(void) 2936static int __init ioapic_init_ops(void)
3211{ 2937{
3212 struct sys_device * dev; 2938 register_syscore_ops(&ioapic_syscore_ops);
3213 int i, size, error;
3214
3215 error = sysdev_class_register(&ioapic_sysdev_class);
3216 if (error)
3217 return error;
3218
3219 for (i = 0; i < nr_ioapics; i++ ) {
3220 size = sizeof(struct sys_device) + nr_ioapic_registers[i]
3221 * sizeof(struct IO_APIC_route_entry);
3222 mp_ioapic_data[i] = kzalloc(size, GFP_KERNEL);
3223 if (!mp_ioapic_data[i]) {
3224 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
3225 continue;
3226 }
3227 dev = &mp_ioapic_data[i]->dev;
3228 dev->id = i;
3229 dev->cls = &ioapic_sysdev_class;
3230 error = sysdev_register(dev);
3231 if (error) {
3232 kfree(mp_ioapic_data[i]);
3233 mp_ioapic_data[i] = NULL;
3234 printk(KERN_ERR "Can't suspend/resume IOAPIC %d\n", i);
3235 continue;
3236 }
3237 }
3238 2939
3239 return 0; 2940 return 0;
3240} 2941}
3241 2942
3242device_initcall(ioapic_init_sysfs); 2943device_initcall(ioapic_init_ops);
3243 2944
3244/* 2945/*
3245 * Dynamic irq allocate and deallocation 2946 * Dynamic irq allocate and deallocation
3246 */ 2947 */
3247unsigned int create_irq_nr(unsigned int irq_want, int node) 2948unsigned int create_irq_nr(unsigned int from, int node)
3248{ 2949{
3249 /* Allocate an unused irq */ 2950 struct irq_cfg *cfg;
3250 unsigned int irq;
3251 unsigned int new;
3252 unsigned long flags; 2951 unsigned long flags;
3253 struct irq_cfg *cfg_new = NULL; 2952 unsigned int ret = 0;
3254 struct irq_desc *desc_new = NULL; 2953 int irq;
3255
3256 irq = 0;
3257 if (irq_want < nr_irqs_gsi)
3258 irq_want = nr_irqs_gsi;
3259
3260 raw_spin_lock_irqsave(&vector_lock, flags);
3261 for (new = irq_want; new < nr_irqs; new++) {
3262 desc_new = irq_to_desc_alloc_node(new, node);
3263 if (!desc_new) {
3264 printk(KERN_INFO "can not get irq_desc for %d\n", new);
3265 continue;
3266 }
3267 cfg_new = desc_new->chip_data;
3268
3269 if (cfg_new->vector != 0)
3270 continue;
3271 2954
3272 desc_new = move_irq_desc(desc_new, node); 2955 if (from < nr_irqs_gsi)
3273 cfg_new = desc_new->chip_data; 2956 from = nr_irqs_gsi;
3274 2957
3275 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 2958 irq = alloc_irq_from(from, node);
3276 irq = new; 2959 if (irq < 0)
3277 break; 2960 return 0;
2961 cfg = alloc_irq_cfg(irq, node);
2962 if (!cfg) {
2963 free_irq_at(irq, NULL);
2964 return 0;
3278 } 2965 }
3279 raw_spin_unlock_irqrestore(&vector_lock, flags);
3280 2966
3281 if (irq > 0) 2967 raw_spin_lock_irqsave(&vector_lock, flags);
3282 dynamic_irq_init_keep_chip_data(irq); 2968 if (!__assign_irq_vector(irq, cfg, apic->target_cpus()))
2969 ret = irq;
2970 raw_spin_unlock_irqrestore(&vector_lock, flags);
3283 2971
3284 return irq; 2972 if (ret) {
2973 irq_set_chip_data(irq, cfg);
2974 irq_clear_status_flags(irq, IRQ_NOREQUEST);
2975 } else {
2976 free_irq_at(irq, cfg);
2977 }
2978 return ret;
3285} 2979}
3286 2980
3287int create_irq(void) 2981int create_irq(void)
3288{ 2982{
3289 int node = cpu_to_node(boot_cpu_id); 2983 int node = cpu_to_node(0);
3290 unsigned int irq_want; 2984 unsigned int irq_want;
3291 int irq; 2985 int irq;
3292 2986
@@ -3301,14 +2995,17 @@ int create_irq(void)
3301 2995
3302void destroy_irq(unsigned int irq) 2996void destroy_irq(unsigned int irq)
3303{ 2997{
2998 struct irq_cfg *cfg = irq_get_chip_data(irq);
3304 unsigned long flags; 2999 unsigned long flags;
3305 3000
3306 dynamic_irq_cleanup_keep_chip_data(irq); 3001 irq_set_status_flags(irq, IRQ_NOREQUEST|IRQ_NOPROBE);
3307 3002
3308 free_irte(irq); 3003 if (irq_remapped(cfg))
3004 free_irte(irq);
3309 raw_spin_lock_irqsave(&vector_lock, flags); 3005 raw_spin_lock_irqsave(&vector_lock, flags);
3310 __clear_irq_vector(irq, get_irq_chip_data(irq)); 3006 __clear_irq_vector(irq, cfg);
3311 raw_spin_unlock_irqrestore(&vector_lock, flags); 3007 raw_spin_unlock_irqrestore(&vector_lock, flags);
3008 free_irq_at(irq, cfg);
3312} 3009}
3313 3010
3314/* 3011/*
@@ -3332,7 +3029,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3332 3029
3333 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus()); 3030 dest = apic->cpu_mask_to_apicid_and(cfg->domain, apic->target_cpus());
3334 3031
3335 if (irq_remapped(irq)) { 3032 if (irq_remapped(cfg)) {
3336 struct irte irte; 3033 struct irte irte;
3337 int ir_index; 3034 int ir_index;
3338 u16 sub_handle; 3035 u16 sub_handle;
@@ -3340,14 +3037,7 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3340 ir_index = map_irq_to_irte_handle(irq, &sub_handle); 3037 ir_index = map_irq_to_irte_handle(irq, &sub_handle);
3341 BUG_ON(ir_index == -1); 3038 BUG_ON(ir_index == -1);
3342 3039
3343 memset (&irte, 0, sizeof(irte)); 3040 prepare_irte(&irte, cfg->vector, dest);
3344
3345 irte.present = 1;
3346 irte.dst_mode = apic->irq_dest_mode;
3347 irte.trigger_mode = 0; /* edge */
3348 irte.dlvry_mode = apic->irq_delivery_mode;
3349 irte.vector = cfg->vector;
3350 irte.dest_id = IRTE_DEST(dest);
3351 3041
3352 /* Set source-id of interrupt request */ 3042 /* Set source-id of interrupt request */
3353 if (pdev) 3043 if (pdev)
@@ -3392,26 +3082,24 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3392} 3082}
3393 3083
3394#ifdef CONFIG_SMP 3084#ifdef CONFIG_SMP
3395static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3085static int
3086msi_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3396{ 3087{
3397 struct irq_desc *desc = irq_to_desc(irq); 3088 struct irq_cfg *cfg = data->chip_data;
3398 struct irq_cfg *cfg;
3399 struct msi_msg msg; 3089 struct msi_msg msg;
3400 unsigned int dest; 3090 unsigned int dest;
3401 3091
3402 if (set_desc_affinity(desc, mask, &dest)) 3092 if (__ioapic_set_affinity(data, mask, &dest))
3403 return -1; 3093 return -1;
3404 3094
3405 cfg = desc->chip_data; 3095 __get_cached_msi_msg(data->msi_desc, &msg);
3406
3407 get_cached_msi_msg_desc(desc, &msg);
3408 3096
3409 msg.data &= ~MSI_DATA_VECTOR_MASK; 3097 msg.data &= ~MSI_DATA_VECTOR_MASK;
3410 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3098 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3411 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3099 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3412 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3100 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3413 3101
3414 write_msi_msg_desc(desc, &msg); 3102 __write_msi_msg(data->msi_desc, &msg);
3415 3103
3416 return 0; 3104 return 0;
3417} 3105}
@@ -3421,17 +3109,17 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3421 * done in the process context using interrupt-remapping hardware. 3109 * done in the process context using interrupt-remapping hardware.
3422 */ 3110 */
3423static int 3111static int
3424ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask) 3112ir_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3113 bool force)
3425{ 3114{
3426 struct irq_desc *desc = irq_to_desc(irq); 3115 struct irq_cfg *cfg = data->chip_data;
3427 struct irq_cfg *cfg = desc->chip_data; 3116 unsigned int dest, irq = data->irq;
3428 unsigned int dest;
3429 struct irte irte; 3117 struct irte irte;
3430 3118
3431 if (get_irte(irq, &irte)) 3119 if (get_irte(irq, &irte))
3432 return -1; 3120 return -1;
3433 3121
3434 if (set_desc_affinity(desc, mask, &dest)) 3122 if (__ioapic_set_affinity(data, mask, &dest))
3435 return -1; 3123 return -1;
3436 3124
3437 irte.vector = cfg->vector; 3125 irte.vector = cfg->vector;
@@ -3461,27 +3149,27 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3461 * which implement the MSI or MSI-X Capability Structure. 3149 * which implement the MSI or MSI-X Capability Structure.
3462 */ 3150 */
3463static struct irq_chip msi_chip = { 3151static struct irq_chip msi_chip = {
3464 .name = "PCI-MSI", 3152 .name = "PCI-MSI",
3465 .unmask = unmask_msi_irq, 3153 .irq_unmask = unmask_msi_irq,
3466 .mask = mask_msi_irq, 3154 .irq_mask = mask_msi_irq,
3467 .ack = ack_apic_edge, 3155 .irq_ack = ack_apic_edge,
3468#ifdef CONFIG_SMP 3156#ifdef CONFIG_SMP
3469 .set_affinity = set_msi_irq_affinity, 3157 .irq_set_affinity = msi_set_affinity,
3470#endif 3158#endif
3471 .retrigger = ioapic_retrigger_irq, 3159 .irq_retrigger = ioapic_retrigger_irq,
3472}; 3160};
3473 3161
3474static struct irq_chip msi_ir_chip = { 3162static struct irq_chip msi_ir_chip = {
3475 .name = "IR-PCI-MSI", 3163 .name = "IR-PCI-MSI",
3476 .unmask = unmask_msi_irq, 3164 .irq_unmask = unmask_msi_irq,
3477 .mask = mask_msi_irq, 3165 .irq_mask = mask_msi_irq,
3478#ifdef CONFIG_INTR_REMAP 3166#ifdef CONFIG_INTR_REMAP
3479 .ack = ir_ack_apic_edge, 3167 .irq_ack = ir_ack_apic_edge,
3480#ifdef CONFIG_SMP 3168#ifdef CONFIG_SMP
3481 .set_affinity = ir_set_msi_irq_affinity, 3169 .irq_set_affinity = ir_msi_set_affinity,
3482#endif 3170#endif
3483#endif 3171#endif
3484 .retrigger = ioapic_retrigger_irq, 3172 .irq_retrigger = ioapic_retrigger_irq,
3485}; 3173};
3486 3174
3487/* 3175/*
@@ -3513,40 +3201,35 @@ static int msi_alloc_irte(struct pci_dev *dev, int irq, int nvec)
3513 3201
3514static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq) 3202static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3515{ 3203{
3516 int ret; 3204 struct irq_chip *chip = &msi_chip;
3517 struct msi_msg msg; 3205 struct msi_msg msg;
3206 int ret;
3518 3207
3519 ret = msi_compose_msg(dev, irq, &msg, -1); 3208 ret = msi_compose_msg(dev, irq, &msg, -1);
3520 if (ret < 0) 3209 if (ret < 0)
3521 return ret; 3210 return ret;
3522 3211
3523 set_irq_msi(irq, msidesc); 3212 irq_set_msi_desc(irq, msidesc);
3524 write_msi_msg(irq, &msg); 3213 write_msi_msg(irq, &msg);
3525 3214
3526 if (irq_remapped(irq)) { 3215 if (irq_remapped(irq_get_chip_data(irq))) {
3527 struct irq_desc *desc = irq_to_desc(irq); 3216 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3528 /* 3217 chip = &msi_ir_chip;
3529 * irq migration in process context 3218 }
3530 */ 3219
3531 desc->status |= IRQ_MOVE_PCNTXT; 3220 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3532 set_irq_chip_and_handler_name(irq, &msi_ir_chip, handle_edge_irq, "edge");
3533 } else
3534 set_irq_chip_and_handler_name(irq, &msi_chip, handle_edge_irq, "edge");
3535 3221
3536 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq); 3222 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for MSI/MSI-X\n", irq);
3537 3223
3538 return 0; 3224 return 0;
3539} 3225}
3540 3226
3541int arch_setup_msi_irqs(struct pci_dev *dev, int nvec, int type) 3227int native_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
3542{ 3228{
3543 unsigned int irq; 3229 int node, ret, sub_handle, index = 0;
3544 int ret, sub_handle; 3230 unsigned int irq, irq_want;
3545 struct msi_desc *msidesc; 3231 struct msi_desc *msidesc;
3546 unsigned int irq_want;
3547 struct intel_iommu *iommu = NULL; 3232 struct intel_iommu *iommu = NULL;
3548 int index = 0;
3549 int node;
3550 3233
3551 /* x86 doesn't support multiple MSI yet */ 3234 /* x86 doesn't support multiple MSI yet */
3552 if (type == PCI_CAP_ID_MSI && nvec > 1) 3235 if (type == PCI_CAP_ID_MSI && nvec > 1)
@@ -3599,31 +3282,31 @@ error:
3599 return ret; 3282 return ret;
3600} 3283}
3601 3284
3602void arch_teardown_msi_irq(unsigned int irq) 3285void native_teardown_msi_irq(unsigned int irq)
3603{ 3286{
3604 destroy_irq(irq); 3287 destroy_irq(irq);
3605} 3288}
3606 3289
3607#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP) 3290#if defined (CONFIG_DMAR) || defined (CONFIG_INTR_REMAP)
3608#ifdef CONFIG_SMP 3291#ifdef CONFIG_SMP
3609static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3292static int
3293dmar_msi_set_affinity(struct irq_data *data, const struct cpumask *mask,
3294 bool force)
3610{ 3295{
3611 struct irq_desc *desc = irq_to_desc(irq); 3296 struct irq_cfg *cfg = data->chip_data;
3612 struct irq_cfg *cfg; 3297 unsigned int dest, irq = data->irq;
3613 struct msi_msg msg; 3298 struct msi_msg msg;
3614 unsigned int dest;
3615 3299
3616 if (set_desc_affinity(desc, mask, &dest)) 3300 if (__ioapic_set_affinity(data, mask, &dest))
3617 return -1; 3301 return -1;
3618 3302
3619 cfg = desc->chip_data;
3620
3621 dmar_msi_read(irq, &msg); 3303 dmar_msi_read(irq, &msg);
3622 3304
3623 msg.data &= ~MSI_DATA_VECTOR_MASK; 3305 msg.data &= ~MSI_DATA_VECTOR_MASK;
3624 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3306 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3625 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3307 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3626 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3308 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3309 msg.address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(dest);
3627 3310
3628 dmar_msi_write(irq, &msg); 3311 dmar_msi_write(irq, &msg);
3629 3312
@@ -3633,14 +3316,14 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3633#endif /* CONFIG_SMP */ 3316#endif /* CONFIG_SMP */
3634 3317
3635static struct irq_chip dmar_msi_type = { 3318static struct irq_chip dmar_msi_type = {
3636 .name = "DMAR_MSI", 3319 .name = "DMAR_MSI",
3637 .unmask = dmar_msi_unmask, 3320 .irq_unmask = dmar_msi_unmask,
3638 .mask = dmar_msi_mask, 3321 .irq_mask = dmar_msi_mask,
3639 .ack = ack_apic_edge, 3322 .irq_ack = ack_apic_edge,
3640#ifdef CONFIG_SMP 3323#ifdef CONFIG_SMP
3641 .set_affinity = dmar_msi_set_affinity, 3324 .irq_set_affinity = dmar_msi_set_affinity,
3642#endif 3325#endif
3643 .retrigger = ioapic_retrigger_irq, 3326 .irq_retrigger = ioapic_retrigger_irq,
3644}; 3327};
3645 3328
3646int arch_setup_dmar_msi(unsigned int irq) 3329int arch_setup_dmar_msi(unsigned int irq)
@@ -3652,8 +3335,8 @@ int arch_setup_dmar_msi(unsigned int irq)
3652 if (ret < 0) 3335 if (ret < 0)
3653 return ret; 3336 return ret;
3654 dmar_msi_write(irq, &msg); 3337 dmar_msi_write(irq, &msg);
3655 set_irq_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq, 3338 irq_set_chip_and_handler_name(irq, &dmar_msi_type, handle_edge_irq,
3656 "edge"); 3339 "edge");
3657 return 0; 3340 return 0;
3658} 3341}
3659#endif 3342#endif
@@ -3661,26 +3344,24 @@ int arch_setup_dmar_msi(unsigned int irq)
3661#ifdef CONFIG_HPET_TIMER 3344#ifdef CONFIG_HPET_TIMER
3662 3345
3663#ifdef CONFIG_SMP 3346#ifdef CONFIG_SMP
3664static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask) 3347static int hpet_msi_set_affinity(struct irq_data *data,
3348 const struct cpumask *mask, bool force)
3665{ 3349{
3666 struct irq_desc *desc = irq_to_desc(irq); 3350 struct irq_cfg *cfg = data->chip_data;
3667 struct irq_cfg *cfg;
3668 struct msi_msg msg; 3351 struct msi_msg msg;
3669 unsigned int dest; 3352 unsigned int dest;
3670 3353
3671 if (set_desc_affinity(desc, mask, &dest)) 3354 if (__ioapic_set_affinity(data, mask, &dest))
3672 return -1; 3355 return -1;
3673 3356
3674 cfg = desc->chip_data; 3357 hpet_msi_read(data->handler_data, &msg);
3675
3676 hpet_msi_read(irq, &msg);
3677 3358
3678 msg.data &= ~MSI_DATA_VECTOR_MASK; 3359 msg.data &= ~MSI_DATA_VECTOR_MASK;
3679 msg.data |= MSI_DATA_VECTOR(cfg->vector); 3360 msg.data |= MSI_DATA_VECTOR(cfg->vector);
3680 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK; 3361 msg.address_lo &= ~MSI_ADDR_DEST_ID_MASK;
3681 msg.address_lo |= MSI_ADDR_DEST_ID(dest); 3362 msg.address_lo |= MSI_ADDR_DEST_ID(dest);
3682 3363
3683 hpet_msi_write(irq, &msg); 3364 hpet_msi_write(data->handler_data, &msg);
3684 3365
3685 return 0; 3366 return 0;
3686} 3367}
@@ -3688,34 +3369,34 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3688#endif /* CONFIG_SMP */ 3369#endif /* CONFIG_SMP */
3689 3370
3690static struct irq_chip ir_hpet_msi_type = { 3371static struct irq_chip ir_hpet_msi_type = {
3691 .name = "IR-HPET_MSI", 3372 .name = "IR-HPET_MSI",
3692 .unmask = hpet_msi_unmask, 3373 .irq_unmask = hpet_msi_unmask,
3693 .mask = hpet_msi_mask, 3374 .irq_mask = hpet_msi_mask,
3694#ifdef CONFIG_INTR_REMAP 3375#ifdef CONFIG_INTR_REMAP
3695 .ack = ir_ack_apic_edge, 3376 .irq_ack = ir_ack_apic_edge,
3696#ifdef CONFIG_SMP 3377#ifdef CONFIG_SMP
3697 .set_affinity = ir_set_msi_irq_affinity, 3378 .irq_set_affinity = ir_msi_set_affinity,
3698#endif 3379#endif
3699#endif 3380#endif
3700 .retrigger = ioapic_retrigger_irq, 3381 .irq_retrigger = ioapic_retrigger_irq,
3701}; 3382};
3702 3383
3703static struct irq_chip hpet_msi_type = { 3384static struct irq_chip hpet_msi_type = {
3704 .name = "HPET_MSI", 3385 .name = "HPET_MSI",
3705 .unmask = hpet_msi_unmask, 3386 .irq_unmask = hpet_msi_unmask,
3706 .mask = hpet_msi_mask, 3387 .irq_mask = hpet_msi_mask,
3707 .ack = ack_apic_edge, 3388 .irq_ack = ack_apic_edge,
3708#ifdef CONFIG_SMP 3389#ifdef CONFIG_SMP
3709 .set_affinity = hpet_msi_set_affinity, 3390 .irq_set_affinity = hpet_msi_set_affinity,
3710#endif 3391#endif
3711 .retrigger = ioapic_retrigger_irq, 3392 .irq_retrigger = ioapic_retrigger_irq,
3712}; 3393};
3713 3394
3714int arch_setup_hpet_msi(unsigned int irq, unsigned int id) 3395int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3715{ 3396{
3716 int ret; 3397 struct irq_chip *chip = &hpet_msi_type;
3717 struct msi_msg msg; 3398 struct msi_msg msg;
3718 struct irq_desc *desc = irq_to_desc(irq); 3399 int ret;
3719 3400
3720 if (intr_remapping_enabled) { 3401 if (intr_remapping_enabled) {
3721 struct intel_iommu *iommu = map_hpet_to_ir(id); 3402 struct intel_iommu *iommu = map_hpet_to_ir(id);
@@ -3733,15 +3414,12 @@ int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3733 if (ret < 0) 3414 if (ret < 0)
3734 return ret; 3415 return ret;
3735 3416
3736 hpet_msi_write(irq, &msg); 3417 hpet_msi_write(irq_get_handler_data(irq), &msg);
3737 desc->status |= IRQ_MOVE_PCNTXT; 3418 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
3738 if (irq_remapped(irq)) 3419 if (irq_remapped(irq_get_chip_data(irq)))
3739 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type, 3420 chip = &ir_hpet_msi_type;
3740 handle_edge_irq, "edge");
3741 else
3742 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3743 handle_edge_irq, "edge");
3744 3421
3422 irq_set_chip_and_handler_name(irq, chip, handle_edge_irq, "edge");
3745 return 0; 3423 return 0;
3746} 3424}
3747#endif 3425#endif
@@ -3768,33 +3446,30 @@ static void target_ht_irq(unsigned int irq, unsigned int dest, u8 vector)
3768 write_ht_irq_msg(irq, &msg); 3446 write_ht_irq_msg(irq, &msg);
3769} 3447}
3770 3448
3771static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask) 3449static int
3450ht_set_affinity(struct irq_data *data, const struct cpumask *mask, bool force)
3772{ 3451{
3773 struct irq_desc *desc = irq_to_desc(irq); 3452 struct irq_cfg *cfg = data->chip_data;
3774 struct irq_cfg *cfg;
3775 unsigned int dest; 3453 unsigned int dest;
3776 3454
3777 if (set_desc_affinity(desc, mask, &dest)) 3455 if (__ioapic_set_affinity(data, mask, &dest))
3778 return -1; 3456 return -1;
3779 3457
3780 cfg = desc->chip_data; 3458 target_ht_irq(data->irq, dest, cfg->vector);
3781
3782 target_ht_irq(irq, dest, cfg->vector);
3783
3784 return 0; 3459 return 0;
3785} 3460}
3786 3461
3787#endif 3462#endif
3788 3463
3789static struct irq_chip ht_irq_chip = { 3464static struct irq_chip ht_irq_chip = {
3790 .name = "PCI-HT", 3465 .name = "PCI-HT",
3791 .mask = mask_ht_irq, 3466 .irq_mask = mask_ht_irq,
3792 .unmask = unmask_ht_irq, 3467 .irq_unmask = unmask_ht_irq,
3793 .ack = ack_apic_edge, 3468 .irq_ack = ack_apic_edge,
3794#ifdef CONFIG_SMP 3469#ifdef CONFIG_SMP
3795 .set_affinity = set_ht_irq_affinity, 3470 .irq_set_affinity = ht_set_affinity,
3796#endif 3471#endif
3797 .retrigger = ioapic_retrigger_irq, 3472 .irq_retrigger = ioapic_retrigger_irq,
3798}; 3473};
3799 3474
3800int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev) 3475int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
@@ -3831,7 +3506,7 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3831 3506
3832 write_ht_irq_msg(irq, &msg); 3507 write_ht_irq_msg(irq, &msg);
3833 3508
3834 set_irq_chip_and_handler_name(irq, &ht_irq_chip, 3509 irq_set_chip_and_handler_name(irq, &ht_irq_chip,
3835 handle_edge_irq, "edge"); 3510 handle_edge_irq, "edge");
3836 3511
3837 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq); 3512 dev_printk(KERN_DEBUG, &dev->dev, "irq %d for HT\n", irq);
@@ -3840,7 +3515,40 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3840} 3515}
3841#endif /* CONFIG_HT_IRQ */ 3516#endif /* CONFIG_HT_IRQ */
3842 3517
3843int __init io_apic_get_redir_entries (int ioapic) 3518static int
3519io_apic_setup_irq_pin(unsigned int irq, int node, struct io_apic_irq_attr *attr)
3520{
3521 struct irq_cfg *cfg = alloc_irq_and_cfg_at(irq, node);
3522 int ret;
3523
3524 if (!cfg)
3525 return -EINVAL;
3526 ret = __add_pin_to_irq_node(cfg, node, attr->ioapic, attr->ioapic_pin);
3527 if (!ret)
3528 setup_ioapic_irq(attr->ioapic, attr->ioapic_pin, irq, cfg,
3529 attr->trigger, attr->polarity);
3530 return ret;
3531}
3532
3533int io_apic_setup_irq_pin_once(unsigned int irq, int node,
3534 struct io_apic_irq_attr *attr)
3535{
3536 unsigned int id = attr->ioapic, pin = attr->ioapic_pin;
3537 int ret;
3538
3539 /* Avoid redundant programming */
3540 if (test_bit(pin, ioapics[id].pin_programmed)) {
3541 pr_debug("Pin %d-%d already programmed\n",
3542 mpc_ioapic_id(id), pin);
3543 return 0;
3544 }
3545 ret = io_apic_setup_irq_pin(irq, node, attr);
3546 if (!ret)
3547 set_bit(pin, ioapics[id].pin_programmed);
3548 return ret;
3549}
3550
3551static int __init io_apic_get_redir_entries(int ioapic)
3844{ 3552{
3845 union IO_APIC_reg_01 reg_01; 3553 union IO_APIC_reg_01 reg_01;
3846 unsigned long flags; 3554 unsigned long flags;
@@ -3856,7 +3564,7 @@ int __init io_apic_get_redir_entries (int ioapic)
3856 return reg_01.bits.entries + 1; 3564 return reg_01.bits.entries + 1;
3857} 3565}
3858 3566
3859void __init probe_nr_irqs_gsi(void) 3567static void __init probe_nr_irqs_gsi(void)
3860{ 3568{
3861 int nr; 3569 int nr;
3862 3570
@@ -3867,6 +3575,11 @@ void __init probe_nr_irqs_gsi(void)
3867 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi); 3575 printk(KERN_DEBUG "nr_irqs_gsi: %d\n", nr_irqs_gsi);
3868} 3576}
3869 3577
3578int get_nr_irqs_gsi(void)
3579{
3580 return nr_irqs_gsi;
3581}
3582
3870#ifdef CONFIG_SPARSE_IRQ 3583#ifdef CONFIG_SPARSE_IRQ
3871int __init arch_probe_nr_irqs(void) 3584int __init arch_probe_nr_irqs(void)
3872{ 3585{
@@ -3885,104 +3598,28 @@ int __init arch_probe_nr_irqs(void)
3885 if (nr < nr_irqs) 3598 if (nr < nr_irqs)
3886 nr_irqs = nr; 3599 nr_irqs = nr;
3887 3600
3888 return 0; 3601 return NR_IRQS_LEGACY;
3889} 3602}
3890#endif 3603#endif
3891 3604
3892static int __io_apic_set_pci_routing(struct device *dev, int irq, 3605int io_apic_set_pci_routing(struct device *dev, int irq,
3893 struct io_apic_irq_attr *irq_attr) 3606 struct io_apic_irq_attr *irq_attr)
3894{ 3607{
3895 struct irq_desc *desc;
3896 struct irq_cfg *cfg;
3897 int node; 3608 int node;
3898 int ioapic, pin;
3899 int trigger, polarity;
3900 3609
3901 ioapic = irq_attr->ioapic;
3902 if (!IO_APIC_IRQ(irq)) { 3610 if (!IO_APIC_IRQ(irq)) {
3903 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n", 3611 apic_printk(APIC_QUIET,KERN_ERR "IOAPIC[%d]: Invalid reference to IRQ 0\n",
3904 ioapic); 3612 irq_attr->ioapic);
3905 return -EINVAL; 3613 return -EINVAL;
3906 } 3614 }
3907 3615
3908 if (dev) 3616 node = dev ? dev_to_node(dev) : cpu_to_node(0);
3909 node = dev_to_node(dev);
3910 else
3911 node = cpu_to_node(boot_cpu_id);
3912
3913 desc = irq_to_desc_alloc_node(irq, node);
3914 if (!desc) {
3915 printk(KERN_INFO "can not get irq_desc %d\n", irq);
3916 return 0;
3917 }
3918
3919 pin = irq_attr->ioapic_pin;
3920 trigger = irq_attr->trigger;
3921 polarity = irq_attr->polarity;
3922 3617
3923 /* 3618 return io_apic_setup_irq_pin_once(irq, node, irq_attr);
3924 * IRQs < 16 are already in the irq_2_pin[] map
3925 */
3926 if (irq >= legacy_pic->nr_legacy_irqs) {
3927 cfg = desc->chip_data;
3928 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3929 printk(KERN_INFO "can not add pin %d for irq %d\n",
3930 pin, irq);
3931 return 0;
3932 }
3933 }
3934
3935 setup_IO_APIC_irq(ioapic, pin, irq, desc, trigger, polarity);
3936
3937 return 0;
3938}
3939
3940int io_apic_set_pci_routing(struct device *dev, int irq,
3941 struct io_apic_irq_attr *irq_attr)
3942{
3943 int ioapic, pin;
3944 /*
3945 * Avoid pin reprogramming. PRTs typically include entries
3946 * with redundant pin->gsi mappings (but unique PCI devices);
3947 * we only program the IOAPIC on the first.
3948 */
3949 ioapic = irq_attr->ioapic;
3950 pin = irq_attr->ioapic_pin;
3951 if (test_bit(pin, mp_ioapic_routing[ioapic].pin_programmed)) {
3952 pr_debug("Pin %d-%d already programmed\n",
3953 mp_ioapics[ioapic].apicid, pin);
3954 return 0;
3955 }
3956 set_bit(pin, mp_ioapic_routing[ioapic].pin_programmed);
3957
3958 return __io_apic_set_pci_routing(dev, irq, irq_attr);
3959}
3960
3961u8 __init io_apic_unique_id(u8 id)
3962{
3963#ifdef CONFIG_X86_32
3964 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3965 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3966 return io_apic_get_unique_id(nr_ioapics, id);
3967 else
3968 return id;
3969#else
3970 int i;
3971 DECLARE_BITMAP(used, 256);
3972
3973 bitmap_zero(used, 256);
3974 for (i = 0; i < nr_ioapics; i++) {
3975 struct mpc_ioapic *ia = &mp_ioapics[i];
3976 __set_bit(ia->apicid, used);
3977 }
3978 if (!test_bit(id, used))
3979 return id;
3980 return find_first_zero_bit(used, 256);
3981#endif
3982} 3619}
3983 3620
3984#ifdef CONFIG_X86_32 3621#ifdef CONFIG_X86_32
3985int __init io_apic_get_unique_id(int ioapic, int apic_id) 3622static int __init io_apic_get_unique_id(int ioapic, int apic_id)
3986{ 3623{
3987 union IO_APIC_reg_00 reg_00; 3624 union IO_APIC_reg_00 reg_00;
3988 static physid_mask_t apic_id_map = PHYSID_MASK_NONE; 3625 static physid_mask_t apic_id_map = PHYSID_MASK_NONE;
@@ -4055,9 +3692,32 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
4055 3692
4056 return apic_id; 3693 return apic_id;
4057} 3694}
3695
3696static u8 __init io_apic_unique_id(u8 id)
3697{
3698 if ((boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
3699 !APIC_XAPIC(apic_version[boot_cpu_physical_apicid]))
3700 return io_apic_get_unique_id(nr_ioapics, id);
3701 else
3702 return id;
3703}
3704#else
3705static u8 __init io_apic_unique_id(u8 id)
3706{
3707 int i;
3708 DECLARE_BITMAP(used, 256);
3709
3710 bitmap_zero(used, 256);
3711 for (i = 0; i < nr_ioapics; i++) {
3712 __set_bit(mpc_ioapic_id(i), used);
3713 }
3714 if (!test_bit(id, used))
3715 return id;
3716 return find_first_zero_bit(used, 256);
3717}
4058#endif 3718#endif
4059 3719
4060int __init io_apic_get_version(int ioapic) 3720static int __init io_apic_get_version(int ioapic)
4061{ 3721{
4062 union IO_APIC_reg_01 reg_01; 3722 union IO_APIC_reg_01 reg_01;
4063 unsigned long flags; 3723 unsigned long flags;
@@ -4102,14 +3762,14 @@ int acpi_get_override_irq(u32 gsi, int *trigger, int *polarity)
4102void __init setup_ioapic_dest(void) 3762void __init setup_ioapic_dest(void)
4103{ 3763{
4104 int pin, ioapic, irq, irq_entry; 3764 int pin, ioapic, irq, irq_entry;
4105 struct irq_desc *desc;
4106 const struct cpumask *mask; 3765 const struct cpumask *mask;
3766 struct irq_data *idata;
4107 3767
4108 if (skip_ioapic_setup == 1) 3768 if (skip_ioapic_setup == 1)
4109 return; 3769 return;
4110 3770
4111 for (ioapic = 0; ioapic < nr_ioapics; ioapic++) 3771 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4112 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 3772 for (pin = 0; pin < ioapics[ioapic].nr_registers; pin++) {
4113 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 3773 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4114 if (irq_entry == -1) 3774 if (irq_entry == -1)
4115 continue; 3775 continue;
@@ -4118,21 +3778,20 @@ void __init setup_ioapic_dest(void)
4118 if ((ioapic > 0) && (irq > 16)) 3778 if ((ioapic > 0) && (irq > 16))
4119 continue; 3779 continue;
4120 3780
4121 desc = irq_to_desc(irq); 3781 idata = irq_get_irq_data(irq);
4122 3782
4123 /* 3783 /*
4124 * Honour affinities which have been set in early boot 3784 * Honour affinities which have been set in early boot
4125 */ 3785 */
4126 if (desc->status & 3786 if (!irqd_can_balance(idata) || irqd_affinity_was_set(idata))
4127 (IRQ_NO_BALANCING | IRQ_AFFINITY_SET)) 3787 mask = idata->affinity;
4128 mask = desc->affinity;
4129 else 3788 else
4130 mask = apic->target_cpus(); 3789 mask = apic->target_cpus();
4131 3790
4132 if (intr_remapping_enabled) 3791 if (intr_remapping_enabled)
4133 set_ir_ioapic_affinity_irq_desc(desc, mask); 3792 ir_ioapic_set_affinity(idata, mask, false);
4134 else 3793 else
4135 set_ioapic_affinity_irq_desc(desc, mask); 3794 ioapic_set_affinity(idata, mask, false);
4136 } 3795 }
4137 3796
4138} 3797}
@@ -4172,7 +3831,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4172 return res; 3831 return res;
4173} 3832}
4174 3833
4175void __init ioapic_init_mappings(void) 3834void __init ioapic_and_gsi_init(void)
4176{ 3835{
4177 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0; 3836 unsigned long ioapic_phys, idx = FIX_IO_APIC_BASE_0;
4178 struct resource *ioapic_res; 3837 struct resource *ioapic_res;
@@ -4181,7 +3840,7 @@ void __init ioapic_init_mappings(void)
4181 ioapic_res = ioapic_setup_resources(nr_ioapics); 3840 ioapic_res = ioapic_setup_resources(nr_ioapics);
4182 for (i = 0; i < nr_ioapics; i++) { 3841 for (i = 0; i < nr_ioapics; i++) {
4183 if (smp_found_config) { 3842 if (smp_found_config) {
4184 ioapic_phys = mp_ioapics[i].apicaddr; 3843 ioapic_phys = mpc_ioapic_addr(i);
4185#ifdef CONFIG_X86_32 3844#ifdef CONFIG_X86_32
4186 if (!ioapic_phys) { 3845 if (!ioapic_phys) {
4187 printk(KERN_ERR 3846 printk(KERN_ERR
@@ -4210,6 +3869,8 @@ fake_ioapic_page:
4210 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1; 3869 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4211 ioapic_res++; 3870 ioapic_res++;
4212 } 3871 }
3872
3873 probe_nr_irqs_gsi();
4213} 3874}
4214 3875
4215void __init ioapic_insert_resources(void) 3876void __init ioapic_insert_resources(void)
@@ -4234,10 +3895,14 @@ int mp_find_ioapic(u32 gsi)
4234{ 3895{
4235 int i = 0; 3896 int i = 0;
4236 3897
3898 if (nr_ioapics == 0)
3899 return -1;
3900
4237 /* Find the IOAPIC that manages this GSI. */ 3901 /* Find the IOAPIC that manages this GSI. */
4238 for (i = 0; i < nr_ioapics; i++) { 3902 for (i = 0; i < nr_ioapics; i++) {
4239 if ((gsi >= mp_gsi_routing[i].gsi_base) 3903 struct mp_ioapic_gsi *gsi_cfg = mp_ioapic_gsi_routing(i);
4240 && (gsi <= mp_gsi_routing[i].gsi_end)) 3904 if ((gsi >= gsi_cfg->gsi_base)
3905 && (gsi <= gsi_cfg->gsi_end))
4241 return i; 3906 return i;
4242 } 3907 }
4243 3908
@@ -4247,18 +3912,22 @@ int mp_find_ioapic(u32 gsi)
4247 3912
4248int mp_find_ioapic_pin(int ioapic, u32 gsi) 3913int mp_find_ioapic_pin(int ioapic, u32 gsi)
4249{ 3914{
3915 struct mp_ioapic_gsi *gsi_cfg;
3916
4250 if (WARN_ON(ioapic == -1)) 3917 if (WARN_ON(ioapic == -1))
4251 return -1; 3918 return -1;
4252 if (WARN_ON(gsi > mp_gsi_routing[ioapic].gsi_end)) 3919
3920 gsi_cfg = mp_ioapic_gsi_routing(ioapic);
3921 if (WARN_ON(gsi > gsi_cfg->gsi_end))
4253 return -1; 3922 return -1;
4254 3923
4255 return gsi - mp_gsi_routing[ioapic].gsi_base; 3924 return gsi - gsi_cfg->gsi_base;
4256} 3925}
4257 3926
4258static int bad_ioapic(unsigned long address) 3927static __init int bad_ioapic(unsigned long address)
4259{ 3928{
4260 if (nr_ioapics >= MAX_IO_APICS) { 3929 if (nr_ioapics >= MAX_IO_APICS) {
4261 printk(KERN_WARNING "WARING: Max # of I/O APICs (%d) exceeded " 3930 printk(KERN_WARNING "WARNING: Max # of I/O APICs (%d) exceeded "
4262 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics); 3931 "(found %d), skipping\n", MAX_IO_APICS, nr_ioapics);
4263 return 1; 3932 return 1;
4264 } 3933 }
@@ -4274,40 +3943,42 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4274{ 3943{
4275 int idx = 0; 3944 int idx = 0;
4276 int entries; 3945 int entries;
3946 struct mp_ioapic_gsi *gsi_cfg;
4277 3947
4278 if (bad_ioapic(address)) 3948 if (bad_ioapic(address))
4279 return; 3949 return;
4280 3950
4281 idx = nr_ioapics; 3951 idx = nr_ioapics;
4282 3952
4283 mp_ioapics[idx].type = MP_IOAPIC; 3953 ioapics[idx].mp_config.type = MP_IOAPIC;
4284 mp_ioapics[idx].flags = MPC_APIC_USABLE; 3954 ioapics[idx].mp_config.flags = MPC_APIC_USABLE;
4285 mp_ioapics[idx].apicaddr = address; 3955 ioapics[idx].mp_config.apicaddr = address;
4286 3956
4287 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address); 3957 set_fixmap_nocache(FIX_IO_APIC_BASE_0 + idx, address);
4288 mp_ioapics[idx].apicid = io_apic_unique_id(id); 3958 ioapics[idx].mp_config.apicid = io_apic_unique_id(id);
4289 mp_ioapics[idx].apicver = io_apic_get_version(idx); 3959 ioapics[idx].mp_config.apicver = io_apic_get_version(idx);
4290 3960
4291 /* 3961 /*
4292 * Build basic GSI lookup table to facilitate gsi->io_apic lookups 3962 * Build basic GSI lookup table to facilitate gsi->io_apic lookups
4293 * and to prevent reprogramming of IOAPIC pins (PCI GSIs). 3963 * and to prevent reprogramming of IOAPIC pins (PCI GSIs).
4294 */ 3964 */
4295 entries = io_apic_get_redir_entries(idx); 3965 entries = io_apic_get_redir_entries(idx);
4296 mp_gsi_routing[idx].gsi_base = gsi_base; 3966 gsi_cfg = mp_ioapic_gsi_routing(idx);
4297 mp_gsi_routing[idx].gsi_end = gsi_base + entries - 1; 3967 gsi_cfg->gsi_base = gsi_base;
3968 gsi_cfg->gsi_end = gsi_base + entries - 1;
4298 3969
4299 /* 3970 /*
4300 * The number of IO-APIC IRQ registers (== #pins): 3971 * The number of IO-APIC IRQ registers (== #pins):
4301 */ 3972 */
4302 nr_ioapic_registers[idx] = entries; 3973 ioapics[idx].nr_registers = entries;
4303 3974
4304 if (mp_gsi_routing[idx].gsi_end >= gsi_top) 3975 if (gsi_cfg->gsi_end >= gsi_top)
4305 gsi_top = mp_gsi_routing[idx].gsi_end + 1; 3976 gsi_top = gsi_cfg->gsi_end + 1;
4306 3977
4307 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, " 3978 printk(KERN_INFO "IOAPIC[%d]: apic_id %d, version %d, address 0x%x, "
4308 "GSI %d-%d\n", idx, mp_ioapics[idx].apicid, 3979 "GSI %d-%d\n", idx, mpc_ioapic_id(idx),
4309 mp_ioapics[idx].apicver, mp_ioapics[idx].apicaddr, 3980 mpc_ioapic_ver(idx), mpc_ioapic_addr(idx),
4310 mp_gsi_routing[idx].gsi_base, mp_gsi_routing[idx].gsi_end); 3981 gsi_cfg->gsi_base, gsi_cfg->gsi_end);
4311 3982
4312 nr_ioapics++; 3983 nr_ioapics++;
4313} 3984}
@@ -4315,20 +3986,16 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4315/* Enable IOAPIC early just for system timer */ 3986/* Enable IOAPIC early just for system timer */
4316void __init pre_init_apic_IRQ0(void) 3987void __init pre_init_apic_IRQ0(void)
4317{ 3988{
4318 struct irq_cfg *cfg; 3989 struct io_apic_irq_attr attr = { 0, 0, 0, 0 };
4319 struct irq_desc *desc;
4320 3990
4321 printk(KERN_INFO "Early APIC setup for system timer0\n"); 3991 printk(KERN_INFO "Early APIC setup for system timer0\n");
4322#ifndef CONFIG_SMP 3992#ifndef CONFIG_SMP
4323 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid); 3993 physid_set_mask_of_physid(boot_cpu_physical_apicid,
3994 &phys_cpu_present_map);
4324#endif 3995#endif
4325 desc = irq_to_desc_alloc_node(0, 0);
4326
4327 setup_local_APIC(); 3996 setup_local_APIC();
4328 3997
4329 cfg = irq_cfg(0); 3998 io_apic_setup_irq_pin(0, 0, &attr);
4330 add_pin_to_irq_node(cfg, 0, 0, 0); 3999 irq_set_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq,
4331 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge"); 4000 "edge");
4332
4333 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
4334} 4001}
diff --git a/arch/x86/kernel/apic/ipi.c b/arch/x86/kernel/apic/ipi.c
index 08385e090a6f..cce91bf26676 100644
--- a/arch/x86/kernel/apic/ipi.c
+++ b/arch/x86/kernel/apic/ipi.c
@@ -56,6 +56,8 @@ void default_send_IPI_mask_allbutself_phys(const struct cpumask *mask,
56 local_irq_restore(flags); 56 local_irq_restore(flags);
57} 57}
58 58
59#ifdef CONFIG_X86_32
60
59void default_send_IPI_mask_sequence_logical(const struct cpumask *mask, 61void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
60 int vector) 62 int vector)
61{ 63{
@@ -71,8 +73,8 @@ void default_send_IPI_mask_sequence_logical(const struct cpumask *mask,
71 local_irq_save(flags); 73 local_irq_save(flags);
72 for_each_cpu(query_cpu, mask) 74 for_each_cpu(query_cpu, mask)
73 __default_send_IPI_dest_field( 75 __default_send_IPI_dest_field(
74 apic->cpu_to_logical_apicid(query_cpu), vector, 76 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
75 apic->dest_logical); 77 vector, apic->dest_logical);
76 local_irq_restore(flags); 78 local_irq_restore(flags);
77} 79}
78 80
@@ -90,14 +92,12 @@ void default_send_IPI_mask_allbutself_logical(const struct cpumask *mask,
90 if (query_cpu == this_cpu) 92 if (query_cpu == this_cpu)
91 continue; 93 continue;
92 __default_send_IPI_dest_field( 94 __default_send_IPI_dest_field(
93 apic->cpu_to_logical_apicid(query_cpu), vector, 95 early_per_cpu(x86_cpu_to_logical_apicid, query_cpu),
94 apic->dest_logical); 96 vector, apic->dest_logical);
95 } 97 }
96 local_irq_restore(flags); 98 local_irq_restore(flags);
97} 99}
98 100
99#ifdef CONFIG_X86_32
100
101/* 101/*
102 * This is only used on smaller machines. 102 * This is only used on smaller machines.
103 */ 103 */
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
deleted file mode 100644
index a43f71cb30f8..000000000000
--- a/arch/x86/kernel/apic/nmi.c
+++ /dev/null
@@ -1,567 +0,0 @@
1/*
2 * NMI watchdog support on APIC systems
3 *
4 * Started by Ingo Molnar <mingo@redhat.com>
5 *
6 * Fixes:
7 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
8 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
9 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
10 * Pavel Machek and
11 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
12 */
13
14#include <asm/apic.h>
15
16#include <linux/nmi.h>
17#include <linux/mm.h>
18#include <linux/delay.h>
19#include <linux/interrupt.h>
20#include <linux/module.h>
21#include <linux/slab.h>
22#include <linux/sysdev.h>
23#include <linux/sysctl.h>
24#include <linux/percpu.h>
25#include <linux/kprobes.h>
26#include <linux/cpumask.h>
27#include <linux/kernel_stat.h>
28#include <linux/kdebug.h>
29#include <linux/smp.h>
30
31#include <asm/i8259.h>
32#include <asm/io_apic.h>
33#include <asm/proto.h>
34#include <asm/timer.h>
35
36#include <asm/mce.h>
37
38#include <asm/mach_traps.h>
39
40int unknown_nmi_panic;
41int nmi_watchdog_enabled;
42
43/* For reliability, we're prepared to waste bits here. */
44static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
45
46/* nmi_active:
47 * >0: the lapic NMI watchdog is active, but can be disabled
48 * <0: the lapic NMI watchdog has not been set up, and cannot
49 * be enabled
50 * 0: the lapic NMI watchdog is disabled, but can be enabled
51 */
52atomic_t nmi_active = ATOMIC_INIT(0); /* oprofile uses this */
53EXPORT_SYMBOL(nmi_active);
54
55unsigned int nmi_watchdog = NMI_NONE;
56EXPORT_SYMBOL(nmi_watchdog);
57
58static int panic_on_timeout;
59
60static unsigned int nmi_hz = HZ;
61static DEFINE_PER_CPU(short, wd_enabled);
62static int endflag __initdata;
63
64static inline unsigned int get_nmi_count(int cpu)
65{
66 return per_cpu(irq_stat, cpu).__nmi_count;
67}
68
69static inline int mce_in_progress(void)
70{
71#if defined(CONFIG_X86_MCE)
72 return atomic_read(&mce_entry) > 0;
73#endif
74 return 0;
75}
76
77/*
78 * Take the local apic timer and PIT/HPET into account. We don't
79 * know which one is active, when we have highres/dyntick on
80 */
81static inline unsigned int get_timer_irqs(int cpu)
82{
83 return per_cpu(irq_stat, cpu).apic_timer_irqs +
84 per_cpu(irq_stat, cpu).irq0_irqs;
85}
86
87#ifdef CONFIG_SMP
88/*
89 * The performance counters used by NMI_LOCAL_APIC don't trigger when
90 * the CPU is idle. To make sure the NMI watchdog really ticks on all
91 * CPUs during the test make them busy.
92 */
93static __init void nmi_cpu_busy(void *data)
94{
95 local_irq_enable_in_hardirq();
96 /*
97 * Intentionally don't use cpu_relax here. This is
98 * to make sure that the performance counter really ticks,
99 * even if there is a simulator or similar that catches the
100 * pause instruction. On a real HT machine this is fine because
101 * all other CPUs are busy with "useless" delay loops and don't
102 * care if they get somewhat less cycles.
103 */
104 while (endflag == 0)
105 mb();
106}
107#endif
108
109static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count)
110{
111 printk(KERN_CONT "\n");
112
113 printk(KERN_WARNING
114 "WARNING: CPU#%d: NMI appears to be stuck (%d->%d)!\n",
115 cpu, prev_nmi_count[cpu], get_nmi_count(cpu));
116
117 printk(KERN_WARNING
118 "Please report this to bugzilla.kernel.org,\n");
119 printk(KERN_WARNING
120 "and attach the output of the 'dmesg' command.\n");
121
122 per_cpu(wd_enabled, cpu) = 0;
123 atomic_dec(&nmi_active);
124}
125
126static void __acpi_nmi_disable(void *__unused)
127{
128 apic_write(APIC_LVT0, APIC_DM_NMI | APIC_LVT_MASKED);
129}
130
131int __init check_nmi_watchdog(void)
132{
133 unsigned int *prev_nmi_count;
134 int cpu;
135
136 if (!nmi_watchdog_active() || !atomic_read(&nmi_active))
137 return 0;
138
139 prev_nmi_count = kmalloc(nr_cpu_ids * sizeof(int), GFP_KERNEL);
140 if (!prev_nmi_count)
141 goto error;
142
143 printk(KERN_INFO "Testing NMI watchdog ... ");
144
145#ifdef CONFIG_SMP
146 if (nmi_watchdog == NMI_LOCAL_APIC)
147 smp_call_function(nmi_cpu_busy, (void *)&endflag, 0);
148#endif
149
150 for_each_possible_cpu(cpu)
151 prev_nmi_count[cpu] = get_nmi_count(cpu);
152 local_irq_enable();
153 mdelay((20 * 1000) / nmi_hz); /* wait 20 ticks */
154
155 for_each_online_cpu(cpu) {
156 if (!per_cpu(wd_enabled, cpu))
157 continue;
158 if (get_nmi_count(cpu) - prev_nmi_count[cpu] <= 5)
159 report_broken_nmi(cpu, prev_nmi_count);
160 }
161 endflag = 1;
162 if (!atomic_read(&nmi_active)) {
163 kfree(prev_nmi_count);
164 atomic_set(&nmi_active, -1);
165 goto error;
166 }
167 printk("OK.\n");
168
169 /*
170 * now that we know it works we can reduce NMI frequency to
171 * something more reasonable; makes a difference in some configs
172 */
173 if (nmi_watchdog == NMI_LOCAL_APIC)
174 nmi_hz = lapic_adjust_nmi_hz(1);
175
176 kfree(prev_nmi_count);
177 return 0;
178error:
179 if (nmi_watchdog == NMI_IO_APIC) {
180 if (!timer_through_8259)
181 legacy_pic->chip->mask(0);
182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
183 }
184
185#ifdef CONFIG_X86_32
186 timer_ack = 0;
187#endif
188 return -1;
189}
190
191static int __init setup_nmi_watchdog(char *str)
192{
193 unsigned int nmi;
194
195 if (!strncmp(str, "panic", 5)) {
196 panic_on_timeout = 1;
197 str = strchr(str, ',');
198 if (!str)
199 return 1;
200 ++str;
201 }
202
203 if (!strncmp(str, "lapic", 5))
204 nmi_watchdog = NMI_LOCAL_APIC;
205 else if (!strncmp(str, "ioapic", 6))
206 nmi_watchdog = NMI_IO_APIC;
207 else {
208 get_option(&str, &nmi);
209 if (nmi >= NMI_INVALID)
210 return 0;
211 nmi_watchdog = nmi;
212 }
213
214 return 1;
215}
216__setup("nmi_watchdog=", setup_nmi_watchdog);
217
218/*
219 * Suspend/resume support
220 */
221#ifdef CONFIG_PM
222
223static int nmi_pm_active; /* nmi_active before suspend */
224
225static int lapic_nmi_suspend(struct sys_device *dev, pm_message_t state)
226{
227 /* only CPU0 goes here, other CPUs should be offline */
228 nmi_pm_active = atomic_read(&nmi_active);
229 stop_apic_nmi_watchdog(NULL);
230 BUG_ON(atomic_read(&nmi_active) != 0);
231 return 0;
232}
233
234static int lapic_nmi_resume(struct sys_device *dev)
235{
236 /* only CPU0 goes here, other CPUs should be offline */
237 if (nmi_pm_active > 0) {
238 setup_apic_nmi_watchdog(NULL);
239 touch_nmi_watchdog();
240 }
241 return 0;
242}
243
244static struct sysdev_class nmi_sysclass = {
245 .name = "lapic_nmi",
246 .resume = lapic_nmi_resume,
247 .suspend = lapic_nmi_suspend,
248};
249
250static struct sys_device device_lapic_nmi = {
251 .id = 0,
252 .cls = &nmi_sysclass,
253};
254
255static int __init init_lapic_nmi_sysfs(void)
256{
257 int error;
258
259 /*
260 * should really be a BUG_ON but b/c this is an
261 * init call, it just doesn't work. -dcz
262 */
263 if (nmi_watchdog != NMI_LOCAL_APIC)
264 return 0;
265
266 if (atomic_read(&nmi_active) < 0)
267 return 0;
268
269 error = sysdev_class_register(&nmi_sysclass);
270 if (!error)
271 error = sysdev_register(&device_lapic_nmi);
272 return error;
273}
274
275/* must come after the local APIC's device_initcall() */
276late_initcall(init_lapic_nmi_sysfs);
277
278#endif /* CONFIG_PM */
279
280static void __acpi_nmi_enable(void *__unused)
281{
282 apic_write(APIC_LVT0, APIC_DM_NMI);
283}
284
285/*
286 * Enable timer based NMIs on all CPUs:
287 */
288void acpi_nmi_enable(void)
289{
290 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
291 on_each_cpu(__acpi_nmi_enable, NULL, 1);
292}
293
294/*
295 * Disable timer based NMIs on all CPUs:
296 */
297void acpi_nmi_disable(void)
298{
299 if (atomic_read(&nmi_active) && nmi_watchdog == NMI_IO_APIC)
300 on_each_cpu(__acpi_nmi_disable, NULL, 1);
301}
302
303/*
304 * This function is called as soon the LAPIC NMI watchdog driver has everything
305 * in place and it's ready to check if the NMIs belong to the NMI watchdog
306 */
307void cpu_nmi_set_wd_enabled(void)
308{
309 __get_cpu_var(wd_enabled) = 1;
310}
311
312void setup_apic_nmi_watchdog(void *unused)
313{
314 if (__get_cpu_var(wd_enabled))
315 return;
316
317 /* cheap hack to support suspend/resume */
318 /* if cpu0 is not active neither should the other cpus */
319 if (smp_processor_id() != 0 && atomic_read(&nmi_active) <= 0)
320 return;
321
322 switch (nmi_watchdog) {
323 case NMI_LOCAL_APIC:
324 if (lapic_watchdog_init(nmi_hz) < 0) {
325 __get_cpu_var(wd_enabled) = 0;
326 return;
327 }
328 /* FALL THROUGH */
329 case NMI_IO_APIC:
330 __get_cpu_var(wd_enabled) = 1;
331 atomic_inc(&nmi_active);
332 }
333}
334
335void stop_apic_nmi_watchdog(void *unused)
336{
337 /* only support LOCAL and IO APICs for now */
338 if (!nmi_watchdog_active())
339 return;
340 if (__get_cpu_var(wd_enabled) == 0)
341 return;
342 if (nmi_watchdog == NMI_LOCAL_APIC)
343 lapic_watchdog_stop();
344 else
345 __acpi_nmi_disable(NULL);
346 __get_cpu_var(wd_enabled) = 0;
347 atomic_dec(&nmi_active);
348}
349
350/*
351 * the best way to detect whether a CPU has a 'hard lockup' problem
352 * is to check it's local APIC timer IRQ counts. If they are not
353 * changing then that CPU has some problem.
354 *
355 * as these watchdog NMI IRQs are generated on every CPU, we only
356 * have to check the current processor.
357 *
358 * since NMIs don't listen to _any_ locks, we have to be extremely
359 * careful not to rely on unsafe variables. The printk might lock
360 * up though, so we have to break up any console locks first ...
361 * [when there will be more tty-related locks, break them up here too!]
362 */
363
364static DEFINE_PER_CPU(unsigned, last_irq_sum);
365static DEFINE_PER_CPU(long, alert_counter);
366static DEFINE_PER_CPU(int, nmi_touch);
367
368void touch_nmi_watchdog(void)
369{
370 if (nmi_watchdog_active()) {
371 unsigned cpu;
372
373 /*
374 * Tell other CPUs to reset their alert counters. We cannot
375 * do it ourselves because the alert count increase is not
376 * atomic.
377 */
378 for_each_present_cpu(cpu) {
379 if (per_cpu(nmi_touch, cpu) != 1)
380 per_cpu(nmi_touch, cpu) = 1;
381 }
382 }
383
384 /*
385 * Tickle the softlockup detector too:
386 */
387 touch_softlockup_watchdog();
388}
389EXPORT_SYMBOL(touch_nmi_watchdog);
390
391notrace __kprobes int
392nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
393{
394 /*
395 * Since current_thread_info()-> is always on the stack, and we
396 * always switch the stack NMI-atomically, it's safe to use
397 * smp_processor_id().
398 */
399 unsigned int sum;
400 int touched = 0;
401 int cpu = smp_processor_id();
402 int rc = 0;
403
404 sum = get_timer_irqs(cpu);
405
406 if (__get_cpu_var(nmi_touch)) {
407 __get_cpu_var(nmi_touch) = 0;
408 touched = 1;
409 }
410
411 /* We can be called before check_nmi_watchdog, hence NULL check. */
412 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
413 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
414
415 raw_spin_lock(&lock);
416 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
417 show_regs(regs);
418 dump_stack();
419 raw_spin_unlock(&lock);
420 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
421
422 rc = 1;
423 }
424
425 /* Could check oops_in_progress here too, but it's safer not to */
426 if (mce_in_progress())
427 touched = 1;
428
429 /* if the none of the timers isn't firing, this cpu isn't doing much */
430 if (!touched && __get_cpu_var(last_irq_sum) == sum) {
431 /*
432 * Ayiee, looks like this CPU is stuck ...
433 * wait a few IRQs (5 seconds) before doing the oops ...
434 */
435 __this_cpu_inc(alert_counter);
436 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
437 /*
438 * die_nmi will return ONLY if NOTIFY_STOP happens..
439 */
440 die_nmi("BUG: NMI Watchdog detected LOCKUP",
441 regs, panic_on_timeout);
442 } else {
443 __get_cpu_var(last_irq_sum) = sum;
444 __this_cpu_write(alert_counter, 0);
445 }
446
447 /* see if the nmi watchdog went off */
448 if (!__get_cpu_var(wd_enabled))
449 return rc;
450 switch (nmi_watchdog) {
451 case NMI_LOCAL_APIC:
452 rc |= lapic_wd_event(nmi_hz);
453 break;
454 case NMI_IO_APIC:
455 /*
456 * don't know how to accurately check for this.
457 * just assume it was a watchdog timer interrupt
458 * This matches the old behaviour.
459 */
460 rc = 1;
461 break;
462 }
463 return rc;
464}
465
466#ifdef CONFIG_SYSCTL
467
468static void enable_ioapic_nmi_watchdog_single(void *unused)
469{
470 __get_cpu_var(wd_enabled) = 1;
471 atomic_inc(&nmi_active);
472 __acpi_nmi_enable(NULL);
473}
474
475static void enable_ioapic_nmi_watchdog(void)
476{
477 on_each_cpu(enable_ioapic_nmi_watchdog_single, NULL, 1);
478 touch_nmi_watchdog();
479}
480
481static void disable_ioapic_nmi_watchdog(void)
482{
483 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
484}
485
486static int __init setup_unknown_nmi_panic(char *str)
487{
488 unknown_nmi_panic = 1;
489 return 1;
490}
491__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
492
493static int unknown_nmi_panic_callback(struct pt_regs *regs, int cpu)
494{
495 unsigned char reason = get_nmi_reason();
496 char buf[64];
497
498 sprintf(buf, "NMI received for unknown reason %02x\n", reason);
499 die_nmi(buf, regs, 1); /* Always panic here */
500 return 0;
501}
502
503/*
504 * proc handler for /proc/sys/kernel/nmi
505 */
506int proc_nmi_enabled(struct ctl_table *table, int write,
507 void __user *buffer, size_t *length, loff_t *ppos)
508{
509 int old_state;
510
511 nmi_watchdog_enabled = (atomic_read(&nmi_active) > 0) ? 1 : 0;
512 old_state = nmi_watchdog_enabled;
513 proc_dointvec(table, write, buffer, length, ppos);
514 if (!!old_state == !!nmi_watchdog_enabled)
515 return 0;
516
517 if (atomic_read(&nmi_active) < 0 || !nmi_watchdog_active()) {
518 printk(KERN_WARNING
519 "NMI watchdog is permanently disabled\n");
520 return -EIO;
521 }
522
523 if (nmi_watchdog == NMI_LOCAL_APIC) {
524 if (nmi_watchdog_enabled)
525 enable_lapic_nmi_watchdog();
526 else
527 disable_lapic_nmi_watchdog();
528 } else if (nmi_watchdog == NMI_IO_APIC) {
529 if (nmi_watchdog_enabled)
530 enable_ioapic_nmi_watchdog();
531 else
532 disable_ioapic_nmi_watchdog();
533 } else {
534 printk(KERN_WARNING
535 "NMI watchdog doesn't know what hardware to touch\n");
536 return -EIO;
537 }
538 return 0;
539}
540
541#endif /* CONFIG_SYSCTL */
542
543int do_nmi_callback(struct pt_regs *regs, int cpu)
544{
545#ifdef CONFIG_SYSCTL
546 if (unknown_nmi_panic)
547 return unknown_nmi_panic_callback(regs, cpu);
548#endif
549 return 0;
550}
551
552void arch_trigger_all_cpu_backtrace(void)
553{
554 int i;
555
556 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
557
558 printk(KERN_INFO "sending NMI to all CPUs:\n");
559 apic->send_IPI_all(NMI_VECTOR);
560
561 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
562 for (i = 0; i < 10 * 1000; i++) {
563 if (cpumask_empty(to_cpumask(backtrace_mask)))
564 break;
565 mdelay(1);
566 }
567}
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index 3e28401f161c..c4a61ca1349a 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -26,6 +26,7 @@
26#include <linux/nodemask.h> 26#include <linux/nodemask.h>
27#include <linux/topology.h> 27#include <linux/topology.h>
28#include <linux/bootmem.h> 28#include <linux/bootmem.h>
29#include <linux/memblock.h>
29#include <linux/threads.h> 30#include <linux/threads.h>
30#include <linux/cpumask.h> 31#include <linux/cpumask.h>
31#include <linux/kernel.h> 32#include <linux/kernel.h>
@@ -47,8 +48,6 @@
47#include <asm/e820.h> 48#include <asm/e820.h>
48#include <asm/ipi.h> 49#include <asm/ipi.h>
49 50
50#define MB_TO_PAGES(addr) ((addr) << (20 - PAGE_SHIFT))
51
52int found_numaq; 51int found_numaq;
53 52
54/* 53/*
@@ -78,31 +77,20 @@ int quad_local_to_mp_bus_id[NR_CPUS/4][4];
78static inline void numaq_register_node(int node, struct sys_cfg_data *scd) 77static inline void numaq_register_node(int node, struct sys_cfg_data *scd)
79{ 78{
80 struct eachquadmem *eq = scd->eq + node; 79 struct eachquadmem *eq = scd->eq + node;
80 u64 start = (u64)(eq->hi_shrd_mem_start - eq->priv_mem_size) << 20;
81 u64 end = (u64)(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size) << 20;
82 int ret;
81 83
82 node_set_online(node); 84 node_set(node, numa_nodes_parsed);
83 85 ret = numa_add_memblk(node, start, end);
84 /* Convert to pages */ 86 BUG_ON(ret < 0);
85 node_start_pfn[node] =
86 MB_TO_PAGES(eq->hi_shrd_mem_start - eq->priv_mem_size);
87
88 node_end_pfn[node] =
89 MB_TO_PAGES(eq->hi_shrd_mem_start + eq->hi_shrd_mem_size);
90
91 e820_register_active_regions(node, node_start_pfn[node],
92 node_end_pfn[node]);
93
94 memory_present(node, node_start_pfn[node], node_end_pfn[node]);
95
96 node_remap_size[node] = node_memmap_size_bytes(node,
97 node_start_pfn[node],
98 node_end_pfn[node]);
99} 87}
100 88
101/* 89/*
102 * Function: smp_dump_qct() 90 * Function: smp_dump_qct()
103 * 91 *
104 * Description: gets memory layout from the quad config table. This 92 * Description: gets memory layout from the quad config table. This
105 * function also updates node_online_map with the nodes (quads) present. 93 * function also updates numa_nodes_parsed with the nodes (quads) present.
106 */ 94 */
107static void __init smp_dump_qct(void) 95static void __init smp_dump_qct(void)
108{ 96{
@@ -111,7 +99,6 @@ static void __init smp_dump_qct(void)
111 99
112 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR); 100 scd = (void *)__va(SYS_CFG_DATA_PRIV_ADDR);
113 101
114 nodes_clear(node_online_map);
115 for_each_node(node) { 102 for_each_node(node) {
116 if (scd->quads_present31_0 & (1 << node)) 103 if (scd->quads_present31_0 & (1 << node))
117 numaq_register_node(node, scd); 104 numaq_register_node(node, scd);
@@ -281,14 +268,14 @@ static __init void early_check_numaq(void)
281 } 268 }
282} 269}
283 270
284int __init get_memcfg_numaq(void) 271int __init numaq_numa_init(void)
285{ 272{
286 early_check_numaq(); 273 early_check_numaq();
287 if (!found_numaq) 274 if (!found_numaq)
288 return 0; 275 return -ENOENT;
289 smp_dump_qct(); 276 smp_dump_qct();
290 277
291 return 1; 278 return 0;
292} 279}
293 280
294#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER) 281#define NUMAQ_APIC_DFR_VALUE (APIC_DFR_CLUSTER)
@@ -372,13 +359,6 @@ static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask
372 return physids_promote(0xFUL, retmap); 359 return physids_promote(0xFUL, retmap);
373} 360}
374 361
375static inline int numaq_cpu_to_logical_apicid(int cpu)
376{
377 if (cpu >= nr_cpu_ids)
378 return BAD_APICID;
379 return cpu_2_logical_apicid[cpu];
380}
381
382/* 362/*
383 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent 363 * Supporting over 60 cpus on NUMA-Q requires a locality-dependent
384 * cpu to APIC ID relation to properly interact with the intelligent 364 * cpu to APIC ID relation to properly interact with the intelligent
@@ -397,6 +377,15 @@ static inline int numaq_apicid_to_node(int logical_apicid)
397 return logical_apicid >> 4; 377 return logical_apicid >> 4;
398} 378}
399 379
380static int numaq_numa_cpu_node(int cpu)
381{
382 int logical_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
383
384 if (logical_apicid != BAD_APICID)
385 return numaq_apicid_to_node(logical_apicid);
386 return NUMA_NO_NODE;
387}
388
400static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap) 389static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
401{ 390{
402 int node = numaq_apicid_to_node(logical_apicid); 391 int node = numaq_apicid_to_node(logical_apicid);
@@ -483,8 +472,8 @@ static void numaq_setup_portio_remap(void)
483 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD); 472 (u_long) xquad_portio, (u_long) num_quads*XQUAD_PORTIO_QUAD);
484} 473}
485 474
486/* Use __refdata to keep false positive warning calm. */ 475/* Use __refdata to keep false positive warning calm. */
487struct apic __refdata apic_numaq = { 476static struct apic __refdata apic_numaq = {
488 477
489 .name = "NUMAQ", 478 .name = "NUMAQ",
490 .probe = probe_numaq, 479 .probe = probe_numaq,
@@ -507,8 +496,6 @@ struct apic __refdata apic_numaq = {
507 .ioapic_phys_id_map = numaq_ioapic_phys_id_map, 496 .ioapic_phys_id_map = numaq_ioapic_phys_id_map,
508 .setup_apic_routing = numaq_setup_apic_routing, 497 .setup_apic_routing = numaq_setup_apic_routing,
509 .multi_timer_check = numaq_multi_timer_check, 498 .multi_timer_check = numaq_multi_timer_check,
510 .apicid_to_node = numaq_apicid_to_node,
511 .cpu_to_logical_apicid = numaq_cpu_to_logical_apicid,
512 .cpu_present_to_apicid = numaq_cpu_present_to_apicid, 499 .cpu_present_to_apicid = numaq_cpu_present_to_apicid,
513 .apicid_to_cpu_present = numaq_apicid_to_cpu_present, 500 .apicid_to_cpu_present = numaq_apicid_to_cpu_present,
514 .setup_portio_remap = numaq_setup_portio_remap, 501 .setup_portio_remap = numaq_setup_portio_remap,
@@ -546,4 +533,9 @@ struct apic __refdata apic_numaq = {
546 .icr_write = native_apic_icr_write, 533 .icr_write = native_apic_icr_write,
547 .wait_icr_idle = native_apic_wait_icr_idle, 534 .wait_icr_idle = native_apic_wait_icr_idle,
548 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 535 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
536
537 .x86_32_early_logical_apicid = noop_x86_32_early_logical_apicid,
538 .x86_32_numa_cpu_node = numaq_numa_cpu_node,
549}; 539};
540
541apic_driver(apic_numaq);
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 99d2fe016084..b5254ad044ab 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,29 +52,9 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void __init default_setup_apic_routing(void) 55static int default_x86_32_early_logical_apicid(int cpu)
56{ 56{
57 int version = apic_version[boot_cpu_physical_apicid]; 57 return 1 << cpu;
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78} 58}
79 59
80static void setup_apic_flat_routing(void) 60static void setup_apic_flat_routing(void)
@@ -107,7 +87,7 @@ static int probe_default(void)
107 return 1; 87 return 1;
108} 88}
109 89
110struct apic apic_default = { 90static struct apic apic_default = {
111 91
112 .name = "default", 92 .name = "default",
113 .probe = probe_default, 93 .probe = probe_default,
@@ -130,8 +110,6 @@ struct apic apic_default = {
130 .ioapic_phys_id_map = default_ioapic_phys_id_map, 110 .ioapic_phys_id_map = default_ioapic_phys_id_map,
131 .setup_apic_routing = setup_apic_flat_routing, 111 .setup_apic_routing = setup_apic_flat_routing,
132 .multi_timer_check = NULL, 112 .multi_timer_check = NULL,
133 .apicid_to_node = default_apicid_to_node,
134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
135 .cpu_present_to_apicid = default_cpu_present_to_apicid, 113 .cpu_present_to_apicid = default_cpu_present_to_apicid,
136 .apicid_to_cpu_present = physid_set_mask_of_physid, 114 .apicid_to_cpu_present = physid_set_mask_of_physid,
137 .setup_portio_remap = NULL, 115 .setup_portio_remap = NULL,
@@ -167,46 +145,26 @@ struct apic apic_default = {
167 .icr_write = native_apic_icr_write, 145 .icr_write = native_apic_icr_write,
168 .wait_icr_idle = native_apic_wait_icr_idle, 146 .wait_icr_idle = native_apic_wait_icr_idle,
169 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 147 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
148
149 .x86_32_early_logical_apicid = default_x86_32_early_logical_apicid,
170}; 150};
171 151
172extern struct apic apic_numaq; 152apic_driver(apic_default);
173extern struct apic apic_summit;
174extern struct apic apic_bigsmp;
175extern struct apic apic_es7000;
176extern struct apic apic_es7000_cluster;
177 153
178struct apic *apic = &apic_default; 154struct apic *apic = &apic_default;
179EXPORT_SYMBOL_GPL(apic); 155EXPORT_SYMBOL_GPL(apic);
180 156
181static struct apic *apic_probe[] __initdata = {
182#ifdef CONFIG_X86_NUMAQ
183 &apic_numaq,
184#endif
185#ifdef CONFIG_X86_SUMMIT
186 &apic_summit,
187#endif
188#ifdef CONFIG_X86_BIGSMP
189 &apic_bigsmp,
190#endif
191#ifdef CONFIG_X86_ES7000
192 &apic_es7000,
193 &apic_es7000_cluster,
194#endif
195 &apic_default, /* must be last */
196 NULL,
197};
198
199static int cmdline_apic __initdata; 157static int cmdline_apic __initdata;
200static int __init parse_apic(char *arg) 158static int __init parse_apic(char *arg)
201{ 159{
202 int i; 160 struct apic **drv;
203 161
204 if (!arg) 162 if (!arg)
205 return -EINVAL; 163 return -EINVAL;
206 164
207 for (i = 0; apic_probe[i]; i++) { 165 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
208 if (!strcmp(apic_probe[i]->name, arg)) { 166 if (!strcmp((*drv)->name, arg)) {
209 apic = apic_probe[i]; 167 apic = *drv;
210 cmdline_apic = 1; 168 cmdline_apic = 1;
211 return 0; 169 return 0;
212 } 170 }
@@ -217,38 +175,58 @@ static int __init parse_apic(char *arg)
217} 175}
218early_param("apic", parse_apic); 176early_param("apic", parse_apic);
219 177
220void __init generic_bigsmp_probe(void) 178void __init default_setup_apic_routing(void)
221{ 179{
180 int version = apic_version[boot_cpu_physical_apicid];
181
182 if (num_possible_cpus() > 8) {
183 switch (boot_cpu_data.x86_vendor) {
184 case X86_VENDOR_INTEL:
185 if (!APIC_XAPIC(version)) {
186 def_to_bigsmp = 0;
187 break;
188 }
189 /* If P4 and above fall through */
190 case X86_VENDOR_AMD:
191 def_to_bigsmp = 1;
192 }
193 }
194
222#ifdef CONFIG_X86_BIGSMP 195#ifdef CONFIG_X86_BIGSMP
223 /* 196 /*
224 * This routine is used to switch to bigsmp mode when 197 * This is used to switch to bigsmp mode when
225 * - There is no apic= option specified by the user 198 * - There is no apic= option specified by the user
226 * - generic_apic_probe() has chosen apic_default as the sub_arch 199 * - generic_apic_probe() has chosen apic_default as the sub_arch
227 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support 200 * - we find more than 8 CPUs in acpi LAPIC listing with xAPIC support
228 */ 201 */
229 202
230 if (!cmdline_apic && apic == &apic_default) { 203 if (!cmdline_apic && apic == &apic_default) {
231 if (apic_bigsmp.probe()) { 204 struct apic *bigsmp = generic_bigsmp_probe();
232 apic = &apic_bigsmp; 205 if (bigsmp) {
206 apic = bigsmp;
233 printk(KERN_INFO "Overriding APIC driver with %s\n", 207 printk(KERN_INFO "Overriding APIC driver with %s\n",
234 apic->name); 208 apic->name);
235 } 209 }
236 } 210 }
237#endif 211#endif
212
213 if (apic->setup_apic_routing)
214 apic->setup_apic_routing();
238} 215}
239 216
240void __init generic_apic_probe(void) 217void __init generic_apic_probe(void)
241{ 218{
242 if (!cmdline_apic) { 219 if (!cmdline_apic) {
243 int i; 220 struct apic **drv;
244 for (i = 0; apic_probe[i]; i++) { 221
245 if (apic_probe[i]->probe()) { 222 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
246 apic = apic_probe[i]; 223 if ((*drv)->probe()) {
224 apic = *drv;
247 break; 225 break;
248 } 226 }
249 } 227 }
250 /* Not visible without early console */ 228 /* Not visible without early console */
251 if (!apic_probe[i]) 229 if (drv == __apicdrivers_end)
252 panic("Didn't find an APIC driver"); 230 panic("Didn't find an APIC driver");
253 } 231 }
254 printk(KERN_INFO "Using APIC driver %s\n", apic->name); 232 printk(KERN_INFO "Using APIC driver %s\n", apic->name);
@@ -259,16 +237,16 @@ void __init generic_apic_probe(void)
259int __init 237int __init
260generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid) 238generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
261{ 239{
262 int i; 240 struct apic **drv;
263 241
264 for (i = 0; apic_probe[i]; ++i) { 242 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
265 if (!apic_probe[i]->mps_oem_check) 243 if (!((*drv)->mps_oem_check))
266 continue; 244 continue;
267 if (!apic_probe[i]->mps_oem_check(mpc, oem, productid)) 245 if (!(*drv)->mps_oem_check(mpc, oem, productid))
268 continue; 246 continue;
269 247
270 if (!cmdline_apic) { 248 if (!cmdline_apic) {
271 apic = apic_probe[i]; 249 apic = *drv;
272 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 250 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
273 apic->name); 251 apic->name);
274 } 252 }
@@ -279,16 +257,16 @@ generic_mps_oem_check(struct mpc_table *mpc, char *oem, char *productid)
279 257
280int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 258int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
281{ 259{
282 int i; 260 struct apic **drv;
283 261
284 for (i = 0; apic_probe[i]; ++i) { 262 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
285 if (!apic_probe[i]->acpi_madt_oem_check) 263 if (!(*drv)->acpi_madt_oem_check)
286 continue; 264 continue;
287 if (!apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) 265 if (!(*drv)->acpi_madt_oem_check(oem_id, oem_table_id))
288 continue; 266 continue;
289 267
290 if (!cmdline_apic) { 268 if (!cmdline_apic) {
291 apic = apic_probe[i]; 269 apic = *drv;
292 printk(KERN_INFO "Switched to APIC driver `%s'.\n", 270 printk(KERN_INFO "Switched to APIC driver `%s'.\n",
293 apic->name); 271 apic->name);
294 } 272 }
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index 83e9be4778e2..3fe986698929 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -23,27 +23,6 @@
23#include <asm/ipi.h> 23#include <asm/ipi.h>
24#include <asm/setup.h> 24#include <asm/setup.h>
25 25
26extern struct apic apic_flat;
27extern struct apic apic_physflat;
28extern struct apic apic_x2xpic_uv_x;
29extern struct apic apic_x2apic_phys;
30extern struct apic apic_x2apic_cluster;
31
32struct apic __read_mostly *apic = &apic_flat;
33EXPORT_SYMBOL_GPL(apic);
34
35static struct apic *apic_probe[] __initdata = {
36#ifdef CONFIG_X86_UV
37 &apic_x2apic_uv_x,
38#endif
39#ifdef CONFIG_X86_X2APIC
40 &apic_x2apic_phys,
41 &apic_x2apic_cluster,
42#endif
43 &apic_physflat,
44 NULL,
45};
46
47static int apicid_phys_pkg_id(int initial_apic_id, int index_msb) 26static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
48{ 27{
49 return hard_smp_processor_id() >> index_msb; 28 return hard_smp_processor_id() >> index_msb;
@@ -54,35 +33,25 @@ static int apicid_phys_pkg_id(int initial_apic_id, int index_msb)
54 */ 33 */
55void __init default_setup_apic_routing(void) 34void __init default_setup_apic_routing(void)
56{ 35{
57#ifdef CONFIG_X86_X2APIC 36 struct apic **drv;
58 if (x2apic_mode
59#ifdef CONFIG_X86_UV
60 && apic != &apic_x2apic_uv_x
61#endif
62 ) {
63 if (x2apic_phys)
64 apic = &apic_x2apic_phys;
65 else
66 apic = &apic_x2apic_cluster;
67 }
68#endif
69 37
70 if (apic == &apic_flat && num_possible_cpus() > 8) 38 enable_IR_x2apic();
71 apic = &apic_physflat;
72 39
73 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 40 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
41 if ((*drv)->probe && (*drv)->probe()) {
42 if (apic != *drv) {
43 apic = *drv;
44 pr_info("Switched APIC routing to %s.\n",
45 apic->name);
46 }
47 break;
48 }
49 }
74 50
75 if (is_vsmp_box()) { 51 if (is_vsmp_box()) {
76 /* need to update phys_pkg_id */ 52 /* need to update phys_pkg_id */
77 apic->phys_pkg_id = apicid_phys_pkg_id; 53 apic->phys_pkg_id = apicid_phys_pkg_id;
78 } 54 }
79
80 /*
81 * Now that apic routing model is selected, configure the
82 * fault handling for intr remapping.
83 */
84 if (intr_remapping_enabled)
85 enable_drhd_fault_handling();
86} 55}
87 56
88/* Same for both flat and physical. */ 57/* Same for both flat and physical. */
@@ -94,13 +63,15 @@ void apic_send_IPI_self(int vector)
94 63
95int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 64int __init default_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
96{ 65{
97 int i; 66 struct apic **drv;
98 67
99 for (i = 0; apic_probe[i]; ++i) { 68 for (drv = __apicdrivers; drv < __apicdrivers_end; drv++) {
100 if (apic_probe[i]->acpi_madt_oem_check(oem_id, oem_table_id)) { 69 if ((*drv)->acpi_madt_oem_check(oem_id, oem_table_id)) {
101 apic = apic_probe[i]; 70 if (apic != *drv) {
102 printk(KERN_INFO "Setting APIC routing to %s.\n", 71 apic = *drv;
103 apic->name); 72 pr_info("Setting APIC routing to %s.\n",
73 apic->name);
74 }
104 return 1; 75 return 1;
105 } 76 }
106 } 77 }
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 9b419263d90d..19114423c58c 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -194,11 +194,10 @@ static unsigned long summit_check_apicid_present(int bit)
194 return 1; 194 return 1;
195} 195}
196 196
197static void summit_init_apic_ldr(void) 197static int summit_early_logical_apicid(int cpu)
198{ 198{
199 unsigned long val, id;
200 int count = 0; 199 int count = 0;
201 u8 my_id = (u8)hard_smp_processor_id(); 200 u8 my_id = early_per_cpu(x86_cpu_to_apicid, cpu);
202 u8 my_cluster = APIC_CLUSTER(my_id); 201 u8 my_cluster = APIC_CLUSTER(my_id);
203#ifdef CONFIG_SMP 202#ifdef CONFIG_SMP
204 u8 lid; 203 u8 lid;
@@ -206,7 +205,7 @@ static void summit_init_apic_ldr(void)
206 205
207 /* Create logical APIC IDs by counting CPUs already in cluster. */ 206 /* Create logical APIC IDs by counting CPUs already in cluster. */
208 for (count = 0, i = nr_cpu_ids; --i >= 0; ) { 207 for (count = 0, i = nr_cpu_ids; --i >= 0; ) {
209 lid = cpu_2_logical_apicid[i]; 208 lid = early_per_cpu(x86_cpu_to_logical_apicid, i);
210 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster) 209 if (lid != BAD_APICID && APIC_CLUSTER(lid) == my_cluster)
211 ++count; 210 ++count;
212 } 211 }
@@ -214,7 +213,15 @@ static void summit_init_apic_ldr(void)
214 /* We only have a 4 wide bitmap in cluster mode. If a deranged 213 /* We only have a 4 wide bitmap in cluster mode. If a deranged
215 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */ 214 * BIOS puts 5 CPUs in one APIC cluster, we're hosed. */
216 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT); 215 BUG_ON(count >= XAPIC_DEST_CPUS_SHIFT);
217 id = my_cluster | (1UL << count); 216 return my_cluster | (1UL << count);
217}
218
219static void summit_init_apic_ldr(void)
220{
221 int cpu = smp_processor_id();
222 unsigned long id = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
223 unsigned long val;
224
218 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE); 225 apic_write(APIC_DFR, SUMMIT_APIC_DFR_VALUE);
219 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK; 226 val = apic_read(APIC_LDR) & ~APIC_LDR_MASK;
220 val |= SET_APIC_LOGICAL_ID(id); 227 val |= SET_APIC_LOGICAL_ID(id);
@@ -232,27 +239,6 @@ static void summit_setup_apic_routing(void)
232 nr_ioapics); 239 nr_ioapics);
233} 240}
234 241
235static int summit_apicid_to_node(int logical_apicid)
236{
237#ifdef CONFIG_SMP
238 return apicid_2_node[hard_smp_processor_id()];
239#else
240 return 0;
241#endif
242}
243
244/* Mapping from cpu number to logical apicid */
245static inline int summit_cpu_to_logical_apicid(int cpu)
246{
247#ifdef CONFIG_SMP
248 if (cpu >= nr_cpu_ids)
249 return BAD_APICID;
250 return cpu_2_logical_apicid[cpu];
251#else
252 return logical_smp_processor_id();
253#endif
254}
255
256static int summit_cpu_present_to_apicid(int mps_cpu) 242static int summit_cpu_present_to_apicid(int mps_cpu)
257{ 243{
258 if (mps_cpu < nr_cpu_ids) 244 if (mps_cpu < nr_cpu_ids)
@@ -286,7 +272,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
286 * The cpus in the mask must all be on the apic cluster. 272 * The cpus in the mask must all be on the apic cluster.
287 */ 273 */
288 for_each_cpu(cpu, cpumask) { 274 for_each_cpu(cpu, cpumask) {
289 int new_apicid = summit_cpu_to_logical_apicid(cpu); 275 int new_apicid = early_per_cpu(x86_cpu_to_logical_apicid, cpu);
290 276
291 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) { 277 if (round && APIC_CLUSTER(apicid) != APIC_CLUSTER(new_apicid)) {
292 printk("%s: Not a valid mask!\n", __func__); 278 printk("%s: Not a valid mask!\n", __func__);
@@ -301,7 +287,7 @@ static unsigned int summit_cpu_mask_to_apicid(const struct cpumask *cpumask)
301static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask, 287static unsigned int summit_cpu_mask_to_apicid_and(const struct cpumask *inmask,
302 const struct cpumask *andmask) 288 const struct cpumask *andmask)
303{ 289{
304 int apicid = summit_cpu_to_logical_apicid(0); 290 int apicid = early_per_cpu(x86_cpu_to_logical_apicid, 0);
305 cpumask_var_t cpumask; 291 cpumask_var_t cpumask;
306 292
307 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC)) 293 if (!alloc_cpumask_var(&cpumask, GFP_ATOMIC))
@@ -505,7 +491,7 @@ void setup_summit(void)
505} 491}
506#endif 492#endif
507 493
508struct apic apic_summit = { 494static struct apic apic_summit = {
509 495
510 .name = "summit", 496 .name = "summit",
511 .probe = probe_summit, 497 .probe = probe_summit,
@@ -528,8 +514,6 @@ struct apic apic_summit = {
528 .ioapic_phys_id_map = summit_ioapic_phys_id_map, 514 .ioapic_phys_id_map = summit_ioapic_phys_id_map,
529 .setup_apic_routing = summit_setup_apic_routing, 515 .setup_apic_routing = summit_setup_apic_routing,
530 .multi_timer_check = NULL, 516 .multi_timer_check = NULL,
531 .apicid_to_node = summit_apicid_to_node,
532 .cpu_to_logical_apicid = summit_cpu_to_logical_apicid,
533 .cpu_present_to_apicid = summit_cpu_present_to_apicid, 517 .cpu_present_to_apicid = summit_cpu_present_to_apicid,
534 .apicid_to_cpu_present = summit_apicid_to_cpu_present, 518 .apicid_to_cpu_present = summit_apicid_to_cpu_present,
535 .setup_portio_remap = NULL, 519 .setup_portio_remap = NULL,
@@ -565,4 +549,8 @@ struct apic apic_summit = {
565 .icr_write = native_apic_icr_write, 549 .icr_write = native_apic_icr_write,
566 .wait_icr_idle = native_apic_wait_icr_idle, 550 .wait_icr_idle = native_apic_wait_icr_idle,
567 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, 551 .safe_wait_icr_idle = native_safe_apic_wait_icr_idle,
552
553 .x86_32_early_logical_apicid = summit_early_logical_apicid,
568}; 554};
555
556apic_driver(apic_summit);
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index cf69c59f4910..500795875827 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -5,118 +5,95 @@
5#include <linux/ctype.h> 5#include <linux/ctype.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8#include <linux/cpu.h>
8 9
9#include <asm/smp.h> 10#include <asm/smp.h>
10#include <asm/apic.h> 11#include <asm/x2apic.h>
11#include <asm/ipi.h>
12 12
13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); 13static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid);
14static DEFINE_PER_CPU(cpumask_var_t, cpus_in_cluster);
15static DEFINE_PER_CPU(cpumask_var_t, ipi_mask);
14 16
15static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 17static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
16{ 18{
17 return x2apic_enabled(); 19 return x2apic_enabled();
18} 20}
19 21
20/* 22static inline u32 x2apic_cluster(int cpu)
21 * need to use more than cpu 0, because we need more vectors when
22 * MSI-X are used.
23 */
24static const struct cpumask *x2apic_target_cpus(void)
25{ 23{
26 return cpu_online_mask; 24 return per_cpu(x86_cpu_to_logical_apicid, cpu) >> 16;
27}
28
29/*
30 * for now each logical cpu is in its own vector allocation domain.
31 */
32static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
33{
34 cpumask_clear(retmask);
35 cpumask_set_cpu(cpu, retmask);
36} 25}
37 26
38static void 27static void
39 __x2apic_send_IPI_dest(unsigned int apicid, int vector, unsigned int dest) 28__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
40{ 29{
41 unsigned long cfg; 30 struct cpumask *cpus_in_cluster_ptr;
31 struct cpumask *ipi_mask_ptr;
32 unsigned int cpu, this_cpu;
33 unsigned long flags;
34 u32 dest;
35
36 x2apic_wrmsr_fence();
37
38 local_irq_save(flags);
42 39
43 cfg = __prepare_ICR(0, vector, dest); 40 this_cpu = smp_processor_id();
44 41
45 /* 42 /*
46 * send the IPI. 43 * We are to modify mask, so we need an own copy
44 * and be sure it's manipulated with irq off.
47 */ 45 */
48 native_x2apic_icr_write(cfg, apicid); 46 ipi_mask_ptr = __raw_get_cpu_var(ipi_mask);
49} 47 cpumask_copy(ipi_mask_ptr, mask);
50 48
51/* 49 /*
52 * for now, we send the IPI's one by one in the cpumask. 50 * The idea is to send one IPI per cluster.
53 * TBD: Based on the cpu mask, we can send the IPI's to the cluster group 51 */
54 * at once. We have 16 cpu's in a cluster. This will minimize IPI register 52 for_each_cpu(cpu, ipi_mask_ptr) {
55 * writes. 53 unsigned long i;
56 */
57static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
58{
59 unsigned long query_cpu;
60 unsigned long flags;
61 54
62 x2apic_wrmsr_fence(); 55 cpus_in_cluster_ptr = per_cpu(cpus_in_cluster, cpu);
56 dest = 0;
63 57
64 local_irq_save(flags); 58 /* Collect cpus in cluster. */
65 for_each_cpu(query_cpu, mask) { 59 for_each_cpu_and(i, ipi_mask_ptr, cpus_in_cluster_ptr) {
66 __x2apic_send_IPI_dest( 60 if (apic_dest == APIC_DEST_ALLINC || i != this_cpu)
67 per_cpu(x86_cpu_to_logical_apicid, query_cpu), 61 dest |= per_cpu(x86_cpu_to_logical_apicid, i);
68 vector, apic->dest_logical); 62 }
63
64 if (!dest)
65 continue;
66
67 __x2apic_send_IPI_dest(dest, vector, apic->dest_logical);
68 /*
69 * Cluster sibling cpus should be discared now so
70 * we would not send IPI them second time.
71 */
72 cpumask_andnot(ipi_mask_ptr, ipi_mask_ptr, cpus_in_cluster_ptr);
69 } 73 }
74
70 local_irq_restore(flags); 75 local_irq_restore(flags);
71} 76}
72 77
78static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
79{
80 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
81}
82
73static void 83static void
74 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 84 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
75{ 85{
76 unsigned long this_cpu = smp_processor_id(); 86 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
77 unsigned long query_cpu;
78 unsigned long flags;
79
80 x2apic_wrmsr_fence();
81
82 local_irq_save(flags);
83 for_each_cpu(query_cpu, mask) {
84 if (query_cpu == this_cpu)
85 continue;
86 __x2apic_send_IPI_dest(
87 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
88 vector, apic->dest_logical);
89 }
90 local_irq_restore(flags);
91} 87}
92 88
93static void x2apic_send_IPI_allbutself(int vector) 89static void x2apic_send_IPI_allbutself(int vector)
94{ 90{
95 unsigned long this_cpu = smp_processor_id(); 91 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
96 unsigned long query_cpu;
97 unsigned long flags;
98
99 x2apic_wrmsr_fence();
100
101 local_irq_save(flags);
102 for_each_online_cpu(query_cpu) {
103 if (query_cpu == this_cpu)
104 continue;
105 __x2apic_send_IPI_dest(
106 per_cpu(x86_cpu_to_logical_apicid, query_cpu),
107 vector, apic->dest_logical);
108 }
109 local_irq_restore(flags);
110} 92}
111 93
112static void x2apic_send_IPI_all(int vector) 94static void x2apic_send_IPI_all(int vector)
113{ 95{
114 x2apic_send_IPI_mask(cpu_online_mask, vector); 96 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
115}
116
117static int x2apic_apic_id_registered(void)
118{
119 return 1;
120} 97}
121 98
122static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 99static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -151,43 +128,90 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
151 return per_cpu(x86_cpu_to_logical_apicid, cpu); 128 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152} 129}
153 130
154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 131static void init_x2apic_ldr(void)
155{ 132{
156 unsigned int id; 133 unsigned int this_cpu = smp_processor_id();
134 unsigned int cpu;
157 135
158 id = x; 136 per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
159 return id; 137
138 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
139 for_each_online_cpu(cpu) {
140 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
141 continue;
142 __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
143 __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
144 }
160} 145}
161 146
162static unsigned long set_apic_id(unsigned int id) 147 /*
148 * At CPU state changes, update the x2apic cluster sibling info.
149 */
150static int __cpuinit
151update_clusterinfo(struct notifier_block *nfb, unsigned long action, void *hcpu)
163{ 152{
164 unsigned long x; 153 unsigned int this_cpu = (unsigned long)hcpu;
154 unsigned int cpu;
155 int err = 0;
156
157 switch (action) {
158 case CPU_UP_PREPARE:
159 if (!zalloc_cpumask_var(&per_cpu(cpus_in_cluster, this_cpu),
160 GFP_KERNEL)) {
161 err = -ENOMEM;
162 } else if (!zalloc_cpumask_var(&per_cpu(ipi_mask, this_cpu),
163 GFP_KERNEL)) {
164 free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
165 err = -ENOMEM;
166 }
167 break;
168 case CPU_UP_CANCELED:
169 case CPU_UP_CANCELED_FROZEN:
170 case CPU_DEAD:
171 for_each_online_cpu(cpu) {
172 if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
173 continue;
174 __cpu_clear(this_cpu, per_cpu(cpus_in_cluster, cpu));
175 __cpu_clear(cpu, per_cpu(cpus_in_cluster, this_cpu));
176 }
177 free_cpumask_var(per_cpu(cpus_in_cluster, this_cpu));
178 free_cpumask_var(per_cpu(ipi_mask, this_cpu));
179 break;
180 }
165 181
166 x = id; 182 return notifier_from_errno(err);
167 return x;
168} 183}
169 184
170static int x2apic_cluster_phys_pkg_id(int initial_apicid, int index_msb) 185static struct notifier_block __refdata x2apic_cpu_notifier = {
171{ 186 .notifier_call = update_clusterinfo,
172 return initial_apicid >> index_msb; 187};
173}
174 188
175static void x2apic_send_IPI_self(int vector) 189static int x2apic_init_cpu_notifier(void)
176{ 190{
177 apic_write(APIC_SELF_IPI, vector); 191 int cpu = smp_processor_id();
192
193 zalloc_cpumask_var(&per_cpu(cpus_in_cluster, cpu), GFP_KERNEL);
194 zalloc_cpumask_var(&per_cpu(ipi_mask, cpu), GFP_KERNEL);
195
196 BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
197
198 __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
199 register_hotcpu_notifier(&x2apic_cpu_notifier);
200 return 1;
178} 201}
179 202
180static void init_x2apic_ldr(void) 203static int x2apic_cluster_probe(void)
181{ 204{
182 int cpu = smp_processor_id(); 205 if (x2apic_mode)
183 206 return x2apic_init_cpu_notifier();
184 per_cpu(x86_cpu_to_logical_apicid, cpu) = apic_read(APIC_LDR); 207 else
208 return 0;
185} 209}
186 210
187struct apic apic_x2apic_cluster = { 211static struct apic apic_x2apic_cluster = {
188 212
189 .name = "cluster x2apic", 213 .name = "cluster x2apic",
190 .probe = NULL, 214 .probe = x2apic_cluster_probe,
191 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 215 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
192 .apic_id_registered = x2apic_apic_id_registered, 216 .apic_id_registered = x2apic_apic_id_registered,
193 217
@@ -206,18 +230,16 @@ struct apic apic_x2apic_cluster = {
206 .ioapic_phys_id_map = NULL, 230 .ioapic_phys_id_map = NULL,
207 .setup_apic_routing = NULL, 231 .setup_apic_routing = NULL,
208 .multi_timer_check = NULL, 232 .multi_timer_check = NULL,
209 .apicid_to_node = NULL,
210 .cpu_to_logical_apicid = NULL,
211 .cpu_present_to_apicid = default_cpu_present_to_apicid, 233 .cpu_present_to_apicid = default_cpu_present_to_apicid,
212 .apicid_to_cpu_present = NULL, 234 .apicid_to_cpu_present = NULL,
213 .setup_portio_remap = NULL, 235 .setup_portio_remap = NULL,
214 .check_phys_apicid_present = default_check_phys_apicid_present, 236 .check_phys_apicid_present = default_check_phys_apicid_present,
215 .enable_apic_mode = NULL, 237 .enable_apic_mode = NULL,
216 .phys_pkg_id = x2apic_cluster_phys_pkg_id, 238 .phys_pkg_id = x2apic_phys_pkg_id,
217 .mps_oem_check = NULL, 239 .mps_oem_check = NULL,
218 240
219 .get_apic_id = x2apic_cluster_phys_get_apic_id, 241 .get_apic_id = x2apic_get_apic_id,
220 .set_apic_id = set_apic_id, 242 .set_apic_id = x2apic_set_apic_id,
221 .apic_id_mask = 0xFFFFFFFFu, 243 .apic_id_mask = 0xFFFFFFFFu,
222 244
223 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 245 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -242,3 +264,5 @@ struct apic apic_x2apic_cluster = {
242 .wait_icr_idle = native_x2apic_wait_icr_idle, 264 .wait_icr_idle = native_x2apic_wait_icr_idle,
243 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, 265 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
244}; 266};
267
268apic_driver(apic_x2apic_cluster);
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index 8972f38c5ced..f5373dfde21e 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -7,11 +7,12 @@
7#include <linux/dmar.h> 7#include <linux/dmar.h>
8 8
9#include <asm/smp.h> 9#include <asm/smp.h>
10#include <asm/apic.h> 10#include <asm/x2apic.h>
11#include <asm/ipi.h>
12 11
13int x2apic_phys; 12int x2apic_phys;
14 13
14static struct apic apic_x2apic_phys;
15
15static int set_x2apic_phys_mode(char *arg) 16static int set_x2apic_phys_mode(char *arg)
16{ 17{
17 x2apic_phys = 1; 18 x2apic_phys = 1;
@@ -27,94 +28,46 @@ static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
27 return 0; 28 return 0;
28} 29}
29 30
30/* 31static void
31 * need to use more than cpu 0, because we need more vectors when 32__x2apic_send_IPI_mask(const struct cpumask *mask, int vector, int apic_dest)
32 * MSI-X are used.
33 */
34static const struct cpumask *x2apic_target_cpus(void)
35{
36 return cpu_online_mask;
37}
38
39static void x2apic_vector_allocation_domain(int cpu, struct cpumask *retmask)
40{
41 cpumask_clear(retmask);
42 cpumask_set_cpu(cpu, retmask);
43}
44
45static void __x2apic_send_IPI_dest(unsigned int apicid, int vector,
46 unsigned int dest)
47{
48 unsigned long cfg;
49
50 cfg = __prepare_ICR(0, vector, dest);
51
52 /*
53 * send the IPI.
54 */
55 native_x2apic_icr_write(cfg, apicid);
56}
57
58static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
59{ 33{
60 unsigned long query_cpu; 34 unsigned long query_cpu;
35 unsigned long this_cpu;
61 unsigned long flags; 36 unsigned long flags;
62 37
63 x2apic_wrmsr_fence(); 38 x2apic_wrmsr_fence();
64 39
65 local_irq_save(flags); 40 local_irq_save(flags);
41
42 this_cpu = smp_processor_id();
66 for_each_cpu(query_cpu, mask) { 43 for_each_cpu(query_cpu, mask) {
44 if (apic_dest == APIC_DEST_ALLBUT && this_cpu == query_cpu)
45 continue;
67 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu), 46 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
68 vector, APIC_DEST_PHYSICAL); 47 vector, APIC_DEST_PHYSICAL);
69 } 48 }
70 local_irq_restore(flags); 49 local_irq_restore(flags);
71} 50}
72 51
52static void x2apic_send_IPI_mask(const struct cpumask *mask, int vector)
53{
54 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLINC);
55}
56
73static void 57static void
74 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector) 58 x2apic_send_IPI_mask_allbutself(const struct cpumask *mask, int vector)
75{ 59{
76 unsigned long this_cpu = smp_processor_id(); 60 __x2apic_send_IPI_mask(mask, vector, APIC_DEST_ALLBUT);
77 unsigned long query_cpu;
78 unsigned long flags;
79
80 x2apic_wrmsr_fence();
81
82 local_irq_save(flags);
83 for_each_cpu(query_cpu, mask) {
84 if (query_cpu != this_cpu)
85 __x2apic_send_IPI_dest(
86 per_cpu(x86_cpu_to_apicid, query_cpu),
87 vector, APIC_DEST_PHYSICAL);
88 }
89 local_irq_restore(flags);
90} 61}
91 62
92static void x2apic_send_IPI_allbutself(int vector) 63static void x2apic_send_IPI_allbutself(int vector)
93{ 64{
94 unsigned long this_cpu = smp_processor_id(); 65 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLBUT);
95 unsigned long query_cpu;
96 unsigned long flags;
97
98 x2apic_wrmsr_fence();
99
100 local_irq_save(flags);
101 for_each_online_cpu(query_cpu) {
102 if (query_cpu == this_cpu)
103 continue;
104 __x2apic_send_IPI_dest(per_cpu(x86_cpu_to_apicid, query_cpu),
105 vector, APIC_DEST_PHYSICAL);
106 }
107 local_irq_restore(flags);
108} 66}
109 67
110static void x2apic_send_IPI_all(int vector) 68static void x2apic_send_IPI_all(int vector)
111{ 69{
112 x2apic_send_IPI_mask(cpu_online_mask, vector); 70 __x2apic_send_IPI_mask(cpu_online_mask, vector, APIC_DEST_ALLINC);
113}
114
115static int x2apic_apic_id_registered(void)
116{
117 return 1;
118} 71}
119 72
120static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask) 73static unsigned int x2apic_cpu_mask_to_apicid(const struct cpumask *cpumask)
@@ -149,34 +102,22 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
149 return per_cpu(x86_cpu_to_apicid, cpu); 102 return per_cpu(x86_cpu_to_apicid, cpu);
150} 103}
151 104
152static unsigned int x2apic_phys_get_apic_id(unsigned long x) 105static void init_x2apic_ldr(void)
153{
154 return x;
155}
156
157static unsigned long set_apic_id(unsigned int id)
158{
159 return id;
160}
161
162static int x2apic_phys_pkg_id(int initial_apicid, int index_msb)
163{ 106{
164 return initial_apicid >> index_msb;
165} 107}
166 108
167static void x2apic_send_IPI_self(int vector) 109static int x2apic_phys_probe(void)
168{ 110{
169 apic_write(APIC_SELF_IPI, vector); 111 if (x2apic_mode && x2apic_phys)
170} 112 return 1;
171 113
172static void init_x2apic_ldr(void) 114 return apic == &apic_x2apic_phys;
173{
174} 115}
175 116
176struct apic apic_x2apic_phys = { 117static struct apic apic_x2apic_phys = {
177 118
178 .name = "physical x2apic", 119 .name = "physical x2apic",
179 .probe = NULL, 120 .probe = x2apic_phys_probe,
180 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check, 121 .acpi_madt_oem_check = x2apic_acpi_madt_oem_check,
181 .apic_id_registered = x2apic_apic_id_registered, 122 .apic_id_registered = x2apic_apic_id_registered,
182 123
@@ -195,8 +136,6 @@ struct apic apic_x2apic_phys = {
195 .ioapic_phys_id_map = NULL, 136 .ioapic_phys_id_map = NULL,
196 .setup_apic_routing = NULL, 137 .setup_apic_routing = NULL,
197 .multi_timer_check = NULL, 138 .multi_timer_check = NULL,
198 .apicid_to_node = NULL,
199 .cpu_to_logical_apicid = NULL,
200 .cpu_present_to_apicid = default_cpu_present_to_apicid, 139 .cpu_present_to_apicid = default_cpu_present_to_apicid,
201 .apicid_to_cpu_present = NULL, 140 .apicid_to_cpu_present = NULL,
202 .setup_portio_remap = NULL, 141 .setup_portio_remap = NULL,
@@ -205,8 +144,8 @@ struct apic apic_x2apic_phys = {
205 .phys_pkg_id = x2apic_phys_pkg_id, 144 .phys_pkg_id = x2apic_phys_pkg_id,
206 .mps_oem_check = NULL, 145 .mps_oem_check = NULL,
207 146
208 .get_apic_id = x2apic_phys_get_apic_id, 147 .get_apic_id = x2apic_get_apic_id,
209 .set_apic_id = set_apic_id, 148 .set_apic_id = x2apic_set_apic_id,
210 .apic_id_mask = 0xFFFFFFFFu, 149 .apic_id_mask = 0xFFFFFFFFu,
211 150
212 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid, 151 .cpu_mask_to_apicid = x2apic_cpu_mask_to_apicid,
@@ -231,3 +170,5 @@ struct apic apic_x2apic_phys = {
231 .wait_icr_idle = native_x2apic_wait_icr_idle, 170 .wait_icr_idle = native_x2apic_wait_icr_idle,
232 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle, 171 .safe_wait_icr_idle = native_safe_x2apic_wait_icr_idle,
233}; 172};
173
174apic_driver(apic_x2apic_phys);
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index f744f54cb248..adc66c3a1fef 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2010 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -23,6 +23,8 @@
23#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h> 24#include <linux/pci.h>
25#include <linux/kdebug.h> 25#include <linux/kdebug.h>
26#include <linux/delay.h>
27#include <linux/crash_dump.h>
26 28
27#include <asm/uv/uv_mmrs.h> 29#include <asm/uv/uv_mmrs.h>
28#include <asm/uv/uv_hub.h> 30#include <asm/uv/uv_hub.h>
@@ -34,6 +36,14 @@
34#include <asm/ipi.h> 36#include <asm/ipi.h>
35#include <asm/smp.h> 37#include <asm/smp.h>
36#include <asm/x86_init.h> 38#include <asm/x86_init.h>
39#include <asm/emergency-restart.h>
40#include <asm/nmi.h>
41
42/* BMC sets a bit this MMR non-zero before sending an NMI */
43#define UVH_NMI_MMR UVH_SCRATCH5
44#define UVH_NMI_MMR_CLEAR (UVH_NMI_MMR + 8)
45#define UV_NMI_PENDING_MASK (1UL << 63)
46DEFINE_PER_CPU(unsigned long, cpu_last_nmi_count);
37 47
38DEFINE_PER_CPU(int, x2apic_extra_bits); 48DEFINE_PER_CPU(int, x2apic_extra_bits);
39 49
@@ -41,10 +51,25 @@ DEFINE_PER_CPU(int, x2apic_extra_bits);
41 51
42static enum uv_system_type uv_system_type; 52static enum uv_system_type uv_system_type;
43static u64 gru_start_paddr, gru_end_paddr; 53static u64 gru_start_paddr, gru_end_paddr;
54static union uvh_apicid uvh_apicid;
44int uv_min_hub_revision_id; 55int uv_min_hub_revision_id;
45EXPORT_SYMBOL_GPL(uv_min_hub_revision_id); 56EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
57unsigned int uv_apicid_hibits;
58EXPORT_SYMBOL_GPL(uv_apicid_hibits);
46static DEFINE_SPINLOCK(uv_nmi_lock); 59static DEFINE_SPINLOCK(uv_nmi_lock);
47 60
61static struct apic apic_x2apic_uv_x;
62
63static unsigned long __init uv_early_read_mmr(unsigned long addr)
64{
65 unsigned long val, *mmr;
66
67 mmr = early_ioremap(UV_LOCAL_MMR_BASE | addr, sizeof(*mmr));
68 val = *mmr;
69 early_iounmap(mmr, sizeof(*mmr));
70 return val;
71}
72
48static inline bool is_GRU_range(u64 start, u64 end) 73static inline bool is_GRU_range(u64 start, u64 end)
49{ 74{
50 return start >= gru_start_paddr && end <= gru_end_paddr; 75 return start >= gru_start_paddr && end <= gru_end_paddr;
@@ -55,27 +80,63 @@ static bool uv_is_untracked_pat_range(u64 start, u64 end)
55 return is_ISA_range(start, end) || is_GRU_range(start, end); 80 return is_ISA_range(start, end) || is_GRU_range(start, end);
56} 81}
57 82
58static int early_get_nodeid(void) 83static int __init early_get_pnodeid(void)
59{ 84{
60 union uvh_node_id_u node_id; 85 union uvh_node_id_u node_id;
61 unsigned long *mmr; 86 union uvh_rh_gam_config_mmr_u m_n_config;
62 87 int pnode;
63 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
64 node_id.v = *mmr;
65 early_iounmap(mmr, sizeof(*mmr));
66 88
67 /* Currently, all blades have same revision number */ 89 /* Currently, all blades have same revision number */
90 node_id.v = uv_early_read_mmr(UVH_NODE_ID);
91 m_n_config.v = uv_early_read_mmr(UVH_RH_GAM_CONFIG_MMR);
68 uv_min_hub_revision_id = node_id.s.revision; 92 uv_min_hub_revision_id = node_id.s.revision;
69 93
70 return node_id.s.node_id; 94 if (node_id.s.part_number == UV2_HUB_PART_NUMBER)
95 uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1;
96
97 uv_hub_info->hub_revision = uv_min_hub_revision_id;
98 pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1);
99 return pnode;
100}
101
102static void __init early_get_apic_pnode_shift(void)
103{
104 uvh_apicid.v = uv_early_read_mmr(UVH_APICID);
105 if (!uvh_apicid.v)
106 /*
107 * Old bios, use default value
108 */
109 uvh_apicid.s.pnode_shift = UV_APIC_PNODE_SHIFT;
110}
111
112/*
113 * Add an extra bit as dictated by bios to the destination apicid of
114 * interrupts potentially passing through the UV HUB. This prevents
115 * a deadlock between interrupts and IO port operations.
116 */
117static void __init uv_set_apicid_hibit(void)
118{
119 union uv1h_lb_target_physical_apic_id_mask_u apicid_mask;
120
121 if (is_uv1_hub()) {
122 apicid_mask.v =
123 uv_early_read_mmr(UV1H_LB_TARGET_PHYSICAL_APIC_ID_MASK);
124 uv_apicid_hibits =
125 apicid_mask.s1.bit_enables & UV_APICID_HIBIT_MASK;
126 }
71} 127}
72 128
73static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 129static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
74{ 130{
75 int nodeid; 131 int pnodeid, is_uv1, is_uv2;
76 132
77 if (!strcmp(oem_id, "SGI")) { 133 is_uv1 = !strcmp(oem_id, "SGI");
78 nodeid = early_get_nodeid(); 134 is_uv2 = !strcmp(oem_id, "SGI2");
135 if (is_uv1 || is_uv2) {
136 uv_hub_info->hub_revision =
137 is_uv1 ? UV1_HUB_REVISION_BASE : UV2_HUB_REVISION_BASE;
138 pnodeid = early_get_pnodeid();
139 early_get_apic_pnode_shift();
79 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; 140 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
80 x86_platform.nmi_init = uv_nmi_init; 141 x86_platform.nmi_init = uv_nmi_init;
81 if (!strcmp(oem_table_id, "UVL")) 142 if (!strcmp(oem_table_id, "UVL"))
@@ -83,9 +144,10 @@ static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
83 else if (!strcmp(oem_table_id, "UVX")) 144 else if (!strcmp(oem_table_id, "UVX"))
84 uv_system_type = UV_X2APIC; 145 uv_system_type = UV_X2APIC;
85 else if (!strcmp(oem_table_id, "UVH")) { 146 else if (!strcmp(oem_table_id, "UVH")) {
86 __get_cpu_var(x2apic_extra_bits) = 147 __this_cpu_write(x2apic_extra_bits,
87 nodeid << (UV_APIC_PNODE_SHIFT - 1); 148 pnodeid << uvh_apicid.s.pnode_shift);
88 uv_system_type = UV_NON_UNIQUE_APIC; 149 uv_system_type = UV_NON_UNIQUE_APIC;
150 uv_set_apicid_hibit();
89 return 1; 151 return 1;
90 } 152 }
91 } 153 }
@@ -139,6 +201,7 @@ static int __cpuinit uv_wakeup_secondary(int phys_apicid, unsigned long start_ri
139 int pnode; 201 int pnode;
140 202
141 pnode = uv_apicid_to_pnode(phys_apicid); 203 pnode = uv_apicid_to_pnode(phys_apicid);
204 phys_apicid |= uv_apicid_hibits;
142 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 205 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
143 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) | 206 (phys_apicid << UVH_IPI_INT_APIC_ID_SHFT) |
144 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) | 207 ((start_rip << UVH_IPI_INT_VECTOR_SHFT) >> 12) |
@@ -220,7 +283,7 @@ static unsigned int uv_cpu_mask_to_apicid(const struct cpumask *cpumask)
220 int cpu = cpumask_first(cpumask); 283 int cpu = cpumask_first(cpumask);
221 284
222 if ((unsigned)cpu < nr_cpu_ids) 285 if ((unsigned)cpu < nr_cpu_ids)
223 return per_cpu(x86_cpu_to_apicid, cpu); 286 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
224 else 287 else
225 return BAD_APICID; 288 return BAD_APICID;
226} 289}
@@ -239,7 +302,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
239 if (cpumask_test_cpu(cpu, cpu_online_mask)) 302 if (cpumask_test_cpu(cpu, cpu_online_mask))
240 break; 303 break;
241 } 304 }
242 return per_cpu(x86_cpu_to_apicid, cpu); 305 return per_cpu(x86_cpu_to_apicid, cpu) | uv_apicid_hibits;
243} 306}
244 307
245static unsigned int x2apic_get_apic_id(unsigned long x) 308static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -247,7 +310,7 @@ static unsigned int x2apic_get_apic_id(unsigned long x)
247 unsigned int id; 310 unsigned int id;
248 311
249 WARN_ON(preemptible() && num_online_cpus() > 1); 312 WARN_ON(preemptible() && num_online_cpus() > 1);
250 id = x | __get_cpu_var(x2apic_extra_bits); 313 id = x | __this_cpu_read(x2apic_extra_bits);
251 314
252 return id; 315 return id;
253} 316}
@@ -277,10 +340,15 @@ static void uv_send_IPI_self(int vector)
277 apic_write(APIC_SELF_IPI, vector); 340 apic_write(APIC_SELF_IPI, vector);
278} 341}
279 342
280struct apic __refdata apic_x2apic_uv_x = { 343static int uv_probe(void)
344{
345 return apic == &apic_x2apic_uv_x;
346}
347
348static struct apic __refdata apic_x2apic_uv_x = {
281 349
282 .name = "UV large system", 350 .name = "UV large system",
283 .probe = NULL, 351 .probe = uv_probe,
284 .acpi_madt_oem_check = uv_acpi_madt_oem_check, 352 .acpi_madt_oem_check = uv_acpi_madt_oem_check,
285 .apic_id_registered = uv_apic_id_registered, 353 .apic_id_registered = uv_apic_id_registered,
286 354
@@ -299,8 +367,6 @@ struct apic __refdata apic_x2apic_uv_x = {
299 .ioapic_phys_id_map = NULL, 367 .ioapic_phys_id_map = NULL,
300 .setup_apic_routing = NULL, 368 .setup_apic_routing = NULL,
301 .multi_timer_check = NULL, 369 .multi_timer_check = NULL,
302 .apicid_to_node = NULL,
303 .cpu_to_logical_apicid = NULL,
304 .cpu_present_to_apicid = default_cpu_present_to_apicid, 370 .cpu_present_to_apicid = default_cpu_present_to_apicid,
305 .apicid_to_cpu_present = NULL, 371 .apicid_to_cpu_present = NULL,
306 .setup_portio_remap = NULL, 372 .setup_portio_remap = NULL,
@@ -339,7 +405,7 @@ struct apic __refdata apic_x2apic_uv_x = {
339 405
340static __cpuinit void set_x2apic_extra_bits(int pnode) 406static __cpuinit void set_x2apic_extra_bits(int pnode)
341{ 407{
342 __get_cpu_var(x2apic_extra_bits) = (pnode << 6); 408 __this_cpu_write(x2apic_extra_bits, pnode << uvh_apicid.s.pnode_shift);
343} 409}
344 410
345/* 411/*
@@ -363,14 +429,14 @@ struct redir_addr {
363#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT 429#define DEST_SHIFT UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR_DEST_BASE_SHFT
364 430
365static __initdata struct redir_addr redir_addrs[] = { 431static __initdata struct redir_addr redir_addrs[] = {
366 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_SI_ALIAS0_OVERLAY_CONFIG}, 432 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_0_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_0_MMR},
367 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_SI_ALIAS1_OVERLAY_CONFIG}, 433 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_1_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_1_MMR},
368 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_SI_ALIAS2_OVERLAY_CONFIG}, 434 {UVH_RH_GAM_ALIAS210_REDIRECT_CONFIG_2_MMR, UVH_RH_GAM_ALIAS210_OVERLAY_CONFIG_2_MMR},
369}; 435};
370 436
371static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size) 437static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
372{ 438{
373 union uvh_si_alias0_overlay_config_u alias; 439 union uvh_rh_gam_alias210_overlay_config_2_mmr_u alias;
374 union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect; 440 union uvh_rh_gam_alias210_redirect_config_2_mmr_u redirect;
375 int i; 441 int i;
376 442
@@ -430,12 +496,19 @@ static __init void map_mmr_high(int max_pnode)
430static __init void map_mmioh_high(int max_pnode) 496static __init void map_mmioh_high(int max_pnode)
431{ 497{
432 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh; 498 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
433 int shift = UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT; 499 int shift;
434 500
435 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 501 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
436 if (mmioh.s.enable) 502 if (is_uv1_hub() && mmioh.s1.enable) {
437 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io, 503 shift = UV1H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
504 map_high("MMIOH", mmioh.s1.base, shift, mmioh.s1.m_io,
438 max_pnode, map_uc); 505 max_pnode, map_uc);
506 }
507 if (is_uv2_hub() && mmioh.s2.enable) {
508 shift = UV2H_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR_BASE_SHFT;
509 map_high("MMIOH", mmioh.s2.base, shift, mmioh.s2.m_io,
510 max_pnode, map_uc);
511 }
439} 512}
440 513
441static __init void map_low_mmrs(void) 514static __init void map_low_mmrs(void)
@@ -559,14 +632,14 @@ late_initcall(uv_init_heartbeat);
559 632
560/* Direct Legacy VGA I/O traffic to designated IOH */ 633/* Direct Legacy VGA I/O traffic to designated IOH */
561int uv_set_vga_state(struct pci_dev *pdev, bool decode, 634int uv_set_vga_state(struct pci_dev *pdev, bool decode,
562 unsigned int command_bits, bool change_bridge) 635 unsigned int command_bits, u32 flags)
563{ 636{
564 int domain, bus, rc; 637 int domain, bus, rc;
565 638
566 PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n", 639 PR_DEVEL("devfn %x decode %d cmd %x flags %d\n",
567 pdev->devfn, decode, command_bits, change_bridge); 640 pdev->devfn, decode, command_bits, flags);
568 641
569 if (!change_bridge) 642 if (!(flags & PCI_VGA_STATE_CHANGE_BRIDGE))
570 return 0; 643 return 0;
571 644
572 if ((command_bits & PCI_COMMAND_IO) == 0) 645 if ((command_bits & PCI_COMMAND_IO) == 0)
@@ -602,18 +675,46 @@ void __cpuinit uv_cpu_init(void)
602 */ 675 */
603int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data) 676int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
604{ 677{
605 if (reason != DIE_NMI_IPI) 678 unsigned long real_uv_nmi;
679 int bid;
680
681 if (reason != DIE_NMIUNKNOWN)
606 return NOTIFY_OK; 682 return NOTIFY_OK;
607 683
608 if (in_crash_kexec) 684 if (in_crash_kexec)
609 /* do nothing if entering the crash kernel */ 685 /* do nothing if entering the crash kernel */
610 return NOTIFY_OK; 686 return NOTIFY_OK;
687
688 /*
689 * Each blade has an MMR that indicates when an NMI has been sent
690 * to cpus on the blade. If an NMI is detected, atomically
691 * clear the MMR and update a per-blade NMI count used to
692 * cause each cpu on the blade to notice a new NMI.
693 */
694 bid = uv_numa_blade_id();
695 real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
696
697 if (unlikely(real_uv_nmi)) {
698 spin_lock(&uv_blade_info[bid].nmi_lock);
699 real_uv_nmi = (uv_read_local_mmr(UVH_NMI_MMR) & UV_NMI_PENDING_MASK);
700 if (real_uv_nmi) {
701 uv_blade_info[bid].nmi_count++;
702 uv_write_local_mmr(UVH_NMI_MMR_CLEAR, UV_NMI_PENDING_MASK);
703 }
704 spin_unlock(&uv_blade_info[bid].nmi_lock);
705 }
706
707 if (likely(__get_cpu_var(cpu_last_nmi_count) == uv_blade_info[bid].nmi_count))
708 return NOTIFY_DONE;
709
710 __get_cpu_var(cpu_last_nmi_count) = uv_blade_info[bid].nmi_count;
711
611 /* 712 /*
612 * Use a lock so only one cpu prints at a time 713 * Use a lock so only one cpu prints at a time.
613 * to prevent intermixed output. 714 * This prevents intermixed output.
614 */ 715 */
615 spin_lock(&uv_nmi_lock); 716 spin_lock(&uv_nmi_lock);
616 pr_info("NMI stack dump cpu %u:\n", smp_processor_id()); 717 pr_info("UV NMI stack dump cpu %u:\n", smp_processor_id());
617 dump_stack(); 718 dump_stack();
618 spin_unlock(&uv_nmi_lock); 719 spin_unlock(&uv_nmi_lock);
619 720
@@ -621,7 +722,8 @@ int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
621} 722}
622 723
623static struct notifier_block uv_dump_stack_nmi_nb = { 724static struct notifier_block uv_dump_stack_nmi_nb = {
624 .notifier_call = uv_handle_nmi 725 .notifier_call = uv_handle_nmi,
726 .priority = NMI_LOCAL_LOW_PRIOR - 1,
625}; 727};
626 728
627void uv_register_nmi_notifier(void) 729void uv_register_nmi_notifier(void)
@@ -644,28 +746,34 @@ void uv_nmi_init(void)
644 746
645void __init uv_system_init(void) 747void __init uv_system_init(void)
646{ 748{
647 union uvh_si_addr_map_config_u m_n_config; 749 union uvh_rh_gam_config_mmr_u m_n_config;
750 union uvh_rh_gam_mmioh_overlay_config_mmr_u mmioh;
648 union uvh_node_id_u node_id; 751 union uvh_node_id_u node_id;
649 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size; 752 unsigned long gnode_upper, lowmem_redir_base, lowmem_redir_size;
650 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val; 753 int bytes, nid, cpu, lcpu, pnode, blade, i, j, m_val, n_val, n_io;
651 int gnode_extra, max_pnode = 0; 754 int gnode_extra, max_pnode = 0;
652 unsigned long mmr_base, present, paddr; 755 unsigned long mmr_base, present, paddr;
653 unsigned short pnode_mask; 756 unsigned short pnode_mask, pnode_io_mask;
654 757
758 printk(KERN_INFO "UV: Found %s hub\n", is_uv1_hub() ? "UV1" : "UV2");
655 map_low_mmrs(); 759 map_low_mmrs();
656 760
657 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 761 m_n_config.v = uv_read_local_mmr(UVH_RH_GAM_CONFIG_MMR );
658 m_val = m_n_config.s.m_skt; 762 m_val = m_n_config.s.m_skt;
659 n_val = m_n_config.s.n_skt; 763 n_val = m_n_config.s.n_skt;
764 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
765 n_io = is_uv1_hub() ? mmioh.s1.n_io : mmioh.s2.n_io;
660 mmr_base = 766 mmr_base =
661 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) & 767 uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR) &
662 ~UV_MMR_ENABLE; 768 ~UV_MMR_ENABLE;
663 pnode_mask = (1 << n_val) - 1; 769 pnode_mask = (1 << n_val) - 1;
770 pnode_io_mask = (1 << n_io) - 1;
771
664 node_id.v = uv_read_local_mmr(UVH_NODE_ID); 772 node_id.v = uv_read_local_mmr(UVH_NODE_ID);
665 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1; 773 gnode_extra = (node_id.s.node_id & ~((1 << n_val) - 1)) >> 1;
666 gnode_upper = ((unsigned long)gnode_extra << m_val); 774 gnode_upper = ((unsigned long)gnode_extra << m_val);
667 printk(KERN_DEBUG "UV: N %d, M %d, gnode_upper 0x%lx, gnode_extra 0x%x\n", 775 printk(KERN_INFO "UV: N %d, M %d, N_IO: %d, gnode_upper 0x%lx, gnode_extra 0x%x, pnode_mask 0x%x, pnode_io_mask 0x%x\n",
668 n_val, m_val, gnode_upper, gnode_extra); 776 n_val, m_val, n_io, gnode_upper, gnode_extra, pnode_mask, pnode_io_mask);
669 777
670 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base); 778 printk(KERN_DEBUG "UV: global MMR base 0x%lx\n", mmr_base);
671 779
@@ -675,8 +783,9 @@ void __init uv_system_init(void)
675 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades()); 783 printk(KERN_DEBUG "UV: Found %d blades\n", uv_num_possible_blades());
676 784
677 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades(); 785 bytes = sizeof(struct uv_blade_info) * uv_num_possible_blades();
678 uv_blade_info = kmalloc(bytes, GFP_KERNEL); 786 uv_blade_info = kzalloc(bytes, GFP_KERNEL);
679 BUG_ON(!uv_blade_info); 787 BUG_ON(!uv_blade_info);
788
680 for (blade = 0; blade < uv_num_possible_blades(); blade++) 789 for (blade = 0; blade < uv_num_possible_blades(); blade++)
681 uv_blade_info[blade].memory_nid = -1; 790 uv_blade_info[blade].memory_nid = -1;
682 791
@@ -698,10 +807,11 @@ void __init uv_system_init(void)
698 for (j = 0; j < 64; j++) { 807 for (j = 0; j < 64; j++) {
699 if (!test_bit(j, &present)) 808 if (!test_bit(j, &present))
700 continue; 809 continue;
701 pnode = (i * 64 + j); 810 pnode = (i * 64 + j) & pnode_mask;
702 uv_blade_info[blade].pnode = pnode; 811 uv_blade_info[blade].pnode = pnode;
703 uv_blade_info[blade].nr_possible_cpus = 0; 812 uv_blade_info[blade].nr_possible_cpus = 0;
704 uv_blade_info[blade].nr_online_cpus = 0; 813 uv_blade_info[blade].nr_online_cpus = 0;
814 spin_lock_init(&uv_blade_info[blade].nmi_lock);
705 max_pnode = max(pnode, max_pnode); 815 max_pnode = max(pnode, max_pnode);
706 blade++; 816 blade++;
707 } 817 }
@@ -716,6 +826,13 @@ void __init uv_system_init(void)
716 int apicid = per_cpu(x86_cpu_to_apicid, cpu); 826 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
717 827
718 nid = cpu_to_node(cpu); 828 nid = cpu_to_node(cpu);
829 /*
830 * apic_pnode_shift must be set before calling uv_apicid_to_pnode();
831 */
832 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
833 uv_cpu_hub_info(cpu)->apic_pnode_shift = uvh_apicid.s.pnode_shift;
834 uv_cpu_hub_info(cpu)->hub_revision = uv_hub_info->hub_revision;
835
719 pnode = uv_apicid_to_pnode(apicid); 836 pnode = uv_apicid_to_pnode(apicid);
720 blade = boot_pnode_to_blade(pnode); 837 blade = boot_pnode_to_blade(pnode);
721 lcpu = uv_blade_info[blade].nr_possible_cpus; 838 lcpu = uv_blade_info[blade].nr_possible_cpus;
@@ -731,7 +848,6 @@ void __init uv_system_init(void)
731 uv_cpu_hub_info(cpu)->numa_blade_id = blade; 848 uv_cpu_hub_info(cpu)->numa_blade_id = blade;
732 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu; 849 uv_cpu_hub_info(cpu)->blade_processor_id = lcpu;
733 uv_cpu_hub_info(cpu)->pnode = pnode; 850 uv_cpu_hub_info(cpu)->pnode = pnode;
734 uv_cpu_hub_info(cpu)->pnode_mask = pnode_mask;
735 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1; 851 uv_cpu_hub_info(cpu)->gpa_mask = (1UL << (m_val + n_val)) - 1;
736 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper; 852 uv_cpu_hub_info(cpu)->gnode_upper = gnode_upper;
737 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 853 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
@@ -755,7 +871,7 @@ void __init uv_system_init(void)
755 871
756 map_gru_high(max_pnode); 872 map_gru_high(max_pnode);
757 map_mmr_high(max_pnode); 873 map_mmr_high(max_pnode);
758 map_mmioh_high(max_pnode); 874 map_mmioh_high(max_pnode & pnode_io_mask);
759 875
760 uv_cpu_init(); 876 uv_cpu_init();
761 uv_scir_register_cpu_notifier(); 877 uv_scir_register_cpu_notifier();
@@ -764,4 +880,13 @@ void __init uv_system_init(void)
764 880
765 /* register Legacy VGA I/O redirection handler */ 881 /* register Legacy VGA I/O redirection handler */
766 pci_register_set_vga_state(uv_set_vga_state); 882 pci_register_set_vga_state(uv_set_vga_state);
883
884 /*
885 * For a kdump kernel the reset must be BOOT_ACPI, not BOOT_EFI, as
886 * EFI is not enabled in the kdump kernel.
887 */
888 if (is_kdump_kernel())
889 reboot_type = BOOT_ACPI;
767} 890}
891
892apic_driver(apic_x2apic_uv_x);
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 4c9c67bf09b7..965a7666c283 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -66,7 +66,7 @@
66 * 1.5: Fix segment register reloading (in case of bad segments saved 66 * 1.5: Fix segment register reloading (in case of bad segments saved
67 * across BIOS call). 67 * across BIOS call).
68 * Stephen Rothwell 68 * Stephen Rothwell
69 * 1.6: Cope with complier/assembler differences. 69 * 1.6: Cope with compiler/assembler differences.
70 * Only try to turn off the first display device. 70 * Only try to turn off the first display device.
71 * Fix OOPS at power off with no APM BIOS by Jan Echternach 71 * Fix OOPS at power off with no APM BIOS by Jan Echternach
72 * <echter@informatik.uni-rostock.de> 72 * <echter@informatik.uni-rostock.de>
@@ -189,8 +189,8 @@
189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01. 189 * Intel Order Number 241704-001. Microsoft Part Number 781-110-X01.
190 * 190 *
191 * [This document is available free from Intel by calling 800.628.8686 (fax 191 * [This document is available free from Intel by calling 800.628.8686 (fax
192 * 916.356.6100) or 800.548.4725; or via anonymous ftp from 192 * 916.356.6100) or 800.548.4725; or from
193 * ftp://ftp.intel.com/pub/IAL/software_specs/apmv11.doc. It is also 193 * http://www.microsoft.com/whdc/archive/amp_12.mspx It is also
194 * available from Microsoft by calling 206.882.8080.] 194 * available from Microsoft by calling 206.882.8080.]
195 * 195 *
196 * APM 1.2 Reference: 196 * APM 1.2 Reference:
@@ -227,6 +227,8 @@
227#include <linux/suspend.h> 227#include <linux/suspend.h>
228#include <linux/kthread.h> 228#include <linux/kthread.h>
229#include <linux/jiffies.h> 229#include <linux/jiffies.h>
230#include <linux/acpi.h>
231#include <linux/syscore_ops.h>
230 232
231#include <asm/system.h> 233#include <asm/system.h>
232#include <asm/uaccess.h> 234#include <asm/uaccess.h>
@@ -359,6 +361,7 @@ struct apm_user {
359 * idle percentage above which bios idle calls are done 361 * idle percentage above which bios idle calls are done
360 */ 362 */
361#ifdef CONFIG_APM_CPU_IDLE 363#ifdef CONFIG_APM_CPU_IDLE
364#warning deprecated CONFIG_APM_CPU_IDLE will be deleted in 2012
362#define DEFAULT_IDLE_THRESHOLD 95 365#define DEFAULT_IDLE_THRESHOLD 95
363#else 366#else
364#define DEFAULT_IDLE_THRESHOLD 100 367#define DEFAULT_IDLE_THRESHOLD 100
@@ -902,6 +905,7 @@ static void apm_cpu_idle(void)
902 unsigned int jiffies_since_last_check = jiffies - last_jiffies; 905 unsigned int jiffies_since_last_check = jiffies - last_jiffies;
903 unsigned int bucket; 906 unsigned int bucket;
904 907
908 WARN_ONCE(1, "deprecated apm_cpu_idle will be deleted in 2012");
905recalc: 909recalc:
906 if (jiffies_since_last_check > IDLE_CALC_LIMIT) { 910 if (jiffies_since_last_check > IDLE_CALC_LIMIT) {
907 use_apm_idle = 0; 911 use_apm_idle = 0;
@@ -975,20 +979,10 @@ recalc:
975 979
976static void apm_power_off(void) 980static void apm_power_off(void)
977{ 981{
978 unsigned char po_bios_call[] = {
979 0xb8, 0x00, 0x10, /* movw $0x1000,ax */
980 0x8e, 0xd0, /* movw ax,ss */
981 0xbc, 0x00, 0xf0, /* movw $0xf000,sp */
982 0xb8, 0x07, 0x53, /* movw $0x5307,ax */
983 0xbb, 0x01, 0x00, /* movw $0x0001,bx */
984 0xb9, 0x03, 0x00, /* movw $0x0003,cx */
985 0xcd, 0x15 /* int $0x15 */
986 };
987
988 /* Some bioses don't like being called from CPU != 0 */ 982 /* Some bioses don't like being called from CPU != 0 */
989 if (apm_info.realmode_power_off) { 983 if (apm_info.realmode_power_off) {
990 set_cpus_allowed_ptr(current, cpumask_of(0)); 984 set_cpus_allowed_ptr(current, cpumask_of(0));
991 machine_real_restart(po_bios_call, sizeof(po_bios_call)); 985 machine_real_restart(MRR_APM);
992 } else { 986 } else {
993 (void)set_system_power_state(APM_STATE_OFF); 987 (void)set_system_power_state(APM_STATE_OFF);
994 } 988 }
@@ -1246,7 +1240,7 @@ static int suspend(int vetoable)
1246 dpm_suspend_noirq(PMSG_SUSPEND); 1240 dpm_suspend_noirq(PMSG_SUSPEND);
1247 1241
1248 local_irq_disable(); 1242 local_irq_disable();
1249 sysdev_suspend(PMSG_SUSPEND); 1243 syscore_suspend();
1250 1244
1251 local_irq_enable(); 1245 local_irq_enable();
1252 1246
@@ -1264,7 +1258,7 @@ static int suspend(int vetoable)
1264 apm_error("suspend", err); 1258 apm_error("suspend", err);
1265 err = (err == APM_SUCCESS) ? 0 : -EIO; 1259 err = (err == APM_SUCCESS) ? 0 : -EIO;
1266 1260
1267 sysdev_resume(); 1261 syscore_resume();
1268 local_irq_enable(); 1262 local_irq_enable();
1269 1263
1270 dpm_resume_noirq(PMSG_RESUME); 1264 dpm_resume_noirq(PMSG_RESUME);
@@ -1288,7 +1282,7 @@ static void standby(void)
1288 dpm_suspend_noirq(PMSG_SUSPEND); 1282 dpm_suspend_noirq(PMSG_SUSPEND);
1289 1283
1290 local_irq_disable(); 1284 local_irq_disable();
1291 sysdev_suspend(PMSG_SUSPEND); 1285 syscore_suspend();
1292 local_irq_enable(); 1286 local_irq_enable();
1293 1287
1294 err = set_system_power_state(APM_STATE_STANDBY); 1288 err = set_system_power_state(APM_STATE_STANDBY);
@@ -1296,7 +1290,7 @@ static void standby(void)
1296 apm_error("standby", err); 1290 apm_error("standby", err);
1297 1291
1298 local_irq_disable(); 1292 local_irq_disable();
1299 sysdev_resume(); 1293 syscore_resume();
1300 local_irq_enable(); 1294 local_irq_enable();
1301 1295
1302 dpm_resume_noirq(PMSG_RESUME); 1296 dpm_resume_noirq(PMSG_RESUME);
@@ -1926,6 +1920,7 @@ static const struct file_operations apm_bios_fops = {
1926 .unlocked_ioctl = do_ioctl, 1920 .unlocked_ioctl = do_ioctl,
1927 .open = do_open, 1921 .open = do_open,
1928 .release = do_release, 1922 .release = do_release,
1923 .llseek = noop_llseek,
1929}; 1924};
1930 1925
1931static struct miscdevice apm_device = { 1926static struct miscdevice apm_device = {
@@ -2330,12 +2325,11 @@ static int __init apm_init(void)
2330 apm_info.disabled = 1; 2325 apm_info.disabled = 1;
2331 return -ENODEV; 2326 return -ENODEV;
2332 } 2327 }
2333 if (pm_flags & PM_ACPI) { 2328 if (!acpi_disabled) {
2334 printk(KERN_NOTICE "apm: overridden by ACPI.\n"); 2329 printk(KERN_NOTICE "apm: overridden by ACPI.\n");
2335 apm_info.disabled = 1; 2330 apm_info.disabled = 1;
2336 return -ENODEV; 2331 return -ENODEV;
2337 } 2332 }
2338 pm_flags |= PM_APM;
2339 2333
2340 /* 2334 /*
2341 * Set up the long jump entry point to the APM BIOS, which is called 2335 * Set up the long jump entry point to the APM BIOS, which is called
@@ -2427,7 +2421,6 @@ static void __exit apm_exit(void)
2427 kthread_stop(kapmd_task); 2421 kthread_stop(kapmd_task);
2428 kapmd_task = NULL; 2422 kapmd_task = NULL;
2429 } 2423 }
2430 pm_flags &= ~PM_APM;
2431} 2424}
2432 2425
2433module_init(apm_init); 2426module_init(apm_init);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index cfa82c899f47..4f13fafc5264 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -1,5 +1,70 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/hardirq.h>
12#include <linux/suspend.h>
13#include <linux/kbuild.h>
14#include <asm/processor.h>
15#include <asm/thread_info.h>
16#include <asm/sigframe.h>
17#include <asm/bootparam.h>
18#include <asm/suspend.h>
19
20#ifdef CONFIG_XEN
21#include <xen/interface/xen.h>
22#endif
23
1#ifdef CONFIG_X86_32 24#ifdef CONFIG_X86_32
2# include "asm-offsets_32.c" 25# include "asm-offsets_32.c"
3#else 26#else
4# include "asm-offsets_64.c" 27# include "asm-offsets_64.c"
5#endif 28#endif
29
30void common(void) {
31 BLANK();
32 OFFSET(TI_flags, thread_info, flags);
33 OFFSET(TI_status, thread_info, status);
34 OFFSET(TI_addr_limit, thread_info, addr_limit);
35 OFFSET(TI_preempt_count, thread_info, preempt_count);
36
37 BLANK();
38 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
39
40 BLANK();
41 OFFSET(pbe_address, pbe, address);
42 OFFSET(pbe_orig_address, pbe, orig_address);
43 OFFSET(pbe_next, pbe, next);
44
45#ifdef CONFIG_PARAVIRT
46 BLANK();
47 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
48 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
49 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
50 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
51 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
52 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
53 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
54 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
55 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2);
56#endif
57
58#ifdef CONFIG_XEN
59 BLANK();
60 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
61 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
62#endif
63
64 BLANK();
65 OFFSET(BP_scratch, boot_params, scratch);
66 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
67 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
68 OFFSET(BP_version, boot_params, hdr.version);
69 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
70}
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index dfdbf6403895..c29d631af6fc 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -1,26 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed
4 * to extract and format the required data.
5 */
6
7#include <linux/crypto.h>
8#include <linux/sched.h>
9#include <linux/signal.h>
10#include <linux/personality.h>
11#include <linux/suspend.h>
12#include <linux/kbuild.h>
13#include <asm/ucontext.h> 1#include <asm/ucontext.h>
14#include <asm/sigframe.h>
15#include <asm/pgtable.h>
16#include <asm/fixmap.h>
17#include <asm/processor.h>
18#include <asm/thread_info.h>
19#include <asm/bootparam.h>
20#include <asm/elf.h>
21#include <asm/suspend.h>
22
23#include <xen/interface/xen.h>
24 2
25#include <linux/lguest.h> 3#include <linux/lguest.h>
26#include "../../../drivers/lguest/lg.h" 4#include "../../../drivers/lguest/lg.h"
@@ -51,21 +29,10 @@ void foo(void)
51 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id); 29 OFFSET(CPUINFO_x86_vendor_id, cpuinfo_x86, x86_vendor_id);
52 BLANK(); 30 BLANK();
53 31
54 OFFSET(TI_task, thread_info, task);
55 OFFSET(TI_exec_domain, thread_info, exec_domain);
56 OFFSET(TI_flags, thread_info, flags);
57 OFFSET(TI_status, thread_info, status);
58 OFFSET(TI_preempt_count, thread_info, preempt_count);
59 OFFSET(TI_addr_limit, thread_info, addr_limit);
60 OFFSET(TI_restart_block, thread_info, restart_block);
61 OFFSET(TI_sysenter_return, thread_info, sysenter_return); 32 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
62 OFFSET(TI_cpu, thread_info, cpu); 33 OFFSET(TI_cpu, thread_info, cpu);
63 BLANK(); 34 BLANK();
64 35
65 OFFSET(GDS_size, desc_ptr, size);
66 OFFSET(GDS_address, desc_ptr, address);
67 BLANK();
68
69 OFFSET(PT_EBX, pt_regs, bx); 36 OFFSET(PT_EBX, pt_regs, bx);
70 OFFSET(PT_ECX, pt_regs, cx); 37 OFFSET(PT_ECX, pt_regs, cx);
71 OFFSET(PT_EDX, pt_regs, dx); 38 OFFSET(PT_EDX, pt_regs, dx);
@@ -85,44 +52,13 @@ void foo(void)
85 OFFSET(PT_OLDSS, pt_regs, ss); 52 OFFSET(PT_OLDSS, pt_regs, ss);
86 BLANK(); 53 BLANK();
87 54
88 OFFSET(EXEC_DOMAIN_handler, exec_domain, handler);
89 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext); 55 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe, uc.uc_mcontext);
90 BLANK(); 56 BLANK();
91 57
92 OFFSET(pbe_address, pbe, address);
93 OFFSET(pbe_orig_address, pbe, orig_address);
94 OFFSET(pbe_next, pbe, next);
95
96 /* Offset from the sysenter stack to tss.sp0 */ 58 /* Offset from the sysenter stack to tss.sp0 */
97 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - 59 DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
98 sizeof(struct tss_struct)); 60 sizeof(struct tss_struct));
99 61
100 DEFINE(PAGE_SIZE_asm, PAGE_SIZE);
101 DEFINE(PAGE_SHIFT_asm, PAGE_SHIFT);
102 DEFINE(PTRS_PER_PTE, PTRS_PER_PTE);
103 DEFINE(PTRS_PER_PMD, PTRS_PER_PMD);
104 DEFINE(PTRS_PER_PGD, PTRS_PER_PGD);
105
106 OFFSET(crypto_tfm_ctx_offset, crypto_tfm, __crt_ctx);
107
108#ifdef CONFIG_PARAVIRT
109 BLANK();
110 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
111 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
112 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
113 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
114 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
115 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
116 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
117 OFFSET(PV_CPU_read_cr0, pv_cpu_ops, read_cr0);
118#endif
119
120#ifdef CONFIG_XEN
121 BLANK();
122 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
123 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
124#endif
125
126#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) 62#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
127 BLANK(); 63 BLANK();
128 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled); 64 OFFSET(LGUEST_DATA_irq_enabled, lguest_data, irq_enabled);
@@ -141,11 +77,4 @@ void foo(void)
141 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode); 77 OFFSET(LGUEST_PAGES_regs_errcode, lguest_pages, regs.errcode);
142 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs); 78 OFFSET(LGUEST_PAGES_regs, lguest_pages, regs);
143#endif 79#endif
144
145 BLANK();
146 OFFSET(BP_scratch, boot_params, scratch);
147 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
148 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
149 OFFSET(BP_version, boot_params, hdr.version);
150 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
151} 80}
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 4a6aeedcd965..e72a1194af22 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -1,27 +1,4 @@
1/*
2 * Generate definitions needed by assembly language modules.
3 * This code generates raw asm output which is post-processed to extract
4 * and format the required data.
5 */
6#define COMPILE_OFFSETS
7
8#include <linux/crypto.h>
9#include <linux/sched.h>
10#include <linux/stddef.h>
11#include <linux/errno.h>
12#include <linux/hardirq.h>
13#include <linux/suspend.h>
14#include <linux/kbuild.h>
15#include <asm/processor.h>
16#include <asm/segment.h>
17#include <asm/thread_info.h>
18#include <asm/ia32.h> 1#include <asm/ia32.h>
19#include <asm/bootparam.h>
20#include <asm/suspend.h>
21
22#include <xen/interface/xen.h>
23
24#include <asm/sigframe.h>
25 2
26#define __NO_STUBS 1 3#define __NO_STUBS 1
27#undef __SYSCALL 4#undef __SYSCALL
@@ -33,41 +10,19 @@ static char syscalls[] = {
33 10
34int main(void) 11int main(void)
35{ 12{
36#define ENTRY(entry) DEFINE(tsk_ ## entry, offsetof(struct task_struct, entry))
37 ENTRY(state);
38 ENTRY(flags);
39 ENTRY(pid);
40 BLANK();
41#undef ENTRY
42#define ENTRY(entry) DEFINE(TI_ ## entry, offsetof(struct thread_info, entry))
43 ENTRY(flags);
44 ENTRY(addr_limit);
45 ENTRY(preempt_count);
46 ENTRY(status);
47#ifdef CONFIG_IA32_EMULATION
48 ENTRY(sysenter_return);
49#endif
50 BLANK();
51#undef ENTRY
52#ifdef CONFIG_PARAVIRT 13#ifdef CONFIG_PARAVIRT
53 BLANK();
54 OFFSET(PARAVIRT_enabled, pv_info, paravirt_enabled);
55 OFFSET(PARAVIRT_PATCH_pv_cpu_ops, paravirt_patch_template, pv_cpu_ops);
56 OFFSET(PARAVIRT_PATCH_pv_irq_ops, paravirt_patch_template, pv_irq_ops);
57 OFFSET(PV_IRQ_irq_disable, pv_irq_ops, irq_disable);
58 OFFSET(PV_IRQ_irq_enable, pv_irq_ops, irq_enable);
59 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); 14 OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame);
60 OFFSET(PV_CPU_iret, pv_cpu_ops, iret);
61 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32); 15 OFFSET(PV_CPU_usergs_sysret32, pv_cpu_ops, usergs_sysret32);
62 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); 16 OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64);
63 OFFSET(PV_CPU_irq_enable_sysexit, pv_cpu_ops, irq_enable_sysexit);
64 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); 17 OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs);
65 OFFSET(PV_MMU_read_cr2, pv_mmu_ops, read_cr2); 18 BLANK();
66#endif 19#endif
67 20
68
69#ifdef CONFIG_IA32_EMULATION 21#ifdef CONFIG_IA32_EMULATION
70#define ENTRY(entry) DEFINE(IA32_SIGCONTEXT_ ## entry, offsetof(struct sigcontext_ia32, entry)) 22 OFFSET(TI_sysenter_return, thread_info, sysenter_return);
23 BLANK();
24
25#define ENTRY(entry) OFFSET(IA32_SIGCONTEXT_ ## entry, sigcontext_ia32, entry)
71 ENTRY(ax); 26 ENTRY(ax);
72 ENTRY(bx); 27 ENTRY(bx);
73 ENTRY(cx); 28 ENTRY(cx);
@@ -79,15 +34,12 @@ int main(void)
79 ENTRY(ip); 34 ENTRY(ip);
80 BLANK(); 35 BLANK();
81#undef ENTRY 36#undef ENTRY
82 DEFINE(IA32_RT_SIGFRAME_sigcontext, 37
83 offsetof (struct rt_sigframe_ia32, uc.uc_mcontext)); 38 OFFSET(IA32_RT_SIGFRAME_sigcontext, rt_sigframe_ia32, uc.uc_mcontext);
84 BLANK(); 39 BLANK();
85#endif 40#endif
86 DEFINE(pbe_address, offsetof(struct pbe, address)); 41
87 DEFINE(pbe_orig_address, offsetof(struct pbe, orig_address)); 42#define ENTRY(entry) OFFSET(pt_regs_ ## entry, pt_regs, entry)
88 DEFINE(pbe_next, offsetof(struct pbe, next));
89 BLANK();
90#define ENTRY(entry) DEFINE(pt_regs_ ## entry, offsetof(struct pt_regs, entry))
91 ENTRY(bx); 43 ENTRY(bx);
92 ENTRY(bx); 44 ENTRY(bx);
93 ENTRY(cx); 45 ENTRY(cx);
@@ -107,7 +59,8 @@ int main(void)
107 ENTRY(flags); 59 ENTRY(flags);
108 BLANK(); 60 BLANK();
109#undef ENTRY 61#undef ENTRY
110#define ENTRY(entry) DEFINE(saved_context_ ## entry, offsetof(struct saved_context, entry)) 62
63#define ENTRY(entry) OFFSET(saved_context_ ## entry, saved_context, entry)
111 ENTRY(cr0); 64 ENTRY(cr0);
112 ENTRY(cr2); 65 ENTRY(cr2);
113 ENTRY(cr3); 66 ENTRY(cr3);
@@ -115,26 +68,11 @@ int main(void)
115 ENTRY(cr8); 68 ENTRY(cr8);
116 BLANK(); 69 BLANK();
117#undef ENTRY 70#undef ENTRY
118 DEFINE(TSS_ist, offsetof(struct tss_struct, x86_tss.ist));
119 BLANK();
120 DEFINE(crypto_tfm_ctx_offset, offsetof(struct crypto_tfm, __crt_ctx));
121 BLANK();
122 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
123 71
72 OFFSET(TSS_ist, tss_struct, x86_tss.ist);
124 BLANK(); 73 BLANK();
125 OFFSET(BP_scratch, boot_params, scratch);
126 OFFSET(BP_loadflags, boot_params, hdr.loadflags);
127 OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch);
128 OFFSET(BP_version, boot_params, hdr.version);
129 OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment);
130 74
131 BLANK(); 75 DEFINE(__NR_syscall_max, sizeof(syscalls) - 1);
132 DEFINE(PAGE_SIZE_asm, PAGE_SIZE); 76
133#ifdef CONFIG_XEN
134 BLANK();
135 OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
136 OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
137#undef ENTRY
138#endif
139 return 0; 77 return 0;
140} 78}
diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c
index fc999e6fc46a..452932d34730 100644
--- a/arch/x86/kernel/check.c
+++ b/arch/x86/kernel/check.c
@@ -2,7 +2,8 @@
2#include <linux/sched.h> 2#include <linux/sched.h>
3#include <linux/kthread.h> 3#include <linux/kthread.h>
4#include <linux/workqueue.h> 4#include <linux/workqueue.h>
5#include <asm/e820.h> 5#include <linux/memblock.h>
6
6#include <asm/proto.h> 7#include <asm/proto.h>
7 8
8/* 9/*
@@ -18,10 +19,12 @@ static int __read_mostly memory_corruption_check = -1;
18static unsigned __read_mostly corruption_check_size = 64*1024; 19static unsigned __read_mostly corruption_check_size = 64*1024;
19static unsigned __read_mostly corruption_check_period = 60; /* seconds */ 20static unsigned __read_mostly corruption_check_period = 60; /* seconds */
20 21
21static struct e820entry scan_areas[MAX_SCAN_AREAS]; 22static struct scan_area {
23 u64 addr;
24 u64 size;
25} scan_areas[MAX_SCAN_AREAS];
22static int num_scan_areas; 26static int num_scan_areas;
23 27
24
25static __init int set_corruption_check(char *arg) 28static __init int set_corruption_check(char *arg)
26{ 29{
27 char *end; 30 char *end;
@@ -81,9 +84,9 @@ void __init setup_bios_corruption_check(void)
81 84
82 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { 85 while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) {
83 u64 size; 86 u64 size;
84 addr = find_e820_area_size(addr, &size, PAGE_SIZE); 87 addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE);
85 88
86 if (!(addr + 1)) 89 if (addr == MEMBLOCK_ERROR)
87 break; 90 break;
88 91
89 if (addr >= corruption_check_size) 92 if (addr >= corruption_check_size)
@@ -92,7 +95,7 @@ void __init setup_bios_corruption_check(void)
92 if ((addr + size) > corruption_check_size) 95 if ((addr + size) > corruption_check_size)
93 size = corruption_check_size - addr; 96 size = corruption_check_size - addr;
94 97
95 e820_update_range(addr, size, E820_RAM, E820_RESERVED); 98 memblock_x86_reserve_range(addr, addr + size, "SCAN RAM");
96 scan_areas[num_scan_areas].addr = addr; 99 scan_areas[num_scan_areas].addr = addr;
97 scan_areas[num_scan_areas].size = size; 100 scan_areas[num_scan_areas].size = size;
98 num_scan_areas++; 101 num_scan_areas++;
@@ -103,9 +106,8 @@ void __init setup_bios_corruption_check(void)
103 addr += size; 106 addr += size;
104 } 107 }
105 108
106 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", 109 if (num_scan_areas)
107 num_scan_areas); 110 printk(KERN_INFO "Scanning %d areas for low memory corruption\n", num_scan_areas);
108 update_e820();
109} 111}
110 112
111 113
@@ -141,12 +143,12 @@ static void check_corruption(struct work_struct *dummy)
141{ 143{
142 check_for_bios_corruption(); 144 check_for_bios_corruption();
143 schedule_delayed_work(&bios_check_work, 145 schedule_delayed_work(&bios_check_work,
144 round_jiffies_relative(corruption_check_period*HZ)); 146 round_jiffies_relative(corruption_check_period*HZ));
145} 147}
146 148
147static int start_periodic_check_for_corruption(void) 149static int start_periodic_check_for_corruption(void)
148{ 150{
149 if (!memory_corruption_check || corruption_check_period == 0) 151 if (!num_scan_areas || !memory_corruption_check || corruption_check_period == 0)
150 return 0; 152 return 0;
151 153
152 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n", 154 printk(KERN_INFO "Scanning for low memory corruption every %d seconds\n",
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 3f0ebe429a01..6042981d0309 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -30,7 +30,6 @@ obj-$(CONFIG_PERF_EVENTS) += perf_event.o
30 30
31obj-$(CONFIG_X86_MCE) += mcheck/ 31obj-$(CONFIG_X86_MCE) += mcheck/
32obj-$(CONFIG_MTRR) += mtrr/ 32obj-$(CONFIG_MTRR) += mtrr/
33obj-$(CONFIG_CPU_FREQ) += cpufreq/
34 33
35obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o 34obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o
36 35
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index ba5f62f45f01..b13ed393dfce 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -148,7 +148,7 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c)
148{ 148{
149#ifdef CONFIG_SMP 149#ifdef CONFIG_SMP
150 /* calling is from identify_secondary_cpu() ? */ 150 /* calling is from identify_secondary_cpu() ? */
151 if (c->cpu_index == boot_cpu_id) 151 if (!c->cpu_index)
152 return; 152 return;
153 153
154 /* 154 /*
@@ -233,18 +233,22 @@ static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c)
233} 233}
234#endif 234#endif
235 235
236#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 236#ifdef CONFIG_NUMA
237/*
238 * To workaround broken NUMA config. Read the comment in
239 * srat_detect_node().
240 */
237static int __cpuinit nearby_node(int apicid) 241static int __cpuinit nearby_node(int apicid)
238{ 242{
239 int i, node; 243 int i, node;
240 244
241 for (i = apicid - 1; i >= 0; i--) { 245 for (i = apicid - 1; i >= 0; i--) {
242 node = apicid_to_node[i]; 246 node = __apicid_to_node[i];
243 if (node != NUMA_NO_NODE && node_online(node)) 247 if (node != NUMA_NO_NODE && node_online(node))
244 return node; 248 return node;
245 } 249 }
246 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) { 250 for (i = apicid + 1; i < MAX_LOCAL_APIC; i++) {
247 node = apicid_to_node[i]; 251 node = __apicid_to_node[i];
248 if (node != NUMA_NO_NODE && node_online(node)) 252 if (node != NUMA_NO_NODE && node_online(node))
249 return node; 253 return node;
250 } 254 }
@@ -253,37 +257,55 @@ static int __cpuinit nearby_node(int apicid)
253#endif 257#endif
254 258
255/* 259/*
256 * Fixup core topology information for AMD multi-node processors. 260 * Fixup core topology information for
257 * Assumption: Number of cores in each internal node is the same. 261 * (1) AMD multi-node processors
262 * Assumption: Number of cores in each internal node is the same.
263 * (2) AMD processors supporting compute units
258 */ 264 */
259#ifdef CONFIG_X86_HT 265#ifdef CONFIG_X86_HT
260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 266static void __cpuinit amd_get_topology(struct cpuinfo_x86 *c)
261{ 267{
262 unsigned long long value; 268 u32 nodes, cores_per_cu = 1;
263 u32 nodes, cores_per_node; 269 u8 node_id;
264 int cpu = smp_processor_id(); 270 int cpu = smp_processor_id();
265 271
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR)) 272 /* get information required for multi-node processors */
267 return; 273 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
268 274 u32 eax, ebx, ecx, edx;
269 /* fixup topology information only once for a core */ 275
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 276 cpuid(0x8000001e, &eax, &ebx, &ecx, &edx);
277 nodes = ((ecx >> 8) & 7) + 1;
278 node_id = ecx & 7;
279
280 /* get compute unit information */
281 smp_num_siblings = ((ebx >> 8) & 3) + 1;
282 c->compute_unit_id = ebx & 0xff;
283 cores_per_cu += ((ebx >> 8) & 3);
284 } else if (cpu_has(c, X86_FEATURE_NODEID_MSR)) {
285 u64 value;
286
287 rdmsrl(MSR_FAM10H_NODE_ID, value);
288 nodes = ((value >> 3) & 7) + 1;
289 node_id = value & 7;
290 } else
271 return; 291 return;
272 292
273 rdmsrl(MSR_FAM10H_NODE_ID, value); 293 /* fixup multi-node processor information */
274 294 if (nodes > 1) {
275 nodes = ((value >> 3) & 7) + 1; 295 u32 cores_per_node;
276 if (nodes == 1) 296 u32 cus_per_node;
277 return;
278 297
279 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 298 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
280 cores_per_node = c->x86_max_cores / nodes; 299 cores_per_node = c->x86_max_cores / nodes;
300 cus_per_node = cores_per_node / cores_per_cu;
281 301
282 /* store NodeID, use llc_shared_map to store sibling info */ 302 /* store NodeID, use llc_shared_map to store sibling info */
283 per_cpu(cpu_llc_id, cpu) = value & 7; 303 per_cpu(cpu_llc_id, cpu) = node_id;
284 304
285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */ 305 /* core id has to be in the [0 .. cores_per_node - 1] range */
286 c->cpu_core_id = c->cpu_core_id % cores_per_node; 306 c->cpu_core_id %= cores_per_node;
307 c->compute_unit_id %= cus_per_node;
308 }
287} 309}
288#endif 310#endif
289 311
@@ -304,9 +326,7 @@ static void __cpuinit amd_detect_cmp(struct cpuinfo_x86 *c)
304 c->phys_proc_id = c->initial_apicid >> bits; 326 c->phys_proc_id = c->initial_apicid >> bits;
305 /* use socket ID also for last level cache */ 327 /* use socket ID also for last level cache */
306 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id; 328 per_cpu(cpu_llc_id, cpu) = c->phys_proc_id;
307 /* fixup topology information on multi-node processors */ 329 amd_get_topology(c);
308 if ((c->x86 == 0x10) && (c->x86_model == 9))
309 amd_fixup_dcm(c);
310#endif 330#endif
311} 331}
312 332
@@ -322,31 +342,40 @@ EXPORT_SYMBOL_GPL(amd_get_nb_id);
322 342
323static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 343static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
324{ 344{
325#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 345#ifdef CONFIG_NUMA
326 int cpu = smp_processor_id(); 346 int cpu = smp_processor_id();
327 int node; 347 int node;
328 unsigned apicid = c->apicid; 348 unsigned apicid = c->apicid;
329 349
330 node = per_cpu(cpu_llc_id, cpu); 350 node = numa_cpu_node(cpu);
351 if (node == NUMA_NO_NODE)
352 node = per_cpu(cpu_llc_id, cpu);
331 353
332 if (apicid_to_node[apicid] != NUMA_NO_NODE)
333 node = apicid_to_node[apicid];
334 if (!node_online(node)) { 354 if (!node_online(node)) {
335 /* Two possibilities here: 355 /*
336 - The CPU is missing memory and no node was created. 356 * Two possibilities here:
337 In that case try picking one from a nearby CPU 357 *
338 - The APIC IDs differ from the HyperTransport node IDs 358 * - The CPU is missing memory and no node was created. In
339 which the K8 northbridge parsing fills in. 359 * that case try picking one from a nearby CPU.
340 Assume they are all increased by a constant offset, 360 *
341 but in the same order as the HT nodeids. 361 * - The APIC IDs differ from the HyperTransport node IDs
342 If that doesn't result in a usable node fall back to the 362 * which the K8 northbridge parsing fills in. Assume
343 path for the previous case. */ 363 * they are all increased by a constant offset, but in
344 364 * the same order as the HT nodeids. If that doesn't
365 * result in a usable node fall back to the path for the
366 * previous case.
367 *
368 * This workaround operates directly on the mapping between
369 * APIC ID and NUMA node, assuming certain relationship
370 * between APIC ID, HT node ID and NUMA topology. As going
371 * through CPU mapping may alter the outcome, directly
372 * access __apicid_to_node[].
373 */
345 int ht_nodeid = c->initial_apicid; 374 int ht_nodeid = c->initial_apicid;
346 375
347 if (ht_nodeid >= 0 && 376 if (ht_nodeid >= 0 &&
348 apicid_to_node[ht_nodeid] != NUMA_NO_NODE) 377 __apicid_to_node[ht_nodeid] != NUMA_NO_NODE)
349 node = apicid_to_node[ht_nodeid]; 378 node = __apicid_to_node[ht_nodeid];
350 /* Pick a nearby node */ 379 /* Pick a nearby node */
351 if (!node_online(node)) 380 if (!node_online(node))
352 node = nearby_node(apicid); 381 node = nearby_node(apicid);
@@ -412,6 +441,23 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c)
412 set_cpu_cap(c, X86_FEATURE_EXTD_APICID); 441 set_cpu_cap(c, X86_FEATURE_EXTD_APICID);
413 } 442 }
414#endif 443#endif
444
445 /* We need to do the following only once */
446 if (c != &boot_cpu_data)
447 return;
448
449 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) {
450
451 if (c->x86 > 0x10 ||
452 (c->x86 == 0x10 && c->x86_model >= 0x2)) {
453 u64 val;
454
455 rdmsrl(MSR_K7_HWCR, val);
456 if (!(val & BIT(24)))
457 printk(KERN_WARNING FW_BUG "TSC doesn't count "
458 "with P0 frequency!\n");
459 }
460 }
415} 461}
416 462
417static void __cpuinit init_amd(struct cpuinfo_x86 *c) 463static void __cpuinit init_amd(struct cpuinfo_x86 *c)
@@ -523,7 +569,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
523#endif 569#endif
524 570
525 if (c->extended_cpuid_level >= 0x80000006) { 571 if (c->extended_cpuid_level >= 0x80000006) {
526 if ((c->x86 >= 0x0f) && (cpuid_edx(0x80000006) & 0xf000)) 572 if (cpuid_edx(0x80000006) & 0xf000)
527 num_cache_leaves = 4; 573 num_cache_leaves = 4;
528 else 574 else
529 num_cache_leaves = 3; 575 num_cache_leaves = 3;
@@ -565,6 +611,35 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
565 } 611 }
566 } 612 }
567#endif 613#endif
614
615 /*
616 * Family 0x12 and above processors have APIC timer
617 * running in deep C states.
618 */
619 if (c->x86 > 0x11)
620 set_cpu_cap(c, X86_FEATURE_ARAT);
621
622 /*
623 * Disable GART TLB Walk Errors on Fam10h. We do this here
624 * because this is always needed when GART is enabled, even in a
625 * kernel which has no MCE support built in.
626 */
627 if (c->x86 == 0x10) {
628 /*
629 * BIOS should disable GartTlbWlk Errors themself. If
630 * it doesn't do it here as suggested by the BKDG.
631 *
632 * Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=33012
633 */
634 u64 mask;
635 int err;
636
637 err = rdmsrl_safe(MSR_AMD64_MCx_MASK(4), &mask);
638 if (err == 0) {
639 mask |= (1 << 10);
640 checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask);
641 }
642 }
568} 643}
569 644
570#ifdef CONFIG_X86_32 645#ifdef CONFIG_X86_32
@@ -639,7 +714,7 @@ EXPORT_SYMBOL_GPL(amd_erratum_383);
639 714
640bool cpu_has_amd_erratum(const int *erratum) 715bool cpu_has_amd_erratum(const int *erratum)
641{ 716{
642 struct cpuinfo_x86 *cpu = &current_cpu_data; 717 struct cpuinfo_x86 *cpu = __this_cpu_ptr(&cpu_info);
643 int osvw_id = *erratum++; 718 int osvw_id = *erratum++;
644 u32 range; 719 u32 range;
645 u32 ms; 720 u32 ms;
diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c
index c39576cb3018..525514cf33c3 100644
--- a/arch/x86/kernel/cpu/bugs.c
+++ b/arch/x86/kernel/cpu/bugs.c
@@ -19,6 +19,7 @@
19 19
20static int __init no_halt(char *s) 20static int __init no_halt(char *s)
21{ 21{
22 WARN_ONCE(1, "\"no-hlt\" is deprecated, please use \"idle=poll\"\n");
22 boot_cpu_data.hlt_works_ok = 0; 23 boot_cpu_data.hlt_works_ok = 0;
23 return 1; 24 return 1;
24} 25}
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f2f9ac7da25c..22a073d7fbff 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -254,6 +254,25 @@ static inline void squash_the_stupid_serial_number(struct cpuinfo_x86 *c)
254} 254}
255#endif 255#endif
256 256
257static int disable_smep __cpuinitdata;
258static __init int setup_disable_smep(char *arg)
259{
260 disable_smep = 1;
261 return 1;
262}
263__setup("nosmep", setup_disable_smep);
264
265static __cpuinit void setup_smep(struct cpuinfo_x86 *c)
266{
267 if (cpu_has(c, X86_FEATURE_SMEP)) {
268 if (unlikely(disable_smep)) {
269 setup_clear_cpu_cap(X86_FEATURE_SMEP);
270 clear_in_cr4(X86_CR4_SMEP);
271 } else
272 set_in_cr4(X86_CR4_SMEP);
273 }
274}
275
257/* 276/*
258 * Some CPU features depend on higher CPUID levels, which may not always 277 * Some CPU features depend on higher CPUID levels, which may not always
259 * be available due to CPUID level capping or broken virtualization 278 * be available due to CPUID level capping or broken virtualization
@@ -458,13 +477,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
458 if (smp_num_siblings <= 1) 477 if (smp_num_siblings <= 1)
459 goto out; 478 goto out;
460 479
461 if (smp_num_siblings > nr_cpu_ids) {
462 pr_warning("CPU: Unsupported number of siblings %d",
463 smp_num_siblings);
464 smp_num_siblings = 1;
465 return;
466 }
467
468 index_msb = get_count_order(smp_num_siblings); 480 index_msb = get_count_order(smp_num_siblings);
469 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb); 481 c->phys_proc_id = apic->phys_pkg_id(c->initial_apicid, index_msb);
470 482
@@ -565,8 +577,7 @@ void __cpuinit get_cpu_cap(struct cpuinfo_x86 *c)
565 577
566 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx); 578 cpuid_count(0x00000007, 0, &eax, &ebx, &ecx, &edx);
567 579
568 if (eax > 0) 580 c->x86_capability[9] = ebx;
569 c->x86_capability[9] = ebx;
570 } 581 }
571 582
572 /* AMD-defined flags: level 0x80000001 */ 583 /* AMD-defined flags: level 0x80000001 */
@@ -665,9 +676,11 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c)
665 this_cpu->c_early_init(c); 676 this_cpu->c_early_init(c);
666 677
667#ifdef CONFIG_SMP 678#ifdef CONFIG_SMP
668 c->cpu_index = boot_cpu_id; 679 c->cpu_index = 0;
669#endif 680#endif
670 filter_cpuid_features(c, false); 681 filter_cpuid_features(c, false);
682
683 setup_smep(c);
671} 684}
672 685
673void __init early_cpu_init(void) 686void __init early_cpu_init(void)
@@ -675,7 +688,7 @@ void __init early_cpu_init(void)
675 const struct cpu_dev *const *cdev; 688 const struct cpu_dev *const *cdev;
676 int count = 0; 689 int count = 0;
677 690
678#ifdef PROCESSOR_SELECT 691#ifdef CONFIG_PROCESSOR_SELECT
679 printk(KERN_INFO "KERNEL supported cpus:\n"); 692 printk(KERN_INFO "KERNEL supported cpus:\n");
680#endif 693#endif
681 694
@@ -687,7 +700,7 @@ void __init early_cpu_init(void)
687 cpu_devs[count] = cpudev; 700 cpu_devs[count] = cpudev;
688 count++; 701 count++;
689 702
690#ifdef PROCESSOR_SELECT 703#ifdef CONFIG_PROCESSOR_SELECT
691 { 704 {
692 unsigned int j; 705 unsigned int j;
693 706
@@ -704,16 +717,21 @@ void __init early_cpu_init(void)
704} 717}
705 718
706/* 719/*
707 * The NOPL instruction is supposed to exist on all CPUs with 720 * The NOPL instruction is supposed to exist on all CPUs of family >= 6;
708 * family >= 6; unfortunately, that's not true in practice because 721 * unfortunately, that's not true in practice because of early VIA
709 * of early VIA chips and (more importantly) broken virtualizers that 722 * chips and (more importantly) broken virtualizers that are not easy
710 * are not easy to detect. In the latter case it doesn't even *fail* 723 * to detect. In the latter case it doesn't even *fail* reliably, so
711 * reliably, so probing for it doesn't even work. Disable it completely 724 * probing for it doesn't even work. Disable it completely on 32-bit
712 * unless we can find a reliable way to detect all the broken cases. 725 * unless we can find a reliable way to detect all the broken cases.
726 * Enable it explicitly on 64-bit for non-constant inputs of cpu_has().
713 */ 727 */
714static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) 728static void __cpuinit detect_nopl(struct cpuinfo_x86 *c)
715{ 729{
730#ifdef CONFIG_X86_32
716 clear_cpu_cap(c, X86_FEATURE_NOPL); 731 clear_cpu_cap(c, X86_FEATURE_NOPL);
732#else
733 set_cpu_cap(c, X86_FEATURE_NOPL);
734#endif
717} 735}
718 736
719static void __cpuinit generic_identify(struct cpuinfo_x86 *c) 737static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
@@ -748,6 +766,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c)
748#endif 766#endif
749 } 767 }
750 768
769 setup_smep(c);
770
751 get_model_name(c); /* Default name */ 771 get_model_name(c); /* Default name */
752 772
753 detect_nopl(c); 773 detect_nopl(c);
@@ -864,7 +884,7 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
864 884
865 select_idle_routine(c); 885 select_idle_routine(c);
866 886
867#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 887#ifdef CONFIG_NUMA
868 numa_add_cpu(smp_processor_id()); 888 numa_add_cpu(smp_processor_id());
869#endif 889#endif
870} 890}
@@ -882,14 +902,13 @@ static void vgetcpu_set_mode(void)
882void __init identify_boot_cpu(void) 902void __init identify_boot_cpu(void)
883{ 903{
884 identify_cpu(&boot_cpu_data); 904 identify_cpu(&boot_cpu_data);
885 init_c1e_mask(); 905 init_amd_e400_c1e_mask();
886#ifdef CONFIG_X86_32 906#ifdef CONFIG_X86_32
887 sysenter_setup(); 907 sysenter_setup();
888 enable_sep_cpu(); 908 enable_sep_cpu();
889#else 909#else
890 vgetcpu_set_mode(); 910 vgetcpu_set_mode();
891#endif 911#endif
892 init_hw_perf_events();
893} 912}
894 913
895void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) 914void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c)
@@ -1264,13 +1283,6 @@ void __cpuinit cpu_init(void)
1264 clear_all_debug_regs(); 1283 clear_all_debug_regs();
1265 dbg_restore_debug_regs(); 1284 dbg_restore_debug_regs();
1266 1285
1267 /*
1268 * Force FPU initialization:
1269 */
1270 current_thread_info()->status = 0;
1271 clear_used_math();
1272 mxcsr_feature_mask_init();
1273
1274 fpu_init(); 1286 fpu_init();
1275 xsave_init(); 1287 xsave_init();
1276} 1288}
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index f668bb1f7d43..e765633f210e 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,7 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void get_cpu_cap(struct cpuinfo_x86 *c);
35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); 36extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36extern void get_cpu_cap(struct cpuinfo_x86 *c); 37extern void get_cpu_cap(struct cpuinfo_x86 *c);
37 38
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
deleted file mode 100644
index 870e6cc6ad28..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ /dev/null
@@ -1,266 +0,0 @@
1#
2# CPU Frequency scaling
3#
4
5menu "CPU Frequency scaling"
6
7source "drivers/cpufreq/Kconfig"
8
9if CPU_FREQ
10
11comment "CPUFreq processor drivers"
12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
27config X86_ACPI_CPUFREQ
28 tristate "ACPI Processor P-States driver"
29 select CPU_FREQ_TABLE
30 depends on ACPI_PROCESSOR
31 help
32 This driver adds a CPUFreq driver which utilizes the ACPI
33 Processor Performance States.
34 This driver also supports Intel Enhanced Speedstep.
35
36 To compile this driver as a module, choose M here: the
37 module will be called acpi-cpufreq.
38
39 For details, take a look at <file:Documentation/cpu-freq/>.
40
41 If in doubt, say N.
42
43config ELAN_CPUFREQ
44 tristate "AMD Elan SC400 and SC410"
45 select CPU_FREQ_TABLE
46 depends on X86_ELAN
47 ---help---
48 This adds the CPUFreq driver for AMD Elan SC400 and SC410
49 processors.
50
51 You need to specify the processor maximum speed as boot
52 parameter: elanfreq=maxspeed (in kHz) or as module
53 parameter "max_freq".
54
55 For details, take a look at <file:Documentation/cpu-freq/>.
56
57 If in doubt, say N.
58
59config SC520_CPUFREQ
60 tristate "AMD Elan SC520"
61 select CPU_FREQ_TABLE
62 depends on X86_ELAN
63 ---help---
64 This adds the CPUFreq driver for AMD Elan SC520 processor.
65
66 For details, take a look at <file:Documentation/cpu-freq/>.
67
68 If in doubt, say N.
69
70
71config X86_POWERNOW_K6
72 tristate "AMD Mobile K6-2/K6-3 PowerNow!"
73 select CPU_FREQ_TABLE
74 depends on X86_32
75 help
76 This adds the CPUFreq driver for mobile AMD K6-2+ and mobile
77 AMD K6-3+ processors.
78
79 For details, take a look at <file:Documentation/cpu-freq/>.
80
81 If in doubt, say N.
82
83config X86_POWERNOW_K7
84 tristate "AMD Mobile Athlon/Duron PowerNow!"
85 select CPU_FREQ_TABLE
86 depends on X86_32
87 help
88 This adds the CPUFreq driver for mobile AMD K7 mobile processors.
89
90 For details, take a look at <file:Documentation/cpu-freq/>.
91
92 If in doubt, say N.
93
94config X86_POWERNOW_K7_ACPI
95 bool
96 depends on X86_POWERNOW_K7 && ACPI_PROCESSOR
97 depends on !(X86_POWERNOW_K7 = y && ACPI_PROCESSOR = m)
98 depends on X86_32
99 default y
100
101config X86_POWERNOW_K8
102 tristate "AMD Opteron/Athlon64 PowerNow!"
103 select CPU_FREQ_TABLE
104 depends on ACPI && ACPI_PROCESSOR
105 help
106 This adds the CPUFreq driver for K8/K10 Opteron/Athlon64 processors.
107
108 To compile this driver as a module, choose M here: the
109 module will be called powernow-k8.
110
111 For details, take a look at <file:Documentation/cpu-freq/>.
112
113config X86_GX_SUSPMOD
114 tristate "Cyrix MediaGX/NatSemi Geode Suspend Modulation"
115 depends on X86_32 && PCI
116 help
117 This add the CPUFreq driver for NatSemi Geode processors which
118 support suspend modulation.
119
120 For details, take a look at <file:Documentation/cpu-freq/>.
121
122 If in doubt, say N.
123
124config X86_SPEEDSTEP_CENTRINO
125 tristate "Intel Enhanced SpeedStep (deprecated)"
126 select CPU_FREQ_TABLE
127 select X86_SPEEDSTEP_CENTRINO_TABLE if X86_32
128 depends on X86_32 || (X86_64 && ACPI_PROCESSOR)
129 help
130 This is deprecated and this functionality is now merged into
131 acpi_cpufreq (X86_ACPI_CPUFREQ). Use that driver instead of
132 speedstep_centrino.
133 This adds the CPUFreq driver for Enhanced SpeedStep enabled
134 mobile CPUs. This means Intel Pentium M (Centrino) CPUs
135 or 64bit enabled Intel Xeons.
136
137 To compile this driver as a module, choose M here: the
138 module will be called speedstep-centrino.
139
140 For details, take a look at <file:Documentation/cpu-freq/>.
141
142 If in doubt, say N.
143
144config X86_SPEEDSTEP_CENTRINO_TABLE
145 bool "Built-in tables for Banias CPUs"
146 depends on X86_32 && X86_SPEEDSTEP_CENTRINO
147 default y
148 help
149 Use built-in tables for Banias CPUs if ACPI encoding
150 is not available.
151
152 If in doubt, say N.
153
154config X86_SPEEDSTEP_ICH
155 tristate "Intel Speedstep on ICH-M chipsets (ioport interface)"
156 select CPU_FREQ_TABLE
157 depends on X86_32
158 help
159 This adds the CPUFreq driver for certain mobile Intel Pentium III
160 (Coppermine), all mobile Intel Pentium III-M (Tualatin) and all
161 mobile Intel Pentium 4 P4-M on systems which have an Intel ICH2,
162 ICH3 or ICH4 southbridge.
163
164 For details, take a look at <file:Documentation/cpu-freq/>.
165
166 If in doubt, say N.
167
168config X86_SPEEDSTEP_SMI
169 tristate "Intel SpeedStep on 440BX/ZX/MX chipsets (SMI interface)"
170 select CPU_FREQ_TABLE
171 depends on X86_32 && EXPERIMENTAL
172 help
173 This adds the CPUFreq driver for certain mobile Intel Pentium III
174 (Coppermine), all mobile Intel Pentium III-M (Tualatin)
175 on systems which have an Intel 440BX/ZX/MX southbridge.
176
177 For details, take a look at <file:Documentation/cpu-freq/>.
178
179 If in doubt, say N.
180
181config X86_P4_CLOCKMOD
182 tristate "Intel Pentium 4 clock modulation"
183 select CPU_FREQ_TABLE
184 help
185 This adds the CPUFreq driver for Intel Pentium 4 / XEON
186 processors. When enabled it will lower CPU temperature by skipping
187 clocks.
188
189 This driver should be only used in exceptional
190 circumstances when very low power is needed because it causes severe
191 slowdowns and noticeable latencies. Normally Speedstep should be used
192 instead.
193
194 To compile this driver as a module, choose M here: the
195 module will be called p4-clockmod.
196
197 For details, take a look at <file:Documentation/cpu-freq/>.
198
199 Unless you are absolutely sure say N.
200
201config X86_CPUFREQ_NFORCE2
202 tristate "nVidia nForce2 FSB changing"
203 depends on X86_32 && EXPERIMENTAL
204 help
205 This adds the CPUFreq driver for FSB changing on nVidia nForce2
206 platforms.
207
208 For details, take a look at <file:Documentation/cpu-freq/>.
209
210 If in doubt, say N.
211
212config X86_LONGRUN
213 tristate "Transmeta LongRun"
214 depends on X86_32
215 help
216 This adds the CPUFreq driver for Transmeta Crusoe and Efficeon processors
217 which support LongRun.
218
219 For details, take a look at <file:Documentation/cpu-freq/>.
220
221 If in doubt, say N.
222
223config X86_LONGHAUL
224 tristate "VIA Cyrix III Longhaul"
225 select CPU_FREQ_TABLE
226 depends on X86_32 && ACPI_PROCESSOR
227 help
228 This adds the CPUFreq driver for VIA Samuel/CyrixIII,
229 VIA Cyrix Samuel/C3, VIA Cyrix Ezra and VIA Cyrix Ezra-T
230 processors.
231
232 For details, take a look at <file:Documentation/cpu-freq/>.
233
234 If in doubt, say N.
235
236config X86_E_POWERSAVER
237 tristate "VIA C7 Enhanced PowerSaver (DANGEROUS)"
238 select CPU_FREQ_TABLE
239 depends on X86_32 && EXPERIMENTAL
240 help
241 This adds the CPUFreq driver for VIA C7 processors. However, this driver
242 does not have any safeguards to prevent operating the CPU out of spec
243 and is thus considered dangerous. Please use the regular ACPI cpufreq
244 driver, enabled by CONFIG_X86_ACPI_CPUFREQ.
245
246 If in doubt, say N.
247
248comment "shared options"
249
250config X86_SPEEDSTEP_LIB
251 tristate
252 default (X86_SPEEDSTEP_ICH || X86_SPEEDSTEP_SMI || X86_P4_CLOCKMOD)
253
254config X86_SPEEDSTEP_RELAXED_CAP_CHECK
255 bool "Relaxed speedstep capability checks"
256 depends on X86_32 && (X86_SPEEDSTEP_SMI || X86_SPEEDSTEP_ICH)
257 help
258 Don't perform all checks for a speedstep capable system which would
259 normally be done. Some ancient or strange systems, though speedstep
260 capable, don't always indicate that they are speedstep capable. This
261 option lets the probing code bypass some of those checks if the
262 parameter "relaxed_check=1" is passed to the module.
263
264endif # CPU_FREQ
265
266endmenu
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
deleted file mode 100644
index bd54bf67e6fb..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
1# Link order matters. K8 is preferred to ACPI because of firmware bugs in early
2# K8 systems. ACPI is preferred to all other hardware-specific drivers.
3# speedstep-* is preferred over p4-clockmod.
4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o mperf.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o mperf.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
11obj-$(CONFIG_X86_E_POWERSAVER) += e_powersaver.o
12obj-$(CONFIG_ELAN_CPUFREQ) += elanfreq.o
13obj-$(CONFIG_SC520_CPUFREQ) += sc520_freq.o
14obj-$(CONFIG_X86_LONGRUN) += longrun.o
15obj-$(CONFIG_X86_GX_SUSPMOD) += gx-suspmod.o
16obj-$(CONFIG_X86_SPEEDSTEP_ICH) += speedstep-ich.o
17obj-$(CONFIG_X86_SPEEDSTEP_LIB) += speedstep-lib.o
18obj-$(CONFIG_X86_SPEEDSTEP_SMI) += speedstep-smi.o
19obj-$(CONFIG_X86_SPEEDSTEP_CENTRINO) += speedstep-centrino.o
20obj-$(CONFIG_X86_P4_CLOCKMOD) += p4-clockmod.o
21obj-$(CONFIG_X86_CPUFREQ_NFORCE2) += cpufreq-nforce2.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
deleted file mode 100644
index cd8da247dda1..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ /dev/null
@@ -1,775 +0,0 @@
1/*
2 * acpi-cpufreq.c - ACPI Processor P-States Driver
3 *
4 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
5 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
6 * Copyright (C) 2002 - 2004 Dominik Brodowski <linux@brodo.de>
7 * Copyright (C) 2006 Denis Sadykov <denis.m.sadykov@intel.com>
8 *
9 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or (at
14 * your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful, but
17 * WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
19 * General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License along
22 * with this program; if not, write to the Free Software Foundation, Inc.,
23 * 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
24 *
25 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 */
27
28#include <linux/kernel.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/smp.h>
32#include <linux/sched.h>
33#include <linux/cpufreq.h>
34#include <linux/compiler.h>
35#include <linux/dmi.h>
36#include <linux/slab.h>
37
38#include <linux/acpi.h>
39#include <linux/io.h>
40#include <linux/delay.h>
41#include <linux/uaccess.h>
42
43#include <acpi/processor.h>
44
45#include <asm/msr.h>
46#include <asm/processor.h>
47#include <asm/cpufeature.h>
48#include "mperf.h"
49
50#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
51 "acpi-cpufreq", msg)
52
53MODULE_AUTHOR("Paul Diefenbaugh, Dominik Brodowski");
54MODULE_DESCRIPTION("ACPI Processor P-States Driver");
55MODULE_LICENSE("GPL");
56
57enum {
58 UNDEFINED_CAPABLE = 0,
59 SYSTEM_INTEL_MSR_CAPABLE,
60 SYSTEM_IO_CAPABLE,
61};
62
63#define INTEL_MSR_RANGE (0xffff)
64
65struct acpi_cpufreq_data {
66 struct acpi_processor_performance *acpi_data;
67 struct cpufreq_frequency_table *freq_table;
68 unsigned int resume;
69 unsigned int cpu_feature;
70};
71
72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
73
74/* acpi_perf_data is a pointer to percpu data. */
75static struct acpi_processor_performance __percpu *acpi_perf_data;
76
77static struct cpufreq_driver acpi_cpufreq_driver;
78
79static unsigned int acpi_pstate_strict;
80
81static int check_est_cpu(unsigned int cpuid)
82{
83 struct cpuinfo_x86 *cpu = &cpu_data(cpuid);
84
85 return cpu_has(cpu, X86_FEATURE_EST);
86}
87
88static unsigned extract_io(u32 value, struct acpi_cpufreq_data *data)
89{
90 struct acpi_processor_performance *perf;
91 int i;
92
93 perf = data->acpi_data;
94
95 for (i = 0; i < perf->state_count; i++) {
96 if (value == perf->states[i].status)
97 return data->freq_table[i].frequency;
98 }
99 return 0;
100}
101
102static unsigned extract_msr(u32 msr, struct acpi_cpufreq_data *data)
103{
104 int i;
105 struct acpi_processor_performance *perf;
106
107 msr &= INTEL_MSR_RANGE;
108 perf = data->acpi_data;
109
110 for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) {
111 if (msr == perf->states[data->freq_table[i].index].status)
112 return data->freq_table[i].frequency;
113 }
114 return data->freq_table[0].frequency;
115}
116
117static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data)
118{
119 switch (data->cpu_feature) {
120 case SYSTEM_INTEL_MSR_CAPABLE:
121 return extract_msr(val, data);
122 case SYSTEM_IO_CAPABLE:
123 return extract_io(val, data);
124 default:
125 return 0;
126 }
127}
128
129struct msr_addr {
130 u32 reg;
131};
132
133struct io_addr {
134 u16 port;
135 u8 bit_width;
136};
137
138struct drv_cmd {
139 unsigned int type;
140 const struct cpumask *mask;
141 union {
142 struct msr_addr msr;
143 struct io_addr io;
144 } addr;
145 u32 val;
146};
147
148/* Called via smp_call_function_single(), on the target CPU */
149static void do_drv_read(void *_cmd)
150{
151 struct drv_cmd *cmd = _cmd;
152 u32 h;
153
154 switch (cmd->type) {
155 case SYSTEM_INTEL_MSR_CAPABLE:
156 rdmsr(cmd->addr.msr.reg, cmd->val, h);
157 break;
158 case SYSTEM_IO_CAPABLE:
159 acpi_os_read_port((acpi_io_address)cmd->addr.io.port,
160 &cmd->val,
161 (u32)cmd->addr.io.bit_width);
162 break;
163 default:
164 break;
165 }
166}
167
168/* Called via smp_call_function_many(), on the target CPUs */
169static void do_drv_write(void *_cmd)
170{
171 struct drv_cmd *cmd = _cmd;
172 u32 lo, hi;
173
174 switch (cmd->type) {
175 case SYSTEM_INTEL_MSR_CAPABLE:
176 rdmsr(cmd->addr.msr.reg, lo, hi);
177 lo = (lo & ~INTEL_MSR_RANGE) | (cmd->val & INTEL_MSR_RANGE);
178 wrmsr(cmd->addr.msr.reg, lo, hi);
179 break;
180 case SYSTEM_IO_CAPABLE:
181 acpi_os_write_port((acpi_io_address)cmd->addr.io.port,
182 cmd->val,
183 (u32)cmd->addr.io.bit_width);
184 break;
185 default:
186 break;
187 }
188}
189
190static void drv_read(struct drv_cmd *cmd)
191{
192 int err;
193 cmd->val = 0;
194
195 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
196 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
197}
198
199static void drv_write(struct drv_cmd *cmd)
200{
201 int this_cpu;
202
203 this_cpu = get_cpu();
204 if (cpumask_test_cpu(this_cpu, cmd->mask))
205 do_drv_write(cmd);
206 smp_call_function_many(cmd->mask, do_drv_write, cmd, 1);
207 put_cpu();
208}
209
210static u32 get_cur_val(const struct cpumask *mask)
211{
212 struct acpi_processor_performance *perf;
213 struct drv_cmd cmd;
214
215 if (unlikely(cpumask_empty(mask)))
216 return 0;
217
218 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
219 case SYSTEM_INTEL_MSR_CAPABLE:
220 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
221 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
222 break;
223 case SYSTEM_IO_CAPABLE:
224 cmd.type = SYSTEM_IO_CAPABLE;
225 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
226 cmd.addr.io.port = perf->control_register.address;
227 cmd.addr.io.bit_width = perf->control_register.bit_width;
228 break;
229 default:
230 return 0;
231 }
232
233 cmd.mask = mask;
234 drv_read(&cmd);
235
236 dprintk("get_cur_val = %u\n", cmd.val);
237
238 return cmd.val;
239}
240
241static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
242{
243 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
244 unsigned int freq;
245 unsigned int cached_freq;
246
247 dprintk("get_cur_freq_on_cpu (%d)\n", cpu);
248
249 if (unlikely(data == NULL ||
250 data->acpi_data == NULL || data->freq_table == NULL)) {
251 return 0;
252 }
253
254 cached_freq = data->freq_table[data->acpi_data->state].frequency;
255 freq = extract_freq(get_cur_val(cpumask_of(cpu)), data);
256 if (freq != cached_freq) {
257 /*
258 * The dreaded BIOS frequency change behind our back.
259 * Force set the frequency on next target call.
260 */
261 data->resume = 1;
262 }
263
264 dprintk("cur freq = %u\n", freq);
265
266 return freq;
267}
268
269static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
270 struct acpi_cpufreq_data *data)
271{
272 unsigned int cur_freq;
273 unsigned int i;
274
275 for (i = 0; i < 100; i++) {
276 cur_freq = extract_freq(get_cur_val(mask), data);
277 if (cur_freq == freq)
278 return 1;
279 udelay(10);
280 }
281 return 0;
282}
283
284static int acpi_cpufreq_target(struct cpufreq_policy *policy,
285 unsigned int target_freq, unsigned int relation)
286{
287 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
288 struct acpi_processor_performance *perf;
289 struct cpufreq_freqs freqs;
290 struct drv_cmd cmd;
291 unsigned int next_state = 0; /* Index into freq_table */
292 unsigned int next_perf_state = 0; /* Index into perf table */
293 unsigned int i;
294 int result = 0;
295
296 dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
297
298 if (unlikely(data == NULL ||
299 data->acpi_data == NULL || data->freq_table == NULL)) {
300 return -ENODEV;
301 }
302
303 perf = data->acpi_data;
304 result = cpufreq_frequency_table_target(policy,
305 data->freq_table,
306 target_freq,
307 relation, &next_state);
308 if (unlikely(result)) {
309 result = -ENODEV;
310 goto out;
311 }
312
313 next_perf_state = data->freq_table[next_state].index;
314 if (perf->state == next_perf_state) {
315 if (unlikely(data->resume)) {
316 dprintk("Called after resume, resetting to P%d\n",
317 next_perf_state);
318 data->resume = 0;
319 } else {
320 dprintk("Already at target state (P%d)\n",
321 next_perf_state);
322 goto out;
323 }
324 }
325
326 switch (data->cpu_feature) {
327 case SYSTEM_INTEL_MSR_CAPABLE:
328 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
329 cmd.addr.msr.reg = MSR_IA32_PERF_CTL;
330 cmd.val = (u32) perf->states[next_perf_state].control;
331 break;
332 case SYSTEM_IO_CAPABLE:
333 cmd.type = SYSTEM_IO_CAPABLE;
334 cmd.addr.io.port = perf->control_register.address;
335 cmd.addr.io.bit_width = perf->control_register.bit_width;
336 cmd.val = (u32) perf->states[next_perf_state].control;
337 break;
338 default:
339 result = -ENODEV;
340 goto out;
341 }
342
343 /* cpufreq holds the hotplug lock, so we are safe from here on */
344 if (policy->shared_type != CPUFREQ_SHARED_TYPE_ANY)
345 cmd.mask = policy->cpus;
346 else
347 cmd.mask = cpumask_of(policy->cpu);
348
349 freqs.old = perf->states[perf->state].core_frequency * 1000;
350 freqs.new = data->freq_table[next_state].frequency;
351 for_each_cpu(i, policy->cpus) {
352 freqs.cpu = i;
353 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
354 }
355
356 drv_write(&cmd);
357
358 if (acpi_pstate_strict) {
359 if (!check_freqs(cmd.mask, freqs.new, data)) {
360 dprintk("acpi_cpufreq_target failed (%d)\n",
361 policy->cpu);
362 result = -EAGAIN;
363 goto out;
364 }
365 }
366
367 for_each_cpu(i, policy->cpus) {
368 freqs.cpu = i;
369 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
370 }
371 perf->state = next_perf_state;
372
373out:
374 return result;
375}
376
377static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
378{
379 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
380
381 dprintk("acpi_cpufreq_verify\n");
382
383 return cpufreq_frequency_table_verify(policy, data->freq_table);
384}
385
386static unsigned long
387acpi_cpufreq_guess_freq(struct acpi_cpufreq_data *data, unsigned int cpu)
388{
389 struct acpi_processor_performance *perf = data->acpi_data;
390
391 if (cpu_khz) {
392 /* search the closest match to cpu_khz */
393 unsigned int i;
394 unsigned long freq;
395 unsigned long freqn = perf->states[0].core_frequency * 1000;
396
397 for (i = 0; i < (perf->state_count-1); i++) {
398 freq = freqn;
399 freqn = perf->states[i+1].core_frequency * 1000;
400 if ((2 * cpu_khz) > (freqn + freq)) {
401 perf->state = i;
402 return freq;
403 }
404 }
405 perf->state = perf->state_count-1;
406 return freqn;
407 } else {
408 /* assume CPU is at P0... */
409 perf->state = 0;
410 return perf->states[0].core_frequency * 1000;
411 }
412}
413
414static void free_acpi_perf_data(void)
415{
416 unsigned int i;
417
418 /* Freeing a NULL pointer is OK, and alloc_percpu zeroes. */
419 for_each_possible_cpu(i)
420 free_cpumask_var(per_cpu_ptr(acpi_perf_data, i)
421 ->shared_cpu_map);
422 free_percpu(acpi_perf_data);
423}
424
425/*
426 * acpi_cpufreq_early_init - initialize ACPI P-States library
427 *
428 * Initialize the ACPI P-States library (drivers/acpi/processor_perflib.c)
429 * in order to determine correct frequency and voltage pairings. We can
430 * do _PDC and _PSD and find out the processor dependency for the
431 * actual init that will happen later...
432 */
433static int __init acpi_cpufreq_early_init(void)
434{
435 unsigned int i;
436 dprintk("acpi_cpufreq_early_init\n");
437
438 acpi_perf_data = alloc_percpu(struct acpi_processor_performance);
439 if (!acpi_perf_data) {
440 dprintk("Memory allocation error for acpi_perf_data.\n");
441 return -ENOMEM;
442 }
443 for_each_possible_cpu(i) {
444 if (!zalloc_cpumask_var_node(
445 &per_cpu_ptr(acpi_perf_data, i)->shared_cpu_map,
446 GFP_KERNEL, cpu_to_node(i))) {
447
448 /* Freeing a NULL pointer is OK: alloc_percpu zeroes. */
449 free_acpi_perf_data();
450 return -ENOMEM;
451 }
452 }
453
454 /* Do initialization in ACPI core */
455 acpi_processor_preregister_performance(acpi_perf_data);
456 return 0;
457}
458
459#ifdef CONFIG_SMP
460/*
461 * Some BIOSes do SW_ANY coordination internally, either set it up in hw
462 * or do it in BIOS firmware and won't inform about it to OS. If not
463 * detected, this has a side effect of making CPU run at a different speed
464 * than OS intended it to run at. Detect it and handle it cleanly.
465 */
466static int bios_with_sw_any_bug;
467
468static int sw_any_bug_found(const struct dmi_system_id *d)
469{
470 bios_with_sw_any_bug = 1;
471 return 0;
472}
473
474static const struct dmi_system_id sw_any_bug_dmi_table[] = {
475 {
476 .callback = sw_any_bug_found,
477 .ident = "Supermicro Server X6DLP",
478 .matches = {
479 DMI_MATCH(DMI_SYS_VENDOR, "Supermicro"),
480 DMI_MATCH(DMI_BIOS_VERSION, "080010"),
481 DMI_MATCH(DMI_PRODUCT_NAME, "X6DLP"),
482 },
483 },
484 { }
485};
486
487static int acpi_cpufreq_blacklist(struct cpuinfo_x86 *c)
488{
489 /* Intel Xeon Processor 7100 Series Specification Update
490 * http://www.intel.com/Assets/PDF/specupdate/314554.pdf
491 * AL30: A Machine Check Exception (MCE) Occurring during an
492 * Enhanced Intel SpeedStep Technology Ratio Change May Cause
493 * Both Processor Cores to Lock Up. */
494 if (c->x86_vendor == X86_VENDOR_INTEL) {
495 if ((c->x86 == 15) &&
496 (c->x86_model == 6) &&
497 (c->x86_mask == 8)) {
498 printk(KERN_INFO "acpi-cpufreq: Intel(R) "
499 "Xeon(R) 7100 Errata AL30, processors may "
500 "lock up on frequency changes: disabling "
501 "acpi-cpufreq.\n");
502 return -ENODEV;
503 }
504 }
505 return 0;
506}
507#endif
508
509static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
510{
511 unsigned int i;
512 unsigned int valid_states = 0;
513 unsigned int cpu = policy->cpu;
514 struct acpi_cpufreq_data *data;
515 unsigned int result = 0;
516 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
517 struct acpi_processor_performance *perf;
518#ifdef CONFIG_SMP
519 static int blacklisted;
520#endif
521
522 dprintk("acpi_cpufreq_cpu_init\n");
523
524#ifdef CONFIG_SMP
525 if (blacklisted)
526 return blacklisted;
527 blacklisted = acpi_cpufreq_blacklist(c);
528 if (blacklisted)
529 return blacklisted;
530#endif
531
532 data = kzalloc(sizeof(struct acpi_cpufreq_data), GFP_KERNEL);
533 if (!data)
534 return -ENOMEM;
535
536 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
537 per_cpu(acfreq_data, cpu) = data;
538
539 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
540 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
541
542 result = acpi_processor_register_performance(data->acpi_data, cpu);
543 if (result)
544 goto err_free;
545
546 perf = data->acpi_data;
547 policy->shared_type = perf->shared_type;
548
549 /*
550 * Will let policy->cpus know about dependency only when software
551 * coordination is required.
552 */
553 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ALL ||
554 policy->shared_type == CPUFREQ_SHARED_TYPE_ANY) {
555 cpumask_copy(policy->cpus, perf->shared_cpu_map);
556 }
557 cpumask_copy(policy->related_cpus, perf->shared_cpu_map);
558
559#ifdef CONFIG_SMP
560 dmi_check_system(sw_any_bug_dmi_table);
561 if (bios_with_sw_any_bug && cpumask_weight(policy->cpus) == 1) {
562 policy->shared_type = CPUFREQ_SHARED_TYPE_ALL;
563 cpumask_copy(policy->cpus, cpu_core_mask(cpu));
564 }
565#endif
566
567 /* capability check */
568 if (perf->state_count <= 1) {
569 dprintk("No P-States\n");
570 result = -ENODEV;
571 goto err_unreg;
572 }
573
574 if (perf->control_register.space_id != perf->status_register.space_id) {
575 result = -ENODEV;
576 goto err_unreg;
577 }
578
579 switch (perf->control_register.space_id) {
580 case ACPI_ADR_SPACE_SYSTEM_IO:
581 dprintk("SYSTEM IO addr space\n");
582 data->cpu_feature = SYSTEM_IO_CAPABLE;
583 break;
584 case ACPI_ADR_SPACE_FIXED_HARDWARE:
585 dprintk("HARDWARE addr space\n");
586 if (!check_est_cpu(cpu)) {
587 result = -ENODEV;
588 goto err_unreg;
589 }
590 data->cpu_feature = SYSTEM_INTEL_MSR_CAPABLE;
591 break;
592 default:
593 dprintk("Unknown addr space %d\n",
594 (u32) (perf->control_register.space_id));
595 result = -ENODEV;
596 goto err_unreg;
597 }
598
599 data->freq_table = kmalloc(sizeof(struct cpufreq_frequency_table) *
600 (perf->state_count+1), GFP_KERNEL);
601 if (!data->freq_table) {
602 result = -ENOMEM;
603 goto err_unreg;
604 }
605
606 /* detect transition latency */
607 policy->cpuinfo.transition_latency = 0;
608 for (i = 0; i < perf->state_count; i++) {
609 if ((perf->states[i].transition_latency * 1000) >
610 policy->cpuinfo.transition_latency)
611 policy->cpuinfo.transition_latency =
612 perf->states[i].transition_latency * 1000;
613 }
614
615 /* Check for high latency (>20uS) from buggy BIOSes, like on T42 */
616 if (perf->control_register.space_id == ACPI_ADR_SPACE_FIXED_HARDWARE &&
617 policy->cpuinfo.transition_latency > 20 * 1000) {
618 policy->cpuinfo.transition_latency = 20 * 1000;
619 printk_once(KERN_INFO
620 "P-state transition latency capped at 20 uS\n");
621 }
622
623 /* table init */
624 for (i = 0; i < perf->state_count; i++) {
625 if (i > 0 && perf->states[i].core_frequency >=
626 data->freq_table[valid_states-1].frequency / 1000)
627 continue;
628
629 data->freq_table[valid_states].index = i;
630 data->freq_table[valid_states].frequency =
631 perf->states[i].core_frequency * 1000;
632 valid_states++;
633 }
634 data->freq_table[valid_states].frequency = CPUFREQ_TABLE_END;
635 perf->state = 0;
636
637 result = cpufreq_frequency_table_cpuinfo(policy, data->freq_table);
638 if (result)
639 goto err_freqfree;
640
641 if (perf->states[0].core_frequency * 1000 != policy->cpuinfo.max_freq)
642 printk(KERN_WARNING FW_WARN "P-state 0 is not max freq\n");
643
644 switch (perf->control_register.space_id) {
645 case ACPI_ADR_SPACE_SYSTEM_IO:
646 /* Current speed is unknown and not detectable by IO port */
647 policy->cur = acpi_cpufreq_guess_freq(data, policy->cpu);
648 break;
649 case ACPI_ADR_SPACE_FIXED_HARDWARE:
650 acpi_cpufreq_driver.get = get_cur_freq_on_cpu;
651 policy->cur = get_cur_freq_on_cpu(cpu);
652 break;
653 default:
654 break;
655 }
656
657 /* notify BIOS that we exist */
658 acpi_processor_notify_smm(THIS_MODULE);
659
660 /* Check for APERF/MPERF support in hardware */
661 if (cpu_has(c, X86_FEATURE_APERFMPERF))
662 acpi_cpufreq_driver.getavg = cpufreq_get_measured_perf;
663
664 dprintk("CPU%u - ACPI performance management activated.\n", cpu);
665 for (i = 0; i < perf->state_count; i++)
666 dprintk(" %cP%d: %d MHz, %d mW, %d uS\n",
667 (i == perf->state ? '*' : ' '), i,
668 (u32) perf->states[i].core_frequency,
669 (u32) perf->states[i].power,
670 (u32) perf->states[i].transition_latency);
671
672 cpufreq_frequency_table_get_attr(data->freq_table, policy->cpu);
673
674 /*
675 * the first call to ->target() should result in us actually
676 * writing something to the appropriate registers.
677 */
678 data->resume = 1;
679
680 return result;
681
682err_freqfree:
683 kfree(data->freq_table);
684err_unreg:
685 acpi_processor_unregister_performance(perf, cpu);
686err_free:
687 kfree(data);
688 per_cpu(acfreq_data, cpu) = NULL;
689
690 return result;
691}
692
693static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
694{
695 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
696
697 dprintk("acpi_cpufreq_cpu_exit\n");
698
699 if (data) {
700 cpufreq_frequency_table_put_attr(policy->cpu);
701 per_cpu(acfreq_data, policy->cpu) = NULL;
702 acpi_processor_unregister_performance(data->acpi_data,
703 policy->cpu);
704 kfree(data);
705 }
706
707 return 0;
708}
709
710static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
711{
712 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
713
714 dprintk("acpi_cpufreq_resume\n");
715
716 data->resume = 1;
717
718 return 0;
719}
720
721static struct freq_attr *acpi_cpufreq_attr[] = {
722 &cpufreq_freq_attr_scaling_available_freqs,
723 NULL,
724};
725
726static struct cpufreq_driver acpi_cpufreq_driver = {
727 .verify = acpi_cpufreq_verify,
728 .target = acpi_cpufreq_target,
729 .bios_limit = acpi_processor_get_bios_limit,
730 .init = acpi_cpufreq_cpu_init,
731 .exit = acpi_cpufreq_cpu_exit,
732 .resume = acpi_cpufreq_resume,
733 .name = "acpi-cpufreq",
734 .owner = THIS_MODULE,
735 .attr = acpi_cpufreq_attr,
736};
737
738static int __init acpi_cpufreq_init(void)
739{
740 int ret;
741
742 if (acpi_disabled)
743 return 0;
744
745 dprintk("acpi_cpufreq_init\n");
746
747 ret = acpi_cpufreq_early_init();
748 if (ret)
749 return ret;
750
751 ret = cpufreq_register_driver(&acpi_cpufreq_driver);
752 if (ret)
753 free_acpi_perf_data();
754
755 return ret;
756}
757
758static void __exit acpi_cpufreq_exit(void)
759{
760 dprintk("acpi_cpufreq_exit\n");
761
762 cpufreq_unregister_driver(&acpi_cpufreq_driver);
763
764 free_percpu(acpi_perf_data);
765}
766
767module_param(acpi_pstate_strict, uint, 0644);
768MODULE_PARM_DESC(acpi_pstate_strict,
769 "value 0 or non-zero. non-zero -> strict ACPI checks are "
770 "performed during frequency changes.");
771
772late_initcall(acpi_cpufreq_init);
773module_exit(acpi_cpufreq_exit);
774
775MODULE_ALIAS("acpi");
diff --git a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c b/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
deleted file mode 100644
index 733093d60436..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/cpufreq-nforce2.c
+++ /dev/null
@@ -1,446 +0,0 @@
1/*
2 * (C) 2004-2006 Sebastian Witt <se.witt@gmx.net>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 * Based upon reverse engineered information
6 *
7 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
8 */
9
10#include <linux/kernel.h>
11#include <linux/module.h>
12#include <linux/moduleparam.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/pci.h>
16#include <linux/delay.h>
17
18#define NFORCE2_XTAL 25
19#define NFORCE2_BOOTFSB 0x48
20#define NFORCE2_PLLENABLE 0xa8
21#define NFORCE2_PLLREG 0xa4
22#define NFORCE2_PLLADR 0xa0
23#define NFORCE2_PLL(mul, div) (0x100000 | (mul << 8) | div)
24
25#define NFORCE2_MIN_FSB 50
26#define NFORCE2_SAFE_DISTANCE 50
27
28/* Delay in ms between FSB changes */
29/* #define NFORCE2_DELAY 10 */
30
31/*
32 * nforce2_chipset:
33 * FSB is changed using the chipset
34 */
35static struct pci_dev *nforce2_dev;
36
37/* fid:
38 * multiplier * 10
39 */
40static int fid;
41
42/* min_fsb, max_fsb:
43 * minimum and maximum FSB (= FSB at boot time)
44 */
45static int min_fsb;
46static int max_fsb;
47
48MODULE_AUTHOR("Sebastian Witt <se.witt@gmx.net>");
49MODULE_DESCRIPTION("nForce2 FSB changing cpufreq driver");
50MODULE_LICENSE("GPL");
51
52module_param(fid, int, 0444);
53module_param(min_fsb, int, 0444);
54
55MODULE_PARM_DESC(fid, "CPU multiplier to use (11.5 = 115)");
56MODULE_PARM_DESC(min_fsb,
57 "Minimum FSB to use, if not defined: current FSB - 50");
58
59#define PFX "cpufreq-nforce2: "
60#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
61 "cpufreq-nforce2", msg)
62
63/**
64 * nforce2_calc_fsb - calculate FSB
65 * @pll: PLL value
66 *
67 * Calculates FSB from PLL value
68 */
69static int nforce2_calc_fsb(int pll)
70{
71 unsigned char mul, div;
72
73 mul = (pll >> 8) & 0xff;
74 div = pll & 0xff;
75
76 if (div > 0)
77 return NFORCE2_XTAL * mul / div;
78
79 return 0;
80}
81
82/**
83 * nforce2_calc_pll - calculate PLL value
84 * @fsb: FSB
85 *
86 * Calculate PLL value for given FSB
87 */
88static int nforce2_calc_pll(unsigned int fsb)
89{
90 unsigned char xmul, xdiv;
91 unsigned char mul = 0, div = 0;
92 int tried = 0;
93
94 /* Try to calculate multiplier and divider up to 4 times */
95 while (((mul == 0) || (div == 0)) && (tried <= 3)) {
96 for (xdiv = 2; xdiv <= 0x80; xdiv++)
97 for (xmul = 1; xmul <= 0xfe; xmul++)
98 if (nforce2_calc_fsb(NFORCE2_PLL(xmul, xdiv)) ==
99 fsb + tried) {
100 mul = xmul;
101 div = xdiv;
102 }
103 tried++;
104 }
105
106 if ((mul == 0) || (div == 0))
107 return -1;
108
109 return NFORCE2_PLL(mul, div);
110}
111
112/**
113 * nforce2_write_pll - write PLL value to chipset
114 * @pll: PLL value
115 *
116 * Writes new FSB PLL value to chipset
117 */
118static void nforce2_write_pll(int pll)
119{
120 int temp;
121
122 /* Set the pll addr. to 0x00 */
123 pci_write_config_dword(nforce2_dev, NFORCE2_PLLADR, 0);
124
125 /* Now write the value in all 64 registers */
126 for (temp = 0; temp <= 0x3f; temp++)
127 pci_write_config_dword(nforce2_dev, NFORCE2_PLLREG, pll);
128
129 return;
130}
131
132/**
133 * nforce2_fsb_read - Read FSB
134 *
135 * Read FSB from chipset
136 * If bootfsb != 0, return FSB at boot-time
137 */
138static unsigned int nforce2_fsb_read(int bootfsb)
139{
140 struct pci_dev *nforce2_sub5;
141 u32 fsb, temp = 0;
142
143 /* Get chipset boot FSB from subdevice 5 (FSB at boot-time) */
144 nforce2_sub5 = pci_get_subsys(PCI_VENDOR_ID_NVIDIA, 0x01EF,
145 PCI_ANY_ID, PCI_ANY_ID, NULL);
146 if (!nforce2_sub5)
147 return 0;
148
149 pci_read_config_dword(nforce2_sub5, NFORCE2_BOOTFSB, &fsb);
150 fsb /= 1000000;
151
152 /* Check if PLL register is already set */
153 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
154
155 if (bootfsb || !temp)
156 return fsb;
157
158 /* Use PLL register FSB value */
159 pci_read_config_dword(nforce2_dev, NFORCE2_PLLREG, &temp);
160 fsb = nforce2_calc_fsb(temp);
161
162 return fsb;
163}
164
165/**
166 * nforce2_set_fsb - set new FSB
167 * @fsb: New FSB
168 *
169 * Sets new FSB
170 */
171static int nforce2_set_fsb(unsigned int fsb)
172{
173 u32 temp = 0;
174 unsigned int tfsb;
175 int diff;
176 int pll = 0;
177
178 if ((fsb > max_fsb) || (fsb < NFORCE2_MIN_FSB)) {
179 printk(KERN_ERR PFX "FSB %d is out of range!\n", fsb);
180 return -EINVAL;
181 }
182
183 tfsb = nforce2_fsb_read(0);
184 if (!tfsb) {
185 printk(KERN_ERR PFX "Error while reading the FSB\n");
186 return -EINVAL;
187 }
188
189 /* First write? Then set actual value */
190 pci_read_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8 *)&temp);
191 if (!temp) {
192 pll = nforce2_calc_pll(tfsb);
193
194 if (pll < 0)
195 return -EINVAL;
196
197 nforce2_write_pll(pll);
198 }
199
200 /* Enable write access */
201 temp = 0x01;
202 pci_write_config_byte(nforce2_dev, NFORCE2_PLLENABLE, (u8)temp);
203
204 diff = tfsb - fsb;
205
206 if (!diff)
207 return 0;
208
209 while ((tfsb != fsb) && (tfsb <= max_fsb) && (tfsb >= min_fsb)) {
210 if (diff < 0)
211 tfsb++;
212 else
213 tfsb--;
214
215 /* Calculate the PLL reg. value */
216 pll = nforce2_calc_pll(tfsb);
217 if (pll == -1)
218 return -EINVAL;
219
220 nforce2_write_pll(pll);
221#ifdef NFORCE2_DELAY
222 mdelay(NFORCE2_DELAY);
223#endif
224 }
225
226 temp = 0x40;
227 pci_write_config_byte(nforce2_dev, NFORCE2_PLLADR, (u8)temp);
228
229 return 0;
230}
231
232/**
233 * nforce2_get - get the CPU frequency
234 * @cpu: CPU number
235 *
236 * Returns the CPU frequency
237 */
238static unsigned int nforce2_get(unsigned int cpu)
239{
240 if (cpu)
241 return 0;
242 return nforce2_fsb_read(0) * fid * 100;
243}
244
245/**
246 * nforce2_target - set a new CPUFreq policy
247 * @policy: new policy
248 * @target_freq: the target frequency
249 * @relation: how that frequency relates to achieved frequency
250 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
251 *
252 * Sets a new CPUFreq policy.
253 */
254static int nforce2_target(struct cpufreq_policy *policy,
255 unsigned int target_freq, unsigned int relation)
256{
257/* unsigned long flags; */
258 struct cpufreq_freqs freqs;
259 unsigned int target_fsb;
260
261 if ((target_freq > policy->max) || (target_freq < policy->min))
262 return -EINVAL;
263
264 target_fsb = target_freq / (fid * 100);
265
266 freqs.old = nforce2_get(policy->cpu);
267 freqs.new = target_fsb * fid * 100;
268 freqs.cpu = 0; /* Only one CPU on nForce2 platforms */
269
270 if (freqs.old == freqs.new)
271 return 0;
272
273 dprintk("Old CPU frequency %d kHz, new %d kHz\n",
274 freqs.old, freqs.new);
275
276 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
277
278 /* Disable IRQs */
279 /* local_irq_save(flags); */
280
281 if (nforce2_set_fsb(target_fsb) < 0)
282 printk(KERN_ERR PFX "Changing FSB to %d failed\n",
283 target_fsb);
284 else
285 dprintk("Changed FSB successfully to %d\n",
286 target_fsb);
287
288 /* Enable IRQs */
289 /* local_irq_restore(flags); */
290
291 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
292
293 return 0;
294}
295
296/**
297 * nforce2_verify - verifies a new CPUFreq policy
298 * @policy: new policy
299 */
300static int nforce2_verify(struct cpufreq_policy *policy)
301{
302 unsigned int fsb_pol_max;
303
304 fsb_pol_max = policy->max / (fid * 100);
305
306 if (policy->min < (fsb_pol_max * fid * 100))
307 policy->max = (fsb_pol_max + 1) * fid * 100;
308
309 cpufreq_verify_within_limits(policy,
310 policy->cpuinfo.min_freq,
311 policy->cpuinfo.max_freq);
312 return 0;
313}
314
315static int nforce2_cpu_init(struct cpufreq_policy *policy)
316{
317 unsigned int fsb;
318 unsigned int rfid;
319
320 /* capability check */
321 if (policy->cpu != 0)
322 return -ENODEV;
323
324 /* Get current FSB */
325 fsb = nforce2_fsb_read(0);
326
327 if (!fsb)
328 return -EIO;
329
330 /* FIX: Get FID from CPU */
331 if (!fid) {
332 if (!cpu_khz) {
333 printk(KERN_WARNING PFX
334 "cpu_khz not set, can't calculate multiplier!\n");
335 return -ENODEV;
336 }
337
338 fid = cpu_khz / (fsb * 100);
339 rfid = fid % 5;
340
341 if (rfid) {
342 if (rfid > 2)
343 fid += 5 - rfid;
344 else
345 fid -= rfid;
346 }
347 }
348
349 printk(KERN_INFO PFX "FSB currently at %i MHz, FID %d.%d\n", fsb,
350 fid / 10, fid % 10);
351
352 /* Set maximum FSB to FSB at boot time */
353 max_fsb = nforce2_fsb_read(1);
354
355 if (!max_fsb)
356 return -EIO;
357
358 if (!min_fsb)
359 min_fsb = max_fsb - NFORCE2_SAFE_DISTANCE;
360
361 if (min_fsb < NFORCE2_MIN_FSB)
362 min_fsb = NFORCE2_MIN_FSB;
363
364 /* cpuinfo and default policy values */
365 policy->cpuinfo.min_freq = min_fsb * fid * 100;
366 policy->cpuinfo.max_freq = max_fsb * fid * 100;
367 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
368 policy->cur = nforce2_get(policy->cpu);
369 policy->min = policy->cpuinfo.min_freq;
370 policy->max = policy->cpuinfo.max_freq;
371
372 return 0;
373}
374
375static int nforce2_cpu_exit(struct cpufreq_policy *policy)
376{
377 return 0;
378}
379
380static struct cpufreq_driver nforce2_driver = {
381 .name = "nforce2",
382 .verify = nforce2_verify,
383 .target = nforce2_target,
384 .get = nforce2_get,
385 .init = nforce2_cpu_init,
386 .exit = nforce2_cpu_exit,
387 .owner = THIS_MODULE,
388};
389
390/**
391 * nforce2_detect_chipset - detect the Southbridge which contains FSB PLL logic
392 *
393 * Detects nForce2 A2 and C1 stepping
394 *
395 */
396static unsigned int nforce2_detect_chipset(void)
397{
398 nforce2_dev = pci_get_subsys(PCI_VENDOR_ID_NVIDIA,
399 PCI_DEVICE_ID_NVIDIA_NFORCE2,
400 PCI_ANY_ID, PCI_ANY_ID, NULL);
401
402 if (nforce2_dev == NULL)
403 return -ENODEV;
404
405 printk(KERN_INFO PFX "Detected nForce2 chipset revision %X\n",
406 nforce2_dev->revision);
407 printk(KERN_INFO PFX
408 "FSB changing is maybe unstable and can lead to "
409 "crashes and data loss.\n");
410
411 return 0;
412}
413
414/**
415 * nforce2_init - initializes the nForce2 CPUFreq driver
416 *
417 * Initializes the nForce2 FSB support. Returns -ENODEV on unsupported
418 * devices, -EINVAL on problems during initiatization, and zero on
419 * success.
420 */
421static int __init nforce2_init(void)
422{
423 /* TODO: do we need to detect the processor? */
424
425 /* detect chipset */
426 if (nforce2_detect_chipset()) {
427 printk(KERN_INFO PFX "No nForce2 chipset.\n");
428 return -ENODEV;
429 }
430
431 return cpufreq_register_driver(&nforce2_driver);
432}
433
434/**
435 * nforce2_exit - unregisters cpufreq module
436 *
437 * Unregisters nForce2 FSB change support.
438 */
439static void __exit nforce2_exit(void)
440{
441 cpufreq_unregister_driver(&nforce2_driver);
442}
443
444module_init(nforce2_init);
445module_exit(nforce2_exit);
446
diff --git a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c b/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
deleted file mode 100644
index 35a257dd4bb7..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/e_powersaver.c
+++ /dev/null
@@ -1,367 +0,0 @@
1/*
2 * Based on documentation provided by Dave Jones. Thanks!
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/ioport.h>
14#include <linux/slab.h>
15#include <linux/timex.h>
16#include <linux/io.h>
17#include <linux/delay.h>
18
19#include <asm/msr.h>
20#include <asm/tsc.h>
21
22#define EPS_BRAND_C7M 0
23#define EPS_BRAND_C7 1
24#define EPS_BRAND_EDEN 2
25#define EPS_BRAND_C3 3
26#define EPS_BRAND_C7D 4
27
28struct eps_cpu_data {
29 u32 fsb;
30 struct cpufreq_frequency_table freq_table[];
31};
32
33static struct eps_cpu_data *eps_cpu[NR_CPUS];
34
35
36static unsigned int eps_get(unsigned int cpu)
37{
38 struct eps_cpu_data *centaur;
39 u32 lo, hi;
40
41 if (cpu)
42 return 0;
43 centaur = eps_cpu[cpu];
44 if (centaur == NULL)
45 return 0;
46
47 /* Return current frequency */
48 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
49 return centaur->fsb * ((lo >> 8) & 0xff);
50}
51
52static int eps_set_state(struct eps_cpu_data *centaur,
53 unsigned int cpu,
54 u32 dest_state)
55{
56 struct cpufreq_freqs freqs;
57 u32 lo, hi;
58 int err = 0;
59 int i;
60
61 freqs.old = eps_get(cpu);
62 freqs.new = centaur->fsb * ((dest_state >> 8) & 0xff);
63 freqs.cpu = cpu;
64 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
65
66 /* Wait while CPU is busy */
67 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
68 i = 0;
69 while (lo & ((1 << 16) | (1 << 17))) {
70 udelay(16);
71 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
72 i++;
73 if (unlikely(i > 64)) {
74 err = -ENODEV;
75 goto postchange;
76 }
77 }
78 /* Set new multiplier and voltage */
79 wrmsr(MSR_IA32_PERF_CTL, dest_state & 0xffff, 0);
80 /* Wait until transition end */
81 i = 0;
82 do {
83 udelay(16);
84 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
85 i++;
86 if (unlikely(i > 64)) {
87 err = -ENODEV;
88 goto postchange;
89 }
90 } while (lo & ((1 << 16) | (1 << 17)));
91
92 /* Return current frequency */
93postchange:
94 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
95 freqs.new = centaur->fsb * ((lo >> 8) & 0xff);
96
97#ifdef DEBUG
98 {
99 u8 current_multiplier, current_voltage;
100
101 /* Print voltage and multiplier */
102 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
103 current_voltage = lo & 0xff;
104 printk(KERN_INFO "eps: Current voltage = %dmV\n",
105 current_voltage * 16 + 700);
106 current_multiplier = (lo >> 8) & 0xff;
107 printk(KERN_INFO "eps: Current multiplier = %d\n",
108 current_multiplier);
109 }
110#endif
111 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
112 return err;
113}
114
115static int eps_target(struct cpufreq_policy *policy,
116 unsigned int target_freq,
117 unsigned int relation)
118{
119 struct eps_cpu_data *centaur;
120 unsigned int newstate = 0;
121 unsigned int cpu = policy->cpu;
122 unsigned int dest_state;
123 int ret;
124
125 if (unlikely(eps_cpu[cpu] == NULL))
126 return -ENODEV;
127 centaur = eps_cpu[cpu];
128
129 if (unlikely(cpufreq_frequency_table_target(policy,
130 &eps_cpu[cpu]->freq_table[0],
131 target_freq,
132 relation,
133 &newstate))) {
134 return -EINVAL;
135 }
136
137 /* Make frequency transition */
138 dest_state = centaur->freq_table[newstate].index & 0xffff;
139 ret = eps_set_state(centaur, cpu, dest_state);
140 if (ret)
141 printk(KERN_ERR "eps: Timeout!\n");
142 return ret;
143}
144
145static int eps_verify(struct cpufreq_policy *policy)
146{
147 return cpufreq_frequency_table_verify(policy,
148 &eps_cpu[policy->cpu]->freq_table[0]);
149}
150
151static int eps_cpu_init(struct cpufreq_policy *policy)
152{
153 unsigned int i;
154 u32 lo, hi;
155 u64 val;
156 u8 current_multiplier, current_voltage;
157 u8 max_multiplier, max_voltage;
158 u8 min_multiplier, min_voltage;
159 u8 brand = 0;
160 u32 fsb;
161 struct eps_cpu_data *centaur;
162 struct cpuinfo_x86 *c = &cpu_data(0);
163 struct cpufreq_frequency_table *f_table;
164 int k, step, voltage;
165 int ret;
166 int states;
167
168 if (policy->cpu != 0)
169 return -ENODEV;
170
171 /* Check brand */
172 printk(KERN_INFO "eps: Detected VIA ");
173
174 switch (c->x86_model) {
175 case 10:
176 rdmsr(0x1153, lo, hi);
177 brand = (((lo >> 2) ^ lo) >> 18) & 3;
178 printk(KERN_CONT "Model A ");
179 break;
180 case 13:
181 rdmsr(0x1154, lo, hi);
182 brand = (((lo >> 4) ^ (lo >> 2))) & 0x000000ff;
183 printk(KERN_CONT "Model D ");
184 break;
185 }
186
187 switch (brand) {
188 case EPS_BRAND_C7M:
189 printk(KERN_CONT "C7-M\n");
190 break;
191 case EPS_BRAND_C7:
192 printk(KERN_CONT "C7\n");
193 break;
194 case EPS_BRAND_EDEN:
195 printk(KERN_CONT "Eden\n");
196 break;
197 case EPS_BRAND_C7D:
198 printk(KERN_CONT "C7-D\n");
199 break;
200 case EPS_BRAND_C3:
201 printk(KERN_CONT "C3\n");
202 return -ENODEV;
203 break;
204 }
205 /* Enable Enhanced PowerSaver */
206 rdmsrl(MSR_IA32_MISC_ENABLE, val);
207 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
208 val |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
209 wrmsrl(MSR_IA32_MISC_ENABLE, val);
210 /* Can be locked at 0 */
211 rdmsrl(MSR_IA32_MISC_ENABLE, val);
212 if (!(val & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
213 printk(KERN_INFO "eps: Can't enable Enhanced PowerSaver\n");
214 return -ENODEV;
215 }
216 }
217
218 /* Print voltage and multiplier */
219 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
220 current_voltage = lo & 0xff;
221 printk(KERN_INFO "eps: Current voltage = %dmV\n",
222 current_voltage * 16 + 700);
223 current_multiplier = (lo >> 8) & 0xff;
224 printk(KERN_INFO "eps: Current multiplier = %d\n", current_multiplier);
225
226 /* Print limits */
227 max_voltage = hi & 0xff;
228 printk(KERN_INFO "eps: Highest voltage = %dmV\n",
229 max_voltage * 16 + 700);
230 max_multiplier = (hi >> 8) & 0xff;
231 printk(KERN_INFO "eps: Highest multiplier = %d\n", max_multiplier);
232 min_voltage = (hi >> 16) & 0xff;
233 printk(KERN_INFO "eps: Lowest voltage = %dmV\n",
234 min_voltage * 16 + 700);
235 min_multiplier = (hi >> 24) & 0xff;
236 printk(KERN_INFO "eps: Lowest multiplier = %d\n", min_multiplier);
237
238 /* Sanity checks */
239 if (current_multiplier == 0 || max_multiplier == 0
240 || min_multiplier == 0)
241 return -EINVAL;
242 if (current_multiplier > max_multiplier
243 || max_multiplier <= min_multiplier)
244 return -EINVAL;
245 if (current_voltage > 0x1f || max_voltage > 0x1f)
246 return -EINVAL;
247 if (max_voltage < min_voltage)
248 return -EINVAL;
249
250 /* Calc FSB speed */
251 fsb = cpu_khz / current_multiplier;
252 /* Calc number of p-states supported */
253 if (brand == EPS_BRAND_C7M)
254 states = max_multiplier - min_multiplier + 1;
255 else
256 states = 2;
257
258 /* Allocate private data and frequency table for current cpu */
259 centaur = kzalloc(sizeof(struct eps_cpu_data)
260 + (states + 1) * sizeof(struct cpufreq_frequency_table),
261 GFP_KERNEL);
262 if (!centaur)
263 return -ENOMEM;
264 eps_cpu[0] = centaur;
265
266 /* Copy basic values */
267 centaur->fsb = fsb;
268
269 /* Fill frequency and MSR value table */
270 f_table = &centaur->freq_table[0];
271 if (brand != EPS_BRAND_C7M) {
272 f_table[0].frequency = fsb * min_multiplier;
273 f_table[0].index = (min_multiplier << 8) | min_voltage;
274 f_table[1].frequency = fsb * max_multiplier;
275 f_table[1].index = (max_multiplier << 8) | max_voltage;
276 f_table[2].frequency = CPUFREQ_TABLE_END;
277 } else {
278 k = 0;
279 step = ((max_voltage - min_voltage) * 256)
280 / (max_multiplier - min_multiplier);
281 for (i = min_multiplier; i <= max_multiplier; i++) {
282 voltage = (k * step) / 256 + min_voltage;
283 f_table[k].frequency = fsb * i;
284 f_table[k].index = (i << 8) | voltage;
285 k++;
286 }
287 f_table[k].frequency = CPUFREQ_TABLE_END;
288 }
289
290 policy->cpuinfo.transition_latency = 140000; /* 844mV -> 700mV in ns */
291 policy->cur = fsb * current_multiplier;
292
293 ret = cpufreq_frequency_table_cpuinfo(policy, &centaur->freq_table[0]);
294 if (ret) {
295 kfree(centaur);
296 return ret;
297 }
298
299 cpufreq_frequency_table_get_attr(&centaur->freq_table[0], policy->cpu);
300 return 0;
301}
302
303static int eps_cpu_exit(struct cpufreq_policy *policy)
304{
305 unsigned int cpu = policy->cpu;
306 struct eps_cpu_data *centaur;
307 u32 lo, hi;
308
309 if (eps_cpu[cpu] == NULL)
310 return -ENODEV;
311 centaur = eps_cpu[cpu];
312
313 /* Get max frequency */
314 rdmsr(MSR_IA32_PERF_STATUS, lo, hi);
315 /* Set max frequency */
316 eps_set_state(centaur, cpu, hi & 0xffff);
317 /* Bye */
318 cpufreq_frequency_table_put_attr(policy->cpu);
319 kfree(eps_cpu[cpu]);
320 eps_cpu[cpu] = NULL;
321 return 0;
322}
323
324static struct freq_attr *eps_attr[] = {
325 &cpufreq_freq_attr_scaling_available_freqs,
326 NULL,
327};
328
329static struct cpufreq_driver eps_driver = {
330 .verify = eps_verify,
331 .target = eps_target,
332 .init = eps_cpu_init,
333 .exit = eps_cpu_exit,
334 .get = eps_get,
335 .name = "e_powersaver",
336 .owner = THIS_MODULE,
337 .attr = eps_attr,
338};
339
340static int __init eps_init(void)
341{
342 struct cpuinfo_x86 *c = &cpu_data(0);
343
344 /* This driver will work only on Centaur C7 processors with
345 * Enhanced SpeedStep/PowerSaver registers */
346 if (c->x86_vendor != X86_VENDOR_CENTAUR
347 || c->x86 != 6 || c->x86_model < 10)
348 return -ENODEV;
349 if (!cpu_has(c, X86_FEATURE_EST))
350 return -ENODEV;
351
352 if (cpufreq_register_driver(&eps_driver))
353 return -EINVAL;
354 return 0;
355}
356
357static void __exit eps_exit(void)
358{
359 cpufreq_unregister_driver(&eps_driver);
360}
361
362MODULE_AUTHOR("Rafal Bilski <rafalbilski@interia.pl>");
363MODULE_DESCRIPTION("Enhanced PowerSaver driver for VIA C7 CPU's.");
364MODULE_LICENSE("GPL");
365
366module_init(eps_init);
367module_exit(eps_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
deleted file mode 100644
index c587db472a75..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ /dev/null
@@ -1,309 +0,0 @@
1/*
2 * elanfreq: cpufreq driver for the AMD ELAN family
3 *
4 * (c) Copyright 2002 Robert Schwebel <r.schwebel@pengutronix.de>
5 *
6 * Parts of this code are (c) Sven Geggus <sven@geggus.net>
7 *
8 * All Rights Reserved.
9 *
10 * This program is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU General Public License
12 * as published by the Free Software Foundation; either version
13 * 2 of the License, or (at your option) any later version.
14 *
15 * 2002-02-13: - initial revision for 2.4.18-pre9 by Robert Schwebel
16 *
17 */
18
19#include <linux/kernel.h>
20#include <linux/module.h>
21#include <linux/init.h>
22
23#include <linux/delay.h>
24#include <linux/cpufreq.h>
25
26#include <asm/msr.h>
27#include <linux/timex.h>
28#include <linux/io.h>
29
30#define REG_CSCIR 0x22 /* Chip Setup and Control Index Register */
31#define REG_CSCDR 0x23 /* Chip Setup and Control Data Register */
32
33/* Module parameter */
34static int max_freq;
35
36struct s_elan_multiplier {
37 int clock; /* frequency in kHz */
38 int val40h; /* PMU Force Mode register */
39 int val80h; /* CPU Clock Speed Register */
40};
41
42/*
43 * It is important that the frequencies
44 * are listed in ascending order here!
45 */
46static struct s_elan_multiplier elan_multiplier[] = {
47 {1000, 0x02, 0x18},
48 {2000, 0x02, 0x10},
49 {4000, 0x02, 0x08},
50 {8000, 0x00, 0x00},
51 {16000, 0x00, 0x02},
52 {33000, 0x00, 0x04},
53 {66000, 0x01, 0x04},
54 {99000, 0x01, 0x05}
55};
56
57static struct cpufreq_frequency_table elanfreq_table[] = {
58 {0, 1000},
59 {1, 2000},
60 {2, 4000},
61 {3, 8000},
62 {4, 16000},
63 {5, 33000},
64 {6, 66000},
65 {7, 99000},
66 {0, CPUFREQ_TABLE_END},
67};
68
69
70/**
71 * elanfreq_get_cpu_frequency: determine current cpu speed
72 *
73 * Finds out at which frequency the CPU of the Elan SOC runs
74 * at the moment. Frequencies from 1 to 33 MHz are generated
75 * the normal way, 66 and 99 MHz are called "Hyperspeed Mode"
76 * and have the rest of the chip running with 33 MHz.
77 */
78
79static unsigned int elanfreq_get_cpu_frequency(unsigned int cpu)
80{
81 u8 clockspeed_reg; /* Clock Speed Register */
82
83 local_irq_disable();
84 outb_p(0x80, REG_CSCIR);
85 clockspeed_reg = inb_p(REG_CSCDR);
86 local_irq_enable();
87
88 if ((clockspeed_reg & 0xE0) == 0xE0)
89 return 0;
90
91 /* Are we in CPU clock multiplied mode (66/99 MHz)? */
92 if ((clockspeed_reg & 0xE0) == 0xC0) {
93 if ((clockspeed_reg & 0x01) == 0)
94 return 66000;
95 else
96 return 99000;
97 }
98
99 /* 33 MHz is not 32 MHz... */
100 if ((clockspeed_reg & 0xE0) == 0xA0)
101 return 33000;
102
103 return (1<<((clockspeed_reg & 0xE0) >> 5)) * 1000;
104}
105
106
107/**
108 * elanfreq_set_cpu_frequency: Change the CPU core frequency
109 * @cpu: cpu number
110 * @freq: frequency in kHz
111 *
112 * This function takes a frequency value and changes the CPU frequency
113 * according to this. Note that the frequency has to be checked by
114 * elanfreq_validatespeed() for correctness!
115 *
116 * There is no return value.
117 */
118
119static void elanfreq_set_cpu_state(unsigned int state)
120{
121 struct cpufreq_freqs freqs;
122
123 freqs.old = elanfreq_get_cpu_frequency(0);
124 freqs.new = elan_multiplier[state].clock;
125 freqs.cpu = 0; /* elanfreq.c is UP only driver */
126
127 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
128
129 printk(KERN_INFO "elanfreq: attempting to set frequency to %i kHz\n",
130 elan_multiplier[state].clock);
131
132
133 /*
134 * Access to the Elan's internal registers is indexed via
135 * 0x22: Chip Setup & Control Register Index Register (CSCI)
136 * 0x23: Chip Setup & Control Register Data Register (CSCD)
137 *
138 */
139
140 /*
141 * 0x40 is the Power Management Unit's Force Mode Register.
142 * Bit 6 enables Hyperspeed Mode (66/100 MHz core frequency)
143 */
144
145 local_irq_disable();
146 outb_p(0x40, REG_CSCIR); /* Disable hyperspeed mode */
147 outb_p(0x00, REG_CSCDR);
148 local_irq_enable(); /* wait till internal pipelines and */
149 udelay(1000); /* buffers have cleaned up */
150
151 local_irq_disable();
152
153 /* now, set the CPU clock speed register (0x80) */
154 outb_p(0x80, REG_CSCIR);
155 outb_p(elan_multiplier[state].val80h, REG_CSCDR);
156
157 /* now, the hyperspeed bit in PMU Force Mode Register (0x40) */
158 outb_p(0x40, REG_CSCIR);
159 outb_p(elan_multiplier[state].val40h, REG_CSCDR);
160 udelay(10000);
161 local_irq_enable();
162
163 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
164};
165
166
167/**
168 * elanfreq_validatespeed: test if frequency range is valid
169 * @policy: the policy to validate
170 *
171 * This function checks if a given frequency range in kHz is valid
172 * for the hardware supported by the driver.
173 */
174
175static int elanfreq_verify(struct cpufreq_policy *policy)
176{
177 return cpufreq_frequency_table_verify(policy, &elanfreq_table[0]);
178}
179
180static int elanfreq_target(struct cpufreq_policy *policy,
181 unsigned int target_freq,
182 unsigned int relation)
183{
184 unsigned int newstate = 0;
185
186 if (cpufreq_frequency_table_target(policy, &elanfreq_table[0],
187 target_freq, relation, &newstate))
188 return -EINVAL;
189
190 elanfreq_set_cpu_state(newstate);
191
192 return 0;
193}
194
195
196/*
197 * Module init and exit code
198 */
199
200static int elanfreq_cpu_init(struct cpufreq_policy *policy)
201{
202 struct cpuinfo_x86 *c = &cpu_data(0);
203 unsigned int i;
204 int result;
205
206 /* capability check */
207 if ((c->x86_vendor != X86_VENDOR_AMD) ||
208 (c->x86 != 4) || (c->x86_model != 10))
209 return -ENODEV;
210
211 /* max freq */
212 if (!max_freq)
213 max_freq = elanfreq_get_cpu_frequency(0);
214
215 /* table init */
216 for (i = 0; (elanfreq_table[i].frequency != CPUFREQ_TABLE_END); i++) {
217 if (elanfreq_table[i].frequency > max_freq)
218 elanfreq_table[i].frequency = CPUFREQ_ENTRY_INVALID;
219 }
220
221 /* cpuinfo and default policy values */
222 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
223 policy->cur = elanfreq_get_cpu_frequency(0);
224
225 result = cpufreq_frequency_table_cpuinfo(policy, elanfreq_table);
226 if (result)
227 return result;
228
229 cpufreq_frequency_table_get_attr(elanfreq_table, policy->cpu);
230 return 0;
231}
232
233
234static int elanfreq_cpu_exit(struct cpufreq_policy *policy)
235{
236 cpufreq_frequency_table_put_attr(policy->cpu);
237 return 0;
238}
239
240
241#ifndef MODULE
242/**
243 * elanfreq_setup - elanfreq command line parameter parsing
244 *
245 * elanfreq command line parameter. Use:
246 * elanfreq=66000
247 * to set the maximum CPU frequency to 66 MHz. Note that in
248 * case you do not give this boot parameter, the maximum
249 * frequency will fall back to _current_ CPU frequency which
250 * might be lower. If you build this as a module, use the
251 * max_freq module parameter instead.
252 */
253static int __init elanfreq_setup(char *str)
254{
255 max_freq = simple_strtoul(str, &str, 0);
256 printk(KERN_WARNING "You're using the deprecated elanfreq command line option. Use elanfreq.max_freq instead, please!\n");
257 return 1;
258}
259__setup("elanfreq=", elanfreq_setup);
260#endif
261
262
263static struct freq_attr *elanfreq_attr[] = {
264 &cpufreq_freq_attr_scaling_available_freqs,
265 NULL,
266};
267
268
269static struct cpufreq_driver elanfreq_driver = {
270 .get = elanfreq_get_cpu_frequency,
271 .verify = elanfreq_verify,
272 .target = elanfreq_target,
273 .init = elanfreq_cpu_init,
274 .exit = elanfreq_cpu_exit,
275 .name = "elanfreq",
276 .owner = THIS_MODULE,
277 .attr = elanfreq_attr,
278};
279
280
281static int __init elanfreq_init(void)
282{
283 struct cpuinfo_x86 *c = &cpu_data(0);
284
285 /* Test if we have the right hardware */
286 if ((c->x86_vendor != X86_VENDOR_AMD) ||
287 (c->x86 != 4) || (c->x86_model != 10)) {
288 printk(KERN_INFO "elanfreq: error: no Elan processor found!\n");
289 return -ENODEV;
290 }
291 return cpufreq_register_driver(&elanfreq_driver);
292}
293
294
295static void __exit elanfreq_exit(void)
296{
297 cpufreq_unregister_driver(&elanfreq_driver);
298}
299
300
301module_param(max_freq, int, 0444);
302
303MODULE_LICENSE("GPL");
304MODULE_AUTHOR("Robert Schwebel <r.schwebel@pengutronix.de>, "
305 "Sven Geggus <sven@geggus.net>");
306MODULE_DESCRIPTION("cpufreq driver for AMD's Elan CPUs");
307
308module_init(elanfreq_init);
309module_exit(elanfreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
deleted file mode 100644
index 32974cf84232..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ /dev/null
@@ -1,517 +0,0 @@
1/*
2 * Cyrix MediaGX and NatSemi Geode Suspend Modulation
3 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
4 * (C) 2002 Hiroshi Miura <miura@da-cha.org>
5 * All Rights Reserved
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * version 2 as published by the Free Software Foundation
10 *
11 * The author(s) of this software shall not be held liable for damages
12 * of any nature resulting due to the use of this software. This
13 * software is provided AS-IS with no warranties.
14 *
15 * Theoretical note:
16 *
17 * (see Geode(tm) CS5530 manual (rev.4.1) page.56)
18 *
19 * CPU frequency control on NatSemi Geode GX1/GXLV processor and CS55x0
20 * are based on Suspend Modulation.
21 *
22 * Suspend Modulation works by asserting and de-asserting the SUSP# pin
23 * to CPU(GX1/GXLV) for configurable durations. When asserting SUSP#
24 * the CPU enters an idle state. GX1 stops its core clock when SUSP# is
25 * asserted then power consumption is reduced.
26 *
27 * Suspend Modulation's OFF/ON duration are configurable
28 * with 'Suspend Modulation OFF Count Register'
29 * and 'Suspend Modulation ON Count Register'.
30 * These registers are 8bit counters that represent the number of
31 * 32us intervals which the SUSP# pin is asserted(ON)/de-asserted(OFF)
32 * to the processor.
33 *
34 * These counters define a ratio which is the effective frequency
35 * of operation of the system.
36 *
37 * OFF Count
38 * F_eff = Fgx * ----------------------
39 * OFF Count + ON Count
40 *
41 * 0 <= On Count, Off Count <= 255
42 *
43 * From these limits, we can get register values
44 *
45 * off_duration + on_duration <= MAX_DURATION
46 * on_duration = off_duration * (stock_freq - freq) / freq
47 *
48 * off_duration = (freq * DURATION) / stock_freq
49 * on_duration = DURATION - off_duration
50 *
51 *
52 *---------------------------------------------------------------------------
53 *
54 * ChangeLog:
55 * Dec. 12, 2003 Hiroshi Miura <miura@da-cha.org>
56 * - fix on/off register mistake
57 * - fix cpu_khz calc when it stops cpu modulation.
58 *
59 * Dec. 11, 2002 Hiroshi Miura <miura@da-cha.org>
60 * - rewrite for Cyrix MediaGX Cx5510/5520 and
61 * NatSemi Geode Cs5530(A).
62 *
63 * Jul. ??, 2002 Zwane Mwaikambo <zwane@commfireservices.com>
64 * - cs5530_mod patch for 2.4.19-rc1.
65 *
66 *---------------------------------------------------------------------------
67 *
68 * Todo
69 * Test on machines with 5510, 5530, 5530A
70 */
71
72/************************************************************************
73 * Suspend Modulation - Definitions *
74 ************************************************************************/
75
76#include <linux/kernel.h>
77#include <linux/module.h>
78#include <linux/init.h>
79#include <linux/smp.h>
80#include <linux/cpufreq.h>
81#include <linux/pci.h>
82#include <linux/errno.h>
83#include <linux/slab.h>
84
85#include <asm/processor-cyrix.h>
86
87/* PCI config registers, all at F0 */
88#define PCI_PMER1 0x80 /* power management enable register 1 */
89#define PCI_PMER2 0x81 /* power management enable register 2 */
90#define PCI_PMER3 0x82 /* power management enable register 3 */
91#define PCI_IRQTC 0x8c /* irq speedup timer counter register:typical 2 to 4ms */
92#define PCI_VIDTC 0x8d /* video speedup timer counter register: typical 50 to 100ms */
93#define PCI_MODOFF 0x94 /* suspend modulation OFF counter register, 1 = 32us */
94#define PCI_MODON 0x95 /* suspend modulation ON counter register */
95#define PCI_SUSCFG 0x96 /* suspend configuration register */
96
97/* PMER1 bits */
98#define GPM (1<<0) /* global power management */
99#define GIT (1<<1) /* globally enable PM device idle timers */
100#define GTR (1<<2) /* globally enable IO traps */
101#define IRQ_SPDUP (1<<3) /* disable clock throttle during interrupt handling */
102#define VID_SPDUP (1<<4) /* disable clock throttle during vga video handling */
103
104/* SUSCFG bits */
105#define SUSMOD (1<<0) /* enable/disable suspend modulation */
106/* the below is supported only with cs5530 (after rev.1.2)/cs5530A */
107#define SMISPDUP (1<<1) /* select how SMI re-enable suspend modulation: */
108 /* IRQTC timer or read SMI speedup disable reg.(F1BAR[08-09h]) */
109#define SUSCFG (1<<2) /* enable powering down a GXLV processor. "Special 3Volt Suspend" mode */
110/* the below is supported only with cs5530A */
111#define PWRSVE_ISA (1<<3) /* stop ISA clock */
112#define PWRSVE (1<<4) /* active idle */
113
114struct gxfreq_params {
115 u8 on_duration;
116 u8 off_duration;
117 u8 pci_suscfg;
118 u8 pci_pmer1;
119 u8 pci_pmer2;
120 struct pci_dev *cs55x0;
121};
122
123static struct gxfreq_params *gx_params;
124static int stock_freq;
125
126/* PCI bus clock - defaults to 30.000 if cpu_khz is not available */
127static int pci_busclk;
128module_param(pci_busclk, int, 0444);
129
130/* maximum duration for which the cpu may be suspended
131 * (32us * MAX_DURATION). If no parameter is given, this defaults
132 * to 255.
133 * Note that this leads to a maximum of 8 ms(!) where the CPU clock
134 * is suspended -- processing power is just 0.39% of what it used to be,
135 * though. 781.25 kHz(!) for a 200 MHz processor -- wow. */
136static int max_duration = 255;
137module_param(max_duration, int, 0444);
138
139/* For the default policy, we want at least some processing power
140 * - let's say 5%. (min = maxfreq / POLICY_MIN_DIV)
141 */
142#define POLICY_MIN_DIV 20
143
144
145#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
146 "gx-suspmod", msg)
147
148/**
149 * we can detect a core multipiler from dir0_lsb
150 * from GX1 datasheet p.56,
151 * MULT[3:0]:
152 * 0000 = SYSCLK multiplied by 4 (test only)
153 * 0001 = SYSCLK multiplied by 10
154 * 0010 = SYSCLK multiplied by 4
155 * 0011 = SYSCLK multiplied by 6
156 * 0100 = SYSCLK multiplied by 9
157 * 0101 = SYSCLK multiplied by 5
158 * 0110 = SYSCLK multiplied by 7
159 * 0111 = SYSCLK multiplied by 8
160 * of 33.3MHz
161 **/
162static int gx_freq_mult[16] = {
163 4, 10, 4, 6, 9, 5, 7, 8,
164 0, 0, 0, 0, 0, 0, 0, 0
165};
166
167
168/****************************************************************
169 * Low Level chipset interface *
170 ****************************************************************/
171static struct pci_device_id gx_chipset_tbl[] __initdata = {
172 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY), },
173 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5520), },
174 { PCI_VDEVICE(CYRIX, PCI_DEVICE_ID_CYRIX_5510), },
175 { 0, },
176};
177
178static void gx_write_byte(int reg, int value)
179{
180 pci_write_config_byte(gx_params->cs55x0, reg, value);
181}
182
183/**
184 * gx_detect_chipset:
185 *
186 **/
187static __init struct pci_dev *gx_detect_chipset(void)
188{
189 struct pci_dev *gx_pci = NULL;
190
191 /* check if CPU is a MediaGX or a Geode. */
192 if ((boot_cpu_data.x86_vendor != X86_VENDOR_NSC) &&
193 (boot_cpu_data.x86_vendor != X86_VENDOR_CYRIX)) {
194 dprintk("error: no MediaGX/Geode processor found!\n");
195 return NULL;
196 }
197
198 /* detect which companion chip is used */
199 for_each_pci_dev(gx_pci) {
200 if ((pci_match_id(gx_chipset_tbl, gx_pci)) != NULL)
201 return gx_pci;
202 }
203
204 dprintk("error: no supported chipset found!\n");
205 return NULL;
206}
207
208/**
209 * gx_get_cpuspeed:
210 *
211 * Finds out at which efficient frequency the Cyrix MediaGX/NatSemi
212 * Geode CPU runs.
213 */
214static unsigned int gx_get_cpuspeed(unsigned int cpu)
215{
216 if ((gx_params->pci_suscfg & SUSMOD) == 0)
217 return stock_freq;
218
219 return (stock_freq * gx_params->off_duration)
220 / (gx_params->on_duration + gx_params->off_duration);
221}
222
223/**
224 * gx_validate_speed:
225 * determine current cpu speed
226 *
227 **/
228
229static unsigned int gx_validate_speed(unsigned int khz, u8 *on_duration,
230 u8 *off_duration)
231{
232 unsigned int i;
233 u8 tmp_on, tmp_off;
234 int old_tmp_freq = stock_freq;
235 int tmp_freq;
236
237 *off_duration = 1;
238 *on_duration = 0;
239
240 for (i = max_duration; i > 0; i--) {
241 tmp_off = ((khz * i) / stock_freq) & 0xff;
242 tmp_on = i - tmp_off;
243 tmp_freq = (stock_freq * tmp_off) / i;
244 /* if this relation is closer to khz, use this. If it's equal,
245 * prefer it, too - lower latency */
246 if (abs(tmp_freq - khz) <= abs(old_tmp_freq - khz)) {
247 *on_duration = tmp_on;
248 *off_duration = tmp_off;
249 old_tmp_freq = tmp_freq;
250 }
251 }
252
253 return old_tmp_freq;
254}
255
256
257/**
258 * gx_set_cpuspeed:
259 * set cpu speed in khz.
260 **/
261
262static void gx_set_cpuspeed(unsigned int khz)
263{
264 u8 suscfg, pmer1;
265 unsigned int new_khz;
266 unsigned long flags;
267 struct cpufreq_freqs freqs;
268
269 freqs.cpu = 0;
270 freqs.old = gx_get_cpuspeed(0);
271
272 new_khz = gx_validate_speed(khz, &gx_params->on_duration,
273 &gx_params->off_duration);
274
275 freqs.new = new_khz;
276
277 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
278 local_irq_save(flags);
279
280
281
282 if (new_khz != stock_freq) {
283 /* if new khz == 100% of CPU speed, it is special case */
284 switch (gx_params->cs55x0->device) {
285 case PCI_DEVICE_ID_CYRIX_5530_LEGACY:
286 pmer1 = gx_params->pci_pmer1 | IRQ_SPDUP | VID_SPDUP;
287 /* FIXME: need to test other values -- Zwane,Miura */
288 /* typical 2 to 4ms */
289 gx_write_byte(PCI_IRQTC, 4);
290 /* typical 50 to 100ms */
291 gx_write_byte(PCI_VIDTC, 100);
292 gx_write_byte(PCI_PMER1, pmer1);
293
294 if (gx_params->cs55x0->revision < 0x10) {
295 /* CS5530(rev 1.2, 1.3) */
296 suscfg = gx_params->pci_suscfg|SUSMOD;
297 } else {
298 /* CS5530A,B.. */
299 suscfg = gx_params->pci_suscfg|SUSMOD|PWRSVE;
300 }
301 break;
302 case PCI_DEVICE_ID_CYRIX_5520:
303 case PCI_DEVICE_ID_CYRIX_5510:
304 suscfg = gx_params->pci_suscfg | SUSMOD;
305 break;
306 default:
307 local_irq_restore(flags);
308 dprintk("fatal: try to set unknown chipset.\n");
309 return;
310 }
311 } else {
312 suscfg = gx_params->pci_suscfg & ~(SUSMOD);
313 gx_params->off_duration = 0;
314 gx_params->on_duration = 0;
315 dprintk("suspend modulation disabled: cpu runs 100%% speed.\n");
316 }
317
318 gx_write_byte(PCI_MODOFF, gx_params->off_duration);
319 gx_write_byte(PCI_MODON, gx_params->on_duration);
320
321 gx_write_byte(PCI_SUSCFG, suscfg);
322 pci_read_config_byte(gx_params->cs55x0, PCI_SUSCFG, &suscfg);
323
324 local_irq_restore(flags);
325
326 gx_params->pci_suscfg = suscfg;
327
328 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
329
330 dprintk("suspend modulation w/ duration of ON:%d us, OFF:%d us\n",
331 gx_params->on_duration * 32, gx_params->off_duration * 32);
332 dprintk("suspend modulation w/ clock speed: %d kHz.\n", freqs.new);
333}
334
335/****************************************************************
336 * High level functions *
337 ****************************************************************/
338
339/*
340 * cpufreq_gx_verify: test if frequency range is valid
341 *
342 * This function checks if a given frequency range in kHz is valid
343 * for the hardware supported by the driver.
344 */
345
346static int cpufreq_gx_verify(struct cpufreq_policy *policy)
347{
348 unsigned int tmp_freq = 0;
349 u8 tmp1, tmp2;
350
351 if (!stock_freq || !policy)
352 return -EINVAL;
353
354 policy->cpu = 0;
355 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
356 stock_freq);
357
358 /* it needs to be assured that at least one supported frequency is
359 * within policy->min and policy->max. If it is not, policy->max
360 * needs to be increased until one freuqency is supported.
361 * policy->min may not be decreased, though. This way we guarantee a
362 * specific processing capacity.
363 */
364 tmp_freq = gx_validate_speed(policy->min, &tmp1, &tmp2);
365 if (tmp_freq < policy->min)
366 tmp_freq += stock_freq / max_duration;
367 policy->min = tmp_freq;
368 if (policy->min > policy->max)
369 policy->max = tmp_freq;
370 tmp_freq = gx_validate_speed(policy->max, &tmp1, &tmp2);
371 if (tmp_freq > policy->max)
372 tmp_freq -= stock_freq / max_duration;
373 policy->max = tmp_freq;
374 if (policy->max < policy->min)
375 policy->max = policy->min;
376 cpufreq_verify_within_limits(policy, (stock_freq / max_duration),
377 stock_freq);
378
379 return 0;
380}
381
382/*
383 * cpufreq_gx_target:
384 *
385 */
386static int cpufreq_gx_target(struct cpufreq_policy *policy,
387 unsigned int target_freq,
388 unsigned int relation)
389{
390 u8 tmp1, tmp2;
391 unsigned int tmp_freq;
392
393 if (!stock_freq || !policy)
394 return -EINVAL;
395
396 policy->cpu = 0;
397
398 tmp_freq = gx_validate_speed(target_freq, &tmp1, &tmp2);
399 while (tmp_freq < policy->min) {
400 tmp_freq += stock_freq / max_duration;
401 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
402 }
403 while (tmp_freq > policy->max) {
404 tmp_freq -= stock_freq / max_duration;
405 tmp_freq = gx_validate_speed(tmp_freq, &tmp1, &tmp2);
406 }
407
408 gx_set_cpuspeed(tmp_freq);
409
410 return 0;
411}
412
413static int cpufreq_gx_cpu_init(struct cpufreq_policy *policy)
414{
415 unsigned int maxfreq, curfreq;
416
417 if (!policy || policy->cpu != 0)
418 return -ENODEV;
419
420 /* determine maximum frequency */
421 if (pci_busclk)
422 maxfreq = pci_busclk * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
423 else if (cpu_khz)
424 maxfreq = cpu_khz;
425 else
426 maxfreq = 30000 * gx_freq_mult[getCx86(CX86_DIR1) & 0x0f];
427
428 stock_freq = maxfreq;
429 curfreq = gx_get_cpuspeed(0);
430
431 dprintk("cpu max frequency is %d.\n", maxfreq);
432 dprintk("cpu current frequency is %dkHz.\n", curfreq);
433
434 /* setup basic struct for cpufreq API */
435 policy->cpu = 0;
436
437 if (max_duration < POLICY_MIN_DIV)
438 policy->min = maxfreq / max_duration;
439 else
440 policy->min = maxfreq / POLICY_MIN_DIV;
441 policy->max = maxfreq;
442 policy->cur = curfreq;
443 policy->cpuinfo.min_freq = maxfreq / max_duration;
444 policy->cpuinfo.max_freq = maxfreq;
445 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
446
447 return 0;
448}
449
450/*
451 * cpufreq_gx_init:
452 * MediaGX/Geode GX initialize cpufreq driver
453 */
454static struct cpufreq_driver gx_suspmod_driver = {
455 .get = gx_get_cpuspeed,
456 .verify = cpufreq_gx_verify,
457 .target = cpufreq_gx_target,
458 .init = cpufreq_gx_cpu_init,
459 .name = "gx-suspmod",
460 .owner = THIS_MODULE,
461};
462
463static int __init cpufreq_gx_init(void)
464{
465 int ret;
466 struct gxfreq_params *params;
467 struct pci_dev *gx_pci;
468
469 /* Test if we have the right hardware */
470 gx_pci = gx_detect_chipset();
471 if (gx_pci == NULL)
472 return -ENODEV;
473
474 /* check whether module parameters are sane */
475 if (max_duration > 0xff)
476 max_duration = 0xff;
477
478 dprintk("geode suspend modulation available.\n");
479
480 params = kzalloc(sizeof(struct gxfreq_params), GFP_KERNEL);
481 if (params == NULL)
482 return -ENOMEM;
483
484 params->cs55x0 = gx_pci;
485 gx_params = params;
486
487 /* keep cs55x0 configurations */
488 pci_read_config_byte(params->cs55x0, PCI_SUSCFG, &(params->pci_suscfg));
489 pci_read_config_byte(params->cs55x0, PCI_PMER1, &(params->pci_pmer1));
490 pci_read_config_byte(params->cs55x0, PCI_PMER2, &(params->pci_pmer2));
491 pci_read_config_byte(params->cs55x0, PCI_MODON, &(params->on_duration));
492 pci_read_config_byte(params->cs55x0, PCI_MODOFF,
493 &(params->off_duration));
494
495 ret = cpufreq_register_driver(&gx_suspmod_driver);
496 if (ret) {
497 kfree(params);
498 return ret; /* register error! */
499 }
500
501 return 0;
502}
503
504static void __exit cpufreq_gx_exit(void)
505{
506 cpufreq_unregister_driver(&gx_suspmod_driver);
507 pci_dev_put(gx_params->cs55x0);
508 kfree(gx_params);
509}
510
511MODULE_AUTHOR("Hiroshi Miura <miura@da-cha.org>");
512MODULE_DESCRIPTION("Cpufreq driver for Cyrix MediaGX and NatSemi Geode");
513MODULE_LICENSE("GPL");
514
515module_init(cpufreq_gx_init);
516module_exit(cpufreq_gx_exit);
517
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
deleted file mode 100644
index 03162dac6271..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ /dev/null
@@ -1,1029 +0,0 @@
1/*
2 * (C) 2001-2004 Dave Jones. <davej@redhat.com>
3 * (C) 2002 Padraig Brady. <padraig@antefacto.com>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon datasheets & sample CPUs kindly provided by VIA.
7 *
8 * VIA have currently 3 different versions of Longhaul.
9 * Version 1 (Longhaul) uses the BCR2 MSR at 0x1147.
10 * It is present only in Samuel 1 (C5A), Samuel 2 (C5B) stepping 0.
11 * Version 2 of longhaul is backward compatible with v1, but adds
12 * LONGHAUL MSR for purpose of both frequency and voltage scaling.
13 * Present in Samuel 2 (steppings 1-7 only) (C5B), and Ezra (C5C).
14 * Version 3 of longhaul got renamed to Powersaver and redesigned
15 * to use only the POWERSAVER MSR at 0x110a.
16 * It is present in Ezra-T (C5M), Nehemiah (C5X) and above.
17 * It's pretty much the same feature wise to longhaul v2, though
18 * there is provision for scaling FSB too, but this doesn't work
19 * too well in practice so we don't even try to use this.
20 *
21 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
22 */
23
24#include <linux/kernel.h>
25#include <linux/module.h>
26#include <linux/moduleparam.h>
27#include <linux/init.h>
28#include <linux/cpufreq.h>
29#include <linux/pci.h>
30#include <linux/slab.h>
31#include <linux/string.h>
32#include <linux/delay.h>
33#include <linux/timex.h>
34#include <linux/io.h>
35#include <linux/acpi.h>
36
37#include <asm/msr.h>
38#include <acpi/processor.h>
39
40#include "longhaul.h"
41
42#define PFX "longhaul: "
43
44#define TYPE_LONGHAUL_V1 1
45#define TYPE_LONGHAUL_V2 2
46#define TYPE_POWERSAVER 3
47
48#define CPU_SAMUEL 1
49#define CPU_SAMUEL2 2
50#define CPU_EZRA 3
51#define CPU_EZRA_T 4
52#define CPU_NEHEMIAH 5
53#define CPU_NEHEMIAH_C 6
54
55/* Flags */
56#define USE_ACPI_C3 (1 << 1)
57#define USE_NORTHBRIDGE (1 << 2)
58
59static int cpu_model;
60static unsigned int numscales = 16;
61static unsigned int fsb;
62
63static const struct mV_pos *vrm_mV_table;
64static const unsigned char *mV_vrm_table;
65
66static unsigned int highest_speed, lowest_speed; /* kHz */
67static unsigned int minmult, maxmult;
68static int can_scale_voltage;
69static struct acpi_processor *pr;
70static struct acpi_processor_cx *cx;
71static u32 acpi_regs_addr;
72static u8 longhaul_flags;
73static unsigned int longhaul_index;
74
75/* Module parameters */
76static int scale_voltage;
77static int disable_acpi_c3;
78static int revid_errata;
79
80#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
81 "longhaul", msg)
82
83
84/* Clock ratios multiplied by 10 */
85static int mults[32];
86static int eblcr[32];
87static int longhaul_version;
88static struct cpufreq_frequency_table *longhaul_table;
89
90#ifdef CONFIG_CPU_FREQ_DEBUG
91static char speedbuffer[8];
92
93static char *print_speed(int speed)
94{
95 if (speed < 1000) {
96 snprintf(speedbuffer, sizeof(speedbuffer), "%dMHz", speed);
97 return speedbuffer;
98 }
99
100 if (speed%1000 == 0)
101 snprintf(speedbuffer, sizeof(speedbuffer),
102 "%dGHz", speed/1000);
103 else
104 snprintf(speedbuffer, sizeof(speedbuffer),
105 "%d.%dGHz", speed/1000, (speed%1000)/100);
106
107 return speedbuffer;
108}
109#endif
110
111
112static unsigned int calc_speed(int mult)
113{
114 int khz;
115 khz = (mult/10)*fsb;
116 if (mult%10)
117 khz += fsb/2;
118 khz *= 1000;
119 return khz;
120}
121
122
123static int longhaul_get_cpu_mult(void)
124{
125 unsigned long invalue = 0, lo, hi;
126
127 rdmsr(MSR_IA32_EBL_CR_POWERON, lo, hi);
128 invalue = (lo & (1<<22|1<<23|1<<24|1<<25))>>22;
129 if (longhaul_version == TYPE_LONGHAUL_V2 ||
130 longhaul_version == TYPE_POWERSAVER) {
131 if (lo & (1<<27))
132 invalue += 16;
133 }
134 return eblcr[invalue];
135}
136
137/* For processor with BCR2 MSR */
138
139static void do_longhaul1(unsigned int mults_index)
140{
141 union msr_bcr2 bcr2;
142
143 rdmsrl(MSR_VIA_BCR2, bcr2.val);
144 /* Enable software clock multiplier */
145 bcr2.bits.ESOFTBF = 1;
146 bcr2.bits.CLOCKMUL = mults_index & 0xff;
147
148 /* Sync to timer tick */
149 safe_halt();
150 /* Change frequency on next halt or sleep */
151 wrmsrl(MSR_VIA_BCR2, bcr2.val);
152 /* Invoke transition */
153 ACPI_FLUSH_CPU_CACHE();
154 halt();
155
156 /* Disable software clock multiplier */
157 local_irq_disable();
158 rdmsrl(MSR_VIA_BCR2, bcr2.val);
159 bcr2.bits.ESOFTBF = 0;
160 wrmsrl(MSR_VIA_BCR2, bcr2.val);
161}
162
163/* For processor with Longhaul MSR */
164
165static void do_powersaver(int cx_address, unsigned int mults_index,
166 unsigned int dir)
167{
168 union msr_longhaul longhaul;
169 u32 t;
170
171 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
172 /* Setup new frequency */
173 if (!revid_errata)
174 longhaul.bits.RevisionKey = longhaul.bits.RevisionID;
175 else
176 longhaul.bits.RevisionKey = 0;
177 longhaul.bits.SoftBusRatio = mults_index & 0xf;
178 longhaul.bits.SoftBusRatio4 = (mults_index & 0x10) >> 4;
179 /* Setup new voltage */
180 if (can_scale_voltage)
181 longhaul.bits.SoftVID = (mults_index >> 8) & 0x1f;
182 /* Sync to timer tick */
183 safe_halt();
184 /* Raise voltage if necessary */
185 if (can_scale_voltage && dir) {
186 longhaul.bits.EnableSoftVID = 1;
187 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
188 /* Change voltage */
189 if (!cx_address) {
190 ACPI_FLUSH_CPU_CACHE();
191 halt();
192 } else {
193 ACPI_FLUSH_CPU_CACHE();
194 /* Invoke C3 */
195 inb(cx_address);
196 /* Dummy op - must do something useless after P_LVL3
197 * read */
198 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
199 }
200 longhaul.bits.EnableSoftVID = 0;
201 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
202 }
203
204 /* Change frequency on next halt or sleep */
205 longhaul.bits.EnableSoftBusRatio = 1;
206 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
207 if (!cx_address) {
208 ACPI_FLUSH_CPU_CACHE();
209 halt();
210 } else {
211 ACPI_FLUSH_CPU_CACHE();
212 /* Invoke C3 */
213 inb(cx_address);
214 /* Dummy op - must do something useless after P_LVL3 read */
215 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
216 }
217 /* Disable bus ratio bit */
218 longhaul.bits.EnableSoftBusRatio = 0;
219 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
220
221 /* Reduce voltage if necessary */
222 if (can_scale_voltage && !dir) {
223 longhaul.bits.EnableSoftVID = 1;
224 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
225 /* Change voltage */
226 if (!cx_address) {
227 ACPI_FLUSH_CPU_CACHE();
228 halt();
229 } else {
230 ACPI_FLUSH_CPU_CACHE();
231 /* Invoke C3 */
232 inb(cx_address);
233 /* Dummy op - must do something useless after P_LVL3
234 * read */
235 t = inl(acpi_gbl_FADT.xpm_timer_block.address);
236 }
237 longhaul.bits.EnableSoftVID = 0;
238 wrmsrl(MSR_VIA_LONGHAUL, longhaul.val);
239 }
240}
241
242/**
243 * longhaul_set_cpu_frequency()
244 * @mults_index : bitpattern of the new multiplier.
245 *
246 * Sets a new clock ratio.
247 */
248
249static void longhaul_setstate(unsigned int table_index)
250{
251 unsigned int mults_index;
252 int speed, mult;
253 struct cpufreq_freqs freqs;
254 unsigned long flags;
255 unsigned int pic1_mask, pic2_mask;
256 u16 bm_status = 0;
257 u32 bm_timeout = 1000;
258 unsigned int dir = 0;
259
260 mults_index = longhaul_table[table_index].index;
261 /* Safety precautions */
262 mult = mults[mults_index & 0x1f];
263 if (mult == -1)
264 return;
265 speed = calc_speed(mult);
266 if ((speed > highest_speed) || (speed < lowest_speed))
267 return;
268 /* Voltage transition before frequency transition? */
269 if (can_scale_voltage && longhaul_index < table_index)
270 dir = 1;
271
272 freqs.old = calc_speed(longhaul_get_cpu_mult());
273 freqs.new = speed;
274 freqs.cpu = 0; /* longhaul.c is UP only driver */
275
276 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
277
278 dprintk("Setting to FSB:%dMHz Mult:%d.%dx (%s)\n",
279 fsb, mult/10, mult%10, print_speed(speed/1000));
280retry_loop:
281 preempt_disable();
282 local_irq_save(flags);
283
284 pic2_mask = inb(0xA1);
285 pic1_mask = inb(0x21); /* works on C3. save mask. */
286 outb(0xFF, 0xA1); /* Overkill */
287 outb(0xFE, 0x21); /* TMR0 only */
288
289 /* Wait while PCI bus is busy. */
290 if (acpi_regs_addr && (longhaul_flags & USE_NORTHBRIDGE
291 || ((pr != NULL) && pr->flags.bm_control))) {
292 bm_status = inw(acpi_regs_addr);
293 bm_status &= 1 << 4;
294 while (bm_status && bm_timeout) {
295 outw(1 << 4, acpi_regs_addr);
296 bm_timeout--;
297 bm_status = inw(acpi_regs_addr);
298 bm_status &= 1 << 4;
299 }
300 }
301
302 if (longhaul_flags & USE_NORTHBRIDGE) {
303 /* Disable AGP and PCI arbiters */
304 outb(3, 0x22);
305 } else if ((pr != NULL) && pr->flags.bm_control) {
306 /* Disable bus master arbitration */
307 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 1);
308 }
309 switch (longhaul_version) {
310
311 /*
312 * Longhaul v1. (Samuel[C5A] and Samuel2 stepping 0[C5B])
313 * Software controlled multipliers only.
314 */
315 case TYPE_LONGHAUL_V1:
316 do_longhaul1(mults_index);
317 break;
318
319 /*
320 * Longhaul v2 appears in Samuel2 Steppings 1->7 [C5B] and Ezra [C5C]
321 *
322 * Longhaul v3 (aka Powersaver). (Ezra-T [C5M] & Nehemiah [C5N])
323 * Nehemiah can do FSB scaling too, but this has never been proven
324 * to work in practice.
325 */
326 case TYPE_LONGHAUL_V2:
327 case TYPE_POWERSAVER:
328 if (longhaul_flags & USE_ACPI_C3) {
329 /* Don't allow wakeup */
330 acpi_write_bit_register(ACPI_BITREG_BUS_MASTER_RLD, 0);
331 do_powersaver(cx->address, mults_index, dir);
332 } else {
333 do_powersaver(0, mults_index, dir);
334 }
335 break;
336 }
337
338 if (longhaul_flags & USE_NORTHBRIDGE) {
339 /* Enable arbiters */
340 outb(0, 0x22);
341 } else if ((pr != NULL) && pr->flags.bm_control) {
342 /* Enable bus master arbitration */
343 acpi_write_bit_register(ACPI_BITREG_ARB_DISABLE, 0);
344 }
345 outb(pic2_mask, 0xA1); /* restore mask */
346 outb(pic1_mask, 0x21);
347
348 local_irq_restore(flags);
349 preempt_enable();
350
351 freqs.new = calc_speed(longhaul_get_cpu_mult());
352 /* Check if requested frequency is set. */
353 if (unlikely(freqs.new != speed)) {
354 printk(KERN_INFO PFX "Failed to set requested frequency!\n");
355 /* Revision ID = 1 but processor is expecting revision key
356 * equal to 0. Jumpers at the bottom of processor will change
357 * multiplier and FSB, but will not change bits in Longhaul
358 * MSR nor enable voltage scaling. */
359 if (!revid_errata) {
360 printk(KERN_INFO PFX "Enabling \"Ignore Revision ID\" "
361 "option.\n");
362 revid_errata = 1;
363 msleep(200);
364 goto retry_loop;
365 }
366 /* Why ACPI C3 sometimes doesn't work is a mystery for me.
367 * But it does happen. Processor is entering ACPI C3 state,
368 * but it doesn't change frequency. I tried poking various
369 * bits in northbridge registers, but without success. */
370 if (longhaul_flags & USE_ACPI_C3) {
371 printk(KERN_INFO PFX "Disabling ACPI C3 support.\n");
372 longhaul_flags &= ~USE_ACPI_C3;
373 if (revid_errata) {
374 printk(KERN_INFO PFX "Disabling \"Ignore "
375 "Revision ID\" option.\n");
376 revid_errata = 0;
377 }
378 msleep(200);
379 goto retry_loop;
380 }
381 /* This shouldn't happen. Longhaul ver. 2 was reported not
382 * working on processors without voltage scaling, but with
383 * RevID = 1. RevID errata will make things right. Just
384 * to be 100% sure. */
385 if (longhaul_version == TYPE_LONGHAUL_V2) {
386 printk(KERN_INFO PFX "Switching to Longhaul ver. 1\n");
387 longhaul_version = TYPE_LONGHAUL_V1;
388 msleep(200);
389 goto retry_loop;
390 }
391 }
392 /* Report true CPU frequency */
393 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
394
395 if (!bm_timeout)
396 printk(KERN_INFO PFX "Warning: Timeout while waiting for "
397 "idle PCI bus.\n");
398}
399
400/*
401 * Centaur decided to make life a little more tricky.
402 * Only longhaul v1 is allowed to read EBLCR BSEL[0:1].
403 * Samuel2 and above have to try and guess what the FSB is.
404 * We do this by assuming we booted at maximum multiplier, and interpolate
405 * between that value multiplied by possible FSBs and cpu_mhz which
406 * was calculated at boot time. Really ugly, but no other way to do this.
407 */
408
409#define ROUNDING 0xf
410
411static int guess_fsb(int mult)
412{
413 int speed = cpu_khz / 1000;
414 int i;
415 int speeds[] = { 666, 1000, 1333, 2000 };
416 int f_max, f_min;
417
418 for (i = 0; i < 4; i++) {
419 f_max = ((speeds[i] * mult) + 50) / 100;
420 f_max += (ROUNDING / 2);
421 f_min = f_max - ROUNDING;
422 if ((speed <= f_max) && (speed >= f_min))
423 return speeds[i] / 10;
424 }
425 return 0;
426}
427
428
429static int __cpuinit longhaul_get_ranges(void)
430{
431 unsigned int i, j, k = 0;
432 unsigned int ratio;
433 int mult;
434
435 /* Get current frequency */
436 mult = longhaul_get_cpu_mult();
437 if (mult == -1) {
438 printk(KERN_INFO PFX "Invalid (reserved) multiplier!\n");
439 return -EINVAL;
440 }
441 fsb = guess_fsb(mult);
442 if (fsb == 0) {
443 printk(KERN_INFO PFX "Invalid (reserved) FSB!\n");
444 return -EINVAL;
445 }
446 /* Get max multiplier - as we always did.
447 * Longhaul MSR is usefull only when voltage scaling is enabled.
448 * C3 is booting at max anyway. */
449 maxmult = mult;
450 /* Get min multiplier */
451 switch (cpu_model) {
452 case CPU_NEHEMIAH:
453 minmult = 50;
454 break;
455 case CPU_NEHEMIAH_C:
456 minmult = 40;
457 break;
458 default:
459 minmult = 30;
460 break;
461 }
462
463 dprintk("MinMult:%d.%dx MaxMult:%d.%dx\n",
464 minmult/10, minmult%10, maxmult/10, maxmult%10);
465
466 highest_speed = calc_speed(maxmult);
467 lowest_speed = calc_speed(minmult);
468 dprintk("FSB:%dMHz Lowest speed: %s Highest speed:%s\n", fsb,
469 print_speed(lowest_speed/1000),
470 print_speed(highest_speed/1000));
471
472 if (lowest_speed == highest_speed) {
473 printk(KERN_INFO PFX "highestspeed == lowest, aborting.\n");
474 return -EINVAL;
475 }
476 if (lowest_speed > highest_speed) {
477 printk(KERN_INFO PFX "nonsense! lowest (%d > %d) !\n",
478 lowest_speed, highest_speed);
479 return -EINVAL;
480 }
481
482 longhaul_table = kmalloc((numscales + 1) * sizeof(*longhaul_table),
483 GFP_KERNEL);
484 if (!longhaul_table)
485 return -ENOMEM;
486
487 for (j = 0; j < numscales; j++) {
488 ratio = mults[j];
489 if (ratio == -1)
490 continue;
491 if (ratio > maxmult || ratio < minmult)
492 continue;
493 longhaul_table[k].frequency = calc_speed(ratio);
494 longhaul_table[k].index = j;
495 k++;
496 }
497 if (k <= 1) {
498 kfree(longhaul_table);
499 return -ENODEV;
500 }
501 /* Sort */
502 for (j = 0; j < k - 1; j++) {
503 unsigned int min_f, min_i;
504 min_f = longhaul_table[j].frequency;
505 min_i = j;
506 for (i = j + 1; i < k; i++) {
507 if (longhaul_table[i].frequency < min_f) {
508 min_f = longhaul_table[i].frequency;
509 min_i = i;
510 }
511 }
512 if (min_i != j) {
513 swap(longhaul_table[j].frequency,
514 longhaul_table[min_i].frequency);
515 swap(longhaul_table[j].index,
516 longhaul_table[min_i].index);
517 }
518 }
519
520 longhaul_table[k].frequency = CPUFREQ_TABLE_END;
521
522 /* Find index we are running on */
523 for (j = 0; j < k; j++) {
524 if (mults[longhaul_table[j].index & 0x1f] == mult) {
525 longhaul_index = j;
526 break;
527 }
528 }
529 return 0;
530}
531
532
533static void __cpuinit longhaul_setup_voltagescaling(void)
534{
535 union msr_longhaul longhaul;
536 struct mV_pos minvid, maxvid, vid;
537 unsigned int j, speed, pos, kHz_step, numvscales;
538 int min_vid_speed;
539
540 rdmsrl(MSR_VIA_LONGHAUL, longhaul.val);
541 if (!(longhaul.bits.RevisionID & 1)) {
542 printk(KERN_INFO PFX "Voltage scaling not supported by CPU.\n");
543 return;
544 }
545
546 if (!longhaul.bits.VRMRev) {
547 printk(KERN_INFO PFX "VRM 8.5\n");
548 vrm_mV_table = &vrm85_mV[0];
549 mV_vrm_table = &mV_vrm85[0];
550 } else {
551 printk(KERN_INFO PFX "Mobile VRM\n");
552 if (cpu_model < CPU_NEHEMIAH)
553 return;
554 vrm_mV_table = &mobilevrm_mV[0];
555 mV_vrm_table = &mV_mobilevrm[0];
556 }
557
558 minvid = vrm_mV_table[longhaul.bits.MinimumVID];
559 maxvid = vrm_mV_table[longhaul.bits.MaximumVID];
560
561 if (minvid.mV == 0 || maxvid.mV == 0 || minvid.mV > maxvid.mV) {
562 printk(KERN_INFO PFX "Bogus values Min:%d.%03d Max:%d.%03d. "
563 "Voltage scaling disabled.\n",
564 minvid.mV/1000, minvid.mV%1000,
565 maxvid.mV/1000, maxvid.mV%1000);
566 return;
567 }
568
569 if (minvid.mV == maxvid.mV) {
570 printk(KERN_INFO PFX "Claims to support voltage scaling but "
571 "min & max are both %d.%03d. "
572 "Voltage scaling disabled\n",
573 maxvid.mV/1000, maxvid.mV%1000);
574 return;
575 }
576
577 /* How many voltage steps*/
578 numvscales = maxvid.pos - minvid.pos + 1;
579 printk(KERN_INFO PFX
580 "Max VID=%d.%03d "
581 "Min VID=%d.%03d, "
582 "%d possible voltage scales\n",
583 maxvid.mV/1000, maxvid.mV%1000,
584 minvid.mV/1000, minvid.mV%1000,
585 numvscales);
586
587 /* Calculate max frequency at min voltage */
588 j = longhaul.bits.MinMHzBR;
589 if (longhaul.bits.MinMHzBR4)
590 j += 16;
591 min_vid_speed = eblcr[j];
592 if (min_vid_speed == -1)
593 return;
594 switch (longhaul.bits.MinMHzFSB) {
595 case 0:
596 min_vid_speed *= 13333;
597 break;
598 case 1:
599 min_vid_speed *= 10000;
600 break;
601 case 3:
602 min_vid_speed *= 6666;
603 break;
604 default:
605 return;
606 break;
607 }
608 if (min_vid_speed >= highest_speed)
609 return;
610 /* Calculate kHz for one voltage step */
611 kHz_step = (highest_speed - min_vid_speed) / numvscales;
612
613 j = 0;
614 while (longhaul_table[j].frequency != CPUFREQ_TABLE_END) {
615 speed = longhaul_table[j].frequency;
616 if (speed > min_vid_speed)
617 pos = (speed - min_vid_speed) / kHz_step + minvid.pos;
618 else
619 pos = minvid.pos;
620 longhaul_table[j].index |= mV_vrm_table[pos] << 8;
621 vid = vrm_mV_table[mV_vrm_table[pos]];
622 printk(KERN_INFO PFX "f: %d kHz, index: %d, vid: %d mV\n",
623 speed, j, vid.mV);
624 j++;
625 }
626
627 can_scale_voltage = 1;
628 printk(KERN_INFO PFX "Voltage scaling enabled.\n");
629}
630
631
632static int longhaul_verify(struct cpufreq_policy *policy)
633{
634 return cpufreq_frequency_table_verify(policy, longhaul_table);
635}
636
637
638static int longhaul_target(struct cpufreq_policy *policy,
639 unsigned int target_freq, unsigned int relation)
640{
641 unsigned int table_index = 0;
642 unsigned int i;
643 unsigned int dir = 0;
644 u8 vid, current_vid;
645
646 if (cpufreq_frequency_table_target(policy, longhaul_table, target_freq,
647 relation, &table_index))
648 return -EINVAL;
649
650 /* Don't set same frequency again */
651 if (longhaul_index == table_index)
652 return 0;
653
654 if (!can_scale_voltage)
655 longhaul_setstate(table_index);
656 else {
657 /* On test system voltage transitions exceeding single
658 * step up or down were turning motherboard off. Both
659 * "ondemand" and "userspace" are unsafe. C7 is doing
660 * this in hardware, C3 is old and we need to do this
661 * in software. */
662 i = longhaul_index;
663 current_vid = (longhaul_table[longhaul_index].index >> 8);
664 current_vid &= 0x1f;
665 if (table_index > longhaul_index)
666 dir = 1;
667 while (i != table_index) {
668 vid = (longhaul_table[i].index >> 8) & 0x1f;
669 if (vid != current_vid) {
670 longhaul_setstate(i);
671 current_vid = vid;
672 msleep(200);
673 }
674 if (dir)
675 i++;
676 else
677 i--;
678 }
679 longhaul_setstate(table_index);
680 }
681 longhaul_index = table_index;
682 return 0;
683}
684
685
686static unsigned int longhaul_get(unsigned int cpu)
687{
688 if (cpu)
689 return 0;
690 return calc_speed(longhaul_get_cpu_mult());
691}
692
693static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
694 u32 nesting_level,
695 void *context, void **return_value)
696{
697 struct acpi_device *d;
698
699 if (acpi_bus_get_device(obj_handle, &d))
700 return 0;
701
702 *return_value = acpi_driver_data(d);
703 return 1;
704}
705
706/* VIA don't support PM2 reg, but have something similar */
707static int enable_arbiter_disable(void)
708{
709 struct pci_dev *dev;
710 int status = 1;
711 int reg;
712 u8 pci_cmd;
713
714 /* Find PLE133 host bridge */
715 reg = 0x78;
716 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8601_0,
717 NULL);
718 /* Find PM133/VT8605 host bridge */
719 if (dev == NULL)
720 dev = pci_get_device(PCI_VENDOR_ID_VIA,
721 PCI_DEVICE_ID_VIA_8605_0, NULL);
722 /* Find CLE266 host bridge */
723 if (dev == NULL) {
724 reg = 0x76;
725 dev = pci_get_device(PCI_VENDOR_ID_VIA,
726 PCI_DEVICE_ID_VIA_862X_0, NULL);
727 /* Find CN400 V-Link host bridge */
728 if (dev == NULL)
729 dev = pci_get_device(PCI_VENDOR_ID_VIA, 0x7259, NULL);
730 }
731 if (dev != NULL) {
732 /* Enable access to port 0x22 */
733 pci_read_config_byte(dev, reg, &pci_cmd);
734 if (!(pci_cmd & 1<<7)) {
735 pci_cmd |= 1<<7;
736 pci_write_config_byte(dev, reg, pci_cmd);
737 pci_read_config_byte(dev, reg, &pci_cmd);
738 if (!(pci_cmd & 1<<7)) {
739 printk(KERN_ERR PFX
740 "Can't enable access to port 0x22.\n");
741 status = 0;
742 }
743 }
744 pci_dev_put(dev);
745 return status;
746 }
747 return 0;
748}
749
750static int longhaul_setup_southbridge(void)
751{
752 struct pci_dev *dev;
753 u8 pci_cmd;
754
755 /* Find VT8235 southbridge */
756 dev = pci_get_device(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235, NULL);
757 if (dev == NULL)
758 /* Find VT8237 southbridge */
759 dev = pci_get_device(PCI_VENDOR_ID_VIA,
760 PCI_DEVICE_ID_VIA_8237, NULL);
761 if (dev != NULL) {
762 /* Set transition time to max */
763 pci_read_config_byte(dev, 0xec, &pci_cmd);
764 pci_cmd &= ~(1 << 2);
765 pci_write_config_byte(dev, 0xec, pci_cmd);
766 pci_read_config_byte(dev, 0xe4, &pci_cmd);
767 pci_cmd &= ~(1 << 7);
768 pci_write_config_byte(dev, 0xe4, pci_cmd);
769 pci_read_config_byte(dev, 0xe5, &pci_cmd);
770 pci_cmd |= 1 << 7;
771 pci_write_config_byte(dev, 0xe5, pci_cmd);
772 /* Get address of ACPI registers block*/
773 pci_read_config_byte(dev, 0x81, &pci_cmd);
774 if (pci_cmd & 1 << 7) {
775 pci_read_config_dword(dev, 0x88, &acpi_regs_addr);
776 acpi_regs_addr &= 0xff00;
777 printk(KERN_INFO PFX "ACPI I/O at 0x%x\n",
778 acpi_regs_addr);
779 }
780
781 pci_dev_put(dev);
782 return 1;
783 }
784 return 0;
785}
786
787static int __cpuinit longhaul_cpu_init(struct cpufreq_policy *policy)
788{
789 struct cpuinfo_x86 *c = &cpu_data(0);
790 char *cpuname = NULL;
791 int ret;
792 u32 lo, hi;
793
794 /* Check what we have on this motherboard */
795 switch (c->x86_model) {
796 case 6:
797 cpu_model = CPU_SAMUEL;
798 cpuname = "C3 'Samuel' [C5A]";
799 longhaul_version = TYPE_LONGHAUL_V1;
800 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
801 memcpy(eblcr, samuel1_eblcr, sizeof(samuel1_eblcr));
802 break;
803
804 case 7:
805 switch (c->x86_mask) {
806 case 0:
807 longhaul_version = TYPE_LONGHAUL_V1;
808 cpu_model = CPU_SAMUEL2;
809 cpuname = "C3 'Samuel 2' [C5B]";
810 /* Note, this is not a typo, early Samuel2's had
811 * Samuel1 ratios. */
812 memcpy(mults, samuel1_mults, sizeof(samuel1_mults));
813 memcpy(eblcr, samuel2_eblcr, sizeof(samuel2_eblcr));
814 break;
815 case 1 ... 15:
816 longhaul_version = TYPE_LONGHAUL_V2;
817 if (c->x86_mask < 8) {
818 cpu_model = CPU_SAMUEL2;
819 cpuname = "C3 'Samuel 2' [C5B]";
820 } else {
821 cpu_model = CPU_EZRA;
822 cpuname = "C3 'Ezra' [C5C]";
823 }
824 memcpy(mults, ezra_mults, sizeof(ezra_mults));
825 memcpy(eblcr, ezra_eblcr, sizeof(ezra_eblcr));
826 break;
827 }
828 break;
829
830 case 8:
831 cpu_model = CPU_EZRA_T;
832 cpuname = "C3 'Ezra-T' [C5M]";
833 longhaul_version = TYPE_POWERSAVER;
834 numscales = 32;
835 memcpy(mults, ezrat_mults, sizeof(ezrat_mults));
836 memcpy(eblcr, ezrat_eblcr, sizeof(ezrat_eblcr));
837 break;
838
839 case 9:
840 longhaul_version = TYPE_POWERSAVER;
841 numscales = 32;
842 memcpy(mults, nehemiah_mults, sizeof(nehemiah_mults));
843 memcpy(eblcr, nehemiah_eblcr, sizeof(nehemiah_eblcr));
844 switch (c->x86_mask) {
845 case 0 ... 1:
846 cpu_model = CPU_NEHEMIAH;
847 cpuname = "C3 'Nehemiah A' [C5XLOE]";
848 break;
849 case 2 ... 4:
850 cpu_model = CPU_NEHEMIAH;
851 cpuname = "C3 'Nehemiah B' [C5XLOH]";
852 break;
853 case 5 ... 15:
854 cpu_model = CPU_NEHEMIAH_C;
855 cpuname = "C3 'Nehemiah C' [C5P]";
856 break;
857 }
858 break;
859
860 default:
861 cpuname = "Unknown";
862 break;
863 }
864 /* Check Longhaul ver. 2 */
865 if (longhaul_version == TYPE_LONGHAUL_V2) {
866 rdmsr(MSR_VIA_LONGHAUL, lo, hi);
867 if (lo == 0 && hi == 0)
868 /* Looks like MSR isn't present */
869 longhaul_version = TYPE_LONGHAUL_V1;
870 }
871
872 printk(KERN_INFO PFX "VIA %s CPU detected. ", cpuname);
873 switch (longhaul_version) {
874 case TYPE_LONGHAUL_V1:
875 case TYPE_LONGHAUL_V2:
876 printk(KERN_CONT "Longhaul v%d supported.\n", longhaul_version);
877 break;
878 case TYPE_POWERSAVER:
879 printk(KERN_CONT "Powersaver supported.\n");
880 break;
881 };
882
883 /* Doesn't hurt */
884 longhaul_setup_southbridge();
885
886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr);
890
891 /* Check ACPI support for C3 state */
892 if (pr != NULL && longhaul_version == TYPE_POWERSAVER) {
893 cx = &pr->power.states[ACPI_STATE_C3];
894 if (cx->address > 0 && cx->latency <= 1000)
895 longhaul_flags |= USE_ACPI_C3;
896 }
897 /* Disable if it isn't working */
898 if (disable_acpi_c3)
899 longhaul_flags &= ~USE_ACPI_C3;
900 /* Check if northbridge is friendly */
901 if (enable_arbiter_disable())
902 longhaul_flags |= USE_NORTHBRIDGE;
903
904 /* Check ACPI support for bus master arbiter disable */
905 if (!(longhaul_flags & USE_ACPI_C3
906 || longhaul_flags & USE_NORTHBRIDGE)
907 && ((pr == NULL) || !(pr->flags.bm_control))) {
908 printk(KERN_ERR PFX
909 "No ACPI support. Unsupported northbridge.\n");
910 return -ENODEV;
911 }
912
913 if (longhaul_flags & USE_NORTHBRIDGE)
914 printk(KERN_INFO PFX "Using northbridge support.\n");
915 if (longhaul_flags & USE_ACPI_C3)
916 printk(KERN_INFO PFX "Using ACPI support.\n");
917
918 ret = longhaul_get_ranges();
919 if (ret != 0)
920 return ret;
921
922 if ((longhaul_version != TYPE_LONGHAUL_V1) && (scale_voltage != 0))
923 longhaul_setup_voltagescaling();
924
925 policy->cpuinfo.transition_latency = 200000; /* nsec */
926 policy->cur = calc_speed(longhaul_get_cpu_mult());
927
928 ret = cpufreq_frequency_table_cpuinfo(policy, longhaul_table);
929 if (ret)
930 return ret;
931
932 cpufreq_frequency_table_get_attr(longhaul_table, policy->cpu);
933
934 return 0;
935}
936
937static int __devexit longhaul_cpu_exit(struct cpufreq_policy *policy)
938{
939 cpufreq_frequency_table_put_attr(policy->cpu);
940 return 0;
941}
942
943static struct freq_attr *longhaul_attr[] = {
944 &cpufreq_freq_attr_scaling_available_freqs,
945 NULL,
946};
947
948static struct cpufreq_driver longhaul_driver = {
949 .verify = longhaul_verify,
950 .target = longhaul_target,
951 .get = longhaul_get,
952 .init = longhaul_cpu_init,
953 .exit = __devexit_p(longhaul_cpu_exit),
954 .name = "longhaul",
955 .owner = THIS_MODULE,
956 .attr = longhaul_attr,
957};
958
959
960static int __init longhaul_init(void)
961{
962 struct cpuinfo_x86 *c = &cpu_data(0);
963
964 if (c->x86_vendor != X86_VENDOR_CENTAUR || c->x86 != 6)
965 return -ENODEV;
966
967#ifdef CONFIG_SMP
968 if (num_online_cpus() > 1) {
969 printk(KERN_ERR PFX "More than 1 CPU detected, "
970 "longhaul disabled.\n");
971 return -ENODEV;
972 }
973#endif
974#ifdef CONFIG_X86_IO_APIC
975 if (cpu_has_apic) {
976 printk(KERN_ERR PFX "APIC detected. Longhaul is currently "
977 "broken in this configuration.\n");
978 return -ENODEV;
979 }
980#endif
981 switch (c->x86_model) {
982 case 6 ... 9:
983 return cpufreq_register_driver(&longhaul_driver);
984 case 10:
985 printk(KERN_ERR PFX "Use acpi-cpufreq driver for VIA C7\n");
986 default:
987 ;
988 }
989
990 return -ENODEV;
991}
992
993
994static void __exit longhaul_exit(void)
995{
996 int i;
997
998 for (i = 0; i < numscales; i++) {
999 if (mults[i] == maxmult) {
1000 longhaul_setstate(i);
1001 break;
1002 }
1003 }
1004
1005 cpufreq_unregister_driver(&longhaul_driver);
1006 kfree(longhaul_table);
1007}
1008
1009/* Even if BIOS is exporting ACPI C3 state, and it is used
1010 * with success when CPU is idle, this state doesn't
1011 * trigger frequency transition in some cases. */
1012module_param(disable_acpi_c3, int, 0644);
1013MODULE_PARM_DESC(disable_acpi_c3, "Don't use ACPI C3 support");
1014/* Change CPU voltage with frequency. Very usefull to save
1015 * power, but most VIA C3 processors aren't supporting it. */
1016module_param(scale_voltage, int, 0644);
1017MODULE_PARM_DESC(scale_voltage, "Scale voltage of processor");
1018/* Force revision key to 0 for processors which doesn't
1019 * support voltage scaling, but are introducing itself as
1020 * such. */
1021module_param(revid_errata, int, 0644);
1022MODULE_PARM_DESC(revid_errata, "Ignore CPU Revision ID");
1023
1024MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
1025MODULE_DESCRIPTION("Longhaul driver for VIA Cyrix processors.");
1026MODULE_LICENSE("GPL");
1027
1028late_initcall(longhaul_init);
1029module_exit(longhaul_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.h b/arch/x86/kernel/cpu/cpufreq/longhaul.h
deleted file mode 100644
index cbf48fbca881..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.h
+++ /dev/null
@@ -1,353 +0,0 @@
1/*
2 * longhaul.h
3 * (C) 2003 Dave Jones.
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 *
7 * VIA-specific information
8 */
9
10union msr_bcr2 {
11 struct {
12 unsigned Reseved:19, // 18:0
13 ESOFTBF:1, // 19
14 Reserved2:3, // 22:20
15 CLOCKMUL:4, // 26:23
16 Reserved3:5; // 31:27
17 } bits;
18 unsigned long val;
19};
20
21union msr_longhaul {
22 struct {
23 unsigned RevisionID:4, // 3:0
24 RevisionKey:4, // 7:4
25 EnableSoftBusRatio:1, // 8
26 EnableSoftVID:1, // 9
27 EnableSoftBSEL:1, // 10
28 Reserved:3, // 11:13
29 SoftBusRatio4:1, // 14
30 VRMRev:1, // 15
31 SoftBusRatio:4, // 19:16
32 SoftVID:5, // 24:20
33 Reserved2:3, // 27:25
34 SoftBSEL:2, // 29:28
35 Reserved3:2, // 31:30
36 MaxMHzBR:4, // 35:32
37 MaximumVID:5, // 40:36
38 MaxMHzFSB:2, // 42:41
39 MaxMHzBR4:1, // 43
40 Reserved4:4, // 47:44
41 MinMHzBR:4, // 51:48
42 MinimumVID:5, // 56:52
43 MinMHzFSB:2, // 58:57
44 MinMHzBR4:1, // 59
45 Reserved5:4; // 63:60
46 } bits;
47 unsigned long long val;
48};
49
50/*
51 * Clock ratio tables. Div/Mod by 10 to get ratio.
52 * The eblcr values specify the ratio read from the CPU.
53 * The mults values specify what to write to the CPU.
54 */
55
56/*
57 * VIA C3 Samuel 1 & Samuel 2 (stepping 0)
58 */
59static const int __cpuinitdata samuel1_mults[16] = {
60 -1, /* 0000 -> RESERVED */
61 30, /* 0001 -> 3.0x */
62 40, /* 0010 -> 4.0x */
63 -1, /* 0011 -> RESERVED */
64 -1, /* 0100 -> RESERVED */
65 35, /* 0101 -> 3.5x */
66 45, /* 0110 -> 4.5x */
67 55, /* 0111 -> 5.5x */
68 60, /* 1000 -> 6.0x */
69 70, /* 1001 -> 7.0x */
70 80, /* 1010 -> 8.0x */
71 50, /* 1011 -> 5.0x */
72 65, /* 1100 -> 6.5x */
73 75, /* 1101 -> 7.5x */
74 -1, /* 1110 -> RESERVED */
75 -1, /* 1111 -> RESERVED */
76};
77
78static const int __cpuinitdata samuel1_eblcr[16] = {
79 50, /* 0000 -> RESERVED */
80 30, /* 0001 -> 3.0x */
81 40, /* 0010 -> 4.0x */
82 -1, /* 0011 -> RESERVED */
83 55, /* 0100 -> 5.5x */
84 35, /* 0101 -> 3.5x */
85 45, /* 0110 -> 4.5x */
86 -1, /* 0111 -> RESERVED */
87 -1, /* 1000 -> RESERVED */
88 70, /* 1001 -> 7.0x */
89 80, /* 1010 -> 8.0x */
90 60, /* 1011 -> 6.0x */
91 -1, /* 1100 -> RESERVED */
92 75, /* 1101 -> 7.5x */
93 -1, /* 1110 -> RESERVED */
94 65, /* 1111 -> 6.5x */
95};
96
97/*
98 * VIA C3 Samuel2 Stepping 1->15
99 */
100static const int __cpuinitdata samuel2_eblcr[16] = {
101 50, /* 0000 -> 5.0x */
102 30, /* 0001 -> 3.0x */
103 40, /* 0010 -> 4.0x */
104 100, /* 0011 -> 10.0x */
105 55, /* 0100 -> 5.5x */
106 35, /* 0101 -> 3.5x */
107 45, /* 0110 -> 4.5x */
108 110, /* 0111 -> 11.0x */
109 90, /* 1000 -> 9.0x */
110 70, /* 1001 -> 7.0x */
111 80, /* 1010 -> 8.0x */
112 60, /* 1011 -> 6.0x */
113 120, /* 1100 -> 12.0x */
114 75, /* 1101 -> 7.5x */
115 130, /* 1110 -> 13.0x */
116 65, /* 1111 -> 6.5x */
117};
118
119/*
120 * VIA C3 Ezra
121 */
122static const int __cpuinitdata ezra_mults[16] = {
123 100, /* 0000 -> 10.0x */
124 30, /* 0001 -> 3.0x */
125 40, /* 0010 -> 4.0x */
126 90, /* 0011 -> 9.0x */
127 95, /* 0100 -> 9.5x */
128 35, /* 0101 -> 3.5x */
129 45, /* 0110 -> 4.5x */
130 55, /* 0111 -> 5.5x */
131 60, /* 1000 -> 6.0x */
132 70, /* 1001 -> 7.0x */
133 80, /* 1010 -> 8.0x */
134 50, /* 1011 -> 5.0x */
135 65, /* 1100 -> 6.5x */
136 75, /* 1101 -> 7.5x */
137 85, /* 1110 -> 8.5x */
138 120, /* 1111 -> 12.0x */
139};
140
141static const int __cpuinitdata ezra_eblcr[16] = {
142 50, /* 0000 -> 5.0x */
143 30, /* 0001 -> 3.0x */
144 40, /* 0010 -> 4.0x */
145 100, /* 0011 -> 10.0x */
146 55, /* 0100 -> 5.5x */
147 35, /* 0101 -> 3.5x */
148 45, /* 0110 -> 4.5x */
149 95, /* 0111 -> 9.5x */
150 90, /* 1000 -> 9.0x */
151 70, /* 1001 -> 7.0x */
152 80, /* 1010 -> 8.0x */
153 60, /* 1011 -> 6.0x */
154 120, /* 1100 -> 12.0x */
155 75, /* 1101 -> 7.5x */
156 85, /* 1110 -> 8.5x */
157 65, /* 1111 -> 6.5x */
158};
159
160/*
161 * VIA C3 (Ezra-T) [C5M].
162 */
163static const int __cpuinitdata ezrat_mults[32] = {
164 100, /* 0000 -> 10.0x */
165 30, /* 0001 -> 3.0x */
166 40, /* 0010 -> 4.0x */
167 90, /* 0011 -> 9.0x */
168 95, /* 0100 -> 9.5x */
169 35, /* 0101 -> 3.5x */
170 45, /* 0110 -> 4.5x */
171 55, /* 0111 -> 5.5x */
172 60, /* 1000 -> 6.0x */
173 70, /* 1001 -> 7.0x */
174 80, /* 1010 -> 8.0x */
175 50, /* 1011 -> 5.0x */
176 65, /* 1100 -> 6.5x */
177 75, /* 1101 -> 7.5x */
178 85, /* 1110 -> 8.5x */
179 120, /* 1111 -> 12.0x */
180
181 -1, /* 0000 -> RESERVED (10.0x) */
182 110, /* 0001 -> 11.0x */
183 -1, /* 0010 -> 12.0x */
184 -1, /* 0011 -> RESERVED (9.0x)*/
185 105, /* 0100 -> 10.5x */
186 115, /* 0101 -> 11.5x */
187 125, /* 0110 -> 12.5x */
188 135, /* 0111 -> 13.5x */
189 140, /* 1000 -> 14.0x */
190 150, /* 1001 -> 15.0x */
191 160, /* 1010 -> 16.0x */
192 130, /* 1011 -> 13.0x */
193 145, /* 1100 -> 14.5x */
194 155, /* 1101 -> 15.5x */
195 -1, /* 1110 -> RESERVED (13.0x) */
196 -1, /* 1111 -> RESERVED (12.0x) */
197};
198
199static const int __cpuinitdata ezrat_eblcr[32] = {
200 50, /* 0000 -> 5.0x */
201 30, /* 0001 -> 3.0x */
202 40, /* 0010 -> 4.0x */
203 100, /* 0011 -> 10.0x */
204 55, /* 0100 -> 5.5x */
205 35, /* 0101 -> 3.5x */
206 45, /* 0110 -> 4.5x */
207 95, /* 0111 -> 9.5x */
208 90, /* 1000 -> 9.0x */
209 70, /* 1001 -> 7.0x */
210 80, /* 1010 -> 8.0x */
211 60, /* 1011 -> 6.0x */
212 120, /* 1100 -> 12.0x */
213 75, /* 1101 -> 7.5x */
214 85, /* 1110 -> 8.5x */
215 65, /* 1111 -> 6.5x */
216
217 -1, /* 0000 -> RESERVED (9.0x) */
218 110, /* 0001 -> 11.0x */
219 120, /* 0010 -> 12.0x */
220 -1, /* 0011 -> RESERVED (10.0x)*/
221 135, /* 0100 -> 13.5x */
222 115, /* 0101 -> 11.5x */
223 125, /* 0110 -> 12.5x */
224 105, /* 0111 -> 10.5x */
225 130, /* 1000 -> 13.0x */
226 150, /* 1001 -> 15.0x */
227 160, /* 1010 -> 16.0x */
228 140, /* 1011 -> 14.0x */
229 -1, /* 1100 -> RESERVED (12.0x) */
230 155, /* 1101 -> 15.5x */
231 -1, /* 1110 -> RESERVED (13.0x) */
232 145, /* 1111 -> 14.5x */
233};
234
235/*
236 * VIA C3 Nehemiah */
237
238static const int __cpuinitdata nehemiah_mults[32] = {
239 100, /* 0000 -> 10.0x */
240 -1, /* 0001 -> 16.0x */
241 40, /* 0010 -> 4.0x */
242 90, /* 0011 -> 9.0x */
243 95, /* 0100 -> 9.5x */
244 -1, /* 0101 -> RESERVED */
245 45, /* 0110 -> 4.5x */
246 55, /* 0111 -> 5.5x */
247 60, /* 1000 -> 6.0x */
248 70, /* 1001 -> 7.0x */
249 80, /* 1010 -> 8.0x */
250 50, /* 1011 -> 5.0x */
251 65, /* 1100 -> 6.5x */
252 75, /* 1101 -> 7.5x */
253 85, /* 1110 -> 8.5x */
254 120, /* 1111 -> 12.0x */
255 -1, /* 0000 -> 10.0x */
256 110, /* 0001 -> 11.0x */
257 -1, /* 0010 -> 12.0x */
258 -1, /* 0011 -> 9.0x */
259 105, /* 0100 -> 10.5x */
260 115, /* 0101 -> 11.5x */
261 125, /* 0110 -> 12.5x */
262 135, /* 0111 -> 13.5x */
263 140, /* 1000 -> 14.0x */
264 150, /* 1001 -> 15.0x */
265 160, /* 1010 -> 16.0x */
266 130, /* 1011 -> 13.0x */
267 145, /* 1100 -> 14.5x */
268 155, /* 1101 -> 15.5x */
269 -1, /* 1110 -> RESERVED (13.0x) */
270 -1, /* 1111 -> 12.0x */
271};
272
273static const int __cpuinitdata nehemiah_eblcr[32] = {
274 50, /* 0000 -> 5.0x */
275 160, /* 0001 -> 16.0x */
276 40, /* 0010 -> 4.0x */
277 100, /* 0011 -> 10.0x */
278 55, /* 0100 -> 5.5x */
279 -1, /* 0101 -> RESERVED */
280 45, /* 0110 -> 4.5x */
281 95, /* 0111 -> 9.5x */
282 90, /* 1000 -> 9.0x */
283 70, /* 1001 -> 7.0x */
284 80, /* 1010 -> 8.0x */
285 60, /* 1011 -> 6.0x */
286 120, /* 1100 -> 12.0x */
287 75, /* 1101 -> 7.5x */
288 85, /* 1110 -> 8.5x */
289 65, /* 1111 -> 6.5x */
290 90, /* 0000 -> 9.0x */
291 110, /* 0001 -> 11.0x */
292 120, /* 0010 -> 12.0x */
293 100, /* 0011 -> 10.0x */
294 135, /* 0100 -> 13.5x */
295 115, /* 0101 -> 11.5x */
296 125, /* 0110 -> 12.5x */
297 105, /* 0111 -> 10.5x */
298 130, /* 1000 -> 13.0x */
299 150, /* 1001 -> 15.0x */
300 160, /* 1010 -> 16.0x */
301 140, /* 1011 -> 14.0x */
302 120, /* 1100 -> 12.0x */
303 155, /* 1101 -> 15.5x */
304 -1, /* 1110 -> RESERVED (13.0x) */
305 145 /* 1111 -> 14.5x */
306};
307
308/*
309 * Voltage scales. Div/Mod by 1000 to get actual voltage.
310 * Which scale to use depends on the VRM type in use.
311 */
312
313struct mV_pos {
314 unsigned short mV;
315 unsigned short pos;
316};
317
318static const struct mV_pos __cpuinitdata vrm85_mV[32] = {
319 {1250, 8}, {1200, 6}, {1150, 4}, {1100, 2},
320 {1050, 0}, {1800, 30}, {1750, 28}, {1700, 26},
321 {1650, 24}, {1600, 22}, {1550, 20}, {1500, 18},
322 {1450, 16}, {1400, 14}, {1350, 12}, {1300, 10},
323 {1275, 9}, {1225, 7}, {1175, 5}, {1125, 3},
324 {1075, 1}, {1825, 31}, {1775, 29}, {1725, 27},
325 {1675, 25}, {1625, 23}, {1575, 21}, {1525, 19},
326 {1475, 17}, {1425, 15}, {1375, 13}, {1325, 11}
327};
328
329static const unsigned char __cpuinitdata mV_vrm85[32] = {
330 0x04, 0x14, 0x03, 0x13, 0x02, 0x12, 0x01, 0x11,
331 0x00, 0x10, 0x0f, 0x1f, 0x0e, 0x1e, 0x0d, 0x1d,
332 0x0c, 0x1c, 0x0b, 0x1b, 0x0a, 0x1a, 0x09, 0x19,
333 0x08, 0x18, 0x07, 0x17, 0x06, 0x16, 0x05, 0x15
334};
335
336static const struct mV_pos __cpuinitdata mobilevrm_mV[32] = {
337 {1750, 31}, {1700, 30}, {1650, 29}, {1600, 28},
338 {1550, 27}, {1500, 26}, {1450, 25}, {1400, 24},
339 {1350, 23}, {1300, 22}, {1250, 21}, {1200, 20},
340 {1150, 19}, {1100, 18}, {1050, 17}, {1000, 16},
341 {975, 15}, {950, 14}, {925, 13}, {900, 12},
342 {875, 11}, {850, 10}, {825, 9}, {800, 8},
343 {775, 7}, {750, 6}, {725, 5}, {700, 4},
344 {675, 3}, {650, 2}, {625, 1}, {600, 0}
345};
346
347static const unsigned char __cpuinitdata mV_mobilevrm[32] = {
348 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
349 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
350 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08,
351 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00
352};
353
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
deleted file mode 100644
index fc09f142d94d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ /dev/null
@@ -1,327 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
7 */
8
9#include <linux/kernel.h>
10#include <linux/module.h>
11#include <linux/init.h>
12#include <linux/cpufreq.h>
13#include <linux/timex.h>
14
15#include <asm/msr.h>
16#include <asm/processor.h>
17
18#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
19 "longrun", msg)
20
21static struct cpufreq_driver longrun_driver;
22
23/**
24 * longrun_{low,high}_freq is needed for the conversion of cpufreq kHz
25 * values into per cent values. In TMTA microcode, the following is valid:
26 * performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
27 */
28static unsigned int longrun_low_freq, longrun_high_freq;
29
30
31/**
32 * longrun_get_policy - get the current LongRun policy
33 * @policy: struct cpufreq_policy where current policy is written into
34 *
35 * Reads the current LongRun policy by access to MSR_TMTA_LONGRUN_FLAGS
36 * and MSR_TMTA_LONGRUN_CTRL
37 */
38static void __init longrun_get_policy(struct cpufreq_policy *policy)
39{
40 u32 msr_lo, msr_hi;
41
42 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
43 dprintk("longrun flags are %x - %x\n", msr_lo, msr_hi);
44 if (msr_lo & 0x01)
45 policy->policy = CPUFREQ_POLICY_PERFORMANCE;
46 else
47 policy->policy = CPUFREQ_POLICY_POWERSAVE;
48
49 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
50 dprintk("longrun ctrl is %x - %x\n", msr_lo, msr_hi);
51 msr_lo &= 0x0000007F;
52 msr_hi &= 0x0000007F;
53
54 if (longrun_high_freq <= longrun_low_freq) {
55 /* Assume degenerate Longrun table */
56 policy->min = policy->max = longrun_high_freq;
57 } else {
58 policy->min = longrun_low_freq + msr_lo *
59 ((longrun_high_freq - longrun_low_freq) / 100);
60 policy->max = longrun_low_freq + msr_hi *
61 ((longrun_high_freq - longrun_low_freq) / 100);
62 }
63 policy->cpu = 0;
64}
65
66
67/**
68 * longrun_set_policy - sets a new CPUFreq policy
69 * @policy: new policy
70 *
71 * Sets a new CPUFreq policy on LongRun-capable processors. This function
72 * has to be called with cpufreq_driver locked.
73 */
74static int longrun_set_policy(struct cpufreq_policy *policy)
75{
76 u32 msr_lo, msr_hi;
77 u32 pctg_lo, pctg_hi;
78
79 if (!policy)
80 return -EINVAL;
81
82 if (longrun_high_freq <= longrun_low_freq) {
83 /* Assume degenerate Longrun table */
84 pctg_lo = pctg_hi = 100;
85 } else {
86 pctg_lo = (policy->min - longrun_low_freq) /
87 ((longrun_high_freq - longrun_low_freq) / 100);
88 pctg_hi = (policy->max - longrun_low_freq) /
89 ((longrun_high_freq - longrun_low_freq) / 100);
90 }
91
92 if (pctg_hi > 100)
93 pctg_hi = 100;
94 if (pctg_lo > pctg_hi)
95 pctg_lo = pctg_hi;
96
97 /* performance or economy mode */
98 rdmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
99 msr_lo &= 0xFFFFFFFE;
100 switch (policy->policy) {
101 case CPUFREQ_POLICY_PERFORMANCE:
102 msr_lo |= 0x00000001;
103 break;
104 case CPUFREQ_POLICY_POWERSAVE:
105 break;
106 }
107 wrmsr(MSR_TMTA_LONGRUN_FLAGS, msr_lo, msr_hi);
108
109 /* lower and upper boundary */
110 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
111 msr_lo &= 0xFFFFFF80;
112 msr_hi &= 0xFFFFFF80;
113 msr_lo |= pctg_lo;
114 msr_hi |= pctg_hi;
115 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
116
117 return 0;
118}
119
120
121/**
122 * longrun_verify_poliy - verifies a new CPUFreq policy
123 * @policy: the policy to verify
124 *
125 * Validates a new CPUFreq policy. This function has to be called with
126 * cpufreq_driver locked.
127 */
128static int longrun_verify_policy(struct cpufreq_policy *policy)
129{
130 if (!policy)
131 return -EINVAL;
132
133 policy->cpu = 0;
134 cpufreq_verify_within_limits(policy,
135 policy->cpuinfo.min_freq,
136 policy->cpuinfo.max_freq);
137
138 if ((policy->policy != CPUFREQ_POLICY_POWERSAVE) &&
139 (policy->policy != CPUFREQ_POLICY_PERFORMANCE))
140 return -EINVAL;
141
142 return 0;
143}
144
145static unsigned int longrun_get(unsigned int cpu)
146{
147 u32 eax, ebx, ecx, edx;
148
149 if (cpu)
150 return 0;
151
152 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
153 dprintk("cpuid eax is %u\n", eax);
154
155 return eax * 1000;
156}
157
158/**
159 * longrun_determine_freqs - determines the lowest and highest possible core frequency
160 * @low_freq: an int to put the lowest frequency into
161 * @high_freq: an int to put the highest frequency into
162 *
163 * Determines the lowest and highest possible core frequencies on this CPU.
164 * This is necessary to calculate the performance percentage according to
165 * TMTA rules:
166 * performance_pctg = (target_freq - low_freq)/(high_freq - low_freq)
167 */
168static unsigned int __cpuinit longrun_determine_freqs(unsigned int *low_freq,
169 unsigned int *high_freq)
170{
171 u32 msr_lo, msr_hi;
172 u32 save_lo, save_hi;
173 u32 eax, ebx, ecx, edx;
174 u32 try_hi;
175 struct cpuinfo_x86 *c = &cpu_data(0);
176
177 if (!low_freq || !high_freq)
178 return -EINVAL;
179
180 if (cpu_has(c, X86_FEATURE_LRTI)) {
181 /* if the LongRun Table Interface is present, the
182 * detection is a bit easier:
183 * For minimum frequency, read out the maximum
184 * level (msr_hi), write that into "currently
185 * selected level", and read out the frequency.
186 * For maximum frequency, read out level zero.
187 */
188 /* minimum */
189 rdmsr(MSR_TMTA_LRTI_READOUT, msr_lo, msr_hi);
190 wrmsr(MSR_TMTA_LRTI_READOUT, msr_hi, msr_hi);
191 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
192 *low_freq = msr_lo * 1000; /* to kHz */
193
194 /* maximum */
195 wrmsr(MSR_TMTA_LRTI_READOUT, 0, msr_hi);
196 rdmsr(MSR_TMTA_LRTI_VOLT_MHZ, msr_lo, msr_hi);
197 *high_freq = msr_lo * 1000; /* to kHz */
198
199 dprintk("longrun table interface told %u - %u kHz\n",
200 *low_freq, *high_freq);
201
202 if (*low_freq > *high_freq)
203 *low_freq = *high_freq;
204 return 0;
205 }
206
207 /* set the upper border to the value determined during TSC init */
208 *high_freq = (cpu_khz / 1000);
209 *high_freq = *high_freq * 1000;
210 dprintk("high frequency is %u kHz\n", *high_freq);
211
212 /* get current borders */
213 rdmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
214 save_lo = msr_lo & 0x0000007F;
215 save_hi = msr_hi & 0x0000007F;
216
217 /* if current perf_pctg is larger than 90%, we need to decrease the
218 * upper limit to make the calculation more accurate.
219 */
220 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
221 /* try decreasing in 10% steps, some processors react only
222 * on some barrier values */
223 for (try_hi = 80; try_hi > 0 && ecx > 90; try_hi -= 10) {
224 /* set to 0 to try_hi perf_pctg */
225 msr_lo &= 0xFFFFFF80;
226 msr_hi &= 0xFFFFFF80;
227 msr_hi |= try_hi;
228 wrmsr(MSR_TMTA_LONGRUN_CTRL, msr_lo, msr_hi);
229
230 /* read out current core MHz and current perf_pctg */
231 cpuid(0x80860007, &eax, &ebx, &ecx, &edx);
232
233 /* restore values */
234 wrmsr(MSR_TMTA_LONGRUN_CTRL, save_lo, save_hi);
235 }
236 dprintk("percentage is %u %%, freq is %u MHz\n", ecx, eax);
237
238 /* performance_pctg = (current_freq - low_freq)/(high_freq - low_freq)
239 * eqals
240 * low_freq * (1 - perf_pctg) = (cur_freq - high_freq * perf_pctg)
241 *
242 * high_freq * perf_pctg is stored tempoarily into "ebx".
243 */
244 ebx = (((cpu_khz / 1000) * ecx) / 100); /* to MHz */
245
246 if ((ecx > 95) || (ecx == 0) || (eax < ebx))
247 return -EIO;
248
249 edx = ((eax - ebx) * 100) / (100 - ecx);
250 *low_freq = edx * 1000; /* back to kHz */
251
252 dprintk("low frequency is %u kHz\n", *low_freq);
253
254 if (*low_freq > *high_freq)
255 *low_freq = *high_freq;
256
257 return 0;
258}
259
260
261static int __cpuinit longrun_cpu_init(struct cpufreq_policy *policy)
262{
263 int result = 0;
264
265 /* capability check */
266 if (policy->cpu != 0)
267 return -ENODEV;
268
269 /* detect low and high frequency */
270 result = longrun_determine_freqs(&longrun_low_freq, &longrun_high_freq);
271 if (result)
272 return result;
273
274 /* cpuinfo and default policy values */
275 policy->cpuinfo.min_freq = longrun_low_freq;
276 policy->cpuinfo.max_freq = longrun_high_freq;
277 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
278 longrun_get_policy(policy);
279
280 return 0;
281}
282
283
284static struct cpufreq_driver longrun_driver = {
285 .flags = CPUFREQ_CONST_LOOPS,
286 .verify = longrun_verify_policy,
287 .setpolicy = longrun_set_policy,
288 .get = longrun_get,
289 .init = longrun_cpu_init,
290 .name = "longrun",
291 .owner = THIS_MODULE,
292};
293
294
295/**
296 * longrun_init - initializes the Transmeta Crusoe LongRun CPUFreq driver
297 *
298 * Initializes the LongRun support.
299 */
300static int __init longrun_init(void)
301{
302 struct cpuinfo_x86 *c = &cpu_data(0);
303
304 if (c->x86_vendor != X86_VENDOR_TRANSMETA ||
305 !cpu_has(c, X86_FEATURE_LONGRUN))
306 return -ENODEV;
307
308 return cpufreq_register_driver(&longrun_driver);
309}
310
311
312/**
313 * longrun_exit - unregisters LongRun support
314 */
315static void __exit longrun_exit(void)
316{
317 cpufreq_unregister_driver(&longrun_driver);
318}
319
320
321MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
322MODULE_DESCRIPTION("LongRun driver for Transmeta Crusoe and "
323 "Efficeon processors.");
324MODULE_LICENSE("GPL");
325
326module_init(longrun_init);
327module_exit(longrun_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.c b/arch/x86/kernel/cpu/cpufreq/mperf.c
deleted file mode 100644
index 911e193018ae..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.c
+++ /dev/null
@@ -1,51 +0,0 @@
1#include <linux/kernel.h>
2#include <linux/smp.h>
3#include <linux/module.h>
4#include <linux/init.h>
5#include <linux/cpufreq.h>
6#include <linux/slab.h>
7
8#include "mperf.h"
9
10static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
11
12/* Called via smp_call_function_single(), on the target CPU */
13static void read_measured_perf_ctrs(void *_cur)
14{
15 struct aperfmperf *am = _cur;
16
17 get_aperfmperf(am);
18}
19
20/*
21 * Return the measured active (C0) frequency on this CPU since last call
22 * to this function.
23 * Input: cpu number
24 * Return: Average CPU frequency in terms of max frequency (zero on error)
25 *
26 * We use IA32_MPERF and IA32_APERF MSRs to get the measured performance
27 * over a period of time, while CPU is in C0 state.
28 * IA32_MPERF counts at the rate of max advertised frequency
29 * IA32_APERF counts at the rate of actual CPU frequency
30 * Only IA32_APERF/IA32_MPERF ratio is architecturally defined and
31 * no meaning should be associated with absolute values of these MSRs.
32 */
33unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
34 unsigned int cpu)
35{
36 struct aperfmperf perf;
37 unsigned long ratio;
38 unsigned int retval;
39
40 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
41 return 0;
42
43 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
44 per_cpu(acfreq_old_perf, cpu) = perf;
45
46 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
47
48 return retval;
49}
50EXPORT_SYMBOL_GPL(cpufreq_get_measured_perf);
51MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/mperf.h b/arch/x86/kernel/cpu/cpufreq/mperf.h
deleted file mode 100644
index 5dbf2950dc22..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/mperf.h
+++ /dev/null
@@ -1,9 +0,0 @@
1/*
2 * (c) 2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8unsigned int cpufreq_get_measured_perf(struct cpufreq_policy *policy,
9 unsigned int cpu);
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
deleted file mode 100644
index bd1cac747f67..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ /dev/null
@@ -1,331 +0,0 @@
1/*
2 * Pentium 4/Xeon CPU on demand clock modulation/speed scaling
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 * (C) 2002 Zwane Mwaikambo <zwane@commfireservices.com>
5 * (C) 2002 Arjan van de Ven <arjanv@redhat.com>
6 * (C) 2002 Tora T. Engstad
7 * All Rights Reserved
8 *
9 * This program is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU General Public License
11 * as published by the Free Software Foundation; either version
12 * 2 of the License, or (at your option) any later version.
13 *
14 * The author(s) of this software shall not be held liable for damages
15 * of any nature resulting due to the use of this software. This
16 * software is provided AS-IS with no warranties.
17 *
18 * Date Errata Description
19 * 20020525 N44, O17 12.5% or 25% DC causes lockup
20 *
21 */
22
23#include <linux/kernel.h>
24#include <linux/module.h>
25#include <linux/init.h>
26#include <linux/smp.h>
27#include <linux/cpufreq.h>
28#include <linux/cpumask.h>
29#include <linux/timex.h>
30
31#include <asm/processor.h>
32#include <asm/msr.h>
33#include <asm/timer.h>
34
35#include "speedstep-lib.h"
36
37#define PFX "p4-clockmod: "
38#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
39 "p4-clockmod", msg)
40
41/*
42 * Duty Cycle (3bits), note DC_DISABLE is not specified in
43 * intel docs i just use it to mean disable
44 */
45enum {
46 DC_RESV, DC_DFLT, DC_25PT, DC_38PT, DC_50PT,
47 DC_64PT, DC_75PT, DC_88PT, DC_DISABLE
48};
49
50#define DC_ENTRIES 8
51
52
53static int has_N44_O17_errata[NR_CPUS];
54static unsigned int stock_freq;
55static struct cpufreq_driver p4clockmod_driver;
56static unsigned int cpufreq_p4_get(unsigned int cpu);
57
58static int cpufreq_p4_setdc(unsigned int cpu, unsigned int newstate)
59{
60 u32 l, h;
61
62 if (!cpu_online(cpu) ||
63 (newstate > DC_DISABLE) || (newstate == DC_RESV))
64 return -EINVAL;
65
66 rdmsr_on_cpu(cpu, MSR_IA32_THERM_STATUS, &l, &h);
67
68 if (l & 0x01)
69 dprintk("CPU#%d currently thermal throttled\n", cpu);
70
71 if (has_N44_O17_errata[cpu] &&
72 (newstate == DC_25PT || newstate == DC_DFLT))
73 newstate = DC_38PT;
74
75 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
76 if (newstate == DC_DISABLE) {
77 dprintk("CPU#%d disabling modulation\n", cpu);
78 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l & ~(1<<4), h);
79 } else {
80 dprintk("CPU#%d setting duty cycle to %d%%\n",
81 cpu, ((125 * newstate) / 10));
82 /* bits 63 - 5 : reserved
83 * bit 4 : enable/disable
84 * bits 3-1 : duty cycle
85 * bit 0 : reserved
86 */
87 l = (l & ~14);
88 l = l | (1<<4) | ((newstate & 0x7)<<1);
89 wrmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, l, h);
90 }
91
92 return 0;
93}
94
95
96static struct cpufreq_frequency_table p4clockmod_table[] = {
97 {DC_RESV, CPUFREQ_ENTRY_INVALID},
98 {DC_DFLT, 0},
99 {DC_25PT, 0},
100 {DC_38PT, 0},
101 {DC_50PT, 0},
102 {DC_64PT, 0},
103 {DC_75PT, 0},
104 {DC_88PT, 0},
105 {DC_DISABLE, 0},
106 {DC_RESV, CPUFREQ_TABLE_END},
107};
108
109
110static int cpufreq_p4_target(struct cpufreq_policy *policy,
111 unsigned int target_freq,
112 unsigned int relation)
113{
114 unsigned int newstate = DC_RESV;
115 struct cpufreq_freqs freqs;
116 int i;
117
118 if (cpufreq_frequency_table_target(policy, &p4clockmod_table[0],
119 target_freq, relation, &newstate))
120 return -EINVAL;
121
122 freqs.old = cpufreq_p4_get(policy->cpu);
123 freqs.new = stock_freq * p4clockmod_table[newstate].index / 8;
124
125 if (freqs.new == freqs.old)
126 return 0;
127
128 /* notifiers */
129 for_each_cpu(i, policy->cpus) {
130 freqs.cpu = i;
131 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
132 }
133
134 /* run on each logical CPU,
135 * see section 13.15.3 of IA32 Intel Architecture Software
136 * Developer's Manual, Volume 3
137 */
138 for_each_cpu(i, policy->cpus)
139 cpufreq_p4_setdc(i, p4clockmod_table[newstate].index);
140
141 /* notifiers */
142 for_each_cpu(i, policy->cpus) {
143 freqs.cpu = i;
144 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
145 }
146
147 return 0;
148}
149
150
151static int cpufreq_p4_verify(struct cpufreq_policy *policy)
152{
153 return cpufreq_frequency_table_verify(policy, &p4clockmod_table[0]);
154}
155
156
157static unsigned int cpufreq_p4_get_frequency(struct cpuinfo_x86 *c)
158{
159 if (c->x86 == 0x06) {
160 if (cpu_has(c, X86_FEATURE_EST))
161 printk(KERN_WARNING PFX "Warning: EST-capable CPU "
162 "detected. The acpi-cpufreq module offers "
163 "voltage scaling in addition of frequency "
164 "scaling. You should use that instead of "
165 "p4-clockmod, if possible.\n");
166 switch (c->x86_model) {
167 case 0x0E: /* Core */
168 case 0x0F: /* Core Duo */
169 case 0x16: /* Celeron Core */
170 case 0x1C: /* Atom */
171 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
172 return speedstep_get_frequency(SPEEDSTEP_CPU_PCORE);
173 case 0x0D: /* Pentium M (Dothan) */
174 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
175 /* fall through */
176 case 0x09: /* Pentium M (Banias) */
177 return speedstep_get_frequency(SPEEDSTEP_CPU_PM);
178 }
179 }
180
181 if (c->x86 != 0xF)
182 return 0;
183
184 /* on P-4s, the TSC runs with constant frequency independent whether
185 * throttling is active or not. */
186 p4clockmod_driver.flags |= CPUFREQ_CONST_LOOPS;
187
188 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4M) {
189 printk(KERN_WARNING PFX "Warning: Pentium 4-M detected. "
190 "The speedstep-ich or acpi cpufreq modules offer "
191 "voltage scaling in addition of frequency scaling. "
192 "You should use either one instead of p4-clockmod, "
193 "if possible.\n");
194 return speedstep_get_frequency(SPEEDSTEP_CPU_P4M);
195 }
196
197 return speedstep_get_frequency(SPEEDSTEP_CPU_P4D);
198}
199
200
201
202static int cpufreq_p4_cpu_init(struct cpufreq_policy *policy)
203{
204 struct cpuinfo_x86 *c = &cpu_data(policy->cpu);
205 int cpuid = 0;
206 unsigned int i;
207
208#ifdef CONFIG_SMP
209 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
210#endif
211
212 /* Errata workaround */
213 cpuid = (c->x86 << 8) | (c->x86_model << 4) | c->x86_mask;
214 switch (cpuid) {
215 case 0x0f07:
216 case 0x0f0a:
217 case 0x0f11:
218 case 0x0f12:
219 has_N44_O17_errata[policy->cpu] = 1;
220 dprintk("has errata -- disabling low frequencies\n");
221 }
222
223 if (speedstep_detect_processor() == SPEEDSTEP_CPU_P4D &&
224 c->x86_model < 2) {
225 /* switch to maximum frequency and measure result */
226 cpufreq_p4_setdc(policy->cpu, DC_DISABLE);
227 recalibrate_cpu_khz();
228 }
229 /* get max frequency */
230 stock_freq = cpufreq_p4_get_frequency(c);
231 if (!stock_freq)
232 return -EINVAL;
233
234 /* table init */
235 for (i = 1; (p4clockmod_table[i].frequency != CPUFREQ_TABLE_END); i++) {
236 if ((i < 2) && (has_N44_O17_errata[policy->cpu]))
237 p4clockmod_table[i].frequency = CPUFREQ_ENTRY_INVALID;
238 else
239 p4clockmod_table[i].frequency = (stock_freq * i)/8;
240 }
241 cpufreq_frequency_table_get_attr(p4clockmod_table, policy->cpu);
242
243 /* cpuinfo and default policy values */
244
245 /* the transition latency is set to be 1 higher than the maximum
246 * transition latency of the ondemand governor */
247 policy->cpuinfo.transition_latency = 10000001;
248 policy->cur = stock_freq;
249
250 return cpufreq_frequency_table_cpuinfo(policy, &p4clockmod_table[0]);
251}
252
253
254static int cpufreq_p4_cpu_exit(struct cpufreq_policy *policy)
255{
256 cpufreq_frequency_table_put_attr(policy->cpu);
257 return 0;
258}
259
260static unsigned int cpufreq_p4_get(unsigned int cpu)
261{
262 u32 l, h;
263
264 rdmsr_on_cpu(cpu, MSR_IA32_THERM_CONTROL, &l, &h);
265
266 if (l & 0x10) {
267 l = l >> 1;
268 l &= 0x7;
269 } else
270 l = DC_DISABLE;
271
272 if (l != DC_DISABLE)
273 return stock_freq * l / 8;
274
275 return stock_freq;
276}
277
278static struct freq_attr *p4clockmod_attr[] = {
279 &cpufreq_freq_attr_scaling_available_freqs,
280 NULL,
281};
282
283static struct cpufreq_driver p4clockmod_driver = {
284 .verify = cpufreq_p4_verify,
285 .target = cpufreq_p4_target,
286 .init = cpufreq_p4_cpu_init,
287 .exit = cpufreq_p4_cpu_exit,
288 .get = cpufreq_p4_get,
289 .name = "p4-clockmod",
290 .owner = THIS_MODULE,
291 .attr = p4clockmod_attr,
292};
293
294
295static int __init cpufreq_p4_init(void)
296{
297 struct cpuinfo_x86 *c = &cpu_data(0);
298 int ret;
299
300 /*
301 * THERM_CONTROL is architectural for IA32 now, so
302 * we can rely on the capability checks
303 */
304 if (c->x86_vendor != X86_VENDOR_INTEL)
305 return -ENODEV;
306
307 if (!test_cpu_cap(c, X86_FEATURE_ACPI) ||
308 !test_cpu_cap(c, X86_FEATURE_ACC))
309 return -ENODEV;
310
311 ret = cpufreq_register_driver(&p4clockmod_driver);
312 if (!ret)
313 printk(KERN_INFO PFX "P4/Xeon(TM) CPU On-Demand Clock "
314 "Modulation available\n");
315
316 return ret;
317}
318
319
320static void __exit cpufreq_p4_exit(void)
321{
322 cpufreq_unregister_driver(&p4clockmod_driver);
323}
324
325
326MODULE_AUTHOR("Zwane Mwaikambo <zwane@commfireservices.com>");
327MODULE_DESCRIPTION("cpufreq driver for Pentium(TM) 4/Xeon(TM)");
328MODULE_LICENSE("GPL");
329
330late_initcall(cpufreq_p4_init);
331module_exit(cpufreq_p4_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
deleted file mode 100644
index 4f6f679f2799..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
+++ /dev/null
@@ -1,626 +0,0 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu __percpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return -EINVAL;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID;
323 in_params[1].type = ACPI_TYPE_INTEGER;
324 in_params[1].integer.value = 1;
325 in_params[2].type = ACPI_TYPE_INTEGER;
326 in_params[2].integer.value = 2;
327 in_params[3].type = ACPI_TYPE_BUFFER;
328 in_params[3].buffer.length = 8;
329 in_params[3].buffer.pointer = (u8 *)&capabilities;
330
331 capabilities[0] = OSC_QUERY_ENABLE;
332 capabilities[1] = 0x1;
333
334 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
335 if (ACPI_FAILURE(status))
336 return -ENODEV;
337
338 if (!output.length)
339 return -ENODEV;
340
341 out_obj = output.pointer;
342 if (out_obj->type != ACPI_TYPE_BUFFER) {
343 ret = -ENODEV;
344 goto out_free;
345 }
346
347 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
348 if (errors) {
349 ret = -ENODEV;
350 goto out_free;
351 }
352
353 supported = *((u32 *)(out_obj->buffer.pointer + 4));
354 if (!(supported & 0x1)) {
355 ret = -ENODEV;
356 goto out_free;
357 }
358
359 kfree(output.pointer);
360 capabilities[0] = 0x0;
361 capabilities[1] = 0x1;
362
363 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
364 if (ACPI_FAILURE(status))
365 return -ENODEV;
366
367 if (!output.length)
368 return -ENODEV;
369
370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 ret = -ENODEV;
373 goto out_free;
374 }
375
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) {
378 ret = -ENODEV;
379 goto out_free;
380 }
381
382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) {
384 ret = -ENODEV;
385 goto out_free;
386 }
387
388out_free:
389 kfree(output.pointer);
390 return ret;
391}
392
393static int __init pcc_cpufreq_probe(void)
394{
395 acpi_status status;
396 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle, pcch_handle;
401 int ret = 0;
402
403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status))
405 return -ENODEV;
406
407 status = acpi_get_handle(handle, "PCCH", &pcch_handle);
408 if (ACPI_FAILURE(status))
409 return -ENODEV;
410
411 status = acpi_get_handle(handle, "_OSC", &osc_handle);
412 if (ACPI_SUCCESS(status)) {
413 ret = pcc_cpufreq_do_osc(&osc_handle);
414 if (ret)
415 dprintk("probe: _OSC evaluation did not succeed\n");
416 /* Firmware's use of _OSC is optional */
417 ret = 0;
418 }
419
420 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
421 if (ACPI_FAILURE(status))
422 return -ENODEV;
423
424 out_obj = output.pointer;
425 if (out_obj->type != ACPI_TYPE_PACKAGE) {
426 ret = -ENODEV;
427 goto out_free;
428 }
429
430 member = &out_obj->package.elements[0];
431 if (member->type != ACPI_TYPE_BUFFER) {
432 ret = -ENODEV;
433 goto out_free;
434 }
435
436 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
437
438 dprintk("probe: mem_resource descriptor: 0x%x,"
439 " length: %d, space_id: %d, resource_usage: %d,"
440 " type_specific: %d, granularity: 0x%llx,"
441 " minimum: 0x%llx, maximum: 0x%llx,"
442 " translation_offset: 0x%llx, address_length: 0x%llx\n",
443 mem_resource->descriptor, mem_resource->length,
444 mem_resource->space_id, mem_resource->resource_usage,
445 mem_resource->type_specific, mem_resource->granularity,
446 mem_resource->minimum, mem_resource->maximum,
447 mem_resource->translation_offset,
448 mem_resource->address_length);
449
450 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
451 ret = -ENODEV;
452 goto out_free;
453 }
454
455 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
456 mem_resource->address_length);
457 if (pcch_virt_addr == NULL) {
458 dprintk("probe: could not map shared mem region\n");
459 goto out_free;
460 }
461 pcch_hdr = pcch_virt_addr;
462
463 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
464 dprintk("probe: PCCH header is at physical address: 0x%llx,"
465 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
466 " supported features: 0x%x, command field: 0x%x,"
467 " status field: 0x%x, nominal latency: %d us\n",
468 mem_resource->minimum, ioread32(&pcch_hdr->signature),
469 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
470 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
471 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
472 ioread32(&pcch_hdr->latency));
473
474 dprintk("probe: min time between commands: %d us,"
475 " max time between commands: %d us,"
476 " nominal CPU frequency: %d MHz,"
477 " minimum CPU frequency: %d MHz,"
478 " minimum CPU frequency without throttling: %d MHz\n",
479 ioread32(&pcch_hdr->minimum_time),
480 ioread32(&pcch_hdr->maximum_time),
481 ioread32(&pcch_hdr->nominal),
482 ioread32(&pcch_hdr->throttled_frequency),
483 ioread32(&pcch_hdr->minimum_frequency));
484
485 member = &out_obj->package.elements[1];
486 if (member->type != ACPI_TYPE_BUFFER) {
487 ret = -ENODEV;
488 goto pcch_free;
489 }
490
491 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
492
493 doorbell.space_id = reg_resource->space_id;
494 doorbell.bit_width = reg_resource->bit_width;
495 doorbell.bit_offset = reg_resource->bit_offset;
496 doorbell.access_width = 64;
497 doorbell.address = reg_resource->address;
498
499 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
500 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
501 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
502 doorbell.access_width, reg_resource->address);
503
504 member = &out_obj->package.elements[2];
505 if (member->type != ACPI_TYPE_INTEGER) {
506 ret = -ENODEV;
507 goto pcch_free;
508 }
509
510 doorbell_preserve = member->integer.value;
511
512 member = &out_obj->package.elements[3];
513 if (member->type != ACPI_TYPE_INTEGER) {
514 ret = -ENODEV;
515 goto pcch_free;
516 }
517
518 doorbell_write = member->integer.value;
519
520 dprintk("probe: doorbell_preserve: 0x%llx,"
521 " doorbell_write: 0x%llx\n",
522 doorbell_preserve, doorbell_write);
523
524 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
525 if (!pcc_cpu_info) {
526 ret = -ENOMEM;
527 goto pcch_free;
528 }
529
530 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
531 " limits: %d MHz, %d MHz\n", PCC_VERSION,
532 ioread32(&pcch_hdr->minimum_frequency),
533 ioread32(&pcch_hdr->nominal));
534 kfree(output.pointer);
535 return ret;
536pcch_free:
537 pcc_clear_mapping();
538out_free:
539 kfree(output.pointer);
540 return ret;
541}
542
543static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
544{
545 unsigned int cpu = policy->cpu;
546 unsigned int result = 0;
547
548 if (!pcch_virt_addr) {
549 result = -1;
550 goto out;
551 }
552
553 result = pcc_get_offset(cpu);
554 if (result) {
555 dprintk("init: PCCP evaluation failed\n");
556 goto out;
557 }
558
559 policy->max = policy->cpuinfo.max_freq =
560 ioread32(&pcch_hdr->nominal) * 1000;
561 policy->min = policy->cpuinfo.min_freq =
562 ioread32(&pcch_hdr->minimum_frequency) * 1000;
563 policy->cur = pcc_get_freq(cpu);
564
565 if (!policy->cur) {
566 dprintk("init: Unable to get current CPU frequency\n");
567 result = -EINVAL;
568 goto out;
569 }
570
571 dprintk("init: policy->max is %d, policy->min is %d\n",
572 policy->max, policy->min);
573out:
574 return result;
575}
576
577static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
578{
579 return 0;
580}
581
582static struct cpufreq_driver pcc_cpufreq_driver = {
583 .flags = CPUFREQ_CONST_LOOPS,
584 .get = pcc_get_freq,
585 .verify = pcc_cpufreq_verify,
586 .target = pcc_cpufreq_target,
587 .init = pcc_cpufreq_cpu_init,
588 .exit = pcc_cpufreq_cpu_exit,
589 .name = "pcc-cpufreq",
590 .owner = THIS_MODULE,
591};
592
593static int __init pcc_cpufreq_init(void)
594{
595 int ret;
596
597 if (acpi_disabled)
598 return 0;
599
600 ret = pcc_cpufreq_probe();
601 if (ret) {
602 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
603 return ret;
604 }
605
606 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
607
608 return ret;
609}
610
611static void __exit pcc_cpufreq_exit(void)
612{
613 cpufreq_unregister_driver(&pcc_cpufreq_driver);
614
615 pcc_clear_mapping();
616
617 free_percpu(pcc_cpu_info);
618}
619
620MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
621MODULE_VERSION(PCC_VERSION);
622MODULE_DESCRIPTION("Processor Clocking Control interface driver");
623MODULE_LICENSE("GPL");
624
625late_initcall(pcc_cpufreq_init);
626module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
deleted file mode 100644
index b3379d6a5c57..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ /dev/null
@@ -1,261 +0,0 @@
1/*
2 * This file was based upon code in Powertweak Linux (http://powertweak.sf.net)
3 * (C) 2000-2003 Dave Jones, Arjan van de Ven, Janne Pänkälä,
4 * Dominik Brodowski.
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/init.h>
14#include <linux/cpufreq.h>
15#include <linux/ioport.h>
16#include <linux/timex.h>
17#include <linux/io.h>
18
19#include <asm/msr.h>
20
21#define POWERNOW_IOPORT 0xfff0 /* it doesn't matter where, as long
22 as it is unused */
23
24#define PFX "powernow-k6: "
25static unsigned int busfreq; /* FSB, in 10 kHz */
26static unsigned int max_multiplier;
27
28
29/* Clock ratio multiplied by 10 - see table 27 in AMD#23446 */
30static struct cpufreq_frequency_table clock_ratio[] = {
31 {45, /* 000 -> 4.5x */ 0},
32 {50, /* 001 -> 5.0x */ 0},
33 {40, /* 010 -> 4.0x */ 0},
34 {55, /* 011 -> 5.5x */ 0},
35 {20, /* 100 -> 2.0x */ 0},
36 {30, /* 101 -> 3.0x */ 0},
37 {60, /* 110 -> 6.0x */ 0},
38 {35, /* 111 -> 3.5x */ 0},
39 {0, CPUFREQ_TABLE_END}
40};
41
42
43/**
44 * powernow_k6_get_cpu_multiplier - returns the current FSB multiplier
45 *
46 * Returns the current setting of the frequency multiplier. Core clock
47 * speed is frequency of the Front-Side Bus multiplied with this value.
48 */
49static int powernow_k6_get_cpu_multiplier(void)
50{
51 u64 invalue = 0;
52 u32 msrval;
53
54 msrval = POWERNOW_IOPORT + 0x1;
55 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
56 invalue = inl(POWERNOW_IOPORT + 0x8);
57 msrval = POWERNOW_IOPORT + 0x0;
58 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
59
60 return clock_ratio[(invalue >> 5)&7].index;
61}
62
63
64/**
65 * powernow_k6_set_state - set the PowerNow! multiplier
66 * @best_i: clock_ratio[best_i] is the target multiplier
67 *
68 * Tries to change the PowerNow! multiplier
69 */
70static void powernow_k6_set_state(unsigned int best_i)
71{
72 unsigned long outvalue = 0, invalue = 0;
73 unsigned long msrval;
74 struct cpufreq_freqs freqs;
75
76 if (clock_ratio[best_i].index > max_multiplier) {
77 printk(KERN_ERR PFX "invalid target frequency\n");
78 return;
79 }
80
81 freqs.old = busfreq * powernow_k6_get_cpu_multiplier();
82 freqs.new = busfreq * clock_ratio[best_i].index;
83 freqs.cpu = 0; /* powernow-k6.c is UP only driver */
84
85 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
86
87 /* we now need to transform best_i to the BVC format, see AMD#23446 */
88
89 outvalue = (1<<12) | (1<<10) | (1<<9) | (best_i<<5);
90
91 msrval = POWERNOW_IOPORT + 0x1;
92 wrmsr(MSR_K6_EPMR, msrval, 0); /* enable the PowerNow port */
93 invalue = inl(POWERNOW_IOPORT + 0x8);
94 invalue = invalue & 0xf;
95 outvalue = outvalue | invalue;
96 outl(outvalue , (POWERNOW_IOPORT + 0x8));
97 msrval = POWERNOW_IOPORT + 0x0;
98 wrmsr(MSR_K6_EPMR, msrval, 0); /* disable it again */
99
100 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
101
102 return;
103}
104
105
106/**
107 * powernow_k6_verify - verifies a new CPUfreq policy
108 * @policy: new policy
109 *
110 * Policy must be within lowest and highest possible CPU Frequency,
111 * and at least one possible state must be within min and max.
112 */
113static int powernow_k6_verify(struct cpufreq_policy *policy)
114{
115 return cpufreq_frequency_table_verify(policy, &clock_ratio[0]);
116}
117
118
119/**
120 * powernow_k6_setpolicy - sets a new CPUFreq policy
121 * @policy: new policy
122 * @target_freq: the target frequency
123 * @relation: how that frequency relates to achieved frequency
124 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
125 *
126 * sets a new CPUFreq policy
127 */
128static int powernow_k6_target(struct cpufreq_policy *policy,
129 unsigned int target_freq,
130 unsigned int relation)
131{
132 unsigned int newstate = 0;
133
134 if (cpufreq_frequency_table_target(policy, &clock_ratio[0],
135 target_freq, relation, &newstate))
136 return -EINVAL;
137
138 powernow_k6_set_state(newstate);
139
140 return 0;
141}
142
143
144static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
145{
146 unsigned int i, f;
147 int result;
148
149 if (policy->cpu != 0)
150 return -ENODEV;
151
152 /* get frequencies */
153 max_multiplier = powernow_k6_get_cpu_multiplier();
154 busfreq = cpu_khz / max_multiplier;
155
156 /* table init */
157 for (i = 0; (clock_ratio[i].frequency != CPUFREQ_TABLE_END); i++) {
158 f = clock_ratio[i].index;
159 if (f > max_multiplier)
160 clock_ratio[i].frequency = CPUFREQ_ENTRY_INVALID;
161 else
162 clock_ratio[i].frequency = busfreq * f;
163 }
164
165 /* cpuinfo and default policy values */
166 policy->cpuinfo.transition_latency = 200000;
167 policy->cur = busfreq * max_multiplier;
168
169 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
170 if (result)
171 return result;
172
173 cpufreq_frequency_table_get_attr(clock_ratio, policy->cpu);
174
175 return 0;
176}
177
178
179static int powernow_k6_cpu_exit(struct cpufreq_policy *policy)
180{
181 unsigned int i;
182 for (i = 0; i < 8; i++) {
183 if (i == max_multiplier)
184 powernow_k6_set_state(i);
185 }
186 cpufreq_frequency_table_put_attr(policy->cpu);
187 return 0;
188}
189
190static unsigned int powernow_k6_get(unsigned int cpu)
191{
192 unsigned int ret;
193 ret = (busfreq * powernow_k6_get_cpu_multiplier());
194 return ret;
195}
196
197static struct freq_attr *powernow_k6_attr[] = {
198 &cpufreq_freq_attr_scaling_available_freqs,
199 NULL,
200};
201
202static struct cpufreq_driver powernow_k6_driver = {
203 .verify = powernow_k6_verify,
204 .target = powernow_k6_target,
205 .init = powernow_k6_cpu_init,
206 .exit = powernow_k6_cpu_exit,
207 .get = powernow_k6_get,
208 .name = "powernow-k6",
209 .owner = THIS_MODULE,
210 .attr = powernow_k6_attr,
211};
212
213
214/**
215 * powernow_k6_init - initializes the k6 PowerNow! CPUFreq driver
216 *
217 * Initializes the K6 PowerNow! support. Returns -ENODEV on unsupported
218 * devices, -EINVAL or -ENOMEM on problems during initiatization, and zero
219 * on success.
220 */
221static int __init powernow_k6_init(void)
222{
223 struct cpuinfo_x86 *c = &cpu_data(0);
224
225 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 5) ||
226 ((c->x86_model != 12) && (c->x86_model != 13)))
227 return -ENODEV;
228
229 if (!request_region(POWERNOW_IOPORT, 16, "PowerNow!")) {
230 printk(KERN_INFO PFX "PowerNow IOPORT region already used.\n");
231 return -EIO;
232 }
233
234 if (cpufreq_register_driver(&powernow_k6_driver)) {
235 release_region(POWERNOW_IOPORT, 16);
236 return -EINVAL;
237 }
238
239 return 0;
240}
241
242
243/**
244 * powernow_k6_exit - unregisters AMD K6-2+/3+ PowerNow! support
245 *
246 * Unregisters AMD K6-2+ / K6-3+ PowerNow! support.
247 */
248static void __exit powernow_k6_exit(void)
249{
250 cpufreq_unregister_driver(&powernow_k6_driver);
251 release_region(POWERNOW_IOPORT, 16);
252}
253
254
255MODULE_AUTHOR("Arjan van de Ven, Dave Jones <davej@redhat.com>, "
256 "Dominik Brodowski <linux@brodo.de>");
257MODULE_DESCRIPTION("PowerNow! driver for AMD K6-2+ / K6-3+ processors.");
258MODULE_LICENSE("GPL");
259
260module_init(powernow_k6_init);
261module_exit(powernow_k6_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
deleted file mode 100644
index 4a45fd6e41ba..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ /dev/null
@@ -1,752 +0,0 @@
1/*
2 * AMD K7 Powernow driver.
3 * (C) 2003 Dave Jones on behalf of SuSE Labs.
4 * (C) 2003-2004 Dave Jones <davej@redhat.com>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 * Based upon datasheets & sample CPUs kindly provided by AMD.
8 *
9 * Errata 5:
10 * CPU may fail to execute a FID/VID change in presence of interrupt.
11 * - We cli/sti on stepping A0 CPUs around the FID/VID transition.
12 * Errata 15:
13 * CPU with half frequency multipliers may hang upon wakeup from disconnect.
14 * - We disable half multipliers if ACPI is used on A0 stepping CPUs.
15 */
16
17#include <linux/kernel.h>
18#include <linux/module.h>
19#include <linux/moduleparam.h>
20#include <linux/init.h>
21#include <linux/cpufreq.h>
22#include <linux/slab.h>
23#include <linux/string.h>
24#include <linux/dmi.h>
25#include <linux/timex.h>
26#include <linux/io.h>
27
28#include <asm/timer.h> /* Needed for recalibrate_cpu_khz() */
29#include <asm/msr.h>
30#include <asm/system.h>
31
32#ifdef CONFIG_X86_POWERNOW_K7_ACPI
33#include <linux/acpi.h>
34#include <acpi/processor.h>
35#endif
36
37#include "powernow-k7.h"
38
39#define PFX "powernow: "
40
41
42struct psb_s {
43 u8 signature[10];
44 u8 tableversion;
45 u8 flags;
46 u16 settlingtime;
47 u8 reserved1;
48 u8 numpst;
49};
50
51struct pst_s {
52 u32 cpuid;
53 u8 fsbspeed;
54 u8 maxfid;
55 u8 startvid;
56 u8 numpstates;
57};
58
59#ifdef CONFIG_X86_POWERNOW_K7_ACPI
60union powernow_acpi_control_t {
61 struct {
62 unsigned long fid:5,
63 vid:5,
64 sgtc:20,
65 res1:2;
66 } bits;
67 unsigned long val;
68};
69#endif
70
71#ifdef CONFIG_CPU_FREQ_DEBUG
72/* divide by 1000 to get VCore voltage in V. */
73static const int mobile_vid_table[32] = {
74 2000, 1950, 1900, 1850, 1800, 1750, 1700, 1650,
75 1600, 1550, 1500, 1450, 1400, 1350, 1300, 0,
76 1275, 1250, 1225, 1200, 1175, 1150, 1125, 1100,
77 1075, 1050, 1025, 1000, 975, 950, 925, 0,
78};
79#endif
80
81/* divide by 10 to get FID. */
82static const int fid_codes[32] = {
83 110, 115, 120, 125, 50, 55, 60, 65,
84 70, 75, 80, 85, 90, 95, 100, 105,
85 30, 190, 40, 200, 130, 135, 140, 210,
86 150, 225, 160, 165, 170, 180, -1, -1,
87};
88
89/* This parameter is used in order to force ACPI instead of legacy method for
90 * configuration purpose.
91 */
92
93static int acpi_force;
94
95static struct cpufreq_frequency_table *powernow_table;
96
97static unsigned int can_scale_bus;
98static unsigned int can_scale_vid;
99static unsigned int minimum_speed = -1;
100static unsigned int maximum_speed;
101static unsigned int number_scales;
102static unsigned int fsb;
103static unsigned int latency;
104static char have_a0;
105
106#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
107 "powernow-k7", msg)
108
109static int check_fsb(unsigned int fsbspeed)
110{
111 int delta;
112 unsigned int f = fsb / 1000;
113
114 delta = (fsbspeed > f) ? fsbspeed - f : f - fsbspeed;
115 return delta < 5;
116}
117
118static int check_powernow(void)
119{
120 struct cpuinfo_x86 *c = &cpu_data(0);
121 unsigned int maxei, eax, ebx, ecx, edx;
122
123 if ((c->x86_vendor != X86_VENDOR_AMD) || (c->x86 != 6)) {
124#ifdef MODULE
125 printk(KERN_INFO PFX "This module only works with "
126 "AMD K7 CPUs\n");
127#endif
128 return 0;
129 }
130
131 /* Get maximum capabilities */
132 maxei = cpuid_eax(0x80000000);
133 if (maxei < 0x80000007) { /* Any powernow info ? */
134#ifdef MODULE
135 printk(KERN_INFO PFX "No powernow capabilities detected\n");
136#endif
137 return 0;
138 }
139
140 if ((c->x86_model == 6) && (c->x86_mask == 0)) {
141 printk(KERN_INFO PFX "K7 660[A0] core detected, "
142 "enabling errata workarounds\n");
143 have_a0 = 1;
144 }
145
146 cpuid(0x80000007, &eax, &ebx, &ecx, &edx);
147
148 /* Check we can actually do something before we say anything.*/
149 if (!(edx & (1 << 1 | 1 << 2)))
150 return 0;
151
152 printk(KERN_INFO PFX "PowerNOW! Technology present. Can scale: ");
153
154 if (edx & 1 << 1) {
155 printk("frequency");
156 can_scale_bus = 1;
157 }
158
159 if ((edx & (1 << 1 | 1 << 2)) == 0x6)
160 printk(" and ");
161
162 if (edx & 1 << 2) {
163 printk("voltage");
164 can_scale_vid = 1;
165 }
166
167 printk(".\n");
168 return 1;
169}
170
171#ifdef CONFIG_X86_POWERNOW_K7_ACPI
172static void invalidate_entry(unsigned int entry)
173{
174 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
175}
176#endif
177
178static int get_ranges(unsigned char *pst)
179{
180 unsigned int j;
181 unsigned int speed;
182 u8 fid, vid;
183
184 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
185 (number_scales + 1)), GFP_KERNEL);
186 if (!powernow_table)
187 return -ENOMEM;
188
189 for (j = 0 ; j < number_scales; j++) {
190 fid = *pst++;
191
192 powernow_table[j].frequency = (fsb * fid_codes[fid]) / 10;
193 powernow_table[j].index = fid; /* lower 8 bits */
194
195 speed = powernow_table[j].frequency;
196
197 if ((fid_codes[fid] % 10) == 5) {
198#ifdef CONFIG_X86_POWERNOW_K7_ACPI
199 if (have_a0 == 1)
200 invalidate_entry(j);
201#endif
202 }
203
204 if (speed < minimum_speed)
205 minimum_speed = speed;
206 if (speed > maximum_speed)
207 maximum_speed = speed;
208
209 vid = *pst++;
210 powernow_table[j].index |= (vid << 8); /* upper 8 bits */
211
212 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
213 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
214 fid_codes[fid] % 10, speed/1000, vid,
215 mobile_vid_table[vid]/1000,
216 mobile_vid_table[vid]%1000);
217 }
218 powernow_table[number_scales].frequency = CPUFREQ_TABLE_END;
219 powernow_table[number_scales].index = 0;
220
221 return 0;
222}
223
224
225static void change_FID(int fid)
226{
227 union msr_fidvidctl fidvidctl;
228
229 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
230 if (fidvidctl.bits.FID != fid) {
231 fidvidctl.bits.SGTC = latency;
232 fidvidctl.bits.FID = fid;
233 fidvidctl.bits.VIDC = 0;
234 fidvidctl.bits.FIDC = 1;
235 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
236 }
237}
238
239
240static void change_VID(int vid)
241{
242 union msr_fidvidctl fidvidctl;
243
244 rdmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
245 if (fidvidctl.bits.VID != vid) {
246 fidvidctl.bits.SGTC = latency;
247 fidvidctl.bits.VID = vid;
248 fidvidctl.bits.FIDC = 0;
249 fidvidctl.bits.VIDC = 1;
250 wrmsrl(MSR_K7_FID_VID_CTL, fidvidctl.val);
251 }
252}
253
254
255static void change_speed(unsigned int index)
256{
257 u8 fid, vid;
258 struct cpufreq_freqs freqs;
259 union msr_fidvidstatus fidvidstatus;
260 int cfid;
261
262 /* fid are the lower 8 bits of the index we stored into
263 * the cpufreq frequency table in powernow_decode_bios,
264 * vid are the upper 8 bits.
265 */
266
267 fid = powernow_table[index].index & 0xFF;
268 vid = (powernow_table[index].index & 0xFF00) >> 8;
269
270 freqs.cpu = 0;
271
272 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
273 cfid = fidvidstatus.bits.CFID;
274 freqs.old = fsb * fid_codes[cfid] / 10;
275
276 freqs.new = powernow_table[index].frequency;
277
278 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
279
280 /* Now do the magic poking into the MSRs. */
281
282 if (have_a0 == 1) /* A0 errata 5 */
283 local_irq_disable();
284
285 if (freqs.old > freqs.new) {
286 /* Going down, so change FID first */
287 change_FID(fid);
288 change_VID(vid);
289 } else {
290 /* Going up, so change VID first */
291 change_VID(vid);
292 change_FID(fid);
293 }
294
295
296 if (have_a0 == 1)
297 local_irq_enable();
298
299 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
300}
301
302
303#ifdef CONFIG_X86_POWERNOW_K7_ACPI
304
305static struct acpi_processor_performance *acpi_processor_perf;
306
307static int powernow_acpi_init(void)
308{
309 int i;
310 int retval = 0;
311 union powernow_acpi_control_t pc;
312
313 if (acpi_processor_perf != NULL && powernow_table != NULL) {
314 retval = -EINVAL;
315 goto err0;
316 }
317
318 acpi_processor_perf = kzalloc(sizeof(struct acpi_processor_performance),
319 GFP_KERNEL);
320 if (!acpi_processor_perf) {
321 retval = -ENOMEM;
322 goto err0;
323 }
324
325 if (!zalloc_cpumask_var(&acpi_processor_perf->shared_cpu_map,
326 GFP_KERNEL)) {
327 retval = -ENOMEM;
328 goto err05;
329 }
330
331 if (acpi_processor_register_performance(acpi_processor_perf, 0)) {
332 retval = -EIO;
333 goto err1;
334 }
335
336 if (acpi_processor_perf->control_register.space_id !=
337 ACPI_ADR_SPACE_FIXED_HARDWARE) {
338 retval = -ENODEV;
339 goto err2;
340 }
341
342 if (acpi_processor_perf->status_register.space_id !=
343 ACPI_ADR_SPACE_FIXED_HARDWARE) {
344 retval = -ENODEV;
345 goto err2;
346 }
347
348 number_scales = acpi_processor_perf->state_count;
349
350 if (number_scales < 2) {
351 retval = -ENODEV;
352 goto err2;
353 }
354
355 powernow_table = kzalloc((sizeof(struct cpufreq_frequency_table) *
356 (number_scales + 1)), GFP_KERNEL);
357 if (!powernow_table) {
358 retval = -ENOMEM;
359 goto err2;
360 }
361
362 pc.val = (unsigned long) acpi_processor_perf->states[0].control;
363 for (i = 0; i < number_scales; i++) {
364 u8 fid, vid;
365 struct acpi_processor_px *state =
366 &acpi_processor_perf->states[i];
367 unsigned int speed, speed_mhz;
368
369 pc.val = (unsigned long) state->control;
370 dprintk("acpi: P%d: %d MHz %d mW %d uS control %08x SGTC %d\n",
371 i,
372 (u32) state->core_frequency,
373 (u32) state->power,
374 (u32) state->transition_latency,
375 (u32) state->control,
376 pc.bits.sgtc);
377
378 vid = pc.bits.vid;
379 fid = pc.bits.fid;
380
381 powernow_table[i].frequency = fsb * fid_codes[fid] / 10;
382 powernow_table[i].index = fid; /* lower 8 bits */
383 powernow_table[i].index |= (vid << 8); /* upper 8 bits */
384
385 speed = powernow_table[i].frequency;
386 speed_mhz = speed / 1000;
387
388 /* processor_perflib will multiply the MHz value by 1000 to
389 * get a KHz value (e.g. 1266000). However, powernow-k7 works
390 * with true KHz values (e.g. 1266768). To ensure that all
391 * powernow frequencies are available, we must ensure that
392 * ACPI doesn't restrict them, so we round up the MHz value
393 * to ensure that perflib's computed KHz value is greater than
394 * or equal to powernow's KHz value.
395 */
396 if (speed % 1000 > 0)
397 speed_mhz++;
398
399 if ((fid_codes[fid] % 10) == 5) {
400 if (have_a0 == 1)
401 invalidate_entry(i);
402 }
403
404 dprintk(" FID: 0x%x (%d.%dx [%dMHz]) "
405 "VID: 0x%x (%d.%03dV)\n", fid, fid_codes[fid] / 10,
406 fid_codes[fid] % 10, speed_mhz, vid,
407 mobile_vid_table[vid]/1000,
408 mobile_vid_table[vid]%1000);
409
410 if (state->core_frequency != speed_mhz) {
411 state->core_frequency = speed_mhz;
412 dprintk(" Corrected ACPI frequency to %d\n",
413 speed_mhz);
414 }
415
416 if (latency < pc.bits.sgtc)
417 latency = pc.bits.sgtc;
418
419 if (speed < minimum_speed)
420 minimum_speed = speed;
421 if (speed > maximum_speed)
422 maximum_speed = speed;
423 }
424
425 powernow_table[i].frequency = CPUFREQ_TABLE_END;
426 powernow_table[i].index = 0;
427
428 /* notify BIOS that we exist */
429 acpi_processor_notify_smm(THIS_MODULE);
430
431 return 0;
432
433err2:
434 acpi_processor_unregister_performance(acpi_processor_perf, 0);
435err1:
436 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
437err05:
438 kfree(acpi_processor_perf);
439err0:
440 printk(KERN_WARNING PFX "ACPI perflib can not be used on "
441 "this platform\n");
442 acpi_processor_perf = NULL;
443 return retval;
444}
445#else
446static int powernow_acpi_init(void)
447{
448 printk(KERN_INFO PFX "no support for ACPI processor found."
449 " Please recompile your kernel with ACPI processor\n");
450 return -EINVAL;
451}
452#endif
453
454static void print_pst_entry(struct pst_s *pst, unsigned int j)
455{
456 dprintk("PST:%d (@%p)\n", j, pst);
457 dprintk(" cpuid: 0x%x fsb: %d maxFID: 0x%x startvid: 0x%x\n",
458 pst->cpuid, pst->fsbspeed, pst->maxfid, pst->startvid);
459}
460
461static int powernow_decode_bios(int maxfid, int startvid)
462{
463 struct psb_s *psb;
464 struct pst_s *pst;
465 unsigned int i, j;
466 unsigned char *p;
467 unsigned int etuple;
468 unsigned int ret;
469
470 etuple = cpuid_eax(0x80000001);
471
472 for (i = 0xC0000; i < 0xffff0 ; i += 16) {
473
474 p = phys_to_virt(i);
475
476 if (memcmp(p, "AMDK7PNOW!", 10) == 0) {
477 dprintk("Found PSB header at %p\n", p);
478 psb = (struct psb_s *) p;
479 dprintk("Table version: 0x%x\n", psb->tableversion);
480 if (psb->tableversion != 0x12) {
481 printk(KERN_INFO PFX "Sorry, only v1.2 tables"
482 " supported right now\n");
483 return -ENODEV;
484 }
485
486 dprintk("Flags: 0x%x\n", psb->flags);
487 if ((psb->flags & 1) == 0)
488 dprintk("Mobile voltage regulator\n");
489 else
490 dprintk("Desktop voltage regulator\n");
491
492 latency = psb->settlingtime;
493 if (latency < 100) {
494 printk(KERN_INFO PFX "BIOS set settling time "
495 "to %d microseconds. "
496 "Should be at least 100. "
497 "Correcting.\n", latency);
498 latency = 100;
499 }
500 dprintk("Settling Time: %d microseconds.\n",
501 psb->settlingtime);
502 dprintk("Has %d PST tables. (Only dumping ones "
503 "relevant to this CPU).\n",
504 psb->numpst);
505
506 p += sizeof(struct psb_s);
507
508 pst = (struct pst_s *) p;
509
510 for (j = 0; j < psb->numpst; j++) {
511 pst = (struct pst_s *) p;
512 number_scales = pst->numpstates;
513
514 if ((etuple == pst->cpuid) &&
515 check_fsb(pst->fsbspeed) &&
516 (maxfid == pst->maxfid) &&
517 (startvid == pst->startvid)) {
518 print_pst_entry(pst, j);
519 p = (char *)pst + sizeof(struct pst_s);
520 ret = get_ranges(p);
521 return ret;
522 } else {
523 unsigned int k;
524 p = (char *)pst + sizeof(struct pst_s);
525 for (k = 0; k < number_scales; k++)
526 p += 2;
527 }
528 }
529 printk(KERN_INFO PFX "No PST tables match this cpuid "
530 "(0x%x)\n", etuple);
531 printk(KERN_INFO PFX "This is indicative of a broken "
532 "BIOS.\n");
533
534 return -EINVAL;
535 }
536 p++;
537 }
538
539 return -ENODEV;
540}
541
542
543static int powernow_target(struct cpufreq_policy *policy,
544 unsigned int target_freq,
545 unsigned int relation)
546{
547 unsigned int newstate;
548
549 if (cpufreq_frequency_table_target(policy, powernow_table, target_freq,
550 relation, &newstate))
551 return -EINVAL;
552
553 change_speed(newstate);
554
555 return 0;
556}
557
558
559static int powernow_verify(struct cpufreq_policy *policy)
560{
561 return cpufreq_frequency_table_verify(policy, powernow_table);
562}
563
564/*
565 * We use the fact that the bus frequency is somehow
566 * a multiple of 100000/3 khz, then we compute sgtc according
567 * to this multiple.
568 * That way, we match more how AMD thinks all of that work.
569 * We will then get the same kind of behaviour already tested under
570 * the "well-known" other OS.
571 */
572static int __cpuinit fixup_sgtc(void)
573{
574 unsigned int sgtc;
575 unsigned int m;
576
577 m = fsb / 3333;
578 if ((m % 10) >= 5)
579 m += 5;
580
581 m /= 10;
582
583 sgtc = 100 * m * latency;
584 sgtc = sgtc / 3;
585 if (sgtc > 0xfffff) {
586 printk(KERN_WARNING PFX "SGTC too large %d\n", sgtc);
587 sgtc = 0xfffff;
588 }
589 return sgtc;
590}
591
592static unsigned int powernow_get(unsigned int cpu)
593{
594 union msr_fidvidstatus fidvidstatus;
595 unsigned int cfid;
596
597 if (cpu)
598 return 0;
599 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
600 cfid = fidvidstatus.bits.CFID;
601
602 return fsb * fid_codes[cfid] / 10;
603}
604
605
606static int __cpuinit acer_cpufreq_pst(const struct dmi_system_id *d)
607{
608 printk(KERN_WARNING PFX
609 "%s laptop with broken PST tables in BIOS detected.\n",
610 d->ident);
611 printk(KERN_WARNING PFX
612 "You need to downgrade to 3A21 (09/09/2002), or try a newer "
613 "BIOS than 3A71 (01/20/2003)\n");
614 printk(KERN_WARNING PFX
615 "cpufreq scaling has been disabled as a result of this.\n");
616 return 0;
617}
618
619/*
620 * Some Athlon laptops have really fucked PST tables.
621 * A BIOS update is all that can save them.
622 * Mention this, and disable cpufreq.
623 */
624static struct dmi_system_id __cpuinitdata powernow_dmi_table[] = {
625 {
626 .callback = acer_cpufreq_pst,
627 .ident = "Acer Aspire",
628 .matches = {
629 DMI_MATCH(DMI_SYS_VENDOR, "Insyde Software"),
630 DMI_MATCH(DMI_BIOS_VERSION, "3A71"),
631 },
632 },
633 { }
634};
635
636static int __cpuinit powernow_cpu_init(struct cpufreq_policy *policy)
637{
638 union msr_fidvidstatus fidvidstatus;
639 int result;
640
641 if (policy->cpu != 0)
642 return -ENODEV;
643
644 rdmsrl(MSR_K7_FID_VID_STATUS, fidvidstatus.val);
645
646 recalibrate_cpu_khz();
647
648 fsb = (10 * cpu_khz) / fid_codes[fidvidstatus.bits.CFID];
649 if (!fsb) {
650 printk(KERN_WARNING PFX "can not determine bus frequency\n");
651 return -EINVAL;
652 }
653 dprintk("FSB: %3dMHz\n", fsb/1000);
654
655 if (dmi_check_system(powernow_dmi_table) || acpi_force) {
656 printk(KERN_INFO PFX "PSB/PST known to be broken. "
657 "Trying ACPI instead\n");
658 result = powernow_acpi_init();
659 } else {
660 result = powernow_decode_bios(fidvidstatus.bits.MFID,
661 fidvidstatus.bits.SVID);
662 if (result) {
663 printk(KERN_INFO PFX "Trying ACPI perflib\n");
664 maximum_speed = 0;
665 minimum_speed = -1;
666 latency = 0;
667 result = powernow_acpi_init();
668 if (result) {
669 printk(KERN_INFO PFX
670 "ACPI and legacy methods failed\n");
671 }
672 } else {
673 /* SGTC use the bus clock as timer */
674 latency = fixup_sgtc();
675 printk(KERN_INFO PFX "SGTC: %d\n", latency);
676 }
677 }
678
679 if (result)
680 return result;
681
682 printk(KERN_INFO PFX "Minimum speed %d MHz. Maximum speed %d MHz.\n",
683 minimum_speed/1000, maximum_speed/1000);
684
685 policy->cpuinfo.transition_latency =
686 cpufreq_scale(2000000UL, fsb, latency);
687
688 policy->cur = powernow_get(0);
689
690 cpufreq_frequency_table_get_attr(powernow_table, policy->cpu);
691
692 return cpufreq_frequency_table_cpuinfo(policy, powernow_table);
693}
694
695static int powernow_cpu_exit(struct cpufreq_policy *policy)
696{
697 cpufreq_frequency_table_put_attr(policy->cpu);
698
699#ifdef CONFIG_X86_POWERNOW_K7_ACPI
700 if (acpi_processor_perf) {
701 acpi_processor_unregister_performance(acpi_processor_perf, 0);
702 free_cpumask_var(acpi_processor_perf->shared_cpu_map);
703 kfree(acpi_processor_perf);
704 }
705#endif
706
707 kfree(powernow_table);
708 return 0;
709}
710
711static struct freq_attr *powernow_table_attr[] = {
712 &cpufreq_freq_attr_scaling_available_freqs,
713 NULL,
714};
715
716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify,
718 .target = powernow_target,
719 .get = powernow_get,
720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .bios_limit = acpi_processor_get_bios_limit,
722#endif
723 .init = powernow_cpu_init,
724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
728};
729
730static int __init powernow_init(void)
731{
732 if (check_powernow() == 0)
733 return -ENODEV;
734 return cpufreq_register_driver(&powernow_driver);
735}
736
737
738static void __exit powernow_exit(void)
739{
740 cpufreq_unregister_driver(&powernow_driver);
741}
742
743module_param(acpi_force, int, 0444);
744MODULE_PARM_DESC(acpi_force, "Force ACPI to be used.");
745
746MODULE_AUTHOR("Dave Jones <davej@redhat.com>");
747MODULE_DESCRIPTION("Powernow driver for AMD K7 processors.");
748MODULE_LICENSE("GPL");
749
750late_initcall(powernow_init);
751module_exit(powernow_exit);
752
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h b/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
deleted file mode 100644
index 35fb4eaf6e1c..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.h
+++ /dev/null
@@ -1,43 +0,0 @@
1/*
2 * (C) 2003 Dave Jones.
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * AMD-specific information
7 *
8 */
9
10union msr_fidvidctl {
11 struct {
12 unsigned FID:5, // 4:0
13 reserved1:3, // 7:5
14 VID:5, // 12:8
15 reserved2:3, // 15:13
16 FIDC:1, // 16
17 VIDC:1, // 17
18 reserved3:2, // 19:18
19 FIDCHGRATIO:1, // 20
20 reserved4:11, // 31-21
21 SGTC:20, // 32:51
22 reserved5:12; // 63:52
23 } bits;
24 unsigned long long val;
25};
26
27union msr_fidvidstatus {
28 struct {
29 unsigned CFID:5, // 4:0
30 reserved1:3, // 7:5
31 SFID:5, // 12:8
32 reserved2:3, // 15:13
33 MFID:5, // 20:16
34 reserved3:11, // 31:21
35 CVID:5, // 36:32
36 reserved4:3, // 39:37
37 SVID:5, // 44:40
38 reserved5:3, // 47:45
39 MVID:5, // 52:48
40 reserved6:11; // 63:53
41 } bits;
42 unsigned long long val;
43};
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
deleted file mode 100644
index 491977baf6c0..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ /dev/null
@@ -1,1601 +0,0 @@
1/*
2 * (c) 2003-2010 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 *
7 * Support : mark.langsdorf@amd.com
8 *
9 * Based on the powernow-k7.c module written by Dave Jones.
10 * (C) 2003 Dave Jones on behalf of SuSE Labs
11 * (C) 2004 Dominik Brodowski <linux@brodo.de>
12 * (C) 2004 Pavel Machek <pavel@ucw.cz>
13 * Licensed under the terms of the GNU GPL License version 2.
14 * Based upon datasheets & sample CPUs kindly provided by AMD.
15 *
16 * Valuable input gratefully received from Dave Jones, Pavel Machek,
17 * Dominik Brodowski, Jacob Shin, and others.
18 * Originally developed by Paul Devriendt.
19 * Processor information obtained from Chapter 9 (Power and Thermal Management)
20 * of the "BIOS and Kernel Developer's Guide for the AMD Athlon 64 and AMD
21 * Opteron Processors" available for download from www.amd.com
22 *
23 * Tables for specific CPUs can be inferred from
24 * http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/30430.pdf
25 */
26
27#include <linux/kernel.h>
28#include <linux/smp.h>
29#include <linux/module.h>
30#include <linux/init.h>
31#include <linux/cpufreq.h>
32#include <linux/slab.h>
33#include <linux/string.h>
34#include <linux/cpumask.h>
35#include <linux/sched.h> /* for current / set_cpus_allowed() */
36#include <linux/io.h>
37#include <linux/delay.h>
38
39#include <asm/msr.h>
40
41#include <linux/acpi.h>
42#include <linux/mutex.h>
43#include <acpi/processor.h>
44
45#define PFX "powernow-k8: "
46#define VERSION "version 2.20.00"
47#include "powernow-k8.h"
48#include "mperf.h"
49
50/* serialize freq changes */
51static DEFINE_MUTEX(fidvid_mutex);
52
53static DEFINE_PER_CPU(struct powernow_k8_data *, powernow_data);
54
55static int cpu_family = CPU_OPTERON;
56
57/* core performance boost */
58static bool cpb_capable, cpb_enabled;
59static struct msr __percpu *msrs;
60
61static struct cpufreq_driver cpufreq_amd64_driver;
62
63#ifndef CONFIG_SMP
64static inline const struct cpumask *cpu_core_mask(int cpu)
65{
66 return cpumask_of(0);
67}
68#endif
69
70/* Return a frequency in MHz, given an input fid */
71static u32 find_freq_from_fid(u32 fid)
72{
73 return 800 + (fid * 100);
74}
75
76/* Return a frequency in KHz, given an input fid */
77static u32 find_khz_freq_from_fid(u32 fid)
78{
79 return 1000 * find_freq_from_fid(fid);
80}
81
82static u32 find_khz_freq_from_pstate(struct cpufreq_frequency_table *data,
83 u32 pstate)
84{
85 return data[pstate].frequency;
86}
87
88/* Return the vco fid for an input fid
89 *
90 * Each "low" fid has corresponding "high" fid, and you can get to "low" fids
91 * only from corresponding high fids. This returns "high" fid corresponding to
92 * "low" one.
93 */
94static u32 convert_fid_to_vco_fid(u32 fid)
95{
96 if (fid < HI_FID_TABLE_BOTTOM)
97 return 8 + (2 * fid);
98 else
99 return fid;
100}
101
102/*
103 * Return 1 if the pending bit is set. Unless we just instructed the processor
104 * to transition to a new state, seeing this bit set is really bad news.
105 */
106static int pending_bit_stuck(void)
107{
108 u32 lo, hi;
109
110 if (cpu_family == CPU_HW_PSTATE)
111 return 0;
112
113 rdmsr(MSR_FIDVID_STATUS, lo, hi);
114 return lo & MSR_S_LO_CHANGE_PENDING ? 1 : 0;
115}
116
117/*
118 * Update the global current fid / vid values from the status msr.
119 * Returns 1 on error.
120 */
121static int query_current_values_with_pending_wait(struct powernow_k8_data *data)
122{
123 u32 lo, hi;
124 u32 i = 0;
125
126 if (cpu_family == CPU_HW_PSTATE) {
127 rdmsr(MSR_PSTATE_STATUS, lo, hi);
128 i = lo & HW_PSTATE_MASK;
129 data->currpstate = i;
130
131 /*
132 * a workaround for family 11h erratum 311 might cause
133 * an "out-of-range Pstate if the core is in Pstate-0
134 */
135 if ((boot_cpu_data.x86 == 0x11) && (i >= data->numps))
136 data->currpstate = HW_PSTATE_0;
137
138 return 0;
139 }
140 do {
141 if (i++ > 10000) {
142 dprintk("detected change pending stuck\n");
143 return 1;
144 }
145 rdmsr(MSR_FIDVID_STATUS, lo, hi);
146 } while (lo & MSR_S_LO_CHANGE_PENDING);
147
148 data->currvid = hi & MSR_S_HI_CURRENT_VID;
149 data->currfid = lo & MSR_S_LO_CURRENT_FID;
150
151 return 0;
152}
153
154/* the isochronous relief time */
155static void count_off_irt(struct powernow_k8_data *data)
156{
157 udelay((1 << data->irt) * 10);
158 return;
159}
160
161/* the voltage stabilization time */
162static void count_off_vst(struct powernow_k8_data *data)
163{
164 udelay(data->vstable * VST_UNITS_20US);
165 return;
166}
167
168/* need to init the control msr to a safe value (for each cpu) */
169static void fidvid_msr_init(void)
170{
171 u32 lo, hi;
172 u8 fid, vid;
173
174 rdmsr(MSR_FIDVID_STATUS, lo, hi);
175 vid = hi & MSR_S_HI_CURRENT_VID;
176 fid = lo & MSR_S_LO_CURRENT_FID;
177 lo = fid | (vid << MSR_C_LO_VID_SHIFT);
178 hi = MSR_C_HI_STP_GNT_BENIGN;
179 dprintk("cpu%d, init lo 0x%x, hi 0x%x\n", smp_processor_id(), lo, hi);
180 wrmsr(MSR_FIDVID_CTL, lo, hi);
181}
182
183/* write the new fid value along with the other control fields to the msr */
184static int write_new_fid(struct powernow_k8_data *data, u32 fid)
185{
186 u32 lo;
187 u32 savevid = data->currvid;
188 u32 i = 0;
189
190 if ((fid & INVALID_FID_MASK) || (data->currvid & INVALID_VID_MASK)) {
191 printk(KERN_ERR PFX "internal error - overflow on fid write\n");
192 return 1;
193 }
194
195 lo = fid;
196 lo |= (data->currvid << MSR_C_LO_VID_SHIFT);
197 lo |= MSR_C_LO_INIT_FID_VID;
198
199 dprintk("writing fid 0x%x, lo 0x%x, hi 0x%x\n",
200 fid, lo, data->plllock * PLL_LOCK_CONVERSION);
201
202 do {
203 wrmsr(MSR_FIDVID_CTL, lo, data->plllock * PLL_LOCK_CONVERSION);
204 if (i++ > 100) {
205 printk(KERN_ERR PFX
206 "Hardware error - pending bit very stuck - "
207 "no further pstate changes possible\n");
208 return 1;
209 }
210 } while (query_current_values_with_pending_wait(data));
211
212 count_off_irt(data);
213
214 if (savevid != data->currvid) {
215 printk(KERN_ERR PFX
216 "vid change on fid trans, old 0x%x, new 0x%x\n",
217 savevid, data->currvid);
218 return 1;
219 }
220
221 if (fid != data->currfid) {
222 printk(KERN_ERR PFX
223 "fid trans failed, fid 0x%x, curr 0x%x\n", fid,
224 data->currfid);
225 return 1;
226 }
227
228 return 0;
229}
230
231/* Write a new vid to the hardware */
232static int write_new_vid(struct powernow_k8_data *data, u32 vid)
233{
234 u32 lo;
235 u32 savefid = data->currfid;
236 int i = 0;
237
238 if ((data->currfid & INVALID_FID_MASK) || (vid & INVALID_VID_MASK)) {
239 printk(KERN_ERR PFX "internal error - overflow on vid write\n");
240 return 1;
241 }
242
243 lo = data->currfid;
244 lo |= (vid << MSR_C_LO_VID_SHIFT);
245 lo |= MSR_C_LO_INIT_FID_VID;
246
247 dprintk("writing vid 0x%x, lo 0x%x, hi 0x%x\n",
248 vid, lo, STOP_GRANT_5NS);
249
250 do {
251 wrmsr(MSR_FIDVID_CTL, lo, STOP_GRANT_5NS);
252 if (i++ > 100) {
253 printk(KERN_ERR PFX "internal error - pending bit "
254 "very stuck - no further pstate "
255 "changes possible\n");
256 return 1;
257 }
258 } while (query_current_values_with_pending_wait(data));
259
260 if (savefid != data->currfid) {
261 printk(KERN_ERR PFX "fid changed on vid trans, old "
262 "0x%x new 0x%x\n",
263 savefid, data->currfid);
264 return 1;
265 }
266
267 if (vid != data->currvid) {
268 printk(KERN_ERR PFX "vid trans failed, vid 0x%x, "
269 "curr 0x%x\n",
270 vid, data->currvid);
271 return 1;
272 }
273
274 return 0;
275}
276
277/*
278 * Reduce the vid by the max of step or reqvid.
279 * Decreasing vid codes represent increasing voltages:
280 * vid of 0 is 1.550V, vid of 0x1e is 0.800V, vid of VID_OFF is off.
281 */
282static int decrease_vid_code_by_step(struct powernow_k8_data *data,
283 u32 reqvid, u32 step)
284{
285 if ((data->currvid - reqvid) > step)
286 reqvid = data->currvid - step;
287
288 if (write_new_vid(data, reqvid))
289 return 1;
290
291 count_off_vst(data);
292
293 return 0;
294}
295
296/* Change hardware pstate by single MSR write */
297static int transition_pstate(struct powernow_k8_data *data, u32 pstate)
298{
299 wrmsr(MSR_PSTATE_CTRL, pstate, 0);
300 data->currpstate = pstate;
301 return 0;
302}
303
304/* Change Opteron/Athlon64 fid and vid, by the 3 phases. */
305static int transition_fid_vid(struct powernow_k8_data *data,
306 u32 reqfid, u32 reqvid)
307{
308 if (core_voltage_pre_transition(data, reqvid, reqfid))
309 return 1;
310
311 if (core_frequency_transition(data, reqfid))
312 return 1;
313
314 if (core_voltage_post_transition(data, reqvid))
315 return 1;
316
317 if (query_current_values_with_pending_wait(data))
318 return 1;
319
320 if ((reqfid != data->currfid) || (reqvid != data->currvid)) {
321 printk(KERN_ERR PFX "failed (cpu%d): req 0x%x 0x%x, "
322 "curr 0x%x 0x%x\n",
323 smp_processor_id(),
324 reqfid, reqvid, data->currfid, data->currvid);
325 return 1;
326 }
327
328 dprintk("transitioned (cpu%d): new fid 0x%x, vid 0x%x\n",
329 smp_processor_id(), data->currfid, data->currvid);
330
331 return 0;
332}
333
334/* Phase 1 - core voltage transition ... setup voltage */
335static int core_voltage_pre_transition(struct powernow_k8_data *data,
336 u32 reqvid, u32 reqfid)
337{
338 u32 rvosteps = data->rvo;
339 u32 savefid = data->currfid;
340 u32 maxvid, lo, rvomult = 1;
341
342 dprintk("ph1 (cpu%d): start, currfid 0x%x, currvid 0x%x, "
343 "reqvid 0x%x, rvo 0x%x\n",
344 smp_processor_id(),
345 data->currfid, data->currvid, reqvid, data->rvo);
346
347 if ((savefid < LO_FID_TABLE_TOP) && (reqfid < LO_FID_TABLE_TOP))
348 rvomult = 2;
349 rvosteps *= rvomult;
350 rdmsr(MSR_FIDVID_STATUS, lo, maxvid);
351 maxvid = 0x1f & (maxvid >> 16);
352 dprintk("ph1 maxvid=0x%x\n", maxvid);
353 if (reqvid < maxvid) /* lower numbers are higher voltages */
354 reqvid = maxvid;
355
356 while (data->currvid > reqvid) {
357 dprintk("ph1: curr 0x%x, req vid 0x%x\n",
358 data->currvid, reqvid);
359 if (decrease_vid_code_by_step(data, reqvid, data->vidmvs))
360 return 1;
361 }
362
363 while ((rvosteps > 0) &&
364 ((rvomult * data->rvo + data->currvid) > reqvid)) {
365 if (data->currvid == maxvid) {
366 rvosteps = 0;
367 } else {
368 dprintk("ph1: changing vid for rvo, req 0x%x\n",
369 data->currvid - 1);
370 if (decrease_vid_code_by_step(data, data->currvid-1, 1))
371 return 1;
372 rvosteps--;
373 }
374 }
375
376 if (query_current_values_with_pending_wait(data))
377 return 1;
378
379 if (savefid != data->currfid) {
380 printk(KERN_ERR PFX "ph1 err, currfid changed 0x%x\n",
381 data->currfid);
382 return 1;
383 }
384
385 dprintk("ph1 complete, currfid 0x%x, currvid 0x%x\n",
386 data->currfid, data->currvid);
387
388 return 0;
389}
390
391/* Phase 2 - core frequency transition */
392static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid)
393{
394 u32 vcoreqfid, vcocurrfid, vcofiddiff;
395 u32 fid_interval, savevid = data->currvid;
396
397 if (data->currfid == reqfid) {
398 printk(KERN_ERR PFX "ph2 null fid transition 0x%x\n",
399 data->currfid);
400 return 0;
401 }
402
403 dprintk("ph2 (cpu%d): starting, currfid 0x%x, currvid 0x%x, "
404 "reqfid 0x%x\n",
405 smp_processor_id(),
406 data->currfid, data->currvid, reqfid);
407
408 vcoreqfid = convert_fid_to_vco_fid(reqfid);
409 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
410 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
411 : vcoreqfid - vcocurrfid;
412
413 if ((reqfid <= LO_FID_TABLE_TOP) && (data->currfid <= LO_FID_TABLE_TOP))
414 vcofiddiff = 0;
415
416 while (vcofiddiff > 2) {
417 (data->currfid & 1) ? (fid_interval = 1) : (fid_interval = 2);
418
419 if (reqfid > data->currfid) {
420 if (data->currfid > LO_FID_TABLE_TOP) {
421 if (write_new_fid(data,
422 data->currfid + fid_interval))
423 return 1;
424 } else {
425 if (write_new_fid
426 (data,
427 2 + convert_fid_to_vco_fid(data->currfid)))
428 return 1;
429 }
430 } else {
431 if (write_new_fid(data, data->currfid - fid_interval))
432 return 1;
433 }
434
435 vcocurrfid = convert_fid_to_vco_fid(data->currfid);
436 vcofiddiff = vcocurrfid > vcoreqfid ? vcocurrfid - vcoreqfid
437 : vcoreqfid - vcocurrfid;
438 }
439
440 if (write_new_fid(data, reqfid))
441 return 1;
442
443 if (query_current_values_with_pending_wait(data))
444 return 1;
445
446 if (data->currfid != reqfid) {
447 printk(KERN_ERR PFX
448 "ph2: mismatch, failed fid transition, "
449 "curr 0x%x, req 0x%x\n",
450 data->currfid, reqfid);
451 return 1;
452 }
453
454 if (savevid != data->currvid) {
455 printk(KERN_ERR PFX "ph2: vid changed, save 0x%x, curr 0x%x\n",
456 savevid, data->currvid);
457 return 1;
458 }
459
460 dprintk("ph2 complete, currfid 0x%x, currvid 0x%x\n",
461 data->currfid, data->currvid);
462
463 return 0;
464}
465
466/* Phase 3 - core voltage transition flow ... jump to the final vid. */
467static int core_voltage_post_transition(struct powernow_k8_data *data,
468 u32 reqvid)
469{
470 u32 savefid = data->currfid;
471 u32 savereqvid = reqvid;
472
473 dprintk("ph3 (cpu%d): starting, currfid 0x%x, currvid 0x%x\n",
474 smp_processor_id(),
475 data->currfid, data->currvid);
476
477 if (reqvid != data->currvid) {
478 if (write_new_vid(data, reqvid))
479 return 1;
480
481 if (savefid != data->currfid) {
482 printk(KERN_ERR PFX
483 "ph3: bad fid change, save 0x%x, curr 0x%x\n",
484 savefid, data->currfid);
485 return 1;
486 }
487
488 if (data->currvid != reqvid) {
489 printk(KERN_ERR PFX
490 "ph3: failed vid transition\n, "
491 "req 0x%x, curr 0x%x",
492 reqvid, data->currvid);
493 return 1;
494 }
495 }
496
497 if (query_current_values_with_pending_wait(data))
498 return 1;
499
500 if (savereqvid != data->currvid) {
501 dprintk("ph3 failed, currvid 0x%x\n", data->currvid);
502 return 1;
503 }
504
505 if (savefid != data->currfid) {
506 dprintk("ph3 failed, currfid changed 0x%x\n",
507 data->currfid);
508 return 1;
509 }
510
511 dprintk("ph3 complete, currfid 0x%x, currvid 0x%x\n",
512 data->currfid, data->currvid);
513
514 return 0;
515}
516
517static void check_supported_cpu(void *_rc)
518{
519 u32 eax, ebx, ecx, edx;
520 int *rc = _rc;
521
522 *rc = -ENODEV;
523
524 if (current_cpu_data.x86_vendor != X86_VENDOR_AMD)
525 return;
526
527 eax = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
528 if (((eax & CPUID_XFAM) != CPUID_XFAM_K8) &&
529 ((eax & CPUID_XFAM) < CPUID_XFAM_10H))
530 return;
531
532 if ((eax & CPUID_XFAM) == CPUID_XFAM_K8) {
533 if (((eax & CPUID_USE_XFAM_XMOD) != CPUID_USE_XFAM_XMOD) ||
534 ((eax & CPUID_XMOD) > CPUID_XMOD_REV_MASK)) {
535 printk(KERN_INFO PFX
536 "Processor cpuid %x not supported\n", eax);
537 return;
538 }
539
540 eax = cpuid_eax(CPUID_GET_MAX_CAPABILITIES);
541 if (eax < CPUID_FREQ_VOLT_CAPABILITIES) {
542 printk(KERN_INFO PFX
543 "No frequency change capabilities detected\n");
544 return;
545 }
546
547 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
548 if ((edx & P_STATE_TRANSITION_CAPABLE)
549 != P_STATE_TRANSITION_CAPABLE) {
550 printk(KERN_INFO PFX
551 "Power state transitions not supported\n");
552 return;
553 }
554 } else { /* must be a HW Pstate capable processor */
555 cpuid(CPUID_FREQ_VOLT_CAPABILITIES, &eax, &ebx, &ecx, &edx);
556 if ((edx & USE_HW_PSTATE) == USE_HW_PSTATE)
557 cpu_family = CPU_HW_PSTATE;
558 else
559 return;
560 }
561
562 *rc = 0;
563}
564
565static int check_pst_table(struct powernow_k8_data *data, struct pst_s *pst,
566 u8 maxvid)
567{
568 unsigned int j;
569 u8 lastfid = 0xff;
570
571 for (j = 0; j < data->numps; j++) {
572 if (pst[j].vid > LEAST_VID) {
573 printk(KERN_ERR FW_BUG PFX "vid %d invalid : 0x%x\n",
574 j, pst[j].vid);
575 return -EINVAL;
576 }
577 if (pst[j].vid < data->rvo) {
578 /* vid + rvo >= 0 */
579 printk(KERN_ERR FW_BUG PFX "0 vid exceeded with pstate"
580 " %d\n", j);
581 return -ENODEV;
582 }
583 if (pst[j].vid < maxvid + data->rvo) {
584 /* vid + rvo >= maxvid */
585 printk(KERN_ERR FW_BUG PFX "maxvid exceeded with pstate"
586 " %d\n", j);
587 return -ENODEV;
588 }
589 if (pst[j].fid > MAX_FID) {
590 printk(KERN_ERR FW_BUG PFX "maxfid exceeded with pstate"
591 " %d\n", j);
592 return -ENODEV;
593 }
594 if (j && (pst[j].fid < HI_FID_TABLE_BOTTOM)) {
595 /* Only first fid is allowed to be in "low" range */
596 printk(KERN_ERR FW_BUG PFX "two low fids - %d : "
597 "0x%x\n", j, pst[j].fid);
598 return -EINVAL;
599 }
600 if (pst[j].fid < lastfid)
601 lastfid = pst[j].fid;
602 }
603 if (lastfid & 1) {
604 printk(KERN_ERR FW_BUG PFX "lastfid invalid\n");
605 return -EINVAL;
606 }
607 if (lastfid > LO_FID_TABLE_TOP)
608 printk(KERN_INFO FW_BUG PFX
609 "first fid not from lo freq table\n");
610
611 return 0;
612}
613
614static void invalidate_entry(struct cpufreq_frequency_table *powernow_table,
615 unsigned int entry)
616{
617 powernow_table[entry].frequency = CPUFREQ_ENTRY_INVALID;
618}
619
620static void print_basics(struct powernow_k8_data *data)
621{
622 int j;
623 for (j = 0; j < data->numps; j++) {
624 if (data->powernow_table[j].frequency !=
625 CPUFREQ_ENTRY_INVALID) {
626 if (cpu_family == CPU_HW_PSTATE) {
627 printk(KERN_INFO PFX
628 " %d : pstate %d (%d MHz)\n", j,
629 data->powernow_table[j].index,
630 data->powernow_table[j].frequency/1000);
631 } else {
632 printk(KERN_INFO PFX
633 " %d : fid 0x%x (%d MHz), vid 0x%x\n",
634 j,
635 data->powernow_table[j].index & 0xff,
636 data->powernow_table[j].frequency/1000,
637 data->powernow_table[j].index >> 8);
638 }
639 }
640 }
641 if (data->batps)
642 printk(KERN_INFO PFX "Only %d pstates on battery\n",
643 data->batps);
644}
645
646static u32 freq_from_fid_did(u32 fid, u32 did)
647{
648 u32 mhz = 0;
649
650 if (boot_cpu_data.x86 == 0x10)
651 mhz = (100 * (fid + 0x10)) >> did;
652 else if (boot_cpu_data.x86 == 0x11)
653 mhz = (100 * (fid + 8)) >> did;
654 else
655 BUG();
656
657 return mhz * 1000;
658}
659
660static int fill_powernow_table(struct powernow_k8_data *data,
661 struct pst_s *pst, u8 maxvid)
662{
663 struct cpufreq_frequency_table *powernow_table;
664 unsigned int j;
665
666 if (data->batps) {
667 /* use ACPI support to get full speed on mains power */
668 printk(KERN_WARNING PFX
669 "Only %d pstates usable (use ACPI driver for full "
670 "range\n", data->batps);
671 data->numps = data->batps;
672 }
673
674 for (j = 1; j < data->numps; j++) {
675 if (pst[j-1].fid >= pst[j].fid) {
676 printk(KERN_ERR PFX "PST out of sequence\n");
677 return -EINVAL;
678 }
679 }
680
681 if (data->numps < 2) {
682 printk(KERN_ERR PFX "no p states to transition\n");
683 return -ENODEV;
684 }
685
686 if (check_pst_table(data, pst, maxvid))
687 return -EINVAL;
688
689 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
690 * (data->numps + 1)), GFP_KERNEL);
691 if (!powernow_table) {
692 printk(KERN_ERR PFX "powernow_table memory alloc failure\n");
693 return -ENOMEM;
694 }
695
696 for (j = 0; j < data->numps; j++) {
697 int freq;
698 powernow_table[j].index = pst[j].fid; /* lower 8 bits */
699 powernow_table[j].index |= (pst[j].vid << 8); /* upper 8 bits */
700 freq = find_khz_freq_from_fid(pst[j].fid);
701 powernow_table[j].frequency = freq;
702 }
703 powernow_table[data->numps].frequency = CPUFREQ_TABLE_END;
704 powernow_table[data->numps].index = 0;
705
706 if (query_current_values_with_pending_wait(data)) {
707 kfree(powernow_table);
708 return -EIO;
709 }
710
711 dprintk("cfid 0x%x, cvid 0x%x\n", data->currfid, data->currvid);
712 data->powernow_table = powernow_table;
713 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
714 print_basics(data);
715
716 for (j = 0; j < data->numps; j++)
717 if ((pst[j].fid == data->currfid) &&
718 (pst[j].vid == data->currvid))
719 return 0;
720
721 dprintk("currfid/vid do not match PST, ignoring\n");
722 return 0;
723}
724
725/* Find and validate the PSB/PST table in BIOS. */
726static int find_psb_table(struct powernow_k8_data *data)
727{
728 struct psb_s *psb;
729 unsigned int i;
730 u32 mvs;
731 u8 maxvid;
732 u32 cpst = 0;
733 u32 thiscpuid;
734
735 for (i = 0xc0000; i < 0xffff0; i += 0x10) {
736 /* Scan BIOS looking for the signature. */
737 /* It can not be at ffff0 - it is too big. */
738
739 psb = phys_to_virt(i);
740 if (memcmp(psb, PSB_ID_STRING, PSB_ID_STRING_LEN) != 0)
741 continue;
742
743 dprintk("found PSB header at 0x%p\n", psb);
744
745 dprintk("table vers: 0x%x\n", psb->tableversion);
746 if (psb->tableversion != PSB_VERSION_1_4) {
747 printk(KERN_ERR FW_BUG PFX "PSB table is not v1.4\n");
748 return -ENODEV;
749 }
750
751 dprintk("flags: 0x%x\n", psb->flags1);
752 if (psb->flags1) {
753 printk(KERN_ERR FW_BUG PFX "unknown flags\n");
754 return -ENODEV;
755 }
756
757 data->vstable = psb->vstable;
758 dprintk("voltage stabilization time: %d(*20us)\n",
759 data->vstable);
760
761 dprintk("flags2: 0x%x\n", psb->flags2);
762 data->rvo = psb->flags2 & 3;
763 data->irt = ((psb->flags2) >> 2) & 3;
764 mvs = ((psb->flags2) >> 4) & 3;
765 data->vidmvs = 1 << mvs;
766 data->batps = ((psb->flags2) >> 6) & 3;
767
768 dprintk("ramp voltage offset: %d\n", data->rvo);
769 dprintk("isochronous relief time: %d\n", data->irt);
770 dprintk("maximum voltage step: %d - 0x%x\n", mvs, data->vidmvs);
771
772 dprintk("numpst: 0x%x\n", psb->num_tables);
773 cpst = psb->num_tables;
774 if ((psb->cpuid == 0x00000fc0) ||
775 (psb->cpuid == 0x00000fe0)) {
776 thiscpuid = cpuid_eax(CPUID_PROCESSOR_SIGNATURE);
777 if ((thiscpuid == 0x00000fc0) ||
778 (thiscpuid == 0x00000fe0))
779 cpst = 1;
780 }
781 if (cpst != 1) {
782 printk(KERN_ERR FW_BUG PFX "numpst must be 1\n");
783 return -ENODEV;
784 }
785
786 data->plllock = psb->plllocktime;
787 dprintk("plllocktime: 0x%x (units 1us)\n", psb->plllocktime);
788 dprintk("maxfid: 0x%x\n", psb->maxfid);
789 dprintk("maxvid: 0x%x\n", psb->maxvid);
790 maxvid = psb->maxvid;
791
792 data->numps = psb->numps;
793 dprintk("numpstates: 0x%x\n", data->numps);
794 return fill_powernow_table(data,
795 (struct pst_s *)(psb+1), maxvid);
796 }
797 /*
798 * If you see this message, complain to BIOS manufacturer. If
799 * he tells you "we do not support Linux" or some similar
800 * nonsense, remember that Windows 2000 uses the same legacy
801 * mechanism that the old Linux PSB driver uses. Tell them it
802 * is broken with Windows 2000.
803 *
804 * The reference to the AMD documentation is chapter 9 in the
805 * BIOS and Kernel Developer's Guide, which is available on
806 * www.amd.com
807 */
808 printk(KERN_ERR FW_BUG PFX "No PSB or ACPI _PSS objects\n");
809 printk(KERN_ERR PFX "Make sure that your BIOS is up to date"
810 " and Cool'N'Quiet support is enabled in BIOS setup\n");
811 return -ENODEV;
812}
813
814static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
815 unsigned int index)
816{
817 u64 control;
818
819 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
820 return;
821
822 control = data->acpi_data.states[index].control;
823 data->irt = (control >> IRT_SHIFT) & IRT_MASK;
824 data->rvo = (control >> RVO_SHIFT) & RVO_MASK;
825 data->exttype = (control >> EXT_TYPE_SHIFT) & EXT_TYPE_MASK;
826 data->plllock = (control >> PLL_L_SHIFT) & PLL_L_MASK;
827 data->vidmvs = 1 << ((control >> MVS_SHIFT) & MVS_MASK);
828 data->vstable = (control >> VST_SHIFT) & VST_MASK;
829}
830
831static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
832{
833 struct cpufreq_frequency_table *powernow_table;
834 int ret_val = -ENODEV;
835 u64 control, status;
836
837 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
838 dprintk("register performance failed: bad ACPI data\n");
839 return -EIO;
840 }
841
842 /* verify the data contained in the ACPI structures */
843 if (data->acpi_data.state_count <= 1) {
844 dprintk("No ACPI P-States\n");
845 goto err_out;
846 }
847
848 control = data->acpi_data.control_register.space_id;
849 status = data->acpi_data.status_register.space_id;
850
851 if ((control != ACPI_ADR_SPACE_FIXED_HARDWARE) ||
852 (status != ACPI_ADR_SPACE_FIXED_HARDWARE)) {
853 dprintk("Invalid control/status registers (%x - %x)\n",
854 control, status);
855 goto err_out;
856 }
857
858 /* fill in data->powernow_table */
859 powernow_table = kmalloc((sizeof(struct cpufreq_frequency_table)
860 * (data->acpi_data.state_count + 1)), GFP_KERNEL);
861 if (!powernow_table) {
862 dprintk("powernow_table memory alloc failure\n");
863 goto err_out;
864 }
865
866 /* fill in data */
867 data->numps = data->acpi_data.state_count;
868 powernow_k8_acpi_pst_values(data, 0);
869
870 if (cpu_family == CPU_HW_PSTATE)
871 ret_val = fill_powernow_table_pstate(data, powernow_table);
872 else
873 ret_val = fill_powernow_table_fidvid(data, powernow_table);
874 if (ret_val)
875 goto err_out_mem;
876
877 powernow_table[data->acpi_data.state_count].frequency =
878 CPUFREQ_TABLE_END;
879 powernow_table[data->acpi_data.state_count].index = 0;
880 data->powernow_table = powernow_table;
881
882 if (cpumask_first(cpu_core_mask(data->cpu)) == data->cpu)
883 print_basics(data);
884
885 /* notify BIOS that we exist */
886 acpi_processor_notify_smm(THIS_MODULE);
887
888 if (!zalloc_cpumask_var(&data->acpi_data.shared_cpu_map, GFP_KERNEL)) {
889 printk(KERN_ERR PFX
890 "unable to alloc powernow_k8_data cpumask\n");
891 ret_val = -ENOMEM;
892 goto err_out_mem;
893 }
894
895 return 0;
896
897err_out_mem:
898 kfree(powernow_table);
899
900err_out:
901 acpi_processor_unregister_performance(&data->acpi_data, data->cpu);
902
903 /* data->acpi_data.state_count informs us at ->exit()
904 * whether ACPI was used */
905 data->acpi_data.state_count = 0;
906
907 return ret_val;
908}
909
910static int fill_powernow_table_pstate(struct powernow_k8_data *data,
911 struct cpufreq_frequency_table *powernow_table)
912{
913 int i;
914 u32 hi = 0, lo = 0;
915 rdmsr(MSR_PSTATE_CUR_LIMIT, lo, hi);
916 data->max_hw_pstate = (lo & HW_PSTATE_MAX_MASK) >> HW_PSTATE_MAX_SHIFT;
917
918 for (i = 0; i < data->acpi_data.state_count; i++) {
919 u32 index;
920
921 index = data->acpi_data.states[i].control & HW_PSTATE_MASK;
922 if (index > data->max_hw_pstate) {
923 printk(KERN_ERR PFX "invalid pstate %d - "
924 "bad value %d.\n", i, index);
925 printk(KERN_ERR PFX "Please report to BIOS "
926 "manufacturer\n");
927 invalidate_entry(powernow_table, i);
928 continue;
929 }
930 rdmsr(MSR_PSTATE_DEF_BASE + index, lo, hi);
931 if (!(hi & HW_PSTATE_VALID_MASK)) {
932 dprintk("invalid pstate %d, ignoring\n", index);
933 invalidate_entry(powernow_table, i);
934 continue;
935 }
936
937 powernow_table[i].index = index;
938
939 /* Frequency may be rounded for these */
940 if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
941 || boot_cpu_data.x86 == 0x11) {
942 powernow_table[i].frequency =
943 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
944 } else
945 powernow_table[i].frequency =
946 data->acpi_data.states[i].core_frequency * 1000;
947 }
948 return 0;
949}
950
951static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
952 struct cpufreq_frequency_table *powernow_table)
953{
954 int i;
955
956 for (i = 0; i < data->acpi_data.state_count; i++) {
957 u32 fid;
958 u32 vid;
959 u32 freq, index;
960 u64 status, control;
961
962 if (data->exttype) {
963 status = data->acpi_data.states[i].status;
964 fid = status & EXT_FID_MASK;
965 vid = (status >> VID_SHIFT) & EXT_VID_MASK;
966 } else {
967 control = data->acpi_data.states[i].control;
968 fid = control & FID_MASK;
969 vid = (control >> VID_SHIFT) & VID_MASK;
970 }
971
972 dprintk(" %d : fid 0x%x, vid 0x%x\n", i, fid, vid);
973
974 index = fid | (vid<<8);
975 powernow_table[i].index = index;
976
977 freq = find_khz_freq_from_fid(fid);
978 powernow_table[i].frequency = freq;
979
980 /* verify frequency is OK */
981 if ((freq > (MAX_FREQ * 1000)) || (freq < (MIN_FREQ * 1000))) {
982 dprintk("invalid freq %u kHz, ignoring\n", freq);
983 invalidate_entry(powernow_table, i);
984 continue;
985 }
986
987 /* verify voltage is OK -
988 * BIOSs are using "off" to indicate invalid */
989 if (vid == VID_OFF) {
990 dprintk("invalid vid %u, ignoring\n", vid);
991 invalidate_entry(powernow_table, i);
992 continue;
993 }
994
995 if (freq != (data->acpi_data.states[i].core_frequency * 1000)) {
996 printk(KERN_INFO PFX "invalid freq entries "
997 "%u kHz vs. %u kHz\n", freq,
998 (unsigned int)
999 (data->acpi_data.states[i].core_frequency
1000 * 1000));
1001 invalidate_entry(powernow_table, i);
1002 continue;
1003 }
1004 }
1005 return 0;
1006}
1007
1008static void powernow_k8_cpu_exit_acpi(struct powernow_k8_data *data)
1009{
1010 if (data->acpi_data.state_count)
1011 acpi_processor_unregister_performance(&data->acpi_data,
1012 data->cpu);
1013 free_cpumask_var(data->acpi_data.shared_cpu_map);
1014}
1015
1016static int get_transition_latency(struct powernow_k8_data *data)
1017{
1018 int max_latency = 0;
1019 int i;
1020 for (i = 0; i < data->acpi_data.state_count; i++) {
1021 int cur_latency = data->acpi_data.states[i].transition_latency
1022 + data->acpi_data.states[i].bus_master_latency;
1023 if (cur_latency > max_latency)
1024 max_latency = cur_latency;
1025 }
1026 if (max_latency == 0) {
1027 /*
1028 * Fam 11h and later may return 0 as transition latency. This
1029 * is intended and means "very fast". While cpufreq core and
1030 * governors currently can handle that gracefully, better set it
1031 * to 1 to avoid problems in the future.
1032 */
1033 if (boot_cpu_data.x86 < 0x11)
1034 printk(KERN_ERR FW_WARN PFX "Invalid zero transition "
1035 "latency\n");
1036 max_latency = 1;
1037 }
1038 /* value in usecs, needs to be in nanoseconds */
1039 return 1000 * max_latency;
1040}
1041
1042/* Take a frequency, and issue the fid/vid transition command */
1043static int transition_frequency_fidvid(struct powernow_k8_data *data,
1044 unsigned int index)
1045{
1046 u32 fid = 0;
1047 u32 vid = 0;
1048 int res, i;
1049 struct cpufreq_freqs freqs;
1050
1051 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1052
1053 /* fid/vid correctness check for k8 */
1054 /* fid are the lower 8 bits of the index we stored into
1055 * the cpufreq frequency table in find_psb_table, vid
1056 * are the upper 8 bits.
1057 */
1058 fid = data->powernow_table[index].index & 0xFF;
1059 vid = (data->powernow_table[index].index & 0xFF00) >> 8;
1060
1061 dprintk("table matched fid 0x%x, giving vid 0x%x\n", fid, vid);
1062
1063 if (query_current_values_with_pending_wait(data))
1064 return 1;
1065
1066 if ((data->currvid == vid) && (data->currfid == fid)) {
1067 dprintk("target matches current values (fid 0x%x, vid 0x%x)\n",
1068 fid, vid);
1069 return 0;
1070 }
1071
1072 dprintk("cpu %d, changing to fid 0x%x, vid 0x%x\n",
1073 smp_processor_id(), fid, vid);
1074 freqs.old = find_khz_freq_from_fid(data->currfid);
1075 freqs.new = find_khz_freq_from_fid(fid);
1076
1077 for_each_cpu(i, data->available_cores) {
1078 freqs.cpu = i;
1079 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1080 }
1081
1082 res = transition_fid_vid(data, fid, vid);
1083 freqs.new = find_khz_freq_from_fid(data->currfid);
1084
1085 for_each_cpu(i, data->available_cores) {
1086 freqs.cpu = i;
1087 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1088 }
1089 return res;
1090}
1091
1092/* Take a frequency, and issue the hardware pstate transition command */
1093static int transition_frequency_pstate(struct powernow_k8_data *data,
1094 unsigned int index)
1095{
1096 u32 pstate = 0;
1097 int res, i;
1098 struct cpufreq_freqs freqs;
1099
1100 dprintk("cpu %d transition to index %u\n", smp_processor_id(), index);
1101
1102 /* get MSR index for hardware pstate transition */
1103 pstate = index & HW_PSTATE_MASK;
1104 if (pstate > data->max_hw_pstate)
1105 return 0;
1106 freqs.old = find_khz_freq_from_pstate(data->powernow_table,
1107 data->currpstate);
1108 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1109
1110 for_each_cpu(i, data->available_cores) {
1111 freqs.cpu = i;
1112 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
1113 }
1114
1115 res = transition_pstate(data, pstate);
1116 freqs.new = find_khz_freq_from_pstate(data->powernow_table, pstate);
1117
1118 for_each_cpu(i, data->available_cores) {
1119 freqs.cpu = i;
1120 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
1121 }
1122 return res;
1123}
1124
1125/* Driver entry point to switch to the target frequency */
1126static int powernowk8_target(struct cpufreq_policy *pol,
1127 unsigned targfreq, unsigned relation)
1128{
1129 cpumask_var_t oldmask;
1130 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1131 u32 checkfid;
1132 u32 checkvid;
1133 unsigned int newstate;
1134 int ret = -EIO;
1135
1136 if (!data)
1137 return -EINVAL;
1138
1139 checkfid = data->currfid;
1140 checkvid = data->currvid;
1141
1142 /* only run on specific CPU from here on. */
1143 /* This is poor form: use a workqueue or smp_call_function_single */
1144 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1145 return -ENOMEM;
1146
1147 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1148 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1149
1150 if (smp_processor_id() != pol->cpu) {
1151 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
1152 goto err_out;
1153 }
1154
1155 if (pending_bit_stuck()) {
1156 printk(KERN_ERR PFX "failing targ, change pending bit set\n");
1157 goto err_out;
1158 }
1159
1160 dprintk("targ: cpu %d, %d kHz, min %d, max %d, relation %d\n",
1161 pol->cpu, targfreq, pol->min, pol->max, relation);
1162
1163 if (query_current_values_with_pending_wait(data))
1164 goto err_out;
1165
1166 if (cpu_family != CPU_HW_PSTATE) {
1167 dprintk("targ: curr fid 0x%x, vid 0x%x\n",
1168 data->currfid, data->currvid);
1169
1170 if ((checkvid != data->currvid) ||
1171 (checkfid != data->currfid)) {
1172 printk(KERN_INFO PFX
1173 "error - out of sync, fix 0x%x 0x%x, "
1174 "vid 0x%x 0x%x\n",
1175 checkfid, data->currfid,
1176 checkvid, data->currvid);
1177 }
1178 }
1179
1180 if (cpufreq_frequency_table_target(pol, data->powernow_table,
1181 targfreq, relation, &newstate))
1182 goto err_out;
1183
1184 mutex_lock(&fidvid_mutex);
1185
1186 powernow_k8_acpi_pst_values(data, newstate);
1187
1188 if (cpu_family == CPU_HW_PSTATE)
1189 ret = transition_frequency_pstate(data, newstate);
1190 else
1191 ret = transition_frequency_fidvid(data, newstate);
1192 if (ret) {
1193 printk(KERN_ERR PFX "transition frequency failed\n");
1194 ret = 1;
1195 mutex_unlock(&fidvid_mutex);
1196 goto err_out;
1197 }
1198 mutex_unlock(&fidvid_mutex);
1199
1200 if (cpu_family == CPU_HW_PSTATE)
1201 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1202 newstate);
1203 else
1204 pol->cur = find_khz_freq_from_fid(data->currfid);
1205 ret = 0;
1206
1207err_out:
1208 set_cpus_allowed_ptr(current, oldmask);
1209 free_cpumask_var(oldmask);
1210 return ret;
1211}
1212
1213/* Driver entry point to verify the policy and range of frequencies */
1214static int powernowk8_verify(struct cpufreq_policy *pol)
1215{
1216 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1217
1218 if (!data)
1219 return -EINVAL;
1220
1221 return cpufreq_frequency_table_verify(pol, data->powernow_table);
1222}
1223
1224struct init_on_cpu {
1225 struct powernow_k8_data *data;
1226 int rc;
1227};
1228
1229static void __cpuinit powernowk8_cpu_init_on_cpu(void *_init_on_cpu)
1230{
1231 struct init_on_cpu *init_on_cpu = _init_on_cpu;
1232
1233 if (pending_bit_stuck()) {
1234 printk(KERN_ERR PFX "failing init, change pending bit set\n");
1235 init_on_cpu->rc = -ENODEV;
1236 return;
1237 }
1238
1239 if (query_current_values_with_pending_wait(init_on_cpu->data)) {
1240 init_on_cpu->rc = -ENODEV;
1241 return;
1242 }
1243
1244 if (cpu_family == CPU_OPTERON)
1245 fidvid_msr_init();
1246
1247 init_on_cpu->rc = 0;
1248}
1249
1250/* per CPU init entry point to the driver */
1251static int __cpuinit powernowk8_cpu_init(struct cpufreq_policy *pol)
1252{
1253 static const char ACPI_PSS_BIOS_BUG_MSG[] =
1254 KERN_ERR FW_BUG PFX "No compatible ACPI _PSS objects found.\n"
1255 FW_BUG PFX "Try again with latest BIOS.\n";
1256 struct powernow_k8_data *data;
1257 struct init_on_cpu init_on_cpu;
1258 int rc;
1259 struct cpuinfo_x86 *c = &cpu_data(pol->cpu);
1260
1261 if (!cpu_online(pol->cpu))
1262 return -ENODEV;
1263
1264 smp_call_function_single(pol->cpu, check_supported_cpu, &rc, 1);
1265 if (rc)
1266 return -ENODEV;
1267
1268 data = kzalloc(sizeof(struct powernow_k8_data), GFP_KERNEL);
1269 if (!data) {
1270 printk(KERN_ERR PFX "unable to alloc powernow_k8_data");
1271 return -ENOMEM;
1272 }
1273
1274 data->cpu = pol->cpu;
1275 data->currpstate = HW_PSTATE_INVALID;
1276
1277 if (powernow_k8_cpu_init_acpi(data)) {
1278 /*
1279 * Use the PSB BIOS structure. This is only availabe on
1280 * an UP version, and is deprecated by AMD.
1281 */
1282 if (num_online_cpus() != 1) {
1283 printk_once(ACPI_PSS_BIOS_BUG_MSG);
1284 goto err_out;
1285 }
1286 if (pol->cpu != 0) {
1287 printk(KERN_ERR FW_BUG PFX "No ACPI _PSS objects for "
1288 "CPU other than CPU0. Complain to your BIOS "
1289 "vendor.\n");
1290 goto err_out;
1291 }
1292 rc = find_psb_table(data);
1293 if (rc)
1294 goto err_out;
1295
1296 /* Take a crude guess here.
1297 * That guess was in microseconds, so multiply with 1000 */
1298 pol->cpuinfo.transition_latency = (
1299 ((data->rvo + 8) * data->vstable * VST_UNITS_20US) +
1300 ((1 << data->irt) * 30)) * 1000;
1301 } else /* ACPI _PSS objects available */
1302 pol->cpuinfo.transition_latency = get_transition_latency(data);
1303
1304 /* only run on specific CPU from here on */
1305 init_on_cpu.data = data;
1306 smp_call_function_single(data->cpu, powernowk8_cpu_init_on_cpu,
1307 &init_on_cpu, 1);
1308 rc = init_on_cpu.rc;
1309 if (rc != 0)
1310 goto err_out_exit_acpi;
1311
1312 if (cpu_family == CPU_HW_PSTATE)
1313 cpumask_copy(pol->cpus, cpumask_of(pol->cpu));
1314 else
1315 cpumask_copy(pol->cpus, cpu_core_mask(pol->cpu));
1316 data->available_cores = pol->cpus;
1317
1318 if (cpu_family == CPU_HW_PSTATE)
1319 pol->cur = find_khz_freq_from_pstate(data->powernow_table,
1320 data->currpstate);
1321 else
1322 pol->cur = find_khz_freq_from_fid(data->currfid);
1323 dprintk("policy current frequency %d kHz\n", pol->cur);
1324
1325 /* min/max the cpu is capable of */
1326 if (cpufreq_frequency_table_cpuinfo(pol, data->powernow_table)) {
1327 printk(KERN_ERR FW_BUG PFX "invalid powernow_table\n");
1328 powernow_k8_cpu_exit_acpi(data);
1329 kfree(data->powernow_table);
1330 kfree(data);
1331 return -EINVAL;
1332 }
1333
1334 /* Check for APERF/MPERF support in hardware */
1335 if (cpu_has(c, X86_FEATURE_APERFMPERF))
1336 cpufreq_amd64_driver.getavg = cpufreq_get_measured_perf;
1337
1338 cpufreq_frequency_table_get_attr(data->powernow_table, pol->cpu);
1339
1340 if (cpu_family == CPU_HW_PSTATE)
1341 dprintk("cpu_init done, current pstate 0x%x\n",
1342 data->currpstate);
1343 else
1344 dprintk("cpu_init done, current fid 0x%x, vid 0x%x\n",
1345 data->currfid, data->currvid);
1346
1347 per_cpu(powernow_data, pol->cpu) = data;
1348
1349 return 0;
1350
1351err_out_exit_acpi:
1352 powernow_k8_cpu_exit_acpi(data);
1353
1354err_out:
1355 kfree(data);
1356 return -ENODEV;
1357}
1358
1359static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1360{
1361 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1362
1363 if (!data)
1364 return -EINVAL;
1365
1366 powernow_k8_cpu_exit_acpi(data);
1367
1368 cpufreq_frequency_table_put_attr(pol->cpu);
1369
1370 kfree(data->powernow_table);
1371 kfree(data);
1372 per_cpu(powernow_data, pol->cpu) = NULL;
1373
1374 return 0;
1375}
1376
1377static void query_values_on_cpu(void *_err)
1378{
1379 int *err = _err;
1380 struct powernow_k8_data *data = __get_cpu_var(powernow_data);
1381
1382 *err = query_current_values_with_pending_wait(data);
1383}
1384
1385static unsigned int powernowk8_get(unsigned int cpu)
1386{
1387 struct powernow_k8_data *data = per_cpu(powernow_data, cpu);
1388 unsigned int khz = 0;
1389 int err;
1390
1391 if (!data)
1392 return 0;
1393
1394 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1395 if (err)
1396 goto out;
1397
1398 if (cpu_family == CPU_HW_PSTATE)
1399 khz = find_khz_freq_from_pstate(data->powernow_table,
1400 data->currpstate);
1401 else
1402 khz = find_khz_freq_from_fid(data->currfid);
1403
1404
1405out:
1406 return khz;
1407}
1408
1409static void _cpb_toggle_msrs(bool t)
1410{
1411 int cpu;
1412
1413 get_online_cpus();
1414
1415 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1416
1417 for_each_cpu(cpu, cpu_online_mask) {
1418 struct msr *reg = per_cpu_ptr(msrs, cpu);
1419 if (t)
1420 reg->l &= ~BIT(25);
1421 else
1422 reg->l |= BIT(25);
1423 }
1424 wrmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1425
1426 put_online_cpus();
1427}
1428
1429/*
1430 * Switch on/off core performance boosting.
1431 *
1432 * 0=disable
1433 * 1=enable.
1434 */
1435static void cpb_toggle(bool t)
1436{
1437 if (!cpb_capable)
1438 return;
1439
1440 if (t && !cpb_enabled) {
1441 cpb_enabled = true;
1442 _cpb_toggle_msrs(t);
1443 printk(KERN_INFO PFX "Core Boosting enabled.\n");
1444 } else if (!t && cpb_enabled) {
1445 cpb_enabled = false;
1446 _cpb_toggle_msrs(t);
1447 printk(KERN_INFO PFX "Core Boosting disabled.\n");
1448 }
1449}
1450
1451static ssize_t store_cpb(struct cpufreq_policy *policy, const char *buf,
1452 size_t count)
1453{
1454 int ret = -EINVAL;
1455 unsigned long val = 0;
1456
1457 ret = strict_strtoul(buf, 10, &val);
1458 if (!ret && (val == 0 || val == 1) && cpb_capable)
1459 cpb_toggle(val);
1460 else
1461 return -EINVAL;
1462
1463 return count;
1464}
1465
1466static ssize_t show_cpb(struct cpufreq_policy *policy, char *buf)
1467{
1468 return sprintf(buf, "%u\n", cpb_enabled);
1469}
1470
1471#define define_one_rw(_name) \
1472static struct freq_attr _name = \
1473__ATTR(_name, 0644, show_##_name, store_##_name)
1474
1475define_one_rw(cpb);
1476
1477static struct freq_attr *powernow_k8_attr[] = {
1478 &cpufreq_freq_attr_scaling_available_freqs,
1479 &cpb,
1480 NULL,
1481};
1482
1483static struct cpufreq_driver cpufreq_amd64_driver = {
1484 .verify = powernowk8_verify,
1485 .target = powernowk8_target,
1486 .bios_limit = acpi_processor_get_bios_limit,
1487 .init = powernowk8_cpu_init,
1488 .exit = __devexit_p(powernowk8_cpu_exit),
1489 .get = powernowk8_get,
1490 .name = "powernow-k8",
1491 .owner = THIS_MODULE,
1492 .attr = powernow_k8_attr,
1493};
1494
1495/*
1496 * Clear the boost-disable flag on the CPU_DOWN path so that this cpu
1497 * cannot block the remaining ones from boosting. On the CPU_UP path we
1498 * simply keep the boost-disable flag in sync with the current global
1499 * state.
1500 */
1501static int cpb_notify(struct notifier_block *nb, unsigned long action,
1502 void *hcpu)
1503{
1504 unsigned cpu = (long)hcpu;
1505 u32 lo, hi;
1506
1507 switch (action) {
1508 case CPU_UP_PREPARE:
1509 case CPU_UP_PREPARE_FROZEN:
1510
1511 if (!cpb_enabled) {
1512 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1513 lo |= BIT(25);
1514 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1515 }
1516 break;
1517
1518 case CPU_DOWN_PREPARE:
1519 case CPU_DOWN_PREPARE_FROZEN:
1520 rdmsr_on_cpu(cpu, MSR_K7_HWCR, &lo, &hi);
1521 lo &= ~BIT(25);
1522 wrmsr_on_cpu(cpu, MSR_K7_HWCR, lo, hi);
1523 break;
1524
1525 default:
1526 break;
1527 }
1528
1529 return NOTIFY_OK;
1530}
1531
1532static struct notifier_block cpb_nb = {
1533 .notifier_call = cpb_notify,
1534};
1535
1536/* driver entry point for init */
1537static int __cpuinit powernowk8_init(void)
1538{
1539 unsigned int i, supported_cpus = 0, cpu;
1540
1541 for_each_online_cpu(i) {
1542 int rc;
1543 smp_call_function_single(i, check_supported_cpu, &rc, 1);
1544 if (rc == 0)
1545 supported_cpus++;
1546 }
1547
1548 if (supported_cpus != num_online_cpus())
1549 return -ENODEV;
1550
1551 printk(KERN_INFO PFX "Found %d %s (%d cpu cores) (" VERSION ")\n",
1552 num_online_nodes(), boot_cpu_data.x86_model_id, supported_cpus);
1553
1554 if (boot_cpu_has(X86_FEATURE_CPB)) {
1555
1556 cpb_capable = true;
1557
1558 register_cpu_notifier(&cpb_nb);
1559
1560 msrs = msrs_alloc();
1561 if (!msrs) {
1562 printk(KERN_ERR "%s: Error allocating msrs!\n", __func__);
1563 return -ENOMEM;
1564 }
1565
1566 rdmsr_on_cpus(cpu_online_mask, MSR_K7_HWCR, msrs);
1567
1568 for_each_cpu(cpu, cpu_online_mask) {
1569 struct msr *reg = per_cpu_ptr(msrs, cpu);
1570 cpb_enabled |= !(!!(reg->l & BIT(25)));
1571 }
1572
1573 printk(KERN_INFO PFX "Core Performance Boosting: %s.\n",
1574 (cpb_enabled ? "on" : "off"));
1575 }
1576
1577 return cpufreq_register_driver(&cpufreq_amd64_driver);
1578}
1579
1580/* driver entry point for term */
1581static void __exit powernowk8_exit(void)
1582{
1583 dprintk("exit\n");
1584
1585 if (boot_cpu_has(X86_FEATURE_CPB)) {
1586 msrs_free(msrs);
1587 msrs = NULL;
1588
1589 unregister_cpu_notifier(&cpb_nb);
1590 }
1591
1592 cpufreq_unregister_driver(&cpufreq_amd64_driver);
1593}
1594
1595MODULE_AUTHOR("Paul Devriendt <paul.devriendt@amd.com> and "
1596 "Mark Langsdorf <mark.langsdorf@amd.com>");
1597MODULE_DESCRIPTION("AMD Athlon 64 and Opteron processor frequency driver.");
1598MODULE_LICENSE("GPL");
1599
1600late_initcall(powernowk8_init);
1601module_exit(powernowk8_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h b/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
deleted file mode 100644
index df3529b1c02d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.h
+++ /dev/null
@@ -1,224 +0,0 @@
1/*
2 * (c) 2003-2006 Advanced Micro Devices, Inc.
3 * Your use of this code is subject to the terms and conditions of the
4 * GNU general public license version 2. See "COPYING" or
5 * http://www.gnu.org/licenses/gpl.html
6 */
7
8enum pstate {
9 HW_PSTATE_INVALID = 0xff,
10 HW_PSTATE_0 = 0,
11 HW_PSTATE_1 = 1,
12 HW_PSTATE_2 = 2,
13 HW_PSTATE_3 = 3,
14 HW_PSTATE_4 = 4,
15 HW_PSTATE_5 = 5,
16 HW_PSTATE_6 = 6,
17 HW_PSTATE_7 = 7,
18};
19
20struct powernow_k8_data {
21 unsigned int cpu;
22
23 u32 numps; /* number of p-states */
24 u32 batps; /* number of p-states supported on battery */
25 u32 max_hw_pstate; /* maximum legal hardware pstate */
26
27 /* these values are constant when the PSB is used to determine
28 * vid/fid pairings, but are modified during the ->target() call
29 * when ACPI is used */
30 u32 rvo; /* ramp voltage offset */
31 u32 irt; /* isochronous relief time */
32 u32 vidmvs; /* usable value calculated from mvs */
33 u32 vstable; /* voltage stabilization time, units 20 us */
34 u32 plllock; /* pll lock time, units 1 us */
35 u32 exttype; /* extended interface = 1 */
36
37 /* keep track of the current fid / vid or pstate */
38 u32 currvid;
39 u32 currfid;
40 enum pstate currpstate;
41
42 /* the powernow_table includes all frequency and vid/fid pairings:
43 * fid are the lower 8 bits of the index, vid are the upper 8 bits.
44 * frequency is in kHz */
45 struct cpufreq_frequency_table *powernow_table;
46
47 /* the acpi table needs to be kept. it's only available if ACPI was
48 * used to determine valid frequency/vid/fid states */
49 struct acpi_processor_performance acpi_data;
50
51 /* we need to keep track of associated cores, but let cpufreq
52 * handle hotplug events - so just point at cpufreq pol->cpus
53 * structure */
54 struct cpumask *available_cores;
55};
56
57/* processor's cpuid instruction support */
58#define CPUID_PROCESSOR_SIGNATURE 1 /* function 1 */
59#define CPUID_XFAM 0x0ff00000 /* extended family */
60#define CPUID_XFAM_K8 0
61#define CPUID_XMOD 0x000f0000 /* extended model */
62#define CPUID_XMOD_REV_MASK 0x000c0000
63#define CPUID_XFAM_10H 0x00100000 /* family 0x10 */
64#define CPUID_USE_XFAM_XMOD 0x00000f00
65#define CPUID_GET_MAX_CAPABILITIES 0x80000000
66#define CPUID_FREQ_VOLT_CAPABILITIES 0x80000007
67#define P_STATE_TRANSITION_CAPABLE 6
68
69/* Model Specific Registers for p-state transitions. MSRs are 64-bit. For */
70/* writes (wrmsr - opcode 0f 30), the register number is placed in ecx, and */
71/* the value to write is placed in edx:eax. For reads (rdmsr - opcode 0f 32), */
72/* the register number is placed in ecx, and the data is returned in edx:eax. */
73
74#define MSR_FIDVID_CTL 0xc0010041
75#define MSR_FIDVID_STATUS 0xc0010042
76
77/* Field definitions within the FID VID Low Control MSR : */
78#define MSR_C_LO_INIT_FID_VID 0x00010000
79#define MSR_C_LO_NEW_VID 0x00003f00
80#define MSR_C_LO_NEW_FID 0x0000003f
81#define MSR_C_LO_VID_SHIFT 8
82
83/* Field definitions within the FID VID High Control MSR : */
84#define MSR_C_HI_STP_GNT_TO 0x000fffff
85
86/* Field definitions within the FID VID Low Status MSR : */
87#define MSR_S_LO_CHANGE_PENDING 0x80000000 /* cleared when completed */
88#define MSR_S_LO_MAX_RAMP_VID 0x3f000000
89#define MSR_S_LO_MAX_FID 0x003f0000
90#define MSR_S_LO_START_FID 0x00003f00
91#define MSR_S_LO_CURRENT_FID 0x0000003f
92
93/* Field definitions within the FID VID High Status MSR : */
94#define MSR_S_HI_MIN_WORKING_VID 0x3f000000
95#define MSR_S_HI_MAX_WORKING_VID 0x003f0000
96#define MSR_S_HI_START_VID 0x00003f00
97#define MSR_S_HI_CURRENT_VID 0x0000003f
98#define MSR_C_HI_STP_GNT_BENIGN 0x00000001
99
100
101/* Hardware Pstate _PSS and MSR definitions */
102#define USE_HW_PSTATE 0x00000080
103#define HW_PSTATE_MASK 0x00000007
104#define HW_PSTATE_VALID_MASK 0x80000000
105#define HW_PSTATE_MAX_MASK 0x000000f0
106#define HW_PSTATE_MAX_SHIFT 4
107#define MSR_PSTATE_DEF_BASE 0xc0010064 /* base of Pstate MSRs */
108#define MSR_PSTATE_STATUS 0xc0010063 /* Pstate Status MSR */
109#define MSR_PSTATE_CTRL 0xc0010062 /* Pstate control MSR */
110#define MSR_PSTATE_CUR_LIMIT 0xc0010061 /* pstate current limit MSR */
111
112/* define the two driver architectures */
113#define CPU_OPTERON 0
114#define CPU_HW_PSTATE 1
115
116
117/*
118 * There are restrictions frequencies have to follow:
119 * - only 1 entry in the low fid table ( <=1.4GHz )
120 * - lowest entry in the high fid table must be >= 2 * the entry in the
121 * low fid table
122 * - lowest entry in the high fid table must be a <= 200MHz + 2 * the entry
123 * in the low fid table
124 * - the parts can only step at <= 200 MHz intervals, odd fid values are
125 * supported in revision G and later revisions.
126 * - lowest frequency must be >= interprocessor hypertransport link speed
127 * (only applies to MP systems obviously)
128 */
129
130/* fids (frequency identifiers) are arranged in 2 tables - lo and hi */
131#define LO_FID_TABLE_TOP 7 /* fid values marking the boundary */
132#define HI_FID_TABLE_BOTTOM 8 /* between the low and high tables */
133
134#define LO_VCOFREQ_TABLE_TOP 1400 /* corresponding vco frequency values */
135#define HI_VCOFREQ_TABLE_BOTTOM 1600
136
137#define MIN_FREQ_RESOLUTION 200 /* fids jump by 2 matching freq jumps by 200 */
138
139#define MAX_FID 0x2a /* Spec only gives FID values as far as 5 GHz */
140#define LEAST_VID 0x3e /* Lowest (numerically highest) useful vid value */
141
142#define MIN_FREQ 800 /* Min and max freqs, per spec */
143#define MAX_FREQ 5000
144
145#define INVALID_FID_MASK 0xffffffc0 /* not a valid fid if these bits are set */
146#define INVALID_VID_MASK 0xffffffc0 /* not a valid vid if these bits are set */
147
148#define VID_OFF 0x3f
149
150#define STOP_GRANT_5NS 1 /* min poss memory access latency for voltage change */
151
152#define PLL_LOCK_CONVERSION (1000/5) /* ms to ns, then divide by clock period */
153
154#define MAXIMUM_VID_STEPS 1 /* Current cpus only allow a single step of 25mV */
155#define VST_UNITS_20US 20 /* Voltage Stabilization Time is in units of 20us */
156
157/*
158 * Most values of interest are encoded in a single field of the _PSS
159 * entries: the "control" value.
160 */
161
162#define IRT_SHIFT 30
163#define RVO_SHIFT 28
164#define EXT_TYPE_SHIFT 27
165#define PLL_L_SHIFT 20
166#define MVS_SHIFT 18
167#define VST_SHIFT 11
168#define VID_SHIFT 6
169#define IRT_MASK 3
170#define RVO_MASK 3
171#define EXT_TYPE_MASK 1
172#define PLL_L_MASK 0x7f
173#define MVS_MASK 3
174#define VST_MASK 0x7f
175#define VID_MASK 0x1f
176#define FID_MASK 0x1f
177#define EXT_VID_MASK 0x3f
178#define EXT_FID_MASK 0x3f
179
180
181/*
182 * Version 1.4 of the PSB table. This table is constructed by BIOS and is
183 * to tell the OS's power management driver which VIDs and FIDs are
184 * supported by this particular processor.
185 * If the data in the PSB / PST is wrong, then this driver will program the
186 * wrong values into hardware, which is very likely to lead to a crash.
187 */
188
189#define PSB_ID_STRING "AMDK7PNOW!"
190#define PSB_ID_STRING_LEN 10
191
192#define PSB_VERSION_1_4 0x14
193
194struct psb_s {
195 u8 signature[10];
196 u8 tableversion;
197 u8 flags1;
198 u16 vstable;
199 u8 flags2;
200 u8 num_tables;
201 u32 cpuid;
202 u8 plllocktime;
203 u8 maxfid;
204 u8 maxvid;
205 u8 numps;
206};
207
208/* Pairs of fid/vid values are appended to the version 1.4 PSB table. */
209struct pst_s {
210 u8 fid;
211 u8 vid;
212};
213
214#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "powernow-k8", msg)
215
216static int core_voltage_pre_transition(struct powernow_k8_data *data,
217 u32 reqvid, u32 regfid);
218static int core_voltage_post_transition(struct powernow_k8_data *data, u32 reqvid);
219static int core_frequency_transition(struct powernow_k8_data *data, u32 reqfid);
220
221static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, unsigned int index);
222
223static int fill_powernow_table_pstate(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
224static int fill_powernow_table_fidvid(struct powernow_k8_data *data, struct cpufreq_frequency_table *powernow_table);
diff --git a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c b/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
deleted file mode 100644
index 435a996a613a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/sc520_freq.c
+++ /dev/null
@@ -1,194 +0,0 @@
1/*
2 * sc520_freq.c: cpufreq driver for the AMD Elan sc520
3 *
4 * Copyright (C) 2005 Sean Young <sean@mess.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 *
11 * Based on elanfreq.c
12 *
13 * 2005-03-30: - initial revision
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19
20#include <linux/delay.h>
21#include <linux/cpufreq.h>
22#include <linux/timex.h>
23#include <linux/io.h>
24
25#include <asm/msr.h>
26
27#define MMCR_BASE 0xfffef000 /* The default base address */
28#define OFFS_CPUCTL 0x2 /* CPU Control Register */
29
30static __u8 __iomem *cpuctl;
31
32#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
33 "sc520_freq", msg)
34#define PFX "sc520_freq: "
35
36static struct cpufreq_frequency_table sc520_freq_table[] = {
37 {0x01, 100000},
38 {0x02, 133000},
39 {0, CPUFREQ_TABLE_END},
40};
41
42static unsigned int sc520_freq_get_cpu_frequency(unsigned int cpu)
43{
44 u8 clockspeed_reg = *cpuctl;
45
46 switch (clockspeed_reg & 0x03) {
47 default:
48 printk(KERN_ERR PFX "error: cpuctl register has unexpected "
49 "value %02x\n", clockspeed_reg);
50 case 0x01:
51 return 100000;
52 case 0x02:
53 return 133000;
54 }
55}
56
57static void sc520_freq_set_cpu_state(unsigned int state)
58{
59
60 struct cpufreq_freqs freqs;
61 u8 clockspeed_reg;
62
63 freqs.old = sc520_freq_get_cpu_frequency(0);
64 freqs.new = sc520_freq_table[state].frequency;
65 freqs.cpu = 0; /* AMD Elan is UP */
66
67 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
68
69 dprintk("attempting to set frequency to %i kHz\n",
70 sc520_freq_table[state].frequency);
71
72 local_irq_disable();
73
74 clockspeed_reg = *cpuctl & ~0x03;
75 *cpuctl = clockspeed_reg | sc520_freq_table[state].index;
76
77 local_irq_enable();
78
79 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
80};
81
82static int sc520_freq_verify(struct cpufreq_policy *policy)
83{
84 return cpufreq_frequency_table_verify(policy, &sc520_freq_table[0]);
85}
86
87static int sc520_freq_target(struct cpufreq_policy *policy,
88 unsigned int target_freq,
89 unsigned int relation)
90{
91 unsigned int newstate = 0;
92
93 if (cpufreq_frequency_table_target(policy, sc520_freq_table,
94 target_freq, relation, &newstate))
95 return -EINVAL;
96
97 sc520_freq_set_cpu_state(newstate);
98
99 return 0;
100}
101
102
103/*
104 * Module init and exit code
105 */
106
107static int sc520_freq_cpu_init(struct cpufreq_policy *policy)
108{
109 struct cpuinfo_x86 *c = &cpu_data(0);
110 int result;
111
112 /* capability check */
113 if (c->x86_vendor != X86_VENDOR_AMD ||
114 c->x86 != 4 || c->x86_model != 9)
115 return -ENODEV;
116
117 /* cpuinfo and default policy values */
118 policy->cpuinfo.transition_latency = 1000000; /* 1ms */
119 policy->cur = sc520_freq_get_cpu_frequency(0);
120
121 result = cpufreq_frequency_table_cpuinfo(policy, sc520_freq_table);
122 if (result)
123 return result;
124
125 cpufreq_frequency_table_get_attr(sc520_freq_table, policy->cpu);
126
127 return 0;
128}
129
130
131static int sc520_freq_cpu_exit(struct cpufreq_policy *policy)
132{
133 cpufreq_frequency_table_put_attr(policy->cpu);
134 return 0;
135}
136
137
138static struct freq_attr *sc520_freq_attr[] = {
139 &cpufreq_freq_attr_scaling_available_freqs,
140 NULL,
141};
142
143
144static struct cpufreq_driver sc520_freq_driver = {
145 .get = sc520_freq_get_cpu_frequency,
146 .verify = sc520_freq_verify,
147 .target = sc520_freq_target,
148 .init = sc520_freq_cpu_init,
149 .exit = sc520_freq_cpu_exit,
150 .name = "sc520_freq",
151 .owner = THIS_MODULE,
152 .attr = sc520_freq_attr,
153};
154
155
156static int __init sc520_freq_init(void)
157{
158 struct cpuinfo_x86 *c = &cpu_data(0);
159 int err;
160
161 /* Test if we have the right hardware */
162 if (c->x86_vendor != X86_VENDOR_AMD ||
163 c->x86 != 4 || c->x86_model != 9) {
164 dprintk("no Elan SC520 processor found!\n");
165 return -ENODEV;
166 }
167 cpuctl = ioremap((unsigned long)(MMCR_BASE + OFFS_CPUCTL), 1);
168 if (!cpuctl) {
169 printk(KERN_ERR "sc520_freq: error: failed to remap memory\n");
170 return -ENOMEM;
171 }
172
173 err = cpufreq_register_driver(&sc520_freq_driver);
174 if (err)
175 iounmap(cpuctl);
176
177 return err;
178}
179
180
181static void __exit sc520_freq_exit(void)
182{
183 cpufreq_unregister_driver(&sc520_freq_driver);
184 iounmap(cpuctl);
185}
186
187
188MODULE_LICENSE("GPL");
189MODULE_AUTHOR("Sean Young <sean@mess.org>");
190MODULE_DESCRIPTION("cpufreq driver for AMD's Elan sc520 CPU");
191
192module_init(sc520_freq_init);
193module_exit(sc520_freq_exit);
194
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
deleted file mode 100644
index 9b1ff37de46a..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ /dev/null
@@ -1,636 +0,0 @@
1/*
2 * cpufreq driver for Enhanced SpeedStep, as found in Intel's Pentium
3 * M (part of the Centrino chipset).
4 *
5 * Since the original Pentium M, most new Intel CPUs support Enhanced
6 * SpeedStep.
7 *
8 * Despite the "SpeedStep" in the name, this is almost entirely unlike
9 * traditional SpeedStep.
10 *
11 * Modelled on speedstep.c
12 *
13 * Copyright (C) 2003 Jeremy Fitzhardinge <jeremy@goop.org>
14 */
15
16#include <linux/kernel.h>
17#include <linux/module.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/sched.h> /* current */
21#include <linux/delay.h>
22#include <linux/compiler.h>
23#include <linux/gfp.h>
24
25#include <asm/msr.h>
26#include <asm/processor.h>
27#include <asm/cpufeature.h>
28
29#define PFX "speedstep-centrino: "
30#define MAINTAINER "cpufreq@vger.kernel.org"
31
32#define dprintk(msg...) \
33 cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, "speedstep-centrino", msg)
34
35#define INTEL_MSR_RANGE (0xffff)
36
37struct cpu_id
38{
39 __u8 x86; /* CPU family */
40 __u8 x86_model; /* model */
41 __u8 x86_mask; /* stepping */
42};
43
44enum {
45 CPU_BANIAS,
46 CPU_DOTHAN_A1,
47 CPU_DOTHAN_A2,
48 CPU_DOTHAN_B0,
49 CPU_MP4HT_D0,
50 CPU_MP4HT_E0,
51};
52
53static const struct cpu_id cpu_ids[] = {
54 [CPU_BANIAS] = { 6, 9, 5 },
55 [CPU_DOTHAN_A1] = { 6, 13, 1 },
56 [CPU_DOTHAN_A2] = { 6, 13, 2 },
57 [CPU_DOTHAN_B0] = { 6, 13, 6 },
58 [CPU_MP4HT_D0] = {15, 3, 4 },
59 [CPU_MP4HT_E0] = {15, 4, 1 },
60};
61#define N_IDS ARRAY_SIZE(cpu_ids)
62
63struct cpu_model
64{
65 const struct cpu_id *cpu_id;
66 const char *model_name;
67 unsigned max_freq; /* max clock in kHz */
68
69 struct cpufreq_frequency_table *op_points; /* clock/voltage pairs */
70};
71static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
72 const struct cpu_id *x);
73
74/* Operating points for current CPU */
75static DEFINE_PER_CPU(struct cpu_model *, centrino_model);
76static DEFINE_PER_CPU(const struct cpu_id *, centrino_cpu);
77
78static struct cpufreq_driver centrino_driver;
79
80#ifdef CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE
81
82/* Computes the correct form for IA32_PERF_CTL MSR for a particular
83 frequency/voltage operating point; frequency in MHz, volts in mV.
84 This is stored as "index" in the structure. */
85#define OP(mhz, mv) \
86 { \
87 .frequency = (mhz) * 1000, \
88 .index = (((mhz)/100) << 8) | ((mv - 700) / 16) \
89 }
90
91/*
92 * These voltage tables were derived from the Intel Pentium M
93 * datasheet, document 25261202.pdf, Table 5. I have verified they
94 * are consistent with my IBM ThinkPad X31, which has a 1.3GHz Pentium
95 * M.
96 */
97
98/* Ultra Low Voltage Intel Pentium M processor 900MHz (Banias) */
99static struct cpufreq_frequency_table banias_900[] =
100{
101 OP(600, 844),
102 OP(800, 988),
103 OP(900, 1004),
104 { .frequency = CPUFREQ_TABLE_END }
105};
106
107/* Ultra Low Voltage Intel Pentium M processor 1000MHz (Banias) */
108static struct cpufreq_frequency_table banias_1000[] =
109{
110 OP(600, 844),
111 OP(800, 972),
112 OP(900, 988),
113 OP(1000, 1004),
114 { .frequency = CPUFREQ_TABLE_END }
115};
116
117/* Low Voltage Intel Pentium M processor 1.10GHz (Banias) */
118static struct cpufreq_frequency_table banias_1100[] =
119{
120 OP( 600, 956),
121 OP( 800, 1020),
122 OP( 900, 1100),
123 OP(1000, 1164),
124 OP(1100, 1180),
125 { .frequency = CPUFREQ_TABLE_END }
126};
127
128
129/* Low Voltage Intel Pentium M processor 1.20GHz (Banias) */
130static struct cpufreq_frequency_table banias_1200[] =
131{
132 OP( 600, 956),
133 OP( 800, 1004),
134 OP( 900, 1020),
135 OP(1000, 1100),
136 OP(1100, 1164),
137 OP(1200, 1180),
138 { .frequency = CPUFREQ_TABLE_END }
139};
140
141/* Intel Pentium M processor 1.30GHz (Banias) */
142static struct cpufreq_frequency_table banias_1300[] =
143{
144 OP( 600, 956),
145 OP( 800, 1260),
146 OP(1000, 1292),
147 OP(1200, 1356),
148 OP(1300, 1388),
149 { .frequency = CPUFREQ_TABLE_END }
150};
151
152/* Intel Pentium M processor 1.40GHz (Banias) */
153static struct cpufreq_frequency_table banias_1400[] =
154{
155 OP( 600, 956),
156 OP( 800, 1180),
157 OP(1000, 1308),
158 OP(1200, 1436),
159 OP(1400, 1484),
160 { .frequency = CPUFREQ_TABLE_END }
161};
162
163/* Intel Pentium M processor 1.50GHz (Banias) */
164static struct cpufreq_frequency_table banias_1500[] =
165{
166 OP( 600, 956),
167 OP( 800, 1116),
168 OP(1000, 1228),
169 OP(1200, 1356),
170 OP(1400, 1452),
171 OP(1500, 1484),
172 { .frequency = CPUFREQ_TABLE_END }
173};
174
175/* Intel Pentium M processor 1.60GHz (Banias) */
176static struct cpufreq_frequency_table banias_1600[] =
177{
178 OP( 600, 956),
179 OP( 800, 1036),
180 OP(1000, 1164),
181 OP(1200, 1276),
182 OP(1400, 1420),
183 OP(1600, 1484),
184 { .frequency = CPUFREQ_TABLE_END }
185};
186
187/* Intel Pentium M processor 1.70GHz (Banias) */
188static struct cpufreq_frequency_table banias_1700[] =
189{
190 OP( 600, 956),
191 OP( 800, 1004),
192 OP(1000, 1116),
193 OP(1200, 1228),
194 OP(1400, 1308),
195 OP(1700, 1484),
196 { .frequency = CPUFREQ_TABLE_END }
197};
198#undef OP
199
200#define _BANIAS(cpuid, max, name) \
201{ .cpu_id = cpuid, \
202 .model_name = "Intel(R) Pentium(R) M processor " name "MHz", \
203 .max_freq = (max)*1000, \
204 .op_points = banias_##max, \
205}
206#define BANIAS(max) _BANIAS(&cpu_ids[CPU_BANIAS], max, #max)
207
208/* CPU models, their operating frequency range, and freq/voltage
209 operating points */
210static struct cpu_model models[] =
211{
212 _BANIAS(&cpu_ids[CPU_BANIAS], 900, " 900"),
213 BANIAS(1000),
214 BANIAS(1100),
215 BANIAS(1200),
216 BANIAS(1300),
217 BANIAS(1400),
218 BANIAS(1500),
219 BANIAS(1600),
220 BANIAS(1700),
221
222 /* NULL model_name is a wildcard */
223 { &cpu_ids[CPU_DOTHAN_A1], NULL, 0, NULL },
224 { &cpu_ids[CPU_DOTHAN_A2], NULL, 0, NULL },
225 { &cpu_ids[CPU_DOTHAN_B0], NULL, 0, NULL },
226 { &cpu_ids[CPU_MP4HT_D0], NULL, 0, NULL },
227 { &cpu_ids[CPU_MP4HT_E0], NULL, 0, NULL },
228
229 { NULL, }
230};
231#undef _BANIAS
232#undef BANIAS
233
234static int centrino_cpu_init_table(struct cpufreq_policy *policy)
235{
236 struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
237 struct cpu_model *model;
238
239 for(model = models; model->cpu_id != NULL; model++)
240 if (centrino_verify_cpu_id(cpu, model->cpu_id) &&
241 (model->model_name == NULL ||
242 strcmp(cpu->x86_model_id, model->model_name) == 0))
243 break;
244
245 if (model->cpu_id == NULL) {
246 /* No match at all */
247 dprintk("no support for CPU model \"%s\": "
248 "send /proc/cpuinfo to " MAINTAINER "\n",
249 cpu->x86_model_id);
250 return -ENOENT;
251 }
252
253 if (model->op_points == NULL) {
254 /* Matched a non-match */
255 dprintk("no table support for CPU model \"%s\"\n",
256 cpu->x86_model_id);
257 dprintk("try using the acpi-cpufreq driver\n");
258 return -ENOENT;
259 }
260
261 per_cpu(centrino_model, policy->cpu) = model;
262
263 dprintk("found \"%s\": max frequency: %dkHz\n",
264 model->model_name, model->max_freq);
265
266 return 0;
267}
268
269#else
270static inline int centrino_cpu_init_table(struct cpufreq_policy *policy)
271{
272 return -ENODEV;
273}
274#endif /* CONFIG_X86_SPEEDSTEP_CENTRINO_TABLE */
275
276static int centrino_verify_cpu_id(const struct cpuinfo_x86 *c,
277 const struct cpu_id *x)
278{
279 if ((c->x86 == x->x86) &&
280 (c->x86_model == x->x86_model) &&
281 (c->x86_mask == x->x86_mask))
282 return 1;
283 return 0;
284}
285
286/* To be called only after centrino_model is initialized */
287static unsigned extract_clock(unsigned msr, unsigned int cpu, int failsafe)
288{
289 int i;
290
291 /*
292 * Extract clock in kHz from PERF_CTL value
293 * for centrino, as some DSDTs are buggy.
294 * Ideally, this can be done using the acpi_data structure.
295 */
296 if ((per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_BANIAS]) ||
297 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_A1]) ||
298 (per_cpu(centrino_cpu, cpu) == &cpu_ids[CPU_DOTHAN_B0])) {
299 msr = (msr >> 8) & 0xff;
300 return msr * 100000;
301 }
302
303 if ((!per_cpu(centrino_model, cpu)) ||
304 (!per_cpu(centrino_model, cpu)->op_points))
305 return 0;
306
307 msr &= 0xffff;
308 for (i = 0;
309 per_cpu(centrino_model, cpu)->op_points[i].frequency
310 != CPUFREQ_TABLE_END;
311 i++) {
312 if (msr == per_cpu(centrino_model, cpu)->op_points[i].index)
313 return per_cpu(centrino_model, cpu)->
314 op_points[i].frequency;
315 }
316 if (failsafe)
317 return per_cpu(centrino_model, cpu)->op_points[i-1].frequency;
318 else
319 return 0;
320}
321
322/* Return the current CPU frequency in kHz */
323static unsigned int get_cur_freq(unsigned int cpu)
324{
325 unsigned l, h;
326 unsigned clock_freq;
327
328 rdmsr_on_cpu(cpu, MSR_IA32_PERF_STATUS, &l, &h);
329 clock_freq = extract_clock(l, cpu, 0);
330
331 if (unlikely(clock_freq == 0)) {
332 /*
333 * On some CPUs, we can see transient MSR values (which are
334 * not present in _PSS), while CPU is doing some automatic
335 * P-state transition (like TM2). Get the last freq set
336 * in PERF_CTL.
337 */
338 rdmsr_on_cpu(cpu, MSR_IA32_PERF_CTL, &l, &h);
339 clock_freq = extract_clock(l, cpu, 1);
340 }
341 return clock_freq;
342}
343
344
345static int centrino_cpu_init(struct cpufreq_policy *policy)
346{
347 struct cpuinfo_x86 *cpu = &cpu_data(policy->cpu);
348 unsigned freq;
349 unsigned l, h;
350 int ret;
351 int i;
352
353 /* Only Intel makes Enhanced Speedstep-capable CPUs */
354 if (cpu->x86_vendor != X86_VENDOR_INTEL ||
355 !cpu_has(cpu, X86_FEATURE_EST))
356 return -ENODEV;
357
358 if (cpu_has(cpu, X86_FEATURE_CONSTANT_TSC))
359 centrino_driver.flags |= CPUFREQ_CONST_LOOPS;
360
361 if (policy->cpu != 0)
362 return -ENODEV;
363
364 for (i = 0; i < N_IDS; i++)
365 if (centrino_verify_cpu_id(cpu, &cpu_ids[i]))
366 break;
367
368 if (i != N_IDS)
369 per_cpu(centrino_cpu, policy->cpu) = &cpu_ids[i];
370
371 if (!per_cpu(centrino_cpu, policy->cpu)) {
372 dprintk("found unsupported CPU with "
373 "Enhanced SpeedStep: send /proc/cpuinfo to "
374 MAINTAINER "\n");
375 return -ENODEV;
376 }
377
378 if (centrino_cpu_init_table(policy)) {
379 return -ENODEV;
380 }
381
382 /* Check to see if Enhanced SpeedStep is enabled, and try to
383 enable it if not. */
384 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
385
386 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
387 l |= MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP;
388 dprintk("trying to enable Enhanced SpeedStep (%x)\n", l);
389 wrmsr(MSR_IA32_MISC_ENABLE, l, h);
390
391 /* check to see if it stuck */
392 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
393 if (!(l & MSR_IA32_MISC_ENABLE_ENHANCED_SPEEDSTEP)) {
394 printk(KERN_INFO PFX
395 "couldn't enable Enhanced SpeedStep\n");
396 return -ENODEV;
397 }
398 }
399
400 freq = get_cur_freq(policy->cpu);
401 policy->cpuinfo.transition_latency = 10000;
402 /* 10uS transition latency */
403 policy->cur = freq;
404
405 dprintk("centrino_cpu_init: cur=%dkHz\n", policy->cur);
406
407 ret = cpufreq_frequency_table_cpuinfo(policy,
408 per_cpu(centrino_model, policy->cpu)->op_points);
409 if (ret)
410 return (ret);
411
412 cpufreq_frequency_table_get_attr(
413 per_cpu(centrino_model, policy->cpu)->op_points, policy->cpu);
414
415 return 0;
416}
417
418static int centrino_cpu_exit(struct cpufreq_policy *policy)
419{
420 unsigned int cpu = policy->cpu;
421
422 if (!per_cpu(centrino_model, cpu))
423 return -ENODEV;
424
425 cpufreq_frequency_table_put_attr(cpu);
426
427 per_cpu(centrino_model, cpu) = NULL;
428
429 return 0;
430}
431
432/**
433 * centrino_verify - verifies a new CPUFreq policy
434 * @policy: new policy
435 *
436 * Limit must be within this model's frequency range at least one
437 * border included.
438 */
439static int centrino_verify (struct cpufreq_policy *policy)
440{
441 return cpufreq_frequency_table_verify(policy,
442 per_cpu(centrino_model, policy->cpu)->op_points);
443}
444
445/**
446 * centrino_setpolicy - set a new CPUFreq policy
447 * @policy: new policy
448 * @target_freq: the target frequency
449 * @relation: how that frequency relates to achieved frequency
450 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
451 *
452 * Sets a new CPUFreq policy.
453 */
454static int centrino_target (struct cpufreq_policy *policy,
455 unsigned int target_freq,
456 unsigned int relation)
457{
458 unsigned int newstate = 0;
459 unsigned int msr, oldmsr = 0, h = 0, cpu = policy->cpu;
460 struct cpufreq_freqs freqs;
461 int retval = 0;
462 unsigned int j, k, first_cpu, tmp;
463 cpumask_var_t covered_cpus;
464
465 if (unlikely(!zalloc_cpumask_var(&covered_cpus, GFP_KERNEL)))
466 return -ENOMEM;
467
468 if (unlikely(per_cpu(centrino_model, cpu) == NULL)) {
469 retval = -ENODEV;
470 goto out;
471 }
472
473 if (unlikely(cpufreq_frequency_table_target(policy,
474 per_cpu(centrino_model, cpu)->op_points,
475 target_freq,
476 relation,
477 &newstate))) {
478 retval = -EINVAL;
479 goto out;
480 }
481
482 first_cpu = 1;
483 for_each_cpu(j, policy->cpus) {
484 int good_cpu;
485
486 /* cpufreq holds the hotplug lock, so we are safe here */
487 if (!cpu_online(j))
488 continue;
489
490 /*
491 * Support for SMP systems.
492 * Make sure we are running on CPU that wants to change freq
493 */
494 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
495 good_cpu = cpumask_any_and(policy->cpus,
496 cpu_online_mask);
497 else
498 good_cpu = j;
499
500 if (good_cpu >= nr_cpu_ids) {
501 dprintk("couldn't limit to CPUs in this domain\n");
502 retval = -EAGAIN;
503 if (first_cpu) {
504 /* We haven't started the transition yet. */
505 goto out;
506 }
507 break;
508 }
509
510 msr = per_cpu(centrino_model, cpu)->op_points[newstate].index;
511
512 if (first_cpu) {
513 rdmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, &oldmsr, &h);
514 if (msr == (oldmsr & 0xffff)) {
515 dprintk("no change needed - msr was and needs "
516 "to be %x\n", oldmsr);
517 retval = 0;
518 goto out;
519 }
520
521 freqs.old = extract_clock(oldmsr, cpu, 0);
522 freqs.new = extract_clock(msr, cpu, 0);
523
524 dprintk("target=%dkHz old=%d new=%d msr=%04x\n",
525 target_freq, freqs.old, freqs.new, msr);
526
527 for_each_cpu(k, policy->cpus) {
528 if (!cpu_online(k))
529 continue;
530 freqs.cpu = k;
531 cpufreq_notify_transition(&freqs,
532 CPUFREQ_PRECHANGE);
533 }
534
535 first_cpu = 0;
536 /* all but 16 LSB are reserved, treat them with care */
537 oldmsr &= ~0xffff;
538 msr &= 0xffff;
539 oldmsr |= msr;
540 }
541
542 wrmsr_on_cpu(good_cpu, MSR_IA32_PERF_CTL, oldmsr, h);
543 if (policy->shared_type == CPUFREQ_SHARED_TYPE_ANY)
544 break;
545
546 cpumask_set_cpu(j, covered_cpus);
547 }
548
549 for_each_cpu(k, policy->cpus) {
550 if (!cpu_online(k))
551 continue;
552 freqs.cpu = k;
553 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
554 }
555
556 if (unlikely(retval)) {
557 /*
558 * We have failed halfway through the frequency change.
559 * We have sent callbacks to policy->cpus and
560 * MSRs have already been written on coverd_cpus.
561 * Best effort undo..
562 */
563
564 for_each_cpu(j, covered_cpus)
565 wrmsr_on_cpu(j, MSR_IA32_PERF_CTL, oldmsr, h);
566
567 tmp = freqs.new;
568 freqs.new = freqs.old;
569 freqs.old = tmp;
570 for_each_cpu(j, policy->cpus) {
571 if (!cpu_online(j))
572 continue;
573 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
574 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
575 }
576 }
577 retval = 0;
578
579out:
580 free_cpumask_var(covered_cpus);
581 return retval;
582}
583
584static struct freq_attr* centrino_attr[] = {
585 &cpufreq_freq_attr_scaling_available_freqs,
586 NULL,
587};
588
589static struct cpufreq_driver centrino_driver = {
590 .name = "centrino", /* should be speedstep-centrino,
591 but there's a 16 char limit */
592 .init = centrino_cpu_init,
593 .exit = centrino_cpu_exit,
594 .verify = centrino_verify,
595 .target = centrino_target,
596 .get = get_cur_freq,
597 .attr = centrino_attr,
598 .owner = THIS_MODULE,
599};
600
601
602/**
603 * centrino_init - initializes the Enhanced SpeedStep CPUFreq driver
604 *
605 * Initializes the Enhanced SpeedStep support. Returns -ENODEV on
606 * unsupported devices, -ENOENT if there's no voltage table for this
607 * particular CPU model, -EINVAL on problems during initiatization,
608 * and zero on success.
609 *
610 * This is quite picky. Not only does the CPU have to advertise the
611 * "est" flag in the cpuid capability flags, we look for a specific
612 * CPU model and stepping, and we need to have the exact model name in
613 * our voltage tables. That is, be paranoid about not releasing
614 * someone's valuable magic smoke.
615 */
616static int __init centrino_init(void)
617{
618 struct cpuinfo_x86 *cpu = &cpu_data(0);
619
620 if (!cpu_has(cpu, X86_FEATURE_EST))
621 return -ENODEV;
622
623 return cpufreq_register_driver(&centrino_driver);
624}
625
626static void __exit centrino_exit(void)
627{
628 cpufreq_unregister_driver(&centrino_driver);
629}
630
631MODULE_AUTHOR ("Jeremy Fitzhardinge <jeremy@goop.org>");
632MODULE_DESCRIPTION ("Enhanced SpeedStep driver for Intel Pentium M processors.");
633MODULE_LICENSE ("GPL");
634
635late_initcall(centrino_init);
636module_exit(centrino_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
deleted file mode 100644
index 561758e95180..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ /dev/null
@@ -1,452 +0,0 @@
1/*
2 * (C) 2001 Dave Jones, Arjan van de ven.
3 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
4 *
5 * Licensed under the terms of the GNU GPL License version 2.
6 * Based upon reverse engineered information, and on Intel documentation
7 * for chipsets ICH2-M and ICH3-M.
8 *
9 * Many thanks to Ducrot Bruno for finding and fixing the last
10 * "missing link" for ICH2-M/ICH3-M support, and to Thomas Winkler
11 * for extensive testing.
12 *
13 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
14 */
15
16
17/*********************************************************************
18 * SPEEDSTEP - DEFINITIONS *
19 *********************************************************************/
20
21#include <linux/kernel.h>
22#include <linux/module.h>
23#include <linux/init.h>
24#include <linux/cpufreq.h>
25#include <linux/pci.h>
26#include <linux/sched.h>
27
28#include "speedstep-lib.h"
29
30
31/* speedstep_chipset:
32 * It is necessary to know which chipset is used. As accesses to
33 * this device occur at various places in this module, we need a
34 * static struct pci_dev * pointing to that device.
35 */
36static struct pci_dev *speedstep_chipset_dev;
37
38
39/* speedstep_processor
40 */
41static enum speedstep_processor speedstep_processor;
42
43static u32 pmbase;
44
45/*
46 * There are only two frequency states for each processor. Values
47 * are in kHz for the time being.
48 */
49static struct cpufreq_frequency_table speedstep_freqs[] = {
50 {SPEEDSTEP_HIGH, 0},
51 {SPEEDSTEP_LOW, 0},
52 {0, CPUFREQ_TABLE_END},
53};
54
55
56#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
57 "speedstep-ich", msg)
58
59
60/**
61 * speedstep_find_register - read the PMBASE address
62 *
63 * Returns: -ENODEV if no register could be found
64 */
65static int speedstep_find_register(void)
66{
67 if (!speedstep_chipset_dev)
68 return -ENODEV;
69
70 /* get PMBASE */
71 pci_read_config_dword(speedstep_chipset_dev, 0x40, &pmbase);
72 if (!(pmbase & 0x01)) {
73 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
74 return -ENODEV;
75 }
76
77 pmbase &= 0xFFFFFFFE;
78 if (!pmbase) {
79 printk(KERN_ERR "speedstep-ich: could not find speedstep register\n");
80 return -ENODEV;
81 }
82
83 dprintk("pmbase is 0x%x\n", pmbase);
84 return 0;
85}
86
87/**
88 * speedstep_set_state - set the SpeedStep state
89 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
90 *
91 * Tries to change the SpeedStep state. Can be called from
92 * smp_call_function_single.
93 */
94static void speedstep_set_state(unsigned int state)
95{
96 u8 pm2_blk;
97 u8 value;
98 unsigned long flags;
99
100 if (state > 0x1)
101 return;
102
103 /* Disable IRQs */
104 local_irq_save(flags);
105
106 /* read state */
107 value = inb(pmbase + 0x50);
108
109 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
110
111 /* write new state */
112 value &= 0xFE;
113 value |= state;
114
115 dprintk("writing 0x%x to pmbase 0x%x + 0x50\n", value, pmbase);
116
117 /* Disable bus master arbitration */
118 pm2_blk = inb(pmbase + 0x20);
119 pm2_blk |= 0x01;
120 outb(pm2_blk, (pmbase + 0x20));
121
122 /* Actual transition */
123 outb(value, (pmbase + 0x50));
124
125 /* Restore bus master arbitration */
126 pm2_blk &= 0xfe;
127 outb(pm2_blk, (pmbase + 0x20));
128
129 /* check if transition was successful */
130 value = inb(pmbase + 0x50);
131
132 /* Enable IRQs */
133 local_irq_restore(flags);
134
135 dprintk("read at pmbase 0x%x + 0x50 returned 0x%x\n", pmbase, value);
136
137 if (state == (value & 0x1))
138 dprintk("change to %u MHz succeeded\n",
139 speedstep_get_frequency(speedstep_processor) / 1000);
140 else
141 printk(KERN_ERR "cpufreq: change failed - I/O error\n");
142
143 return;
144}
145
146/* Wrapper for smp_call_function_single. */
147static void _speedstep_set_state(void *_state)
148{
149 speedstep_set_state(*(unsigned int *)_state);
150}
151
152/**
153 * speedstep_activate - activate SpeedStep control in the chipset
154 *
155 * Tries to activate the SpeedStep status and control registers.
156 * Returns -EINVAL on an unsupported chipset, and zero on success.
157 */
158static int speedstep_activate(void)
159{
160 u16 value = 0;
161
162 if (!speedstep_chipset_dev)
163 return -EINVAL;
164
165 pci_read_config_word(speedstep_chipset_dev, 0x00A0, &value);
166 if (!(value & 0x08)) {
167 value |= 0x08;
168 dprintk("activating SpeedStep (TM) registers\n");
169 pci_write_config_word(speedstep_chipset_dev, 0x00A0, value);
170 }
171
172 return 0;
173}
174
175
176/**
177 * speedstep_detect_chipset - detect the Southbridge which contains SpeedStep logic
178 *
179 * Detects ICH2-M, ICH3-M and ICH4-M so far. The pci_dev points to
180 * the LPC bridge / PM module which contains all power-management
181 * functions. Returns the SPEEDSTEP_CHIPSET_-number for the detected
182 * chipset, or zero on failure.
183 */
184static unsigned int speedstep_detect_chipset(void)
185{
186 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
187 PCI_DEVICE_ID_INTEL_82801DB_12,
188 PCI_ANY_ID, PCI_ANY_ID,
189 NULL);
190 if (speedstep_chipset_dev)
191 return 4; /* 4-M */
192
193 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
194 PCI_DEVICE_ID_INTEL_82801CA_12,
195 PCI_ANY_ID, PCI_ANY_ID,
196 NULL);
197 if (speedstep_chipset_dev)
198 return 3; /* 3-M */
199
200
201 speedstep_chipset_dev = pci_get_subsys(PCI_VENDOR_ID_INTEL,
202 PCI_DEVICE_ID_INTEL_82801BA_10,
203 PCI_ANY_ID, PCI_ANY_ID,
204 NULL);
205 if (speedstep_chipset_dev) {
206 /* speedstep.c causes lockups on Dell Inspirons 8000 and
207 * 8100 which use a pretty old revision of the 82815
208 * host brige. Abort on these systems.
209 */
210 static struct pci_dev *hostbridge;
211
212 hostbridge = pci_get_subsys(PCI_VENDOR_ID_INTEL,
213 PCI_DEVICE_ID_INTEL_82815_MC,
214 PCI_ANY_ID, PCI_ANY_ID,
215 NULL);
216
217 if (!hostbridge)
218 return 2; /* 2-M */
219
220 if (hostbridge->revision < 5) {
221 dprintk("hostbridge does not support speedstep\n");
222 speedstep_chipset_dev = NULL;
223 pci_dev_put(hostbridge);
224 return 0;
225 }
226
227 pci_dev_put(hostbridge);
228 return 2; /* 2-M */
229 }
230
231 return 0;
232}
233
234static void get_freq_data(void *_speed)
235{
236 unsigned int *speed = _speed;
237
238 *speed = speedstep_get_frequency(speedstep_processor);
239}
240
241static unsigned int speedstep_get(unsigned int cpu)
242{
243 unsigned int speed;
244
245 /* You're supposed to ensure CPU is online. */
246 if (smp_call_function_single(cpu, get_freq_data, &speed, 1) != 0)
247 BUG();
248
249 dprintk("detected %u kHz as current frequency\n", speed);
250 return speed;
251}
252
253/**
254 * speedstep_target - set a new CPUFreq policy
255 * @policy: new policy
256 * @target_freq: the target frequency
257 * @relation: how that frequency relates to achieved frequency
258 * (CPUFREQ_RELATION_L or CPUFREQ_RELATION_H)
259 *
260 * Sets a new CPUFreq policy.
261 */
262static int speedstep_target(struct cpufreq_policy *policy,
263 unsigned int target_freq,
264 unsigned int relation)
265{
266 unsigned int newstate = 0, policy_cpu;
267 struct cpufreq_freqs freqs;
268 int i;
269
270 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
271 target_freq, relation, &newstate))
272 return -EINVAL;
273
274 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
275 freqs.old = speedstep_get(policy_cpu);
276 freqs.new = speedstep_freqs[newstate].frequency;
277 freqs.cpu = policy->cpu;
278
279 dprintk("transiting from %u to %u kHz\n", freqs.old, freqs.new);
280
281 /* no transition necessary */
282 if (freqs.old == freqs.new)
283 return 0;
284
285 for_each_cpu(i, policy->cpus) {
286 freqs.cpu = i;
287 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
288 }
289
290 smp_call_function_single(policy_cpu, _speedstep_set_state, &newstate,
291 true);
292
293 for_each_cpu(i, policy->cpus) {
294 freqs.cpu = i;
295 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
296 }
297
298 return 0;
299}
300
301
302/**
303 * speedstep_verify - verifies a new CPUFreq policy
304 * @policy: new policy
305 *
306 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
307 * at least one border included.
308 */
309static int speedstep_verify(struct cpufreq_policy *policy)
310{
311 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
312}
313
314struct get_freqs {
315 struct cpufreq_policy *policy;
316 int ret;
317};
318
319static void get_freqs_on_cpu(void *_get_freqs)
320{
321 struct get_freqs *get_freqs = _get_freqs;
322
323 get_freqs->ret =
324 speedstep_get_freqs(speedstep_processor,
325 &speedstep_freqs[SPEEDSTEP_LOW].frequency,
326 &speedstep_freqs[SPEEDSTEP_HIGH].frequency,
327 &get_freqs->policy->cpuinfo.transition_latency,
328 &speedstep_set_state);
329}
330
331static int speedstep_cpu_init(struct cpufreq_policy *policy)
332{
333 int result;
334 unsigned int policy_cpu, speed;
335 struct get_freqs gf;
336
337 /* only run on CPU to be set, or on its sibling */
338#ifdef CONFIG_SMP
339 cpumask_copy(policy->cpus, cpu_sibling_mask(policy->cpu));
340#endif
341 policy_cpu = cpumask_any_and(policy->cpus, cpu_online_mask);
342
343 /* detect low and high frequency and transition latency */
344 gf.policy = policy;
345 smp_call_function_single(policy_cpu, get_freqs_on_cpu, &gf, 1);
346 if (gf.ret)
347 return gf.ret;
348
349 /* get current speed setting */
350 speed = speedstep_get(policy_cpu);
351 if (!speed)
352 return -EIO;
353
354 dprintk("currently at %s speed setting - %i MHz\n",
355 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
356 ? "low" : "high",
357 (speed / 1000));
358
359 /* cpuinfo and default policy values */
360 policy->cur = speed;
361
362 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
363 if (result)
364 return result;
365
366 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
367
368 return 0;
369}
370
371
372static int speedstep_cpu_exit(struct cpufreq_policy *policy)
373{
374 cpufreq_frequency_table_put_attr(policy->cpu);
375 return 0;
376}
377
378static struct freq_attr *speedstep_attr[] = {
379 &cpufreq_freq_attr_scaling_available_freqs,
380 NULL,
381};
382
383
384static struct cpufreq_driver speedstep_driver = {
385 .name = "speedstep-ich",
386 .verify = speedstep_verify,
387 .target = speedstep_target,
388 .init = speedstep_cpu_init,
389 .exit = speedstep_cpu_exit,
390 .get = speedstep_get,
391 .owner = THIS_MODULE,
392 .attr = speedstep_attr,
393};
394
395
396/**
397 * speedstep_init - initializes the SpeedStep CPUFreq driver
398 *
399 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
400 * devices, -EINVAL on problems during initiatization, and zero on
401 * success.
402 */
403static int __init speedstep_init(void)
404{
405 /* detect processor */
406 speedstep_processor = speedstep_detect_processor();
407 if (!speedstep_processor) {
408 dprintk("Intel(R) SpeedStep(TM) capable processor "
409 "not found\n");
410 return -ENODEV;
411 }
412
413 /* detect chipset */
414 if (!speedstep_detect_chipset()) {
415 dprintk("Intel(R) SpeedStep(TM) for this chipset not "
416 "(yet) available.\n");
417 return -ENODEV;
418 }
419
420 /* activate speedstep support */
421 if (speedstep_activate()) {
422 pci_dev_put(speedstep_chipset_dev);
423 return -EINVAL;
424 }
425
426 if (speedstep_find_register())
427 return -ENODEV;
428
429 return cpufreq_register_driver(&speedstep_driver);
430}
431
432
433/**
434 * speedstep_exit - unregisters SpeedStep support
435 *
436 * Unregisters SpeedStep support.
437 */
438static void __exit speedstep_exit(void)
439{
440 pci_dev_put(speedstep_chipset_dev);
441 cpufreq_unregister_driver(&speedstep_driver);
442}
443
444
445MODULE_AUTHOR("Dave Jones <davej@redhat.com>, "
446 "Dominik Brodowski <linux@brodo.de>");
447MODULE_DESCRIPTION("Speedstep driver for Intel mobile processors on chipsets "
448 "with ICH-M southbridges.");
449MODULE_LICENSE("GPL");
450
451module_init(speedstep_init);
452module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
deleted file mode 100644
index a94ec6be69fa..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ /dev/null
@@ -1,481 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/moduleparam.h>
14#include <linux/init.h>
15#include <linux/cpufreq.h>
16
17#include <asm/msr.h>
18#include <asm/tsc.h>
19#include "speedstep-lib.h"
20
21#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
22 "speedstep-lib", msg)
23
24#define PFX "speedstep-lib: "
25
26#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
27static int relaxed_check;
28#else
29#define relaxed_check 0
30#endif
31
32/*********************************************************************
33 * GET PROCESSOR CORE SPEED IN KHZ *
34 *********************************************************************/
35
36static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
37{
38 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
39 struct {
40 unsigned int ratio; /* Frequency Multiplier (x10) */
41 u8 bitmap; /* power on configuration bits
42 [27, 25:22] (in MSR 0x2a) */
43 } msr_decode_mult[] = {
44 { 30, 0x01 },
45 { 35, 0x05 },
46 { 40, 0x02 },
47 { 45, 0x06 },
48 { 50, 0x00 },
49 { 55, 0x04 },
50 { 60, 0x0b },
51 { 65, 0x0f },
52 { 70, 0x09 },
53 { 75, 0x0d },
54 { 80, 0x0a },
55 { 85, 0x26 },
56 { 90, 0x20 },
57 { 100, 0x2b },
58 { 0, 0xff } /* error or unknown value */
59 };
60
61 /* PIII(-M) FSB settings: see table b1-b of 24547206.pdf */
62 struct {
63 unsigned int value; /* Front Side Bus speed in MHz */
64 u8 bitmap; /* power on configuration bits [18: 19]
65 (in MSR 0x2a) */
66 } msr_decode_fsb[] = {
67 { 66, 0x0 },
68 { 100, 0x2 },
69 { 133, 0x1 },
70 { 0, 0xff}
71 };
72
73 u32 msr_lo, msr_tmp;
74 int i = 0, j = 0;
75
76 /* read MSR 0x2a - we only need the low 32 bits */
77 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
78 dprintk("P3 - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
79 msr_tmp = msr_lo;
80
81 /* decode the FSB */
82 msr_tmp &= 0x00c0000;
83 msr_tmp >>= 18;
84 while (msr_tmp != msr_decode_fsb[i].bitmap) {
85 if (msr_decode_fsb[i].bitmap == 0xff)
86 return 0;
87 i++;
88 }
89
90 /* decode the multiplier */
91 if (processor == SPEEDSTEP_CPU_PIII_C_EARLY) {
92 dprintk("workaround for early PIIIs\n");
93 msr_lo &= 0x03c00000;
94 } else
95 msr_lo &= 0x0bc00000;
96 msr_lo >>= 22;
97 while (msr_lo != msr_decode_mult[j].bitmap) {
98 if (msr_decode_mult[j].bitmap == 0xff)
99 return 0;
100 j++;
101 }
102
103 dprintk("speed is %u\n",
104 (msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100));
105
106 return msr_decode_mult[j].ratio * msr_decode_fsb[i].value * 100;
107}
108
109
110static unsigned int pentiumM_get_frequency(void)
111{
112 u32 msr_lo, msr_tmp;
113
114 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
115 dprintk("PM - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n", msr_lo, msr_tmp);
116
117 /* see table B-2 of 24547212.pdf */
118 if (msr_lo & 0x00040000) {
119 printk(KERN_DEBUG PFX "PM - invalid FSB: 0x%x 0x%x\n",
120 msr_lo, msr_tmp);
121 return 0;
122 }
123
124 msr_tmp = (msr_lo >> 22) & 0x1f;
125 dprintk("bits 22-26 are 0x%x, speed is %u\n",
126 msr_tmp, (msr_tmp * 100 * 1000));
127
128 return msr_tmp * 100 * 1000;
129}
130
131static unsigned int pentium_core_get_frequency(void)
132{
133 u32 fsb = 0;
134 u32 msr_lo, msr_tmp;
135 int ret;
136
137 rdmsr(MSR_FSB_FREQ, msr_lo, msr_tmp);
138 /* see table B-2 of 25366920.pdf */
139 switch (msr_lo & 0x07) {
140 case 5:
141 fsb = 100000;
142 break;
143 case 1:
144 fsb = 133333;
145 break;
146 case 3:
147 fsb = 166667;
148 break;
149 case 2:
150 fsb = 200000;
151 break;
152 case 0:
153 fsb = 266667;
154 break;
155 case 4:
156 fsb = 333333;
157 break;
158 default:
159 printk(KERN_ERR "PCORE - MSR_FSB_FREQ undefined value");
160 }
161
162 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_tmp);
163 dprintk("PCORE - MSR_IA32_EBL_CR_POWERON: 0x%x 0x%x\n",
164 msr_lo, msr_tmp);
165
166 msr_tmp = (msr_lo >> 22) & 0x1f;
167 dprintk("bits 22-26 are 0x%x, speed is %u\n",
168 msr_tmp, (msr_tmp * fsb));
169
170 ret = (msr_tmp * fsb);
171 return ret;
172}
173
174
175static unsigned int pentium4_get_frequency(void)
176{
177 struct cpuinfo_x86 *c = &boot_cpu_data;
178 u32 msr_lo, msr_hi, mult;
179 unsigned int fsb = 0;
180 unsigned int ret;
181 u8 fsb_code;
182
183 /* Pentium 4 Model 0 and 1 do not have the Core Clock Frequency
184 * to System Bus Frequency Ratio Field in the Processor Frequency
185 * Configuration Register of the MSR. Therefore the current
186 * frequency cannot be calculated and has to be measured.
187 */
188 if (c->x86_model < 2)
189 return cpu_khz;
190
191 rdmsr(0x2c, msr_lo, msr_hi);
192
193 dprintk("P4 - MSR_EBC_FREQUENCY_ID: 0x%x 0x%x\n", msr_lo, msr_hi);
194
195 /* decode the FSB: see IA-32 Intel (C) Architecture Software
196 * Developer's Manual, Volume 3: System Prgramming Guide,
197 * revision #12 in Table B-1: MSRs in the Pentium 4 and
198 * Intel Xeon Processors, on page B-4 and B-5.
199 */
200 fsb_code = (msr_lo >> 16) & 0x7;
201 switch (fsb_code) {
202 case 0:
203 fsb = 100 * 1000;
204 break;
205 case 1:
206 fsb = 13333 * 10;
207 break;
208 case 2:
209 fsb = 200 * 1000;
210 break;
211 }
212
213 if (!fsb)
214 printk(KERN_DEBUG PFX "couldn't detect FSB speed. "
215 "Please send an e-mail to <linux@brodo.de>\n");
216
217 /* Multiplier. */
218 mult = msr_lo >> 24;
219
220 dprintk("P4 - FSB %u kHz; Multiplier %u; Speed %u kHz\n",
221 fsb, mult, (fsb * mult));
222
223 ret = (fsb * mult);
224 return ret;
225}
226
227
228/* Warning: may get called from smp_call_function_single. */
229unsigned int speedstep_get_frequency(enum speedstep_processor processor)
230{
231 switch (processor) {
232 case SPEEDSTEP_CPU_PCORE:
233 return pentium_core_get_frequency();
234 case SPEEDSTEP_CPU_PM:
235 return pentiumM_get_frequency();
236 case SPEEDSTEP_CPU_P4D:
237 case SPEEDSTEP_CPU_P4M:
238 return pentium4_get_frequency();
239 case SPEEDSTEP_CPU_PIII_T:
240 case SPEEDSTEP_CPU_PIII_C:
241 case SPEEDSTEP_CPU_PIII_C_EARLY:
242 return pentium3_get_frequency(processor);
243 default:
244 return 0;
245 };
246 return 0;
247}
248EXPORT_SYMBOL_GPL(speedstep_get_frequency);
249
250
251/*********************************************************************
252 * DETECT SPEEDSTEP-CAPABLE PROCESSOR *
253 *********************************************************************/
254
255unsigned int speedstep_detect_processor(void)
256{
257 struct cpuinfo_x86 *c = &cpu_data(0);
258 u32 ebx, msr_lo, msr_hi;
259
260 dprintk("x86: %x, model: %x\n", c->x86, c->x86_model);
261
262 if ((c->x86_vendor != X86_VENDOR_INTEL) ||
263 ((c->x86 != 6) && (c->x86 != 0xF)))
264 return 0;
265
266 if (c->x86 == 0xF) {
267 /* Intel Mobile Pentium 4-M
268 * or Intel Mobile Pentium 4 with 533 MHz FSB */
269 if (c->x86_model != 2)
270 return 0;
271
272 ebx = cpuid_ebx(0x00000001);
273 ebx &= 0x000000FF;
274
275 dprintk("ebx value is %x, x86_mask is %x\n", ebx, c->x86_mask);
276
277 switch (c->x86_mask) {
278 case 4:
279 /*
280 * B-stepping [M-P4-M]
281 * sample has ebx = 0x0f, production has 0x0e.
282 */
283 if ((ebx == 0x0e) || (ebx == 0x0f))
284 return SPEEDSTEP_CPU_P4M;
285 break;
286 case 7:
287 /*
288 * C-stepping [M-P4-M]
289 * needs to have ebx=0x0e, else it's a celeron:
290 * cf. 25130917.pdf / page 7, footnote 5 even
291 * though 25072120.pdf / page 7 doesn't say
292 * samples are only of B-stepping...
293 */
294 if (ebx == 0x0e)
295 return SPEEDSTEP_CPU_P4M;
296 break;
297 case 9:
298 /*
299 * D-stepping [M-P4-M or M-P4/533]
300 *
301 * this is totally strange: CPUID 0x0F29 is
302 * used by M-P4-M, M-P4/533 and(!) Celeron CPUs.
303 * The latter need to be sorted out as they don't
304 * support speedstep.
305 * Celerons with CPUID 0x0F29 may have either
306 * ebx=0x8 or 0xf -- 25130917.pdf doesn't say anything
307 * specific.
308 * M-P4-Ms may have either ebx=0xe or 0xf [see above]
309 * M-P4/533 have either ebx=0xe or 0xf. [25317607.pdf]
310 * also, M-P4M HTs have ebx=0x8, too
311 * For now, they are distinguished by the model_id
312 * string
313 */
314 if ((ebx == 0x0e) ||
315 (strstr(c->x86_model_id,
316 "Mobile Intel(R) Pentium(R) 4") != NULL))
317 return SPEEDSTEP_CPU_P4M;
318 break;
319 default:
320 break;
321 }
322 return 0;
323 }
324
325 switch (c->x86_model) {
326 case 0x0B: /* Intel PIII [Tualatin] */
327 /* cpuid_ebx(1) is 0x04 for desktop PIII,
328 * 0x06 for mobile PIII-M */
329 ebx = cpuid_ebx(0x00000001);
330 dprintk("ebx is %x\n", ebx);
331
332 ebx &= 0x000000FF;
333
334 if (ebx != 0x06)
335 return 0;
336
337 /* So far all PIII-M processors support SpeedStep. See
338 * Intel's 24540640.pdf of June 2003
339 */
340 return SPEEDSTEP_CPU_PIII_T;
341
342 case 0x08: /* Intel PIII [Coppermine] */
343
344 /* all mobile PIII Coppermines have FSB 100 MHz
345 * ==> sort out a few desktop PIIIs. */
346 rdmsr(MSR_IA32_EBL_CR_POWERON, msr_lo, msr_hi);
347 dprintk("Coppermine: MSR_IA32_EBL_CR_POWERON is 0x%x, 0x%x\n",
348 msr_lo, msr_hi);
349 msr_lo &= 0x00c0000;
350 if (msr_lo != 0x0080000)
351 return 0;
352
353 /*
354 * If the processor is a mobile version,
355 * platform ID has bit 50 set
356 * it has SpeedStep technology if either
357 * bit 56 or 57 is set
358 */
359 rdmsr(MSR_IA32_PLATFORM_ID, msr_lo, msr_hi);
360 dprintk("Coppermine: MSR_IA32_PLATFORM ID is 0x%x, 0x%x\n",
361 msr_lo, msr_hi);
362 if ((msr_hi & (1<<18)) &&
363 (relaxed_check ? 1 : (msr_hi & (3<<24)))) {
364 if (c->x86_mask == 0x01) {
365 dprintk("early PIII version\n");
366 return SPEEDSTEP_CPU_PIII_C_EARLY;
367 } else
368 return SPEEDSTEP_CPU_PIII_C;
369 }
370
371 default:
372 return 0;
373 }
374}
375EXPORT_SYMBOL_GPL(speedstep_detect_processor);
376
377
378/*********************************************************************
379 * DETECT SPEEDSTEP SPEEDS *
380 *********************************************************************/
381
382unsigned int speedstep_get_freqs(enum speedstep_processor processor,
383 unsigned int *low_speed,
384 unsigned int *high_speed,
385 unsigned int *transition_latency,
386 void (*set_state) (unsigned int state))
387{
388 unsigned int prev_speed;
389 unsigned int ret = 0;
390 unsigned long flags;
391 struct timeval tv1, tv2;
392
393 if ((!processor) || (!low_speed) || (!high_speed) || (!set_state))
394 return -EINVAL;
395
396 dprintk("trying to determine both speeds\n");
397
398 /* get current speed */
399 prev_speed = speedstep_get_frequency(processor);
400 if (!prev_speed)
401 return -EIO;
402
403 dprintk("previous speed is %u\n", prev_speed);
404
405 local_irq_save(flags);
406
407 /* switch to low state */
408 set_state(SPEEDSTEP_LOW);
409 *low_speed = speedstep_get_frequency(processor);
410 if (!*low_speed) {
411 ret = -EIO;
412 goto out;
413 }
414
415 dprintk("low speed is %u\n", *low_speed);
416
417 /* start latency measurement */
418 if (transition_latency)
419 do_gettimeofday(&tv1);
420
421 /* switch to high state */
422 set_state(SPEEDSTEP_HIGH);
423
424 /* end latency measurement */
425 if (transition_latency)
426 do_gettimeofday(&tv2);
427
428 *high_speed = speedstep_get_frequency(processor);
429 if (!*high_speed) {
430 ret = -EIO;
431 goto out;
432 }
433
434 dprintk("high speed is %u\n", *high_speed);
435
436 if (*low_speed == *high_speed) {
437 ret = -ENODEV;
438 goto out;
439 }
440
441 /* switch to previous state, if necessary */
442 if (*high_speed != prev_speed)
443 set_state(SPEEDSTEP_LOW);
444
445 if (transition_latency) {
446 *transition_latency = (tv2.tv_sec - tv1.tv_sec) * USEC_PER_SEC +
447 tv2.tv_usec - tv1.tv_usec;
448 dprintk("transition latency is %u uSec\n", *transition_latency);
449
450 /* convert uSec to nSec and add 20% for safety reasons */
451 *transition_latency *= 1200;
452
453 /* check if the latency measurement is too high or too low
454 * and set it to a safe value (500uSec) in that case
455 */
456 if (*transition_latency > 10000000 ||
457 *transition_latency < 50000) {
458 printk(KERN_WARNING PFX "frequency transition "
459 "measured seems out of range (%u "
460 "nSec), falling back to a safe one of"
461 "%u nSec.\n",
462 *transition_latency, 500000);
463 *transition_latency = 500000;
464 }
465 }
466
467out:
468 local_irq_restore(flags);
469 return ret;
470}
471EXPORT_SYMBOL_GPL(speedstep_get_freqs);
472
473#ifdef CONFIG_X86_SPEEDSTEP_RELAXED_CAP_CHECK
474module_param(relaxed_check, int, 0444);
475MODULE_PARM_DESC(relaxed_check,
476 "Don't do all checks for speedstep capability.");
477#endif
478
479MODULE_AUTHOR("Dominik Brodowski <linux@brodo.de>");
480MODULE_DESCRIPTION("Library for Intel SpeedStep 1 or 2 cpufreq drivers.");
481MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
deleted file mode 100644
index 70d9cea1219d..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ /dev/null
@@ -1,49 +0,0 @@
1/*
2 * (C) 2002 - 2003 Dominik Brodowski <linux@brodo.de>
3 *
4 * Licensed under the terms of the GNU GPL License version 2.
5 *
6 * Library for common functions for Intel SpeedStep v.1 and v.2 support
7 *
8 * BIG FAT DISCLAIMER: Work in progress code. Possibly *dangerous*
9 */
10
11
12
13/* processors */
14enum speedstep_processor {
15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19/* the following processors are not speedstep-capable and are not auto-detected
20 * in speedstep_detect_processor(). However, their speed can be detected using
21 * the speedstep_get_frequency() call. */
22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26
27/* speedstep states -- only two of them */
28
29#define SPEEDSTEP_HIGH 0x00000000
30#define SPEEDSTEP_LOW 0x00000001
31
32
33/* detect a speedstep-capable processor */
34extern enum speedstep_processor speedstep_detect_processor(void);
35
36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38
39
40/* detect the low and high speeds of the processor. The callback
41 * set_state"'s first argument is either SPEEDSTEP_HIGH or
42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated.
44 */
45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed,
47 unsigned int *high_speed,
48 unsigned int *transition_latency,
49 void (*set_state) (unsigned int state));
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
deleted file mode 100644
index 8abd869baabf..000000000000
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ /dev/null
@@ -1,467 +0,0 @@
1/*
2 * Intel SpeedStep SMI driver.
3 *
4 * (C) 2003 Hiroshi Miura <miura@da-cha.org>
5 *
6 * Licensed under the terms of the GNU GPL License version 2.
7 *
8 */
9
10
11/*********************************************************************
12 * SPEEDSTEP - DEFINITIONS *
13 *********************************************************************/
14
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/moduleparam.h>
18#include <linux/init.h>
19#include <linux/cpufreq.h>
20#include <linux/delay.h>
21#include <linux/io.h>
22#include <asm/ist.h>
23
24#include "speedstep-lib.h"
25
26/* speedstep system management interface port/command.
27 *
28 * These parameters are got from IST-SMI BIOS call.
29 * If user gives it, these are used.
30 *
31 */
32static int smi_port;
33static int smi_cmd;
34static unsigned int smi_sig;
35
36/* info about the processor */
37static enum speedstep_processor speedstep_processor;
38
39/*
40 * There are only two frequency states for each processor. Values
41 * are in kHz for the time being.
42 */
43static struct cpufreq_frequency_table speedstep_freqs[] = {
44 {SPEEDSTEP_HIGH, 0},
45 {SPEEDSTEP_LOW, 0},
46 {0, CPUFREQ_TABLE_END},
47};
48
49#define GET_SPEEDSTEP_OWNER 0
50#define GET_SPEEDSTEP_STATE 1
51#define SET_SPEEDSTEP_STATE 2
52#define GET_SPEEDSTEP_FREQS 4
53
54/* how often shall the SMI call be tried if it failed, e.g. because
55 * of DMA activity going on? */
56#define SMI_TRIES 5
57
58#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
59 "speedstep-smi", msg)
60
61/**
62 * speedstep_smi_ownership
63 */
64static int speedstep_smi_ownership(void)
65{
66 u32 command, result, magic, dummy;
67 u32 function = GET_SPEEDSTEP_OWNER;
68 unsigned char magic_data[] = "Copyright (c) 1999 Intel Corporation";
69
70 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
71 magic = virt_to_phys(magic_data);
72
73 dprintk("trying to obtain ownership with command %x at port %x\n",
74 command, smi_port);
75
76 __asm__ __volatile__(
77 "push %%ebp\n"
78 "out %%al, (%%dx)\n"
79 "pop %%ebp\n"
80 : "=D" (result),
81 "=a" (dummy), "=b" (dummy), "=c" (dummy), "=d" (dummy),
82 "=S" (dummy)
83 : "a" (command), "b" (function), "c" (0), "d" (smi_port),
84 "D" (0), "S" (magic)
85 : "memory"
86 );
87
88 dprintk("result is %x\n", result);
89
90 return result;
91}
92
93/**
94 * speedstep_smi_get_freqs - get SpeedStep preferred & current freq.
95 * @low: the low frequency value is placed here
96 * @high: the high frequency value is placed here
97 *
98 * Only available on later SpeedStep-enabled systems, returns false results or
99 * even hangs [cf. bugme.osdl.org # 1422] on earlier systems. Empirical testing
100 * shows that the latter occurs if !(ist_info.event & 0xFFFF).
101 */
102static int speedstep_smi_get_freqs(unsigned int *low, unsigned int *high)
103{
104 u32 command, result = 0, edi, high_mhz, low_mhz, dummy;
105 u32 state = 0;
106 u32 function = GET_SPEEDSTEP_FREQS;
107
108 if (!(ist_info.event & 0xFFFF)) {
109 dprintk("bug #1422 -- can't read freqs from BIOS\n");
110 return -ENODEV;
111 }
112
113 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
114
115 dprintk("trying to determine frequencies with command %x at port %x\n",
116 command, smi_port);
117
118 __asm__ __volatile__(
119 "push %%ebp\n"
120 "out %%al, (%%dx)\n"
121 "pop %%ebp"
122 : "=a" (result),
123 "=b" (high_mhz),
124 "=c" (low_mhz),
125 "=d" (state), "=D" (edi), "=S" (dummy)
126 : "a" (command),
127 "b" (function),
128 "c" (state),
129 "d" (smi_port), "S" (0), "D" (0)
130 );
131
132 dprintk("result %x, low_freq %u, high_freq %u\n",
133 result, low_mhz, high_mhz);
134
135 /* abort if results are obviously incorrect... */
136 if ((high_mhz + low_mhz) < 600)
137 return -EINVAL;
138
139 *high = high_mhz * 1000;
140 *low = low_mhz * 1000;
141
142 return result;
143}
144
145/**
146 * speedstep_get_state - set the SpeedStep state
147 * @state: processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
148 *
149 */
150static int speedstep_get_state(void)
151{
152 u32 function = GET_SPEEDSTEP_STATE;
153 u32 result, state, edi, command, dummy;
154
155 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
156
157 dprintk("trying to determine current setting with command %x "
158 "at port %x\n", command, smi_port);
159
160 __asm__ __volatile__(
161 "push %%ebp\n"
162 "out %%al, (%%dx)\n"
163 "pop %%ebp\n"
164 : "=a" (result),
165 "=b" (state), "=D" (edi),
166 "=c" (dummy), "=d" (dummy), "=S" (dummy)
167 : "a" (command), "b" (function), "c" (0),
168 "d" (smi_port), "S" (0), "D" (0)
169 );
170
171 dprintk("state is %x, result is %x\n", state, result);
172
173 return state & 1;
174}
175
176
177/**
178 * speedstep_set_state - set the SpeedStep state
179 * @state: new processor frequency state (SPEEDSTEP_LOW or SPEEDSTEP_HIGH)
180 *
181 */
182static void speedstep_set_state(unsigned int state)
183{
184 unsigned int result = 0, command, new_state, dummy;
185 unsigned long flags;
186 unsigned int function = SET_SPEEDSTEP_STATE;
187 unsigned int retry = 0;
188
189 if (state > 0x1)
190 return;
191
192 /* Disable IRQs */
193 local_irq_save(flags);
194
195 command = (smi_sig & 0xffffff00) | (smi_cmd & 0xff);
196
197 dprintk("trying to set frequency to state %u "
198 "with command %x at port %x\n",
199 state, command, smi_port);
200
201 do {
202 if (retry) {
203 dprintk("retry %u, previous result %u, waiting...\n",
204 retry, result);
205 mdelay(retry * 50);
206 }
207 retry++;
208 __asm__ __volatile__(
209 "push %%ebp\n"
210 "out %%al, (%%dx)\n"
211 "pop %%ebp"
212 : "=b" (new_state), "=D" (result),
213 "=c" (dummy), "=a" (dummy),
214 "=d" (dummy), "=S" (dummy)
215 : "a" (command), "b" (function), "c" (state),
216 "d" (smi_port), "S" (0), "D" (0)
217 );
218 } while ((new_state != state) && (retry <= SMI_TRIES));
219
220 /* enable IRQs */
221 local_irq_restore(flags);
222
223 if (new_state == state)
224 dprintk("change to %u MHz succeeded after %u tries "
225 "with result %u\n",
226 (speedstep_freqs[new_state].frequency / 1000),
227 retry, result);
228 else
229 printk(KERN_ERR "cpufreq: change to state %u "
230 "failed with new_state %u and result %u\n",
231 state, new_state, result);
232
233 return;
234}
235
236
237/**
238 * speedstep_target - set a new CPUFreq policy
239 * @policy: new policy
240 * @target_freq: new freq
241 * @relation:
242 *
243 * Sets a new CPUFreq policy/freq.
244 */
245static int speedstep_target(struct cpufreq_policy *policy,
246 unsigned int target_freq, unsigned int relation)
247{
248 unsigned int newstate = 0;
249 struct cpufreq_freqs freqs;
250
251 if (cpufreq_frequency_table_target(policy, &speedstep_freqs[0],
252 target_freq, relation, &newstate))
253 return -EINVAL;
254
255 freqs.old = speedstep_freqs[speedstep_get_state()].frequency;
256 freqs.new = speedstep_freqs[newstate].frequency;
257 freqs.cpu = 0; /* speedstep.c is UP only driver */
258
259 if (freqs.old == freqs.new)
260 return 0;
261
262 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
263 speedstep_set_state(newstate);
264 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
265
266 return 0;
267}
268
269
270/**
271 * speedstep_verify - verifies a new CPUFreq policy
272 * @policy: new policy
273 *
274 * Limit must be within speedstep_low_freq and speedstep_high_freq, with
275 * at least one border included.
276 */
277static int speedstep_verify(struct cpufreq_policy *policy)
278{
279 return cpufreq_frequency_table_verify(policy, &speedstep_freqs[0]);
280}
281
282
283static int speedstep_cpu_init(struct cpufreq_policy *policy)
284{
285 int result;
286 unsigned int speed, state;
287 unsigned int *low, *high;
288
289 /* capability check */
290 if (policy->cpu != 0)
291 return -ENODEV;
292
293 result = speedstep_smi_ownership();
294 if (result) {
295 dprintk("fails in aquiring ownership of a SMI interface.\n");
296 return -EINVAL;
297 }
298
299 /* detect low and high frequency */
300 low = &speedstep_freqs[SPEEDSTEP_LOW].frequency;
301 high = &speedstep_freqs[SPEEDSTEP_HIGH].frequency;
302
303 result = speedstep_smi_get_freqs(low, high);
304 if (result) {
305 /* fall back to speedstep_lib.c dection mechanism:
306 * try both states out */
307 dprintk("could not detect low and high frequencies "
308 "by SMI call.\n");
309 result = speedstep_get_freqs(speedstep_processor,
310 low, high,
311 NULL,
312 &speedstep_set_state);
313
314 if (result) {
315 dprintk("could not detect two different speeds"
316 " -- aborting.\n");
317 return result;
318 } else
319 dprintk("workaround worked.\n");
320 }
321
322 /* get current speed setting */
323 state = speedstep_get_state();
324 speed = speedstep_freqs[state].frequency;
325
326 dprintk("currently at %s speed setting - %i MHz\n",
327 (speed == speedstep_freqs[SPEEDSTEP_LOW].frequency)
328 ? "low" : "high",
329 (speed / 1000));
330
331 /* cpuinfo and default policy values */
332 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL;
333 policy->cur = speed;
334
335 result = cpufreq_frequency_table_cpuinfo(policy, speedstep_freqs);
336 if (result)
337 return result;
338
339 cpufreq_frequency_table_get_attr(speedstep_freqs, policy->cpu);
340
341 return 0;
342}
343
344static int speedstep_cpu_exit(struct cpufreq_policy *policy)
345{
346 cpufreq_frequency_table_put_attr(policy->cpu);
347 return 0;
348}
349
350static unsigned int speedstep_get(unsigned int cpu)
351{
352 if (cpu)
353 return -ENODEV;
354 return speedstep_get_frequency(speedstep_processor);
355}
356
357
358static int speedstep_resume(struct cpufreq_policy *policy)
359{
360 int result = speedstep_smi_ownership();
361
362 if (result)
363 dprintk("fails in re-aquiring ownership of a SMI interface.\n");
364
365 return result;
366}
367
368static struct freq_attr *speedstep_attr[] = {
369 &cpufreq_freq_attr_scaling_available_freqs,
370 NULL,
371};
372
373static struct cpufreq_driver speedstep_driver = {
374 .name = "speedstep-smi",
375 .verify = speedstep_verify,
376 .target = speedstep_target,
377 .init = speedstep_cpu_init,
378 .exit = speedstep_cpu_exit,
379 .get = speedstep_get,
380 .resume = speedstep_resume,
381 .owner = THIS_MODULE,
382 .attr = speedstep_attr,
383};
384
385/**
386 * speedstep_init - initializes the SpeedStep CPUFreq driver
387 *
388 * Initializes the SpeedStep support. Returns -ENODEV on unsupported
389 * BIOS, -EINVAL on problems during initiatization, and zero on
390 * success.
391 */
392static int __init speedstep_init(void)
393{
394 speedstep_processor = speedstep_detect_processor();
395
396 switch (speedstep_processor) {
397 case SPEEDSTEP_CPU_PIII_T:
398 case SPEEDSTEP_CPU_PIII_C:
399 case SPEEDSTEP_CPU_PIII_C_EARLY:
400 break;
401 default:
402 speedstep_processor = 0;
403 }
404
405 if (!speedstep_processor) {
406 dprintk("No supported Intel CPU detected.\n");
407 return -ENODEV;
408 }
409
410 dprintk("signature:0x%.8lx, command:0x%.8lx, "
411 "event:0x%.8lx, perf_level:0x%.8lx.\n",
412 ist_info.signature, ist_info.command,
413 ist_info.event, ist_info.perf_level);
414
415 /* Error if no IST-SMI BIOS or no PARM
416 sig= 'ISGE' aka 'Intel Speedstep Gate E' */
417 if ((ist_info.signature != 0x47534943) && (
418 (smi_port == 0) || (smi_cmd == 0)))
419 return -ENODEV;
420
421 if (smi_sig == 1)
422 smi_sig = 0x47534943;
423 else
424 smi_sig = ist_info.signature;
425
426 /* setup smi_port from MODLULE_PARM or BIOS */
427 if ((smi_port > 0xff) || (smi_port < 0))
428 return -EINVAL;
429 else if (smi_port == 0)
430 smi_port = ist_info.command & 0xff;
431
432 if ((smi_cmd > 0xff) || (smi_cmd < 0))
433 return -EINVAL;
434 else if (smi_cmd == 0)
435 smi_cmd = (ist_info.command >> 16) & 0xff;
436
437 return cpufreq_register_driver(&speedstep_driver);
438}
439
440
441/**
442 * speedstep_exit - unregisters SpeedStep support
443 *
444 * Unregisters SpeedStep support.
445 */
446static void __exit speedstep_exit(void)
447{
448 cpufreq_unregister_driver(&speedstep_driver);
449}
450
451module_param(smi_port, int, 0444);
452module_param(smi_cmd, int, 0444);
453module_param(smi_sig, uint, 0444);
454
455MODULE_PARM_DESC(smi_port, "Override the BIOS-given IST port with this value "
456 "-- Intel's default setting is 0xb2");
457MODULE_PARM_DESC(smi_cmd, "Override the BIOS-given IST command with this value "
458 "-- Intel's default setting is 0x82");
459MODULE_PARM_DESC(smi_sig, "Set to 1 to fake the IST signature when using the "
460 "SMI interface.");
461
462MODULE_AUTHOR("Hiroshi Miura");
463MODULE_DESCRIPTION("Speedstep driver for IST applet SMI interface.");
464MODULE_LICENSE("GPL");
465
466module_init(speedstep_init);
467module_exit(speedstep_exit);
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index b4389441efbb..1edf5ba4fb2b 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -29,10 +29,10 @@
29 29
30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c) 30static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
31{ 31{
32 u64 misc_enable;
33
32 /* Unmask CPUID levels if masked: */ 34 /* Unmask CPUID levels if masked: */
33 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) { 35 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
34 u64 misc_enable;
35
36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 36 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
37 37
38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) { 38 if (misc_enable & MSR_IA32_MISC_ENABLE_LIMIT_CPUID) {
@@ -118,8 +118,6 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
118 * (model 2) with the same problem. 118 * (model 2) with the same problem.
119 */ 119 */
120 if (c->x86 == 15) { 120 if (c->x86 == 15) {
121 u64 misc_enable;
122
123 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable); 121 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
124 122
125 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) { 123 if (misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING) {
@@ -130,6 +128,19 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
130 } 128 }
131 } 129 }
132#endif 130#endif
131
132 /*
133 * If fast string is not enabled in IA32_MISC_ENABLE for any reason,
134 * clear the fast string and enhanced fast string CPU capabilities.
135 */
136 if (c->x86 > 6 || (c->x86 == 6 && c->x86_model >= 0xd)) {
137 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
138 if (!(misc_enable & MSR_IA32_MISC_ENABLE_FAST_STRING)) {
139 printk(KERN_INFO "Disabled fast string operations\n");
140 setup_clear_cpu_cap(X86_FEATURE_REP_GOOD);
141 setup_clear_cpu_cap(X86_FEATURE_ERMS);
142 }
143 }
133} 144}
134 145
135#ifdef CONFIG_X86_32 146#ifdef CONFIG_X86_32
@@ -170,7 +181,7 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c)
170{ 181{
171#ifdef CONFIG_SMP 182#ifdef CONFIG_SMP
172 /* calling is from identify_secondary_cpu() ? */ 183 /* calling is from identify_secondary_cpu() ? */
173 if (c->cpu_index == boot_cpu_id) 184 if (!c->cpu_index)
174 return; 185 return;
175 186
176 /* 187 /*
@@ -276,17 +287,14 @@ static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c)
276 287
277static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) 288static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
278{ 289{
279#if defined(CONFIG_NUMA) && defined(CONFIG_X86_64) 290#ifdef CONFIG_NUMA
280 unsigned node; 291 unsigned node;
281 int cpu = smp_processor_id(); 292 int cpu = smp_processor_id();
282 int apicid = cpu_has_apic ? hard_smp_processor_id() : c->apicid;
283 293
284 /* Don't do the funky fallback heuristics the AMD version employs 294 /* Don't do the funky fallback heuristics the AMD version employs
285 for now. */ 295 for now. */
286 node = apicid_to_node[apicid]; 296 node = numa_cpu_node(cpu);
287 if (node == NUMA_NO_NODE) 297 if (node == NUMA_NO_NODE || !node_online(node)) {
288 node = first_node(node_online_map);
289 else if (!node_online(node)) {
290 /* reuse the value from init_cpu_to_node() */ 298 /* reuse the value from init_cpu_to_node() */
291 node = cpu_to_node(cpu); 299 node = cpu_to_node(cpu);
292 } 300 }
@@ -403,12 +411,10 @@ static void __cpuinit init_intel(struct cpuinfo_x86 *c)
403 411
404 switch (c->x86_model) { 412 switch (c->x86_model) {
405 case 5: 413 case 5:
406 if (c->x86_mask == 0) { 414 if (l2 == 0)
407 if (l2 == 0) 415 p = "Celeron (Covington)";
408 p = "Celeron (Covington)"; 416 else if (l2 == 256)
409 else if (l2 == 256) 417 p = "Mobile Pentium II (Dixon)";
410 p = "Mobile Pentium II (Dixon)";
411 }
412 break; 418 break;
413 419
414 case 6: 420 case 6:
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3fec7d9bfd62..0bf12644aa73 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -17,7 +17,7 @@
17 17
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/amd_nb.h>
21#include <asm/smp.h> 21#include <asm/smp.h>
22 22
23#define LVL_1_INST 1 23#define LVL_1_INST 1
@@ -45,6 +45,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */ 45 { 0x0a, LVL_1_DATA, 8 }, /* 2 way set assoc, 32 byte line size */
46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */ 46 { 0x0c, LVL_1_DATA, 16 }, /* 4-way set assoc, 32 byte line size */
47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
48 { 0x0e, LVL_1_DATA, 24 }, /* 6-way set assoc, 64 byte line size */
48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 49 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 50 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
@@ -66,6 +67,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */ 67 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */ 68 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */ 69 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
70 { 0x48, LVL_2, MB(3) }, /* 12-way set assoc, 64 byte line size */
69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */ 71 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
@@ -87,6 +89,7 @@ static const struct _cache_table __cpuinitconst cache_table[] =
87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */ 89 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */ 90 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 91 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
92 { 0x80, LVL_2, 512 }, /* 8-way set assoc, 64 byte line size */
90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 94 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */ 95 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
@@ -149,8 +152,7 @@ union _cpuid4_leaf_ecx {
149}; 152};
150 153
151struct amd_l3_cache { 154struct amd_l3_cache {
152 struct pci_dev *dev; 155 struct amd_northbridge *nb;
153 bool can_disable;
154 unsigned indices; 156 unsigned indices;
155 u8 subcaches[4]; 157 u8 subcaches[4];
156}; 158};
@@ -266,7 +268,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
266 line_size = l2.line_size; 268 line_size = l2.line_size;
267 lines_per_tag = l2.lines_per_tag; 269 lines_per_tag = l2.lines_per_tag;
268 /* cpu_data has errata corrections for K7 applied */ 270 /* cpu_data has errata corrections for K7 applied */
269 size_in_kb = current_cpu_data.x86_cache_size; 271 size_in_kb = __this_cpu_read(cpu_info.x86_cache_size);
270 break; 272 break;
271 case 3: 273 case 3:
272 if (!l3.val) 274 if (!l3.val)
@@ -288,7 +290,7 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
288 eax->split.type = types[leaf]; 290 eax->split.type = types[leaf];
289 eax->split.level = levels[leaf]; 291 eax->split.level = levels[leaf];
290 eax->split.num_threads_sharing = 0; 292 eax->split.num_threads_sharing = 0;
291 eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; 293 eax->split.num_cores_on_die = __this_cpu_read(cpu_info.x86_max_cores) - 1;
292 294
293 295
294 if (assoc == 0xffff) 296 if (assoc == 0xffff)
@@ -302,23 +304,22 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
302 304
303struct _cache_attr { 305struct _cache_attr {
304 struct attribute attr; 306 struct attribute attr;
305 ssize_t (*show)(struct _cpuid4_info *, char *); 307 ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
306 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count); 308 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
309 unsigned int);
307}; 310};
308 311
309#ifdef CONFIG_CPU_SUP_AMD 312#ifdef CONFIG_AMD_NB
310 313
311/* 314/*
312 * L3 cache descriptors 315 * L3 cache descriptors
313 */ 316 */
314static struct amd_l3_cache **__cpuinitdata l3_caches;
315
316static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3) 317static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
317{ 318{
318 unsigned int sc0, sc1, sc2, sc3; 319 unsigned int sc0, sc1, sc2, sc3;
319 u32 val = 0; 320 u32 val = 0;
320 321
321 pci_read_config_dword(l3->dev, 0x1C4, &val); 322 pci_read_config_dword(l3->nb->misc, 0x1C4, &val);
322 323
323 /* calculate subcache sizes */ 324 /* calculate subcache sizes */
324 l3->subcaches[0] = sc0 = !(val & BIT(0)); 325 l3->subcaches[0] = sc0 = !(val & BIT(0));
@@ -326,50 +327,17 @@ static void __cpuinit amd_calc_l3_indices(struct amd_l3_cache *l3)
326 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9)); 327 l3->subcaches[2] = sc2 = !(val & BIT(8)) + !(val & BIT(9));
327 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13)); 328 l3->subcaches[3] = sc3 = !(val & BIT(12)) + !(val & BIT(13));
328 329
329 l3->indices = (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1; 330 l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
330}
331
332static struct amd_l3_cache * __cpuinit amd_init_l3_cache(int node)
333{
334 struct amd_l3_cache *l3;
335 struct pci_dev *dev = node_to_k8_nb_misc(node);
336
337 l3 = kzalloc(sizeof(struct amd_l3_cache), GFP_ATOMIC);
338 if (!l3) {
339 printk(KERN_WARNING "Error allocating L3 struct\n");
340 return NULL;
341 }
342
343 l3->dev = dev;
344
345 amd_calc_l3_indices(l3);
346
347 return l3;
348} 331}
349 332
350static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, 333static void __cpuinit amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf,
351 int index) 334 int index)
352{ 335{
336 static struct amd_l3_cache *__cpuinitdata l3_caches;
353 int node; 337 int node;
354 338
355 if (boot_cpu_data.x86 != 0x10) 339 /* only for L3, and not in virtualized environments */
356 return; 340 if (index < 3 || amd_nb_num() == 0)
357
358 if (index < 3)
359 return;
360
361 /* see errata #382 and #388 */
362 if (boot_cpu_data.x86_model < 0x8)
363 return;
364
365 if ((boot_cpu_data.x86_model == 0x8 ||
366 boot_cpu_data.x86_model == 0x9)
367 &&
368 boot_cpu_data.x86_mask < 0x1)
369 return;
370
371 /* not in virtualized environments */
372 if (num_k8_northbridges == 0)
373 return; 341 return;
374 342
375 /* 343 /*
@@ -377,7 +345,7 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
377 * never freed but this is done only on shutdown so it doesn't matter. 345 * never freed but this is done only on shutdown so it doesn't matter.
378 */ 346 */
379 if (!l3_caches) { 347 if (!l3_caches) {
380 int size = num_k8_northbridges * sizeof(struct amd_l3_cache *); 348 int size = amd_nb_num() * sizeof(struct amd_l3_cache);
381 349
382 l3_caches = kzalloc(size, GFP_ATOMIC); 350 l3_caches = kzalloc(size, GFP_ATOMIC);
383 if (!l3_caches) 351 if (!l3_caches)
@@ -386,14 +354,12 @@ static void __cpuinit amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf,
386 354
387 node = amd_get_nb_id(smp_processor_id()); 355 node = amd_get_nb_id(smp_processor_id());
388 356
389 if (!l3_caches[node]) { 357 if (!l3_caches[node].nb) {
390 l3_caches[node] = amd_init_l3_cache(node); 358 l3_caches[node].nb = node_to_amd_nb(node);
391 l3_caches[node]->can_disable = true; 359 amd_calc_l3_indices(&l3_caches[node]);
392 } 360 }
393 361
394 WARN_ON(!l3_caches[node]); 362 this_leaf->l3 = &l3_caches[node];
395
396 this_leaf->l3 = l3_caches[node];
397} 363}
398 364
399/* 365/*
@@ -407,7 +373,7 @@ int amd_get_l3_disable_slot(struct amd_l3_cache *l3, unsigned slot)
407{ 373{
408 unsigned int reg = 0; 374 unsigned int reg = 0;
409 375
410 pci_read_config_dword(l3->dev, 0x1BC + slot * 4, &reg); 376 pci_read_config_dword(l3->nb->misc, 0x1BC + slot * 4, &reg);
411 377
412 /* check whether this slot is activated already */ 378 /* check whether this slot is activated already */
413 if (reg & (3UL << 30)) 379 if (reg & (3UL << 30))
@@ -421,7 +387,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
421{ 387{
422 int index; 388 int index;
423 389
424 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 390 if (!this_leaf->l3 ||
391 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
425 return -EINVAL; 392 return -EINVAL;
426 393
427 index = amd_get_l3_disable_slot(this_leaf->l3, slot); 394 index = amd_get_l3_disable_slot(this_leaf->l3, slot);
@@ -433,7 +400,8 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
433 400
434#define SHOW_CACHE_DISABLE(slot) \ 401#define SHOW_CACHE_DISABLE(slot) \
435static ssize_t \ 402static ssize_t \
436show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf) \ 403show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
404 unsigned int cpu) \
437{ \ 405{ \
438 return show_cache_disable(this_leaf, buf, slot); \ 406 return show_cache_disable(this_leaf, buf, slot); \
439} 407}
@@ -456,7 +424,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
456 if (!l3->subcaches[i]) 424 if (!l3->subcaches[i])
457 continue; 425 continue;
458 426
459 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); 427 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
460 428
461 /* 429 /*
462 * We need to WBINVD on a core on the node containing the L3 430 * We need to WBINVD on a core on the node containing the L3
@@ -466,7 +434,7 @@ static void amd_l3_disable_index(struct amd_l3_cache *l3, int cpu,
466 wbinvd_on_cpu(cpu); 434 wbinvd_on_cpu(cpu);
467 435
468 reg |= BIT(31); 436 reg |= BIT(31);
469 pci_write_config_dword(l3->dev, 0x1BC + slot * 4, reg); 437 pci_write_config_dword(l3->nb->misc, 0x1BC + slot * 4, reg);
470 } 438 }
471} 439}
472 440
@@ -485,27 +453,16 @@ int amd_set_l3_disable_slot(struct amd_l3_cache *l3, int cpu, unsigned slot,
485{ 453{
486 int ret = 0; 454 int ret = 0;
487 455
488#define SUBCACHE_MASK (3UL << 20) 456 /* check if @slot is already used or the index is already disabled */
489#define SUBCACHE_INDEX 0xfff
490
491 /*
492 * check whether this slot is already used or
493 * the index is already disabled
494 */
495 ret = amd_get_l3_disable_slot(l3, slot); 457 ret = amd_get_l3_disable_slot(l3, slot);
496 if (ret >= 0) 458 if (ret >= 0)
497 return -EINVAL; 459 return -EINVAL;
498 460
499 /* 461 if (index > l3->indices)
500 * check whether the other slot has disabled the
501 * same index already
502 */
503 if (index == amd_get_l3_disable_slot(l3, !slot))
504 return -EINVAL; 462 return -EINVAL;
505 463
506 /* do not allow writes outside of allowed bits */ 464 /* check whether the other slot has disabled the same index already */
507 if ((index & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) || 465 if (index == amd_get_l3_disable_slot(l3, !slot))
508 ((index & SUBCACHE_INDEX) > l3->indices))
509 return -EINVAL; 466 return -EINVAL;
510 467
511 amd_l3_disable_index(l3, cpu, slot, index); 468 amd_l3_disable_index(l3, cpu, slot, index);
@@ -523,7 +480,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
523 if (!capable(CAP_SYS_ADMIN)) 480 if (!capable(CAP_SYS_ADMIN))
524 return -EPERM; 481 return -EPERM;
525 482
526 if (!this_leaf->l3 || !this_leaf->l3->can_disable) 483 if (!this_leaf->l3 ||
484 !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
527 return -EINVAL; 485 return -EINVAL;
528 486
529 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); 487 cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
@@ -544,7 +502,8 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
544#define STORE_CACHE_DISABLE(slot) \ 502#define STORE_CACHE_DISABLE(slot) \
545static ssize_t \ 503static ssize_t \
546store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ 504store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
547 const char *buf, size_t count) \ 505 const char *buf, size_t count, \
506 unsigned int cpu) \
548{ \ 507{ \
549 return store_cache_disable(this_leaf, buf, count, slot); \ 508 return store_cache_disable(this_leaf, buf, count, slot); \
550} 509}
@@ -556,25 +515,55 @@ static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
556static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 515static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
557 show_cache_disable_1, store_cache_disable_1); 516 show_cache_disable_1, store_cache_disable_1);
558 517
559#else /* CONFIG_CPU_SUP_AMD */ 518static ssize_t
560static void __cpuinit 519show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
561amd_check_l3_disable(struct _cpuid4_info_regs *this_leaf, int index)
562{ 520{
563}; 521 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
564#endif /* CONFIG_CPU_SUP_AMD */ 522 return -EINVAL;
523
524 return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
525}
526
527static ssize_t
528store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
529 unsigned int cpu)
530{
531 unsigned long val;
532
533 if (!capable(CAP_SYS_ADMIN))
534 return -EPERM;
535
536 if (!this_leaf->l3 || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
537 return -EINVAL;
538
539 if (strict_strtoul(buf, 16, &val) < 0)
540 return -EINVAL;
541
542 if (amd_set_subcaches(cpu, val))
543 return -EINVAL;
544
545 return count;
546}
547
548static struct _cache_attr subcaches =
549 __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
550
551#else /* CONFIG_AMD_NB */
552#define amd_init_l3_cache(x, y)
553#endif /* CONFIG_AMD_NB */
565 554
566static int 555static int
567__cpuinit cpuid4_cache_lookup_regs(int index, 556__cpuinit cpuid4_cache_lookup_regs(int index,
568 struct _cpuid4_info_regs *this_leaf) 557 struct _cpuid4_info_regs *this_leaf)
569{ 558{
570 union _cpuid4_leaf_eax eax; 559 union _cpuid4_leaf_eax eax;
571 union _cpuid4_leaf_ebx ebx; 560 union _cpuid4_leaf_ebx ebx;
572 union _cpuid4_leaf_ecx ecx; 561 union _cpuid4_leaf_ecx ecx;
573 unsigned edx; 562 unsigned edx;
574 563
575 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { 564 if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) {
576 amd_cpuid4(index, &eax, &ebx, &ecx); 565 amd_cpuid4(index, &eax, &ebx, &ecx);
577 amd_check_l3_disable(this_leaf, index); 566 amd_init_l3_cache(this_leaf, index);
578 } else { 567 } else {
579 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); 568 cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
580 } 569 }
@@ -784,11 +773,11 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
784 struct cpuinfo_x86 *c = &cpu_data(cpu); 773 struct cpuinfo_x86 *c = &cpu_data(cpu);
785 774
786 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 775 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
787 for_each_cpu(i, c->llc_shared_map) { 776 for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
788 if (!per_cpu(ici_cpuid4_info, i)) 777 if (!per_cpu(ici_cpuid4_info, i))
789 continue; 778 continue;
790 this_leaf = CPUID4_INFO_IDX(i, index); 779 this_leaf = CPUID4_INFO_IDX(i, index);
791 for_each_cpu(sibling, c->llc_shared_map) { 780 for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
792 if (!cpu_online(sibling)) 781 if (!cpu_online(sibling))
793 continue; 782 continue;
794 set_bit(sibling, this_leaf->shared_cpu_map); 783 set_bit(sibling, this_leaf->shared_cpu_map);
@@ -922,8 +911,8 @@ static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
922#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) 911#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
923 912
924#define show_one_plus(file_name, object, val) \ 913#define show_one_plus(file_name, object, val) \
925static ssize_t show_##file_name \ 914static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
926 (struct _cpuid4_info *this_leaf, char *buf) \ 915 unsigned int cpu) \
927{ \ 916{ \
928 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ 917 return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
929} 918}
@@ -934,7 +923,8 @@ show_one_plus(physical_line_partition, ebx.split.physical_line_partition, 1);
934show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1); 923show_one_plus(ways_of_associativity, ebx.split.ways_of_associativity, 1);
935show_one_plus(number_of_sets, ecx.split.number_of_sets, 1); 924show_one_plus(number_of_sets, ecx.split.number_of_sets, 1);
936 925
937static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf) 926static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
927 unsigned int cpu)
938{ 928{
939 return sprintf(buf, "%luK\n", this_leaf->size / 1024); 929 return sprintf(buf, "%luK\n", this_leaf->size / 1024);
940} 930}
@@ -958,17 +948,20 @@ static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
958 return n; 948 return n;
959} 949}
960 950
961static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf) 951static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
952 unsigned int cpu)
962{ 953{
963 return show_shared_cpu_map_func(leaf, 0, buf); 954 return show_shared_cpu_map_func(leaf, 0, buf);
964} 955}
965 956
966static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf) 957static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
958 unsigned int cpu)
967{ 959{
968 return show_shared_cpu_map_func(leaf, 1, buf); 960 return show_shared_cpu_map_func(leaf, 1, buf);
969} 961}
970 962
971static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) 963static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
964 unsigned int cpu)
972{ 965{
973 switch (this_leaf->eax.split.type) { 966 switch (this_leaf->eax.split.type) {
974 case CACHE_TYPE_DATA: 967 case CACHE_TYPE_DATA:
@@ -999,30 +992,54 @@ define_one_ro(size);
999define_one_ro(shared_cpu_map); 992define_one_ro(shared_cpu_map);
1000define_one_ro(shared_cpu_list); 993define_one_ro(shared_cpu_list);
1001 994
1002#define DEFAULT_SYSFS_CACHE_ATTRS \
1003 &type.attr, \
1004 &level.attr, \
1005 &coherency_line_size.attr, \
1006 &physical_line_partition.attr, \
1007 &ways_of_associativity.attr, \
1008 &number_of_sets.attr, \
1009 &size.attr, \
1010 &shared_cpu_map.attr, \
1011 &shared_cpu_list.attr
1012
1013static struct attribute *default_attrs[] = { 995static struct attribute *default_attrs[] = {
1014 DEFAULT_SYSFS_CACHE_ATTRS, 996 &type.attr,
997 &level.attr,
998 &coherency_line_size.attr,
999 &physical_line_partition.attr,
1000 &ways_of_associativity.attr,
1001 &number_of_sets.attr,
1002 &size.attr,
1003 &shared_cpu_map.attr,
1004 &shared_cpu_list.attr,
1015 NULL 1005 NULL
1016}; 1006};
1017 1007
1018static struct attribute *default_l3_attrs[] = { 1008#ifdef CONFIG_AMD_NB
1019 DEFAULT_SYSFS_CACHE_ATTRS, 1009static struct attribute ** __cpuinit amd_l3_attrs(void)
1020#ifdef CONFIG_CPU_SUP_AMD 1010{
1021 &cache_disable_0.attr, 1011 static struct attribute **attrs;
1022 &cache_disable_1.attr, 1012 int n;
1013
1014 if (attrs)
1015 return attrs;
1016
1017 n = sizeof (default_attrs) / sizeof (struct attribute *);
1018
1019 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
1020 n += 2;
1021
1022 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1023 n += 1;
1024
1025 attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
1026 if (attrs == NULL)
1027 return attrs = default_attrs;
1028
1029 for (n = 0; default_attrs[n]; n++)
1030 attrs[n] = default_attrs[n];
1031
1032 if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
1033 attrs[n++] = &cache_disable_0.attr;
1034 attrs[n++] = &cache_disable_1.attr;
1035 }
1036
1037 if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
1038 attrs[n++] = &subcaches.attr;
1039
1040 return attrs;
1041}
1023#endif 1042#endif
1024 NULL
1025};
1026 1043
1027static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) 1044static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
1028{ 1045{
@@ -1032,7 +1049,7 @@ static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
1032 1049
1033 ret = fattr->show ? 1050 ret = fattr->show ?
1034 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1051 fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1035 buf) : 1052 buf, this_leaf->cpu) :
1036 0; 1053 0;
1037 return ret; 1054 return ret;
1038} 1055}
@@ -1046,7 +1063,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
1046 1063
1047 ret = fattr->store ? 1064 ret = fattr->store ?
1048 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), 1065 fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
1049 buf, count) : 1066 buf, count, this_leaf->cpu) :
1050 0; 1067 0;
1051 return ret; 1068 return ret;
1052} 1069}
@@ -1133,11 +1150,11 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
1133 1150
1134 this_leaf = CPUID4_INFO_IDX(cpu, i); 1151 this_leaf = CPUID4_INFO_IDX(cpu, i);
1135 1152
1136 if (this_leaf->l3 && this_leaf->l3->can_disable) 1153 ktype_cache.default_attrs = default_attrs;
1137 ktype_cache.default_attrs = default_l3_attrs; 1154#ifdef CONFIG_AMD_NB
1138 else 1155 if (this_leaf->l3)
1139 ktype_cache.default_attrs = default_attrs; 1156 ktype_cache.default_attrs = amd_l3_attrs();
1140 1157#endif
1141 retval = kobject_init_and_add(&(this_object->kobj), 1158 retval = kobject_init_and_add(&(this_object->kobj),
1142 &ktype_cache, 1159 &ktype_cache,
1143 per_cpu(ici_cache_kobject, cpu), 1160 per_cpu(ici_cache_kobject, cpu),
diff --git a/arch/x86/kernel/cpu/mcheck/mce-apei.c b/arch/x86/kernel/cpu/mcheck/mce-apei.c
index 8209472b27a5..83930deec3c6 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-apei.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-apei.c
@@ -106,24 +106,34 @@ int apei_write_mce(struct mce *m)
106ssize_t apei_read_mce(struct mce *m, u64 *record_id) 106ssize_t apei_read_mce(struct mce *m, u64 *record_id)
107{ 107{
108 struct cper_mce_record rcd; 108 struct cper_mce_record rcd;
109 ssize_t len; 109 int rc, pos;
110 110
111 len = erst_read_next(&rcd.hdr, sizeof(rcd)); 111 rc = erst_get_record_id_begin(&pos);
112 if (len <= 0) 112 if (rc)
113 return len; 113 return rc;
114 /* Can not skip other records in storage via ERST unless clear them */ 114retry:
115 else if (len != sizeof(rcd) || 115 rc = erst_get_record_id_next(&pos, record_id);
116 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE)) { 116 if (rc)
117 if (printk_ratelimit()) 117 goto out;
118 pr_warning( 118 /* no more record */
119 "MCE-APEI: Can not skip the unknown record in ERST"); 119 if (*record_id == APEI_ERST_INVALID_RECORD_ID)
120 return -EIO; 120 goto out;
121 } 121 rc = erst_read(*record_id, &rcd.hdr, sizeof(rcd));
122 122 /* someone else has cleared the record, try next one */
123 if (rc == -ENOENT)
124 goto retry;
125 else if (rc < 0)
126 goto out;
127 /* try to skip other type records in storage */
128 else if (rc != sizeof(rcd) ||
129 uuid_le_cmp(rcd.hdr.creator_id, CPER_CREATOR_MCE))
130 goto retry;
123 memcpy(m, &rcd.mce, sizeof(*m)); 131 memcpy(m, &rcd.mce, sizeof(*m));
124 *record_id = rcd.hdr.record_id; 132 rc = sizeof(*m);
133out:
134 erst_get_record_id_end();
125 135
126 return sizeof(*m); 136 return rc;
127} 137}
128 138
129/* Check whether there is record in ERST */ 139/* Check whether there is record in ERST */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index e7dbde7bfedb..0ed633c5048b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -25,13 +25,14 @@
25#include <linux/gfp.h> 25#include <linux/gfp.h>
26#include <asm/mce.h> 26#include <asm/mce.h>
27#include <asm/apic.h> 27#include <asm/apic.h>
28#include <asm/nmi.h>
28 29
29/* Update fake mce registers on current CPU. */ 30/* Update fake mce registers on current CPU. */
30static void inject_mce(struct mce *m) 31static void inject_mce(struct mce *m)
31{ 32{
32 struct mce *i = &per_cpu(injectm, m->extcpu); 33 struct mce *i = &per_cpu(injectm, m->extcpu);
33 34
34 /* Make sure noone reads partially written injectm */ 35 /* Make sure no one reads partially written injectm */
35 i->finished = 0; 36 i->finished = 0;
36 mb(); 37 mb();
37 m->finished = 0; 38 m->finished = 0;
@@ -83,7 +84,7 @@ static int mce_raise_notify(struct notifier_block *self,
83 struct die_args *args = (struct die_args *)data; 84 struct die_args *args = (struct die_args *)data;
84 int cpu = smp_processor_id(); 85 int cpu = smp_processor_id();
85 struct mce *m = &__get_cpu_var(injectm); 86 struct mce *m = &__get_cpu_var(injectm);
86 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask)) 87 if (val != DIE_NMI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
87 return NOTIFY_DONE; 88 return NOTIFY_DONE;
88 cpumask_clear_cpu(cpu, mce_inject_cpumask); 89 cpumask_clear_cpu(cpu, mce_inject_cpumask);
89 if (m->inject_flags & MCJ_EXCEPTION) 90 if (m->inject_flags & MCJ_EXCEPTION)
@@ -95,7 +96,7 @@ static int mce_raise_notify(struct notifier_block *self,
95 96
96static struct notifier_block mce_raise_nb = { 97static struct notifier_block mce_raise_nb = {
97 .notifier_call = mce_raise_notify, 98 .notifier_call = mce_raise_notify,
98 .priority = 1000, 99 .priority = NMI_LOCAL_NORMAL_PRIOR,
99}; 100};
100 101
101/* Inject mce on current CPU */ 102/* Inject mce on current CPU */
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8a85dd1b1aa1..1e8d66c1336a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -192,6 +192,7 @@ static const struct file_operations severities_coverage_fops = {
192 .release = seq_release, 192 .release = seq_release,
193 .read = seq_read, 193 .read = seq_read,
194 .write = severities_coverage_write, 194 .write = severities_coverage_write,
195 .llseek = seq_lseek,
195}; 196};
196 197
197static int __init severities_debugfs_init(void) 198static int __init severities_debugfs_init(void)
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index ed41562909fe..ff1ae9b6464d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -21,6 +21,7 @@
21#include <linux/percpu.h> 21#include <linux/percpu.h>
22#include <linux/string.h> 22#include <linux/string.h>
23#include <linux/sysdev.h> 23#include <linux/sysdev.h>
24#include <linux/syscore_ops.h>
24#include <linux/delay.h> 25#include <linux/delay.h>
25#include <linux/ctype.h> 26#include <linux/ctype.h>
26#include <linux/sched.h> 27#include <linux/sched.h>
@@ -104,20 +105,6 @@ static int cpu_missing;
104ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); 105ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
105EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); 106EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
106 107
107static int default_decode_mce(struct notifier_block *nb, unsigned long val,
108 void *data)
109{
110 pr_emerg(HW_ERR "No human readable MCE decoding support on this CPU type.\n");
111 pr_emerg(HW_ERR "Run the message through 'mcelog --ascii' to decode.\n");
112
113 return NOTIFY_STOP;
114}
115
116static struct notifier_block mce_dec_nb = {
117 .notifier_call = default_decode_mce,
118 .priority = -1,
119};
120
121/* MCA banks polled by the period polling timer for corrected events */ 108/* MCA banks polled by the period polling timer for corrected events */
122DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 109DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
123 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL 110 [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL
@@ -211,6 +198,8 @@ void mce_log(struct mce *mce)
211 198
212static void print_mce(struct mce *m) 199static void print_mce(struct mce *m)
213{ 200{
201 int ret = 0;
202
214 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n", 203 pr_emerg(HW_ERR "CPU %d: Machine Check Exception: %Lx Bank %d: %016Lx\n",
215 m->extcpu, m->mcgstatus, m->bank, m->status); 204 m->extcpu, m->mcgstatus, m->bank, m->status);
216 205
@@ -238,7 +227,11 @@ static void print_mce(struct mce *m)
238 * Print out human-readable details about the MCE error, 227 * Print out human-readable details about the MCE error,
239 * (if the CPU has an implementation for that) 228 * (if the CPU has an implementation for that)
240 */ 229 */
241 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); 230 ret = atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
231 if (ret == NOTIFY_STOP)
232 return;
233
234 pr_emerg_ratelimited(HW_ERR "Run the above through 'mcelog --ascii'\n");
242} 235}
243 236
244#define PANIC_TIMEOUT 5 /* 5 seconds */ 237#define PANIC_TIMEOUT 5 /* 5 seconds */
@@ -326,7 +319,7 @@ static void mce_panic(char *msg, struct mce *final, char *exp)
326 319
327static int msr_to_offset(u32 msr) 320static int msr_to_offset(u32 msr)
328{ 321{
329 unsigned bank = __get_cpu_var(injectm.bank); 322 unsigned bank = __this_cpu_read(injectm.bank);
330 323
331 if (msr == rip_msr) 324 if (msr == rip_msr)
332 return offsetof(struct mce, ip); 325 return offsetof(struct mce, ip);
@@ -346,7 +339,7 @@ static u64 mce_rdmsrl(u32 msr)
346{ 339{
347 u64 v; 340 u64 v;
348 341
349 if (__get_cpu_var(injectm).finished) { 342 if (__this_cpu_read(injectm.finished)) {
350 int offset = msr_to_offset(msr); 343 int offset = msr_to_offset(msr);
351 344
352 if (offset < 0) 345 if (offset < 0)
@@ -369,7 +362,7 @@ static u64 mce_rdmsrl(u32 msr)
369 362
370static void mce_wrmsrl(u32 msr, u64 v) 363static void mce_wrmsrl(u32 msr, u64 v)
371{ 364{
372 if (__get_cpu_var(injectm).finished) { 365 if (__this_cpu_read(injectm.finished)) {
373 int offset = msr_to_offset(msr); 366 int offset = msr_to_offset(msr);
374 367
375 if (offset >= 0) 368 if (offset >= 0)
@@ -589,7 +582,6 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
589 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) { 582 if (!(flags & MCP_DONTLOG) && !mce_dont_log_ce) {
590 mce_log(&m); 583 mce_log(&m);
591 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m); 584 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, &m);
592 add_taint(TAINT_MACHINE_CHECK);
593 } 585 }
594 586
595 /* 587 /*
@@ -881,7 +873,7 @@ reset:
881 * Check if the address reported by the CPU is in a format we can parse. 873 * Check if the address reported by the CPU is in a format we can parse.
882 * It would be possible to add code for most other cases, but all would 874 * It would be possible to add code for most other cases, but all would
883 * be somewhat complicated (e.g. segment offset would require an instruction 875 * be somewhat complicated (e.g. segment offset would require an instruction
884 * parser). So only support physical addresses upto page granuality for now. 876 * parser). So only support physical addresses up to page granuality for now.
885 */ 877 */
886static int mce_usable_address(struct mce *m) 878static int mce_usable_address(struct mce *m)
887{ 879{
@@ -1159,7 +1151,7 @@ static void mce_start_timer(unsigned long data)
1159 1151
1160 WARN_ON(smp_processor_id() != data); 1152 WARN_ON(smp_processor_id() != data);
1161 1153
1162 if (mce_available(&current_cpu_data)) { 1154 if (mce_available(__this_cpu_ptr(&cpu_info))) {
1163 machine_check_poll(MCP_TIMESTAMP, 1155 machine_check_poll(MCP_TIMESTAMP,
1164 &__get_cpu_var(mce_poll_banks)); 1156 &__get_cpu_var(mce_poll_banks));
1165 } 1157 }
@@ -1625,7 +1617,7 @@ out:
1625static unsigned int mce_poll(struct file *file, poll_table *wait) 1617static unsigned int mce_poll(struct file *file, poll_table *wait)
1626{ 1618{
1627 poll_wait(file, &mce_wait, wait); 1619 poll_wait(file, &mce_wait, wait);
1628 if (rcu_dereference_check_mce(mcelog.next)) 1620 if (rcu_access_index(mcelog.next))
1629 return POLLIN | POLLRDNORM; 1621 return POLLIN | POLLRDNORM;
1630 if (!mce_apei_read_done && apei_check_mce()) 1622 if (!mce_apei_read_done && apei_check_mce())
1631 return POLLIN | POLLRDNORM; 1623 return POLLIN | POLLRDNORM;
@@ -1665,6 +1657,7 @@ struct file_operations mce_chrdev_ops = {
1665 .read = mce_read, 1657 .read = mce_read,
1666 .poll = mce_poll, 1658 .poll = mce_poll,
1667 .unlocked_ioctl = mce_ioctl, 1659 .unlocked_ioctl = mce_ioctl,
1660 .llseek = no_llseek,
1668}; 1661};
1669EXPORT_SYMBOL_GPL(mce_chrdev_ops); 1662EXPORT_SYMBOL_GPL(mce_chrdev_ops);
1670 1663
@@ -1720,8 +1713,6 @@ __setup("mce", mcheck_enable);
1720 1713
1721int __init mcheck_init(void) 1714int __init mcheck_init(void)
1722{ 1715{
1723 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1724
1725 mcheck_intel_therm_init(); 1716 mcheck_intel_therm_init();
1726 1717
1727 return 0; 1718 return 0;
@@ -1748,14 +1739,14 @@ static int mce_disable_error_reporting(void)
1748 return 0; 1739 return 0;
1749} 1740}
1750 1741
1751static int mce_suspend(struct sys_device *dev, pm_message_t state) 1742static int mce_suspend(void)
1752{ 1743{
1753 return mce_disable_error_reporting(); 1744 return mce_disable_error_reporting();
1754} 1745}
1755 1746
1756static int mce_shutdown(struct sys_device *dev) 1747static void mce_shutdown(void)
1757{ 1748{
1758 return mce_disable_error_reporting(); 1749 mce_disable_error_reporting();
1759} 1750}
1760 1751
1761/* 1752/*
@@ -1763,18 +1754,22 @@ static int mce_shutdown(struct sys_device *dev)
1763 * Only one CPU is active at this time, the others get re-added later using 1754 * Only one CPU is active at this time, the others get re-added later using
1764 * CPU hotplug: 1755 * CPU hotplug:
1765 */ 1756 */
1766static int mce_resume(struct sys_device *dev) 1757static void mce_resume(void)
1767{ 1758{
1768 __mcheck_cpu_init_generic(); 1759 __mcheck_cpu_init_generic();
1769 __mcheck_cpu_init_vendor(&current_cpu_data); 1760 __mcheck_cpu_init_vendor(__this_cpu_ptr(&cpu_info));
1770
1771 return 0;
1772} 1761}
1773 1762
1763static struct syscore_ops mce_syscore_ops = {
1764 .suspend = mce_suspend,
1765 .shutdown = mce_shutdown,
1766 .resume = mce_resume,
1767};
1768
1774static void mce_cpu_restart(void *data) 1769static void mce_cpu_restart(void *data)
1775{ 1770{
1776 del_timer_sync(&__get_cpu_var(mce_timer)); 1771 del_timer_sync(&__get_cpu_var(mce_timer));
1777 if (!mce_available(&current_cpu_data)) 1772 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1778 return; 1773 return;
1779 __mcheck_cpu_init_generic(); 1774 __mcheck_cpu_init_generic();
1780 __mcheck_cpu_init_timer(); 1775 __mcheck_cpu_init_timer();
@@ -1789,7 +1784,7 @@ static void mce_restart(void)
1789/* Toggle features for corrected errors */ 1784/* Toggle features for corrected errors */
1790static void mce_disable_ce(void *all) 1785static void mce_disable_ce(void *all)
1791{ 1786{
1792 if (!mce_available(&current_cpu_data)) 1787 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1793 return; 1788 return;
1794 if (all) 1789 if (all)
1795 del_timer_sync(&__get_cpu_var(mce_timer)); 1790 del_timer_sync(&__get_cpu_var(mce_timer));
@@ -1798,7 +1793,7 @@ static void mce_disable_ce(void *all)
1798 1793
1799static void mce_enable_ce(void *all) 1794static void mce_enable_ce(void *all)
1800{ 1795{
1801 if (!mce_available(&current_cpu_data)) 1796 if (!mce_available(__this_cpu_ptr(&cpu_info)))
1802 return; 1797 return;
1803 cmci_reenable(); 1798 cmci_reenable();
1804 cmci_recheck(); 1799 cmci_recheck();
@@ -1807,9 +1802,6 @@ static void mce_enable_ce(void *all)
1807} 1802}
1808 1803
1809static struct sysdev_class mce_sysclass = { 1804static struct sysdev_class mce_sysclass = {
1810 .suspend = mce_suspend,
1811 .shutdown = mce_shutdown,
1812 .resume = mce_resume,
1813 .name = "machinecheck", 1805 .name = "machinecheck",
1814}; 1806};
1815 1807
@@ -2021,7 +2013,7 @@ static void __cpuinit mce_disable_cpu(void *h)
2021 unsigned long action = *(unsigned long *)h; 2013 unsigned long action = *(unsigned long *)h;
2022 int i; 2014 int i;
2023 2015
2024 if (!mce_available(&current_cpu_data)) 2016 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2025 return; 2017 return;
2026 2018
2027 if (!(action & CPU_TASKS_FROZEN)) 2019 if (!(action & CPU_TASKS_FROZEN))
@@ -2039,7 +2031,7 @@ static void __cpuinit mce_reenable_cpu(void *h)
2039 unsigned long action = *(unsigned long *)h; 2031 unsigned long action = *(unsigned long *)h;
2040 int i; 2032 int i;
2041 2033
2042 if (!mce_available(&current_cpu_data)) 2034 if (!mce_available(__this_cpu_ptr(&cpu_info)))
2043 return; 2035 return;
2044 2036
2045 if (!(action & CPU_TASKS_FROZEN)) 2037 if (!(action & CPU_TASKS_FROZEN))
@@ -2138,6 +2130,7 @@ static __init int mcheck_init_device(void)
2138 return err; 2130 return err;
2139 } 2131 }
2140 2132
2133 register_syscore_ops(&mce_syscore_ops);
2141 register_hotcpu_notifier(&mce_cpu_notifier); 2134 register_hotcpu_notifier(&mce_cpu_notifier);
2142 misc_register(&mce_log_device); 2135 misc_register(&mce_log_device);
2143 2136
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 39aaee5c1ab2..bb0adad35143 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -31,8 +31,6 @@
31#include <asm/mce.h> 31#include <asm/mce.h>
32#include <asm/msr.h> 32#include <asm/msr.h>
33 33
34#define PFX "mce_threshold: "
35#define VERSION "version 1.1.1"
36#define NR_BANKS 6 34#define NR_BANKS 6
37#define NR_BLOCKS 9 35#define NR_BLOCKS 9
38#define THRESHOLD_MAX 0xFFF 36#define THRESHOLD_MAX 0xFFF
@@ -59,12 +57,6 @@ struct threshold_block {
59 struct list_head miscj; 57 struct list_head miscj;
60}; 58};
61 59
62/* defaults used early on boot */
63static struct threshold_block threshold_defaults = {
64 .interrupt_enable = 0,
65 .threshold_limit = THRESHOLD_MAX,
66};
67
68struct threshold_bank { 60struct threshold_bank {
69 struct kobject *kobj; 61 struct kobject *kobj;
70 struct threshold_block *blocks; 62 struct threshold_block *blocks;
@@ -89,49 +81,101 @@ static void amd_threshold_interrupt(void);
89struct thresh_restart { 81struct thresh_restart {
90 struct threshold_block *b; 82 struct threshold_block *b;
91 int reset; 83 int reset;
84 int set_lvt_off;
85 int lvt_off;
92 u16 old_limit; 86 u16 old_limit;
93}; 87};
94 88
89static int lvt_off_valid(struct threshold_block *b, int apic, u32 lo, u32 hi)
90{
91 int msr = (hi & MASK_LVTOFF_HI) >> 20;
92
93 if (apic < 0) {
94 pr_err(FW_BUG "cpu %d, failed to setup threshold interrupt "
95 "for bank %d, block %d (MSR%08X=0x%x%08x)\n", b->cpu,
96 b->bank, b->block, b->address, hi, lo);
97 return 0;
98 }
99
100 if (apic != msr) {
101 pr_err(FW_BUG "cpu %d, invalid threshold interrupt offset %d "
102 "for bank %d, block %d (MSR%08X=0x%x%08x)\n",
103 b->cpu, apic, b->bank, b->block, b->address, hi, lo);
104 return 0;
105 }
106
107 return 1;
108};
109
95/* must be called with correct cpu affinity */ 110/* must be called with correct cpu affinity */
96/* Called via smp_call_function_single() */ 111/* Called via smp_call_function_single() */
97static void threshold_restart_bank(void *_tr) 112static void threshold_restart_bank(void *_tr)
98{ 113{
99 struct thresh_restart *tr = _tr; 114 struct thresh_restart *tr = _tr;
100 u32 mci_misc_hi, mci_misc_lo; 115 u32 hi, lo;
101 116
102 rdmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 117 rdmsr(tr->b->address, lo, hi);
103 118
104 if (tr->b->threshold_limit < (mci_misc_hi & THRESHOLD_MAX)) 119 if (tr->b->threshold_limit < (hi & THRESHOLD_MAX))
105 tr->reset = 1; /* limit cannot be lower than err count */ 120 tr->reset = 1; /* limit cannot be lower than err count */
106 121
107 if (tr->reset) { /* reset err count and overflow bit */ 122 if (tr->reset) { /* reset err count and overflow bit */
108 mci_misc_hi = 123 hi =
109 (mci_misc_hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) | 124 (hi & ~(MASK_ERR_COUNT_HI | MASK_OVERFLOW_HI)) |
110 (THRESHOLD_MAX - tr->b->threshold_limit); 125 (THRESHOLD_MAX - tr->b->threshold_limit);
111 } else if (tr->old_limit) { /* change limit w/o reset */ 126 } else if (tr->old_limit) { /* change limit w/o reset */
112 int new_count = (mci_misc_hi & THRESHOLD_MAX) + 127 int new_count = (hi & THRESHOLD_MAX) +
113 (tr->old_limit - tr->b->threshold_limit); 128 (tr->old_limit - tr->b->threshold_limit);
114 129
115 mci_misc_hi = (mci_misc_hi & ~MASK_ERR_COUNT_HI) | 130 hi = (hi & ~MASK_ERR_COUNT_HI) |
116 (new_count & THRESHOLD_MAX); 131 (new_count & THRESHOLD_MAX);
117 } 132 }
118 133
134 if (tr->set_lvt_off) {
135 if (lvt_off_valid(tr->b, tr->lvt_off, lo, hi)) {
136 /* set new lvt offset */
137 hi &= ~MASK_LVTOFF_HI;
138 hi |= tr->lvt_off << 20;
139 }
140 }
141
119 tr->b->interrupt_enable ? 142 tr->b->interrupt_enable ?
120 (mci_misc_hi = (mci_misc_hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) : 143 (hi = (hi & ~MASK_INT_TYPE_HI) | INT_TYPE_APIC) :
121 (mci_misc_hi &= ~MASK_INT_TYPE_HI); 144 (hi &= ~MASK_INT_TYPE_HI);
122 145
123 mci_misc_hi |= MASK_COUNT_EN_HI; 146 hi |= MASK_COUNT_EN_HI;
124 wrmsr(tr->b->address, mci_misc_lo, mci_misc_hi); 147 wrmsr(tr->b->address, lo, hi);
148}
149
150static void mce_threshold_block_init(struct threshold_block *b, int offset)
151{
152 struct thresh_restart tr = {
153 .b = b,
154 .set_lvt_off = 1,
155 .lvt_off = offset,
156 };
157
158 b->threshold_limit = THRESHOLD_MAX;
159 threshold_restart_bank(&tr);
160};
161
162static int setup_APIC_mce(int reserved, int new)
163{
164 if (reserved < 0 && !setup_APIC_eilvt(new, THRESHOLD_APIC_VECTOR,
165 APIC_EILVT_MSG_FIX, 0))
166 return new;
167
168 return reserved;
125} 169}
126 170
127/* cpu init entry point, called from mce.c with preempt off */ 171/* cpu init entry point, called from mce.c with preempt off */
128void mce_amd_feature_init(struct cpuinfo_x86 *c) 172void mce_amd_feature_init(struct cpuinfo_x86 *c)
129{ 173{
174 struct threshold_block b;
130 unsigned int cpu = smp_processor_id(); 175 unsigned int cpu = smp_processor_id();
131 u32 low = 0, high = 0, address = 0; 176 u32 low = 0, high = 0, address = 0;
132 unsigned int bank, block; 177 unsigned int bank, block;
133 struct thresh_restart tr; 178 int offset = -1;
134 u8 lvt_off;
135 179
136 for (bank = 0; bank < NR_BANKS; ++bank) { 180 for (bank = 0; bank < NR_BANKS; ++bank) {
137 for (block = 0; block < NR_BLOCKS; ++block) { 181 for (block = 0; block < NR_BLOCKS; ++block) {
@@ -162,19 +206,16 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
162 if (shared_bank[bank] && c->cpu_core_id) 206 if (shared_bank[bank] && c->cpu_core_id)
163 break; 207 break;
164#endif 208#endif
165 lvt_off = setup_APIC_eilvt_mce(THRESHOLD_APIC_VECTOR, 209 offset = setup_APIC_mce(offset,
166 APIC_EILVT_MSG_FIX, 0); 210 (high & MASK_LVTOFF_HI) >> 20);
167 211
168 high &= ~MASK_LVTOFF_HI; 212 memset(&b, 0, sizeof(b));
169 high |= lvt_off << 20; 213 b.cpu = cpu;
170 wrmsr(address, low, high); 214 b.bank = bank;
171 215 b.block = block;
172 threshold_defaults.address = address; 216 b.address = address;
173 tr.b = &threshold_defaults;
174 tr.reset = 0;
175 tr.old_limit = 0;
176 threshold_restart_bank(&tr);
177 217
218 mce_threshold_block_init(&b, offset);
178 mce_threshold_vector = amd_threshold_interrupt; 219 mce_threshold_vector = amd_threshold_interrupt;
179 } 220 }
180 } 221 }
@@ -277,9 +318,8 @@ store_interrupt_enable(struct threshold_block *b, const char *buf, size_t size)
277 318
278 b->interrupt_enable = !!new; 319 b->interrupt_enable = !!new;
279 320
321 memset(&tr, 0, sizeof(tr));
280 tr.b = b; 322 tr.b = b;
281 tr.reset = 0;
282 tr.old_limit = 0;
283 323
284 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 324 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
285 325
@@ -300,10 +340,10 @@ store_threshold_limit(struct threshold_block *b, const char *buf, size_t size)
300 if (new < 1) 340 if (new < 1)
301 new = 1; 341 new = 1;
302 342
343 memset(&tr, 0, sizeof(tr));
303 tr.old_limit = b->threshold_limit; 344 tr.old_limit = b->threshold_limit;
304 b->threshold_limit = new; 345 b->threshold_limit = new;
305 tr.b = b; 346 tr.b = b;
306 tr.reset = 0;
307 347
308 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1); 348 smp_call_function_single(b->cpu, threshold_restart_bank, &tr, 1);
309 349
@@ -469,6 +509,7 @@ recurse:
469out_free: 509out_free:
470 if (b) { 510 if (b) {
471 kobject_put(&b->kobj); 511 kobject_put(&b->kobj);
512 list_del(&b->miscj);
472 kfree(b); 513 kfree(b);
473 } 514 }
474 return err; 515 return err;
@@ -487,15 +528,12 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
487 int i, err = 0; 528 int i, err = 0;
488 struct threshold_bank *b = NULL; 529 struct threshold_bank *b = NULL;
489 char name[32]; 530 char name[32];
490#ifdef CONFIG_SMP
491 struct cpuinfo_x86 *c = &cpu_data(cpu);
492#endif
493 531
494 sprintf(name, "threshold_bank%i", bank); 532 sprintf(name, "threshold_bank%i", bank);
495 533
496#ifdef CONFIG_SMP 534#ifdef CONFIG_SMP
497 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ 535 if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */
498 i = cpumask_first(c->llc_shared_map); 536 i = cpumask_first(cpu_llc_shared_mask(cpu));
499 537
500 /* first core not up yet */ 538 /* first core not up yet */
501 if (cpu_data(i).cpu_core_id) 539 if (cpu_data(i).cpu_core_id)
@@ -515,7 +553,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank)
515 if (err) 553 if (err)
516 goto out; 554 goto out;
517 555
518 cpumask_copy(b->cpus, c->llc_shared_map); 556 cpumask_copy(b->cpus, cpu_llc_shared_mask(cpu));
519 per_cpu(threshold_banks, cpu)[bank] = b; 557 per_cpu(threshold_banks, cpu)[bank] = b;
520 558
521 goto out; 559 goto out;
@@ -582,9 +620,9 @@ static __cpuinit int threshold_create_device(unsigned int cpu)
582 continue; 620 continue;
583 err = threshold_create_bank(cpu, bank); 621 err = threshold_create_bank(cpu, bank);
584 if (err) 622 if (err)
585 goto out; 623 return err;
586 } 624 }
587out: 625
588 return err; 626 return err;
589} 627}
590 628
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 6fcd0936194f..8694ef56459d 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -130,7 +130,7 @@ void cmci_recheck(void)
130 unsigned long flags; 130 unsigned long flags;
131 int banks; 131 int banks;
132 132
133 if (!mce_available(&current_cpu_data) || !cmci_supported(&banks)) 133 if (!mce_available(__this_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
134 return; 134 return;
135 local_irq_save(flags); 135 local_irq_save(flags);
136 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned)); 136 machine_check_poll(MCP_TIMESTAMP, &__get_cpu_var(mce_banks_owned));
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index 169d8804a9f8..27c625178bf1 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -53,8 +53,14 @@ struct thermal_state {
53 struct _thermal_state core_power_limit; 53 struct _thermal_state core_power_limit;
54 struct _thermal_state package_throttle; 54 struct _thermal_state package_throttle;
55 struct _thermal_state package_power_limit; 55 struct _thermal_state package_power_limit;
56 struct _thermal_state core_thresh0;
57 struct _thermal_state core_thresh1;
56}; 58};
57 59
60/* Callback to handle core threshold interrupts */
61int (*platform_thermal_notify)(__u64 msr_val);
62EXPORT_SYMBOL(platform_thermal_notify);
63
58static DEFINE_PER_CPU(struct thermal_state, thermal_state); 64static DEFINE_PER_CPU(struct thermal_state, thermal_state);
59 65
60static atomic_t therm_throt_en = ATOMIC_INIT(0); 66static atomic_t therm_throt_en = ATOMIC_INIT(0);
@@ -181,8 +187,6 @@ static int therm_throt_process(bool new_event, int event, int level)
181 this_cpu, 187 this_cpu,
182 level == CORE_LEVEL ? "Core" : "Package", 188 level == CORE_LEVEL ? "Core" : "Package",
183 state->count); 189 state->count);
184
185 add_taint(TAINT_MACHINE_CHECK);
186 return 1; 190 return 1;
187 } 191 }
188 if (old_event) { 192 if (old_event) {
@@ -200,6 +204,22 @@ static int therm_throt_process(bool new_event, int event, int level)
200 return 0; 204 return 0;
201} 205}
202 206
207static int thresh_event_valid(int event)
208{
209 struct _thermal_state *state;
210 unsigned int this_cpu = smp_processor_id();
211 struct thermal_state *pstate = &per_cpu(thermal_state, this_cpu);
212 u64 now = get_jiffies_64();
213
214 state = (event == 0) ? &pstate->core_thresh0 : &pstate->core_thresh1;
215
216 if (time_before64(now, state->next_check))
217 return 0;
218
219 state->next_check = now + CHECK_INTERVAL;
220 return 1;
221}
222
203#ifdef CONFIG_SYSFS 223#ifdef CONFIG_SYSFS
204/* Add/Remove thermal_throttle interface for CPU device: */ 224/* Add/Remove thermal_throttle interface for CPU device: */
205static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, 225static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev,
@@ -313,32 +333,50 @@ device_initcall(thermal_throttle_init_device);
313#define PACKAGE_THROTTLED ((__u64)2 << 62) 333#define PACKAGE_THROTTLED ((__u64)2 << 62)
314#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) 334#define PACKAGE_POWER_LIMIT ((__u64)3 << 62)
315 335
336static void notify_thresholds(__u64 msr_val)
337{
338 /* check whether the interrupt handler is defined;
339 * otherwise simply return
340 */
341 if (!platform_thermal_notify)
342 return;
343
344 /* lower threshold reached */
345 if ((msr_val & THERM_LOG_THRESHOLD0) && thresh_event_valid(0))
346 platform_thermal_notify(msr_val);
347 /* higher threshold reached */
348 if ((msr_val & THERM_LOG_THRESHOLD1) && thresh_event_valid(1))
349 platform_thermal_notify(msr_val);
350}
351
316/* Thermal transition interrupt handler */ 352/* Thermal transition interrupt handler */
317static void intel_thermal_interrupt(void) 353static void intel_thermal_interrupt(void)
318{ 354{
319 __u64 msr_val; 355 __u64 msr_val;
320 struct cpuinfo_x86 *c = &cpu_data(smp_processor_id());
321 356
322 rdmsrl(MSR_IA32_THERM_STATUS, msr_val); 357 rdmsrl(MSR_IA32_THERM_STATUS, msr_val);
323 358
359 /* Check for violation of core thermal thresholds*/
360 notify_thresholds(msr_val);
361
324 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, 362 if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT,
325 THERMAL_THROTTLING_EVENT, 363 THERMAL_THROTTLING_EVENT,
326 CORE_LEVEL) != 0) 364 CORE_LEVEL) != 0)
327 mce_log_therm_throt_event(CORE_THROTTLED | msr_val); 365 mce_log_therm_throt_event(CORE_THROTTLED | msr_val);
328 366
329 if (cpu_has(c, X86_FEATURE_PLN)) 367 if (this_cpu_has(X86_FEATURE_PLN))
330 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, 368 if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT,
331 POWER_LIMIT_EVENT, 369 POWER_LIMIT_EVENT,
332 CORE_LEVEL) != 0) 370 CORE_LEVEL) != 0)
333 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); 371 mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val);
334 372
335 if (cpu_has(c, X86_FEATURE_PTS)) { 373 if (this_cpu_has(X86_FEATURE_PTS)) {
336 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); 374 rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val);
337 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, 375 if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT,
338 THERMAL_THROTTLING_EVENT, 376 THERMAL_THROTTLING_EVENT,
339 PACKAGE_LEVEL) != 0) 377 PACKAGE_LEVEL) != 0)
340 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); 378 mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val);
341 if (cpu_has(c, X86_FEATURE_PLN)) 379 if (this_cpu_has(X86_FEATURE_PLN))
342 if (therm_throt_process(msr_val & 380 if (therm_throt_process(msr_val &
343 PACKAGE_THERM_STATUS_POWER_LIMIT, 381 PACKAGE_THERM_STATUS_POWER_LIMIT,
344 POWER_LIMIT_EVENT, 382 POWER_LIMIT_EVENT,
@@ -350,9 +388,8 @@ static void intel_thermal_interrupt(void)
350 388
351static void unexpected_thermal_interrupt(void) 389static void unexpected_thermal_interrupt(void)
352{ 390{
353 printk(KERN_ERR "CPU%d: Unexpected LVT TMR interrupt!\n", 391 printk(KERN_ERR "CPU%d: Unexpected LVT thermal interrupt!\n",
354 smp_processor_id()); 392 smp_processor_id());
355 add_taint(TAINT_MACHINE_CHECK);
356} 393}
357 394
358static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; 395static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt;
@@ -405,18 +442,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
405 */ 442 */
406 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 443 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
407 444
445 h = lvtthmr_init;
408 /* 446 /*
409 * The initial value of thermal LVT entries on all APs always reads 447 * The initial value of thermal LVT entries on all APs always reads
410 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI 448 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
411 * sequence to them and LVT registers are reset to 0s except for 449 * sequence to them and LVT registers are reset to 0s except for
412 * the mask bits which are set to 1s when APs receive INIT IPI. 450 * the mask bits which are set to 1s when APs receive INIT IPI.
413 * Always restore the value that BIOS has programmed on AP based on 451 * If BIOS takes over the thermal interrupt and sets its interrupt
414 * BSP's info we saved since BIOS is always setting the same value 452 * delivery mode to SMI (not fixed), it restores the value that the
415 * for all threads/cores 453 * BIOS has programmed on AP based on BSP's info we saved since BIOS
454 * is always setting the same value for all threads/cores.
416 */ 455 */
417 apic_write(APIC_LVTTHMR, lvtthmr_init); 456 if ((h & APIC_DM_FIXED_MASK) != APIC_DM_FIXED)
457 apic_write(APIC_LVTTHMR, lvtthmr_init);
418 458
419 h = lvtthmr_init;
420 459
421 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 460 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
422 printk(KERN_DEBUG 461 printk(KERN_DEBUG
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index c5f59d071425..ac140c7be396 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -827,7 +827,7 @@ int __init amd_special_default_mtrr(void)
827 827
828 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) 828 if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
829 return 0; 829 return 0;
830 if (boot_cpu_data.x86 < 0xf || boot_cpu_data.x86 > 0x11) 830 if (boot_cpu_data.x86 < 0xf)
831 return 0; 831 return 0;
832 /* In case some hypervisor doesn't pass SYSCFG through: */ 832 /* In case some hypervisor doesn't pass SYSCFG through: */
833 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0) 833 if (rdmsr_safe(MSR_K8_SYSCFG, &l, &h) < 0)
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 7d28d7d03885..a71efcdbb092 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -1,6 +1,6 @@
1/* 1/*
2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong 2 * This only handles 32bit MTRR on 32bit hosts. This is strictly wrong
3 * because MTRRs can span upto 40 bits (36bits on most modern x86) 3 * because MTRRs can span up to 40 bits (36bits on most modern x86)
4 */ 4 */
5#define DEBUG 5#define DEBUG
6 6
@@ -64,18 +64,59 @@ static inline void k8_check_syscfg_dram_mod_en(void)
64 } 64 }
65} 65}
66 66
67/* Get the size of contiguous MTRR range */
68static u64 get_mtrr_size(u64 mask)
69{
70 u64 size;
71
72 mask >>= PAGE_SHIFT;
73 mask |= size_or_mask;
74 size = -mask;
75 size <<= PAGE_SHIFT;
76 return size;
77}
78
67/* 79/*
68 * Returns the effective MTRR type for the region 80 * Check and return the effective type for MTRR-MTRR type overlap.
69 * Error returns: 81 * Returns 1 if the effective type is UNCACHEABLE, else returns 0
70 * - 0xFE - when the range is "not entirely covered" by _any_ var range MTRR
71 * - 0xFF - when MTRR is not enabled
72 */ 82 */
73u8 mtrr_type_lookup(u64 start, u64 end) 83static int check_type_overlap(u8 *prev, u8 *curr)
84{
85 if (*prev == MTRR_TYPE_UNCACHABLE || *curr == MTRR_TYPE_UNCACHABLE) {
86 *prev = MTRR_TYPE_UNCACHABLE;
87 *curr = MTRR_TYPE_UNCACHABLE;
88 return 1;
89 }
90
91 if ((*prev == MTRR_TYPE_WRBACK && *curr == MTRR_TYPE_WRTHROUGH) ||
92 (*prev == MTRR_TYPE_WRTHROUGH && *curr == MTRR_TYPE_WRBACK)) {
93 *prev = MTRR_TYPE_WRTHROUGH;
94 *curr = MTRR_TYPE_WRTHROUGH;
95 }
96
97 if (*prev != *curr) {
98 *prev = MTRR_TYPE_UNCACHABLE;
99 *curr = MTRR_TYPE_UNCACHABLE;
100 return 1;
101 }
102
103 return 0;
104}
105
106/*
107 * Error/Semi-error returns:
108 * 0xFF - when MTRR is not enabled
109 * *repeat == 1 implies [start:end] spanned across MTRR range and type returned
110 * corresponds only to [start:*partial_end].
111 * Caller has to lookup again for [*partial_end:end].
112 */
113static u8 __mtrr_type_lookup(u64 start, u64 end, u64 *partial_end, int *repeat)
74{ 114{
75 int i; 115 int i;
76 u64 base, mask; 116 u64 base, mask;
77 u8 prev_match, curr_match; 117 u8 prev_match, curr_match;
78 118
119 *repeat = 0;
79 if (!mtrr_state_set) 120 if (!mtrr_state_set)
80 return 0xFF; 121 return 0xFF;
81 122
@@ -126,8 +167,34 @@ u8 mtrr_type_lookup(u64 start, u64 end)
126 167
127 start_state = ((start & mask) == (base & mask)); 168 start_state = ((start & mask) == (base & mask));
128 end_state = ((end & mask) == (base & mask)); 169 end_state = ((end & mask) == (base & mask));
129 if (start_state != end_state) 170
130 return 0xFE; 171 if (start_state != end_state) {
172 /*
173 * We have start:end spanning across an MTRR.
174 * We split the region into
175 * either
176 * (start:mtrr_end) (mtrr_end:end)
177 * or
178 * (start:mtrr_start) (mtrr_start:end)
179 * depending on kind of overlap.
180 * Return the type for first region and a pointer to
181 * the start of second region so that caller will
182 * lookup again on the second region.
183 * Note: This way we handle multiple overlaps as well.
184 */
185 if (start_state)
186 *partial_end = base + get_mtrr_size(mask);
187 else
188 *partial_end = base;
189
190 if (unlikely(*partial_end <= start)) {
191 WARN_ON(1);
192 *partial_end = start + PAGE_SIZE;
193 }
194
195 end = *partial_end - 1; /* end is inclusive */
196 *repeat = 1;
197 }
131 198
132 if ((start & mask) != (base & mask)) 199 if ((start & mask) != (base & mask))
133 continue; 200 continue;
@@ -138,21 +205,8 @@ u8 mtrr_type_lookup(u64 start, u64 end)
138 continue; 205 continue;
139 } 206 }
140 207
141 if (prev_match == MTRR_TYPE_UNCACHABLE || 208 if (check_type_overlap(&prev_match, &curr_match))
142 curr_match == MTRR_TYPE_UNCACHABLE) { 209 return curr_match;
143 return MTRR_TYPE_UNCACHABLE;
144 }
145
146 if ((prev_match == MTRR_TYPE_WRBACK &&
147 curr_match == MTRR_TYPE_WRTHROUGH) ||
148 (prev_match == MTRR_TYPE_WRTHROUGH &&
149 curr_match == MTRR_TYPE_WRBACK)) {
150 prev_match = MTRR_TYPE_WRTHROUGH;
151 curr_match = MTRR_TYPE_WRTHROUGH;
152 }
153
154 if (prev_match != curr_match)
155 return MTRR_TYPE_UNCACHABLE;
156 } 210 }
157 211
158 if (mtrr_tom2) { 212 if (mtrr_tom2) {
@@ -166,6 +220,36 @@ u8 mtrr_type_lookup(u64 start, u64 end)
166 return mtrr_state.def_type; 220 return mtrr_state.def_type;
167} 221}
168 222
223/*
224 * Returns the effective MTRR type for the region
225 * Error return:
226 * 0xFF - when MTRR is not enabled
227 */
228u8 mtrr_type_lookup(u64 start, u64 end)
229{
230 u8 type, prev_type;
231 int repeat;
232 u64 partial_end;
233
234 type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
235
236 /*
237 * Common path is with repeat = 0.
238 * However, we can have cases where [start:end] spans across some
239 * MTRR range. Do repeated lookups for that case here.
240 */
241 while (repeat) {
242 prev_type = type;
243 start = partial_end;
244 type = __mtrr_type_lookup(start, end, &partial_end, &repeat);
245
246 if (check_type_overlap(&prev_type, &type))
247 return type;
248 }
249
250 return type;
251}
252
169/* Get the MSR pair relating to a var range */ 253/* Get the MSR pair relating to a var range */
170static void 254static void
171get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr) 255get_mtrr_var_range(unsigned int index, struct mtrr_var_range *vr)
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 01c0f3ee6cc3..929739a653d1 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -45,6 +45,7 @@
45#include <linux/cpu.h> 45#include <linux/cpu.h>
46#include <linux/pci.h> 46#include <linux/pci.h>
47#include <linux/smp.h> 47#include <linux/smp.h>
48#include <linux/syscore_ops.h>
48 49
49#include <asm/processor.h> 50#include <asm/processor.h>
50#include <asm/e820.h> 51#include <asm/e820.h>
@@ -292,14 +293,24 @@ set_mtrr(unsigned int reg, unsigned long base, unsigned long size, mtrr_type typ
292 293
293 /* 294 /*
294 * HACK! 295 * HACK!
295 * We use this same function to initialize the mtrrs on boot. 296 *
296 * The state of the boot cpu's mtrrs has been saved, and we want 297 * We use this same function to initialize the mtrrs during boot,
297 * to replicate across all the APs. 298 * resume, runtime cpu online and on an explicit request to set a
298 * If we're doing that @reg is set to something special... 299 * specific MTRR.
300 *
301 * During boot or suspend, the state of the boot cpu's mtrrs has been
302 * saved, and we want to replicate that across all the cpus that come
303 * online (either at the end of boot or resume or during a runtime cpu
304 * online). If we're doing that, @reg is set to something special and on
305 * this cpu we still do mtrr_if->set_all(). During boot/resume, this
306 * is unnecessary if at this point we are still on the cpu that started
307 * the boot/resume sequence. But there is no guarantee that we are still
308 * on the same cpu. So we do mtrr_if->set_all() on this cpu aswell to be
309 * sure that we are in sync with everyone else.
299 */ 310 */
300 if (reg != ~0U) 311 if (reg != ~0U)
301 mtrr_if->set(reg, base, size, type); 312 mtrr_if->set(reg, base, size, type);
302 else if (!mtrr_aps_delayed_init) 313 else
303 mtrr_if->set_all(); 314 mtrr_if->set_all();
304 315
305 /* Wait for the others */ 316 /* Wait for the others */
@@ -630,7 +641,7 @@ struct mtrr_value {
630 641
631static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES]; 642static struct mtrr_value mtrr_value[MTRR_MAX_VAR_RANGES];
632 643
633static int mtrr_save(struct sys_device *sysdev, pm_message_t state) 644static int mtrr_save(void)
634{ 645{
635 int i; 646 int i;
636 647
@@ -642,7 +653,7 @@ static int mtrr_save(struct sys_device *sysdev, pm_message_t state)
642 return 0; 653 return 0;
643} 654}
644 655
645static int mtrr_restore(struct sys_device *sysdev) 656static void mtrr_restore(void)
646{ 657{
647 int i; 658 int i;
648 659
@@ -653,12 +664,11 @@ static int mtrr_restore(struct sys_device *sysdev)
653 mtrr_value[i].ltype); 664 mtrr_value[i].ltype);
654 } 665 }
655 } 666 }
656 return 0;
657} 667}
658 668
659 669
660 670
661static struct sysdev_driver mtrr_sysdev_driver = { 671static struct syscore_ops mtrr_syscore_ops = {
662 .suspend = mtrr_save, 672 .suspend = mtrr_save,
663 .resume = mtrr_restore, 673 .resume = mtrr_restore,
664}; 674};
@@ -793,13 +803,21 @@ void set_mtrr_aps_delayed_init(void)
793} 803}
794 804
795/* 805/*
796 * MTRR initialization for all AP's 806 * Delayed MTRR initialization for all AP's
797 */ 807 */
798void mtrr_aps_init(void) 808void mtrr_aps_init(void)
799{ 809{
800 if (!use_intel()) 810 if (!use_intel())
801 return; 811 return;
802 812
813 /*
814 * Check if someone has requested the delay of AP MTRR initialization,
815 * by doing set_mtrr_aps_delayed_init(), prior to this point. If not,
816 * then we are done.
817 */
818 if (!mtrr_aps_delayed_init)
819 return;
820
803 set_mtrr(~0U, 0, 0, 0); 821 set_mtrr(~0U, 0, 0, 0);
804 mtrr_aps_delayed_init = false; 822 mtrr_aps_delayed_init = false;
805} 823}
@@ -831,7 +849,7 @@ static int __init mtrr_init_finialize(void)
831 * TBD: is there any system with such CPU which supports 849 * TBD: is there any system with such CPU which supports
832 * suspend/resume? If no, we should remove the code. 850 * suspend/resume? If no, we should remove the code.
833 */ 851 */
834 sysdev_driver_register(&cpu_sysdev_class, &mtrr_sysdev_driver); 852 register_syscore_ops(&mtrr_syscore_ops);
835 853
836 return 0; 854 return 0;
837} 855}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 03a5b0385ad6..3a0338b4b179 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -30,6 +30,8 @@
30#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
31#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h> 32#include <asm/compat.h>
33#include <asm/smp.h>
34#include <asm/alternative.h>
33 35
34#if 0 36#if 0
35#undef wrmsrl 37#undef wrmsrl
@@ -49,7 +51,6 @@ static unsigned long
49copy_from_user_nmi(void *to, const void __user *from, unsigned long n) 51copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
50{ 52{
51 unsigned long offset, addr = (unsigned long)from; 53 unsigned long offset, addr = (unsigned long)from;
52 int type = in_nmi() ? KM_NMI : KM_IRQ0;
53 unsigned long size, len = 0; 54 unsigned long size, len = 0;
54 struct page *page; 55 struct page *page;
55 void *map; 56 void *map;
@@ -63,9 +64,9 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
63 offset = addr & (PAGE_SIZE - 1); 64 offset = addr & (PAGE_SIZE - 1);
64 size = min(PAGE_SIZE - offset, n - len); 65 size = min(PAGE_SIZE - offset, n - len);
65 66
66 map = kmap_atomic(page, type); 67 map = kmap_atomic(page);
67 memcpy(to, map+offset, size); 68 memcpy(to, map+offset, size);
68 kunmap_atomic(map, type); 69 kunmap_atomic(map);
69 put_page(page); 70 put_page(page);
70 71
71 len += size; 72 len += size;
@@ -94,6 +95,8 @@ struct amd_nb {
94 struct event_constraint event_constraints[X86_PMC_IDX_MAX]; 95 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
95}; 96};
96 97
98struct intel_percore;
99
97#define MAX_LBR_ENTRIES 16 100#define MAX_LBR_ENTRIES 16
98 101
99struct cpu_hw_events { 102struct cpu_hw_events {
@@ -129,6 +132,13 @@ struct cpu_hw_events {
129 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; 132 struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES];
130 133
131 /* 134 /*
135 * Intel percore register state.
136 * Coordinate shared resources between HT threads.
137 */
138 int percore_used; /* Used by this CPU? */
139 struct intel_percore *per_core;
140
141 /*
132 * AMD specific bits 142 * AMD specific bits
133 */ 143 */
134 struct amd_nb *amd_nb; 144 struct amd_nb *amd_nb;
@@ -167,7 +177,7 @@ struct cpu_hw_events {
167/* 177/*
168 * Constraint on the Event code + UMask 178 * Constraint on the Event code + UMask
169 */ 179 */
170#define PEBS_EVENT_CONSTRAINT(c, n) \ 180#define INTEL_UEVENT_CONSTRAINT(c, n) \
171 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK) 181 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVENT_MASK)
172 182
173#define EVENT_CONSTRAINT_END \ 183#define EVENT_CONSTRAINT_END \
@@ -176,6 +186,28 @@ struct cpu_hw_events {
176#define for_each_event_constraint(e, c) \ 186#define for_each_event_constraint(e, c) \
177 for ((e) = (c); (e)->weight; (e)++) 187 for ((e) = (c); (e)->weight; (e)++)
178 188
189/*
190 * Extra registers for specific events.
191 * Some events need large masks and require external MSRs.
192 * Define a mapping to these extra registers.
193 */
194struct extra_reg {
195 unsigned int event;
196 unsigned int msr;
197 u64 config_mask;
198 u64 valid_mask;
199};
200
201#define EVENT_EXTRA_REG(e, ms, m, vm) { \
202 .event = (e), \
203 .msr = (ms), \
204 .config_mask = (m), \
205 .valid_mask = (vm), \
206 }
207#define INTEL_EVENT_EXTRA_REG(event, msr, vm) \
208 EVENT_EXTRA_REG(event, msr, ARCH_PERFMON_EVENTSEL_EVENT, vm)
209#define EVENT_EXTRA_END EVENT_EXTRA_REG(0, 0, 0, 0)
210
179union perf_capabilities { 211union perf_capabilities {
180 struct { 212 struct {
181 u64 lbr_format : 6; 213 u64 lbr_format : 6;
@@ -220,6 +252,7 @@ struct x86_pmu {
220 void (*put_event_constraints)(struct cpu_hw_events *cpuc, 252 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
221 struct perf_event *event); 253 struct perf_event *event);
222 struct event_constraint *event_constraints; 254 struct event_constraint *event_constraints;
255 struct event_constraint *percore_constraints;
223 void (*quirks)(void); 256 void (*quirks)(void);
224 int perfctr_second_write; 257 int perfctr_second_write;
225 258
@@ -238,6 +271,7 @@ struct x86_pmu {
238 * Intel DebugStore bits 271 * Intel DebugStore bits
239 */ 272 */
240 int bts, pebs; 273 int bts, pebs;
274 int bts_active, pebs_active;
241 int pebs_record_size; 275 int pebs_record_size;
242 void (*drain_pebs)(struct pt_regs *regs); 276 void (*drain_pebs)(struct pt_regs *regs);
243 struct event_constraint *pebs_constraints; 277 struct event_constraint *pebs_constraints;
@@ -247,6 +281,11 @@ struct x86_pmu {
247 */ 281 */
248 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */ 282 unsigned long lbr_tos, lbr_from, lbr_to; /* MSR base regs */
249 int lbr_nr; /* hardware stack size */ 283 int lbr_nr; /* hardware stack size */
284
285 /*
286 * Extra registers for events
287 */
288 struct extra_reg *extra_regs;
250}; 289};
251 290
252static struct x86_pmu x86_pmu __read_mostly; 291static struct x86_pmu x86_pmu __read_mostly;
@@ -271,6 +310,10 @@ static u64 __read_mostly hw_cache_event_ids
271 [PERF_COUNT_HW_CACHE_MAX] 310 [PERF_COUNT_HW_CACHE_MAX]
272 [PERF_COUNT_HW_CACHE_OP_MAX] 311 [PERF_COUNT_HW_CACHE_OP_MAX]
273 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 312 [PERF_COUNT_HW_CACHE_RESULT_MAX];
313static u64 __read_mostly hw_cache_extra_regs
314 [PERF_COUNT_HW_CACHE_MAX]
315 [PERF_COUNT_HW_CACHE_OP_MAX]
316 [PERF_COUNT_HW_CACHE_RESULT_MAX];
274 317
275/* 318/*
276 * Propagate event elapsed time into the generic event. 319 * Propagate event elapsed time into the generic event.
@@ -298,7 +341,7 @@ x86_perf_event_update(struct perf_event *event)
298 */ 341 */
299again: 342again:
300 prev_raw_count = local64_read(&hwc->prev_count); 343 prev_raw_count = local64_read(&hwc->prev_count);
301 rdmsrl(hwc->event_base + idx, new_raw_count); 344 rdmsrl(hwc->event_base, new_raw_count);
302 345
303 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count, 346 if (local64_cmpxchg(&hwc->prev_count, prev_raw_count,
304 new_raw_count) != prev_raw_count) 347 new_raw_count) != prev_raw_count)
@@ -321,6 +364,55 @@ again:
321 return new_raw_count; 364 return new_raw_count;
322} 365}
323 366
367static inline int x86_pmu_addr_offset(int index)
368{
369 int offset;
370
371 /* offset = X86_FEATURE_PERFCTR_CORE ? index << 1 : index */
372 alternative_io(ASM_NOP2,
373 "shll $1, %%eax",
374 X86_FEATURE_PERFCTR_CORE,
375 "=a" (offset),
376 "a" (index));
377
378 return offset;
379}
380
381static inline unsigned int x86_pmu_config_addr(int index)
382{
383 return x86_pmu.eventsel + x86_pmu_addr_offset(index);
384}
385
386static inline unsigned int x86_pmu_event_addr(int index)
387{
388 return x86_pmu.perfctr + x86_pmu_addr_offset(index);
389}
390
391/*
392 * Find and validate any extra registers to set up.
393 */
394static int x86_pmu_extra_regs(u64 config, struct perf_event *event)
395{
396 struct extra_reg *er;
397
398 event->hw.extra_reg = 0;
399 event->hw.extra_config = 0;
400
401 if (!x86_pmu.extra_regs)
402 return 0;
403
404 for (er = x86_pmu.extra_regs; er->msr; er++) {
405 if (er->event != (config & er->config_mask))
406 continue;
407 if (event->attr.config1 & ~er->valid_mask)
408 return -EINVAL;
409 event->hw.extra_reg = er->msr;
410 event->hw.extra_config = event->attr.config1;
411 break;
412 }
413 return 0;
414}
415
324static atomic_t active_events; 416static atomic_t active_events;
325static DEFINE_MUTEX(pmc_reserve_mutex); 417static DEFINE_MUTEX(pmc_reserve_mutex);
326 418
@@ -330,16 +422,13 @@ static bool reserve_pmc_hardware(void)
330{ 422{
331 int i; 423 int i;
332 424
333 if (nmi_watchdog == NMI_LOCAL_APIC)
334 disable_lapic_nmi_watchdog();
335
336 for (i = 0; i < x86_pmu.num_counters; i++) { 425 for (i = 0; i < x86_pmu.num_counters; i++) {
337 if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) 426 if (!reserve_perfctr_nmi(x86_pmu_event_addr(i)))
338 goto perfctr_fail; 427 goto perfctr_fail;
339 } 428 }
340 429
341 for (i = 0; i < x86_pmu.num_counters; i++) { 430 for (i = 0; i < x86_pmu.num_counters; i++) {
342 if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) 431 if (!reserve_evntsel_nmi(x86_pmu_config_addr(i)))
343 goto eventsel_fail; 432 goto eventsel_fail;
344 } 433 }
345 434
@@ -347,16 +436,13 @@ static bool reserve_pmc_hardware(void)
347 436
348eventsel_fail: 437eventsel_fail:
349 for (i--; i >= 0; i--) 438 for (i--; i >= 0; i--)
350 release_evntsel_nmi(x86_pmu.eventsel + i); 439 release_evntsel_nmi(x86_pmu_config_addr(i));
351 440
352 i = x86_pmu.num_counters; 441 i = x86_pmu.num_counters;
353 442
354perfctr_fail: 443perfctr_fail:
355 for (i--; i >= 0; i--) 444 for (i--; i >= 0; i--)
356 release_perfctr_nmi(x86_pmu.perfctr + i); 445 release_perfctr_nmi(x86_pmu_event_addr(i));
357
358 if (nmi_watchdog == NMI_LOCAL_APIC)
359 enable_lapic_nmi_watchdog();
360 446
361 return false; 447 return false;
362} 448}
@@ -366,12 +452,9 @@ static void release_pmc_hardware(void)
366 int i; 452 int i;
367 453
368 for (i = 0; i < x86_pmu.num_counters; i++) { 454 for (i = 0; i < x86_pmu.num_counters; i++) {
369 release_perfctr_nmi(x86_pmu.perfctr + i); 455 release_perfctr_nmi(x86_pmu_event_addr(i));
370 release_evntsel_nmi(x86_pmu.eventsel + i); 456 release_evntsel_nmi(x86_pmu_config_addr(i));
371 } 457 }
372
373 if (nmi_watchdog == NMI_LOCAL_APIC)
374 enable_lapic_nmi_watchdog();
375} 458}
376 459
377#else 460#else
@@ -381,7 +464,64 @@ static void release_pmc_hardware(void) {}
381 464
382#endif 465#endif
383 466
384static int reserve_ds_buffers(void); 467static bool check_hw_exists(void)
468{
469 u64 val, val_new = 0;
470 int i, reg, ret = 0;
471
472 /*
473 * Check to see if the BIOS enabled any of the counters, if so
474 * complain and bail.
475 */
476 for (i = 0; i < x86_pmu.num_counters; i++) {
477 reg = x86_pmu_config_addr(i);
478 ret = rdmsrl_safe(reg, &val);
479 if (ret)
480 goto msr_fail;
481 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
482 goto bios_fail;
483 }
484
485 if (x86_pmu.num_counters_fixed) {
486 reg = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
487 ret = rdmsrl_safe(reg, &val);
488 if (ret)
489 goto msr_fail;
490 for (i = 0; i < x86_pmu.num_counters_fixed; i++) {
491 if (val & (0x03 << i*4))
492 goto bios_fail;
493 }
494 }
495
496 /*
497 * Now write a value and read it back to see if it matches,
498 * this is needed to detect certain hardware emulators (qemu/kvm)
499 * that don't trap on the MSR access and always return 0s.
500 */
501 val = 0xabcdUL;
502 ret = checking_wrmsrl(x86_pmu_event_addr(0), val);
503 ret |= rdmsrl_safe(x86_pmu_event_addr(0), &val_new);
504 if (ret || val != val_new)
505 goto msr_fail;
506
507 return true;
508
509bios_fail:
510 /*
511 * We still allow the PMU driver to operate:
512 */
513 printk(KERN_CONT "Broken BIOS detected, complain to your hardware vendor.\n");
514 printk(KERN_ERR FW_BUG "the BIOS has corrupted hw-PMU resources (MSR %x is %Lx)\n", reg, val);
515
516 return true;
517
518msr_fail:
519 printk(KERN_CONT "Broken PMU hardware detected, using software events only.\n");
520
521 return false;
522}
523
524static void reserve_ds_buffers(void);
385static void release_ds_buffers(void); 525static void release_ds_buffers(void);
386 526
387static void hw_perf_event_destroy(struct perf_event *event) 527static void hw_perf_event_destroy(struct perf_event *event)
@@ -399,8 +539,9 @@ static inline int x86_pmu_initialized(void)
399} 539}
400 540
401static inline int 541static inline int
402set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr) 542set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event *event)
403{ 543{
544 struct perf_event_attr *attr = &event->attr;
404 unsigned int cache_type, cache_op, cache_result; 545 unsigned int cache_type, cache_op, cache_result;
405 u64 config, val; 546 u64 config, val;
406 547
@@ -427,8 +568,8 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
427 return -EINVAL; 568 return -EINVAL;
428 569
429 hwc->config |= val; 570 hwc->config |= val;
430 571 attr->config1 = hw_cache_extra_regs[cache_type][cache_op][cache_result];
431 return 0; 572 return x86_pmu_extra_regs(val, event);
432} 573}
433 574
434static int x86_setup_perfctr(struct perf_event *event) 575static int x86_setup_perfctr(struct perf_event *event)
@@ -437,7 +578,7 @@ static int x86_setup_perfctr(struct perf_event *event)
437 struct hw_perf_event *hwc = &event->hw; 578 struct hw_perf_event *hwc = &event->hw;
438 u64 config; 579 u64 config;
439 580
440 if (!hwc->sample_period) { 581 if (!is_sampling_event(event)) {
441 hwc->sample_period = x86_pmu.max_period; 582 hwc->sample_period = x86_pmu.max_period;
442 hwc->last_period = hwc->sample_period; 583 hwc->last_period = hwc->sample_period;
443 local64_set(&hwc->period_left, hwc->sample_period); 584 local64_set(&hwc->period_left, hwc->sample_period);
@@ -452,11 +593,15 @@ static int x86_setup_perfctr(struct perf_event *event)
452 return -EOPNOTSUPP; 593 return -EOPNOTSUPP;
453 } 594 }
454 595
596 /*
597 * Do not allow config1 (extended registers) to propagate,
598 * there's no sane user-space generalization yet:
599 */
455 if (attr->type == PERF_TYPE_RAW) 600 if (attr->type == PERF_TYPE_RAW)
456 return 0; 601 return 0;
457 602
458 if (attr->type == PERF_TYPE_HW_CACHE) 603 if (attr->type == PERF_TYPE_HW_CACHE)
459 return set_ext_hw_attr(hwc, attr); 604 return set_ext_hw_attr(hwc, event);
460 605
461 if (attr->config >= x86_pmu.max_events) 606 if (attr->config >= x86_pmu.max_events)
462 return -EINVAL; 607 return -EINVAL;
@@ -475,10 +620,10 @@ static int x86_setup_perfctr(struct perf_event *event)
475 /* 620 /*
476 * Branch tracing: 621 * Branch tracing:
477 */ 622 */
478 if ((attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS) && 623 if (attr->config == PERF_COUNT_HW_BRANCH_INSTRUCTIONS &&
479 (hwc->sample_period == 1)) { 624 !attr->freq && hwc->sample_period == 1) {
480 /* BTS is not supported by this architecture. */ 625 /* BTS is not supported by this architecture. */
481 if (!x86_pmu.bts) 626 if (!x86_pmu.bts_active)
482 return -EOPNOTSUPP; 627 return -EOPNOTSUPP;
483 628
484 /* BTS is currently only allowed for user-mode. */ 629 /* BTS is currently only allowed for user-mode. */
@@ -497,12 +642,13 @@ static int x86_pmu_hw_config(struct perf_event *event)
497 int precise = 0; 642 int precise = 0;
498 643
499 /* Support for constant skid */ 644 /* Support for constant skid */
500 if (x86_pmu.pebs) 645 if (x86_pmu.pebs_active) {
501 precise++; 646 precise++;
502 647
503 /* Support for IP fixup */ 648 /* Support for IP fixup */
504 if (x86_pmu.lbr_nr) 649 if (x86_pmu.lbr_nr)
505 precise++; 650 precise++;
651 }
506 652
507 if (event->attr.precise_ip > precise) 653 if (event->attr.precise_ip > precise)
508 return -EOPNOTSUPP; 654 return -EOPNOTSUPP;
@@ -531,7 +677,7 @@ static int x86_pmu_hw_config(struct perf_event *event)
531/* 677/*
532 * Setup the hardware configuration for a given attr_type 678 * Setup the hardware configuration for a given attr_type
533 */ 679 */
534static int __hw_perf_event_init(struct perf_event *event) 680static int __x86_pmu_event_init(struct perf_event *event)
535{ 681{
536 int err; 682 int err;
537 683
@@ -544,11 +690,8 @@ static int __hw_perf_event_init(struct perf_event *event)
544 if (atomic_read(&active_events) == 0) { 690 if (atomic_read(&active_events) == 0) {
545 if (!reserve_pmc_hardware()) 691 if (!reserve_pmc_hardware())
546 err = -EBUSY; 692 err = -EBUSY;
547 else { 693 else
548 err = reserve_ds_buffers(); 694 reserve_ds_buffers();
549 if (err)
550 release_pmc_hardware();
551 }
552 } 695 }
553 if (!err) 696 if (!err)
554 atomic_inc(&active_events); 697 atomic_inc(&active_events);
@@ -576,15 +719,15 @@ static void x86_pmu_disable_all(void)
576 719
577 if (!test_bit(idx, cpuc->active_mask)) 720 if (!test_bit(idx, cpuc->active_mask))
578 continue; 721 continue;
579 rdmsrl(x86_pmu.eventsel + idx, val); 722 rdmsrl(x86_pmu_config_addr(idx), val);
580 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE)) 723 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
581 continue; 724 continue;
582 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE; 725 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
583 wrmsrl(x86_pmu.eventsel + idx, val); 726 wrmsrl(x86_pmu_config_addr(idx), val);
584 } 727 }
585} 728}
586 729
587void hw_perf_disable(void) 730static void x86_pmu_disable(struct pmu *pmu)
588{ 731{
589 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 732 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
590 733
@@ -601,25 +744,30 @@ void hw_perf_disable(void)
601 x86_pmu.disable_all(); 744 x86_pmu.disable_all();
602} 745}
603 746
747static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
748 u64 enable_mask)
749{
750 if (hwc->extra_reg)
751 wrmsrl(hwc->extra_reg, hwc->extra_config);
752 wrmsrl(hwc->config_base, hwc->config | enable_mask);
753}
754
604static void x86_pmu_enable_all(int added) 755static void x86_pmu_enable_all(int added)
605{ 756{
606 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 757 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
607 int idx; 758 int idx;
608 759
609 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 760 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
610 struct perf_event *event = cpuc->events[idx]; 761 struct hw_perf_event *hwc = &cpuc->events[idx]->hw;
611 u64 val;
612 762
613 if (!test_bit(idx, cpuc->active_mask)) 763 if (!test_bit(idx, cpuc->active_mask))
614 continue; 764 continue;
615 765
616 val = event->hw.config; 766 __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE);
617 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
618 wrmsrl(x86_pmu.eventsel + idx, val);
619 } 767 }
620} 768}
621 769
622static const struct pmu pmu; 770static struct pmu pmu;
623 771
624static inline int is_x86_event(struct perf_event *event) 772static inline int is_x86_event(struct perf_event *event)
625{ 773{
@@ -780,15 +928,10 @@ static inline void x86_assign_hw_event(struct perf_event *event,
780 hwc->event_base = 0; 928 hwc->event_base = 0;
781 } else if (hwc->idx >= X86_PMC_IDX_FIXED) { 929 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
782 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; 930 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
783 /* 931 hwc->event_base = MSR_ARCH_PERFMON_FIXED_CTR0 + (hwc->idx - X86_PMC_IDX_FIXED);
784 * We set it so that event_base + idx in wrmsr/rdmsr maps to
785 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
786 */
787 hwc->event_base =
788 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
789 } else { 932 } else {
790 hwc->config_base = x86_pmu.eventsel; 933 hwc->config_base = x86_pmu_config_addr(hwc->idx);
791 hwc->event_base = x86_pmu.perfctr; 934 hwc->event_base = x86_pmu_event_addr(hwc->idx);
792 } 935 }
793} 936}
794 937
@@ -801,10 +944,10 @@ static inline int match_prev_assignment(struct hw_perf_event *hwc,
801 hwc->last_tag == cpuc->tags[i]; 944 hwc->last_tag == cpuc->tags[i];
802} 945}
803 946
804static int x86_pmu_start(struct perf_event *event); 947static void x86_pmu_start(struct perf_event *event, int flags);
805static void x86_pmu_stop(struct perf_event *event); 948static void x86_pmu_stop(struct perf_event *event, int flags);
806 949
807void hw_perf_enable(void) 950static void x86_pmu_enable(struct pmu *pmu)
808{ 951{
809 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 952 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
810 struct perf_event *event; 953 struct perf_event *event;
@@ -840,7 +983,14 @@ void hw_perf_enable(void)
840 match_prev_assignment(hwc, cpuc, i)) 983 match_prev_assignment(hwc, cpuc, i))
841 continue; 984 continue;
842 985
843 x86_pmu_stop(event); 986 /*
987 * Ensure we don't accidentally enable a stopped
988 * counter simply because we rescheduled.
989 */
990 if (hwc->state & PERF_HES_STOPPED)
991 hwc->state |= PERF_HES_ARCH;
992
993 x86_pmu_stop(event, PERF_EF_UPDATE);
844 } 994 }
845 995
846 for (i = 0; i < cpuc->n_events; i++) { 996 for (i = 0; i < cpuc->n_events; i++) {
@@ -852,7 +1002,10 @@ void hw_perf_enable(void)
852 else if (i < n_running) 1002 else if (i < n_running)
853 continue; 1003 continue;
854 1004
855 x86_pmu_start(event); 1005 if (hwc->state & PERF_HES_ARCH)
1006 continue;
1007
1008 x86_pmu_start(event, PERF_EF_RELOAD);
856 } 1009 }
857 cpuc->n_added = 0; 1010 cpuc->n_added = 0;
858 perf_events_lapic_init(); 1011 perf_events_lapic_init();
@@ -864,17 +1017,11 @@ void hw_perf_enable(void)
864 x86_pmu.enable_all(added); 1017 x86_pmu.enable_all(added);
865} 1018}
866 1019
867static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc,
868 u64 enable_mask)
869{
870 wrmsrl(hwc->config_base + hwc->idx, hwc->config | enable_mask);
871}
872
873static inline void x86_pmu_disable_event(struct perf_event *event) 1020static inline void x86_pmu_disable_event(struct perf_event *event)
874{ 1021{
875 struct hw_perf_event *hwc = &event->hw; 1022 struct hw_perf_event *hwc = &event->hw;
876 1023
877 wrmsrl(hwc->config_base + hwc->idx, hwc->config); 1024 wrmsrl(hwc->config_base, hwc->config);
878} 1025}
879 1026
880static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 1027static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -927,7 +1074,7 @@ x86_perf_event_set_period(struct perf_event *event)
927 */ 1074 */
928 local64_set(&hwc->prev_count, (u64)-left); 1075 local64_set(&hwc->prev_count, (u64)-left);
929 1076
930 wrmsrl(hwc->event_base + idx, (u64)(-left) & x86_pmu.cntval_mask); 1077 wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask);
931 1078
932 /* 1079 /*
933 * Due to erratum on certan cpu we need 1080 * Due to erratum on certan cpu we need
@@ -935,7 +1082,7 @@ x86_perf_event_set_period(struct perf_event *event)
935 * is updated properly 1082 * is updated properly
936 */ 1083 */
937 if (x86_pmu.perfctr_second_write) { 1084 if (x86_pmu.perfctr_second_write) {
938 wrmsrl(hwc->event_base + idx, 1085 wrmsrl(hwc->event_base,
939 (u64)(-left) & x86_pmu.cntval_mask); 1086 (u64)(-left) & x86_pmu.cntval_mask);
940 } 1087 }
941 1088
@@ -946,22 +1093,18 @@ x86_perf_event_set_period(struct perf_event *event)
946 1093
947static void x86_pmu_enable_event(struct perf_event *event) 1094static void x86_pmu_enable_event(struct perf_event *event)
948{ 1095{
949 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1096 if (__this_cpu_read(cpu_hw_events.enabled))
950 if (cpuc->enabled)
951 __x86_pmu_enable_event(&event->hw, 1097 __x86_pmu_enable_event(&event->hw,
952 ARCH_PERFMON_EVENTSEL_ENABLE); 1098 ARCH_PERFMON_EVENTSEL_ENABLE);
953} 1099}
954 1100
955/* 1101/*
956 * activate a single event 1102 * Add a single event to the PMU.
957 * 1103 *
958 * The event is added to the group of enabled events 1104 * The event is added to the group of enabled events
959 * but only if it can be scehduled with existing events. 1105 * but only if it can be scehduled with existing events.
960 *
961 * Called with PMU disabled. If successful and return value 1,
962 * then guaranteed to call perf_enable() and hw_perf_enable()
963 */ 1106 */
964static int x86_pmu_enable(struct perf_event *event) 1107static int x86_pmu_add(struct perf_event *event, int flags)
965{ 1108{
966 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1109 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
967 struct hw_perf_event *hwc; 1110 struct hw_perf_event *hwc;
@@ -970,58 +1113,67 @@ static int x86_pmu_enable(struct perf_event *event)
970 1113
971 hwc = &event->hw; 1114 hwc = &event->hw;
972 1115
1116 perf_pmu_disable(event->pmu);
973 n0 = cpuc->n_events; 1117 n0 = cpuc->n_events;
974 n = collect_events(cpuc, event, false); 1118 ret = n = collect_events(cpuc, event, false);
975 if (n < 0) 1119 if (ret < 0)
976 return n; 1120 goto out;
1121
1122 hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED;
1123 if (!(flags & PERF_EF_START))
1124 hwc->state |= PERF_HES_ARCH;
977 1125
978 /* 1126 /*
979 * If group events scheduling transaction was started, 1127 * If group events scheduling transaction was started,
980 * skip the schedulability test here, it will be peformed 1128 * skip the schedulability test here, it will be performed
981 * at commit time(->commit_txn) as a whole 1129 * at commit time (->commit_txn) as a whole
982 */ 1130 */
983 if (cpuc->group_flag & PERF_EVENT_TXN) 1131 if (cpuc->group_flag & PERF_EVENT_TXN)
984 goto out; 1132 goto done_collect;
985 1133
986 ret = x86_pmu.schedule_events(cpuc, n, assign); 1134 ret = x86_pmu.schedule_events(cpuc, n, assign);
987 if (ret) 1135 if (ret)
988 return ret; 1136 goto out;
989 /* 1137 /*
990 * copy new assignment, now we know it is possible 1138 * copy new assignment, now we know it is possible
991 * will be used by hw_perf_enable() 1139 * will be used by hw_perf_enable()
992 */ 1140 */
993 memcpy(cpuc->assign, assign, n*sizeof(int)); 1141 memcpy(cpuc->assign, assign, n*sizeof(int));
994 1142
995out: 1143done_collect:
996 cpuc->n_events = n; 1144 cpuc->n_events = n;
997 cpuc->n_added += n - n0; 1145 cpuc->n_added += n - n0;
998 cpuc->n_txn += n - n0; 1146 cpuc->n_txn += n - n0;
999 1147
1000 return 0; 1148 ret = 0;
1149out:
1150 perf_pmu_enable(event->pmu);
1151 return ret;
1001} 1152}
1002 1153
1003static int x86_pmu_start(struct perf_event *event) 1154static void x86_pmu_start(struct perf_event *event, int flags)
1004{ 1155{
1005 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1156 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1006 int idx = event->hw.idx; 1157 int idx = event->hw.idx;
1007 1158
1008 if (idx == -1) 1159 if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
1009 return -EAGAIN; 1160 return;
1161
1162 if (WARN_ON_ONCE(idx == -1))
1163 return;
1164
1165 if (flags & PERF_EF_RELOAD) {
1166 WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE));
1167 x86_perf_event_set_period(event);
1168 }
1169
1170 event->hw.state = 0;
1010 1171
1011 x86_perf_event_set_period(event);
1012 cpuc->events[idx] = event; 1172 cpuc->events[idx] = event;
1013 __set_bit(idx, cpuc->active_mask); 1173 __set_bit(idx, cpuc->active_mask);
1014 __set_bit(idx, cpuc->running); 1174 __set_bit(idx, cpuc->running);
1015 x86_pmu.enable(event); 1175 x86_pmu.enable(event);
1016 perf_event_update_userpage(event); 1176 perf_event_update_userpage(event);
1017
1018 return 0;
1019}
1020
1021static void x86_pmu_unthrottle(struct perf_event *event)
1022{
1023 int ret = x86_pmu_start(event);
1024 WARN_ON_ONCE(ret);
1025} 1177}
1026 1178
1027void perf_event_print_debug(void) 1179void perf_event_print_debug(void)
@@ -1057,8 +1209,8 @@ void perf_event_print_debug(void)
1057 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask); 1209 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1058 1210
1059 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1211 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1060 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1212 rdmsrl(x86_pmu_config_addr(idx), pmc_ctrl);
1061 rdmsrl(x86_pmu.perfctr + idx, pmc_count); 1213 rdmsrl(x86_pmu_event_addr(idx), pmc_count);
1062 1214
1063 prev_left = per_cpu(pmc_prev_left[idx], cpu); 1215 prev_left = per_cpu(pmc_prev_left[idx], cpu);
1064 1216
@@ -1078,27 +1230,29 @@ void perf_event_print_debug(void)
1078 local_irq_restore(flags); 1230 local_irq_restore(flags);
1079} 1231}
1080 1232
1081static void x86_pmu_stop(struct perf_event *event) 1233static void x86_pmu_stop(struct perf_event *event, int flags)
1082{ 1234{
1083 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1235 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1084 struct hw_perf_event *hwc = &event->hw; 1236 struct hw_perf_event *hwc = &event->hw;
1085 int idx = hwc->idx;
1086
1087 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1088 return;
1089 1237
1090 x86_pmu.disable(event); 1238 if (__test_and_clear_bit(hwc->idx, cpuc->active_mask)) {
1091 1239 x86_pmu.disable(event);
1092 /* 1240 cpuc->events[hwc->idx] = NULL;
1093 * Drain the remaining delta count out of a event 1241 WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
1094 * that we are disabling: 1242 hwc->state |= PERF_HES_STOPPED;
1095 */ 1243 }
1096 x86_perf_event_update(event);
1097 1244
1098 cpuc->events[idx] = NULL; 1245 if ((flags & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
1246 /*
1247 * Drain the remaining delta count out of a event
1248 * that we are disabling:
1249 */
1250 x86_perf_event_update(event);
1251 hwc->state |= PERF_HES_UPTODATE;
1252 }
1099} 1253}
1100 1254
1101static void x86_pmu_disable(struct perf_event *event) 1255static void x86_pmu_del(struct perf_event *event, int flags)
1102{ 1256{
1103 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1257 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1104 int i; 1258 int i;
@@ -1111,7 +1265,7 @@ static void x86_pmu_disable(struct perf_event *event)
1111 if (cpuc->group_flag & PERF_EVENT_TXN) 1265 if (cpuc->group_flag & PERF_EVENT_TXN)
1112 return; 1266 return;
1113 1267
1114 x86_pmu_stop(event); 1268 x86_pmu_stop(event, PERF_EF_UPDATE);
1115 1269
1116 for (i = 0; i < cpuc->n_events; i++) { 1270 for (i = 0; i < cpuc->n_events; i++) {
1117 if (event == cpuc->event_list[i]) { 1271 if (event == cpuc->event_list[i]) {
@@ -1134,7 +1288,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1134 struct perf_sample_data data; 1288 struct perf_sample_data data;
1135 struct cpu_hw_events *cpuc; 1289 struct cpu_hw_events *cpuc;
1136 struct perf_event *event; 1290 struct perf_event *event;
1137 struct hw_perf_event *hwc;
1138 int idx, handled = 0; 1291 int idx, handled = 0;
1139 u64 val; 1292 u64 val;
1140 1293
@@ -1142,6 +1295,16 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1142 1295
1143 cpuc = &__get_cpu_var(cpu_hw_events); 1296 cpuc = &__get_cpu_var(cpu_hw_events);
1144 1297
1298 /*
1299 * Some chipsets need to unmask the LVTPC in a particular spot
1300 * inside the nmi handler. As a result, the unmasking was pushed
1301 * into all the nmi handlers.
1302 *
1303 * This generic handler doesn't seem to have any issues where the
1304 * unmasking occurs so it was left at the top.
1305 */
1306 apic_write(APIC_LVTPC, APIC_DM_NMI);
1307
1145 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 1308 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
1146 if (!test_bit(idx, cpuc->active_mask)) { 1309 if (!test_bit(idx, cpuc->active_mask)) {
1147 /* 1310 /*
@@ -1155,7 +1318,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1155 } 1318 }
1156 1319
1157 event = cpuc->events[idx]; 1320 event = cpuc->events[idx];
1158 hwc = &event->hw;
1159 1321
1160 val = x86_perf_event_update(event); 1322 val = x86_perf_event_update(event);
1161 if (val & (1ULL << (x86_pmu.cntval_bits - 1))) 1323 if (val & (1ULL << (x86_pmu.cntval_bits - 1)))
@@ -1171,7 +1333,7 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1171 continue; 1333 continue;
1172 1334
1173 if (perf_event_overflow(event, 1, &data, regs)) 1335 if (perf_event_overflow(event, 1, &data, regs))
1174 x86_pmu_stop(event); 1336 x86_pmu_stop(event, 0);
1175 } 1337 }
1176 1338
1177 if (handled) 1339 if (handled)
@@ -1180,25 +1342,6 @@ static int x86_pmu_handle_irq(struct pt_regs *regs)
1180 return handled; 1342 return handled;
1181} 1343}
1182 1344
1183void smp_perf_pending_interrupt(struct pt_regs *regs)
1184{
1185 irq_enter();
1186 ack_APIC_irq();
1187 inc_irq_stat(apic_pending_irqs);
1188 perf_event_do_pending();
1189 irq_exit();
1190}
1191
1192void set_perf_event_pending(void)
1193{
1194#ifdef CONFIG_X86_LOCAL_APIC
1195 if (!x86_pmu.apic || !x86_pmu_initialized())
1196 return;
1197
1198 apic->send_IPI_self(LOCAL_PENDING_VECTOR);
1199#endif
1200}
1201
1202void perf_events_lapic_init(void) 1345void perf_events_lapic_init(void)
1203{ 1346{
1204 if (!x86_pmu.apic || !x86_pmu_initialized()) 1347 if (!x86_pmu.apic || !x86_pmu_initialized())
@@ -1230,11 +1373,10 @@ perf_event_nmi_handler(struct notifier_block *self,
1230 1373
1231 switch (cmd) { 1374 switch (cmd) {
1232 case DIE_NMI: 1375 case DIE_NMI:
1233 case DIE_NMI_IPI:
1234 break; 1376 break;
1235 case DIE_NMIUNKNOWN: 1377 case DIE_NMIUNKNOWN:
1236 this_nmi = percpu_read(irq_stat.__nmi_count); 1378 this_nmi = percpu_read(irq_stat.__nmi_count);
1237 if (this_nmi != __get_cpu_var(pmu_nmi).marked) 1379 if (this_nmi != __this_cpu_read(pmu_nmi.marked))
1238 /* let the kernel handle the unknown nmi */ 1380 /* let the kernel handle the unknown nmi */
1239 return NOTIFY_DONE; 1381 return NOTIFY_DONE;
1240 /* 1382 /*
@@ -1249,8 +1391,6 @@ perf_event_nmi_handler(struct notifier_block *self,
1249 return NOTIFY_DONE; 1391 return NOTIFY_DONE;
1250 } 1392 }
1251 1393
1252 apic_write(APIC_LVTPC, APIC_DM_NMI);
1253
1254 handled = x86_pmu.handle_irq(args->regs); 1394 handled = x86_pmu.handle_irq(args->regs);
1255 if (!handled) 1395 if (!handled)
1256 return NOTIFY_DONE; 1396 return NOTIFY_DONE;
@@ -1258,8 +1398,8 @@ perf_event_nmi_handler(struct notifier_block *self,
1258 this_nmi = percpu_read(irq_stat.__nmi_count); 1398 this_nmi = percpu_read(irq_stat.__nmi_count);
1259 if ((handled > 1) || 1399 if ((handled > 1) ||
1260 /* the next nmi could be a back-to-back nmi */ 1400 /* the next nmi could be a back-to-back nmi */
1261 ((__get_cpu_var(pmu_nmi).marked == this_nmi) && 1401 ((__this_cpu_read(pmu_nmi.marked) == this_nmi) &&
1262 (__get_cpu_var(pmu_nmi).handled > 1))) { 1402 (__this_cpu_read(pmu_nmi.handled) > 1))) {
1263 /* 1403 /*
1264 * We could have two subsequent back-to-back nmis: The 1404 * We could have two subsequent back-to-back nmis: The
1265 * first handles more than one counter, the 2nd 1405 * first handles more than one counter, the 2nd
@@ -1270,8 +1410,8 @@ perf_event_nmi_handler(struct notifier_block *self,
1270 * handling more than one counter. We will mark the 1410 * handling more than one counter. We will mark the
1271 * next (3rd) and then drop it if unhandled. 1411 * next (3rd) and then drop it if unhandled.
1272 */ 1412 */
1273 __get_cpu_var(pmu_nmi).marked = this_nmi + 1; 1413 __this_cpu_write(pmu_nmi.marked, this_nmi + 1);
1274 __get_cpu_var(pmu_nmi).handled = handled; 1414 __this_cpu_write(pmu_nmi.handled, handled);
1275 } 1415 }
1276 1416
1277 return NOTIFY_STOP; 1417 return NOTIFY_STOP;
@@ -1280,7 +1420,7 @@ perf_event_nmi_handler(struct notifier_block *self,
1280static __read_mostly struct notifier_block perf_event_nmi_notifier = { 1420static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1281 .notifier_call = perf_event_nmi_handler, 1421 .notifier_call = perf_event_nmi_handler,
1282 .next = NULL, 1422 .next = NULL,
1283 .priority = 1 1423 .priority = NMI_LOCAL_LOW_PRIOR,
1284}; 1424};
1285 1425
1286static struct event_constraint unconstrained; 1426static struct event_constraint unconstrained;
@@ -1353,7 +1493,7 @@ static void __init pmu_check_apic(void)
1353 pr_info("no hardware sampling interrupt available.\n"); 1493 pr_info("no hardware sampling interrupt available.\n");
1354} 1494}
1355 1495
1356void __init init_hw_perf_events(void) 1496static int __init init_hw_perf_events(void)
1357{ 1497{
1358 struct event_constraint *c; 1498 struct event_constraint *c;
1359 int err; 1499 int err;
@@ -1368,15 +1508,19 @@ void __init init_hw_perf_events(void)
1368 err = amd_pmu_init(); 1508 err = amd_pmu_init();
1369 break; 1509 break;
1370 default: 1510 default:
1371 return; 1511 return 0;
1372 } 1512 }
1373 if (err != 0) { 1513 if (err != 0) {
1374 pr_cont("no PMU driver, software events only.\n"); 1514 pr_cont("no PMU driver, software events only.\n");
1375 return; 1515 return 0;
1376 } 1516 }
1377 1517
1378 pmu_check_apic(); 1518 pmu_check_apic();
1379 1519
1520 /* sanity check that the hardware exists or is emulated */
1521 if (!check_hw_exists())
1522 return 0;
1523
1380 pr_cont("%s PMU driver.\n", x86_pmu.name); 1524 pr_cont("%s PMU driver.\n", x86_pmu.name);
1381 1525
1382 if (x86_pmu.quirks) 1526 if (x86_pmu.quirks)
@@ -1388,7 +1532,6 @@ void __init init_hw_perf_events(void)
1388 x86_pmu.num_counters = X86_PMC_MAX_GENERIC; 1532 x86_pmu.num_counters = X86_PMC_MAX_GENERIC;
1389 } 1533 }
1390 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1; 1534 x86_pmu.intel_ctrl = (1 << x86_pmu.num_counters) - 1;
1391 perf_max_events = x86_pmu.num_counters;
1392 1535
1393 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { 1536 if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) {
1394 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!", 1537 WARN(1, KERN_ERR "hw perf events fixed %d > max(%d), clipping!",
@@ -1424,8 +1567,12 @@ void __init init_hw_perf_events(void)
1424 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed); 1567 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_counters_fixed);
1425 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl); 1568 pr_info("... event mask: %016Lx\n", x86_pmu.intel_ctrl);
1426 1569
1570 perf_pmu_register(&pmu, "cpu", PERF_TYPE_RAW);
1427 perf_cpu_notifier(x86_pmu_notifier); 1571 perf_cpu_notifier(x86_pmu_notifier);
1572
1573 return 0;
1428} 1574}
1575early_initcall(init_hw_perf_events);
1429 1576
1430static inline void x86_pmu_read(struct perf_event *event) 1577static inline void x86_pmu_read(struct perf_event *event)
1431{ 1578{
@@ -1437,12 +1584,11 @@ static inline void x86_pmu_read(struct perf_event *event)
1437 * Set the flag to make pmu::enable() not perform the 1584 * Set the flag to make pmu::enable() not perform the
1438 * schedulability test, it will be performed at commit time 1585 * schedulability test, it will be performed at commit time
1439 */ 1586 */
1440static void x86_pmu_start_txn(const struct pmu *pmu) 1587static void x86_pmu_start_txn(struct pmu *pmu)
1441{ 1588{
1442 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1589 perf_pmu_disable(pmu);
1443 1590 __this_cpu_or(cpu_hw_events.group_flag, PERF_EVENT_TXN);
1444 cpuc->group_flag |= PERF_EVENT_TXN; 1591 __this_cpu_write(cpu_hw_events.n_txn, 0);
1445 cpuc->n_txn = 0;
1446} 1592}
1447 1593
1448/* 1594/*
@@ -1450,16 +1596,15 @@ static void x86_pmu_start_txn(const struct pmu *pmu)
1450 * Clear the flag and pmu::enable() will perform the 1596 * Clear the flag and pmu::enable() will perform the
1451 * schedulability test. 1597 * schedulability test.
1452 */ 1598 */
1453static void x86_pmu_cancel_txn(const struct pmu *pmu) 1599static void x86_pmu_cancel_txn(struct pmu *pmu)
1454{ 1600{
1455 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1601 __this_cpu_and(cpu_hw_events.group_flag, ~PERF_EVENT_TXN);
1456
1457 cpuc->group_flag &= ~PERF_EVENT_TXN;
1458 /* 1602 /*
1459 * Truncate the collected events. 1603 * Truncate the collected events.
1460 */ 1604 */
1461 cpuc->n_added -= cpuc->n_txn; 1605 __this_cpu_sub(cpu_hw_events.n_added, __this_cpu_read(cpu_hw_events.n_txn));
1462 cpuc->n_events -= cpuc->n_txn; 1606 __this_cpu_sub(cpu_hw_events.n_events, __this_cpu_read(cpu_hw_events.n_txn));
1607 perf_pmu_enable(pmu);
1463} 1608}
1464 1609
1465/* 1610/*
@@ -1467,7 +1612,7 @@ static void x86_pmu_cancel_txn(const struct pmu *pmu)
1467 * Perform the group schedulability test as a whole 1612 * Perform the group schedulability test as a whole
1468 * Return 0 if success 1613 * Return 0 if success
1469 */ 1614 */
1470static int x86_pmu_commit_txn(const struct pmu *pmu) 1615static int x86_pmu_commit_txn(struct pmu *pmu)
1471{ 1616{
1472 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1617 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1473 int assign[X86_PMC_IDX_MAX]; 1618 int assign[X86_PMC_IDX_MAX];
@@ -1489,22 +1634,10 @@ static int x86_pmu_commit_txn(const struct pmu *pmu)
1489 memcpy(cpuc->assign, assign, n*sizeof(int)); 1634 memcpy(cpuc->assign, assign, n*sizeof(int));
1490 1635
1491 cpuc->group_flag &= ~PERF_EVENT_TXN; 1636 cpuc->group_flag &= ~PERF_EVENT_TXN;
1492 1637 perf_pmu_enable(pmu);
1493 return 0; 1638 return 0;
1494} 1639}
1495 1640
1496static const struct pmu pmu = {
1497 .enable = x86_pmu_enable,
1498 .disable = x86_pmu_disable,
1499 .start = x86_pmu_start,
1500 .stop = x86_pmu_stop,
1501 .read = x86_pmu_read,
1502 .unthrottle = x86_pmu_unthrottle,
1503 .start_txn = x86_pmu_start_txn,
1504 .cancel_txn = x86_pmu_cancel_txn,
1505 .commit_txn = x86_pmu_commit_txn,
1506};
1507
1508/* 1641/*
1509 * validate that we can schedule this event 1642 * validate that we can schedule this event
1510 */ 1643 */
@@ -1579,12 +1712,22 @@ out:
1579 return ret; 1712 return ret;
1580} 1713}
1581 1714
1582const struct pmu *hw_perf_event_init(struct perf_event *event) 1715static int x86_pmu_event_init(struct perf_event *event)
1583{ 1716{
1584 const struct pmu *tmp; 1717 struct pmu *tmp;
1585 int err; 1718 int err;
1586 1719
1587 err = __hw_perf_event_init(event); 1720 switch (event->attr.type) {
1721 case PERF_TYPE_RAW:
1722 case PERF_TYPE_HARDWARE:
1723 case PERF_TYPE_HW_CACHE:
1724 break;
1725
1726 default:
1727 return -ENOENT;
1728 }
1729
1730 err = __x86_pmu_event_init(event);
1588 if (!err) { 1731 if (!err) {
1589 /* 1732 /*
1590 * we temporarily connect event to its pmu 1733 * we temporarily connect event to its pmu
@@ -1604,37 +1747,31 @@ const struct pmu *hw_perf_event_init(struct perf_event *event)
1604 if (err) { 1747 if (err) {
1605 if (event->destroy) 1748 if (event->destroy)
1606 event->destroy(event); 1749 event->destroy(event);
1607 return ERR_PTR(err);
1608 } 1750 }
1609 1751
1610 return &pmu; 1752 return err;
1611} 1753}
1612 1754
1613/* 1755static struct pmu pmu = {
1614 * callchain support 1756 .pmu_enable = x86_pmu_enable,
1615 */ 1757 .pmu_disable = x86_pmu_disable,
1616
1617static inline
1618void callchain_store(struct perf_callchain_entry *entry, u64 ip)
1619{
1620 if (entry->nr < PERF_MAX_STACK_DEPTH)
1621 entry->ip[entry->nr++] = ip;
1622}
1623 1758
1624static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1759 .event_init = x86_pmu_event_init,
1625static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
1626 1760
1761 .add = x86_pmu_add,
1762 .del = x86_pmu_del,
1763 .start = x86_pmu_start,
1764 .stop = x86_pmu_stop,
1765 .read = x86_pmu_read,
1627 1766
1628static void 1767 .start_txn = x86_pmu_start_txn,
1629backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) 1768 .cancel_txn = x86_pmu_cancel_txn,
1630{ 1769 .commit_txn = x86_pmu_commit_txn,
1631 /* Ignore warnings */ 1770};
1632}
1633 1771
1634static void backtrace_warning(void *data, char *msg) 1772/*
1635{ 1773 * callchain support
1636 /* Ignore warnings */ 1774 */
1637}
1638 1775
1639static int backtrace_stack(void *data, char *name) 1776static int backtrace_stack(void *data, char *name)
1640{ 1777{
@@ -1645,24 +1782,26 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
1645{ 1782{
1646 struct perf_callchain_entry *entry = data; 1783 struct perf_callchain_entry *entry = data;
1647 1784
1648 callchain_store(entry, addr); 1785 perf_callchain_store(entry, addr);
1649} 1786}
1650 1787
1651static const struct stacktrace_ops backtrace_ops = { 1788static const struct stacktrace_ops backtrace_ops = {
1652 .warning = backtrace_warning,
1653 .warning_symbol = backtrace_warning_symbol,
1654 .stack = backtrace_stack, 1789 .stack = backtrace_stack,
1655 .address = backtrace_address, 1790 .address = backtrace_address,
1656 .walk_stack = print_context_stack_bp, 1791 .walk_stack = print_context_stack_bp,
1657}; 1792};
1658 1793
1659static void 1794void
1660perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) 1795perf_callchain_kernel(struct perf_callchain_entry *entry, struct pt_regs *regs)
1661{ 1796{
1662 callchain_store(entry, PERF_CONTEXT_KERNEL); 1797 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1663 callchain_store(entry, regs->ip); 1798 /* TODO: We don't support guest os callchain now */
1799 return;
1800 }
1801
1802 perf_callchain_store(entry, regs->ip);
1664 1803
1665 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry); 1804 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry);
1666} 1805}
1667 1806
1668#ifdef CONFIG_COMPAT 1807#ifdef CONFIG_COMPAT
@@ -1689,7 +1828,7 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1689 if (fp < compat_ptr(regs->sp)) 1828 if (fp < compat_ptr(regs->sp))
1690 break; 1829 break;
1691 1830
1692 callchain_store(entry, frame.return_address); 1831 perf_callchain_store(entry, frame.return_address);
1693 fp = compat_ptr(frame.next_frame); 1832 fp = compat_ptr(frame.next_frame);
1694 } 1833 }
1695 return 1; 1834 return 1;
@@ -1702,19 +1841,20 @@ perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1702} 1841}
1703#endif 1842#endif
1704 1843
1705static void 1844void
1706perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1845perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
1707{ 1846{
1708 struct stack_frame frame; 1847 struct stack_frame frame;
1709 const void __user *fp; 1848 const void __user *fp;
1710 1849
1711 if (!user_mode(regs)) 1850 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1712 regs = task_pt_regs(current); 1851 /* TODO: We don't support guest os callchain now */
1852 return;
1853 }
1713 1854
1714 fp = (void __user *)regs->bp; 1855 fp = (void __user *)regs->bp;
1715 1856
1716 callchain_store(entry, PERF_CONTEXT_USER); 1857 perf_callchain_store(entry, regs->ip);
1717 callchain_store(entry, regs->ip);
1718 1858
1719 if (perf_callchain_user32(regs, entry)) 1859 if (perf_callchain_user32(regs, entry))
1720 return; 1860 return;
@@ -1731,52 +1871,11 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
1731 if ((unsigned long)fp < regs->sp) 1871 if ((unsigned long)fp < regs->sp)
1732 break; 1872 break;
1733 1873
1734 callchain_store(entry, frame.return_address); 1874 perf_callchain_store(entry, frame.return_address);
1735 fp = frame.next_frame; 1875 fp = frame.next_frame;
1736 } 1876 }
1737} 1877}
1738 1878
1739static void
1740perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
1741{
1742 int is_user;
1743
1744 if (!regs)
1745 return;
1746
1747 is_user = user_mode(regs);
1748
1749 if (is_user && current->state != TASK_RUNNING)
1750 return;
1751
1752 if (!is_user)
1753 perf_callchain_kernel(regs, entry);
1754
1755 if (current->mm)
1756 perf_callchain_user(regs, entry);
1757}
1758
1759struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
1760{
1761 struct perf_callchain_entry *entry;
1762
1763 if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
1764 /* TODO: We don't support guest os callchain now */
1765 return NULL;
1766 }
1767
1768 if (in_nmi())
1769 entry = &__get_cpu_var(pmc_nmi_entry);
1770 else
1771 entry = &__get_cpu_var(pmc_irq_entry);
1772
1773 entry->nr = 0;
1774
1775 perf_do_callchain(regs, entry);
1776
1777 return entry;
1778}
1779
1780unsigned long perf_instruction_pointer(struct pt_regs *regs) 1879unsigned long perf_instruction_pointer(struct pt_regs *regs)
1781{ 1880{
1782 unsigned long ip; 1881 unsigned long ip;
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
index c2897b7b4a3b..fe29c1d2219e 100644
--- a/arch/x86/kernel/cpu/perf_event_amd.c
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -1,7 +1,5 @@
1#ifdef CONFIG_CPU_SUP_AMD 1#ifdef CONFIG_CPU_SUP_AMD
2 2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst const u64 amd_hw_cache_event_ids 3static __initconst const u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX] 4 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX] 5 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -10,7 +8,7 @@ static __initconst const u64 amd_hw_cache_event_ids
10 [ C(L1D) ] = { 8 [ C(L1D) ] = {
11 [ C(OP_READ) ] = { 9 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ 10 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */ 11 [ C(RESULT_MISS) ] = 0x0141, /* Data Cache Misses */
14 }, 12 },
15 [ C(OP_WRITE) ] = { 13 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */ 14 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
@@ -52,7 +50,7 @@ static __initconst const u64 amd_hw_cache_event_ids
52 [ C(DTLB) ] = { 50 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = { 51 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */ 52 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */ 53 [ C(RESULT_MISS) ] = 0x0746, /* L1_DTLB_AND_L2_DLTB_MISS.ALL */
56 }, 54 },
57 [ C(OP_WRITE) ] = { 55 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0, 56 [ C(RESULT_ACCESS) ] = 0,
@@ -66,7 +64,7 @@ static __initconst const u64 amd_hw_cache_event_ids
66 [ C(ITLB) ] = { 64 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = { 65 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */ 66 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */ 67 [ C(RESULT_MISS) ] = 0x0385, /* L1_ITLB_AND_L2_ITLB_MISS.ALL */
70 }, 68 },
71 [ C(OP_WRITE) ] = { 69 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1, 70 [ C(RESULT_ACCESS) ] = -1,
@@ -98,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids
98 */ 96 */
99static const u64 amd_perfmon_event_map[] = 97static const u64 amd_perfmon_event_map[] =
100{ 98{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, 99 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 100 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, 101 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, 102 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, 103 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, 104 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3,
105 [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */
106 [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */
107}; 107};
108 108
109static u64 amd_pmu_event_map(int hw_event) 109static u64 amd_pmu_event_map(int hw_event)
@@ -129,6 +129,11 @@ static int amd_pmu_hw_config(struct perf_event *event)
129/* 129/*
130 * AMD64 events are detected based on their event codes. 130 * AMD64 events are detected based on their event codes.
131 */ 131 */
132static inline unsigned int amd_get_event_code(struct hw_perf_event *hwc)
133{
134 return ((hwc->config >> 24) & 0x0f00) | (hwc->config & 0x00ff);
135}
136
132static inline int amd_is_nb_event(struct hw_perf_event *hwc) 137static inline int amd_is_nb_event(struct hw_perf_event *hwc)
133{ 138{
134 return (hwc->config & 0xe0) == 0xe0; 139 return (hwc->config & 0xe0) == 0xe0;
@@ -275,17 +280,17 @@ done:
275 return &emptyconstraint; 280 return &emptyconstraint;
276} 281}
277 282
278static struct amd_nb *amd_alloc_nb(int cpu, int nb_id) 283static struct amd_nb *amd_alloc_nb(int cpu)
279{ 284{
280 struct amd_nb *nb; 285 struct amd_nb *nb;
281 int i; 286 int i;
282 287
283 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL); 288 nb = kmalloc_node(sizeof(struct amd_nb), GFP_KERNEL | __GFP_ZERO,
289 cpu_to_node(cpu));
284 if (!nb) 290 if (!nb)
285 return NULL; 291 return NULL;
286 292
287 memset(nb, 0, sizeof(*nb)); 293 nb->nb_id = -1;
288 nb->nb_id = nb_id;
289 294
290 /* 295 /*
291 * initialize all possible NB constraints 296 * initialize all possible NB constraints
@@ -306,7 +311,7 @@ static int amd_pmu_cpu_prepare(int cpu)
306 if (boot_cpu_data.x86_max_cores < 2) 311 if (boot_cpu_data.x86_max_cores < 2)
307 return NOTIFY_OK; 312 return NOTIFY_OK;
308 313
309 cpuc->amd_nb = amd_alloc_nb(cpu, -1); 314 cpuc->amd_nb = amd_alloc_nb(cpu);
310 if (!cpuc->amd_nb) 315 if (!cpuc->amd_nb)
311 return NOTIFY_BAD; 316 return NOTIFY_BAD;
312 317
@@ -325,8 +330,6 @@ static void amd_pmu_cpu_starting(int cpu)
325 nb_id = amd_get_nb_id(cpu); 330 nb_id = amd_get_nb_id(cpu);
326 WARN_ON_ONCE(nb_id == BAD_APICID); 331 WARN_ON_ONCE(nb_id == BAD_APICID);
327 332
328 raw_spin_lock(&amd_nb_lock);
329
330 for_each_online_cpu(i) { 333 for_each_online_cpu(i) {
331 nb = per_cpu(cpu_hw_events, i).amd_nb; 334 nb = per_cpu(cpu_hw_events, i).amd_nb;
332 if (WARN_ON_ONCE(!nb)) 335 if (WARN_ON_ONCE(!nb))
@@ -341,8 +344,6 @@ static void amd_pmu_cpu_starting(int cpu)
341 344
342 cpuc->amd_nb->nb_id = nb_id; 345 cpuc->amd_nb->nb_id = nb_id;
343 cpuc->amd_nb->refcnt++; 346 cpuc->amd_nb->refcnt++;
344
345 raw_spin_unlock(&amd_nb_lock);
346} 347}
347 348
348static void amd_pmu_cpu_dead(int cpu) 349static void amd_pmu_cpu_dead(int cpu)
@@ -354,8 +355,6 @@ static void amd_pmu_cpu_dead(int cpu)
354 355
355 cpuhw = &per_cpu(cpu_hw_events, cpu); 356 cpuhw = &per_cpu(cpu_hw_events, cpu);
356 357
357 raw_spin_lock(&amd_nb_lock);
358
359 if (cpuhw->amd_nb) { 358 if (cpuhw->amd_nb) {
360 struct amd_nb *nb = cpuhw->amd_nb; 359 struct amd_nb *nb = cpuhw->amd_nb;
361 360
@@ -364,8 +363,6 @@ static void amd_pmu_cpu_dead(int cpu)
364 363
365 cpuhw->amd_nb = NULL; 364 cpuhw->amd_nb = NULL;
366 } 365 }
367
368 raw_spin_unlock(&amd_nb_lock);
369} 366}
370 367
371static __initconst const struct x86_pmu amd_pmu = { 368static __initconst const struct x86_pmu amd_pmu = {
@@ -395,13 +392,195 @@ static __initconst const struct x86_pmu amd_pmu = {
395 .cpu_dead = amd_pmu_cpu_dead, 392 .cpu_dead = amd_pmu_cpu_dead,
396}; 393};
397 394
395/* AMD Family 15h */
396
397#define AMD_EVENT_TYPE_MASK 0x000000F0ULL
398
399#define AMD_EVENT_FP 0x00000000ULL ... 0x00000010ULL
400#define AMD_EVENT_LS 0x00000020ULL ... 0x00000030ULL
401#define AMD_EVENT_DC 0x00000040ULL ... 0x00000050ULL
402#define AMD_EVENT_CU 0x00000060ULL ... 0x00000070ULL
403#define AMD_EVENT_IC_DE 0x00000080ULL ... 0x00000090ULL
404#define AMD_EVENT_EX_LS 0x000000C0ULL
405#define AMD_EVENT_DE 0x000000D0ULL
406#define AMD_EVENT_NB 0x000000E0ULL ... 0x000000F0ULL
407
408/*
409 * AMD family 15h event code/PMC mappings:
410 *
411 * type = event_code & 0x0F0:
412 *
413 * 0x000 FP PERF_CTL[5:3]
414 * 0x010 FP PERF_CTL[5:3]
415 * 0x020 LS PERF_CTL[5:0]
416 * 0x030 LS PERF_CTL[5:0]
417 * 0x040 DC PERF_CTL[5:0]
418 * 0x050 DC PERF_CTL[5:0]
419 * 0x060 CU PERF_CTL[2:0]
420 * 0x070 CU PERF_CTL[2:0]
421 * 0x080 IC/DE PERF_CTL[2:0]
422 * 0x090 IC/DE PERF_CTL[2:0]
423 * 0x0A0 ---
424 * 0x0B0 ---
425 * 0x0C0 EX/LS PERF_CTL[5:0]
426 * 0x0D0 DE PERF_CTL[2:0]
427 * 0x0E0 NB NB_PERF_CTL[3:0]
428 * 0x0F0 NB NB_PERF_CTL[3:0]
429 *
430 * Exceptions:
431 *
432 * 0x000 FP PERF_CTL[3], PERF_CTL[5:3] (*)
433 * 0x003 FP PERF_CTL[3]
434 * 0x004 FP PERF_CTL[3], PERF_CTL[5:3] (*)
435 * 0x00B FP PERF_CTL[3]
436 * 0x00D FP PERF_CTL[3]
437 * 0x023 DE PERF_CTL[2:0]
438 * 0x02D LS PERF_CTL[3]
439 * 0x02E LS PERF_CTL[3,0]
440 * 0x043 CU PERF_CTL[2:0]
441 * 0x045 CU PERF_CTL[2:0]
442 * 0x046 CU PERF_CTL[2:0]
443 * 0x054 CU PERF_CTL[2:0]
444 * 0x055 CU PERF_CTL[2:0]
445 * 0x08F IC PERF_CTL[0]
446 * 0x187 DE PERF_CTL[0]
447 * 0x188 DE PERF_CTL[0]
448 * 0x0DB EX PERF_CTL[5:0]
449 * 0x0DC LS PERF_CTL[5:0]
450 * 0x0DD LS PERF_CTL[5:0]
451 * 0x0DE LS PERF_CTL[5:0]
452 * 0x0DF LS PERF_CTL[5:0]
453 * 0x1D6 EX PERF_CTL[5:0]
454 * 0x1D8 EX PERF_CTL[5:0]
455 *
456 * (*) depending on the umask all FPU counters may be used
457 */
458
459static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0);
460static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0);
461static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0);
462static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0);
463static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0);
464static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0);
465
466static struct event_constraint *
467amd_get_event_constraints_f15h(struct cpu_hw_events *cpuc, struct perf_event *event)
468{
469 struct hw_perf_event *hwc = &event->hw;
470 unsigned int event_code = amd_get_event_code(hwc);
471
472 switch (event_code & AMD_EVENT_TYPE_MASK) {
473 case AMD_EVENT_FP:
474 switch (event_code) {
475 case 0x000:
476 if (!(hwc->config & 0x0000F000ULL))
477 break;
478 if (!(hwc->config & 0x00000F00ULL))
479 break;
480 return &amd_f15_PMC3;
481 case 0x004:
482 if (hweight_long(hwc->config & ARCH_PERFMON_EVENTSEL_UMASK) <= 1)
483 break;
484 return &amd_f15_PMC3;
485 case 0x003:
486 case 0x00B:
487 case 0x00D:
488 return &amd_f15_PMC3;
489 }
490 return &amd_f15_PMC53;
491 case AMD_EVENT_LS:
492 case AMD_EVENT_DC:
493 case AMD_EVENT_EX_LS:
494 switch (event_code) {
495 case 0x023:
496 case 0x043:
497 case 0x045:
498 case 0x046:
499 case 0x054:
500 case 0x055:
501 return &amd_f15_PMC20;
502 case 0x02D:
503 return &amd_f15_PMC3;
504 case 0x02E:
505 return &amd_f15_PMC30;
506 default:
507 return &amd_f15_PMC50;
508 }
509 case AMD_EVENT_CU:
510 case AMD_EVENT_IC_DE:
511 case AMD_EVENT_DE:
512 switch (event_code) {
513 case 0x08F:
514 case 0x187:
515 case 0x188:
516 return &amd_f15_PMC0;
517 case 0x0DB ... 0x0DF:
518 case 0x1D6:
519 case 0x1D8:
520 return &amd_f15_PMC50;
521 default:
522 return &amd_f15_PMC20;
523 }
524 case AMD_EVENT_NB:
525 /* not yet implemented */
526 return &emptyconstraint;
527 default:
528 return &emptyconstraint;
529 }
530}
531
532static __initconst const struct x86_pmu amd_pmu_f15h = {
533 .name = "AMD Family 15h",
534 .handle_irq = x86_pmu_handle_irq,
535 .disable_all = x86_pmu_disable_all,
536 .enable_all = x86_pmu_enable_all,
537 .enable = x86_pmu_enable_event,
538 .disable = x86_pmu_disable_event,
539 .hw_config = amd_pmu_hw_config,
540 .schedule_events = x86_schedule_events,
541 .eventsel = MSR_F15H_PERF_CTL,
542 .perfctr = MSR_F15H_PERF_CTR,
543 .event_map = amd_pmu_event_map,
544 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
545 .num_counters = 6,
546 .cntval_bits = 48,
547 .cntval_mask = (1ULL << 48) - 1,
548 .apic = 1,
549 /* use highest bit to detect overflow */
550 .max_period = (1ULL << 47) - 1,
551 .get_event_constraints = amd_get_event_constraints_f15h,
552 /* nortbridge counters not yet implemented: */
553#if 0
554 .put_event_constraints = amd_put_event_constraints,
555
556 .cpu_prepare = amd_pmu_cpu_prepare,
557 .cpu_starting = amd_pmu_cpu_starting,
558 .cpu_dead = amd_pmu_cpu_dead,
559#endif
560};
561
398static __init int amd_pmu_init(void) 562static __init int amd_pmu_init(void)
399{ 563{
400 /* Performance-monitoring supported from K7 and later: */ 564 /* Performance-monitoring supported from K7 and later: */
401 if (boot_cpu_data.x86 < 6) 565 if (boot_cpu_data.x86 < 6)
402 return -ENODEV; 566 return -ENODEV;
403 567
404 x86_pmu = amd_pmu; 568 /*
569 * If core performance counter extensions exists, it must be
570 * family 15h, otherwise fail. See x86_pmu_addr_offset().
571 */
572 switch (boot_cpu_data.x86) {
573 case 0x15:
574 if (!cpu_has_perfctr_core)
575 return -ENODEV;
576 x86_pmu = amd_pmu_f15h;
577 break;
578 default:
579 if (cpu_has_perfctr_core)
580 return -ENODEV;
581 x86_pmu = amd_pmu;
582 break;
583 }
405 584
406 /* Events are common for all AMDs */ 585 /* Events are common for all AMDs */
407 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids, 586 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
index ee05c90012d2..41178c826c48 100644
--- a/arch/x86/kernel/cpu/perf_event_intel.c
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -1,9 +1,31 @@
1#ifdef CONFIG_CPU_SUP_INTEL 1#ifdef CONFIG_CPU_SUP_INTEL
2 2
3#define MAX_EXTRA_REGS 2
4
5/*
6 * Per register state.
7 */
8struct er_account {
9 int ref; /* reference count */
10 unsigned int extra_reg; /* extra MSR number */
11 u64 extra_config; /* extra MSR config */
12};
13
14/*
15 * Per core state
16 * This used to coordinate shared registers for HT threads.
17 */
18struct intel_percore {
19 raw_spinlock_t lock; /* protect structure */
20 struct er_account regs[MAX_EXTRA_REGS];
21 int refcnt; /* number of threads */
22 unsigned core_id;
23};
24
3/* 25/*
4 * Intel PerfMon, used on Core and later. 26 * Intel PerfMon, used on Core and later.
5 */ 27 */
6static const u64 intel_perfmon_event_map[] = 28static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly =
7{ 29{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c, 30 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, 31 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
@@ -14,7 +36,7 @@ static const u64 intel_perfmon_event_map[] =
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, 36 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15}; 37};
16 38
17static struct event_constraint intel_core_event_constraints[] = 39static struct event_constraint intel_core_event_constraints[] __read_mostly =
18{ 40{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ 41 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ 42 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
@@ -25,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] =
25 EVENT_CONSTRAINT_END 47 EVENT_CONSTRAINT_END
26}; 48};
27 49
28static struct event_constraint intel_core2_event_constraints[] = 50static struct event_constraint intel_core2_event_constraints[] __read_mostly =
29{ 51{
30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 52 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 53 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -48,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] =
48 EVENT_CONSTRAINT_END 70 EVENT_CONSTRAINT_END
49}; 71};
50 72
51static struct event_constraint intel_nehalem_event_constraints[] = 73static struct event_constraint intel_nehalem_event_constraints[] __read_mostly =
52{ 74{
53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 75 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 76 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -64,7 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] =
64 EVENT_CONSTRAINT_END 86 EVENT_CONSTRAINT_END
65}; 87};
66 88
67static struct event_constraint intel_westmere_event_constraints[] = 89static struct extra_reg intel_nehalem_extra_regs[] __read_mostly =
90{
91 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
92 EVENT_EXTRA_END
93};
94
95static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly =
96{
97 INTEL_EVENT_CONSTRAINT(0xb7, 0),
98 EVENT_CONSTRAINT_END
99};
100
101static struct event_constraint intel_westmere_event_constraints[] __read_mostly =
68{ 102{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 103 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 104 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -76,7 +110,34 @@ static struct event_constraint intel_westmere_event_constraints[] =
76 EVENT_CONSTRAINT_END 110 EVENT_CONSTRAINT_END
77}; 111};
78 112
79static struct event_constraint intel_gen_event_constraints[] = 113static struct event_constraint intel_snb_event_constraints[] __read_mostly =
114{
115 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
116 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
117 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
118 INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */
119 INTEL_EVENT_CONSTRAINT(0xb7, 0x1), /* OFF_CORE_RESPONSE_0 */
120 INTEL_EVENT_CONSTRAINT(0xbb, 0x8), /* OFF_CORE_RESPONSE_1 */
121 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */
122 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */
123 EVENT_CONSTRAINT_END
124};
125
126static struct extra_reg intel_westmere_extra_regs[] __read_mostly =
127{
128 INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff),
129 INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff),
130 EVENT_EXTRA_END
131};
132
133static struct event_constraint intel_westmere_percore_constraints[] __read_mostly =
134{
135 INTEL_EVENT_CONSTRAINT(0xb7, 0),
136 INTEL_EVENT_CONSTRAINT(0xbb, 0),
137 EVENT_CONSTRAINT_END
138};
139
140static struct event_constraint intel_gen_event_constraints[] __read_mostly =
80{ 141{
81 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ 142 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
82 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ 143 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
@@ -89,6 +150,103 @@ static u64 intel_pmu_event_map(int hw_event)
89 return intel_perfmon_event_map[hw_event]; 150 return intel_perfmon_event_map[hw_event];
90} 151}
91 152
153static __initconst const u64 snb_hw_cache_event_ids
154 [PERF_COUNT_HW_CACHE_MAX]
155 [PERF_COUNT_HW_CACHE_OP_MAX]
156 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
157{
158 [ C(L1D) ] = {
159 [ C(OP_READ) ] = {
160 [ C(RESULT_ACCESS) ] = 0xf1d0, /* MEM_UOP_RETIRED.LOADS */
161 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPLACEMENT */
162 },
163 [ C(OP_WRITE) ] = {
164 [ C(RESULT_ACCESS) ] = 0xf2d0, /* MEM_UOP_RETIRED.STORES */
165 [ C(RESULT_MISS) ] = 0x0851, /* L1D.ALL_M_REPLACEMENT */
166 },
167 [ C(OP_PREFETCH) ] = {
168 [ C(RESULT_ACCESS) ] = 0x0,
169 [ C(RESULT_MISS) ] = 0x024e, /* HW_PRE_REQ.DL1_MISS */
170 },
171 },
172 [ C(L1I ) ] = {
173 [ C(OP_READ) ] = {
174 [ C(RESULT_ACCESS) ] = 0x0,
175 [ C(RESULT_MISS) ] = 0x0280, /* ICACHE.MISSES */
176 },
177 [ C(OP_WRITE) ] = {
178 [ C(RESULT_ACCESS) ] = -1,
179 [ C(RESULT_MISS) ] = -1,
180 },
181 [ C(OP_PREFETCH) ] = {
182 [ C(RESULT_ACCESS) ] = 0x0,
183 [ C(RESULT_MISS) ] = 0x0,
184 },
185 },
186 [ C(LL ) ] = {
187 [ C(OP_READ) ] = {
188 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
189 [ C(RESULT_ACCESS) ] = 0x01b7,
190 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
191 [ C(RESULT_MISS) ] = 0x01b7,
192 },
193 [ C(OP_WRITE) ] = {
194 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
195 [ C(RESULT_ACCESS) ] = 0x01b7,
196 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
197 [ C(RESULT_MISS) ] = 0x01b7,
198 },
199 [ C(OP_PREFETCH) ] = {
200 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
201 [ C(RESULT_ACCESS) ] = 0x01b7,
202 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
203 [ C(RESULT_MISS) ] = 0x01b7,
204 },
205 },
206 [ C(DTLB) ] = {
207 [ C(OP_READ) ] = {
208 [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_UOP_RETIRED.ALL_LOADS */
209 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.CAUSES_A_WALK */
210 },
211 [ C(OP_WRITE) ] = {
212 [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_UOP_RETIRED.ALL_STORES */
213 [ C(RESULT_MISS) ] = 0x0149, /* DTLB_STORE_MISSES.MISS_CAUSES_A_WALK */
214 },
215 [ C(OP_PREFETCH) ] = {
216 [ C(RESULT_ACCESS) ] = 0x0,
217 [ C(RESULT_MISS) ] = 0x0,
218 },
219 },
220 [ C(ITLB) ] = {
221 [ C(OP_READ) ] = {
222 [ C(RESULT_ACCESS) ] = 0x1085, /* ITLB_MISSES.STLB_HIT */
223 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.CAUSES_A_WALK */
224 },
225 [ C(OP_WRITE) ] = {
226 [ C(RESULT_ACCESS) ] = -1,
227 [ C(RESULT_MISS) ] = -1,
228 },
229 [ C(OP_PREFETCH) ] = {
230 [ C(RESULT_ACCESS) ] = -1,
231 [ C(RESULT_MISS) ] = -1,
232 },
233 },
234 [ C(BPU ) ] = {
235 [ C(OP_READ) ] = {
236 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
237 [ C(RESULT_MISS) ] = 0x00c5, /* BR_MISP_RETIRED.ALL_BRANCHES */
238 },
239 [ C(OP_WRITE) ] = {
240 [ C(RESULT_ACCESS) ] = -1,
241 [ C(RESULT_MISS) ] = -1,
242 },
243 [ C(OP_PREFETCH) ] = {
244 [ C(RESULT_ACCESS) ] = -1,
245 [ C(RESULT_MISS) ] = -1,
246 },
247 },
248};
249
92static __initconst const u64 westmere_hw_cache_event_ids 250static __initconst const u64 westmere_hw_cache_event_ids
93 [PERF_COUNT_HW_CACHE_MAX] 251 [PERF_COUNT_HW_CACHE_MAX]
94 [PERF_COUNT_HW_CACHE_OP_MAX] 252 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -124,16 +282,26 @@ static __initconst const u64 westmere_hw_cache_event_ids
124 }, 282 },
125 [ C(LL ) ] = { 283 [ C(LL ) ] = {
126 [ C(OP_READ) ] = { 284 [ C(OP_READ) ] = {
127 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 285 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
128 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 286 [ C(RESULT_ACCESS) ] = 0x01b7,
287 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
288 [ C(RESULT_MISS) ] = 0x01b7,
129 }, 289 },
290 /*
291 * Use RFO, not WRITEBACK, because a write miss would typically occur
292 * on RFO.
293 */
130 [ C(OP_WRITE) ] = { 294 [ C(OP_WRITE) ] = {
131 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 295 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
132 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 296 [ C(RESULT_ACCESS) ] = 0x01b7,
297 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
298 [ C(RESULT_MISS) ] = 0x01b7,
133 }, 299 },
134 [ C(OP_PREFETCH) ] = { 300 [ C(OP_PREFETCH) ] = {
135 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 301 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
136 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 302 [ C(RESULT_ACCESS) ] = 0x01b7,
303 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
304 [ C(RESULT_MISS) ] = 0x01b7,
137 }, 305 },
138 }, 306 },
139 [ C(DTLB) ] = { 307 [ C(DTLB) ] = {
@@ -180,6 +348,59 @@ static __initconst const u64 westmere_hw_cache_event_ids
180 }, 348 },
181}; 349};
182 350
351/*
352 * Nehalem/Westmere MSR_OFFCORE_RESPONSE bits;
353 * See IA32 SDM Vol 3B 30.6.1.3
354 */
355
356#define NHM_DMND_DATA_RD (1 << 0)
357#define NHM_DMND_RFO (1 << 1)
358#define NHM_DMND_IFETCH (1 << 2)
359#define NHM_DMND_WB (1 << 3)
360#define NHM_PF_DATA_RD (1 << 4)
361#define NHM_PF_DATA_RFO (1 << 5)
362#define NHM_PF_IFETCH (1 << 6)
363#define NHM_OFFCORE_OTHER (1 << 7)
364#define NHM_UNCORE_HIT (1 << 8)
365#define NHM_OTHER_CORE_HIT_SNP (1 << 9)
366#define NHM_OTHER_CORE_HITM (1 << 10)
367 /* reserved */
368#define NHM_REMOTE_CACHE_FWD (1 << 12)
369#define NHM_REMOTE_DRAM (1 << 13)
370#define NHM_LOCAL_DRAM (1 << 14)
371#define NHM_NON_DRAM (1 << 15)
372
373#define NHM_ALL_DRAM (NHM_REMOTE_DRAM|NHM_LOCAL_DRAM)
374
375#define NHM_DMND_READ (NHM_DMND_DATA_RD)
376#define NHM_DMND_WRITE (NHM_DMND_RFO|NHM_DMND_WB)
377#define NHM_DMND_PREFETCH (NHM_PF_DATA_RD|NHM_PF_DATA_RFO)
378
379#define NHM_L3_HIT (NHM_UNCORE_HIT|NHM_OTHER_CORE_HIT_SNP|NHM_OTHER_CORE_HITM)
380#define NHM_L3_MISS (NHM_NON_DRAM|NHM_ALL_DRAM|NHM_REMOTE_CACHE_FWD)
381#define NHM_L3_ACCESS (NHM_L3_HIT|NHM_L3_MISS)
382
383static __initconst const u64 nehalem_hw_cache_extra_regs
384 [PERF_COUNT_HW_CACHE_MAX]
385 [PERF_COUNT_HW_CACHE_OP_MAX]
386 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
387{
388 [ C(LL ) ] = {
389 [ C(OP_READ) ] = {
390 [ C(RESULT_ACCESS) ] = NHM_DMND_READ|NHM_L3_ACCESS,
391 [ C(RESULT_MISS) ] = NHM_DMND_READ|NHM_L3_MISS,
392 },
393 [ C(OP_WRITE) ] = {
394 [ C(RESULT_ACCESS) ] = NHM_DMND_WRITE|NHM_L3_ACCESS,
395 [ C(RESULT_MISS) ] = NHM_DMND_WRITE|NHM_L3_MISS,
396 },
397 [ C(OP_PREFETCH) ] = {
398 [ C(RESULT_ACCESS) ] = NHM_DMND_PREFETCH|NHM_L3_ACCESS,
399 [ C(RESULT_MISS) ] = NHM_DMND_PREFETCH|NHM_L3_MISS,
400 },
401 }
402};
403
183static __initconst const u64 nehalem_hw_cache_event_ids 404static __initconst const u64 nehalem_hw_cache_event_ids
184 [PERF_COUNT_HW_CACHE_MAX] 405 [PERF_COUNT_HW_CACHE_MAX]
185 [PERF_COUNT_HW_CACHE_OP_MAX] 406 [PERF_COUNT_HW_CACHE_OP_MAX]
@@ -187,12 +408,12 @@ static __initconst const u64 nehalem_hw_cache_event_ids
187{ 408{
188 [ C(L1D) ] = { 409 [ C(L1D) ] = {
189 [ C(OP_READ) ] = { 410 [ C(OP_READ) ] = {
190 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */ 411 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
191 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */ 412 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
192 }, 413 },
193 [ C(OP_WRITE) ] = { 414 [ C(OP_WRITE) ] = {
194 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */ 415 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
195 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */ 416 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
196 }, 417 },
197 [ C(OP_PREFETCH) ] = { 418 [ C(OP_PREFETCH) ] = {
198 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */ 419 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
@@ -215,16 +436,26 @@ static __initconst const u64 nehalem_hw_cache_event_ids
215 }, 436 },
216 [ C(LL ) ] = { 437 [ C(LL ) ] = {
217 [ C(OP_READ) ] = { 438 [ C(OP_READ) ] = {
218 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */ 439 /* OFFCORE_RESPONSE.ANY_DATA.LOCAL_CACHE */
219 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */ 440 [ C(RESULT_ACCESS) ] = 0x01b7,
441 /* OFFCORE_RESPONSE.ANY_DATA.ANY_LLC_MISS */
442 [ C(RESULT_MISS) ] = 0x01b7,
220 }, 443 },
444 /*
445 * Use RFO, not WRITEBACK, because a write miss would typically occur
446 * on RFO.
447 */
221 [ C(OP_WRITE) ] = { 448 [ C(OP_WRITE) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */ 449 /* OFFCORE_RESPONSE.ANY_RFO.LOCAL_CACHE */
223 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */ 450 [ C(RESULT_ACCESS) ] = 0x01b7,
451 /* OFFCORE_RESPONSE.ANY_RFO.ANY_LLC_MISS */
452 [ C(RESULT_MISS) ] = 0x01b7,
224 }, 453 },
225 [ C(OP_PREFETCH) ] = { 454 [ C(OP_PREFETCH) ] = {
226 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */ 455 /* OFFCORE_RESPONSE.PREFETCH.LOCAL_CACHE */
227 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */ 456 [ C(RESULT_ACCESS) ] = 0x01b7,
457 /* OFFCORE_RESPONSE.PREFETCH.ANY_LLC_MISS */
458 [ C(RESULT_MISS) ] = 0x01b7,
228 }, 459 },
229 }, 460 },
230 [ C(DTLB) ] = { 461 [ C(DTLB) ] = {
@@ -649,7 +880,7 @@ static void intel_pmu_enable_event(struct perf_event *event)
649 struct hw_perf_event *hwc = &event->hw; 880 struct hw_perf_event *hwc = &event->hw;
650 881
651 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) { 882 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
652 if (!__get_cpu_var(cpu_hw_events).enabled) 883 if (!__this_cpu_read(cpu_hw_events.enabled))
653 return; 884 return;
654 885
655 intel_pmu_enable_bts(hwc->config); 886 intel_pmu_enable_bts(hwc->config);
@@ -679,7 +910,7 @@ static int intel_pmu_save_and_restart(struct perf_event *event)
679 910
680static void intel_pmu_reset(void) 911static void intel_pmu_reset(void)
681{ 912{
682 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds; 913 struct debug_store *ds = __this_cpu_read(cpu_hw_events.ds);
683 unsigned long flags; 914 unsigned long flags;
684 int idx; 915 int idx;
685 916
@@ -691,8 +922,8 @@ static void intel_pmu_reset(void)
691 printk("clearing PMU state on CPU#%d\n", smp_processor_id()); 922 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
692 923
693 for (idx = 0; idx < x86_pmu.num_counters; idx++) { 924 for (idx = 0; idx < x86_pmu.num_counters; idx++) {
694 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); 925 checking_wrmsrl(x86_pmu_config_addr(idx), 0ull);
695 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); 926 checking_wrmsrl(x86_pmu_event_addr(idx), 0ull);
696 } 927 }
697 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) 928 for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++)
698 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); 929 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
@@ -713,18 +944,28 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
713 struct cpu_hw_events *cpuc; 944 struct cpu_hw_events *cpuc;
714 int bit, loops; 945 int bit, loops;
715 u64 status; 946 u64 status;
716 int handled = 0; 947 int handled;
717 948
718 perf_sample_data_init(&data, 0); 949 perf_sample_data_init(&data, 0);
719 950
720 cpuc = &__get_cpu_var(cpu_hw_events); 951 cpuc = &__get_cpu_var(cpu_hw_events);
721 952
953 /*
954 * Some chipsets need to unmask the LVTPC in a particular spot
955 * inside the nmi handler. As a result, the unmasking was pushed
956 * into all the nmi handlers.
957 *
958 * This handler doesn't seem to have any issues with the unmasking
959 * so it was left at the top.
960 */
961 apic_write(APIC_LVTPC, APIC_DM_NMI);
962
722 intel_pmu_disable_all(); 963 intel_pmu_disable_all();
723 intel_pmu_drain_bts_buffer(); 964 handled = intel_pmu_drain_bts_buffer();
724 status = intel_pmu_get_status(); 965 status = intel_pmu_get_status();
725 if (!status) { 966 if (!status) {
726 intel_pmu_enable_all(0); 967 intel_pmu_enable_all(0);
727 return 0; 968 return handled;
728 } 969 }
729 970
730 loops = 0; 971 loops = 0;
@@ -763,7 +1004,7 @@ again:
763 data.period = event->hw.last_period; 1004 data.period = event->hw.last_period;
764 1005
765 if (perf_event_overflow(event, 1, &data, regs)) 1006 if (perf_event_overflow(event, 1, &data, regs))
766 x86_pmu_stop(event); 1007 x86_pmu_stop(event, 0);
767 } 1008 }
768 1009
769 /* 1010 /*
@@ -784,6 +1025,9 @@ intel_bts_constraints(struct perf_event *event)
784 struct hw_perf_event *hwc = &event->hw; 1025 struct hw_perf_event *hwc = &event->hw;
785 unsigned int hw_event, bts_event; 1026 unsigned int hw_event, bts_event;
786 1027
1028 if (event->attr.freq)
1029 return NULL;
1030
787 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK; 1031 hw_event = hwc->config & INTEL_ARCH_EVENT_MASK;
788 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS); 1032 bts_event = x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS);
789 1033
@@ -794,6 +1038,67 @@ intel_bts_constraints(struct perf_event *event)
794} 1038}
795 1039
796static struct event_constraint * 1040static struct event_constraint *
1041intel_percore_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1042{
1043 struct hw_perf_event *hwc = &event->hw;
1044 unsigned int e = hwc->config & ARCH_PERFMON_EVENTSEL_EVENT;
1045 struct event_constraint *c;
1046 struct intel_percore *pc;
1047 struct er_account *era;
1048 int i;
1049 int free_slot;
1050 int found;
1051
1052 if (!x86_pmu.percore_constraints || hwc->extra_alloc)
1053 return NULL;
1054
1055 for (c = x86_pmu.percore_constraints; c->cmask; c++) {
1056 if (e != c->code)
1057 continue;
1058
1059 /*
1060 * Allocate resource per core.
1061 */
1062 pc = cpuc->per_core;
1063 if (!pc)
1064 break;
1065 c = &emptyconstraint;
1066 raw_spin_lock(&pc->lock);
1067 free_slot = -1;
1068 found = 0;
1069 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1070 era = &pc->regs[i];
1071 if (era->ref > 0 && hwc->extra_reg == era->extra_reg) {
1072 /* Allow sharing same config */
1073 if (hwc->extra_config == era->extra_config) {
1074 era->ref++;
1075 cpuc->percore_used = 1;
1076 hwc->extra_alloc = 1;
1077 c = NULL;
1078 }
1079 /* else conflict */
1080 found = 1;
1081 break;
1082 } else if (era->ref == 0 && free_slot == -1)
1083 free_slot = i;
1084 }
1085 if (!found && free_slot != -1) {
1086 era = &pc->regs[free_slot];
1087 era->ref = 1;
1088 era->extra_reg = hwc->extra_reg;
1089 era->extra_config = hwc->extra_config;
1090 cpuc->percore_used = 1;
1091 hwc->extra_alloc = 1;
1092 c = NULL;
1093 }
1094 raw_spin_unlock(&pc->lock);
1095 return c;
1096 }
1097
1098 return NULL;
1099}
1100
1101static struct event_constraint *
797intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event) 1102intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
798{ 1103{
799 struct event_constraint *c; 1104 struct event_constraint *c;
@@ -806,9 +1111,51 @@ intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event
806 if (c) 1111 if (c)
807 return c; 1112 return c;
808 1113
1114 c = intel_percore_constraints(cpuc, event);
1115 if (c)
1116 return c;
1117
809 return x86_get_event_constraints(cpuc, event); 1118 return x86_get_event_constraints(cpuc, event);
810} 1119}
811 1120
1121static void intel_put_event_constraints(struct cpu_hw_events *cpuc,
1122 struct perf_event *event)
1123{
1124 struct extra_reg *er;
1125 struct intel_percore *pc;
1126 struct er_account *era;
1127 struct hw_perf_event *hwc = &event->hw;
1128 int i, allref;
1129
1130 if (!cpuc->percore_used)
1131 return;
1132
1133 for (er = x86_pmu.extra_regs; er->msr; er++) {
1134 if (er->event != (hwc->config & er->config_mask))
1135 continue;
1136
1137 pc = cpuc->per_core;
1138 raw_spin_lock(&pc->lock);
1139 for (i = 0; i < MAX_EXTRA_REGS; i++) {
1140 era = &pc->regs[i];
1141 if (era->ref > 0 &&
1142 era->extra_config == hwc->extra_config &&
1143 era->extra_reg == er->msr) {
1144 era->ref--;
1145 hwc->extra_alloc = 0;
1146 break;
1147 }
1148 }
1149 allref = 0;
1150 for (i = 0; i < MAX_EXTRA_REGS; i++)
1151 allref += pc->regs[i].ref;
1152 if (allref == 0)
1153 cpuc->percore_used = 0;
1154 raw_spin_unlock(&pc->lock);
1155 break;
1156 }
1157}
1158
812static int intel_pmu_hw_config(struct perf_event *event) 1159static int intel_pmu_hw_config(struct perf_event *event)
813{ 1160{
814 int ret = x86_pmu_hw_config(event); 1161 int ret = x86_pmu_hw_config(event);
@@ -816,6 +1163,32 @@ static int intel_pmu_hw_config(struct perf_event *event)
816 if (ret) 1163 if (ret)
817 return ret; 1164 return ret;
818 1165
1166 if (event->attr.precise_ip &&
1167 (event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) {
1168 /*
1169 * Use an alternative encoding for CPU_CLK_UNHALTED.THREAD_P
1170 * (0x003c) so that we can use it with PEBS.
1171 *
1172 * The regular CPU_CLK_UNHALTED.THREAD_P event (0x003c) isn't
1173 * PEBS capable. However we can use INST_RETIRED.ANY_P
1174 * (0x00c0), which is a PEBS capable event, to get the same
1175 * count.
1176 *
1177 * INST_RETIRED.ANY_P counts the number of cycles that retires
1178 * CNTMASK instructions. By setting CNTMASK to a value (16)
1179 * larger than the maximum number of instructions that can be
1180 * retired per cycle (4) and then inverting the condition, we
1181 * count all cycles that retire 16 or less instructions, which
1182 * is every cycle.
1183 *
1184 * Thereby we gain a PEBS capable cycle counter.
1185 */
1186 u64 alt_config = 0x108000c0; /* INST_RETIRED.TOTAL_CYCLES */
1187
1188 alt_config |= (event->hw.config & ~X86_RAW_EVENT_MASK);
1189 event->hw.config = alt_config;
1190 }
1191
819 if (event->attr.type != PERF_TYPE_RAW) 1192 if (event->attr.type != PERF_TYPE_RAW)
820 return 0; 1193 return 0;
821 1194
@@ -854,20 +1227,67 @@ static __initconst const struct x86_pmu core_pmu = {
854 */ 1227 */
855 .max_period = (1ULL << 31) - 1, 1228 .max_period = (1ULL << 31) - 1,
856 .get_event_constraints = intel_get_event_constraints, 1229 .get_event_constraints = intel_get_event_constraints,
1230 .put_event_constraints = intel_put_event_constraints,
857 .event_constraints = intel_core_event_constraints, 1231 .event_constraints = intel_core_event_constraints,
858}; 1232};
859 1233
1234static int intel_pmu_cpu_prepare(int cpu)
1235{
1236 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1237
1238 if (!cpu_has_ht_siblings())
1239 return NOTIFY_OK;
1240
1241 cpuc->per_core = kzalloc_node(sizeof(struct intel_percore),
1242 GFP_KERNEL, cpu_to_node(cpu));
1243 if (!cpuc->per_core)
1244 return NOTIFY_BAD;
1245
1246 raw_spin_lock_init(&cpuc->per_core->lock);
1247 cpuc->per_core->core_id = -1;
1248 return NOTIFY_OK;
1249}
1250
860static void intel_pmu_cpu_starting(int cpu) 1251static void intel_pmu_cpu_starting(int cpu)
861{ 1252{
1253 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1254 int core_id = topology_core_id(cpu);
1255 int i;
1256
862 init_debug_store_on_cpu(cpu); 1257 init_debug_store_on_cpu(cpu);
863 /* 1258 /*
864 * Deal with CPUs that don't clear their LBRs on power-up. 1259 * Deal with CPUs that don't clear their LBRs on power-up.
865 */ 1260 */
866 intel_pmu_lbr_reset(); 1261 intel_pmu_lbr_reset();
1262
1263 if (!cpu_has_ht_siblings())
1264 return;
1265
1266 for_each_cpu(i, topology_thread_cpumask(cpu)) {
1267 struct intel_percore *pc = per_cpu(cpu_hw_events, i).per_core;
1268
1269 if (pc && pc->core_id == core_id) {
1270 kfree(cpuc->per_core);
1271 cpuc->per_core = pc;
1272 break;
1273 }
1274 }
1275
1276 cpuc->per_core->core_id = core_id;
1277 cpuc->per_core->refcnt++;
867} 1278}
868 1279
869static void intel_pmu_cpu_dying(int cpu) 1280static void intel_pmu_cpu_dying(int cpu)
870{ 1281{
1282 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
1283 struct intel_percore *pc = cpuc->per_core;
1284
1285 if (pc) {
1286 if (pc->core_id == -1 || --pc->refcnt == 0)
1287 kfree(pc);
1288 cpuc->per_core = NULL;
1289 }
1290
871 fini_debug_store_on_cpu(cpu); 1291 fini_debug_store_on_cpu(cpu);
872} 1292}
873 1293
@@ -892,7 +1312,9 @@ static __initconst const struct x86_pmu intel_pmu = {
892 */ 1312 */
893 .max_period = (1ULL << 31) - 1, 1313 .max_period = (1ULL << 31) - 1,
894 .get_event_constraints = intel_get_event_constraints, 1314 .get_event_constraints = intel_get_event_constraints,
1315 .put_event_constraints = intel_put_event_constraints,
895 1316
1317 .cpu_prepare = intel_pmu_cpu_prepare,
896 .cpu_starting = intel_pmu_cpu_starting, 1318 .cpu_starting = intel_pmu_cpu_starting,
897 .cpu_dying = intel_pmu_cpu_dying, 1319 .cpu_dying = intel_pmu_cpu_dying,
898}; 1320};
@@ -913,7 +1335,7 @@ static void intel_clovertown_quirks(void)
913 * AJ106 could possibly be worked around by not allowing LBR 1335 * AJ106 could possibly be worked around by not allowing LBR
914 * usage from PEBS, including the fixup. 1336 * usage from PEBS, including the fixup.
915 * AJ68 could possibly be worked around by always programming 1337 * AJ68 could possibly be worked around by always programming
916 * a pebs_event_reset[0] value and coping with the lost events. 1338 * a pebs_event_reset[0] value and coping with the lost events.
917 * 1339 *
918 * But taken together it might just make sense to not enable PEBS on 1340 * But taken together it might just make sense to not enable PEBS on
919 * these chips. 1341 * these chips.
@@ -998,6 +1420,7 @@ static __init int intel_pmu_init(void)
998 intel_pmu_lbr_init_core(); 1420 intel_pmu_lbr_init_core();
999 1421
1000 x86_pmu.event_constraints = intel_core2_event_constraints; 1422 x86_pmu.event_constraints = intel_core2_event_constraints;
1423 x86_pmu.pebs_constraints = intel_core2_pebs_event_constraints;
1001 pr_cont("Core2 events, "); 1424 pr_cont("Core2 events, ");
1002 break; 1425 break;
1003 1426
@@ -1006,11 +1429,33 @@ static __init int intel_pmu_init(void)
1006 case 46: /* 45 nm nehalem-ex, "Beckton" */ 1429 case 46: /* 45 nm nehalem-ex, "Beckton" */
1007 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids, 1430 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
1008 sizeof(hw_cache_event_ids)); 1431 sizeof(hw_cache_event_ids));
1432 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1433 sizeof(hw_cache_extra_regs));
1009 1434
1010 intel_pmu_lbr_init_nhm(); 1435 intel_pmu_lbr_init_nhm();
1011 1436
1012 x86_pmu.event_constraints = intel_nehalem_event_constraints; 1437 x86_pmu.event_constraints = intel_nehalem_event_constraints;
1438 x86_pmu.pebs_constraints = intel_nehalem_pebs_event_constraints;
1439 x86_pmu.percore_constraints = intel_nehalem_percore_constraints;
1013 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1440 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1441 x86_pmu.extra_regs = intel_nehalem_extra_regs;
1442
1443 /* UOPS_ISSUED.STALLED_CYCLES */
1444 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1445 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1446 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1447
1448 if (ebx & 0x40) {
1449 /*
1450 * Erratum AAJ80 detected, we work it around by using
1451 * the BR_MISP_EXEC.ANY event. This will over-count
1452 * branch-misses, but it's still much better than the
1453 * architectural event which is often completely bogus:
1454 */
1455 intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89;
1456
1457 pr_cont("erratum AAJ80 worked around, ");
1458 }
1014 pr_cont("Nehalem events, "); 1459 pr_cont("Nehalem events, ");
1015 break; 1460 break;
1016 1461
@@ -1021,21 +1466,51 @@ static __init int intel_pmu_init(void)
1021 intel_pmu_lbr_init_atom(); 1466 intel_pmu_lbr_init_atom();
1022 1467
1023 x86_pmu.event_constraints = intel_gen_event_constraints; 1468 x86_pmu.event_constraints = intel_gen_event_constraints;
1469 x86_pmu.pebs_constraints = intel_atom_pebs_event_constraints;
1024 pr_cont("Atom events, "); 1470 pr_cont("Atom events, ");
1025 break; 1471 break;
1026 1472
1027 case 37: /* 32 nm nehalem, "Clarkdale" */ 1473 case 37: /* 32 nm nehalem, "Clarkdale" */
1028 case 44: /* 32 nm nehalem, "Gulftown" */ 1474 case 44: /* 32 nm nehalem, "Gulftown" */
1475 case 47: /* 32 nm Xeon E7 */
1029 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids, 1476 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
1030 sizeof(hw_cache_event_ids)); 1477 sizeof(hw_cache_event_ids));
1478 memcpy(hw_cache_extra_regs, nehalem_hw_cache_extra_regs,
1479 sizeof(hw_cache_extra_regs));
1031 1480
1032 intel_pmu_lbr_init_nhm(); 1481 intel_pmu_lbr_init_nhm();
1033 1482
1034 x86_pmu.event_constraints = intel_westmere_event_constraints; 1483 x86_pmu.event_constraints = intel_westmere_event_constraints;
1484 x86_pmu.percore_constraints = intel_westmere_percore_constraints;
1035 x86_pmu.enable_all = intel_pmu_nhm_enable_all; 1485 x86_pmu.enable_all = intel_pmu_nhm_enable_all;
1486 x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints;
1487 x86_pmu.extra_regs = intel_westmere_extra_regs;
1488
1489 /* UOPS_ISSUED.STALLED_CYCLES */
1490 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1491 /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */
1492 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1;
1493
1036 pr_cont("Westmere events, "); 1494 pr_cont("Westmere events, ");
1037 break; 1495 break;
1038 1496
1497 case 42: /* SandyBridge */
1498 memcpy(hw_cache_event_ids, snb_hw_cache_event_ids,
1499 sizeof(hw_cache_event_ids));
1500
1501 intel_pmu_lbr_init_nhm();
1502
1503 x86_pmu.event_constraints = intel_snb_event_constraints;
1504 x86_pmu.pebs_constraints = intel_snb_pebs_events;
1505
1506 /* UOPS_ISSUED.ANY,c=1,i=1 to count stall cycles */
1507 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e;
1508 /* UOPS_DISPATCHED.THREAD,c=1,i=1 to count stall cycles*/
1509 intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x18001b1;
1510
1511 pr_cont("SandyBridge events, ");
1512 break;
1513
1039 default: 1514 default:
1040 /* 1515 /*
1041 * default constraints for v2 and up 1516 * default constraints for v2 and up
diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c
index 18018d1311cd..bab491b8ee25 100644
--- a/arch/x86/kernel/cpu/perf_event_intel_ds.c
+++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c
@@ -74,6 +74,107 @@ static void fini_debug_store_on_cpu(int cpu)
74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0); 74 wrmsr_on_cpu(cpu, MSR_IA32_DS_AREA, 0, 0);
75} 75}
76 76
77static int alloc_pebs_buffer(int cpu)
78{
79 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
80 int node = cpu_to_node(cpu);
81 int max, thresh = 1; /* always use a single PEBS record */
82 void *buffer;
83
84 if (!x86_pmu.pebs)
85 return 0;
86
87 buffer = kmalloc_node(PEBS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
88 if (unlikely(!buffer))
89 return -ENOMEM;
90
91 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
92
93 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
94 ds->pebs_index = ds->pebs_buffer_base;
95 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
96 max * x86_pmu.pebs_record_size;
97
98 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
99 thresh * x86_pmu.pebs_record_size;
100
101 return 0;
102}
103
104static void release_pebs_buffer(int cpu)
105{
106 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
107
108 if (!ds || !x86_pmu.pebs)
109 return;
110
111 kfree((void *)(unsigned long)ds->pebs_buffer_base);
112 ds->pebs_buffer_base = 0;
113}
114
115static int alloc_bts_buffer(int cpu)
116{
117 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
118 int node = cpu_to_node(cpu);
119 int max, thresh;
120 void *buffer;
121
122 if (!x86_pmu.bts)
123 return 0;
124
125 buffer = kmalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_ZERO, node);
126 if (unlikely(!buffer))
127 return -ENOMEM;
128
129 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
130 thresh = max / 16;
131
132 ds->bts_buffer_base = (u64)(unsigned long)buffer;
133 ds->bts_index = ds->bts_buffer_base;
134 ds->bts_absolute_maximum = ds->bts_buffer_base +
135 max * BTS_RECORD_SIZE;
136 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
137 thresh * BTS_RECORD_SIZE;
138
139 return 0;
140}
141
142static void release_bts_buffer(int cpu)
143{
144 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
145
146 if (!ds || !x86_pmu.bts)
147 return;
148
149 kfree((void *)(unsigned long)ds->bts_buffer_base);
150 ds->bts_buffer_base = 0;
151}
152
153static int alloc_ds_buffer(int cpu)
154{
155 int node = cpu_to_node(cpu);
156 struct debug_store *ds;
157
158 ds = kmalloc_node(sizeof(*ds), GFP_KERNEL | __GFP_ZERO, node);
159 if (unlikely(!ds))
160 return -ENOMEM;
161
162 per_cpu(cpu_hw_events, cpu).ds = ds;
163
164 return 0;
165}
166
167static void release_ds_buffer(int cpu)
168{
169 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
170
171 if (!ds)
172 return;
173
174 per_cpu(cpu_hw_events, cpu).ds = NULL;
175 kfree(ds);
176}
177
77static void release_ds_buffers(void) 178static void release_ds_buffers(void)
78{ 179{
79 int cpu; 180 int cpu;
@@ -82,93 +183,77 @@ static void release_ds_buffers(void)
82 return; 183 return;
83 184
84 get_online_cpus(); 185 get_online_cpus();
85
86 for_each_online_cpu(cpu) 186 for_each_online_cpu(cpu)
87 fini_debug_store_on_cpu(cpu); 187 fini_debug_store_on_cpu(cpu);
88 188
89 for_each_possible_cpu(cpu) { 189 for_each_possible_cpu(cpu) {
90 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 190 release_pebs_buffer(cpu);
91 191 release_bts_buffer(cpu);
92 if (!ds) 192 release_ds_buffer(cpu);
93 continue;
94
95 per_cpu(cpu_hw_events, cpu).ds = NULL;
96
97 kfree((void *)(unsigned long)ds->pebs_buffer_base);
98 kfree((void *)(unsigned long)ds->bts_buffer_base);
99 kfree(ds);
100 } 193 }
101
102 put_online_cpus(); 194 put_online_cpus();
103} 195}
104 196
105static int reserve_ds_buffers(void) 197static void reserve_ds_buffers(void)
106{ 198{
107 int cpu, err = 0; 199 int bts_err = 0, pebs_err = 0;
200 int cpu;
201
202 x86_pmu.bts_active = 0;
203 x86_pmu.pebs_active = 0;
108 204
109 if (!x86_pmu.bts && !x86_pmu.pebs) 205 if (!x86_pmu.bts && !x86_pmu.pebs)
110 return 0; 206 return;
207
208 if (!x86_pmu.bts)
209 bts_err = 1;
210
211 if (!x86_pmu.pebs)
212 pebs_err = 1;
111 213
112 get_online_cpus(); 214 get_online_cpus();
113 215
114 for_each_possible_cpu(cpu) { 216 for_each_possible_cpu(cpu) {
115 struct debug_store *ds; 217 if (alloc_ds_buffer(cpu)) {
116 void *buffer; 218 bts_err = 1;
117 int max, thresh; 219 pebs_err = 1;
220 }
118 221
119 err = -ENOMEM; 222 if (!bts_err && alloc_bts_buffer(cpu))
120 ds = kzalloc(sizeof(*ds), GFP_KERNEL); 223 bts_err = 1;
121 if (unlikely(!ds)) 224
225 if (!pebs_err && alloc_pebs_buffer(cpu))
226 pebs_err = 1;
227
228 if (bts_err && pebs_err)
122 break; 229 break;
123 per_cpu(cpu_hw_events, cpu).ds = ds; 230 }
124
125 if (x86_pmu.bts) {
126 buffer = kzalloc(BTS_BUFFER_SIZE, GFP_KERNEL);
127 if (unlikely(!buffer))
128 break;
129
130 max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE;
131 thresh = max / 16;
132
133 ds->bts_buffer_base = (u64)(unsigned long)buffer;
134 ds->bts_index = ds->bts_buffer_base;
135 ds->bts_absolute_maximum = ds->bts_buffer_base +
136 max * BTS_RECORD_SIZE;
137 ds->bts_interrupt_threshold = ds->bts_absolute_maximum -
138 thresh * BTS_RECORD_SIZE;
139 }
140 231
141 if (x86_pmu.pebs) { 232 if (bts_err) {
142 buffer = kzalloc(PEBS_BUFFER_SIZE, GFP_KERNEL); 233 for_each_possible_cpu(cpu)
143 if (unlikely(!buffer)) 234 release_bts_buffer(cpu);
144 break; 235 }
145
146 max = PEBS_BUFFER_SIZE / x86_pmu.pebs_record_size;
147
148 ds->pebs_buffer_base = (u64)(unsigned long)buffer;
149 ds->pebs_index = ds->pebs_buffer_base;
150 ds->pebs_absolute_maximum = ds->pebs_buffer_base +
151 max * x86_pmu.pebs_record_size;
152 /*
153 * Always use single record PEBS
154 */
155 ds->pebs_interrupt_threshold = ds->pebs_buffer_base +
156 x86_pmu.pebs_record_size;
157 }
158 236
159 err = 0; 237 if (pebs_err) {
238 for_each_possible_cpu(cpu)
239 release_pebs_buffer(cpu);
160 } 240 }
161 241
162 if (err) 242 if (bts_err && pebs_err) {
163 release_ds_buffers(); 243 for_each_possible_cpu(cpu)
164 else { 244 release_ds_buffer(cpu);
245 } else {
246 if (x86_pmu.bts && !bts_err)
247 x86_pmu.bts_active = 1;
248
249 if (x86_pmu.pebs && !pebs_err)
250 x86_pmu.pebs_active = 1;
251
165 for_each_online_cpu(cpu) 252 for_each_online_cpu(cpu)
166 init_debug_store_on_cpu(cpu); 253 init_debug_store_on_cpu(cpu);
167 } 254 }
168 255
169 put_online_cpus(); 256 put_online_cpus();
170
171 return err;
172} 257}
173 258
174/* 259/*
@@ -214,7 +299,7 @@ static void intel_pmu_disable_bts(void)
214 update_debugctlmsr(debugctlmsr); 299 update_debugctlmsr(debugctlmsr);
215} 300}
216 301
217static void intel_pmu_drain_bts_buffer(void) 302static int intel_pmu_drain_bts_buffer(void)
218{ 303{
219 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 304 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
220 struct debug_store *ds = cpuc->ds; 305 struct debug_store *ds = cpuc->ds;
@@ -231,16 +316,16 @@ static void intel_pmu_drain_bts_buffer(void)
231 struct pt_regs regs; 316 struct pt_regs regs;
232 317
233 if (!event) 318 if (!event)
234 return; 319 return 0;
235 320
236 if (!ds) 321 if (!x86_pmu.bts_active)
237 return; 322 return 0;
238 323
239 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base; 324 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
240 top = (struct bts_record *)(unsigned long)ds->bts_index; 325 top = (struct bts_record *)(unsigned long)ds->bts_index;
241 326
242 if (top <= at) 327 if (top <= at)
243 return; 328 return 0;
244 329
245 ds->bts_index = ds->bts_buffer_base; 330 ds->bts_index = ds->bts_buffer_base;
246 331
@@ -256,7 +341,7 @@ static void intel_pmu_drain_bts_buffer(void)
256 perf_prepare_sample(&header, &data, event, &regs); 341 perf_prepare_sample(&header, &data, event, &regs);
257 342
258 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1)) 343 if (perf_output_begin(&handle, event, header.size * (top - at), 1, 1))
259 return; 344 return 1;
260 345
261 for (; at < top; at++) { 346 for (; at < top; at++) {
262 data.ip = at->from; 347 data.ip = at->from;
@@ -270,35 +355,76 @@ static void intel_pmu_drain_bts_buffer(void)
270 /* There's new data available. */ 355 /* There's new data available. */
271 event->hw.interrupts++; 356 event->hw.interrupts++;
272 event->pending_kill = POLL_IN; 357 event->pending_kill = POLL_IN;
358 return 1;
273} 359}
274 360
275/* 361/*
276 * PEBS 362 * PEBS
277 */ 363 */
364static struct event_constraint intel_core2_pebs_event_constraints[] = {
365 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
366 INTEL_UEVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */
367 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */
368 INTEL_UEVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */
369 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
370 EVENT_CONSTRAINT_END
371};
372
373static struct event_constraint intel_atom_pebs_event_constraints[] = {
374 INTEL_UEVENT_CONSTRAINT(0x00c0, 0x1), /* INST_RETIRED.ANY */
375 INTEL_UEVENT_CONSTRAINT(0x00c5, 0x1), /* MISPREDICTED_BRANCH_RETIRED */
376 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED.* */
377 EVENT_CONSTRAINT_END
378};
379
380static struct event_constraint intel_nehalem_pebs_event_constraints[] = {
381 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
382 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
383 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
384 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INST_RETIRED.ANY */
385 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
386 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
387 INTEL_UEVENT_CONSTRAINT(0x02c5, 0xf), /* BR_MISP_RETIRED.NEAR_CALL */
388 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
389 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
390 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
391 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
392 EVENT_CONSTRAINT_END
393};
278 394
279static struct event_constraint intel_core_pebs_events[] = { 395static struct event_constraint intel_westmere_pebs_event_constraints[] = {
280 PEBS_EVENT_CONSTRAINT(0x00c0, 0x1), /* INSTR_RETIRED.ANY */ 396 INTEL_EVENT_CONSTRAINT(0x0b, 0xf), /* MEM_INST_RETIRED.* */
281 PEBS_EVENT_CONSTRAINT(0xfec1, 0x1), /* X87_OPS_RETIRED.ANY */ 397 INTEL_EVENT_CONSTRAINT(0x0f, 0xf), /* MEM_UNCORE_RETIRED.* */
282 PEBS_EVENT_CONSTRAINT(0x00c5, 0x1), /* BR_INST_RETIRED.MISPRED */ 398 INTEL_UEVENT_CONSTRAINT(0x010c, 0xf), /* MEM_STORE_RETIRED.DTLB_MISS */
283 PEBS_EVENT_CONSTRAINT(0x1fc7, 0x1), /* SIMD_INST_RETURED.ANY */ 399 INTEL_EVENT_CONSTRAINT(0xc0, 0xf), /* INSTR_RETIRED.* */
284 PEBS_EVENT_CONSTRAINT(0x01cb, 0x1), /* MEM_LOAD_RETIRED.L1D_MISS */ 400 INTEL_EVENT_CONSTRAINT(0xc2, 0xf), /* UOPS_RETIRED.* */
285 PEBS_EVENT_CONSTRAINT(0x02cb, 0x1), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 401 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
286 PEBS_EVENT_CONSTRAINT(0x04cb, 0x1), /* MEM_LOAD_RETIRED.L2_MISS */ 402 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
287 PEBS_EVENT_CONSTRAINT(0x08cb, 0x1), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 403 INTEL_EVENT_CONSTRAINT(0xc7, 0xf), /* SSEX_UOPS_RETIRED.* */
288 PEBS_EVENT_CONSTRAINT(0x10cb, 0x1), /* MEM_LOAD_RETIRED.DTLB_MISS */ 404 INTEL_UEVENT_CONSTRAINT(0x20c8, 0xf), /* ITLB_MISS_RETIRED */
405 INTEL_EVENT_CONSTRAINT(0xcb, 0xf), /* MEM_LOAD_RETIRED.* */
406 INTEL_EVENT_CONSTRAINT(0xf7, 0xf), /* FP_ASSIST.* */
289 EVENT_CONSTRAINT_END 407 EVENT_CONSTRAINT_END
290}; 408};
291 409
292static struct event_constraint intel_nehalem_pebs_events[] = { 410static struct event_constraint intel_snb_pebs_events[] = {
293 PEBS_EVENT_CONSTRAINT(0x00c0, 0xf), /* INSTR_RETIRED.ANY */ 411 INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PRECDIST */
294 PEBS_EVENT_CONSTRAINT(0xfec1, 0xf), /* X87_OPS_RETIRED.ANY */ 412 INTEL_UEVENT_CONSTRAINT(0x01c2, 0xf), /* UOPS_RETIRED.ALL */
295 PEBS_EVENT_CONSTRAINT(0x00c5, 0xf), /* BR_INST_RETIRED.MISPRED */ 413 INTEL_UEVENT_CONSTRAINT(0x02c2, 0xf), /* UOPS_RETIRED.RETIRE_SLOTS */
296 PEBS_EVENT_CONSTRAINT(0x1fc7, 0xf), /* SIMD_INST_RETURED.ANY */ 414 INTEL_EVENT_CONSTRAINT(0xc4, 0xf), /* BR_INST_RETIRED.* */
297 PEBS_EVENT_CONSTRAINT(0x01cb, 0xf), /* MEM_LOAD_RETIRED.L1D_MISS */ 415 INTEL_EVENT_CONSTRAINT(0xc5, 0xf), /* BR_MISP_RETIRED.* */
298 PEBS_EVENT_CONSTRAINT(0x02cb, 0xf), /* MEM_LOAD_RETIRED.L1D_LINE_MISS */ 416 INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.* */
299 PEBS_EVENT_CONSTRAINT(0x04cb, 0xf), /* MEM_LOAD_RETIRED.L2_MISS */ 417 INTEL_UEVENT_CONSTRAINT(0x11d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_LOADS */
300 PEBS_EVENT_CONSTRAINT(0x08cb, 0xf), /* MEM_LOAD_RETIRED.L2_LINE_MISS */ 418 INTEL_UEVENT_CONSTRAINT(0x12d0, 0xf), /* MEM_UOP_RETIRED.STLB_MISS_STORES */
301 PEBS_EVENT_CONSTRAINT(0x10cb, 0xf), /* MEM_LOAD_RETIRED.DTLB_MISS */ 419 INTEL_UEVENT_CONSTRAINT(0x21d0, 0xf), /* MEM_UOP_RETIRED.LOCK_LOADS */
420 INTEL_UEVENT_CONSTRAINT(0x22d0, 0xf), /* MEM_UOP_RETIRED.LOCK_STORES */
421 INTEL_UEVENT_CONSTRAINT(0x41d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_LOADS */
422 INTEL_UEVENT_CONSTRAINT(0x42d0, 0xf), /* MEM_UOP_RETIRED.SPLIT_STORES */
423 INTEL_UEVENT_CONSTRAINT(0x81d0, 0xf), /* MEM_UOP_RETIRED.ANY_LOADS */
424 INTEL_UEVENT_CONSTRAINT(0x82d0, 0xf), /* MEM_UOP_RETIRED.ANY_STORES */
425 INTEL_EVENT_CONSTRAINT(0xd1, 0xf), /* MEM_LOAD_UOPS_RETIRED.* */
426 INTEL_EVENT_CONSTRAINT(0xd2, 0xf), /* MEM_LOAD_UOPS_LLC_HIT_RETIRED.* */
427 INTEL_UEVENT_CONSTRAINT(0x02d4, 0xf), /* MEM_LOAD_UOPS_MISC_RETIRED.LLC_MISS */
302 EVENT_CONSTRAINT_END 428 EVENT_CONSTRAINT_END
303}; 429};
304 430
@@ -491,7 +617,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event,
491 regs.flags &= ~PERF_EFLAGS_EXACT; 617 regs.flags &= ~PERF_EFLAGS_EXACT;
492 618
493 if (perf_event_overflow(event, 1, &data, &regs)) 619 if (perf_event_overflow(event, 1, &data, &regs))
494 x86_pmu_stop(event); 620 x86_pmu_stop(event, 0);
495} 621}
496 622
497static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) 623static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
@@ -502,7 +628,7 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs)
502 struct pebs_record_core *at, *top; 628 struct pebs_record_core *at, *top;
503 int n; 629 int n;
504 630
505 if (!ds || !x86_pmu.pebs) 631 if (!x86_pmu.pebs_active)
506 return; 632 return;
507 633
508 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base; 634 at = (struct pebs_record_core *)(unsigned long)ds->pebs_buffer_base;
@@ -544,7 +670,7 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs)
544 u64 status = 0; 670 u64 status = 0;
545 int bit, n; 671 int bit, n;
546 672
547 if (!ds || !x86_pmu.pebs) 673 if (!x86_pmu.pebs_active)
548 return; 674 return;
549 675
550 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; 676 at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base;
@@ -609,29 +735,25 @@ static void intel_ds_init(void)
609 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type); 735 printk(KERN_CONT "PEBS fmt0%c, ", pebs_type);
610 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core); 736 x86_pmu.pebs_record_size = sizeof(struct pebs_record_core);
611 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core; 737 x86_pmu.drain_pebs = intel_pmu_drain_pebs_core;
612 x86_pmu.pebs_constraints = intel_core_pebs_events;
613 break; 738 break;
614 739
615 case 1: 740 case 1:
616 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type); 741 printk(KERN_CONT "PEBS fmt1%c, ", pebs_type);
617 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm); 742 x86_pmu.pebs_record_size = sizeof(struct pebs_record_nhm);
618 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm; 743 x86_pmu.drain_pebs = intel_pmu_drain_pebs_nhm;
619 x86_pmu.pebs_constraints = intel_nehalem_pebs_events;
620 break; 744 break;
621 745
622 default: 746 default:
623 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type); 747 printk(KERN_CONT "no PEBS fmt%d%c, ", format, pebs_type);
624 x86_pmu.pebs = 0; 748 x86_pmu.pebs = 0;
625 break;
626 } 749 }
627 } 750 }
628} 751}
629 752
630#else /* CONFIG_CPU_SUP_INTEL */ 753#else /* CONFIG_CPU_SUP_INTEL */
631 754
632static int reserve_ds_buffers(void) 755static void reserve_ds_buffers(void)
633{ 756{
634 return 0;
635} 757}
636 758
637static void release_ds_buffers(void) 759static void release_ds_buffers(void)
diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c
index 249015173992..ead584fb6a7d 100644
--- a/arch/x86/kernel/cpu/perf_event_p4.c
+++ b/arch/x86/kernel/cpu/perf_event_p4.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Netburst Perfomance Events (P4, old Xeon) 2 * Netburst Performance Events (P4, old Xeon)
3 * 3 *
4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org> 4 * Copyright (C) 2010 Parallels, Inc., Cyrill Gorcunov <gorcunov@openvz.org>
5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com> 5 * Copyright (C) 2010 Intel Corporation, Lin Ming <ming.m.lin@intel.com>
@@ -18,6 +18,8 @@
18struct p4_event_bind { 18struct p4_event_bind {
19 unsigned int opcode; /* Event code and ESCR selector */ 19 unsigned int opcode; /* Event code and ESCR selector */
20 unsigned int escr_msr[2]; /* ESCR MSR for this event */ 20 unsigned int escr_msr[2]; /* ESCR MSR for this event */
21 unsigned int escr_emask; /* valid ESCR EventMask bits */
22 unsigned int shared; /* event is shared across threads */
21 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */ 23 char cntr[2][P4_CNTR_LIMIT]; /* counter index (offset), -1 on abscence */
22}; 24};
23 25
@@ -66,231 +68,435 @@ static struct p4_event_bind p4_event_bind_map[] = {
66 [P4_EVENT_TC_DELIVER_MODE] = { 68 [P4_EVENT_TC_DELIVER_MODE] = {
67 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE), 69 .opcode = P4_OPCODE(P4_EVENT_TC_DELIVER_MODE),
68 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, 70 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
71 .escr_emask =
72 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DD) |
73 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DB) |
74 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, DI) |
75 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BD) |
76 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BB) |
77 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, BI) |
78 P4_ESCR_EMASK_BIT(P4_EVENT_TC_DELIVER_MODE, ID),
79 .shared = 1,
69 .cntr = { {4, 5, -1}, {6, 7, -1} }, 80 .cntr = { {4, 5, -1}, {6, 7, -1} },
70 }, 81 },
71 [P4_EVENT_BPU_FETCH_REQUEST] = { 82 [P4_EVENT_BPU_FETCH_REQUEST] = {
72 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST), 83 .opcode = P4_OPCODE(P4_EVENT_BPU_FETCH_REQUEST),
73 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 }, 84 .escr_msr = { MSR_P4_BPU_ESCR0, MSR_P4_BPU_ESCR1 },
85 .escr_emask =
86 P4_ESCR_EMASK_BIT(P4_EVENT_BPU_FETCH_REQUEST, TCMISS),
74 .cntr = { {0, -1, -1}, {2, -1, -1} }, 87 .cntr = { {0, -1, -1}, {2, -1, -1} },
75 }, 88 },
76 [P4_EVENT_ITLB_REFERENCE] = { 89 [P4_EVENT_ITLB_REFERENCE] = {
77 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE), 90 .opcode = P4_OPCODE(P4_EVENT_ITLB_REFERENCE),
78 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 }, 91 .escr_msr = { MSR_P4_ITLB_ESCR0, MSR_P4_ITLB_ESCR1 },
92 .escr_emask =
93 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT) |
94 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, MISS) |
95 P4_ESCR_EMASK_BIT(P4_EVENT_ITLB_REFERENCE, HIT_UK),
79 .cntr = { {0, -1, -1}, {2, -1, -1} }, 96 .cntr = { {0, -1, -1}, {2, -1, -1} },
80 }, 97 },
81 [P4_EVENT_MEMORY_CANCEL] = { 98 [P4_EVENT_MEMORY_CANCEL] = {
82 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL), 99 .opcode = P4_OPCODE(P4_EVENT_MEMORY_CANCEL),
83 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, 100 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
101 .escr_emask =
102 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, ST_RB_FULL) |
103 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_CANCEL, 64K_CONF),
84 .cntr = { {8, 9, -1}, {10, 11, -1} }, 104 .cntr = { {8, 9, -1}, {10, 11, -1} },
85 }, 105 },
86 [P4_EVENT_MEMORY_COMPLETE] = { 106 [P4_EVENT_MEMORY_COMPLETE] = {
87 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE), 107 .opcode = P4_OPCODE(P4_EVENT_MEMORY_COMPLETE),
88 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, 108 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
109 .escr_emask =
110 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, LSC) |
111 P4_ESCR_EMASK_BIT(P4_EVENT_MEMORY_COMPLETE, SSC),
89 .cntr = { {8, 9, -1}, {10, 11, -1} }, 112 .cntr = { {8, 9, -1}, {10, 11, -1} },
90 }, 113 },
91 [P4_EVENT_LOAD_PORT_REPLAY] = { 114 [P4_EVENT_LOAD_PORT_REPLAY] = {
92 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY), 115 .opcode = P4_OPCODE(P4_EVENT_LOAD_PORT_REPLAY),
93 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 }, 116 .escr_msr = { MSR_P4_SAAT_ESCR0, MSR_P4_SAAT_ESCR1 },
117 .escr_emask =
118 P4_ESCR_EMASK_BIT(P4_EVENT_LOAD_PORT_REPLAY, SPLIT_LD),
94 .cntr = { {8, 9, -1}, {10, 11, -1} }, 119 .cntr = { {8, 9, -1}, {10, 11, -1} },
95 }, 120 },
96 [P4_EVENT_STORE_PORT_REPLAY] = { 121 [P4_EVENT_STORE_PORT_REPLAY] = {
97 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY), 122 .opcode = P4_OPCODE(P4_EVENT_STORE_PORT_REPLAY),
98 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 }, 123 .escr_msr = { MSR_P4_SAAT_ESCR0 , MSR_P4_SAAT_ESCR1 },
124 .escr_emask =
125 P4_ESCR_EMASK_BIT(P4_EVENT_STORE_PORT_REPLAY, SPLIT_ST),
99 .cntr = { {8, 9, -1}, {10, 11, -1} }, 126 .cntr = { {8, 9, -1}, {10, 11, -1} },
100 }, 127 },
101 [P4_EVENT_MOB_LOAD_REPLAY] = { 128 [P4_EVENT_MOB_LOAD_REPLAY] = {
102 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY), 129 .opcode = P4_OPCODE(P4_EVENT_MOB_LOAD_REPLAY),
103 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 }, 130 .escr_msr = { MSR_P4_MOB_ESCR0, MSR_P4_MOB_ESCR1 },
131 .escr_emask =
132 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STA) |
133 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, NO_STD) |
134 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, PARTIAL_DATA) |
135 P4_ESCR_EMASK_BIT(P4_EVENT_MOB_LOAD_REPLAY, UNALGN_ADDR),
104 .cntr = { {0, -1, -1}, {2, -1, -1} }, 136 .cntr = { {0, -1, -1}, {2, -1, -1} },
105 }, 137 },
106 [P4_EVENT_PAGE_WALK_TYPE] = { 138 [P4_EVENT_PAGE_WALK_TYPE] = {
107 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE), 139 .opcode = P4_OPCODE(P4_EVENT_PAGE_WALK_TYPE),
108 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 }, 140 .escr_msr = { MSR_P4_PMH_ESCR0, MSR_P4_PMH_ESCR1 },
141 .escr_emask =
142 P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, DTMISS) |
143 P4_ESCR_EMASK_BIT(P4_EVENT_PAGE_WALK_TYPE, ITMISS),
144 .shared = 1,
109 .cntr = { {0, -1, -1}, {2, -1, -1} }, 145 .cntr = { {0, -1, -1}, {2, -1, -1} },
110 }, 146 },
111 [P4_EVENT_BSQ_CACHE_REFERENCE] = { 147 [P4_EVENT_BSQ_CACHE_REFERENCE] = {
112 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE), 148 .opcode = P4_OPCODE(P4_EVENT_BSQ_CACHE_REFERENCE),
113 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 }, 149 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR1 },
150 .escr_emask =
151 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITS) |
152 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITE) |
153 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_HITM) |
154 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITS) |
155 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITE) |
156 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_HITM) |
157 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_2ndL_MISS) |
158 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, RD_3rdL_MISS) |
159 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_CACHE_REFERENCE, WR_2ndL_MISS),
114 .cntr = { {0, -1, -1}, {2, -1, -1} }, 160 .cntr = { {0, -1, -1}, {2, -1, -1} },
115 }, 161 },
116 [P4_EVENT_IOQ_ALLOCATION] = { 162 [P4_EVENT_IOQ_ALLOCATION] = {
117 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION), 163 .opcode = P4_OPCODE(P4_EVENT_IOQ_ALLOCATION),
118 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 164 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
165 .escr_emask =
166 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, DEFAULT) |
167 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_READ) |
168 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, ALL_WRITE) |
169 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_UC) |
170 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WC) |
171 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WT) |
172 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WP) |
173 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, MEM_WB) |
174 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OWN) |
175 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, OTHER) |
176 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ALLOCATION, PREFETCH),
119 .cntr = { {0, -1, -1}, {2, -1, -1} }, 177 .cntr = { {0, -1, -1}, {2, -1, -1} },
120 }, 178 },
121 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */ 179 [P4_EVENT_IOQ_ACTIVE_ENTRIES] = { /* shared ESCR */
122 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES), 180 .opcode = P4_OPCODE(P4_EVENT_IOQ_ACTIVE_ENTRIES),
123 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 }, 181 .escr_msr = { MSR_P4_FSB_ESCR1, MSR_P4_FSB_ESCR1 },
182 .escr_emask =
183 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, DEFAULT) |
184 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_READ) |
185 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, ALL_WRITE) |
186 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_UC) |
187 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WC) |
188 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WT) |
189 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WP) |
190 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, MEM_WB) |
191 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OWN) |
192 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, OTHER) |
193 P4_ESCR_EMASK_BIT(P4_EVENT_IOQ_ACTIVE_ENTRIES, PREFETCH),
124 .cntr = { {2, -1, -1}, {3, -1, -1} }, 194 .cntr = { {2, -1, -1}, {3, -1, -1} },
125 }, 195 },
126 [P4_EVENT_FSB_DATA_ACTIVITY] = { 196 [P4_EVENT_FSB_DATA_ACTIVITY] = {
127 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY), 197 .opcode = P4_OPCODE(P4_EVENT_FSB_DATA_ACTIVITY),
128 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 198 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
199 .escr_emask =
200 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_DRV) |
201 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OWN) |
202 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DRDY_OTHER) |
203 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_DRV) |
204 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OWN) |
205 P4_ESCR_EMASK_BIT(P4_EVENT_FSB_DATA_ACTIVITY, DBSY_OTHER),
206 .shared = 1,
129 .cntr = { {0, -1, -1}, {2, -1, -1} }, 207 .cntr = { {0, -1, -1}, {2, -1, -1} },
130 }, 208 },
131 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */ 209 [P4_EVENT_BSQ_ALLOCATION] = { /* shared ESCR, broken CCCR1 */
132 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION), 210 .opcode = P4_OPCODE(P4_EVENT_BSQ_ALLOCATION),
133 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 }, 211 .escr_msr = { MSR_P4_BSU_ESCR0, MSR_P4_BSU_ESCR0 },
212 .escr_emask =
213 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE0) |
214 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_TYPE1) |
215 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN0) |
216 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LEN1) |
217 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_IO_TYPE) |
218 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_LOCK_TYPE) |
219 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_CACHE_TYPE) |
220 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_SPLIT_TYPE) |
221 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_DEM_TYPE) |
222 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, REQ_ORD_TYPE) |
223 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE0) |
224 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE1) |
225 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ALLOCATION, MEM_TYPE2),
134 .cntr = { {0, -1, -1}, {1, -1, -1} }, 226 .cntr = { {0, -1, -1}, {1, -1, -1} },
135 }, 227 },
136 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */ 228 [P4_EVENT_BSQ_ACTIVE_ENTRIES] = { /* shared ESCR */
137 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES), 229 .opcode = P4_OPCODE(P4_EVENT_BSQ_ACTIVE_ENTRIES),
138 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 }, 230 .escr_msr = { MSR_P4_BSU_ESCR1 , MSR_P4_BSU_ESCR1 },
231 .escr_emask =
232 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE0) |
233 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_TYPE1) |
234 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN0) |
235 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LEN1) |
236 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_IO_TYPE) |
237 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_LOCK_TYPE) |
238 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_CACHE_TYPE) |
239 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_SPLIT_TYPE) |
240 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_DEM_TYPE) |
241 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, REQ_ORD_TYPE) |
242 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE0) |
243 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE1) |
244 P4_ESCR_EMASK_BIT(P4_EVENT_BSQ_ACTIVE_ENTRIES, MEM_TYPE2),
139 .cntr = { {2, -1, -1}, {3, -1, -1} }, 245 .cntr = { {2, -1, -1}, {3, -1, -1} },
140 }, 246 },
141 [P4_EVENT_SSE_INPUT_ASSIST] = { 247 [P4_EVENT_SSE_INPUT_ASSIST] = {
142 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST), 248 .opcode = P4_OPCODE(P4_EVENT_SSE_INPUT_ASSIST),
143 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 249 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
250 .escr_emask =
251 P4_ESCR_EMASK_BIT(P4_EVENT_SSE_INPUT_ASSIST, ALL),
252 .shared = 1,
144 .cntr = { {8, 9, -1}, {10, 11, -1} }, 253 .cntr = { {8, 9, -1}, {10, 11, -1} },
145 }, 254 },
146 [P4_EVENT_PACKED_SP_UOP] = { 255 [P4_EVENT_PACKED_SP_UOP] = {
147 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP), 256 .opcode = P4_OPCODE(P4_EVENT_PACKED_SP_UOP),
148 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 257 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
258 .escr_emask =
259 P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_SP_UOP, ALL),
260 .shared = 1,
149 .cntr = { {8, 9, -1}, {10, 11, -1} }, 261 .cntr = { {8, 9, -1}, {10, 11, -1} },
150 }, 262 },
151 [P4_EVENT_PACKED_DP_UOP] = { 263 [P4_EVENT_PACKED_DP_UOP] = {
152 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP), 264 .opcode = P4_OPCODE(P4_EVENT_PACKED_DP_UOP),
153 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 265 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
266 .escr_emask =
267 P4_ESCR_EMASK_BIT(P4_EVENT_PACKED_DP_UOP, ALL),
268 .shared = 1,
154 .cntr = { {8, 9, -1}, {10, 11, -1} }, 269 .cntr = { {8, 9, -1}, {10, 11, -1} },
155 }, 270 },
156 [P4_EVENT_SCALAR_SP_UOP] = { 271 [P4_EVENT_SCALAR_SP_UOP] = {
157 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP), 272 .opcode = P4_OPCODE(P4_EVENT_SCALAR_SP_UOP),
158 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 273 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
274 .escr_emask =
275 P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_SP_UOP, ALL),
276 .shared = 1,
159 .cntr = { {8, 9, -1}, {10, 11, -1} }, 277 .cntr = { {8, 9, -1}, {10, 11, -1} },
160 }, 278 },
161 [P4_EVENT_SCALAR_DP_UOP] = { 279 [P4_EVENT_SCALAR_DP_UOP] = {
162 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP), 280 .opcode = P4_OPCODE(P4_EVENT_SCALAR_DP_UOP),
163 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 281 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
282 .escr_emask =
283 P4_ESCR_EMASK_BIT(P4_EVENT_SCALAR_DP_UOP, ALL),
284 .shared = 1,
164 .cntr = { {8, 9, -1}, {10, 11, -1} }, 285 .cntr = { {8, 9, -1}, {10, 11, -1} },
165 }, 286 },
166 [P4_EVENT_64BIT_MMX_UOP] = { 287 [P4_EVENT_64BIT_MMX_UOP] = {
167 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP), 288 .opcode = P4_OPCODE(P4_EVENT_64BIT_MMX_UOP),
168 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 289 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
290 .escr_emask =
291 P4_ESCR_EMASK_BIT(P4_EVENT_64BIT_MMX_UOP, ALL),
292 .shared = 1,
169 .cntr = { {8, 9, -1}, {10, 11, -1} }, 293 .cntr = { {8, 9, -1}, {10, 11, -1} },
170 }, 294 },
171 [P4_EVENT_128BIT_MMX_UOP] = { 295 [P4_EVENT_128BIT_MMX_UOP] = {
172 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP), 296 .opcode = P4_OPCODE(P4_EVENT_128BIT_MMX_UOP),
173 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 297 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
298 .escr_emask =
299 P4_ESCR_EMASK_BIT(P4_EVENT_128BIT_MMX_UOP, ALL),
300 .shared = 1,
174 .cntr = { {8, 9, -1}, {10, 11, -1} }, 301 .cntr = { {8, 9, -1}, {10, 11, -1} },
175 }, 302 },
176 [P4_EVENT_X87_FP_UOP] = { 303 [P4_EVENT_X87_FP_UOP] = {
177 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP), 304 .opcode = P4_OPCODE(P4_EVENT_X87_FP_UOP),
178 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 }, 305 .escr_msr = { MSR_P4_FIRM_ESCR0, MSR_P4_FIRM_ESCR1 },
306 .escr_emask =
307 P4_ESCR_EMASK_BIT(P4_EVENT_X87_FP_UOP, ALL),
308 .shared = 1,
179 .cntr = { {8, 9, -1}, {10, 11, -1} }, 309 .cntr = { {8, 9, -1}, {10, 11, -1} },
180 }, 310 },
181 [P4_EVENT_TC_MISC] = { 311 [P4_EVENT_TC_MISC] = {
182 .opcode = P4_OPCODE(P4_EVENT_TC_MISC), 312 .opcode = P4_OPCODE(P4_EVENT_TC_MISC),
183 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 }, 313 .escr_msr = { MSR_P4_TC_ESCR0, MSR_P4_TC_ESCR1 },
314 .escr_emask =
315 P4_ESCR_EMASK_BIT(P4_EVENT_TC_MISC, FLUSH),
184 .cntr = { {4, 5, -1}, {6, 7, -1} }, 316 .cntr = { {4, 5, -1}, {6, 7, -1} },
185 }, 317 },
186 [P4_EVENT_GLOBAL_POWER_EVENTS] = { 318 [P4_EVENT_GLOBAL_POWER_EVENTS] = {
187 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS), 319 .opcode = P4_OPCODE(P4_EVENT_GLOBAL_POWER_EVENTS),
188 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 320 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
321 .escr_emask =
322 P4_ESCR_EMASK_BIT(P4_EVENT_GLOBAL_POWER_EVENTS, RUNNING),
189 .cntr = { {0, -1, -1}, {2, -1, -1} }, 323 .cntr = { {0, -1, -1}, {2, -1, -1} },
190 }, 324 },
191 [P4_EVENT_TC_MS_XFER] = { 325 [P4_EVENT_TC_MS_XFER] = {
192 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER), 326 .opcode = P4_OPCODE(P4_EVENT_TC_MS_XFER),
193 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, 327 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
328 .escr_emask =
329 P4_ESCR_EMASK_BIT(P4_EVENT_TC_MS_XFER, CISC),
194 .cntr = { {4, 5, -1}, {6, 7, -1} }, 330 .cntr = { {4, 5, -1}, {6, 7, -1} },
195 }, 331 },
196 [P4_EVENT_UOP_QUEUE_WRITES] = { 332 [P4_EVENT_UOP_QUEUE_WRITES] = {
197 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES), 333 .opcode = P4_OPCODE(P4_EVENT_UOP_QUEUE_WRITES),
198 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 }, 334 .escr_msr = { MSR_P4_MS_ESCR0, MSR_P4_MS_ESCR1 },
335 .escr_emask =
336 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_BUILD) |
337 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_TC_DELIVER) |
338 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_QUEUE_WRITES, FROM_ROM),
199 .cntr = { {4, 5, -1}, {6, 7, -1} }, 339 .cntr = { {4, 5, -1}, {6, 7, -1} },
200 }, 340 },
201 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = { 341 [P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE] = {
202 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE), 342 .opcode = P4_OPCODE(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE),
203 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 }, 343 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR0 },
344 .escr_emask =
345 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CONDITIONAL) |
346 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, CALL) |
347 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, RETURN) |
348 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_MISPRED_BRANCH_TYPE, INDIRECT),
204 .cntr = { {4, 5, -1}, {6, 7, -1} }, 349 .cntr = { {4, 5, -1}, {6, 7, -1} },
205 }, 350 },
206 [P4_EVENT_RETIRED_BRANCH_TYPE] = { 351 [P4_EVENT_RETIRED_BRANCH_TYPE] = {
207 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE), 352 .opcode = P4_OPCODE(P4_EVENT_RETIRED_BRANCH_TYPE),
208 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 }, 353 .escr_msr = { MSR_P4_TBPU_ESCR0 , MSR_P4_TBPU_ESCR1 },
354 .escr_emask =
355 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CONDITIONAL) |
356 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, CALL) |
357 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, RETURN) |
358 P4_ESCR_EMASK_BIT(P4_EVENT_RETIRED_BRANCH_TYPE, INDIRECT),
209 .cntr = { {4, 5, -1}, {6, 7, -1} }, 359 .cntr = { {4, 5, -1}, {6, 7, -1} },
210 }, 360 },
211 [P4_EVENT_RESOURCE_STALL] = { 361 [P4_EVENT_RESOURCE_STALL] = {
212 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL), 362 .opcode = P4_OPCODE(P4_EVENT_RESOURCE_STALL),
213 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 }, 363 .escr_msr = { MSR_P4_ALF_ESCR0, MSR_P4_ALF_ESCR1 },
364 .escr_emask =
365 P4_ESCR_EMASK_BIT(P4_EVENT_RESOURCE_STALL, SBFULL),
214 .cntr = { {12, 13, 16}, {14, 15, 17} }, 366 .cntr = { {12, 13, 16}, {14, 15, 17} },
215 }, 367 },
216 [P4_EVENT_WC_BUFFER] = { 368 [P4_EVENT_WC_BUFFER] = {
217 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER), 369 .opcode = P4_OPCODE(P4_EVENT_WC_BUFFER),
218 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 }, 370 .escr_msr = { MSR_P4_DAC_ESCR0, MSR_P4_DAC_ESCR1 },
371 .escr_emask =
372 P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_EVICTS) |
373 P4_ESCR_EMASK_BIT(P4_EVENT_WC_BUFFER, WCB_FULL_EVICTS),
374 .shared = 1,
219 .cntr = { {8, 9, -1}, {10, 11, -1} }, 375 .cntr = { {8, 9, -1}, {10, 11, -1} },
220 }, 376 },
221 [P4_EVENT_B2B_CYCLES] = { 377 [P4_EVENT_B2B_CYCLES] = {
222 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES), 378 .opcode = P4_OPCODE(P4_EVENT_B2B_CYCLES),
223 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 379 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
380 .escr_emask = 0,
224 .cntr = { {0, -1, -1}, {2, -1, -1} }, 381 .cntr = { {0, -1, -1}, {2, -1, -1} },
225 }, 382 },
226 [P4_EVENT_BNR] = { 383 [P4_EVENT_BNR] = {
227 .opcode = P4_OPCODE(P4_EVENT_BNR), 384 .opcode = P4_OPCODE(P4_EVENT_BNR),
228 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 385 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
386 .escr_emask = 0,
229 .cntr = { {0, -1, -1}, {2, -1, -1} }, 387 .cntr = { {0, -1, -1}, {2, -1, -1} },
230 }, 388 },
231 [P4_EVENT_SNOOP] = { 389 [P4_EVENT_SNOOP] = {
232 .opcode = P4_OPCODE(P4_EVENT_SNOOP), 390 .opcode = P4_OPCODE(P4_EVENT_SNOOP),
233 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 391 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
392 .escr_emask = 0,
234 .cntr = { {0, -1, -1}, {2, -1, -1} }, 393 .cntr = { {0, -1, -1}, {2, -1, -1} },
235 }, 394 },
236 [P4_EVENT_RESPONSE] = { 395 [P4_EVENT_RESPONSE] = {
237 .opcode = P4_OPCODE(P4_EVENT_RESPONSE), 396 .opcode = P4_OPCODE(P4_EVENT_RESPONSE),
238 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 }, 397 .escr_msr = { MSR_P4_FSB_ESCR0, MSR_P4_FSB_ESCR1 },
398 .escr_emask = 0,
239 .cntr = { {0, -1, -1}, {2, -1, -1} }, 399 .cntr = { {0, -1, -1}, {2, -1, -1} },
240 }, 400 },
241 [P4_EVENT_FRONT_END_EVENT] = { 401 [P4_EVENT_FRONT_END_EVENT] = {
242 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT), 402 .opcode = P4_OPCODE(P4_EVENT_FRONT_END_EVENT),
243 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 403 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
404 .escr_emask =
405 P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, NBOGUS) |
406 P4_ESCR_EMASK_BIT(P4_EVENT_FRONT_END_EVENT, BOGUS),
244 .cntr = { {12, 13, 16}, {14, 15, 17} }, 407 .cntr = { {12, 13, 16}, {14, 15, 17} },
245 }, 408 },
246 [P4_EVENT_EXECUTION_EVENT] = { 409 [P4_EVENT_EXECUTION_EVENT] = {
247 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT), 410 .opcode = P4_OPCODE(P4_EVENT_EXECUTION_EVENT),
248 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 411 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
412 .escr_emask =
413 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS0) |
414 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS1) |
415 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS2) |
416 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, NBOGUS3) |
417 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS0) |
418 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS1) |
419 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS2) |
420 P4_ESCR_EMASK_BIT(P4_EVENT_EXECUTION_EVENT, BOGUS3),
249 .cntr = { {12, 13, 16}, {14, 15, 17} }, 421 .cntr = { {12, 13, 16}, {14, 15, 17} },
250 }, 422 },
251 [P4_EVENT_REPLAY_EVENT] = { 423 [P4_EVENT_REPLAY_EVENT] = {
252 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT), 424 .opcode = P4_OPCODE(P4_EVENT_REPLAY_EVENT),
253 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 425 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
426 .escr_emask =
427 P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, NBOGUS) |
428 P4_ESCR_EMASK_BIT(P4_EVENT_REPLAY_EVENT, BOGUS),
254 .cntr = { {12, 13, 16}, {14, 15, 17} }, 429 .cntr = { {12, 13, 16}, {14, 15, 17} },
255 }, 430 },
256 [P4_EVENT_INSTR_RETIRED] = { 431 [P4_EVENT_INSTR_RETIRED] = {
257 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED), 432 .opcode = P4_OPCODE(P4_EVENT_INSTR_RETIRED),
258 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 433 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
434 .escr_emask =
435 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSNTAG) |
436 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, NBOGUSTAG) |
437 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSNTAG) |
438 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_RETIRED, BOGUSTAG),
259 .cntr = { {12, 13, 16}, {14, 15, 17} }, 439 .cntr = { {12, 13, 16}, {14, 15, 17} },
260 }, 440 },
261 [P4_EVENT_UOPS_RETIRED] = { 441 [P4_EVENT_UOPS_RETIRED] = {
262 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED), 442 .opcode = P4_OPCODE(P4_EVENT_UOPS_RETIRED),
263 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 443 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
444 .escr_emask =
445 P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, NBOGUS) |
446 P4_ESCR_EMASK_BIT(P4_EVENT_UOPS_RETIRED, BOGUS),
264 .cntr = { {12, 13, 16}, {14, 15, 17} }, 447 .cntr = { {12, 13, 16}, {14, 15, 17} },
265 }, 448 },
266 [P4_EVENT_UOP_TYPE] = { 449 [P4_EVENT_UOP_TYPE] = {
267 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE), 450 .opcode = P4_OPCODE(P4_EVENT_UOP_TYPE),
268 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 }, 451 .escr_msr = { MSR_P4_RAT_ESCR0, MSR_P4_RAT_ESCR1 },
452 .escr_emask =
453 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGLOADS) |
454 P4_ESCR_EMASK_BIT(P4_EVENT_UOP_TYPE, TAGSTORES),
269 .cntr = { {12, 13, 16}, {14, 15, 17} }, 455 .cntr = { {12, 13, 16}, {14, 15, 17} },
270 }, 456 },
271 [P4_EVENT_BRANCH_RETIRED] = { 457 [P4_EVENT_BRANCH_RETIRED] = {
272 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED), 458 .opcode = P4_OPCODE(P4_EVENT_BRANCH_RETIRED),
273 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 459 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
460 .escr_emask =
461 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNP) |
462 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMNM) |
463 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTP) |
464 P4_ESCR_EMASK_BIT(P4_EVENT_BRANCH_RETIRED, MMTM),
274 .cntr = { {12, 13, 16}, {14, 15, 17} }, 465 .cntr = { {12, 13, 16}, {14, 15, 17} },
275 }, 466 },
276 [P4_EVENT_MISPRED_BRANCH_RETIRED] = { 467 [P4_EVENT_MISPRED_BRANCH_RETIRED] = {
277 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED), 468 .opcode = P4_OPCODE(P4_EVENT_MISPRED_BRANCH_RETIRED),
278 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 469 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
470 .escr_emask =
471 P4_ESCR_EMASK_BIT(P4_EVENT_MISPRED_BRANCH_RETIRED, NBOGUS),
279 .cntr = { {12, 13, 16}, {14, 15, 17} }, 472 .cntr = { {12, 13, 16}, {14, 15, 17} },
280 }, 473 },
281 [P4_EVENT_X87_ASSIST] = { 474 [P4_EVENT_X87_ASSIST] = {
282 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST), 475 .opcode = P4_OPCODE(P4_EVENT_X87_ASSIST),
283 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 476 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
477 .escr_emask =
478 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSU) |
479 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, FPSO) |
480 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAO) |
481 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, POAU) |
482 P4_ESCR_EMASK_BIT(P4_EVENT_X87_ASSIST, PREA),
284 .cntr = { {12, 13, 16}, {14, 15, 17} }, 483 .cntr = { {12, 13, 16}, {14, 15, 17} },
285 }, 484 },
286 [P4_EVENT_MACHINE_CLEAR] = { 485 [P4_EVENT_MACHINE_CLEAR] = {
287 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR), 486 .opcode = P4_OPCODE(P4_EVENT_MACHINE_CLEAR),
288 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 }, 487 .escr_msr = { MSR_P4_CRU_ESCR2, MSR_P4_CRU_ESCR3 },
488 .escr_emask =
489 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, CLEAR) |
490 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, MOCLEAR) |
491 P4_ESCR_EMASK_BIT(P4_EVENT_MACHINE_CLEAR, SMCLEAR),
289 .cntr = { {12, 13, 16}, {14, 15, 17} }, 492 .cntr = { {12, 13, 16}, {14, 15, 17} },
290 }, 493 },
291 [P4_EVENT_INSTR_COMPLETED] = { 494 [P4_EVENT_INSTR_COMPLETED] = {
292 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED), 495 .opcode = P4_OPCODE(P4_EVENT_INSTR_COMPLETED),
293 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 }, 496 .escr_msr = { MSR_P4_CRU_ESCR0, MSR_P4_CRU_ESCR1 },
497 .escr_emask =
498 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, NBOGUS) |
499 P4_ESCR_EMASK_BIT(P4_EVENT_INSTR_COMPLETED, BOGUS),
294 .cntr = { {12, 13, 16}, {14, 15, 17} }, 500 .cntr = { {12, 13, 16}, {14, 15, 17} },
295 }, 501 },
296}; 502};
@@ -428,29 +634,73 @@ static u64 p4_pmu_event_map(int hw_event)
428 return config; 634 return config;
429} 635}
430 636
637/* check cpu model specifics */
638static bool p4_event_match_cpu_model(unsigned int event_idx)
639{
640 /* INSTR_COMPLETED event only exist for model 3, 4, 6 (Prescott) */
641 if (event_idx == P4_EVENT_INSTR_COMPLETED) {
642 if (boot_cpu_data.x86_model != 3 &&
643 boot_cpu_data.x86_model != 4 &&
644 boot_cpu_data.x86_model != 6)
645 return false;
646 }
647
648 /*
649 * For info
650 * - IQ_ESCR0, IQ_ESCR1 only for models 1 and 2
651 */
652
653 return true;
654}
655
431static int p4_validate_raw_event(struct perf_event *event) 656static int p4_validate_raw_event(struct perf_event *event)
432{ 657{
433 unsigned int v; 658 unsigned int v, emask;
434 659
435 /* user data may have out-of-bound event index */ 660 /* User data may have out-of-bound event index */
436 v = p4_config_unpack_event(event->attr.config); 661 v = p4_config_unpack_event(event->attr.config);
437 if (v >= ARRAY_SIZE(p4_event_bind_map)) { 662 if (v >= ARRAY_SIZE(p4_event_bind_map))
438 pr_warning("P4 PMU: Unknown event code: %d\n", v); 663 return -EINVAL;
664
665 /* It may be unsupported: */
666 if (!p4_event_match_cpu_model(v))
439 return -EINVAL; 667 return -EINVAL;
668
669 /*
670 * NOTE: P4_CCCR_THREAD_ANY has not the same meaning as
671 * in Architectural Performance Monitoring, it means not
672 * on _which_ logical cpu to count but rather _when_, ie it
673 * depends on logical cpu state -- count event if one cpu active,
674 * none, both or any, so we just allow user to pass any value
675 * desired.
676 *
677 * In turn we always set Tx_OS/Tx_USR bits bound to logical
678 * cpu without their propagation to another cpu
679 */
680
681 /*
682 * if an event is shared across the logical threads
683 * the user needs special permissions to be able to use it
684 */
685 if (p4_ht_active() && p4_event_bind_map[v].shared) {
686 if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
687 return -EACCES;
440 } 688 }
441 689
690 /* ESCR EventMask bits may be invalid */
691 emask = p4_config_unpack_escr(event->attr.config) & P4_ESCR_EVENTMASK_MASK;
692 if (emask & ~p4_event_bind_map[v].escr_emask)
693 return -EINVAL;
694
442 /* 695 /*
443 * it may have some screwed PEBS bits 696 * it may have some invalid PEBS bits
444 */ 697 */
445 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE)) { 698 if (p4_config_pebs_has(event->attr.config, P4_PEBS_CONFIG_ENABLE))
446 pr_warning("P4 PMU: PEBS are not supported yet\n");
447 return -EINVAL; 699 return -EINVAL;
448 } 700
449 v = p4_config_unpack_metric(event->attr.config); 701 v = p4_config_unpack_metric(event->attr.config);
450 if (v >= ARRAY_SIZE(p4_pebs_bind_map)) { 702 if (v >= ARRAY_SIZE(p4_pebs_bind_map))
451 pr_warning("P4 PMU: Unknown metric code: %d\n", v);
452 return -EINVAL; 703 return -EINVAL;
453 }
454 704
455 return 0; 705 return 0;
456} 706}
@@ -477,28 +727,30 @@ static int p4_hw_config(struct perf_event *event)
477 event->hw.config = p4_set_ht_bit(event->hw.config); 727 event->hw.config = p4_set_ht_bit(event->hw.config);
478 728
479 if (event->attr.type == PERF_TYPE_RAW) { 729 if (event->attr.type == PERF_TYPE_RAW) {
730 struct p4_event_bind *bind;
731 unsigned int esel;
732 /*
733 * Clear bits we reserve to be managed by kernel itself
734 * and never allowed from a user space
735 */
736 event->attr.config &= P4_CONFIG_MASK;
480 737
481 rc = p4_validate_raw_event(event); 738 rc = p4_validate_raw_event(event);
482 if (rc) 739 if (rc)
483 goto out; 740 goto out;
484 741
485 /* 742 /*
486 * We don't control raw events so it's up to the caller
487 * to pass sane values (and we don't count the thread number
488 * on HT machine but allow HT-compatible specifics to be
489 * passed on)
490 *
491 * Note that for RAW events we allow user to use P4_CCCR_RESERVED 743 * Note that for RAW events we allow user to use P4_CCCR_RESERVED
492 * bits since we keep additional info here (for cache events and etc) 744 * bits since we keep additional info here (for cache events and etc)
493 *
494 * XXX: HT wide things should check perf_paranoid_cpu() &&
495 * CAP_SYS_ADMIN
496 */ 745 */
497 event->hw.config |= event->attr.config & 746 event->hw.config |= event->attr.config;
498 (p4_config_pack_escr(P4_ESCR_MASK_HT) | 747 bind = p4_config_get_bind(event->attr.config);
499 p4_config_pack_cccr(P4_CCCR_MASK_HT | P4_CCCR_RESERVED)); 748 if (!bind) {
500 749 rc = -EINVAL;
501 event->hw.config &= ~P4_CCCR_FORCE_OVF; 750 goto out;
751 }
752 esel = P4_OPCODE_ESEL(bind->opcode);
753 event->hw.config |= p4_config_pack_cccr(P4_CCCR_ESEL(esel));
502 } 754 }
503 755
504 rc = x86_setup_perfctr(event); 756 rc = x86_setup_perfctr(event);
@@ -509,19 +761,27 @@ out:
509 761
510static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc) 762static inline int p4_pmu_clear_cccr_ovf(struct hw_perf_event *hwc)
511{ 763{
512 int overflow = 0; 764 u64 v;
513 u32 low, high;
514
515 rdmsr(hwc->config_base + hwc->idx, low, high);
516 765
517 /* we need to check high bit for unflagged overflows */ 766 /* an official way for overflow indication */
518 if ((low & P4_CCCR_OVF) || !(high & (1 << 31))) { 767 rdmsrl(hwc->config_base, v);
519 overflow = 1; 768 if (v & P4_CCCR_OVF) {
520 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 769 wrmsrl(hwc->config_base, v & ~P4_CCCR_OVF);
521 ((u64)low) & ~P4_CCCR_OVF); 770 return 1;
522 } 771 }
523 772
524 return overflow; 773 /*
774 * In some circumstances the overflow might issue an NMI but did
775 * not set P4_CCCR_OVF bit. Because a counter holds a negative value
776 * we simply check for high bit being set, if it's cleared it means
777 * the counter has reached zero value and continued counting before
778 * real NMI signal was received:
779 */
780 rdmsrl(hwc->event_base, v);
781 if (!(v & ARCH_P4_UNFLAGGED_BIT))
782 return 1;
783
784 return 0;
525} 785}
526 786
527static void p4_pmu_disable_pebs(void) 787static void p4_pmu_disable_pebs(void)
@@ -531,13 +791,13 @@ static void p4_pmu_disable_pebs(void)
531 * 791 *
532 * It's still allowed that two threads setup same cache 792 * It's still allowed that two threads setup same cache
533 * events so we can't simply clear metrics until we knew 793 * events so we can't simply clear metrics until we knew
534 * noone is depending on us, so we need kind of counter 794 * no one is depending on us, so we need kind of counter
535 * for "ReplayEvent" users. 795 * for "ReplayEvent" users.
536 * 796 *
537 * What is more complex -- RAW events, if user (for some 797 * What is more complex -- RAW events, if user (for some
538 * reason) will pass some cache event metric with improper 798 * reason) will pass some cache event metric with improper
539 * event opcode -- it's fine from hardware point of view 799 * event opcode -- it's fine from hardware point of view
540 * but completely nonsence from "meaning" of such action. 800 * but completely nonsense from "meaning" of such action.
541 * 801 *
542 * So at moment let leave metrics turned on forever -- it's 802 * So at moment let leave metrics turned on forever -- it's
543 * ok for now but need to be revisited! 803 * ok for now but need to be revisited!
@@ -556,7 +816,7 @@ static inline void p4_pmu_disable_event(struct perf_event *event)
556 * state we need to clear P4_CCCR_OVF, otherwise interrupt get 816 * state we need to clear P4_CCCR_OVF, otherwise interrupt get
557 * asserted again and again 817 * asserted again and again
558 */ 818 */
559 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 819 (void)checking_wrmsrl(hwc->config_base,
560 (u64)(p4_config_unpack_cccr(hwc->config)) & 820 (u64)(p4_config_unpack_cccr(hwc->config)) &
561 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED); 821 ~P4_CCCR_ENABLE & ~P4_CCCR_OVF & ~P4_CCCR_RESERVED);
562} 822}
@@ -626,7 +886,7 @@ static void p4_pmu_enable_event(struct perf_event *event)
626 p4_pmu_enable_pebs(hwc->config); 886 p4_pmu_enable_pebs(hwc->config);
627 887
628 (void)checking_wrmsrl(escr_addr, escr_conf); 888 (void)checking_wrmsrl(escr_addr, escr_conf);
629 (void)checking_wrmsrl(hwc->config_base + hwc->idx, 889 (void)checking_wrmsrl(hwc->config_base,
630 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE); 890 (cccr & ~P4_CCCR_RESERVED) | P4_CCCR_ENABLE);
631} 891}
632 892
@@ -652,8 +912,7 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
652 int idx, handled = 0; 912 int idx, handled = 0;
653 u64 val; 913 u64 val;
654 914
655 data.addr = 0; 915 perf_sample_data_init(&data, 0);
656 data.raw = NULL;
657 916
658 cpuc = &__get_cpu_var(cpu_hw_events); 917 cpuc = &__get_cpu_var(cpu_hw_events);
659 918
@@ -687,14 +946,23 @@ static int p4_pmu_handle_irq(struct pt_regs *regs)
687 if (!x86_perf_event_set_period(event)) 946 if (!x86_perf_event_set_period(event))
688 continue; 947 continue;
689 if (perf_event_overflow(event, 1, &data, regs)) 948 if (perf_event_overflow(event, 1, &data, regs))
690 p4_pmu_disable_event(event); 949 x86_pmu_stop(event, 0);
691 } 950 }
692 951
693 if (handled) { 952 if (handled)
694 /* p4 quirk: unmask it again */
695 apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED);
696 inc_irq_stat(apic_perf_irqs); 953 inc_irq_stat(apic_perf_irqs);
697 } 954
955 /*
956 * When dealing with the unmasking of the LVTPC on P4 perf hw, it has
957 * been observed that the OVF bit flag has to be cleared first _before_
958 * the LVTPC can be unmasked.
959 *
960 * The reason is the NMI line will continue to be asserted while the OVF
961 * bit is set. This causes a second NMI to generate if the LVTPC is
962 * unmasked before the OVF bit is cleared, leading to unknown NMI
963 * messages.
964 */
965 apic_write(APIC_LVTPC, APIC_DM_NMI);
698 966
699 return handled; 967 return handled;
700} 968}
@@ -908,9 +1176,9 @@ static __initconst const struct x86_pmu p4_pmu = {
908 */ 1176 */
909 .num_counters = ARCH_P4_MAX_CCCR, 1177 .num_counters = ARCH_P4_MAX_CCCR,
910 .apic = 1, 1178 .apic = 1,
911 .cntval_bits = 40, 1179 .cntval_bits = ARCH_P4_CNTRVAL_BITS,
912 .cntval_mask = (1ULL << 40) - 1, 1180 .cntval_mask = ARCH_P4_CNTRVAL_MASK,
913 .max_period = (1ULL << 39) - 1, 1181 .max_period = (1ULL << (ARCH_P4_CNTRVAL_BITS - 1)) - 1,
914 .hw_config = p4_hw_config, 1182 .hw_config = p4_hw_config,
915 .schedule_events = p4_pmu_schedule_events, 1183 .schedule_events = p4_pmu_schedule_events,
916 /* 1184 /*
@@ -928,7 +1196,7 @@ static __init int p4_pmu_init(void)
928{ 1196{
929 unsigned int low, high; 1197 unsigned int low, high;
930 1198
931 /* If we get stripped -- indexig fails */ 1199 /* If we get stripped -- indexing fails */
932 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC); 1200 BUILD_BUG_ON(ARCH_P4_MAX_CCCR > X86_PMC_MAX_GENERIC);
933 1201
934 rdmsr(MSR_IA32_MISC_ENABLE, low, high); 1202 rdmsr(MSR_IA32_MISC_ENABLE, low, high);
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
index 34ba07be2cda..20c097e33860 100644
--- a/arch/x86/kernel/cpu/perf_event_p6.c
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -68,7 +68,7 @@ p6_pmu_disable_event(struct perf_event *event)
68 if (cpuc->enabled) 68 if (cpuc->enabled)
69 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 69 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
70 70
71 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 71 (void)checking_wrmsrl(hwc->config_base, val);
72} 72}
73 73
74static void p6_pmu_enable_event(struct perf_event *event) 74static void p6_pmu_enable_event(struct perf_event *event)
@@ -81,7 +81,7 @@ static void p6_pmu_enable_event(struct perf_event *event)
81 if (cpuc->enabled) 81 if (cpuc->enabled)
82 val |= ARCH_PERFMON_EVENTSEL_ENABLE; 82 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
83 83
84 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val); 84 (void)checking_wrmsrl(hwc->config_base, val);
85} 85}
86 86
87static __initconst const struct x86_pmu p6_pmu = { 87static __initconst const struct x86_pmu p6_pmu = {
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fb329e9f8494..966512b2cacf 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -16,32 +16,12 @@
16#include <linux/kernel.h> 16#include <linux/kernel.h>
17#include <linux/bitops.h> 17#include <linux/bitops.h>
18#include <linux/smp.h> 18#include <linux/smp.h>
19#include <linux/nmi.h> 19#include <asm/nmi.h>
20#include <linux/kprobes.h> 20#include <linux/kprobes.h>
21 21
22#include <asm/apic.h> 22#include <asm/apic.h>
23#include <asm/perf_event.h> 23#include <asm/perf_event.h>
24 24
25struct nmi_watchdog_ctlblk {
26 unsigned int cccr_msr;
27 unsigned int perfctr_msr; /* the MSR to reset in NMI handler */
28 unsigned int evntsel_msr; /* the MSR to select the events to handle */
29};
30
31/* Interface defining a CPU specific perfctr watchdog */
32struct wd_ops {
33 int (*reserve)(void);
34 void (*unreserve)(void);
35 int (*setup)(unsigned nmi_hz);
36 void (*rearm)(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz);
37 void (*stop)(void);
38 unsigned perfctr;
39 unsigned evntsel;
40 u64 checkbit;
41};
42
43static const struct wd_ops *wd_ops;
44
45/* 25/*
46 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's 26 * this number is calculated from Intel's MSR_P4_CRU_ESCR5 register and it's
47 * offset from MSR_P4_BSU_ESCR0. 27 * offset from MSR_P4_BSU_ESCR0.
@@ -60,14 +40,14 @@ static const struct wd_ops *wd_ops;
60static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS); 40static DECLARE_BITMAP(perfctr_nmi_owner, NMI_MAX_COUNTER_BITS);
61static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS); 41static DECLARE_BITMAP(evntsel_nmi_owner, NMI_MAX_COUNTER_BITS);
62 42
63static DEFINE_PER_CPU(struct nmi_watchdog_ctlblk, nmi_watchdog_ctlblk);
64
65/* converts an msr to an appropriate reservation bit */ 43/* converts an msr to an appropriate reservation bit */
66static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr) 44static inline unsigned int nmi_perfctr_msr_to_bit(unsigned int msr)
67{ 45{
68 /* returns the bit offset of the performance counter register */ 46 /* returns the bit offset of the performance counter register */
69 switch (boot_cpu_data.x86_vendor) { 47 switch (boot_cpu_data.x86_vendor) {
70 case X86_VENDOR_AMD: 48 case X86_VENDOR_AMD:
49 if (msr >= MSR_F15H_PERF_CTR)
50 return (msr - MSR_F15H_PERF_CTR) >> 1;
71 return msr - MSR_K7_PERFCTR0; 51 return msr - MSR_K7_PERFCTR0;
72 case X86_VENDOR_INTEL: 52 case X86_VENDOR_INTEL:
73 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 53 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -92,6 +72,8 @@ static inline unsigned int nmi_evntsel_msr_to_bit(unsigned int msr)
92 /* returns the bit offset of the event selection register */ 72 /* returns the bit offset of the event selection register */
93 switch (boot_cpu_data.x86_vendor) { 73 switch (boot_cpu_data.x86_vendor) {
94 case X86_VENDOR_AMD: 74 case X86_VENDOR_AMD:
75 if (msr >= MSR_F15H_PERF_CTL)
76 return (msr - MSR_F15H_PERF_CTL) >> 1;
95 return msr - MSR_K7_EVNTSEL0; 77 return msr - MSR_K7_EVNTSEL0;
96 case X86_VENDOR_INTEL: 78 case X86_VENDOR_INTEL:
97 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) 79 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
@@ -172,624 +154,3 @@ void release_evntsel_nmi(unsigned int msr)
172 clear_bit(counter, evntsel_nmi_owner); 154 clear_bit(counter, evntsel_nmi_owner);
173} 155}
174EXPORT_SYMBOL(release_evntsel_nmi); 156EXPORT_SYMBOL(release_evntsel_nmi);
175
176void disable_lapic_nmi_watchdog(void)
177{
178 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
179
180 if (atomic_read(&nmi_active) <= 0)
181 return;
182
183 on_each_cpu(stop_apic_nmi_watchdog, NULL, 1);
184
185 if (wd_ops)
186 wd_ops->unreserve();
187
188 BUG_ON(atomic_read(&nmi_active) != 0);
189}
190
191void enable_lapic_nmi_watchdog(void)
192{
193 BUG_ON(nmi_watchdog != NMI_LOCAL_APIC);
194
195 /* are we already enabled */
196 if (atomic_read(&nmi_active) != 0)
197 return;
198
199 /* are we lapic aware */
200 if (!wd_ops)
201 return;
202 if (!wd_ops->reserve()) {
203 printk(KERN_ERR "NMI watchdog: cannot reserve perfctrs\n");
204 return;
205 }
206
207 on_each_cpu(setup_apic_nmi_watchdog, NULL, 1);
208 touch_nmi_watchdog();
209}
210
211/*
212 * Activate the NMI watchdog via the local APIC.
213 */
214
215static unsigned int adjust_for_32bit_ctr(unsigned int hz)
216{
217 u64 counter_val;
218 unsigned int retval = hz;
219
220 /*
221 * On Intel CPUs with P6/ARCH_PERFMON only 32 bits in the counter
222 * are writable, with higher bits sign extending from bit 31.
223 * So, we can only program the counter with 31 bit values and
224 * 32nd bit should be 1, for 33.. to be 1.
225 * Find the appropriate nmi_hz
226 */
227 counter_val = (u64)cpu_khz * 1000;
228 do_div(counter_val, retval);
229 if (counter_val > 0x7fffffffULL) {
230 u64 count = (u64)cpu_khz * 1000;
231 do_div(count, 0x7fffffffUL);
232 retval = count + 1;
233 }
234 return retval;
235}
236
237static void write_watchdog_counter(unsigned int perfctr_msr,
238 const char *descr, unsigned nmi_hz)
239{
240 u64 count = (u64)cpu_khz * 1000;
241
242 do_div(count, nmi_hz);
243 if (descr)
244 pr_debug("setting %s to -0x%08Lx\n", descr, count);
245 wrmsrl(perfctr_msr, 0 - count);
246}
247
248static void write_watchdog_counter32(unsigned int perfctr_msr,
249 const char *descr, unsigned nmi_hz)
250{
251 u64 count = (u64)cpu_khz * 1000;
252
253 do_div(count, nmi_hz);
254 if (descr)
255 pr_debug("setting %s to -0x%08Lx\n", descr, count);
256 wrmsr(perfctr_msr, (u32)(-count), 0);
257}
258
259/*
260 * AMD K7/K8/Family10h/Family11h support.
261 * AMD keeps this interface nicely stable so there is not much variety
262 */
263#define K7_EVNTSEL_ENABLE (1 << 22)
264#define K7_EVNTSEL_INT (1 << 20)
265#define K7_EVNTSEL_OS (1 << 17)
266#define K7_EVNTSEL_USR (1 << 16)
267#define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
268#define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
269
270static int setup_k7_watchdog(unsigned nmi_hz)
271{
272 unsigned int perfctr_msr, evntsel_msr;
273 unsigned int evntsel;
274 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
275
276 perfctr_msr = wd_ops->perfctr;
277 evntsel_msr = wd_ops->evntsel;
278
279 wrmsrl(perfctr_msr, 0UL);
280
281 evntsel = K7_EVNTSEL_INT
282 | K7_EVNTSEL_OS
283 | K7_EVNTSEL_USR
284 | K7_NMI_EVENT;
285
286 /* setup the timer */
287 wrmsr(evntsel_msr, evntsel, 0);
288 write_watchdog_counter(perfctr_msr, "K7_PERFCTR0", nmi_hz);
289
290 /* initialize the wd struct before enabling */
291 wd->perfctr_msr = perfctr_msr;
292 wd->evntsel_msr = evntsel_msr;
293 wd->cccr_msr = 0; /* unused */
294
295 /* ok, everything is initialized, announce that we're set */
296 cpu_nmi_set_wd_enabled();
297
298 apic_write(APIC_LVTPC, APIC_DM_NMI);
299 evntsel |= K7_EVNTSEL_ENABLE;
300 wrmsr(evntsel_msr, evntsel, 0);
301
302 return 1;
303}
304
305static void single_msr_stop_watchdog(void)
306{
307 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
308
309 wrmsr(wd->evntsel_msr, 0, 0);
310}
311
312static int single_msr_reserve(void)
313{
314 if (!reserve_perfctr_nmi(wd_ops->perfctr))
315 return 0;
316
317 if (!reserve_evntsel_nmi(wd_ops->evntsel)) {
318 release_perfctr_nmi(wd_ops->perfctr);
319 return 0;
320 }
321 return 1;
322}
323
324static void single_msr_unreserve(void)
325{
326 release_evntsel_nmi(wd_ops->evntsel);
327 release_perfctr_nmi(wd_ops->perfctr);
328}
329
330static void __kprobes
331single_msr_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
332{
333 /* start the cycle over again */
334 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
335}
336
337static const struct wd_ops k7_wd_ops = {
338 .reserve = single_msr_reserve,
339 .unreserve = single_msr_unreserve,
340 .setup = setup_k7_watchdog,
341 .rearm = single_msr_rearm,
342 .stop = single_msr_stop_watchdog,
343 .perfctr = MSR_K7_PERFCTR0,
344 .evntsel = MSR_K7_EVNTSEL0,
345 .checkbit = 1ULL << 47,
346};
347
348/*
349 * Intel Model 6 (PPro+,P2,P3,P-M,Core1)
350 */
351#define P6_EVNTSEL0_ENABLE (1 << 22)
352#define P6_EVNTSEL_INT (1 << 20)
353#define P6_EVNTSEL_OS (1 << 17)
354#define P6_EVNTSEL_USR (1 << 16)
355#define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
356#define P6_NMI_EVENT P6_EVENT_CPU_CLOCKS_NOT_HALTED
357
358static int setup_p6_watchdog(unsigned nmi_hz)
359{
360 unsigned int perfctr_msr, evntsel_msr;
361 unsigned int evntsel;
362 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
363
364 perfctr_msr = wd_ops->perfctr;
365 evntsel_msr = wd_ops->evntsel;
366
367 /* KVM doesn't implement this MSR */
368 if (wrmsr_safe(perfctr_msr, 0, 0) < 0)
369 return 0;
370
371 evntsel = P6_EVNTSEL_INT
372 | P6_EVNTSEL_OS
373 | P6_EVNTSEL_USR
374 | P6_NMI_EVENT;
375
376 /* setup the timer */
377 wrmsr(evntsel_msr, evntsel, 0);
378 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
379 write_watchdog_counter32(perfctr_msr, "P6_PERFCTR0", nmi_hz);
380
381 /* initialize the wd struct before enabling */
382 wd->perfctr_msr = perfctr_msr;
383 wd->evntsel_msr = evntsel_msr;
384 wd->cccr_msr = 0; /* unused */
385
386 /* ok, everything is initialized, announce that we're set */
387 cpu_nmi_set_wd_enabled();
388
389 apic_write(APIC_LVTPC, APIC_DM_NMI);
390 evntsel |= P6_EVNTSEL0_ENABLE;
391 wrmsr(evntsel_msr, evntsel, 0);
392
393 return 1;
394}
395
396static void __kprobes p6_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
397{
398 /*
399 * P6 based Pentium M need to re-unmask
400 * the apic vector but it doesn't hurt
401 * other P6 variant.
402 * ArchPerfom/Core Duo also needs this
403 */
404 apic_write(APIC_LVTPC, APIC_DM_NMI);
405
406 /* P6/ARCH_PERFMON has 32 bit counter write */
407 write_watchdog_counter32(wd->perfctr_msr, NULL, nmi_hz);
408}
409
410static const struct wd_ops p6_wd_ops = {
411 .reserve = single_msr_reserve,
412 .unreserve = single_msr_unreserve,
413 .setup = setup_p6_watchdog,
414 .rearm = p6_rearm,
415 .stop = single_msr_stop_watchdog,
416 .perfctr = MSR_P6_PERFCTR0,
417 .evntsel = MSR_P6_EVNTSEL0,
418 .checkbit = 1ULL << 39,
419};
420
421/*
422 * Intel P4 performance counters.
423 * By far the most complicated of all.
424 */
425#define MSR_P4_MISC_ENABLE_PERF_AVAIL (1 << 7)
426#define P4_ESCR_EVENT_SELECT(N) ((N) << 25)
427#define P4_ESCR_OS (1 << 3)
428#define P4_ESCR_USR (1 << 2)
429#define P4_CCCR_OVF_PMI0 (1 << 26)
430#define P4_CCCR_OVF_PMI1 (1 << 27)
431#define P4_CCCR_THRESHOLD(N) ((N) << 20)
432#define P4_CCCR_COMPLEMENT (1 << 19)
433#define P4_CCCR_COMPARE (1 << 18)
434#define P4_CCCR_REQUIRED (3 << 16)
435#define P4_CCCR_ESCR_SELECT(N) ((N) << 13)
436#define P4_CCCR_ENABLE (1 << 12)
437#define P4_CCCR_OVF (1 << 31)
438
439#define P4_CONTROLS 18
440static unsigned int p4_controls[18] = {
441 MSR_P4_BPU_CCCR0,
442 MSR_P4_BPU_CCCR1,
443 MSR_P4_BPU_CCCR2,
444 MSR_P4_BPU_CCCR3,
445 MSR_P4_MS_CCCR0,
446 MSR_P4_MS_CCCR1,
447 MSR_P4_MS_CCCR2,
448 MSR_P4_MS_CCCR3,
449 MSR_P4_FLAME_CCCR0,
450 MSR_P4_FLAME_CCCR1,
451 MSR_P4_FLAME_CCCR2,
452 MSR_P4_FLAME_CCCR3,
453 MSR_P4_IQ_CCCR0,
454 MSR_P4_IQ_CCCR1,
455 MSR_P4_IQ_CCCR2,
456 MSR_P4_IQ_CCCR3,
457 MSR_P4_IQ_CCCR4,
458 MSR_P4_IQ_CCCR5,
459};
460/*
461 * Set up IQ_COUNTER0 to behave like a clock, by having IQ_CCCR0 filter
462 * CRU_ESCR0 (with any non-null event selector) through a complemented
463 * max threshold. [IA32-Vol3, Section 14.9.9]
464 */
465static int setup_p4_watchdog(unsigned nmi_hz)
466{
467 unsigned int perfctr_msr, evntsel_msr, cccr_msr;
468 unsigned int evntsel, cccr_val;
469 unsigned int misc_enable, dummy;
470 unsigned int ht_num;
471 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
472
473 rdmsr(MSR_IA32_MISC_ENABLE, misc_enable, dummy);
474 if (!(misc_enable & MSR_P4_MISC_ENABLE_PERF_AVAIL))
475 return 0;
476
477#ifdef CONFIG_SMP
478 /* detect which hyperthread we are on */
479 if (smp_num_siblings == 2) {
480 unsigned int ebx, apicid;
481
482 ebx = cpuid_ebx(1);
483 apicid = (ebx >> 24) & 0xff;
484 ht_num = apicid & 1;
485 } else
486#endif
487 ht_num = 0;
488
489 /*
490 * performance counters are shared resources
491 * assign each hyperthread its own set
492 * (re-use the ESCR0 register, seems safe
493 * and keeps the cccr_val the same)
494 */
495 if (!ht_num) {
496 /* logical cpu 0 */
497 perfctr_msr = MSR_P4_IQ_PERFCTR0;
498 evntsel_msr = MSR_P4_CRU_ESCR0;
499 cccr_msr = MSR_P4_IQ_CCCR0;
500 cccr_val = P4_CCCR_OVF_PMI0 | P4_CCCR_ESCR_SELECT(4);
501
502 /*
503 * If we're on the kdump kernel or other situation, we may
504 * still have other performance counter registers set to
505 * interrupt and they'll keep interrupting forever because
506 * of the P4_CCCR_OVF quirk. So we need to ACK all the
507 * pending interrupts and disable all the registers here,
508 * before reenabling the NMI delivery. Refer to p4_rearm()
509 * about the P4_CCCR_OVF quirk.
510 */
511 if (reset_devices) {
512 unsigned int low, high;
513 int i;
514
515 for (i = 0; i < P4_CONTROLS; i++) {
516 rdmsr(p4_controls[i], low, high);
517 low &= ~(P4_CCCR_ENABLE | P4_CCCR_OVF);
518 wrmsr(p4_controls[i], low, high);
519 }
520 }
521 } else {
522 /* logical cpu 1 */
523 perfctr_msr = MSR_P4_IQ_PERFCTR1;
524 evntsel_msr = MSR_P4_CRU_ESCR0;
525 cccr_msr = MSR_P4_IQ_CCCR1;
526
527 /* Pentium 4 D processors don't support P4_CCCR_OVF_PMI1 */
528 if (boot_cpu_data.x86_model == 4 && boot_cpu_data.x86_mask == 4)
529 cccr_val = P4_CCCR_OVF_PMI0;
530 else
531 cccr_val = P4_CCCR_OVF_PMI1;
532 cccr_val |= P4_CCCR_ESCR_SELECT(4);
533 }
534
535 evntsel = P4_ESCR_EVENT_SELECT(0x3F)
536 | P4_ESCR_OS
537 | P4_ESCR_USR;
538
539 cccr_val |= P4_CCCR_THRESHOLD(15)
540 | P4_CCCR_COMPLEMENT
541 | P4_CCCR_COMPARE
542 | P4_CCCR_REQUIRED;
543
544 wrmsr(evntsel_msr, evntsel, 0);
545 wrmsr(cccr_msr, cccr_val, 0);
546 write_watchdog_counter(perfctr_msr, "P4_IQ_COUNTER0", nmi_hz);
547
548 wd->perfctr_msr = perfctr_msr;
549 wd->evntsel_msr = evntsel_msr;
550 wd->cccr_msr = cccr_msr;
551
552 /* ok, everything is initialized, announce that we're set */
553 cpu_nmi_set_wd_enabled();
554
555 apic_write(APIC_LVTPC, APIC_DM_NMI);
556 cccr_val |= P4_CCCR_ENABLE;
557 wrmsr(cccr_msr, cccr_val, 0);
558 return 1;
559}
560
561static void stop_p4_watchdog(void)
562{
563 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
564 wrmsr(wd->cccr_msr, 0, 0);
565 wrmsr(wd->evntsel_msr, 0, 0);
566}
567
568static int p4_reserve(void)
569{
570 if (!reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR0))
571 return 0;
572#ifdef CONFIG_SMP
573 if (smp_num_siblings > 1 && !reserve_perfctr_nmi(MSR_P4_IQ_PERFCTR1))
574 goto fail1;
575#endif
576 if (!reserve_evntsel_nmi(MSR_P4_CRU_ESCR0))
577 goto fail2;
578 /* RED-PEN why is ESCR1 not reserved here? */
579 return 1;
580 fail2:
581#ifdef CONFIG_SMP
582 if (smp_num_siblings > 1)
583 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
584 fail1:
585#endif
586 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
587 return 0;
588}
589
590static void p4_unreserve(void)
591{
592#ifdef CONFIG_SMP
593 if (smp_num_siblings > 1)
594 release_perfctr_nmi(MSR_P4_IQ_PERFCTR1);
595#endif
596 release_evntsel_nmi(MSR_P4_CRU_ESCR0);
597 release_perfctr_nmi(MSR_P4_IQ_PERFCTR0);
598}
599
600static void __kprobes p4_rearm(struct nmi_watchdog_ctlblk *wd, unsigned nmi_hz)
601{
602 unsigned dummy;
603 /*
604 * P4 quirks:
605 * - An overflown perfctr will assert its interrupt
606 * until the OVF flag in its CCCR is cleared.
607 * - LVTPC is masked on interrupt and must be
608 * unmasked by the LVTPC handler.
609 */
610 rdmsrl(wd->cccr_msr, dummy);
611 dummy &= ~P4_CCCR_OVF;
612 wrmsrl(wd->cccr_msr, dummy);
613 apic_write(APIC_LVTPC, APIC_DM_NMI);
614 /* start the cycle over again */
615 write_watchdog_counter(wd->perfctr_msr, NULL, nmi_hz);
616}
617
618static const struct wd_ops p4_wd_ops = {
619 .reserve = p4_reserve,
620 .unreserve = p4_unreserve,
621 .setup = setup_p4_watchdog,
622 .rearm = p4_rearm,
623 .stop = stop_p4_watchdog,
624 /* RED-PEN this is wrong for the other sibling */
625 .perfctr = MSR_P4_BPU_PERFCTR0,
626 .evntsel = MSR_P4_BSU_ESCR0,
627 .checkbit = 1ULL << 39,
628};
629
630/*
631 * Watchdog using the Intel architected PerfMon.
632 * Used for Core2 and hopefully all future Intel CPUs.
633 */
634#define ARCH_PERFMON_NMI_EVENT_SEL ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL
635#define ARCH_PERFMON_NMI_EVENT_UMASK ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK
636
637static struct wd_ops intel_arch_wd_ops;
638
639static int setup_intel_arch_watchdog(unsigned nmi_hz)
640{
641 unsigned int ebx;
642 union cpuid10_eax eax;
643 unsigned int unused;
644 unsigned int perfctr_msr, evntsel_msr;
645 unsigned int evntsel;
646 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
647
648 /*
649 * Check whether the Architectural PerfMon supports
650 * Unhalted Core Cycles Event or not.
651 * NOTE: Corresponding bit = 0 in ebx indicates event present.
652 */
653 cpuid(10, &(eax.full), &ebx, &unused, &unused);
654 if ((eax.split.mask_length <
655 (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX+1)) ||
656 (ebx & ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT))
657 return 0;
658
659 perfctr_msr = wd_ops->perfctr;
660 evntsel_msr = wd_ops->evntsel;
661
662 wrmsrl(perfctr_msr, 0UL);
663
664 evntsel = ARCH_PERFMON_EVENTSEL_INT
665 | ARCH_PERFMON_EVENTSEL_OS
666 | ARCH_PERFMON_EVENTSEL_USR
667 | ARCH_PERFMON_NMI_EVENT_SEL
668 | ARCH_PERFMON_NMI_EVENT_UMASK;
669
670 /* setup the timer */
671 wrmsr(evntsel_msr, evntsel, 0);
672 nmi_hz = adjust_for_32bit_ctr(nmi_hz);
673 write_watchdog_counter32(perfctr_msr, "INTEL_ARCH_PERFCTR0", nmi_hz);
674
675 wd->perfctr_msr = perfctr_msr;
676 wd->evntsel_msr = evntsel_msr;
677 wd->cccr_msr = 0; /* unused */
678
679 /* ok, everything is initialized, announce that we're set */
680 cpu_nmi_set_wd_enabled();
681
682 apic_write(APIC_LVTPC, APIC_DM_NMI);
683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
684 wrmsr(evntsel_msr, evntsel, 0);
685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
686 return 1;
687}
688
689static struct wd_ops intel_arch_wd_ops __read_mostly = {
690 .reserve = single_msr_reserve,
691 .unreserve = single_msr_unreserve,
692 .setup = setup_intel_arch_watchdog,
693 .rearm = p6_rearm,
694 .stop = single_msr_stop_watchdog,
695 .perfctr = MSR_ARCH_PERFMON_PERFCTR1,
696 .evntsel = MSR_ARCH_PERFMON_EVENTSEL1,
697};
698
699static void probe_nmi_watchdog(void)
700{
701 switch (boot_cpu_data.x86_vendor) {
702 case X86_VENDOR_AMD:
703 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
704 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
705 return;
706 wd_ops = &k7_wd_ops;
707 break;
708 case X86_VENDOR_INTEL:
709 /* Work around where perfctr1 doesn't have a working enable
710 * bit as described in the following errata:
711 * AE49 Core Duo and Intel Core Solo 65 nm
712 * AN49 Intel Pentium Dual-Core
713 * AF49 Dual-Core Intel Xeon Processor LV
714 */
715 if ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 14) ||
716 ((boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 15 &&
717 boot_cpu_data.x86_mask == 4))) {
718 intel_arch_wd_ops.perfctr = MSR_ARCH_PERFMON_PERFCTR0;
719 intel_arch_wd_ops.evntsel = MSR_ARCH_PERFMON_EVENTSEL0;
720 }
721 if (cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
722 wd_ops = &intel_arch_wd_ops;
723 break;
724 }
725 switch (boot_cpu_data.x86) {
726 case 6:
727 if (boot_cpu_data.x86_model > 13)
728 return;
729
730 wd_ops = &p6_wd_ops;
731 break;
732 case 15:
733 wd_ops = &p4_wd_ops;
734 break;
735 default:
736 return;
737 }
738 break;
739 }
740}
741
742/* Interface to nmi.c */
743
744int lapic_watchdog_init(unsigned nmi_hz)
745{
746 if (!wd_ops) {
747 probe_nmi_watchdog();
748 if (!wd_ops) {
749 printk(KERN_INFO "NMI watchdog: CPU not supported\n");
750 return -1;
751 }
752
753 if (!wd_ops->reserve()) {
754 printk(KERN_ERR
755 "NMI watchdog: cannot reserve perfctrs\n");
756 return -1;
757 }
758 }
759
760 if (!(wd_ops->setup(nmi_hz))) {
761 printk(KERN_ERR "Cannot setup NMI watchdog on CPU %d\n",
762 raw_smp_processor_id());
763 return -1;
764 }
765
766 return 0;
767}
768
769void lapic_watchdog_stop(void)
770{
771 if (wd_ops)
772 wd_ops->stop();
773}
774
775unsigned lapic_adjust_nmi_hz(unsigned hz)
776{
777 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
778 if (wd->perfctr_msr == MSR_P6_PERFCTR0 ||
779 wd->perfctr_msr == MSR_ARCH_PERFMON_PERFCTR1)
780 hz = adjust_for_32bit_ctr(hz);
781 return hz;
782}
783
784int __kprobes lapic_wd_event(unsigned nmi_hz)
785{
786 struct nmi_watchdog_ctlblk *wd = &__get_cpu_var(nmi_watchdog_ctlblk);
787 u64 ctr;
788
789 rdmsrl(wd->perfctr_msr, ctr);
790 if (ctr & wd_ops->checkbit) /* perfctr still running? */
791 return 0;
792
793 wd_ops->rearm(wd, nmi_hz);
794 return 1;
795}
diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c
index d49079515122..c7f64e6f537a 100644
--- a/arch/x86/kernel/cpu/scattered.c
+++ b/arch/x86/kernel/cpu/scattered.c
@@ -44,6 +44,12 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 }, 44 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a, 0 },
45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 }, 45 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a, 0 },
46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 }, 46 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a, 0 },
47 { X86_FEATURE_TSCRATEMSR, CR_EDX, 4, 0x8000000a, 0 },
48 { X86_FEATURE_VMCBCLEAN, CR_EDX, 5, 0x8000000a, 0 },
49 { X86_FEATURE_FLUSHBYASID, CR_EDX, 6, 0x8000000a, 0 },
50 { X86_FEATURE_DECODEASSISTS, CR_EDX, 7, 0x8000000a, 0 },
51 { X86_FEATURE_PAUSEFILTER, CR_EDX,10, 0x8000000a, 0 },
52 { X86_FEATURE_PFTHRESHOLD, CR_EDX,12, 0x8000000a, 0 },
47 { 0, 0, 0, 0, 0 } 53 { 0, 0, 0, 0, 0 }
48 }; 54 };
49 55
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 227b0448960d..d22d0c4edcfd 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -86,7 +86,7 @@ static void __init vmware_platform_setup(void)
86} 86}
87 87
88/* 88/*
89 * While checking the dmi string infomation, just checking the product 89 * While checking the dmi string information, just checking the product
90 * serial key should be enough, as this will always have a VMware 90 * serial key should be enough, as this will always have a VMware
91 * specific string when running under VMware hypervisor. 91 * specific string when running under VMware hypervisor.
92 */ 92 */
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 1b7b31ab7d86..212a6a42527c 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -33,7 +33,6 @@
33#include <linux/init.h> 33#include <linux/init.h>
34#include <linux/poll.h> 34#include <linux/poll.h>
35#include <linux/smp.h> 35#include <linux/smp.h>
36#include <linux/smp_lock.h>
37#include <linux/major.h> 36#include <linux/major.h>
38#include <linux/fs.h> 37#include <linux/fs.h>
39#include <linux/device.h> 38#include <linux/device.h>
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index 67414550c3cc..642f75a68cd5 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -14,9 +14,6 @@
14 14
15static void *kdump_buf_page; 15static void *kdump_buf_page;
16 16
17/* Stores the physical address of elf header of crash image. */
18unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
19
20static inline bool is_crashed_pfn_valid(unsigned long pfn) 17static inline bool is_crashed_pfn_valid(unsigned long pfn)
21{ 18{
22#ifndef CONFIG_X86_PAE 19#ifndef CONFIG_X86_PAE
@@ -61,7 +58,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
61 if (!is_crashed_pfn_valid(pfn)) 58 if (!is_crashed_pfn_valid(pfn))
62 return -EFAULT; 59 return -EFAULT;
63 60
64 vaddr = kmap_atomic_pfn(pfn, KM_PTE0); 61 vaddr = kmap_atomic_pfn(pfn);
65 62
66 if (!userbuf) { 63 if (!userbuf) {
67 memcpy(buf, (vaddr + offset), csize); 64 memcpy(buf, (vaddr + offset), csize);
diff --git a/arch/x86/kernel/crash_dump_64.c b/arch/x86/kernel/crash_dump_64.c
index 045b36cada65..afa64adb75ee 100644
--- a/arch/x86/kernel/crash_dump_64.c
+++ b/arch/x86/kernel/crash_dump_64.c
@@ -10,9 +10,6 @@
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/io.h> 11#include <linux/io.h>
12 12
13/* Stores the physical address of elf header of crash image. */
14unsigned long long elfcorehdr_addr = ELFCORE_ADDR_MAX;
15
16/** 13/**
17 * copy_oldmem_page - copy one page from "oldmem" 14 * copy_oldmem_page - copy one page from "oldmem"
18 * @pfn: page frame number to be copied 15 * @pfn: page frame number to be copied
@@ -34,7 +31,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
34 if (!csize) 31 if (!csize)
35 return 0; 32 return 0;
36 33
37 vaddr = ioremap(pfn << PAGE_SHIFT, PAGE_SIZE); 34 vaddr = ioremap_cache(pfn << PAGE_SHIFT, PAGE_SIZE);
38 if (!vaddr) 35 if (!vaddr)
39 return -ENOMEM; 36 return -ENOMEM;
40 37
@@ -46,6 +43,7 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf,
46 } else 43 } else
47 memcpy(buf, vaddr + offset, csize); 44 memcpy(buf, vaddr + offset, csize);
48 45
46 set_iounmap_nonlazy();
49 iounmap(vaddr); 47 iounmap(vaddr);
50 return csize; 48 return csize;
51} 49}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
new file mode 100644
index 000000000000..9aeb78a23de4
--- /dev/null
+++ b/arch/x86/kernel/devicetree.c
@@ -0,0 +1,452 @@
1/*
2 * Architecture specific OF callbacks.
3 */
4#include <linux/bootmem.h>
5#include <linux/io.h>
6#include <linux/interrupt.h>
7#include <linux/list.h>
8#include <linux/of.h>
9#include <linux/of_fdt.h>
10#include <linux/of_address.h>
11#include <linux/of_platform.h>
12#include <linux/of_irq.h>
13#include <linux/slab.h>
14#include <linux/pci.h>
15#include <linux/of_pci.h>
16#include <linux/initrd.h>
17
18#include <asm/hpet.h>
19#include <asm/irq_controller.h>
20#include <asm/apic.h>
21#include <asm/pci_x86.h>
22
23__initdata u64 initial_dtb;
24char __initdata cmd_line[COMMAND_LINE_SIZE];
25static LIST_HEAD(irq_domains);
26static DEFINE_RAW_SPINLOCK(big_irq_lock);
27
28int __initdata of_ioapic;
29
30#ifdef CONFIG_X86_IO_APIC
31static void add_interrupt_host(struct irq_domain *ih)
32{
33 unsigned long flags;
34
35 raw_spin_lock_irqsave(&big_irq_lock, flags);
36 list_add(&ih->l, &irq_domains);
37 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
38}
39#endif
40
41static struct irq_domain *get_ih_from_node(struct device_node *controller)
42{
43 struct irq_domain *ih, *found = NULL;
44 unsigned long flags;
45
46 raw_spin_lock_irqsave(&big_irq_lock, flags);
47 list_for_each_entry(ih, &irq_domains, l) {
48 if (ih->controller == controller) {
49 found = ih;
50 break;
51 }
52 }
53 raw_spin_unlock_irqrestore(&big_irq_lock, flags);
54 return found;
55}
56
57unsigned int irq_create_of_mapping(struct device_node *controller,
58 const u32 *intspec, unsigned int intsize)
59{
60 struct irq_domain *ih;
61 u32 virq, type;
62 int ret;
63
64 ih = get_ih_from_node(controller);
65 if (!ih)
66 return 0;
67 ret = ih->xlate(ih, intspec, intsize, &virq, &type);
68 if (ret)
69 return 0;
70 if (type == IRQ_TYPE_NONE)
71 return virq;
72 irq_set_irq_type(virq, type);
73 return virq;
74}
75EXPORT_SYMBOL_GPL(irq_create_of_mapping);
76
77unsigned long pci_address_to_pio(phys_addr_t address)
78{
79 /*
80 * The ioport address can be directly used by inX / outX
81 */
82 BUG_ON(address >= (1 << 16));
83 return (unsigned long)address;
84}
85EXPORT_SYMBOL_GPL(pci_address_to_pio);
86
87void __init early_init_dt_scan_chosen_arch(unsigned long node)
88{
89 BUG();
90}
91
92void __init early_init_dt_add_memory_arch(u64 base, u64 size)
93{
94 BUG();
95}
96
97void * __init early_init_dt_alloc_memory_arch(u64 size, u64 align)
98{
99 return __alloc_bootmem(size, align, __pa(MAX_DMA_ADDRESS));
100}
101
102#ifdef CONFIG_BLK_DEV_INITRD
103void __init early_init_dt_setup_initrd_arch(unsigned long start,
104 unsigned long end)
105{
106 initrd_start = (unsigned long)__va(start);
107 initrd_end = (unsigned long)__va(end);
108 initrd_below_start_ok = 1;
109}
110#endif
111
112void __init add_dtb(u64 data)
113{
114 initial_dtb = data + offsetof(struct setup_data, data);
115}
116
117/*
118 * CE4100 ids. Will be moved to machine_device_initcall() once we have it.
119 */
120static struct of_device_id __initdata ce4100_ids[] = {
121 { .compatible = "intel,ce4100-cp", },
122 { .compatible = "isa", },
123 { .compatible = "pci", },
124 {},
125};
126
127static int __init add_bus_probe(void)
128{
129 if (!of_have_populated_dt())
130 return 0;
131
132 return of_platform_bus_probe(NULL, ce4100_ids, NULL);
133}
134module_init(add_bus_probe);
135
136#ifdef CONFIG_PCI
137static int x86_of_pci_irq_enable(struct pci_dev *dev)
138{
139 struct of_irq oirq;
140 u32 virq;
141 int ret;
142 u8 pin;
143
144 ret = pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
145 if (ret)
146 return ret;
147 if (!pin)
148 return 0;
149
150 ret = of_irq_map_pci(dev, &oirq);
151 if (ret)
152 return ret;
153
154 virq = irq_create_of_mapping(oirq.controller, oirq.specifier,
155 oirq.size);
156 if (virq == 0)
157 return -EINVAL;
158 dev->irq = virq;
159 return 0;
160}
161
162static void x86_of_pci_irq_disable(struct pci_dev *dev)
163{
164}
165
166void __cpuinit x86_of_pci_init(void)
167{
168 struct device_node *np;
169
170 pcibios_enable_irq = x86_of_pci_irq_enable;
171 pcibios_disable_irq = x86_of_pci_irq_disable;
172
173 for_each_node_by_type(np, "pci") {
174 const void *prop;
175 struct pci_bus *bus;
176 unsigned int bus_min;
177 struct device_node *child;
178
179 prop = of_get_property(np, "bus-range", NULL);
180 if (!prop)
181 continue;
182 bus_min = be32_to_cpup(prop);
183
184 bus = pci_find_bus(0, bus_min);
185 if (!bus) {
186 printk(KERN_ERR "Can't find a node for bus %s.\n",
187 np->full_name);
188 continue;
189 }
190
191 if (bus->self)
192 bus->self->dev.of_node = np;
193 else
194 bus->dev.of_node = np;
195
196 for_each_child_of_node(np, child) {
197 struct pci_dev *dev;
198 u32 devfn;
199
200 prop = of_get_property(child, "reg", NULL);
201 if (!prop)
202 continue;
203
204 devfn = (be32_to_cpup(prop) >> 8) & 0xff;
205 dev = pci_get_slot(bus, devfn);
206 if (!dev)
207 continue;
208 dev->dev.of_node = child;
209 pci_dev_put(dev);
210 }
211 }
212}
213#endif
214
215static void __init dtb_setup_hpet(void)
216{
217#ifdef CONFIG_HPET_TIMER
218 struct device_node *dn;
219 struct resource r;
220 int ret;
221
222 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-hpet");
223 if (!dn)
224 return;
225 ret = of_address_to_resource(dn, 0, &r);
226 if (ret) {
227 WARN_ON(1);
228 return;
229 }
230 hpet_address = r.start;
231#endif
232}
233
234static void __init dtb_lapic_setup(void)
235{
236#ifdef CONFIG_X86_LOCAL_APIC
237 struct device_node *dn;
238 struct resource r;
239 int ret;
240
241 dn = of_find_compatible_node(NULL, NULL, "intel,ce4100-lapic");
242 if (!dn)
243 return;
244
245 ret = of_address_to_resource(dn, 0, &r);
246 if (WARN_ON(ret))
247 return;
248
249 /* Did the boot loader setup the local APIC ? */
250 if (!cpu_has_apic) {
251 if (apic_force_enable(r.start))
252 return;
253 }
254 smp_found_config = 1;
255 pic_mode = 1;
256 register_lapic_address(r.start);
257 generic_processor_info(boot_cpu_physical_apicid,
258 GET_APIC_VERSION(apic_read(APIC_LVR)));
259#endif
260}
261
262#ifdef CONFIG_X86_IO_APIC
263static unsigned int ioapic_id;
264
265static void __init dtb_add_ioapic(struct device_node *dn)
266{
267 struct resource r;
268 int ret;
269
270 ret = of_address_to_resource(dn, 0, &r);
271 if (ret) {
272 printk(KERN_ERR "Can't obtain address from node %s.\n",
273 dn->full_name);
274 return;
275 }
276 mp_register_ioapic(++ioapic_id, r.start, gsi_top);
277}
278
279static void __init dtb_ioapic_setup(void)
280{
281 struct device_node *dn;
282
283 for_each_compatible_node(dn, NULL, "intel,ce4100-ioapic")
284 dtb_add_ioapic(dn);
285
286 if (nr_ioapics) {
287 of_ioapic = 1;
288 return;
289 }
290 printk(KERN_ERR "Error: No information about IO-APIC in OF.\n");
291}
292#else
293static void __init dtb_ioapic_setup(void) {}
294#endif
295
296static void __init dtb_apic_setup(void)
297{
298 dtb_lapic_setup();
299 dtb_ioapic_setup();
300}
301
302#ifdef CONFIG_OF_FLATTREE
303static void __init x86_flattree_get_config(void)
304{
305 u32 size, map_len;
306 void *new_dtb;
307
308 if (!initial_dtb)
309 return;
310
311 map_len = max(PAGE_SIZE - (initial_dtb & ~PAGE_MASK),
312 (u64)sizeof(struct boot_param_header));
313
314 initial_boot_params = early_memremap(initial_dtb, map_len);
315 size = be32_to_cpu(initial_boot_params->totalsize);
316 if (map_len < size) {
317 early_iounmap(initial_boot_params, map_len);
318 initial_boot_params = early_memremap(initial_dtb, size);
319 map_len = size;
320 }
321
322 new_dtb = alloc_bootmem(size);
323 memcpy(new_dtb, initial_boot_params, size);
324 early_iounmap(initial_boot_params, map_len);
325
326 initial_boot_params = new_dtb;
327
328 /* root level address cells */
329 of_scan_flat_dt(early_init_dt_scan_root, NULL);
330
331 unflatten_device_tree();
332}
333#else
334static inline void x86_flattree_get_config(void) { }
335#endif
336
337void __init x86_dtb_init(void)
338{
339 x86_flattree_get_config();
340
341 if (!of_have_populated_dt())
342 return;
343
344 dtb_setup_hpet();
345 dtb_apic_setup();
346}
347
348#ifdef CONFIG_X86_IO_APIC
349
350struct of_ioapic_type {
351 u32 out_type;
352 u32 trigger;
353 u32 polarity;
354};
355
356static struct of_ioapic_type of_ioapic_type[] =
357{
358 {
359 .out_type = IRQ_TYPE_EDGE_RISING,
360 .trigger = IOAPIC_EDGE,
361 .polarity = 1,
362 },
363 {
364 .out_type = IRQ_TYPE_LEVEL_LOW,
365 .trigger = IOAPIC_LEVEL,
366 .polarity = 0,
367 },
368 {
369 .out_type = IRQ_TYPE_LEVEL_HIGH,
370 .trigger = IOAPIC_LEVEL,
371 .polarity = 1,
372 },
373 {
374 .out_type = IRQ_TYPE_EDGE_FALLING,
375 .trigger = IOAPIC_EDGE,
376 .polarity = 0,
377 },
378};
379
380static int ioapic_xlate(struct irq_domain *id, const u32 *intspec, u32 intsize,
381 u32 *out_hwirq, u32 *out_type)
382{
383 struct mp_ioapic_gsi *gsi_cfg;
384 struct io_apic_irq_attr attr;
385 struct of_ioapic_type *it;
386 u32 line, idx, type;
387
388 if (intsize < 2)
389 return -EINVAL;
390
391 line = *intspec;
392 idx = (u32) id->priv;
393 gsi_cfg = mp_ioapic_gsi_routing(idx);
394 *out_hwirq = line + gsi_cfg->gsi_base;
395
396 intspec++;
397 type = *intspec;
398
399 if (type >= ARRAY_SIZE(of_ioapic_type))
400 return -EINVAL;
401
402 it = of_ioapic_type + type;
403 *out_type = it->out_type;
404
405 set_io_apic_irq_attr(&attr, idx, line, it->trigger, it->polarity);
406
407 return io_apic_setup_irq_pin_once(*out_hwirq, cpu_to_node(0), &attr);
408}
409
410static void __init ioapic_add_ofnode(struct device_node *np)
411{
412 struct resource r;
413 int i, ret;
414
415 ret = of_address_to_resource(np, 0, &r);
416 if (ret) {
417 printk(KERN_ERR "Failed to obtain address for %s\n",
418 np->full_name);
419 return;
420 }
421
422 for (i = 0; i < nr_ioapics; i++) {
423 if (r.start == mpc_ioapic_addr(i)) {
424 struct irq_domain *id;
425
426 id = kzalloc(sizeof(*id), GFP_KERNEL);
427 BUG_ON(!id);
428 id->controller = np;
429 id->xlate = ioapic_xlate;
430 id->priv = (void *)i;
431 add_interrupt_host(id);
432 return;
433 }
434 }
435 printk(KERN_ERR "IOxAPIC at %s is not registered.\n", np->full_name);
436}
437
438void __init x86_add_irq_domains(void)
439{
440 struct device_node *dp;
441
442 if (!of_have_populated_dt())
443 return;
444
445 for_each_node_with_property(dp, "interrupt-controller") {
446 if (of_device_is_compatible(dp, "intel,ce4100-ioapic"))
447 ioapic_add_ofnode(dp);
448 }
449}
450#else
451void __init x86_add_irq_domains(void) { }
452#endif
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6e8752c1bd52..1aae78f775fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -27,7 +27,7 @@ static int die_counter;
27 27
28void printk_address(unsigned long address, int reliable) 28void printk_address(unsigned long address, int reliable)
29{ 29{
30 printk(" [<%p>] %s%pS\n", (void *) address, 30 printk(" [<%p>] %s%pB\n", (void *) address,
31 reliable ? "" : "? ", (void *) address); 31 reliable ? "" : "? ", (void *) address);
32} 32}
33 33
@@ -135,20 +135,6 @@ print_context_stack_bp(struct thread_info *tinfo,
135} 135}
136EXPORT_SYMBOL_GPL(print_context_stack_bp); 136EXPORT_SYMBOL_GPL(print_context_stack_bp);
137 137
138
139static void
140print_trace_warning_symbol(void *data, char *msg, unsigned long symbol)
141{
142 printk(data);
143 print_symbol(msg, symbol);
144 printk("\n");
145}
146
147static void print_trace_warning(void *data, char *msg)
148{
149 printk("%s%s\n", (char *)data, msg);
150}
151
152static int print_trace_stack(void *data, char *name) 138static int print_trace_stack(void *data, char *name)
153{ 139{
154 printk("%s <%s> ", (char *)data, name); 140 printk("%s <%s> ", (char *)data, name);
@@ -166,8 +152,6 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
166} 152}
167 153
168static const struct stacktrace_ops print_trace_ops = { 154static const struct stacktrace_ops print_trace_ops = {
169 .warning = print_trace_warning,
170 .warning_symbol = print_trace_warning_symbol,
171 .stack = print_trace_stack, 155 .stack = print_trace_stack,
172 .address = print_trace_address, 156 .address = print_trace_address,
173 .walk_stack = print_context_stack, 157 .walk_stack = print_context_stack,
@@ -197,14 +181,10 @@ void show_stack(struct task_struct *task, unsigned long *sp)
197 */ 181 */
198void dump_stack(void) 182void dump_stack(void)
199{ 183{
200 unsigned long bp = 0; 184 unsigned long bp;
201 unsigned long stack; 185 unsigned long stack;
202 186
203#ifdef CONFIG_FRAME_POINTER 187 bp = stack_frame(current, NULL);
204 if (!bp)
205 get_bp(bp);
206#endif
207
208 printk("Pid: %d, comm: %.20s %s %s %.*s\n", 188 printk("Pid: %d, comm: %.20s %s %s %.*s\n",
209 current->pid, current->comm, print_tainted(), 189 current->pid, current->comm, print_tainted(),
210 init_utsname()->release, 190 init_utsname()->release,
@@ -240,6 +220,7 @@ unsigned __kprobes long oops_begin(void)
240 bust_spinlocks(1); 220 bust_spinlocks(1);
241 return flags; 221 return flags;
242} 222}
223EXPORT_SYMBOL_GPL(oops_begin);
243 224
244void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr) 225void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
245{ 226{
@@ -282,7 +263,6 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
282 printk("DEBUG_PAGEALLOC"); 263 printk("DEBUG_PAGEALLOC");
283#endif 264#endif
284 printk("\n"); 265 printk("\n");
285 sysfs_printk_last_file();
286 if (notify_die(DIE_OOPS, str, regs, err, 266 if (notify_die(DIE_OOPS, str, regs, err,
287 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP) 267 current->thread.trap_no, SIGSEGV) == NOTIFY_STOP)
288 return 1; 268 return 1;
@@ -325,41 +305,6 @@ void die(const char *str, struct pt_regs *regs, long err)
325 oops_end(flags, regs, sig); 305 oops_end(flags, regs, sig);
326} 306}
327 307
328void notrace __kprobes
329die_nmi(char *str, struct pt_regs *regs, int do_panic)
330{
331 unsigned long flags;
332
333 if (notify_die(DIE_NMIWATCHDOG, str, regs, 0, 2, SIGINT) == NOTIFY_STOP)
334 return;
335
336 /*
337 * We are in trouble anyway, lets at least try
338 * to get a message out.
339 */
340 flags = oops_begin();
341 printk(KERN_EMERG "%s", str);
342 printk(" on CPU%d, ip %08lx, registers:\n",
343 smp_processor_id(), regs->ip);
344 show_registers(regs);
345 oops_end(flags, regs, 0);
346 if (do_panic || panic_on_oops)
347 panic("Non maskable interrupt");
348 nmi_exit();
349 local_irq_enable();
350 do_exit(SIGBUS);
351}
352
353static int __init oops_setup(char *s)
354{
355 if (!s)
356 return -EINVAL;
357 if (!strcmp(s, "panic"))
358 panic_on_oops = 1;
359 return 0;
360}
361early_param("oops", oops_setup);
362
363static int __init kstack_setup(char *s) 308static int __init kstack_setup(char *s)
364{ 309{
365 if (!s) 310 if (!s)
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 0f6376ffa2d9..3b97a80ce329 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -34,17 +34,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
34 stack = (unsigned long *)task->thread.sp; 34 stack = (unsigned long *)task->thread.sp;
35 } 35 }
36 36
37#ifdef CONFIG_FRAME_POINTER 37 if (!bp)
38 if (!bp) { 38 bp = stack_frame(task, regs);
39 if (task == current) {
40 /* Grab bp right from our regs */
41 get_bp(bp);
42 } else {
43 /* bp is the last reg pushed by switch_to */
44 bp = *(unsigned long *) task->thread.sp;
45 }
46 }
47#endif
48 39
49 for (;;) { 40 for (;;) {
50 struct thread_info *context; 41 struct thread_info *context;
@@ -82,11 +73,11 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
82 if (kstack_end(stack)) 73 if (kstack_end(stack))
83 break; 74 break;
84 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 75 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
85 printk("\n%s", log_lvl); 76 printk(KERN_CONT "\n");
86 printk(" %08lx", *stack++); 77 printk(KERN_CONT " %08lx", *stack++);
87 touch_nmi_watchdog(); 78 touch_nmi_watchdog();
88 } 79 }
89 printk("\n"); 80 printk(KERN_CONT "\n");
90 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 81 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
91} 82}
92 83
@@ -112,8 +103,7 @@ void show_registers(struct pt_regs *regs)
112 u8 *ip; 103 u8 *ip;
113 104
114 printk(KERN_EMERG "Stack:\n"); 105 printk(KERN_EMERG "Stack:\n");
115 show_stack_log_lvl(NULL, regs, &regs->sp, 106 show_stack_log_lvl(NULL, regs, &regs->sp, 0, KERN_EMERG);
116 0, KERN_EMERG);
117 107
118 printk(KERN_EMERG "Code: "); 108 printk(KERN_EMERG "Code: ");
119 109
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index 57a21f11c791..e71c98d3c0d2 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -149,29 +149,19 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
149 unsigned used = 0; 149 unsigned used = 0;
150 struct thread_info *tinfo; 150 struct thread_info *tinfo;
151 int graph = 0; 151 int graph = 0;
152 unsigned long dummy;
152 153
153 if (!task) 154 if (!task)
154 task = current; 155 task = current;
155 156
156 if (!stack) { 157 if (!stack) {
157 unsigned long dummy;
158 stack = &dummy; 158 stack = &dummy;
159 if (task && task != current) 159 if (task && task != current)
160 stack = (unsigned long *)task->thread.sp; 160 stack = (unsigned long *)task->thread.sp;
161 } 161 }
162 162
163#ifdef CONFIG_FRAME_POINTER 163 if (!bp)
164 if (!bp) { 164 bp = stack_frame(task, regs);
165 if (task == current) {
166 /* Grab bp right from our regs */
167 get_bp(bp);
168 } else {
169 /* bp is the last reg pushed by switch_to */
170 bp = *(unsigned long *) task->thread.sp;
171 }
172 }
173#endif
174
175 /* 165 /*
176 * Print function call entries in all stacks, starting at the 166 * Print function call entries in all stacks, starting at the
177 * current stack address. If the stacks consist of nested 167 * current stack address. If the stacks consist of nested
@@ -265,20 +255,20 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
265 if (stack >= irq_stack && stack <= irq_stack_end) { 255 if (stack >= irq_stack && stack <= irq_stack_end) {
266 if (stack == irq_stack_end) { 256 if (stack == irq_stack_end) {
267 stack = (unsigned long *) (irq_stack_end[-1]); 257 stack = (unsigned long *) (irq_stack_end[-1]);
268 printk(" <EOI> "); 258 printk(KERN_CONT " <EOI> ");
269 } 259 }
270 } else { 260 } else {
271 if (((long) stack & (THREAD_SIZE-1)) == 0) 261 if (((long) stack & (THREAD_SIZE-1)) == 0)
272 break; 262 break;
273 } 263 }
274 if (i && ((i % STACKSLOTS_PER_LINE) == 0)) 264 if (i && ((i % STACKSLOTS_PER_LINE) == 0))
275 printk("\n%s", log_lvl); 265 printk(KERN_CONT "\n");
276 printk(" %016lx", *stack++); 266 printk(KERN_CONT " %016lx", *stack++);
277 touch_nmi_watchdog(); 267 touch_nmi_watchdog();
278 } 268 }
279 preempt_enable(); 269 preempt_enable();
280 270
281 printk("\n"); 271 printk(KERN_CONT "\n");
282 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 272 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
283} 273}
284 274
@@ -308,7 +298,7 @@ void show_registers(struct pt_regs *regs)
308 298
309 printk(KERN_EMERG "Stack:\n"); 299 printk(KERN_EMERG "Stack:\n");
310 show_stack_log_lvl(NULL, regs, (unsigned long *)sp, 300 show_stack_log_lvl(NULL, regs, (unsigned long *)sp,
311 regs->bp, KERN_EMERG); 301 0, KERN_EMERG);
312 302
313 printk(KERN_EMERG "Code: "); 303 printk(KERN_EMERG "Code: ");
314 304
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 0d6fc71bedb1..3e2ef8425316 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -11,10 +11,13 @@
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/crash_dump.h>
14#include <linux/bootmem.h> 15#include <linux/bootmem.h>
15#include <linux/pfn.h> 16#include <linux/pfn.h>
16#include <linux/suspend.h> 17#include <linux/suspend.h>
18#include <linux/acpi.h>
17#include <linux/firmware-map.h> 19#include <linux/firmware-map.h>
20#include <linux/memblock.h>
18 21
19#include <asm/e820.h> 22#include <asm/e820.h>
20#include <asm/proto.h> 23#include <asm/proto.h>
@@ -665,21 +668,15 @@ __init void e820_setup_gap(void)
665 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of 668 * boot_params.e820_map, others are passed via SETUP_E820_EXT node of
666 * linked list of struct setup_data, which is parsed here. 669 * linked list of struct setup_data, which is parsed here.
667 */ 670 */
668void __init parse_e820_ext(struct setup_data *sdata, unsigned long pa_data) 671void __init parse_e820_ext(struct setup_data *sdata)
669{ 672{
670 u32 map_len;
671 int entries; 673 int entries;
672 struct e820entry *extmap; 674 struct e820entry *extmap;
673 675
674 entries = sdata->len / sizeof(struct e820entry); 676 entries = sdata->len / sizeof(struct e820entry);
675 map_len = sdata->len + sizeof(struct setup_data);
676 if (map_len > PAGE_SIZE)
677 sdata = early_ioremap(pa_data, map_len);
678 extmap = (struct e820entry *)(sdata->data); 677 extmap = (struct e820entry *)(sdata->data);
679 __append_e820_map(extmap, entries); 678 __append_e820_map(extmap, entries);
680 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 679 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
681 if (map_len > PAGE_SIZE)
682 early_iounmap(sdata, map_len);
683 printk(KERN_INFO "extended physical RAM map:\n"); 680 printk(KERN_INFO "extended physical RAM map:\n");
684 e820_print_map("extended"); 681 e820_print_map("extended");
685} 682}
@@ -738,73 +735,7 @@ core_initcall(e820_mark_nvs_memory);
738#endif 735#endif
739 736
740/* 737/*
741 * Find a free area with specified alignment in a specific range. 738 * pre allocated 4k and reserved it in memblock and e820_saved
742 */
743u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
744{
745 int i;
746
747 for (i = 0; i < e820.nr_map; i++) {
748 struct e820entry *ei = &e820.map[i];
749 u64 addr;
750 u64 ei_start, ei_last;
751
752 if (ei->type != E820_RAM)
753 continue;
754
755 ei_last = ei->addr + ei->size;
756 ei_start = ei->addr;
757 addr = find_early_area(ei_start, ei_last, start, end,
758 size, align);
759
760 if (addr != -1ULL)
761 return addr;
762 }
763 return -1ULL;
764}
765
766u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
767{
768 return find_e820_area(start, end, size, align);
769}
770
771u64 __init get_max_mapped(void)
772{
773 u64 end = max_pfn_mapped;
774
775 end <<= PAGE_SHIFT;
776
777 return end;
778}
779/*
780 * Find next free range after *start
781 */
782u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
783{
784 int i;
785
786 for (i = 0; i < e820.nr_map; i++) {
787 struct e820entry *ei = &e820.map[i];
788 u64 addr;
789 u64 ei_start, ei_last;
790
791 if (ei->type != E820_RAM)
792 continue;
793
794 ei_last = ei->addr + ei->size;
795 ei_start = ei->addr;
796 addr = find_early_area_size(ei_start, ei_last, start,
797 sizep, align);
798
799 if (addr != -1ULL)
800 return addr;
801 }
802
803 return -1ULL;
804}
805
806/*
807 * pre allocated 4k and reserved it in e820
808 */ 739 */
809u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) 740u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
810{ 741{
@@ -813,8 +744,8 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
813 u64 start; 744 u64 start;
814 745
815 for (start = startt; ; start += size) { 746 for (start = startt; ; start += size) {
816 start = find_e820_area_size(start, &size, align); 747 start = memblock_x86_find_in_range_size(start, &size, align);
817 if (!(start + 1)) 748 if (start == MEMBLOCK_ERROR)
818 return 0; 749 return 0;
819 if (size >= sizet) 750 if (size >= sizet)
820 break; 751 break;
@@ -830,10 +761,9 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align)
830 addr = round_down(start + size - sizet, align); 761 addr = round_down(start + size - sizet, align);
831 if (addr < start) 762 if (addr < start)
832 return 0; 763 return 0;
833 e820_update_range(addr, sizet, E820_RAM, E820_RESERVED); 764 memblock_x86_reserve_range(addr, addr + sizet, "new next");
834 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); 765 e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED);
835 printk(KERN_INFO "update e820 for early_reserve_e820\n"); 766 printk(KERN_INFO "update e820_saved for early_reserve_e820\n");
836 update_e820();
837 update_e820_saved(); 767 update_e820_saved();
838 768
839 return addr; 769 return addr;
@@ -895,74 +825,6 @@ unsigned long __init e820_end_of_low_ram_pfn(void)
895{ 825{
896 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM); 826 return e820_end_pfn(1UL<<(32 - PAGE_SHIFT), E820_RAM);
897} 827}
898/*
899 * Finds an active region in the address range from start_pfn to last_pfn and
900 * returns its range in ei_startpfn and ei_endpfn for the e820 entry.
901 */
902int __init e820_find_active_region(const struct e820entry *ei,
903 unsigned long start_pfn,
904 unsigned long last_pfn,
905 unsigned long *ei_startpfn,
906 unsigned long *ei_endpfn)
907{
908 u64 align = PAGE_SIZE;
909
910 *ei_startpfn = round_up(ei->addr, align) >> PAGE_SHIFT;
911 *ei_endpfn = round_down(ei->addr + ei->size, align) >> PAGE_SHIFT;
912
913 /* Skip map entries smaller than a page */
914 if (*ei_startpfn >= *ei_endpfn)
915 return 0;
916
917 /* Skip if map is outside the node */
918 if (ei->type != E820_RAM || *ei_endpfn <= start_pfn ||
919 *ei_startpfn >= last_pfn)
920 return 0;
921
922 /* Check for overlaps */
923 if (*ei_startpfn < start_pfn)
924 *ei_startpfn = start_pfn;
925 if (*ei_endpfn > last_pfn)
926 *ei_endpfn = last_pfn;
927
928 return 1;
929}
930
931/* Walk the e820 map and register active regions within a node */
932void __init e820_register_active_regions(int nid, unsigned long start_pfn,
933 unsigned long last_pfn)
934{
935 unsigned long ei_startpfn;
936 unsigned long ei_endpfn;
937 int i;
938
939 for (i = 0; i < e820.nr_map; i++)
940 if (e820_find_active_region(&e820.map[i],
941 start_pfn, last_pfn,
942 &ei_startpfn, &ei_endpfn))
943 add_active_range(nid, ei_startpfn, ei_endpfn);
944}
945
946/*
947 * Find the hole size (in bytes) in the memory range.
948 * @start: starting address of the memory range to scan
949 * @end: ending address of the memory range to scan
950 */
951u64 __init e820_hole_size(u64 start, u64 end)
952{
953 unsigned long start_pfn = start >> PAGE_SHIFT;
954 unsigned long last_pfn = end >> PAGE_SHIFT;
955 unsigned long ei_startpfn, ei_endpfn, ram = 0;
956 int i;
957
958 for (i = 0; i < e820.nr_map; i++) {
959 if (e820_find_active_region(&e820.map[i],
960 start_pfn, last_pfn,
961 &ei_startpfn, &ei_endpfn))
962 ram += ei_endpfn - ei_startpfn;
963 }
964 return end - start - ((u64)ram << PAGE_SHIFT);
965}
966 828
967static void early_panic(char *msg) 829static void early_panic(char *msg)
968{ 830{
@@ -980,15 +842,21 @@ static int __init parse_memopt(char *p)
980 if (!p) 842 if (!p)
981 return -EINVAL; 843 return -EINVAL;
982 844
983#ifdef CONFIG_X86_32
984 if (!strcmp(p, "nopentium")) { 845 if (!strcmp(p, "nopentium")) {
846#ifdef CONFIG_X86_32
985 setup_clear_cpu_cap(X86_FEATURE_PSE); 847 setup_clear_cpu_cap(X86_FEATURE_PSE);
986 return 0; 848 return 0;
987 } 849#else
850 printk(KERN_WARNING "mem=nopentium ignored! (only supported on x86_32)\n");
851 return -EINVAL;
988#endif 852#endif
853 }
989 854
990 userdef = 1; 855 userdef = 1;
991 mem_size = memparse(p, &p); 856 mem_size = memparse(p, &p);
857 /* don't remove all of memory when handling "mem={invalid}" param */
858 if (mem_size == 0)
859 return -EINVAL;
992 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1); 860 e820_remove_range(mem_size, ULLONG_MAX - mem_size, E820_RAM, 1);
993 861
994 return 0; 862 return 0;
@@ -1210,3 +1078,48 @@ void __init setup_memory_map(void)
1210 printk(KERN_INFO "BIOS-provided physical RAM map:\n"); 1078 printk(KERN_INFO "BIOS-provided physical RAM map:\n");
1211 e820_print_map(who); 1079 e820_print_map(who);
1212} 1080}
1081
1082void __init memblock_x86_fill(void)
1083{
1084 int i;
1085 u64 end;
1086
1087 /*
1088 * EFI may have more than 128 entries
1089 * We are safe to enable resizing, beause memblock_x86_fill()
1090 * is rather later for x86
1091 */
1092 memblock_can_resize = 1;
1093
1094 for (i = 0; i < e820.nr_map; i++) {
1095 struct e820entry *ei = &e820.map[i];
1096
1097 end = ei->addr + ei->size;
1098 if (end != (resource_size_t)end)
1099 continue;
1100
1101 if (ei->type != E820_RAM && ei->type != E820_RESERVED_KERN)
1102 continue;
1103
1104 memblock_add(ei->addr, ei->size);
1105 }
1106
1107 memblock_analyze();
1108 memblock_dump_all();
1109}
1110
1111void __init memblock_find_dma_reserve(void)
1112{
1113#ifdef CONFIG_X86_64
1114 u64 free_size_pfn;
1115 u64 mem_size_pfn;
1116 /*
1117 * need to find out used area below MAX_DMA_PFN
1118 * need to use memblock to get free size in [0, MAX_DMA_PFN]
1119 * at first, and assume boot_mem will not take below MAX_DMA_PFN
1120 */
1121 mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
1122 free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT;
1123 set_dma_reserve(mem_size_pfn - free_size_pfn);
1124#endif
1125}
diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c
index ebdb85cf2686..3755ef494390 100644
--- a/arch/x86/kernel/early-quirks.c
+++ b/arch/x86/kernel/early-quirks.c
@@ -97,7 +97,6 @@ static void __init nvidia_bugs(int num, int slot, int func)
97} 97}
98 98
99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) 99#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
100#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC)
101static u32 __init ati_ixp4x0_rev(int num, int slot, int func) 100static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
102{ 101{
103 u32 d; 102 u32 d;
@@ -115,7 +114,6 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func)
115 d &= 0xff; 114 d &= 0xff;
116 return d; 115 return d;
117} 116}
118#endif
119 117
120static void __init ati_bugs(int num, int slot, int func) 118static void __init ati_bugs(int num, int slot, int func)
121{ 119{
@@ -145,15 +143,10 @@ static void __init ati_bugs(int num, int slot, int func)
145 143
146static u32 __init ati_sbx00_rev(int num, int slot, int func) 144static u32 __init ati_sbx00_rev(int num, int slot, int func)
147{ 145{
148 u32 old, d; 146 u32 d;
149 147
150 d = read_pci_config(num, slot, func, 0x70);
151 old = d;
152 d &= ~(1<<8);
153 write_pci_config(num, slot, func, 0x70, d);
154 d = read_pci_config(num, slot, func, 0x8); 148 d = read_pci_config(num, slot, func, 0x8);
155 d &= 0xff; 149 d &= 0xff;
156 write_pci_config(num, slot, func, 0x70, old);
157 150
158 return d; 151 return d;
159} 152}
@@ -162,11 +155,19 @@ static void __init ati_bugs_contd(int num, int slot, int func)
162{ 155{
163 u32 d, rev; 156 u32 d, rev;
164 157
165 if (acpi_use_timer_override) 158 rev = ati_sbx00_rev(num, slot, func);
159 if (rev >= 0x40)
160 acpi_fix_pin2_polarity = 1;
161
162 /*
163 * SB600: revisions 0x11, 0x12, 0x13, 0x14, ...
164 * SB700: revisions 0x39, 0x3a, ...
165 * SB800: revisions 0x40, 0x41, ...
166 */
167 if (rev >= 0x39)
166 return; 168 return;
167 169
168 rev = ati_sbx00_rev(num, slot, func); 170 if (acpi_use_timer_override)
169 if (rev > 0x13)
170 return; 171 return;
171 172
172 /* check for IRQ0 interrupt swap */ 173 /* check for IRQ0 interrupt swap */
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index fa99bae75ace..cd28a350f7f9 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -14,6 +14,7 @@
14#include <xen/hvc-console.h> 14#include <xen/hvc-console.h>
15#include <asm/pci-direct.h> 15#include <asm/pci-direct.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/mrst.h>
17#include <asm/pgtable.h> 18#include <asm/pgtable.h>
18#include <linux/usb/ehci_def.h> 19#include <linux/usb/ehci_def.h>
19 20
@@ -239,6 +240,17 @@ static int __init setup_early_printk(char *buf)
239 if (!strncmp(buf, "xen", 3)) 240 if (!strncmp(buf, "xen", 3))
240 early_console_register(&xenboot_console, keep); 241 early_console_register(&xenboot_console, keep);
241#endif 242#endif
243#ifdef CONFIG_EARLY_PRINTK_MRST
244 if (!strncmp(buf, "mrst", 4)) {
245 mrst_early_console_init();
246 early_console_register(&early_mrst_console, keep);
247 }
248
249 if (!strncmp(buf, "hsu", 3)) {
250 hsu_early_console_init();
251 early_console_register(&early_hsu_console, keep);
252 }
253#endif
242 buf++; 254 buf++;
243 } 255 }
244 return 0; 256 return 0;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 227d00920d2f..5c1a91974918 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -65,6 +65,8 @@
65#define sysexit_audit syscall_exit_work 65#define sysexit_audit syscall_exit_work
66#endif 66#endif
67 67
68 .section .entry.text, "ax"
69
68/* 70/*
69 * We use macros for low-level operations which need to be overridden 71 * We use macros for low-level operations which need to be overridden
70 * for paravirtualization. The following will never clobber any registers: 72 * for paravirtualization. The following will never clobber any registers:
@@ -115,8 +117,7 @@
115 117
116 /* unfortunately push/pop can't be no-op */ 118 /* unfortunately push/pop can't be no-op */
117.macro PUSH_GS 119.macro PUSH_GS
118 pushl $0 120 pushl_cfi $0
119 CFI_ADJUST_CFA_OFFSET 4
120.endm 121.endm
121.macro POP_GS pop=0 122.macro POP_GS pop=0
122 addl $(4 + \pop), %esp 123 addl $(4 + \pop), %esp
@@ -140,14 +141,12 @@
140#else /* CONFIG_X86_32_LAZY_GS */ 141#else /* CONFIG_X86_32_LAZY_GS */
141 142
142.macro PUSH_GS 143.macro PUSH_GS
143 pushl %gs 144 pushl_cfi %gs
144 CFI_ADJUST_CFA_OFFSET 4
145 /*CFI_REL_OFFSET gs, 0*/ 145 /*CFI_REL_OFFSET gs, 0*/
146.endm 146.endm
147 147
148.macro POP_GS pop=0 148.macro POP_GS pop=0
14998: popl %gs 14998: popl_cfi %gs
150 CFI_ADJUST_CFA_OFFSET -4
151 /*CFI_RESTORE gs*/ 150 /*CFI_RESTORE gs*/
152 .if \pop <> 0 151 .if \pop <> 0
153 add $\pop, %esp 152 add $\pop, %esp
@@ -195,35 +194,25 @@
195.macro SAVE_ALL 194.macro SAVE_ALL
196 cld 195 cld
197 PUSH_GS 196 PUSH_GS
198 pushl %fs 197 pushl_cfi %fs
199 CFI_ADJUST_CFA_OFFSET 4
200 /*CFI_REL_OFFSET fs, 0;*/ 198 /*CFI_REL_OFFSET fs, 0;*/
201 pushl %es 199 pushl_cfi %es
202 CFI_ADJUST_CFA_OFFSET 4
203 /*CFI_REL_OFFSET es, 0;*/ 200 /*CFI_REL_OFFSET es, 0;*/
204 pushl %ds 201 pushl_cfi %ds
205 CFI_ADJUST_CFA_OFFSET 4
206 /*CFI_REL_OFFSET ds, 0;*/ 202 /*CFI_REL_OFFSET ds, 0;*/
207 pushl %eax 203 pushl_cfi %eax
208 CFI_ADJUST_CFA_OFFSET 4
209 CFI_REL_OFFSET eax, 0 204 CFI_REL_OFFSET eax, 0
210 pushl %ebp 205 pushl_cfi %ebp
211 CFI_ADJUST_CFA_OFFSET 4
212 CFI_REL_OFFSET ebp, 0 206 CFI_REL_OFFSET ebp, 0
213 pushl %edi 207 pushl_cfi %edi
214 CFI_ADJUST_CFA_OFFSET 4
215 CFI_REL_OFFSET edi, 0 208 CFI_REL_OFFSET edi, 0
216 pushl %esi 209 pushl_cfi %esi
217 CFI_ADJUST_CFA_OFFSET 4
218 CFI_REL_OFFSET esi, 0 210 CFI_REL_OFFSET esi, 0
219 pushl %edx 211 pushl_cfi %edx
220 CFI_ADJUST_CFA_OFFSET 4
221 CFI_REL_OFFSET edx, 0 212 CFI_REL_OFFSET edx, 0
222 pushl %ecx 213 pushl_cfi %ecx
223 CFI_ADJUST_CFA_OFFSET 4
224 CFI_REL_OFFSET ecx, 0 214 CFI_REL_OFFSET ecx, 0
225 pushl %ebx 215 pushl_cfi %ebx
226 CFI_ADJUST_CFA_OFFSET 4
227 CFI_REL_OFFSET ebx, 0 216 CFI_REL_OFFSET ebx, 0
228 movl $(__USER_DS), %edx 217 movl $(__USER_DS), %edx
229 movl %edx, %ds 218 movl %edx, %ds
@@ -234,39 +223,29 @@
234.endm 223.endm
235 224
236.macro RESTORE_INT_REGS 225.macro RESTORE_INT_REGS
237 popl %ebx 226 popl_cfi %ebx
238 CFI_ADJUST_CFA_OFFSET -4
239 CFI_RESTORE ebx 227 CFI_RESTORE ebx
240 popl %ecx 228 popl_cfi %ecx
241 CFI_ADJUST_CFA_OFFSET -4
242 CFI_RESTORE ecx 229 CFI_RESTORE ecx
243 popl %edx 230 popl_cfi %edx
244 CFI_ADJUST_CFA_OFFSET -4
245 CFI_RESTORE edx 231 CFI_RESTORE edx
246 popl %esi 232 popl_cfi %esi
247 CFI_ADJUST_CFA_OFFSET -4
248 CFI_RESTORE esi 233 CFI_RESTORE esi
249 popl %edi 234 popl_cfi %edi
250 CFI_ADJUST_CFA_OFFSET -4
251 CFI_RESTORE edi 235 CFI_RESTORE edi
252 popl %ebp 236 popl_cfi %ebp
253 CFI_ADJUST_CFA_OFFSET -4
254 CFI_RESTORE ebp 237 CFI_RESTORE ebp
255 popl %eax 238 popl_cfi %eax
256 CFI_ADJUST_CFA_OFFSET -4
257 CFI_RESTORE eax 239 CFI_RESTORE eax
258.endm 240.endm
259 241
260.macro RESTORE_REGS pop=0 242.macro RESTORE_REGS pop=0
261 RESTORE_INT_REGS 243 RESTORE_INT_REGS
2621: popl %ds 2441: popl_cfi %ds
263 CFI_ADJUST_CFA_OFFSET -4
264 /*CFI_RESTORE ds;*/ 245 /*CFI_RESTORE ds;*/
2652: popl %es 2462: popl_cfi %es
266 CFI_ADJUST_CFA_OFFSET -4
267 /*CFI_RESTORE es;*/ 247 /*CFI_RESTORE es;*/
2683: popl %fs 2483: popl_cfi %fs
269 CFI_ADJUST_CFA_OFFSET -4
270 /*CFI_RESTORE fs;*/ 249 /*CFI_RESTORE fs;*/
271 POP_GS \pop 250 POP_GS \pop
272.pushsection .fixup, "ax" 251.pushsection .fixup, "ax"
@@ -320,16 +299,12 @@
320 299
321ENTRY(ret_from_fork) 300ENTRY(ret_from_fork)
322 CFI_STARTPROC 301 CFI_STARTPROC
323 pushl %eax 302 pushl_cfi %eax
324 CFI_ADJUST_CFA_OFFSET 4
325 call schedule_tail 303 call schedule_tail
326 GET_THREAD_INFO(%ebp) 304 GET_THREAD_INFO(%ebp)
327 popl %eax 305 popl_cfi %eax
328 CFI_ADJUST_CFA_OFFSET -4 306 pushl_cfi $0x0202 # Reset kernel eflags
329 pushl $0x0202 # Reset kernel eflags 307 popfl_cfi
330 CFI_ADJUST_CFA_OFFSET 4
331 popfl
332 CFI_ADJUST_CFA_OFFSET -4
333 jmp syscall_exit 308 jmp syscall_exit
334 CFI_ENDPROC 309 CFI_ENDPROC
335END(ret_from_fork) 310END(ret_from_fork)
@@ -409,29 +384,23 @@ sysenter_past_esp:
409 * enough kernel state to call TRACE_IRQS_OFF can be called - but 384 * enough kernel state to call TRACE_IRQS_OFF can be called - but
410 * we immediately enable interrupts at that point anyway. 385 * we immediately enable interrupts at that point anyway.
411 */ 386 */
412 pushl $(__USER_DS) 387 pushl_cfi $__USER_DS
413 CFI_ADJUST_CFA_OFFSET 4
414 /*CFI_REL_OFFSET ss, 0*/ 388 /*CFI_REL_OFFSET ss, 0*/
415 pushl %ebp 389 pushl_cfi %ebp
416 CFI_ADJUST_CFA_OFFSET 4
417 CFI_REL_OFFSET esp, 0 390 CFI_REL_OFFSET esp, 0
418 pushfl 391 pushfl_cfi
419 orl $X86_EFLAGS_IF, (%esp) 392 orl $X86_EFLAGS_IF, (%esp)
420 CFI_ADJUST_CFA_OFFSET 4 393 pushl_cfi $__USER_CS
421 pushl $(__USER_CS)
422 CFI_ADJUST_CFA_OFFSET 4
423 /*CFI_REL_OFFSET cs, 0*/ 394 /*CFI_REL_OFFSET cs, 0*/
424 /* 395 /*
425 * Push current_thread_info()->sysenter_return to the stack. 396 * Push current_thread_info()->sysenter_return to the stack.
426 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words 397 * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
427 * pushed above; +8 corresponds to copy_thread's esp0 setting. 398 * pushed above; +8 corresponds to copy_thread's esp0 setting.
428 */ 399 */
429 pushl (TI_sysenter_return-THREAD_SIZE+8+4*4)(%esp) 400 pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
430 CFI_ADJUST_CFA_OFFSET 4
431 CFI_REL_OFFSET eip, 0 401 CFI_REL_OFFSET eip, 0
432 402
433 pushl %eax 403 pushl_cfi %eax
434 CFI_ADJUST_CFA_OFFSET 4
435 SAVE_ALL 404 SAVE_ALL
436 ENABLE_INTERRUPTS(CLBR_NONE) 405 ENABLE_INTERRUPTS(CLBR_NONE)
437 406
@@ -486,8 +455,7 @@ sysenter_audit:
486 movl %eax,%edx /* 2nd arg: syscall number */ 455 movl %eax,%edx /* 2nd arg: syscall number */
487 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */ 456 movl $AUDIT_ARCH_I386,%eax /* 1st arg: audit arch */
488 call audit_syscall_entry 457 call audit_syscall_entry
489 pushl %ebx 458 pushl_cfi %ebx
490 CFI_ADJUST_CFA_OFFSET 4
491 movl PT_EAX(%esp),%eax /* reload syscall number */ 459 movl PT_EAX(%esp),%eax /* reload syscall number */
492 jmp sysenter_do_call 460 jmp sysenter_do_call
493 461
@@ -529,8 +497,7 @@ ENDPROC(ia32_sysenter_target)
529 # system call handler stub 497 # system call handler stub
530ENTRY(system_call) 498ENTRY(system_call)
531 RING0_INT_FRAME # can't unwind into user space anyway 499 RING0_INT_FRAME # can't unwind into user space anyway
532 pushl %eax # save orig_eax 500 pushl_cfi %eax # save orig_eax
533 CFI_ADJUST_CFA_OFFSET 4
534 SAVE_ALL 501 SAVE_ALL
535 GET_THREAD_INFO(%ebp) 502 GET_THREAD_INFO(%ebp)
536 # system call tracing in operation / emulation 503 # system call tracing in operation / emulation
@@ -566,7 +533,6 @@ restore_all_notrace:
566 je ldt_ss # returning to user-space with LDT SS 533 je ldt_ss # returning to user-space with LDT SS
567restore_nocheck: 534restore_nocheck:
568 RESTORE_REGS 4 # skip orig_eax/error_code 535 RESTORE_REGS 4 # skip orig_eax/error_code
569 CFI_ADJUST_CFA_OFFSET -4
570irq_return: 536irq_return:
571 INTERRUPT_RETURN 537 INTERRUPT_RETURN
572.section .fixup,"ax" 538.section .fixup,"ax"
@@ -619,10 +585,8 @@ ldt_ss:
619 shr $16, %edx 585 shr $16, %edx
620 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */ 586 mov %dl, GDT_ESPFIX_SS + 4 /* bits 16..23 */
621 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */ 587 mov %dh, GDT_ESPFIX_SS + 7 /* bits 24..31 */
622 pushl $__ESPFIX_SS 588 pushl_cfi $__ESPFIX_SS
623 CFI_ADJUST_CFA_OFFSET 4 589 pushl_cfi %eax /* new kernel esp */
624 push %eax /* new kernel esp */
625 CFI_ADJUST_CFA_OFFSET 4
626 /* Disable interrupts, but do not irqtrace this section: we 590 /* Disable interrupts, but do not irqtrace this section: we
627 * will soon execute iret and the tracer was already set to 591 * will soon execute iret and the tracer was already set to
628 * the irqstate after the iret */ 592 * the irqstate after the iret */
@@ -666,11 +630,9 @@ work_notifysig: # deal with pending signals and
666 630
667 ALIGN 631 ALIGN
668work_notifysig_v86: 632work_notifysig_v86:
669 pushl %ecx # save ti_flags for do_notify_resume 633 pushl_cfi %ecx # save ti_flags for do_notify_resume
670 CFI_ADJUST_CFA_OFFSET 4
671 call save_v86_state # %eax contains pt_regs pointer 634 call save_v86_state # %eax contains pt_regs pointer
672 popl %ecx 635 popl_cfi %ecx
673 CFI_ADJUST_CFA_OFFSET -4
674 movl %eax, %esp 636 movl %eax, %esp
675#else 637#else
676 movl %esp, %eax 638 movl %esp, %eax
@@ -750,14 +712,18 @@ ptregs_##name: \
750#define PTREGSCALL3(name) \ 712#define PTREGSCALL3(name) \
751 ALIGN; \ 713 ALIGN; \
752ptregs_##name: \ 714ptregs_##name: \
715 CFI_STARTPROC; \
753 leal 4(%esp),%eax; \ 716 leal 4(%esp),%eax; \
754 pushl %eax; \ 717 pushl_cfi %eax; \
755 movl PT_EDX(%eax),%ecx; \ 718 movl PT_EDX(%eax),%ecx; \
756 movl PT_ECX(%eax),%edx; \ 719 movl PT_ECX(%eax),%edx; \
757 movl PT_EBX(%eax),%eax; \ 720 movl PT_EBX(%eax),%eax; \
758 call sys_##name; \ 721 call sys_##name; \
759 addl $4,%esp; \ 722 addl $4,%esp; \
760 ret 723 CFI_ADJUST_CFA_OFFSET -4; \
724 ret; \
725 CFI_ENDPROC; \
726ENDPROC(ptregs_##name)
761 727
762PTREGSCALL1(iopl) 728PTREGSCALL1(iopl)
763PTREGSCALL0(fork) 729PTREGSCALL0(fork)
@@ -772,15 +738,19 @@ PTREGSCALL1(vm86old)
772/* Clone is an oddball. The 4th arg is in %edi */ 738/* Clone is an oddball. The 4th arg is in %edi */
773 ALIGN; 739 ALIGN;
774ptregs_clone: 740ptregs_clone:
741 CFI_STARTPROC
775 leal 4(%esp),%eax 742 leal 4(%esp),%eax
776 pushl %eax 743 pushl_cfi %eax
777 pushl PT_EDI(%eax) 744 pushl_cfi PT_EDI(%eax)
778 movl PT_EDX(%eax),%ecx 745 movl PT_EDX(%eax),%ecx
779 movl PT_ECX(%eax),%edx 746 movl PT_ECX(%eax),%edx
780 movl PT_EBX(%eax),%eax 747 movl PT_EBX(%eax),%eax
781 call sys_clone 748 call sys_clone
782 addl $8,%esp 749 addl $8,%esp
750 CFI_ADJUST_CFA_OFFSET -8
783 ret 751 ret
752 CFI_ENDPROC
753ENDPROC(ptregs_clone)
784 754
785.macro FIXUP_ESPFIX_STACK 755.macro FIXUP_ESPFIX_STACK
786/* 756/*
@@ -795,10 +765,8 @@ ptregs_clone:
795 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */ 765 mov GDT_ESPFIX_SS + 7, %ah /* bits 24..31 */
796 shl $16, %eax 766 shl $16, %eax
797 addl %esp, %eax /* the adjusted stack pointer */ 767 addl %esp, %eax /* the adjusted stack pointer */
798 pushl $__KERNEL_DS 768 pushl_cfi $__KERNEL_DS
799 CFI_ADJUST_CFA_OFFSET 4 769 pushl_cfi %eax
800 pushl %eax
801 CFI_ADJUST_CFA_OFFSET 4
802 lss (%esp), %esp /* switch to the normal stack segment */ 770 lss (%esp), %esp /* switch to the normal stack segment */
803 CFI_ADJUST_CFA_OFFSET -8 771 CFI_ADJUST_CFA_OFFSET -8
804.endm 772.endm
@@ -822,7 +790,7 @@ ptregs_clone:
822 */ 790 */
823.section .init.rodata,"a" 791.section .init.rodata,"a"
824ENTRY(interrupt) 792ENTRY(interrupt)
825.text 793.section .entry.text, "ax"
826 .p2align 5 794 .p2align 5
827 .p2align CONFIG_X86_L1_CACHE_SHIFT 795 .p2align CONFIG_X86_L1_CACHE_SHIFT
828ENTRY(irq_entries_start) 796ENTRY(irq_entries_start)
@@ -835,14 +803,13 @@ vector=FIRST_EXTERNAL_VECTOR
835 .if vector <> FIRST_EXTERNAL_VECTOR 803 .if vector <> FIRST_EXTERNAL_VECTOR
836 CFI_ADJUST_CFA_OFFSET -4 804 CFI_ADJUST_CFA_OFFSET -4
837 .endif 805 .endif
8381: pushl $(~vector+0x80) /* Note: always in signed byte range */ 8061: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
839 CFI_ADJUST_CFA_OFFSET 4
840 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 807 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
841 jmp 2f 808 jmp 2f
842 .endif 809 .endif
843 .previous 810 .previous
844 .long 1b 811 .long 1b
845 .text 812 .section .entry.text, "ax"
846vector=vector+1 813vector=vector+1
847 .endif 814 .endif
848 .endr 815 .endr
@@ -876,8 +843,7 @@ ENDPROC(common_interrupt)
876#define BUILD_INTERRUPT3(name, nr, fn) \ 843#define BUILD_INTERRUPT3(name, nr, fn) \
877ENTRY(name) \ 844ENTRY(name) \
878 RING0_INT_FRAME; \ 845 RING0_INT_FRAME; \
879 pushl $~(nr); \ 846 pushl_cfi $~(nr); \
880 CFI_ADJUST_CFA_OFFSET 4; \
881 SAVE_ALL; \ 847 SAVE_ALL; \
882 TRACE_IRQS_OFF \ 848 TRACE_IRQS_OFF \
883 movl %esp,%eax; \ 849 movl %esp,%eax; \
@@ -893,21 +859,18 @@ ENDPROC(name)
893 859
894ENTRY(coprocessor_error) 860ENTRY(coprocessor_error)
895 RING0_INT_FRAME 861 RING0_INT_FRAME
896 pushl $0 862 pushl_cfi $0
897 CFI_ADJUST_CFA_OFFSET 4 863 pushl_cfi $do_coprocessor_error
898 pushl $do_coprocessor_error
899 CFI_ADJUST_CFA_OFFSET 4
900 jmp error_code 864 jmp error_code
901 CFI_ENDPROC 865 CFI_ENDPROC
902END(coprocessor_error) 866END(coprocessor_error)
903 867
904ENTRY(simd_coprocessor_error) 868ENTRY(simd_coprocessor_error)
905 RING0_INT_FRAME 869 RING0_INT_FRAME
906 pushl $0 870 pushl_cfi $0
907 CFI_ADJUST_CFA_OFFSET 4
908#ifdef CONFIG_X86_INVD_BUG 871#ifdef CONFIG_X86_INVD_BUG
909 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ 872 /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
910661: pushl $do_general_protection 873661: pushl_cfi $do_general_protection
911662: 874662:
912.section .altinstructions,"a" 875.section .altinstructions,"a"
913 .balign 4 876 .balign 4
@@ -922,19 +885,16 @@ ENTRY(simd_coprocessor_error)
922664: 885664:
923.previous 886.previous
924#else 887#else
925 pushl $do_simd_coprocessor_error 888 pushl_cfi $do_simd_coprocessor_error
926#endif 889#endif
927 CFI_ADJUST_CFA_OFFSET 4
928 jmp error_code 890 jmp error_code
929 CFI_ENDPROC 891 CFI_ENDPROC
930END(simd_coprocessor_error) 892END(simd_coprocessor_error)
931 893
932ENTRY(device_not_available) 894ENTRY(device_not_available)
933 RING0_INT_FRAME 895 RING0_INT_FRAME
934 pushl $-1 # mark this as an int 896 pushl_cfi $-1 # mark this as an int
935 CFI_ADJUST_CFA_OFFSET 4 897 pushl_cfi $do_device_not_available
936 pushl $do_device_not_available
937 CFI_ADJUST_CFA_OFFSET 4
938 jmp error_code 898 jmp error_code
939 CFI_ENDPROC 899 CFI_ENDPROC
940END(device_not_available) 900END(device_not_available)
@@ -956,82 +916,68 @@ END(native_irq_enable_sysexit)
956 916
957ENTRY(overflow) 917ENTRY(overflow)
958 RING0_INT_FRAME 918 RING0_INT_FRAME
959 pushl $0 919 pushl_cfi $0
960 CFI_ADJUST_CFA_OFFSET 4 920 pushl_cfi $do_overflow
961 pushl $do_overflow
962 CFI_ADJUST_CFA_OFFSET 4
963 jmp error_code 921 jmp error_code
964 CFI_ENDPROC 922 CFI_ENDPROC
965END(overflow) 923END(overflow)
966 924
967ENTRY(bounds) 925ENTRY(bounds)
968 RING0_INT_FRAME 926 RING0_INT_FRAME
969 pushl $0 927 pushl_cfi $0
970 CFI_ADJUST_CFA_OFFSET 4 928 pushl_cfi $do_bounds
971 pushl $do_bounds
972 CFI_ADJUST_CFA_OFFSET 4
973 jmp error_code 929 jmp error_code
974 CFI_ENDPROC 930 CFI_ENDPROC
975END(bounds) 931END(bounds)
976 932
977ENTRY(invalid_op) 933ENTRY(invalid_op)
978 RING0_INT_FRAME 934 RING0_INT_FRAME
979 pushl $0 935 pushl_cfi $0
980 CFI_ADJUST_CFA_OFFSET 4 936 pushl_cfi $do_invalid_op
981 pushl $do_invalid_op
982 CFI_ADJUST_CFA_OFFSET 4
983 jmp error_code 937 jmp error_code
984 CFI_ENDPROC 938 CFI_ENDPROC
985END(invalid_op) 939END(invalid_op)
986 940
987ENTRY(coprocessor_segment_overrun) 941ENTRY(coprocessor_segment_overrun)
988 RING0_INT_FRAME 942 RING0_INT_FRAME
989 pushl $0 943 pushl_cfi $0
990 CFI_ADJUST_CFA_OFFSET 4 944 pushl_cfi $do_coprocessor_segment_overrun
991 pushl $do_coprocessor_segment_overrun
992 CFI_ADJUST_CFA_OFFSET 4
993 jmp error_code 945 jmp error_code
994 CFI_ENDPROC 946 CFI_ENDPROC
995END(coprocessor_segment_overrun) 947END(coprocessor_segment_overrun)
996 948
997ENTRY(invalid_TSS) 949ENTRY(invalid_TSS)
998 RING0_EC_FRAME 950 RING0_EC_FRAME
999 pushl $do_invalid_TSS 951 pushl_cfi $do_invalid_TSS
1000 CFI_ADJUST_CFA_OFFSET 4
1001 jmp error_code 952 jmp error_code
1002 CFI_ENDPROC 953 CFI_ENDPROC
1003END(invalid_TSS) 954END(invalid_TSS)
1004 955
1005ENTRY(segment_not_present) 956ENTRY(segment_not_present)
1006 RING0_EC_FRAME 957 RING0_EC_FRAME
1007 pushl $do_segment_not_present 958 pushl_cfi $do_segment_not_present
1008 CFI_ADJUST_CFA_OFFSET 4
1009 jmp error_code 959 jmp error_code
1010 CFI_ENDPROC 960 CFI_ENDPROC
1011END(segment_not_present) 961END(segment_not_present)
1012 962
1013ENTRY(stack_segment) 963ENTRY(stack_segment)
1014 RING0_EC_FRAME 964 RING0_EC_FRAME
1015 pushl $do_stack_segment 965 pushl_cfi $do_stack_segment
1016 CFI_ADJUST_CFA_OFFSET 4
1017 jmp error_code 966 jmp error_code
1018 CFI_ENDPROC 967 CFI_ENDPROC
1019END(stack_segment) 968END(stack_segment)
1020 969
1021ENTRY(alignment_check) 970ENTRY(alignment_check)
1022 RING0_EC_FRAME 971 RING0_EC_FRAME
1023 pushl $do_alignment_check 972 pushl_cfi $do_alignment_check
1024 CFI_ADJUST_CFA_OFFSET 4
1025 jmp error_code 973 jmp error_code
1026 CFI_ENDPROC 974 CFI_ENDPROC
1027END(alignment_check) 975END(alignment_check)
1028 976
1029ENTRY(divide_error) 977ENTRY(divide_error)
1030 RING0_INT_FRAME 978 RING0_INT_FRAME
1031 pushl $0 # no error code 979 pushl_cfi $0 # no error code
1032 CFI_ADJUST_CFA_OFFSET 4 980 pushl_cfi $do_divide_error
1033 pushl $do_divide_error
1034 CFI_ADJUST_CFA_OFFSET 4
1035 jmp error_code 981 jmp error_code
1036 CFI_ENDPROC 982 CFI_ENDPROC
1037END(divide_error) 983END(divide_error)
@@ -1039,10 +985,8 @@ END(divide_error)
1039#ifdef CONFIG_X86_MCE 985#ifdef CONFIG_X86_MCE
1040ENTRY(machine_check) 986ENTRY(machine_check)
1041 RING0_INT_FRAME 987 RING0_INT_FRAME
1042 pushl $0 988 pushl_cfi $0
1043 CFI_ADJUST_CFA_OFFSET 4 989 pushl_cfi machine_check_vector
1044 pushl machine_check_vector
1045 CFI_ADJUST_CFA_OFFSET 4
1046 jmp error_code 990 jmp error_code
1047 CFI_ENDPROC 991 CFI_ENDPROC
1048END(machine_check) 992END(machine_check)
@@ -1050,10 +994,8 @@ END(machine_check)
1050 994
1051ENTRY(spurious_interrupt_bug) 995ENTRY(spurious_interrupt_bug)
1052 RING0_INT_FRAME 996 RING0_INT_FRAME
1053 pushl $0 997 pushl_cfi $0
1054 CFI_ADJUST_CFA_OFFSET 4 998 pushl_cfi $do_spurious_interrupt_bug
1055 pushl $do_spurious_interrupt_bug
1056 CFI_ADJUST_CFA_OFFSET 4
1057 jmp error_code 999 jmp error_code
1058 CFI_ENDPROC 1000 CFI_ENDPROC
1059END(spurious_interrupt_bug) 1001END(spurious_interrupt_bug)
@@ -1084,8 +1026,7 @@ ENTRY(xen_sysenter_target)
1084 1026
1085ENTRY(xen_hypervisor_callback) 1027ENTRY(xen_hypervisor_callback)
1086 CFI_STARTPROC 1028 CFI_STARTPROC
1087 pushl $0 1029 pushl_cfi $0
1088 CFI_ADJUST_CFA_OFFSET 4
1089 SAVE_ALL 1030 SAVE_ALL
1090 TRACE_IRQS_OFF 1031 TRACE_IRQS_OFF
1091 1032
@@ -1121,23 +1062,20 @@ ENDPROC(xen_hypervisor_callback)
1121# We distinguish between categories by maintaining a status value in EAX. 1062# We distinguish between categories by maintaining a status value in EAX.
1122ENTRY(xen_failsafe_callback) 1063ENTRY(xen_failsafe_callback)
1123 CFI_STARTPROC 1064 CFI_STARTPROC
1124 pushl %eax 1065 pushl_cfi %eax
1125 CFI_ADJUST_CFA_OFFSET 4
1126 movl $1,%eax 1066 movl $1,%eax
11271: mov 4(%esp),%ds 10671: mov 4(%esp),%ds
11282: mov 8(%esp),%es 10682: mov 8(%esp),%es
11293: mov 12(%esp),%fs 10693: mov 12(%esp),%fs
11304: mov 16(%esp),%gs 10704: mov 16(%esp),%gs
1131 testl %eax,%eax 1071 testl %eax,%eax
1132 popl %eax 1072 popl_cfi %eax
1133 CFI_ADJUST_CFA_OFFSET -4
1134 lea 16(%esp),%esp 1073 lea 16(%esp),%esp
1135 CFI_ADJUST_CFA_OFFSET -16 1074 CFI_ADJUST_CFA_OFFSET -16
1136 jz 5f 1075 jz 5f
1137 addl $16,%esp 1076 addl $16,%esp
1138 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET) 1077 jmp iret_exc # EAX != 0 => Category 2 (Bad IRET)
11395: pushl $0 # EAX == 0 => Category 1 (Bad segment) 10785: pushl_cfi $0 # EAX == 0 => Category 1 (Bad segment)
1140 CFI_ADJUST_CFA_OFFSET 4
1141 SAVE_ALL 1079 SAVE_ALL
1142 jmp ret_from_exception 1080 jmp ret_from_exception
1143 CFI_ENDPROC 1081 CFI_ENDPROC
@@ -1287,40 +1225,29 @@ syscall_table_size=(.-sys_call_table)
1287 1225
1288ENTRY(page_fault) 1226ENTRY(page_fault)
1289 RING0_EC_FRAME 1227 RING0_EC_FRAME
1290 pushl $do_page_fault 1228 pushl_cfi $do_page_fault
1291 CFI_ADJUST_CFA_OFFSET 4
1292 ALIGN 1229 ALIGN
1293error_code: 1230error_code:
1294 /* the function address is in %gs's slot on the stack */ 1231 /* the function address is in %gs's slot on the stack */
1295 pushl %fs 1232 pushl_cfi %fs
1296 CFI_ADJUST_CFA_OFFSET 4
1297 /*CFI_REL_OFFSET fs, 0*/ 1233 /*CFI_REL_OFFSET fs, 0*/
1298 pushl %es 1234 pushl_cfi %es
1299 CFI_ADJUST_CFA_OFFSET 4
1300 /*CFI_REL_OFFSET es, 0*/ 1235 /*CFI_REL_OFFSET es, 0*/
1301 pushl %ds 1236 pushl_cfi %ds
1302 CFI_ADJUST_CFA_OFFSET 4
1303 /*CFI_REL_OFFSET ds, 0*/ 1237 /*CFI_REL_OFFSET ds, 0*/
1304 pushl %eax 1238 pushl_cfi %eax
1305 CFI_ADJUST_CFA_OFFSET 4
1306 CFI_REL_OFFSET eax, 0 1239 CFI_REL_OFFSET eax, 0
1307 pushl %ebp 1240 pushl_cfi %ebp
1308 CFI_ADJUST_CFA_OFFSET 4
1309 CFI_REL_OFFSET ebp, 0 1241 CFI_REL_OFFSET ebp, 0
1310 pushl %edi 1242 pushl_cfi %edi
1311 CFI_ADJUST_CFA_OFFSET 4
1312 CFI_REL_OFFSET edi, 0 1243 CFI_REL_OFFSET edi, 0
1313 pushl %esi 1244 pushl_cfi %esi
1314 CFI_ADJUST_CFA_OFFSET 4
1315 CFI_REL_OFFSET esi, 0 1245 CFI_REL_OFFSET esi, 0
1316 pushl %edx 1246 pushl_cfi %edx
1317 CFI_ADJUST_CFA_OFFSET 4
1318 CFI_REL_OFFSET edx, 0 1247 CFI_REL_OFFSET edx, 0
1319 pushl %ecx 1248 pushl_cfi %ecx
1320 CFI_ADJUST_CFA_OFFSET 4
1321 CFI_REL_OFFSET ecx, 0 1249 CFI_REL_OFFSET ecx, 0
1322 pushl %ebx 1250 pushl_cfi %ebx
1323 CFI_ADJUST_CFA_OFFSET 4
1324 CFI_REL_OFFSET ebx, 0 1251 CFI_REL_OFFSET ebx, 0
1325 cld 1252 cld
1326 movl $(__KERNEL_PERCPU), %ecx 1253 movl $(__KERNEL_PERCPU), %ecx
@@ -1362,12 +1289,9 @@ END(page_fault)
1362 movl TSS_sysenter_sp0 + \offset(%esp), %esp 1289 movl TSS_sysenter_sp0 + \offset(%esp), %esp
1363 CFI_DEF_CFA esp, 0 1290 CFI_DEF_CFA esp, 0
1364 CFI_UNDEFINED eip 1291 CFI_UNDEFINED eip
1365 pushfl 1292 pushfl_cfi
1366 CFI_ADJUST_CFA_OFFSET 4 1293 pushl_cfi $__KERNEL_CS
1367 pushl $__KERNEL_CS 1294 pushl_cfi $sysenter_past_esp
1368 CFI_ADJUST_CFA_OFFSET 4
1369 pushl $sysenter_past_esp
1370 CFI_ADJUST_CFA_OFFSET 4
1371 CFI_REL_OFFSET eip, 0 1295 CFI_REL_OFFSET eip, 0
1372.endm 1296.endm
1373 1297
@@ -1377,8 +1301,7 @@ ENTRY(debug)
1377 jne debug_stack_correct 1301 jne debug_stack_correct
1378 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn 1302 FIX_STACK 12, debug_stack_correct, debug_esp_fix_insn
1379debug_stack_correct: 1303debug_stack_correct:
1380 pushl $-1 # mark this as an int 1304 pushl_cfi $-1 # mark this as an int
1381 CFI_ADJUST_CFA_OFFSET 4
1382 SAVE_ALL 1305 SAVE_ALL
1383 TRACE_IRQS_OFF 1306 TRACE_IRQS_OFF
1384 xorl %edx,%edx # error code 0 1307 xorl %edx,%edx # error code 0
@@ -1398,32 +1321,27 @@ END(debug)
1398 */ 1321 */
1399ENTRY(nmi) 1322ENTRY(nmi)
1400 RING0_INT_FRAME 1323 RING0_INT_FRAME
1401 pushl %eax 1324 pushl_cfi %eax
1402 CFI_ADJUST_CFA_OFFSET 4
1403 movl %ss, %eax 1325 movl %ss, %eax
1404 cmpw $__ESPFIX_SS, %ax 1326 cmpw $__ESPFIX_SS, %ax
1405 popl %eax 1327 popl_cfi %eax
1406 CFI_ADJUST_CFA_OFFSET -4
1407 je nmi_espfix_stack 1328 je nmi_espfix_stack
1408 cmpl $ia32_sysenter_target,(%esp) 1329 cmpl $ia32_sysenter_target,(%esp)
1409 je nmi_stack_fixup 1330 je nmi_stack_fixup
1410 pushl %eax 1331 pushl_cfi %eax
1411 CFI_ADJUST_CFA_OFFSET 4
1412 movl %esp,%eax 1332 movl %esp,%eax
1413 /* Do not access memory above the end of our stack page, 1333 /* Do not access memory above the end of our stack page,
1414 * it might not exist. 1334 * it might not exist.
1415 */ 1335 */
1416 andl $(THREAD_SIZE-1),%eax 1336 andl $(THREAD_SIZE-1),%eax
1417 cmpl $(THREAD_SIZE-20),%eax 1337 cmpl $(THREAD_SIZE-20),%eax
1418 popl %eax 1338 popl_cfi %eax
1419 CFI_ADJUST_CFA_OFFSET -4
1420 jae nmi_stack_correct 1339 jae nmi_stack_correct
1421 cmpl $ia32_sysenter_target,12(%esp) 1340 cmpl $ia32_sysenter_target,12(%esp)
1422 je nmi_debug_stack_check 1341 je nmi_debug_stack_check
1423nmi_stack_correct: 1342nmi_stack_correct:
1424 /* We have a RING0_INT_FRAME here */ 1343 /* We have a RING0_INT_FRAME here */
1425 pushl %eax 1344 pushl_cfi %eax
1426 CFI_ADJUST_CFA_OFFSET 4
1427 SAVE_ALL 1345 SAVE_ALL
1428 xorl %edx,%edx # zero error code 1346 xorl %edx,%edx # zero error code
1429 movl %esp,%eax # pt_regs pointer 1347 movl %esp,%eax # pt_regs pointer
@@ -1452,18 +1370,14 @@ nmi_espfix_stack:
1452 * 1370 *
1453 * create the pointer to lss back 1371 * create the pointer to lss back
1454 */ 1372 */
1455 pushl %ss 1373 pushl_cfi %ss
1456 CFI_ADJUST_CFA_OFFSET 4 1374 pushl_cfi %esp
1457 pushl %esp
1458 CFI_ADJUST_CFA_OFFSET 4
1459 addl $4, (%esp) 1375 addl $4, (%esp)
1460 /* copy the iret frame of 12 bytes */ 1376 /* copy the iret frame of 12 bytes */
1461 .rept 3 1377 .rept 3
1462 pushl 16(%esp) 1378 pushl_cfi 16(%esp)
1463 CFI_ADJUST_CFA_OFFSET 4
1464 .endr 1379 .endr
1465 pushl %eax 1380 pushl_cfi %eax
1466 CFI_ADJUST_CFA_OFFSET 4
1467 SAVE_ALL 1381 SAVE_ALL
1468 FIXUP_ESPFIX_STACK # %eax == %esp 1382 FIXUP_ESPFIX_STACK # %eax == %esp
1469 xorl %edx,%edx # zero error code 1383 xorl %edx,%edx # zero error code
@@ -1477,8 +1391,7 @@ END(nmi)
1477 1391
1478ENTRY(int3) 1392ENTRY(int3)
1479 RING0_INT_FRAME 1393 RING0_INT_FRAME
1480 pushl $-1 # mark this as an int 1394 pushl_cfi $-1 # mark this as an int
1481 CFI_ADJUST_CFA_OFFSET 4
1482 SAVE_ALL 1395 SAVE_ALL
1483 TRACE_IRQS_OFF 1396 TRACE_IRQS_OFF
1484 xorl %edx,%edx # zero error code 1397 xorl %edx,%edx # zero error code
@@ -1490,12 +1403,20 @@ END(int3)
1490 1403
1491ENTRY(general_protection) 1404ENTRY(general_protection)
1492 RING0_EC_FRAME 1405 RING0_EC_FRAME
1493 pushl $do_general_protection 1406 pushl_cfi $do_general_protection
1494 CFI_ADJUST_CFA_OFFSET 4
1495 jmp error_code 1407 jmp error_code
1496 CFI_ENDPROC 1408 CFI_ENDPROC
1497END(general_protection) 1409END(general_protection)
1498 1410
1411#ifdef CONFIG_KVM_GUEST
1412ENTRY(async_page_fault)
1413 RING0_EC_FRAME
1414 pushl_cfi $do_async_page_fault
1415 jmp error_code
1416 CFI_ENDPROC
1417END(async_page_fault)
1418#endif
1419
1499/* 1420/*
1500 * End of kprobes section 1421 * End of kprobes section
1501 */ 1422 */
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 115e8951e8c8..47a4bcd2e503 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -18,7 +18,7 @@
18 * A note on terminology: 18 * A note on terminology:
19 * - top of stack: Architecture defined interrupt frame from SS to RIP 19 * - top of stack: Architecture defined interrupt frame from SS to RIP
20 * at the top of the kernel process stack. 20 * at the top of the kernel process stack.
21 * - partial stack frame: partially saved registers upto R11. 21 * - partial stack frame: partially saved registers up to R11.
22 * - full stack frame: Like partial stack frame, but all register saved. 22 * - full stack frame: Like partial stack frame, but all register saved.
23 * 23 *
24 * Some macro usage: 24 * Some macro usage:
@@ -61,6 +61,8 @@
61#define __AUDIT_ARCH_LE 0x40000000 61#define __AUDIT_ARCH_LE 0x40000000
62 62
63 .code64 63 .code64
64 .section .entry.text, "ax"
65
64#ifdef CONFIG_FUNCTION_TRACER 66#ifdef CONFIG_FUNCTION_TRACER
65#ifdef CONFIG_DYNAMIC_FTRACE 67#ifdef CONFIG_DYNAMIC_FTRACE
66ENTRY(mcount) 68ENTRY(mcount)
@@ -213,23 +215,17 @@ ENDPROC(native_usergs_sysret64)
213 .macro FAKE_STACK_FRAME child_rip 215 .macro FAKE_STACK_FRAME child_rip
214 /* push in order ss, rsp, eflags, cs, rip */ 216 /* push in order ss, rsp, eflags, cs, rip */
215 xorl %eax, %eax 217 xorl %eax, %eax
216 pushq $__KERNEL_DS /* ss */ 218 pushq_cfi $__KERNEL_DS /* ss */
217 CFI_ADJUST_CFA_OFFSET 8
218 /*CFI_REL_OFFSET ss,0*/ 219 /*CFI_REL_OFFSET ss,0*/
219 pushq %rax /* rsp */ 220 pushq_cfi %rax /* rsp */
220 CFI_ADJUST_CFA_OFFSET 8
221 CFI_REL_OFFSET rsp,0 221 CFI_REL_OFFSET rsp,0
222 pushq $X86_EFLAGS_IF /* eflags - interrupts on */ 222 pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */
223 CFI_ADJUST_CFA_OFFSET 8
224 /*CFI_REL_OFFSET rflags,0*/ 223 /*CFI_REL_OFFSET rflags,0*/
225 pushq $__KERNEL_CS /* cs */ 224 pushq_cfi $__KERNEL_CS /* cs */
226 CFI_ADJUST_CFA_OFFSET 8
227 /*CFI_REL_OFFSET cs,0*/ 225 /*CFI_REL_OFFSET cs,0*/
228 pushq \child_rip /* rip */ 226 pushq_cfi \child_rip /* rip */
229 CFI_ADJUST_CFA_OFFSET 8
230 CFI_REL_OFFSET rip,0 227 CFI_REL_OFFSET rip,0
231 pushq %rax /* orig rax */ 228 pushq_cfi %rax /* orig rax */
232 CFI_ADJUST_CFA_OFFSET 8
233 .endm 229 .endm
234 230
235 .macro UNFAKE_STACK_FRAME 231 .macro UNFAKE_STACK_FRAME
@@ -301,20 +297,25 @@ ENDPROC(native_usergs_sysret64)
301 .endm 297 .endm
302 298
303/* save partial stack frame */ 299/* save partial stack frame */
300 .pushsection .kprobes.text, "ax"
304ENTRY(save_args) 301ENTRY(save_args)
305 XCPT_FRAME 302 XCPT_FRAME
306 cld 303 cld
307 movq_cfi rdi, RDI+16-ARGOFFSET 304 /*
308 movq_cfi rsi, RSI+16-ARGOFFSET 305 * start from rbp in pt_regs and jump over
309 movq_cfi rdx, RDX+16-ARGOFFSET 306 * return address.
310 movq_cfi rcx, RCX+16-ARGOFFSET 307 */
311 movq_cfi rax, RAX+16-ARGOFFSET 308 movq_cfi rdi, RDI+8-RBP
312 movq_cfi r8, R8+16-ARGOFFSET 309 movq_cfi rsi, RSI+8-RBP
313 movq_cfi r9, R9+16-ARGOFFSET 310 movq_cfi rdx, RDX+8-RBP
314 movq_cfi r10, R10+16-ARGOFFSET 311 movq_cfi rcx, RCX+8-RBP
315 movq_cfi r11, R11+16-ARGOFFSET 312 movq_cfi rax, RAX+8-RBP
316 313 movq_cfi r8, R8+8-RBP
317 leaq -ARGOFFSET+16(%rsp),%rdi /* arg1 for handler */ 314 movq_cfi r9, R9+8-RBP
315 movq_cfi r10, R10+8-RBP
316 movq_cfi r11, R11+8-RBP
317
318 leaq -RBP+8(%rsp),%rdi /* arg1 for handler */
318 movq_cfi rbp, 8 /* push %rbp */ 319 movq_cfi rbp, 8 /* push %rbp */
319 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */ 320 leaq 8(%rsp), %rbp /* mov %rsp, %ebp */
320 testl $3, CS(%rdi) 321 testl $3, CS(%rdi)
@@ -340,6 +341,7 @@ ENTRY(save_args)
340 ret 341 ret
341 CFI_ENDPROC 342 CFI_ENDPROC
342END(save_args) 343END(save_args)
344 .popsection
343 345
344ENTRY(save_rest) 346ENTRY(save_rest)
345 PARTIAL_FRAME 1 REST_SKIP+8 347 PARTIAL_FRAME 1 REST_SKIP+8
@@ -398,10 +400,8 @@ ENTRY(ret_from_fork)
398 400
399 LOCK ; btr $TIF_FORK,TI_flags(%r8) 401 LOCK ; btr $TIF_FORK,TI_flags(%r8)
400 402
401 push kernel_eflags(%rip) 403 pushq_cfi kernel_eflags(%rip)
402 CFI_ADJUST_CFA_OFFSET 8 404 popfq_cfi # reset kernel eflags
403 popf # reset kernel eflags
404 CFI_ADJUST_CFA_OFFSET -8
405 405
406 call schedule_tail # rdi: 'prev' task parameter 406 call schedule_tail # rdi: 'prev' task parameter
407 407
@@ -422,7 +422,7 @@ ENTRY(ret_from_fork)
422END(ret_from_fork) 422END(ret_from_fork)
423 423
424/* 424/*
425 * System call entry. Upto 6 arguments in registers are supported. 425 * System call entry. Up to 6 arguments in registers are supported.
426 * 426 *
427 * SYSCALL does not save anything on the stack and does not change the 427 * SYSCALL does not save anything on the stack and does not change the
428 * stack pointer. 428 * stack pointer.
@@ -521,11 +521,9 @@ sysret_careful:
521 jnc sysret_signal 521 jnc sysret_signal
522 TRACE_IRQS_ON 522 TRACE_IRQS_ON
523 ENABLE_INTERRUPTS(CLBR_NONE) 523 ENABLE_INTERRUPTS(CLBR_NONE)
524 pushq %rdi 524 pushq_cfi %rdi
525 CFI_ADJUST_CFA_OFFSET 8
526 call schedule 525 call schedule
527 popq %rdi 526 popq_cfi %rdi
528 CFI_ADJUST_CFA_OFFSET -8
529 jmp sysret_check 527 jmp sysret_check
530 528
531 /* Handle a signal */ 529 /* Handle a signal */
@@ -634,11 +632,9 @@ int_careful:
634 jnc int_very_careful 632 jnc int_very_careful
635 TRACE_IRQS_ON 633 TRACE_IRQS_ON
636 ENABLE_INTERRUPTS(CLBR_NONE) 634 ENABLE_INTERRUPTS(CLBR_NONE)
637 pushq %rdi 635 pushq_cfi %rdi
638 CFI_ADJUST_CFA_OFFSET 8
639 call schedule 636 call schedule
640 popq %rdi 637 popq_cfi %rdi
641 CFI_ADJUST_CFA_OFFSET -8
642 DISABLE_INTERRUPTS(CLBR_NONE) 638 DISABLE_INTERRUPTS(CLBR_NONE)
643 TRACE_IRQS_OFF 639 TRACE_IRQS_OFF
644 jmp int_with_check 640 jmp int_with_check
@@ -652,12 +648,10 @@ int_check_syscall_exit_work:
652 /* Check for syscall exit trace */ 648 /* Check for syscall exit trace */
653 testl $_TIF_WORK_SYSCALL_EXIT,%edx 649 testl $_TIF_WORK_SYSCALL_EXIT,%edx
654 jz int_signal 650 jz int_signal
655 pushq %rdi 651 pushq_cfi %rdi
656 CFI_ADJUST_CFA_OFFSET 8
657 leaq 8(%rsp),%rdi # &ptregs -> arg1 652 leaq 8(%rsp),%rdi # &ptregs -> arg1
658 call syscall_trace_leave 653 call syscall_trace_leave
659 popq %rdi 654 popq_cfi %rdi
660 CFI_ADJUST_CFA_OFFSET -8
661 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi 655 andl $~(_TIF_WORK_SYSCALL_EXIT|_TIF_SYSCALL_EMU),%edi
662 jmp int_restore_rest 656 jmp int_restore_rest
663 657
@@ -714,9 +708,8 @@ END(ptregscall_common)
714 708
715ENTRY(stub_execve) 709ENTRY(stub_execve)
716 CFI_STARTPROC 710 CFI_STARTPROC
717 popq %r11 711 addq $8, %rsp
718 CFI_ADJUST_CFA_OFFSET -8 712 PARTIAL_FRAME 0
719 CFI_REGISTER rip, r11
720 SAVE_REST 713 SAVE_REST
721 FIXUP_TOP_OF_STACK %r11 714 FIXUP_TOP_OF_STACK %r11
722 movq %rsp, %rcx 715 movq %rsp, %rcx
@@ -735,7 +728,7 @@ END(stub_execve)
735ENTRY(stub_rt_sigreturn) 728ENTRY(stub_rt_sigreturn)
736 CFI_STARTPROC 729 CFI_STARTPROC
737 addq $8, %rsp 730 addq $8, %rsp
738 CFI_ADJUST_CFA_OFFSET -8 731 PARTIAL_FRAME 0
739 SAVE_REST 732 SAVE_REST
740 movq %rsp,%rdi 733 movq %rsp,%rdi
741 FIXUP_TOP_OF_STACK %r11 734 FIXUP_TOP_OF_STACK %r11
@@ -753,7 +746,7 @@ END(stub_rt_sigreturn)
753 */ 746 */
754 .section .init.rodata,"a" 747 .section .init.rodata,"a"
755ENTRY(interrupt) 748ENTRY(interrupt)
756 .text 749 .section .entry.text
757 .p2align 5 750 .p2align 5
758 .p2align CONFIG_X86_L1_CACHE_SHIFT 751 .p2align CONFIG_X86_L1_CACHE_SHIFT
759ENTRY(irq_entries_start) 752ENTRY(irq_entries_start)
@@ -766,14 +759,13 @@ vector=FIRST_EXTERNAL_VECTOR
766 .if vector <> FIRST_EXTERNAL_VECTOR 759 .if vector <> FIRST_EXTERNAL_VECTOR
767 CFI_ADJUST_CFA_OFFSET -8 760 CFI_ADJUST_CFA_OFFSET -8
768 .endif 761 .endif
7691: pushq $(~vector+0x80) /* Note: always in signed byte range */ 7621: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
770 CFI_ADJUST_CFA_OFFSET 8
771 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 763 .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
772 jmp 2f 764 jmp 2f
773 .endif 765 .endif
774 .previous 766 .previous
775 .quad 1b 767 .quad 1b
776 .text 768 .section .entry.text
777vector=vector+1 769vector=vector+1
778 .endif 770 .endif
779 .endr 771 .endr
@@ -796,8 +788,9 @@ END(interrupt)
796 788
797/* 0(%rsp): ~(interrupt number) */ 789/* 0(%rsp): ~(interrupt number) */
798 .macro interrupt func 790 .macro interrupt func
799 subq $10*8, %rsp 791 /* reserve pt_regs for scratch regs and rbp */
800 CFI_ADJUST_CFA_OFFSET 10*8 792 subq $ORIG_RAX-RBP, %rsp
793 CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
801 call save_args 794 call save_args
802 PARTIAL_FRAME 0 795 PARTIAL_FRAME 0
803 call \func 796 call \func
@@ -822,8 +815,14 @@ ret_from_intr:
822 TRACE_IRQS_OFF 815 TRACE_IRQS_OFF
823 decl PER_CPU_VAR(irq_count) 816 decl PER_CPU_VAR(irq_count)
824 leaveq 817 leaveq
818
819 CFI_RESTORE rbp
825 CFI_DEF_CFA_REGISTER rsp 820 CFI_DEF_CFA_REGISTER rsp
826 CFI_ADJUST_CFA_OFFSET -8 821 CFI_ADJUST_CFA_OFFSET -8
822
823 /* we did not save rbx, restore only from ARGOFFSET */
824 addq $8, %rsp
825 CFI_ADJUST_CFA_OFFSET -8
827exit_intr: 826exit_intr:
828 GET_THREAD_INFO(%rcx) 827 GET_THREAD_INFO(%rcx)
829 testl $3,CS-ARGOFFSET(%rsp) 828 testl $3,CS-ARGOFFSET(%rsp)
@@ -903,11 +902,9 @@ retint_careful:
903 jnc retint_signal 902 jnc retint_signal
904 TRACE_IRQS_ON 903 TRACE_IRQS_ON
905 ENABLE_INTERRUPTS(CLBR_NONE) 904 ENABLE_INTERRUPTS(CLBR_NONE)
906 pushq %rdi 905 pushq_cfi %rdi
907 CFI_ADJUST_CFA_OFFSET 8
908 call schedule 906 call schedule
909 popq %rdi 907 popq_cfi %rdi
910 CFI_ADJUST_CFA_OFFSET -8
911 GET_THREAD_INFO(%rcx) 908 GET_THREAD_INFO(%rcx)
912 DISABLE_INTERRUPTS(CLBR_NONE) 909 DISABLE_INTERRUPTS(CLBR_NONE)
913 TRACE_IRQS_OFF 910 TRACE_IRQS_OFF
@@ -956,8 +953,7 @@ END(common_interrupt)
956.macro apicinterrupt num sym do_sym 953.macro apicinterrupt num sym do_sym
957ENTRY(\sym) 954ENTRY(\sym)
958 INTR_FRAME 955 INTR_FRAME
959 pushq $~(\num) 956 pushq_cfi $~(\num)
960 CFI_ADJUST_CFA_OFFSET 8
961 interrupt \do_sym 957 interrupt \do_sym
962 jmp ret_from_intr 958 jmp ret_from_intr
963 CFI_ENDPROC 959 CFI_ENDPROC
@@ -981,22 +977,13 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \
981 x86_platform_ipi smp_x86_platform_ipi 977 x86_platform_ipi smp_x86_platform_ipi
982 978
983#ifdef CONFIG_SMP 979#ifdef CONFIG_SMP
984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 980.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \
985 invalidate_interrupt0 smp_invalidate_interrupt 981 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
986apicinterrupt INVALIDATE_TLB_VECTOR_START+1 \ 982.if NUM_INVALIDATE_TLB_VECTORS > \idx
987 invalidate_interrupt1 smp_invalidate_interrupt 983apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \
988apicinterrupt INVALIDATE_TLB_VECTOR_START+2 \ 984 invalidate_interrupt\idx smp_invalidate_interrupt
989 invalidate_interrupt2 smp_invalidate_interrupt 985.endif
990apicinterrupt INVALIDATE_TLB_VECTOR_START+3 \ 986.endr
991 invalidate_interrupt3 smp_invalidate_interrupt
992apicinterrupt INVALIDATE_TLB_VECTOR_START+4 \
993 invalidate_interrupt4 smp_invalidate_interrupt
994apicinterrupt INVALIDATE_TLB_VECTOR_START+5 \
995 invalidate_interrupt5 smp_invalidate_interrupt
996apicinterrupt INVALIDATE_TLB_VECTOR_START+6 \
997 invalidate_interrupt6 smp_invalidate_interrupt
998apicinterrupt INVALIDATE_TLB_VECTOR_START+7 \
999 invalidate_interrupt7 smp_invalidate_interrupt
1000#endif 987#endif
1001 988
1002apicinterrupt THRESHOLD_APIC_VECTOR \ 989apicinterrupt THRESHOLD_APIC_VECTOR \
@@ -1025,9 +1012,9 @@ apicinterrupt ERROR_APIC_VECTOR \
1025apicinterrupt SPURIOUS_APIC_VECTOR \ 1012apicinterrupt SPURIOUS_APIC_VECTOR \
1026 spurious_interrupt smp_spurious_interrupt 1013 spurious_interrupt smp_spurious_interrupt
1027 1014
1028#ifdef CONFIG_PERF_EVENTS 1015#ifdef CONFIG_IRQ_WORK
1029apicinterrupt LOCAL_PENDING_VECTOR \ 1016apicinterrupt IRQ_WORK_VECTOR \
1030 perf_pending_interrupt smp_perf_pending_interrupt 1017 irq_work_interrupt smp_irq_work_interrupt
1031#endif 1018#endif
1032 1019
1033/* 1020/*
@@ -1038,8 +1025,8 @@ ENTRY(\sym)
1038 INTR_FRAME 1025 INTR_FRAME
1039 PARAVIRT_ADJUST_EXCEPTION_FRAME 1026 PARAVIRT_ADJUST_EXCEPTION_FRAME
1040 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ 1027 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1041 subq $15*8,%rsp 1028 subq $ORIG_RAX-R15, %rsp
1042 CFI_ADJUST_CFA_OFFSET 15*8 1029 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1043 call error_entry 1030 call error_entry
1044 DEFAULT_FRAME 0 1031 DEFAULT_FRAME 0
1045 movq %rsp,%rdi /* pt_regs pointer */ 1032 movq %rsp,%rdi /* pt_regs pointer */
@@ -1054,9 +1041,9 @@ END(\sym)
1054ENTRY(\sym) 1041ENTRY(\sym)
1055 INTR_FRAME 1042 INTR_FRAME
1056 PARAVIRT_ADJUST_EXCEPTION_FRAME 1043 PARAVIRT_ADJUST_EXCEPTION_FRAME
1057 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1044 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1058 CFI_ADJUST_CFA_OFFSET 8 1045 subq $ORIG_RAX-R15, %rsp
1059 subq $15*8, %rsp 1046 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1060 call save_paranoid 1047 call save_paranoid
1061 TRACE_IRQS_OFF 1048 TRACE_IRQS_OFF
1062 movq %rsp,%rdi /* pt_regs pointer */ 1049 movq %rsp,%rdi /* pt_regs pointer */
@@ -1072,9 +1059,9 @@ END(\sym)
1072ENTRY(\sym) 1059ENTRY(\sym)
1073 INTR_FRAME 1060 INTR_FRAME
1074 PARAVIRT_ADJUST_EXCEPTION_FRAME 1061 PARAVIRT_ADJUST_EXCEPTION_FRAME
1075 pushq $-1 /* ORIG_RAX: no syscall to restart */ 1062 pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
1076 CFI_ADJUST_CFA_OFFSET 8 1063 subq $ORIG_RAX-R15, %rsp
1077 subq $15*8, %rsp 1064 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1078 call save_paranoid 1065 call save_paranoid
1079 TRACE_IRQS_OFF 1066 TRACE_IRQS_OFF
1080 movq %rsp,%rdi /* pt_regs pointer */ 1067 movq %rsp,%rdi /* pt_regs pointer */
@@ -1091,8 +1078,8 @@ END(\sym)
1091ENTRY(\sym) 1078ENTRY(\sym)
1092 XCPT_FRAME 1079 XCPT_FRAME
1093 PARAVIRT_ADJUST_EXCEPTION_FRAME 1080 PARAVIRT_ADJUST_EXCEPTION_FRAME
1094 subq $15*8,%rsp 1081 subq $ORIG_RAX-R15, %rsp
1095 CFI_ADJUST_CFA_OFFSET 15*8 1082 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1096 call error_entry 1083 call error_entry
1097 DEFAULT_FRAME 0 1084 DEFAULT_FRAME 0
1098 movq %rsp,%rdi /* pt_regs pointer */ 1085 movq %rsp,%rdi /* pt_regs pointer */
@@ -1109,8 +1096,8 @@ END(\sym)
1109ENTRY(\sym) 1096ENTRY(\sym)
1110 XCPT_FRAME 1097 XCPT_FRAME
1111 PARAVIRT_ADJUST_EXCEPTION_FRAME 1098 PARAVIRT_ADJUST_EXCEPTION_FRAME
1112 subq $15*8,%rsp 1099 subq $ORIG_RAX-R15, %rsp
1113 CFI_ADJUST_CFA_OFFSET 15*8 1100 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1114 call save_paranoid 1101 call save_paranoid
1115 DEFAULT_FRAME 0 1102 DEFAULT_FRAME 0
1116 TRACE_IRQS_OFF 1103 TRACE_IRQS_OFF
@@ -1141,16 +1128,14 @@ zeroentry simd_coprocessor_error do_simd_coprocessor_error
1141 /* edi: new selector */ 1128 /* edi: new selector */
1142ENTRY(native_load_gs_index) 1129ENTRY(native_load_gs_index)
1143 CFI_STARTPROC 1130 CFI_STARTPROC
1144 pushf 1131 pushfq_cfi
1145 CFI_ADJUST_CFA_OFFSET 8
1146 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) 1132 DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI)
1147 SWAPGS 1133 SWAPGS
1148gs_change: 1134gs_change:
1149 movl %edi,%gs 1135 movl %edi,%gs
11502: mfence /* workaround */ 11362: mfence /* workaround */
1151 SWAPGS 1137 SWAPGS
1152 popf 1138 popfq_cfi
1153 CFI_ADJUST_CFA_OFFSET -8
1154 ret 1139 ret
1155 CFI_ENDPROC 1140 CFI_ENDPROC
1156END(native_load_gs_index) 1141END(native_load_gs_index)
@@ -1217,8 +1202,7 @@ END(kernel_execve)
1217/* Call softirq on interrupt stack. Interrupts are off. */ 1202/* Call softirq on interrupt stack. Interrupts are off. */
1218ENTRY(call_softirq) 1203ENTRY(call_softirq)
1219 CFI_STARTPROC 1204 CFI_STARTPROC
1220 push %rbp 1205 pushq_cfi %rbp
1221 CFI_ADJUST_CFA_OFFSET 8
1222 CFI_REL_OFFSET rbp,0 1206 CFI_REL_OFFSET rbp,0
1223 mov %rsp,%rbp 1207 mov %rsp,%rbp
1224 CFI_DEF_CFA_REGISTER rbp 1208 CFI_DEF_CFA_REGISTER rbp
@@ -1227,6 +1211,7 @@ ENTRY(call_softirq)
1227 push %rbp # backlink for old unwinder 1211 push %rbp # backlink for old unwinder
1228 call __do_softirq 1212 call __do_softirq
1229 leaveq 1213 leaveq
1214 CFI_RESTORE rbp
1230 CFI_DEF_CFA_REGISTER rsp 1215 CFI_DEF_CFA_REGISTER rsp
1231 CFI_ADJUST_CFA_OFFSET -8 1216 CFI_ADJUST_CFA_OFFSET -8
1232 decl PER_CPU_VAR(irq_count) 1217 decl PER_CPU_VAR(irq_count)
@@ -1270,7 +1255,7 @@ ENTRY(xen_do_hypervisor_callback) # do_hypervisor_callback(struct *pt_regs)
1270 decl PER_CPU_VAR(irq_count) 1255 decl PER_CPU_VAR(irq_count)
1271 jmp error_exit 1256 jmp error_exit
1272 CFI_ENDPROC 1257 CFI_ENDPROC
1273END(do_hypervisor_callback) 1258END(xen_do_hypervisor_callback)
1274 1259
1275/* 1260/*
1276 * Hypervisor uses this for application faults while it executes. 1261 * Hypervisor uses this for application faults while it executes.
@@ -1351,6 +1336,9 @@ errorentry xen_stack_segment do_stack_segment
1351#endif 1336#endif
1352errorentry general_protection do_general_protection 1337errorentry general_protection do_general_protection
1353errorentry page_fault do_page_fault 1338errorentry page_fault do_page_fault
1339#ifdef CONFIG_KVM_GUEST
1340errorentry async_page_fault do_async_page_fault
1341#endif
1354#ifdef CONFIG_X86_MCE 1342#ifdef CONFIG_X86_MCE
1355paranoidzeroentry machine_check *machine_check_vector(%rip) 1343paranoidzeroentry machine_check *machine_check_vector(%rip)
1356#endif 1344#endif
@@ -1370,7 +1358,7 @@ paranoidzeroentry machine_check *machine_check_vector(%rip)
1370 1358
1371 /* ebx: no swapgs flag */ 1359 /* ebx: no swapgs flag */
1372ENTRY(paranoid_exit) 1360ENTRY(paranoid_exit)
1373 INTR_FRAME 1361 DEFAULT_FRAME
1374 DISABLE_INTERRUPTS(CLBR_NONE) 1362 DISABLE_INTERRUPTS(CLBR_NONE)
1375 TRACE_IRQS_OFF 1363 TRACE_IRQS_OFF
1376 testl %ebx,%ebx /* swapgs needed? */ 1364 testl %ebx,%ebx /* swapgs needed? */
@@ -1447,7 +1435,6 @@ error_swapgs:
1447error_sti: 1435error_sti:
1448 TRACE_IRQS_OFF 1436 TRACE_IRQS_OFF
1449 ret 1437 ret
1450 CFI_ENDPROC
1451 1438
1452/* 1439/*
1453 * There are two places in the kernel that can potentially fault with 1440 * There are two places in the kernel that can potentially fault with
@@ -1472,6 +1459,7 @@ bstep_iret:
1472 /* Fix truncated RIP */ 1459 /* Fix truncated RIP */
1473 movq %rcx,RIP+8(%rsp) 1460 movq %rcx,RIP+8(%rsp)
1474 jmp error_swapgs 1461 jmp error_swapgs
1462 CFI_ENDPROC
1475END(error_entry) 1463END(error_entry)
1476 1464
1477 1465
@@ -1500,8 +1488,8 @@ ENTRY(nmi)
1500 INTR_FRAME 1488 INTR_FRAME
1501 PARAVIRT_ADJUST_EXCEPTION_FRAME 1489 PARAVIRT_ADJUST_EXCEPTION_FRAME
1502 pushq_cfi $-1 1490 pushq_cfi $-1
1503 subq $15*8, %rsp 1491 subq $ORIG_RAX-R15, %rsp
1504 CFI_ADJUST_CFA_OFFSET 15*8 1492 CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
1505 call save_paranoid 1493 call save_paranoid
1506 DEFAULT_FRAME 0 1494 DEFAULT_FRAME 0
1507 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ 1495 /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index cd37469b54ee..c9a281f272fd 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -19,6 +19,7 @@
19#include <linux/sched.h> 19#include <linux/sched.h>
20#include <linux/init.h> 20#include <linux/init.h>
21#include <linux/list.h> 21#include <linux/list.h>
22#include <linux/module.h>
22 23
23#include <trace/syscall.h> 24#include <trace/syscall.h>
24 25
@@ -49,6 +50,7 @@ static DEFINE_PER_CPU(int, save_modifying_code);
49int ftrace_arch_code_modify_prepare(void) 50int ftrace_arch_code_modify_prepare(void)
50{ 51{
51 set_kernel_text_rw(); 52 set_kernel_text_rw();
53 set_all_modules_text_rw();
52 modifying_code = 1; 54 modifying_code = 1;
53 return 0; 55 return 0;
54} 56}
@@ -56,6 +58,7 @@ int ftrace_arch_code_modify_prepare(void)
56int ftrace_arch_code_modify_post_process(void) 58int ftrace_arch_code_modify_post_process(void)
57{ 59{
58 modifying_code = 0; 60 modifying_code = 0;
61 set_all_modules_text_ro();
59 set_kernel_text_ro(); 62 set_kernel_text_ro();
60 return 0; 63 return 0;
61} 64}
@@ -120,7 +123,7 @@ static unsigned char *ftrace_call_replace(unsigned long ip, unsigned long addr)
120static atomic_t nmi_running = ATOMIC_INIT(0); 123static atomic_t nmi_running = ATOMIC_INIT(0);
121static int mod_code_status; /* holds return value of text write */ 124static int mod_code_status; /* holds return value of text write */
122static void *mod_code_ip; /* holds the IP to write to */ 125static void *mod_code_ip; /* holds the IP to write to */
123static void *mod_code_newcode; /* holds the text to write to the IP */ 126static const void *mod_code_newcode; /* holds the text to write to the IP */
124 127
125static unsigned nmi_wait_count; 128static unsigned nmi_wait_count;
126static atomic_t nmi_update_count = ATOMIC_INIT(0); 129static atomic_t nmi_update_count = ATOMIC_INIT(0);
@@ -167,9 +170,9 @@ static void ftrace_mod_code(void)
167 170
168void ftrace_nmi_enter(void) 171void ftrace_nmi_enter(void)
169{ 172{
170 __get_cpu_var(save_modifying_code) = modifying_code; 173 __this_cpu_write(save_modifying_code, modifying_code);
171 174
172 if (!__get_cpu_var(save_modifying_code)) 175 if (!__this_cpu_read(save_modifying_code))
173 return; 176 return;
174 177
175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 178 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
@@ -183,7 +186,7 @@ void ftrace_nmi_enter(void)
183 186
184void ftrace_nmi_exit(void) 187void ftrace_nmi_exit(void)
185{ 188{
186 if (!__get_cpu_var(save_modifying_code)) 189 if (!__this_cpu_read(save_modifying_code))
187 return; 190 return;
188 191
189 /* Finish all executions before clearing nmi_running */ 192 /* Finish all executions before clearing nmi_running */
@@ -222,7 +225,7 @@ within(unsigned long addr, unsigned long start, unsigned long end)
222} 225}
223 226
224static int 227static int
225do_ftrace_mod_code(unsigned long ip, void *new_code) 228do_ftrace_mod_code(unsigned long ip, const void *new_code)
226{ 229{
227 /* 230 /*
228 * On x86_64, kernel text mappings are mapped read-only with 231 * On x86_64, kernel text mappings are mapped read-only with
@@ -257,19 +260,14 @@ do_ftrace_mod_code(unsigned long ip, void *new_code)
257 return mod_code_status; 260 return mod_code_status;
258} 261}
259 262
260 263static const unsigned char *ftrace_nop_replace(void)
261
262
263static unsigned char ftrace_nop[MCOUNT_INSN_SIZE];
264
265static unsigned char *ftrace_nop_replace(void)
266{ 264{
267 return ftrace_nop; 265 return ideal_nops[NOP_ATOMIC5];
268} 266}
269 267
270static int 268static int
271ftrace_modify_code(unsigned long ip, unsigned char *old_code, 269ftrace_modify_code(unsigned long ip, unsigned const char *old_code,
272 unsigned char *new_code) 270 unsigned const char *new_code)
273{ 271{
274 unsigned char replaced[MCOUNT_INSN_SIZE]; 272 unsigned char replaced[MCOUNT_INSN_SIZE];
275 273
@@ -303,7 +301,7 @@ ftrace_modify_code(unsigned long ip, unsigned char *old_code,
303int ftrace_make_nop(struct module *mod, 301int ftrace_make_nop(struct module *mod,
304 struct dyn_ftrace *rec, unsigned long addr) 302 struct dyn_ftrace *rec, unsigned long addr)
305{ 303{
306 unsigned char *new, *old; 304 unsigned const char *new, *old;
307 unsigned long ip = rec->ip; 305 unsigned long ip = rec->ip;
308 306
309 old = ftrace_call_replace(ip, addr); 307 old = ftrace_call_replace(ip, addr);
@@ -314,7 +312,7 @@ int ftrace_make_nop(struct module *mod,
314 312
315int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr) 313int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr)
316{ 314{
317 unsigned char *new, *old; 315 unsigned const char *new, *old;
318 unsigned long ip = rec->ip; 316 unsigned long ip = rec->ip;
319 317
320 old = ftrace_nop_replace(); 318 old = ftrace_nop_replace();
@@ -338,62 +336,6 @@ int ftrace_update_ftrace_func(ftrace_func_t func)
338 336
339int __init ftrace_dyn_arch_init(void *data) 337int __init ftrace_dyn_arch_init(void *data)
340{ 338{
341 extern const unsigned char ftrace_test_p6nop[];
342 extern const unsigned char ftrace_test_nop5[];
343 extern const unsigned char ftrace_test_jmp[];
344 int faulted = 0;
345
346 /*
347 * There is no good nop for all x86 archs.
348 * We will default to using the P6_NOP5, but first we
349 * will test to make sure that the nop will actually
350 * work on this CPU. If it faults, we will then
351 * go to a lesser efficient 5 byte nop. If that fails
352 * we then just use a jmp as our nop. This isn't the most
353 * efficient nop, but we can not use a multi part nop
354 * since we would then risk being preempted in the middle
355 * of that nop, and if we enabled tracing then, it might
356 * cause a system crash.
357 *
358 * TODO: check the cpuid to determine the best nop.
359 */
360 asm volatile (
361 "ftrace_test_jmp:"
362 "jmp ftrace_test_p6nop\n"
363 "nop\n"
364 "nop\n"
365 "nop\n" /* 2 byte jmp + 3 bytes */
366 "ftrace_test_p6nop:"
367 P6_NOP5
368 "jmp 1f\n"
369 "ftrace_test_nop5:"
370 ".byte 0x66,0x66,0x66,0x66,0x90\n"
371 "1:"
372 ".section .fixup, \"ax\"\n"
373 "2: movl $1, %0\n"
374 " jmp ftrace_test_nop5\n"
375 "3: movl $2, %0\n"
376 " jmp 1b\n"
377 ".previous\n"
378 _ASM_EXTABLE(ftrace_test_p6nop, 2b)
379 _ASM_EXTABLE(ftrace_test_nop5, 3b)
380 : "=r"(faulted) : "0" (faulted));
381
382 switch (faulted) {
383 case 0:
384 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
385 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
386 break;
387 case 1:
388 pr_info("converting mcount calls to 66 66 66 66 90\n");
389 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
390 break;
391 case 2:
392 pr_info("converting mcount calls to jmp . + 5\n");
393 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
394 break;
395 }
396
397 /* The return code is retured via data */ 339 /* The return code is retured via data */
398 *(unsigned long *)data = 0; 340 *(unsigned long *)data = 0;
399 341
@@ -495,18 +437,19 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
495 return; 437 return;
496 } 438 }
497 439
498 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
499 frame_pointer) == -EBUSY) {
500 *parent = old;
501 return;
502 }
503
504 trace.func = self_addr; 440 trace.func = self_addr;
441 trace.depth = current->curr_ret_stack + 1;
505 442
506 /* Only trace if the calling function expects to */ 443 /* Only trace if the calling function expects to */
507 if (!ftrace_graph_entry(&trace)) { 444 if (!ftrace_graph_entry(&trace)) {
508 current->curr_ret_stack--;
509 *parent = old; 445 *parent = old;
446 return;
447 }
448
449 if (ftrace_push_return_trace(old, self_addr, &trace.depth,
450 frame_pointer) == -EBUSY) {
451 *parent = old;
452 return;
510 } 453 }
511} 454}
512#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 455#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c
index 3e66bd364a9d..af0699ba48cf 100644
--- a/arch/x86/kernel/head.c
+++ b/arch/x86/kernel/head.c
@@ -1,5 +1,6 @@
1#include <linux/kernel.h> 1#include <linux/kernel.h>
2#include <linux/init.h> 2#include <linux/init.h>
3#include <linux/memblock.h>
3 4
4#include <asm/setup.h> 5#include <asm/setup.h>
5#include <asm/bios_ebda.h> 6#include <asm/bios_ebda.h>
@@ -51,5 +52,5 @@ void __init reserve_ebda_region(void)
51 lowmem = 0x9f000; 52 lowmem = 0x9f000;
52 53
53 /* reserve all memory between lowmem and the 1MB mark */ 54 /* reserve all memory between lowmem and the 1MB mark */
54 reserve_early_overlap_ok(lowmem, 0x100000, "BIOS reserved"); 55 memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved");
55} 56}
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 784360c0625c..3bb08509a7a1 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -8,6 +8,7 @@
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10#include <linux/mm.h> 10#include <linux/mm.h>
11#include <linux/memblock.h>
11 12
12#include <asm/setup.h> 13#include <asm/setup.h>
13#include <asm/sections.h> 14#include <asm/sections.h>
@@ -17,11 +18,11 @@
17#include <asm/apic.h> 18#include <asm/apic.h>
18#include <asm/io_apic.h> 19#include <asm/io_apic.h>
19#include <asm/bios_ebda.h> 20#include <asm/bios_ebda.h>
21#include <asm/tlbflush.h>
20 22
21static void __init i386_default_early_setup(void) 23static void __init i386_default_early_setup(void)
22{ 24{
23 /* Initialize 32bit specific setup functions */ 25 /* Initialize 32bit specific setup functions */
24 x86_init.resources.probe_roms = probe_roms;
25 x86_init.resources.reserve_resources = i386_reserve_resources; 26 x86_init.resources.reserve_resources = i386_reserve_resources;
26 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc; 27 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc;
27 28
@@ -30,17 +31,9 @@ static void __init i386_default_early_setup(void)
30 31
31void __init i386_start_kernel(void) 32void __init i386_start_kernel(void)
32{ 33{
33#ifdef CONFIG_X86_TRAMPOLINE 34 memblock_init();
34 /*
35 * But first pinch a few for the stack/trampoline stuff
36 * FIXME: Don't need the extra page at 4K, but need to fix
37 * trampoline before removing it. (see the GDT stuff)
38 */
39 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
40 "EX TRAMPOLINE");
41#endif
42 35
43 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 36 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
44 37
45#ifdef CONFIG_BLK_DEV_INITRD 38#ifdef CONFIG_BLK_DEV_INITRD
46 /* Reserve INITRD */ 39 /* Reserve INITRD */
@@ -49,7 +42,7 @@ void __init i386_start_kernel(void)
49 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 42 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
50 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 43 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
51 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 44 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
52 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 45 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
53 } 46 }
54#endif 47#endif
55 48
@@ -58,6 +51,9 @@ void __init i386_start_kernel(void)
58 case X86_SUBARCH_MRST: 51 case X86_SUBARCH_MRST:
59 x86_mrst_early_setup(); 52 x86_mrst_early_setup();
60 break; 53 break;
54 case X86_SUBARCH_CE4100:
55 x86_ce4100_early_setup();
56 break;
61 default: 57 default:
62 i386_default_early_setup(); 58 i386_default_early_setup();
63 break; 59 break;
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 7147143fd614..5655c2272adb 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -12,6 +12,7 @@
12#include <linux/percpu.h> 12#include <linux/percpu.h>
13#include <linux/start_kernel.h> 13#include <linux/start_kernel.h>
14#include <linux/io.h> 14#include <linux/io.h>
15#include <linux/memblock.h>
15 16
16#include <asm/processor.h> 17#include <asm/processor.h>
17#include <asm/proto.h> 18#include <asm/proto.h>
@@ -76,8 +77,7 @@ void __init x86_64_start_kernel(char * real_mode_data)
76 /* Make NULL pointers segfault */ 77 /* Make NULL pointers segfault */
77 zap_identity_mappings(); 78 zap_identity_mappings();
78 79
79 /* Cleanup the over mapped high alias */ 80 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
80 cleanup_highmap();
81 81
82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) { 82 for (i = 0; i < NUM_EXCEPTION_VECTORS; i++) {
83#ifdef CONFIG_EARLY_PRINTK 83#ifdef CONFIG_EARLY_PRINTK
@@ -98,7 +98,9 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 101 memblock_init();
102
103 memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
102 104
103#ifdef CONFIG_BLK_DEV_INITRD 105#ifdef CONFIG_BLK_DEV_INITRD
104 /* Reserve INITRD */ 106 /* Reserve INITRD */
@@ -107,7 +109,7 @@ void __init x86_64_start_reservations(char *real_mode_data)
107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 109 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 110 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); 111 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
110 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 112 memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK");
111 } 113 }
112#endif 114#endif
113 115
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index fa8c1b8e09fb..ce0be7cd085e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -60,18 +60,20 @@
60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD) 60#define PAGE_TABLE_SIZE(pages) ((pages) / PTRS_PER_PGD)
61#endif 61#endif
62 62
63/* Number of possible pages in the lowmem region */
64LOWMEM_PAGES = (((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT)
65
63/* Enough space to fit pagetables for the low memory linear map */ 66/* Enough space to fit pagetables for the low memory linear map */
64MAPPING_BEYOND_END = \ 67MAPPING_BEYOND_END = PAGE_TABLE_SIZE(LOWMEM_PAGES) << PAGE_SHIFT
65 PAGE_TABLE_SIZE(((1<<32) - __PAGE_OFFSET) >> PAGE_SHIFT) << PAGE_SHIFT
66 68
67/* 69/*
68 * Worst-case size of the kernel mapping we need to make: 70 * Worst-case size of the kernel mapping we need to make:
69 * the worst-case size of the kernel itself, plus the extra we need 71 * a relocatable kernel can live anywhere in lowmem, so we need to be able
70 * to map for the linear map. 72 * to map all of lowmem.
71 */ 73 */
72KERNEL_PAGES = (KERNEL_IMAGE_SIZE + MAPPING_BEYOND_END)>>PAGE_SHIFT 74KERNEL_PAGES = LOWMEM_PAGES
73 75
74INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE_asm 76INIT_MAP_SIZE = PAGE_TABLE_SIZE(KERNEL_PAGES) * PAGE_SIZE
75RESERVE_BRK(pagetables, INIT_MAP_SIZE) 77RESERVE_BRK(pagetables, INIT_MAP_SIZE)
76 78
77/* 79/*
@@ -83,6 +85,8 @@ RESERVE_BRK(pagetables, INIT_MAP_SIZE)
83 */ 85 */
84__HEAD 86__HEAD
85ENTRY(startup_32) 87ENTRY(startup_32)
88 movl pa(stack_start),%ecx
89
86 /* test KEEP_SEGMENTS flag to see if the bootloader is asking 90 /* test KEEP_SEGMENTS flag to see if the bootloader is asking
87 us to not reload segments */ 91 us to not reload segments */
88 testb $(1<<6), BP_loadflags(%esi) 92 testb $(1<<6), BP_loadflags(%esi)
@@ -97,7 +101,9 @@ ENTRY(startup_32)
97 movl %eax,%es 101 movl %eax,%es
98 movl %eax,%fs 102 movl %eax,%fs
99 movl %eax,%gs 103 movl %eax,%gs
104 movl %eax,%ss
1002: 1052:
106 leal -__PAGE_OFFSET(%ecx),%esp
101 107
102/* 108/*
103 * Clear BSS first so that there are no surprises... 109 * Clear BSS first so that there are no surprises...
@@ -124,72 +130,35 @@ ENTRY(startup_32)
124 movsl 130 movsl
125 movl pa(boot_params) + NEW_CL_POINTER,%esi 131 movl pa(boot_params) + NEW_CL_POINTER,%esi
126 andl %esi,%esi 132 andl %esi,%esi
127 jz 1f # No comand line 133 jz 1f # No command line
128 movl $pa(boot_command_line),%edi 134 movl $pa(boot_command_line),%edi
129 movl $(COMMAND_LINE_SIZE/4),%ecx 135 movl $(COMMAND_LINE_SIZE/4),%ecx
130 rep 136 rep
131 movsl 137 movsl
1321: 1381:
133 139
134#ifdef CONFIG_OLPC_OPENFIRMWARE 140#ifdef CONFIG_OLPC
135 /* save OFW's pgdir table for later use when calling into OFW */ 141 /* save OFW's pgdir table for later use when calling into OFW */
136 movl %cr3, %eax 142 movl %cr3, %eax
137 movl %eax, pa(olpc_ofw_pgd) 143 movl %eax, pa(olpc_ofw_pgd)
138#endif 144#endif
139 145
140#ifdef CONFIG_PARAVIRT
141 /* This is can only trip for a broken bootloader... */
142 cmpw $0x207, pa(boot_params + BP_version)
143 jb default_entry
144
145 /* Paravirt-compatible boot parameters. Look to see what architecture
146 we're booting under. */
147 movl pa(boot_params + BP_hardware_subarch), %eax
148 cmpl $num_subarch_entries, %eax
149 jae bad_subarch
150
151 movl pa(subarch_entries)(,%eax,4), %eax
152 subl $__PAGE_OFFSET, %eax
153 jmp *%eax
154
155bad_subarch:
156WEAK(lguest_entry)
157WEAK(xen_entry)
158 /* Unknown implementation; there's really
159 nothing we can do at this point. */
160 ud2a
161
162 __INITDATA
163
164subarch_entries:
165 .long default_entry /* normal x86/PC */
166 .long lguest_entry /* lguest hypervisor */
167 .long xen_entry /* Xen hypervisor */
168 .long default_entry /* Moorestown MID */
169num_subarch_entries = (. - subarch_entries) / 4
170.previous
171#endif /* CONFIG_PARAVIRT */
172
173/* 146/*
174 * Initialize page tables. This creates a PDE and a set of page 147 * Initialize page tables. This creates a PDE and a set of page
175 * tables, which are located immediately beyond __brk_base. The variable 148 * tables, which are located immediately beyond __brk_base. The variable
176 * _brk_end is set up to point to the first "safe" location. 149 * _brk_end is set up to point to the first "safe" location.
177 * Mappings are created both at virtual address 0 (identity mapping) 150 * Mappings are created both at virtual address 0 (identity mapping)
178 * and PAGE_OFFSET for up to _end. 151 * and PAGE_OFFSET for up to _end.
179 *
180 * Note that the stack is not yet set up!
181 */ 152 */
182default_entry:
183#ifdef CONFIG_X86_PAE 153#ifdef CONFIG_X86_PAE
184 154
185 /* 155 /*
186 * In PAE mode swapper_pg_dir is statically defined to contain enough 156 * In PAE mode initial_page_table is statically defined to contain
187 * entries to cover the VMSPLIT option (that is the top 1, 2 or 3 157 * enough entries to cover the VMSPLIT option (that is the top 1, 2 or 3
188 * entries). The identity mapping is handled by pointing two PGD 158 * entries). The identity mapping is handled by pointing two PGD entries
189 * entries to the first kernel PMD. 159 * to the first kernel PMD.
190 * 160 *
191 * Note the upper half of each PMD or PTE are always zero at 161 * Note the upper half of each PMD or PTE are always zero at this stage.
192 * this stage.
193 */ 162 */
194 163
195#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */ 164#define KPMDS (((-__PAGE_OFFSET) >> 30) & 3) /* Number of kernel PMDs */
@@ -197,7 +166,7 @@ default_entry:
197 xorl %ebx,%ebx /* %ebx is kept at zero */ 166 xorl %ebx,%ebx /* %ebx is kept at zero */
198 167
199 movl $pa(__brk_base), %edi 168 movl $pa(__brk_base), %edi
200 movl $pa(swapper_pg_pmd), %edx 169 movl $pa(initial_pg_pmd), %edx
201 movl $PTE_IDENT_ATTR, %eax 170 movl $PTE_IDENT_ATTR, %eax
20210: 17110:
203 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */ 172 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PMD entry */
@@ -226,14 +195,14 @@ default_entry:
226 movl %eax, pa(max_pfn_mapped) 195 movl %eax, pa(max_pfn_mapped)
227 196
228 /* Do early initialization of the fixmap area */ 197 /* Do early initialization of the fixmap area */
229 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax 198 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
230 movl %eax,pa(swapper_pg_pmd+0x1000*KPMDS-8) 199 movl %eax,pa(initial_pg_pmd+0x1000*KPMDS-8)
231#else /* Not PAE */ 200#else /* Not PAE */
232 201
233page_pde_offset = (__PAGE_OFFSET >> 20); 202page_pde_offset = (__PAGE_OFFSET >> 20);
234 203
235 movl $pa(__brk_base), %edi 204 movl $pa(__brk_base), %edi
236 movl $pa(swapper_pg_dir), %edx 205 movl $pa(initial_page_table), %edx
237 movl $PTE_IDENT_ATTR, %eax 206 movl $PTE_IDENT_ATTR, %eax
23810: 20710:
239 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */ 208 leal PDE_IDENT_ATTR(%edi),%ecx /* Create PDE entry */
@@ -257,10 +226,45 @@ page_pde_offset = (__PAGE_OFFSET >> 20);
257 movl %eax, pa(max_pfn_mapped) 226 movl %eax, pa(max_pfn_mapped)
258 227
259 /* Do early initialization of the fixmap area */ 228 /* Do early initialization of the fixmap area */
260 movl $pa(swapper_pg_fixmap)+PDE_IDENT_ATTR,%eax 229 movl $pa(initial_pg_fixmap)+PDE_IDENT_ATTR,%eax
261 movl %eax,pa(swapper_pg_dir+0xffc) 230 movl %eax,pa(initial_page_table+0xffc)
262#endif 231#endif
263 jmp 3f 232
233#ifdef CONFIG_PARAVIRT
234 /* This is can only trip for a broken bootloader... */
235 cmpw $0x207, pa(boot_params + BP_version)
236 jb default_entry
237
238 /* Paravirt-compatible boot parameters. Look to see what architecture
239 we're booting under. */
240 movl pa(boot_params + BP_hardware_subarch), %eax
241 cmpl $num_subarch_entries, %eax
242 jae bad_subarch
243
244 movl pa(subarch_entries)(,%eax,4), %eax
245 subl $__PAGE_OFFSET, %eax
246 jmp *%eax
247
248bad_subarch:
249WEAK(lguest_entry)
250WEAK(xen_entry)
251 /* Unknown implementation; there's really
252 nothing we can do at this point. */
253 ud2a
254
255 __INITDATA
256
257subarch_entries:
258 .long default_entry /* normal x86/PC */
259 .long lguest_entry /* lguest hypervisor */
260 .long xen_entry /* Xen hypervisor */
261 .long default_entry /* Moorestown MID */
262num_subarch_entries = (. - subarch_entries) / 4
263.previous
264#else
265 jmp default_entry
266#endif /* CONFIG_PARAVIRT */
267
264/* 268/*
265 * Non-boot CPU entry point; entered from trampoline.S 269 * Non-boot CPU entry point; entered from trampoline.S
266 * We can't lgdt here, because lgdt itself uses a data segment, but 270 * We can't lgdt here, because lgdt itself uses a data segment, but
@@ -280,8 +284,11 @@ ENTRY(startup_32_smp)
280 movl %eax,%es 284 movl %eax,%es
281 movl %eax,%fs 285 movl %eax,%fs
282 movl %eax,%gs 286 movl %eax,%gs
287 movl pa(stack_start),%ecx
288 movl %eax,%ss
289 leal -__PAGE_OFFSET(%ecx),%esp
283#endif /* CONFIG_SMP */ 290#endif /* CONFIG_SMP */
2843: 291default_entry:
285 292
286/* 293/*
287 * New page tables may be in 4Mbyte page mode and may 294 * New page tables may be in 4Mbyte page mode and may
@@ -315,6 +322,10 @@ ENTRY(startup_32_smp)
315 subl $0x80000001, %eax 322 subl $0x80000001, %eax
316 cmpl $(0x8000ffff-0x80000001), %eax 323 cmpl $(0x8000ffff-0x80000001), %eax
317 ja 6f 324 ja 6f
325
326 /* Clear bogus XD_DISABLE bits */
327 call verify_cpu
328
318 mov $0x80000001, %eax 329 mov $0x80000001, %eax
319 cpuid 330 cpuid
320 /* Execute Disable bit supported? */ 331 /* Execute Disable bit supported? */
@@ -334,15 +345,15 @@ ENTRY(startup_32_smp)
334/* 345/*
335 * Enable paging 346 * Enable paging
336 */ 347 */
337 movl pa(initial_page_table), %eax 348 movl $pa(initial_page_table), %eax
338 movl %eax,%cr3 /* set the page table pointer.. */ 349 movl %eax,%cr3 /* set the page table pointer.. */
339 movl %cr0,%eax 350 movl %cr0,%eax
340 orl $X86_CR0_PG,%eax 351 orl $X86_CR0_PG,%eax
341 movl %eax,%cr0 /* ..and set paging (PG) bit */ 352 movl %eax,%cr0 /* ..and set paging (PG) bit */
342 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */ 353 ljmp $__BOOT_CS,$1f /* Clear prefetch and normalize %eip */
3431: 3541:
344 /* Set up the stack pointer */ 355 /* Shift the stack pointer to a virtual address */
345 lss stack_start,%esp 356 addl $__PAGE_OFFSET, %esp
346 357
347/* 358/*
348 * Initialize eflags. Some BIOS's leave bits like NT set. This would 359 * Initialize eflags. Some BIOS's leave bits like NT set. This would
@@ -354,9 +365,7 @@ ENTRY(startup_32_smp)
354 365
355#ifdef CONFIG_SMP 366#ifdef CONFIG_SMP
356 cmpb $0, ready 367 cmpb $0, ready
357 jz 1f /* Initial CPU cleans BSS */ 368 jnz checkCPUtype
358 jmp checkCPUtype
3591:
360#endif /* CONFIG_SMP */ 369#endif /* CONFIG_SMP */
361 370
362/* 371/*
@@ -464,14 +473,7 @@ is386: movl $2,%ecx # set MP
464 473
465 cld # gcc2 wants the direction flag cleared at all times 474 cld # gcc2 wants the direction flag cleared at all times
466 pushl $0 # fake return address for unwinder 475 pushl $0 # fake return address for unwinder
467#ifdef CONFIG_SMP
468 movb ready, %cl
469 movb $1, ready 476 movb $1, ready
470 cmpb $0,%cl # the first CPU calls start_kernel
471 je 1f
472 movl (stack_start), %esp
4731:
474#endif /* CONFIG_SMP */
475 jmp *(initial_code) 477 jmp *(initial_code)
476 478
477/* 479/*
@@ -610,33 +612,31 @@ ignore_int:
610#endif 612#endif
611 iret 613 iret
612 614
615#include "verify_cpu.S"
616
613 __REFDATA 617 __REFDATA
614.align 4 618.align 4
615ENTRY(initial_code) 619ENTRY(initial_code)
616 .long i386_start_kernel 620 .long i386_start_kernel
617ENTRY(initial_page_table)
618 .long pa(swapper_pg_dir)
619 621
620/* 622/*
621 * BSS section 623 * BSS section
622 */ 624 */
623__PAGE_ALIGNED_BSS 625__PAGE_ALIGNED_BSS
624 .align PAGE_SIZE_asm 626 .align PAGE_SIZE
625#ifdef CONFIG_X86_PAE 627#ifdef CONFIG_X86_PAE
626swapper_pg_pmd: 628initial_pg_pmd:
627 .fill 1024*KPMDS,4,0 629 .fill 1024*KPMDS,4,0
628#else 630#else
629ENTRY(swapper_pg_dir) 631ENTRY(initial_page_table)
630 .fill 1024,4,0 632 .fill 1024,4,0
631#endif 633#endif
632swapper_pg_fixmap: 634initial_pg_fixmap:
633 .fill 1024,4,0
634#ifdef CONFIG_X86_TRAMPOLINE
635ENTRY(trampoline_pg_dir)
636 .fill 1024,4,0 635 .fill 1024,4,0
637#endif
638ENTRY(empty_zero_page) 636ENTRY(empty_zero_page)
639 .fill 4096,1,0 637 .fill 4096,1,0
638ENTRY(swapper_pg_dir)
639 .fill 1024,4,0
640 640
641/* 641/*
642 * This starts the data section. 642 * This starts the data section.
@@ -644,37 +644,37 @@ ENTRY(empty_zero_page)
644#ifdef CONFIG_X86_PAE 644#ifdef CONFIG_X86_PAE
645__PAGE_ALIGNED_DATA 645__PAGE_ALIGNED_DATA
646 /* Page-aligned for the benefit of paravirt? */ 646 /* Page-aligned for the benefit of paravirt? */
647 .align PAGE_SIZE_asm 647 .align PAGE_SIZE
648ENTRY(swapper_pg_dir) 648ENTRY(initial_page_table)
649 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */ 649 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0 /* low identity map */
650# if KPMDS == 3 650# if KPMDS == 3
651 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 651 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
652 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 652 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
653 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x2000),0 653 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x2000),0
654# elif KPMDS == 2 654# elif KPMDS == 2
655 .long 0,0 655 .long 0,0
656 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 656 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
657 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR+0x1000),0 657 .long pa(initial_pg_pmd+PGD_IDENT_ATTR+0x1000),0
658# elif KPMDS == 1 658# elif KPMDS == 1
659 .long 0,0 659 .long 0,0
660 .long 0,0 660 .long 0,0
661 .long pa(swapper_pg_pmd+PGD_IDENT_ATTR),0 661 .long pa(initial_pg_pmd+PGD_IDENT_ATTR),0
662# else 662# else
663# error "Kernel PMDs should be 1, 2 or 3" 663# error "Kernel PMDs should be 1, 2 or 3"
664# endif 664# endif
665 .align PAGE_SIZE_asm /* needs to be page-sized too */ 665 .align PAGE_SIZE /* needs to be page-sized too */
666#endif 666#endif
667 667
668.data 668.data
669.balign 4
669ENTRY(stack_start) 670ENTRY(stack_start)
670 .long init_thread_union+THREAD_SIZE 671 .long init_thread_union+THREAD_SIZE
671 .long __BOOT_DS
672
673ready: .byte 0
674 672
675early_recursion_flag: 673early_recursion_flag:
676 .long 0 674 .long 0
677 675
676ready: .byte 0
677
678int_msg: 678int_msg:
679 .asciz "Unknown interrupt or fault at: %p %p %p\n" 679 .asciz "Unknown interrupt or fault at: %p %p %p\n"
680 680
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 239046bd447f..e11e39478a49 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -136,10 +136,9 @@ ident_complete:
136 /* Fixup phys_base */ 136 /* Fixup phys_base */
137 addq %rbp, phys_base(%rip) 137 addq %rbp, phys_base(%rip)
138 138
139#ifdef CONFIG_X86_TRAMPOLINE 139 /* Fixup trampoline */
140 addq %rbp, trampoline_level4_pgt + 0(%rip) 140 addq %rbp, trampoline_level4_pgt + 0(%rip)
141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip) 141 addq %rbp, trampoline_level4_pgt + (511*8)(%rip)
142#endif
143 142
144 /* Due to ENTRY(), sometimes the empty space gets filled with 143 /* Due to ENTRY(), sometimes the empty space gets filled with
145 * zeros. Better take a jmp than relying on empty space being 144 * zeros. Better take a jmp than relying on empty space being
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index 7494999141b3..6781765b3a0d 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -27,6 +27,9 @@
27#define HPET_DEV_FSB_CAP 0x1000 27#define HPET_DEV_FSB_CAP 0x1000
28#define HPET_DEV_PERI_CAP 0x2000 28#define HPET_DEV_PERI_CAP 0x2000
29 29
30#define HPET_MIN_CYCLES 128
31#define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1))
32
30#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) 33#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt)
31 34
32/* 35/*
@@ -214,7 +217,7 @@ static void hpet_reserve_platform_timers(unsigned int id) { }
214/* 217/*
215 * Common hpet info 218 * Common hpet info
216 */ 219 */
217static unsigned long hpet_period; 220static unsigned long hpet_freq;
218 221
219static void hpet_legacy_set_mode(enum clock_event_mode mode, 222static void hpet_legacy_set_mode(enum clock_event_mode mode,
220 struct clock_event_device *evt); 223 struct clock_event_device *evt);
@@ -229,7 +232,6 @@ static struct clock_event_device hpet_clockevent = {
229 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 232 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
230 .set_mode = hpet_legacy_set_mode, 233 .set_mode = hpet_legacy_set_mode,
231 .set_next_event = hpet_legacy_next_event, 234 .set_next_event = hpet_legacy_next_event,
232 .shift = 32,
233 .irq = 0, 235 .irq = 0,
234 .rating = 50, 236 .rating = 50,
235}; 237};
@@ -287,27 +289,12 @@ static void hpet_legacy_clockevent_register(void)
287 hpet_enable_legacy_int(); 289 hpet_enable_legacy_int();
288 290
289 /* 291 /*
290 * The mult factor is defined as (include/linux/clockchips.h)
291 * mult/2^shift = cyc/ns (in contrast to ns/cyc in clocksource.h)
292 * hpet_period is in units of femtoseconds (per cycle), so
293 * mult/2^shift = cyc/ns = 10^6/hpet_period
294 * mult = (10^6 * 2^shift)/hpet_period
295 * mult = (FSEC_PER_NSEC << hpet_clockevent.shift)/hpet_period
296 */
297 hpet_clockevent.mult = div_sc((unsigned long) FSEC_PER_NSEC,
298 hpet_period, hpet_clockevent.shift);
299 /* Calculate the min / max delta */
300 hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
301 &hpet_clockevent);
302 /* 5 usec minimum reprogramming delta. */
303 hpet_clockevent.min_delta_ns = 5000;
304
305 /*
306 * Start hpet with the boot cpu mask and make it 292 * Start hpet with the boot cpu mask and make it
307 * global after the IO_APIC has been initialized. 293 * global after the IO_APIC has been initialized.
308 */ 294 */
309 hpet_clockevent.cpumask = cpumask_of(smp_processor_id()); 295 hpet_clockevent.cpumask = cpumask_of(smp_processor_id());
310 clockevents_register_device(&hpet_clockevent); 296 clockevents_config_and_register(&hpet_clockevent, hpet_freq,
297 HPET_MIN_PROG_DELTA, 0x7FFFFFFF);
311 global_clock_event = &hpet_clockevent; 298 global_clock_event = &hpet_clockevent;
312 printk(KERN_DEBUG "hpet clockevent registered\n"); 299 printk(KERN_DEBUG "hpet clockevent registered\n");
313} 300}
@@ -380,44 +367,37 @@ static int hpet_next_event(unsigned long delta,
380 struct clock_event_device *evt, int timer) 367 struct clock_event_device *evt, int timer)
381{ 368{
382 u32 cnt; 369 u32 cnt;
370 s32 res;
383 371
384 cnt = hpet_readl(HPET_COUNTER); 372 cnt = hpet_readl(HPET_COUNTER);
385 cnt += (u32) delta; 373 cnt += (u32) delta;
386 hpet_writel(cnt, HPET_Tn_CMP(timer)); 374 hpet_writel(cnt, HPET_Tn_CMP(timer));
387 375
388 /* 376 /*
389 * We need to read back the CMP register on certain HPET 377 * HPETs are a complete disaster. The compare register is
390 * implementations (ATI chipsets) which seem to delay the 378 * based on a equal comparison and neither provides a less
391 * transfer of the compare register into the internal compare 379 * than or equal functionality (which would require to take
392 * logic. With small deltas this might actually be too late as 380 * the wraparound into account) nor a simple count down event
393 * the counter could already be higher than the compare value 381 * mode. Further the write to the comparator register is
394 * at that point and we would wait for the next hpet interrupt 382 * delayed internally up to two HPET clock cycles in certain
395 * forever. We found out that reading the CMP register back 383 * chipsets (ATI, ICH9,10). Some newer AMD chipsets have even
396 * forces the transfer so we can rely on the comparison with 384 * longer delays. We worked around that by reading back the
397 * the counter register below. If the read back from the 385 * compare register, but that required another workaround for
398 * compare register does not match the value we programmed 386 * ICH9,10 chips where the first readout after write can
399 * then we might have a real hardware problem. We can not do 387 * return the old stale value. We already had a minimum
400 * much about it here, but at least alert the user/admin with 388 * programming delta of 5us enforced, but a NMI or SMI hitting
401 * a prominent warning. 389 * between the counter readout and the comparator write can
402 * 390 * move us behind that point easily. Now instead of reading
403 * An erratum on some chipsets (ICH9,..), results in 391 * the compare register back several times, we make the ETIME
404 * comparator read immediately following a write returning old 392 * decision based on the following: Return ETIME if the
405 * value. Workaround for this is to read this value second 393 * counter value after the write is less than HPET_MIN_CYCLES
406 * time, when first read returns old value. 394 * away from the event or if the counter is already ahead of
407 * 395 * the event. The minimum programming delta for the generic
408 * In fact the write to the comparator register is delayed up 396 * clockevents code is set to 1.5 * HPET_MIN_CYCLES.
409 * to two HPET cycles so the workaround we tried to restrict
410 * the readback to those known to be borked ATI chipsets
411 * failed miserably. So we give up on optimizations forever
412 * and penalize all HPET incarnations unconditionally.
413 */ 397 */
414 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) { 398 res = (s32)(cnt - hpet_readl(HPET_COUNTER));
415 if (hpet_readl(HPET_Tn_CMP(timer)) != cnt)
416 printk_once(KERN_WARNING
417 "hpet: compare register read back failed.\n");
418 }
419 399
420 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 400 return res < HPET_MIN_CYCLES ? -ETIME : 0;
421} 401}
422 402
423static void hpet_legacy_set_mode(enum clock_event_mode mode, 403static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -440,9 +420,9 @@ static int hpet_legacy_next_event(unsigned long delta,
440static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev); 420static DEFINE_PER_CPU(struct hpet_dev *, cpu_hpet_dev);
441static struct hpet_dev *hpet_devs; 421static struct hpet_dev *hpet_devs;
442 422
443void hpet_msi_unmask(unsigned int irq) 423void hpet_msi_unmask(struct irq_data *data)
444{ 424{
445 struct hpet_dev *hdev = get_irq_data(irq); 425 struct hpet_dev *hdev = data->handler_data;
446 unsigned int cfg; 426 unsigned int cfg;
447 427
448 /* unmask it */ 428 /* unmask it */
@@ -451,10 +431,10 @@ void hpet_msi_unmask(unsigned int irq)
451 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 431 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
452} 432}
453 433
454void hpet_msi_mask(unsigned int irq) 434void hpet_msi_mask(struct irq_data *data)
455{ 435{
436 struct hpet_dev *hdev = data->handler_data;
456 unsigned int cfg; 437 unsigned int cfg;
457 struct hpet_dev *hdev = get_irq_data(irq);
458 438
459 /* mask it */ 439 /* mask it */
460 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 440 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -462,18 +442,14 @@ void hpet_msi_mask(unsigned int irq)
462 hpet_writel(cfg, HPET_Tn_CFG(hdev->num)); 442 hpet_writel(cfg, HPET_Tn_CFG(hdev->num));
463} 443}
464 444
465void hpet_msi_write(unsigned int irq, struct msi_msg *msg) 445void hpet_msi_write(struct hpet_dev *hdev, struct msi_msg *msg)
466{ 446{
467 struct hpet_dev *hdev = get_irq_data(irq);
468
469 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num)); 447 hpet_writel(msg->data, HPET_Tn_ROUTE(hdev->num));
470 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4); 448 hpet_writel(msg->address_lo, HPET_Tn_ROUTE(hdev->num) + 4);
471} 449}
472 450
473void hpet_msi_read(unsigned int irq, struct msi_msg *msg) 451void hpet_msi_read(struct hpet_dev *hdev, struct msi_msg *msg)
474{ 452{
475 struct hpet_dev *hdev = get_irq_data(irq);
476
477 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num)); 453 msg->data = hpet_readl(HPET_Tn_ROUTE(hdev->num));
478 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4); 454 msg->address_lo = hpet_readl(HPET_Tn_ROUTE(hdev->num) + 4);
479 msg->address_hi = 0; 455 msg->address_hi = 0;
@@ -510,7 +486,7 @@ static int hpet_assign_irq(struct hpet_dev *dev)
510 if (!irq) 486 if (!irq)
511 return -EINVAL; 487 return -EINVAL;
512 488
513 set_irq_data(irq, dev); 489 irq_set_handler_data(irq, dev);
514 490
515 if (hpet_setup_msi_irq(irq)) 491 if (hpet_setup_msi_irq(irq))
516 return -EINVAL; 492 return -EINVAL;
@@ -556,7 +532,6 @@ static int hpet_setup_irq(struct hpet_dev *dev)
556static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu) 532static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
557{ 533{
558 struct clock_event_device *evt = &hdev->evt; 534 struct clock_event_device *evt = &hdev->evt;
559 uint64_t hpet_freq;
560 535
561 WARN_ON(cpu != smp_processor_id()); 536 WARN_ON(cpu != smp_processor_id());
562 if (!(hdev->flags & HPET_DEV_VALID)) 537 if (!(hdev->flags & HPET_DEV_VALID))
@@ -578,24 +553,10 @@ static void init_one_hpet_msi_clockevent(struct hpet_dev *hdev, int cpu)
578 553
579 evt->set_mode = hpet_msi_set_mode; 554 evt->set_mode = hpet_msi_set_mode;
580 evt->set_next_event = hpet_msi_next_event; 555 evt->set_next_event = hpet_msi_next_event;
581 evt->shift = 32;
582
583 /*
584 * The period is a femto seconds value. We need to calculate the
585 * scaled math multiplication factor for nanosecond to hpet tick
586 * conversion.
587 */
588 hpet_freq = FSEC_PER_SEC;
589 do_div(hpet_freq, hpet_period);
590 evt->mult = div_sc((unsigned long) hpet_freq,
591 NSEC_PER_SEC, evt->shift);
592 /* Calculate the max delta */
593 evt->max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, evt);
594 /* 5 usec minimum reprogramming delta. */
595 evt->min_delta_ns = 5000;
596
597 evt->cpumask = cpumask_of(hdev->cpu); 556 evt->cpumask = cpumask_of(hdev->cpu);
598 clockevents_register_device(evt); 557
558 clockevents_config_and_register(evt, hpet_freq, HPET_MIN_PROG_DELTA,
559 0x7FFFFFFF);
599} 560}
600 561
601#ifdef CONFIG_HPET 562#ifdef CONFIG_HPET
@@ -726,7 +687,7 @@ static int hpet_cpuhp_notify(struct notifier_block *n,
726 687
727 switch (action & 0xf) { 688 switch (action & 0xf) {
728 case CPU_ONLINE: 689 case CPU_ONLINE:
729 INIT_DELAYED_WORK_ON_STACK(&work.work, hpet_work); 690 INIT_DELAYED_WORK_ONSTACK(&work.work, hpet_work);
730 init_completion(&work.complete); 691 init_completion(&work.complete);
731 /* FIXME: add schedule_work_on() */ 692 /* FIXME: add schedule_work_on() */
732 schedule_delayed_work_on(cpu, &work.work, 0); 693 schedule_delayed_work_on(cpu, &work.work, 0);
@@ -799,7 +760,6 @@ static struct clocksource clocksource_hpet = {
799static int hpet_clocksource_register(void) 760static int hpet_clocksource_register(void)
800{ 761{
801 u64 start, now; 762 u64 start, now;
802 u64 hpet_freq;
803 cycle_t t1; 763 cycle_t t1;
804 764
805 /* Start the counter */ 765 /* Start the counter */
@@ -826,24 +786,7 @@ static int hpet_clocksource_register(void)
826 return -ENODEV; 786 return -ENODEV;
827 } 787 }
828 788
829 /*
830 * The definition of mult is (include/linux/clocksource.h)
831 * mult/2^shift = ns/cyc and hpet_period is in units of fsec/cyc
832 * so we first need to convert hpet_period to ns/cyc units:
833 * mult/2^shift = ns/cyc = hpet_period/10^6
834 * mult = (hpet_period * 2^shift)/10^6
835 * mult = (hpet_period << shift)/FSEC_PER_NSEC
836 */
837
838 /* Need to convert hpet_period (fsec/cyc) to cyc/sec:
839 *
840 * cyc/sec = FSEC_PER_SEC/hpet_period(fsec/cyc)
841 * cyc/sec = (FSEC_PER_NSEC * NSEC_PER_SEC)/hpet_period
842 */
843 hpet_freq = FSEC_PER_SEC;
844 do_div(hpet_freq, hpet_period);
845 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq); 789 clocksource_register_hz(&clocksource_hpet, (u32)hpet_freq);
846
847 return 0; 790 return 0;
848} 791}
849 792
@@ -852,7 +795,9 @@ static int hpet_clocksource_register(void)
852 */ 795 */
853int __init hpet_enable(void) 796int __init hpet_enable(void)
854{ 797{
798 unsigned long hpet_period;
855 unsigned int id; 799 unsigned int id;
800 u64 freq;
856 int i; 801 int i;
857 802
858 if (!is_hpet_capable()) 803 if (!is_hpet_capable())
@@ -891,6 +836,14 @@ int __init hpet_enable(void)
891 goto out_nohpet; 836 goto out_nohpet;
892 837
893 /* 838 /*
839 * The period is a femto seconds value. Convert it to a
840 * frequency.
841 */
842 freq = FSEC_PER_SEC;
843 do_div(freq, hpet_period);
844 hpet_freq = freq;
845
846 /*
894 * Read the HPET ID register to retrieve the IRQ routing 847 * Read the HPET ID register to retrieve the IRQ routing
895 * information and the number of channels 848 * information and the number of channels
896 */ 849 */
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index ff15c9dcc25d..02f07634d265 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -122,7 +122,7 @@ int arch_install_hw_breakpoint(struct perf_event *bp)
122 return -EBUSY; 122 return -EBUSY;
123 123
124 set_debugreg(info->address, i); 124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address; 125 __this_cpu_write(cpu_debugreg[i], info->address);
126 126
127 dr7 = &__get_cpu_var(cpu_dr7); 127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type); 128 *dr7 |= encode_dr7(i, info->len, info->type);
@@ -397,12 +397,12 @@ void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
397 397
398void hw_breakpoint_restore(void) 398void hw_breakpoint_restore(void)
399{ 399{
400 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0); 400 set_debugreg(__this_cpu_read(cpu_debugreg[0]), 0);
401 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1); 401 set_debugreg(__this_cpu_read(cpu_debugreg[1]), 1);
402 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2); 402 set_debugreg(__this_cpu_read(cpu_debugreg[2]), 2);
403 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3); 403 set_debugreg(__this_cpu_read(cpu_debugreg[3]), 3);
404 set_debugreg(current->thread.debugreg6, 6); 404 set_debugreg(current->thread.debugreg6, 6);
405 set_debugreg(__get_cpu_var(cpu_dr7), 7); 405 set_debugreg(__this_cpu_read(cpu_dr7), 7);
406} 406}
407EXPORT_SYMBOL_GPL(hw_breakpoint_restore); 407EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
408 408
@@ -433,6 +433,10 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args)
433 dr6_p = (unsigned long *)ERR_PTR(args->err); 433 dr6_p = (unsigned long *)ERR_PTR(args->err);
434 dr6 = *dr6_p; 434 dr6 = *dr6_p;
435 435
436 /* If it's a single step, TRAP bits are random */
437 if (dr6 & DR_STEP)
438 return NOTIFY_DONE;
439
436 /* Do an early return if no trap bits are set in DR6 */ 440 /* Do an early return if no trap bits are set in DR6 */
437 if ((dr6 & DR_TRAP_BITS) == 0) 441 if ((dr6 & DR_TRAP_BITS) == 0)
438 return NOTIFY_DONE; 442 return NOTIFY_DONE;
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index a46cb3522c0c..12aff2537682 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -68,19 +68,22 @@ static void __cpuinit init_thread_xstate(void)
68 */ 68 */
69 69
70 if (!HAVE_HWFP) { 70 if (!HAVE_HWFP) {
71 /*
72 * Disable xsave as we do not support it if i387
73 * emulation is enabled.
74 */
75 setup_clear_cpu_cap(X86_FEATURE_XSAVE);
76 setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
71 xstate_size = sizeof(struct i387_soft_struct); 77 xstate_size = sizeof(struct i387_soft_struct);
72 return; 78 return;
73 } 79 }
74 80
75 if (cpu_has_fxsr) 81 if (cpu_has_fxsr)
76 xstate_size = sizeof(struct i387_fxsave_struct); 82 xstate_size = sizeof(struct i387_fxsave_struct);
77#ifdef CONFIG_X86_32
78 else 83 else
79 xstate_size = sizeof(struct i387_fsave_struct); 84 xstate_size = sizeof(struct i387_fsave_struct);
80#endif
81} 85}
82 86
83#ifdef CONFIG_X86_64
84/* 87/*
85 * Called at bootup to set up the initial FPU state that is later cloned 88 * Called at bootup to set up the initial FPU state that is later cloned
86 * into all processes. 89 * into all processes.
@@ -88,12 +91,21 @@ static void __cpuinit init_thread_xstate(void)
88 91
89void __cpuinit fpu_init(void) 92void __cpuinit fpu_init(void)
90{ 93{
91 unsigned long oldcr0 = read_cr0(); 94 unsigned long cr0;
92 95 unsigned long cr4_mask = 0;
93 set_in_cr4(X86_CR4_OSFXSR);
94 set_in_cr4(X86_CR4_OSXMMEXCPT);
95 96
96 write_cr0(oldcr0 & ~(X86_CR0_TS|X86_CR0_EM)); /* clear TS and EM */ 97 if (cpu_has_fxsr)
98 cr4_mask |= X86_CR4_OSFXSR;
99 if (cpu_has_xmm)
100 cr4_mask |= X86_CR4_OSXMMEXCPT;
101 if (cr4_mask)
102 set_in_cr4(cr4_mask);
103
104 cr0 = read_cr0();
105 cr0 &= ~(X86_CR0_TS|X86_CR0_EM); /* clear TS and EM */
106 if (!HAVE_HWFP)
107 cr0 |= X86_CR0_EM;
108 write_cr0(cr0);
97 109
98 if (!smp_processor_id()) 110 if (!smp_processor_id())
99 init_thread_xstate(); 111 init_thread_xstate();
@@ -104,24 +116,12 @@ void __cpuinit fpu_init(void)
104 clear_used_math(); 116 clear_used_math();
105} 117}
106 118
107#else /* CONFIG_X86_64 */
108
109void __cpuinit fpu_init(void)
110{
111 if (!smp_processor_id())
112 init_thread_xstate();
113}
114
115#endif /* CONFIG_X86_32 */
116
117void fpu_finit(struct fpu *fpu) 119void fpu_finit(struct fpu *fpu)
118{ 120{
119#ifdef CONFIG_X86_32
120 if (!HAVE_HWFP) { 121 if (!HAVE_HWFP) {
121 finit_soft_fpu(&fpu->state->soft); 122 finit_soft_fpu(&fpu->state->soft);
122 return; 123 return;
123 } 124 }
124#endif
125 125
126 if (cpu_has_fxsr) { 126 if (cpu_has_fxsr) {
127 struct i387_fxsave_struct *fx = &fpu->state->fxsave; 127 struct i387_fxsave_struct *fx = &fpu->state->fxsave;
@@ -145,7 +145,7 @@ EXPORT_SYMBOL_GPL(fpu_finit);
145 * The _current_ task is using the FPU for the first time 145 * The _current_ task is using the FPU for the first time
146 * so initialize it and set the mxcsr to its default 146 * so initialize it and set the mxcsr to its default
147 * value at reset if we support XMM instructions and then 147 * value at reset if we support XMM instructions and then
148 * remeber the current task has used the FPU. 148 * remember the current task has used the FPU.
149 */ 149 */
150int init_fpu(struct task_struct *tsk) 150int init_fpu(struct task_struct *tsk)
151{ 151{
@@ -169,6 +169,7 @@ int init_fpu(struct task_struct *tsk)
169 set_stopped_child_used_math(tsk); 169 set_stopped_child_used_math(tsk);
170 return 0; 170 return 0;
171} 171}
172EXPORT_SYMBOL_GPL(init_fpu);
172 173
173/* 174/*
174 * The xstateregs_active() routine is the same as the fpregs_active() routine, 175 * The xstateregs_active() routine is the same as the fpregs_active() routine,
@@ -386,19 +387,17 @@ convert_from_fxsr(struct user_i387_ia32_struct *env, struct task_struct *tsk)
386#ifdef CONFIG_X86_64 387#ifdef CONFIG_X86_64
387 env->fip = fxsave->rip; 388 env->fip = fxsave->rip;
388 env->foo = fxsave->rdp; 389 env->foo = fxsave->rdp;
390 /*
391 * should be actually ds/cs at fpu exception time, but
392 * that information is not available in 64bit mode.
393 */
394 env->fcs = task_pt_regs(tsk)->cs;
389 if (tsk == current) { 395 if (tsk == current) {
390 /* 396 savesegment(ds, env->fos);
391 * should be actually ds/cs at fpu exception time, but
392 * that information is not available in 64bit mode.
393 */
394 asm("mov %%ds, %[fos]" : [fos] "=r" (env->fos));
395 asm("mov %%cs, %[fcs]" : [fcs] "=r" (env->fcs));
396 } else { 397 } else {
397 struct pt_regs *regs = task_pt_regs(tsk); 398 env->fos = tsk->thread.ds;
398
399 env->fos = 0xffff0000 | tsk->thread.ds;
400 env->fcs = regs->cs;
401 } 399 }
400 env->fos |= 0xffff0000;
402#else 401#else
403 env->fip = fxsave->fip; 402 env->fip = fxsave->fip;
404 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16); 403 env->fcs = (u16) fxsave->fcs | ((u32) fxsave->fop << 16);
diff --git a/arch/x86/kernel/i8237.c b/arch/x86/kernel/i8237.c
index b42ca694dc68..8eeaa81de066 100644
--- a/arch/x86/kernel/i8237.c
+++ b/arch/x86/kernel/i8237.c
@@ -10,7 +10,7 @@
10 */ 10 */
11 11
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/sysdev.h> 13#include <linux/syscore_ops.h>
14 14
15#include <asm/dma.h> 15#include <asm/dma.h>
16 16
@@ -21,7 +21,7 @@
21 * in asm/dma.h. 21 * in asm/dma.h.
22 */ 22 */
23 23
24static int i8237A_resume(struct sys_device *dev) 24static void i8237A_resume(void)
25{ 25{
26 unsigned long flags; 26 unsigned long flags;
27 int i; 27 int i;
@@ -41,31 +41,15 @@ static int i8237A_resume(struct sys_device *dev)
41 enable_dma(4); 41 enable_dma(4);
42 42
43 release_dma_lock(flags); 43 release_dma_lock(flags);
44
45 return 0;
46} 44}
47 45
48static int i8237A_suspend(struct sys_device *dev, pm_message_t state) 46static struct syscore_ops i8237_syscore_ops = {
49{
50 return 0;
51}
52
53static struct sysdev_class i8237_sysdev_class = {
54 .name = "i8237",
55 .suspend = i8237A_suspend,
56 .resume = i8237A_resume, 47 .resume = i8237A_resume,
57}; 48};
58 49
59static struct sys_device device_i8237A = { 50static int __init i8237A_init_ops(void)
60 .id = 0,
61 .cls = &i8237_sysdev_class,
62};
63
64static int __init i8237A_init_sysfs(void)
65{ 51{
66 int error = sysdev_class_register(&i8237_sysdev_class); 52 register_syscore_ops(&i8237_syscore_ops);
67 if (!error) 53 return 0;
68 error = sysdev_register(&device_i8237A);
69 return error;
70} 54}
71device_initcall(i8237A_init_sysfs); 55device_initcall(i8237A_init_ops);
diff --git a/arch/x86/kernel/i8253.c b/arch/x86/kernel/i8253.c
index 2dfd31597443..fb66dc9e36cb 100644
--- a/arch/x86/kernel/i8253.c
+++ b/arch/x86/kernel/i8253.c
@@ -93,7 +93,6 @@ static struct clock_event_device pit_ce = {
93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, 93 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
94 .set_mode = init_pit_timer, 94 .set_mode = init_pit_timer,
95 .set_next_event = pit_next_event, 95 .set_next_event = pit_next_event,
96 .shift = 32,
97 .irq = 0, 96 .irq = 0,
98}; 97};
99 98
@@ -108,90 +107,12 @@ void __init setup_pit_timer(void)
108 * IO_APIC has been initialized. 107 * IO_APIC has been initialized.
109 */ 108 */
110 pit_ce.cpumask = cpumask_of(smp_processor_id()); 109 pit_ce.cpumask = cpumask_of(smp_processor_id());
111 pit_ce.mult = div_sc(CLOCK_TICK_RATE, NSEC_PER_SEC, pit_ce.shift);
112 pit_ce.max_delta_ns = clockevent_delta2ns(0x7FFF, &pit_ce);
113 pit_ce.min_delta_ns = clockevent_delta2ns(0xF, &pit_ce);
114 110
115 clockevents_register_device(&pit_ce); 111 clockevents_config_and_register(&pit_ce, CLOCK_TICK_RATE, 0xF, 0x7FFF);
116 global_clock_event = &pit_ce; 112 global_clock_event = &pit_ce;
117} 113}
118 114
119#ifndef CONFIG_X86_64 115#ifndef CONFIG_X86_64
120/*
121 * Since the PIT overflows every tick, its not very useful
122 * to just read by itself. So use jiffies to emulate a free
123 * running counter:
124 */
125static cycle_t pit_read(struct clocksource *cs)
126{
127 static int old_count;
128 static u32 old_jifs;
129 unsigned long flags;
130 int count;
131 u32 jifs;
132
133 raw_spin_lock_irqsave(&i8253_lock, flags);
134 /*
135 * Although our caller may have the read side of xtime_lock,
136 * this is now a seqlock, and we are cheating in this routine
137 * by having side effects on state that we cannot undo if
138 * there is a collision on the seqlock and our caller has to
139 * retry. (Namely, old_jifs and old_count.) So we must treat
140 * jiffies as volatile despite the lock. We read jiffies
141 * before latching the timer count to guarantee that although
142 * the jiffies value might be older than the count (that is,
143 * the counter may underflow between the last point where
144 * jiffies was incremented and the point where we latch the
145 * count), it cannot be newer.
146 */
147 jifs = jiffies;
148 outb_pit(0x00, PIT_MODE); /* latch the count ASAP */
149 count = inb_pit(PIT_CH0); /* read the latched count */
150 count |= inb_pit(PIT_CH0) << 8;
151
152 /* VIA686a test code... reset the latch if count > max + 1 */
153 if (count > LATCH) {
154 outb_pit(0x34, PIT_MODE);
155 outb_pit(LATCH & 0xff, PIT_CH0);
156 outb_pit(LATCH >> 8, PIT_CH0);
157 count = LATCH - 1;
158 }
159
160 /*
161 * It's possible for count to appear to go the wrong way for a
162 * couple of reasons:
163 *
164 * 1. The timer counter underflows, but we haven't handled the
165 * resulting interrupt and incremented jiffies yet.
166 * 2. Hardware problem with the timer, not giving us continuous time,
167 * the counter does small "jumps" upwards on some Pentium systems,
168 * (see c't 95/10 page 335 for Neptun bug.)
169 *
170 * Previous attempts to handle these cases intelligently were
171 * buggy, so we just do the simple thing now.
172 */
173 if (count > old_count && jifs == old_jifs)
174 count = old_count;
175
176 old_count = count;
177 old_jifs = jifs;
178
179 raw_spin_unlock_irqrestore(&i8253_lock, flags);
180
181 count = (LATCH - 1) - count;
182
183 return (cycle_t)(jifs * LATCH) + count;
184}
185
186static struct clocksource pit_cs = {
187 .name = "pit",
188 .rating = 110,
189 .read = pit_read,
190 .mask = CLOCKSOURCE_MASK(32),
191 .mult = 0,
192 .shift = 20,
193};
194
195static int __init init_pit_clocksource(void) 116static int __init init_pit_clocksource(void)
196{ 117{
197 /* 118 /*
@@ -205,10 +126,7 @@ static int __init init_pit_clocksource(void)
205 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC) 126 pit_ce.mode != CLOCK_EVT_MODE_PERIODIC)
206 return 0; 127 return 0;
207 128
208 pit_cs.mult = clocksource_hz2mult(CLOCK_TICK_RATE, pit_cs.shift); 129 return clocksource_i8253_init();
209
210 return clocksource_register(&pit_cs);
211} 130}
212arch_initcall(init_pit_clocksource); 131arch_initcall(init_pit_clocksource);
213
214#endif /* !CONFIG_X86_64 */ 132#endif /* !CONFIG_X86_64 */
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index cafa7c80ac95..65b8f5c2eebf 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -8,7 +8,7 @@
8#include <linux/random.h> 8#include <linux/random.h>
9#include <linux/init.h> 9#include <linux/init.h>
10#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
11#include <linux/sysdev.h> 11#include <linux/syscore_ops.h>
12#include <linux/bitops.h> 12#include <linux/bitops.h>
13#include <linux/acpi.h> 13#include <linux/acpi.h>
14#include <linux/io.h> 14#include <linux/io.h>
@@ -29,24 +29,10 @@
29 * plus some generic x86 specific things if generic specifics makes 29 * plus some generic x86 specific things if generic specifics makes
30 * any sense at all. 30 * any sense at all.
31 */ 31 */
32static void init_8259A(int auto_eoi);
32 33
33static int i8259A_auto_eoi; 34static int i8259A_auto_eoi;
34DEFINE_RAW_SPINLOCK(i8259A_lock); 35DEFINE_RAW_SPINLOCK(i8259A_lock);
35static void mask_and_ack_8259A(unsigned int);
36static void mask_8259A(void);
37static void unmask_8259A(void);
38static void disable_8259A_irq(unsigned int irq);
39static void enable_8259A_irq(unsigned int irq);
40static void init_8259A(int auto_eoi);
41static int i8259A_irq_pending(unsigned int irq);
42
43struct irq_chip i8259A_chip = {
44 .name = "XT-PIC",
45 .mask = disable_8259A_irq,
46 .disable = disable_8259A_irq,
47 .unmask = enable_8259A_irq,
48 .mask_ack = mask_and_ack_8259A,
49};
50 36
51/* 37/*
52 * 8259A PIC functions to handle ISA devices: 38 * 8259A PIC functions to handle ISA devices:
@@ -68,7 +54,7 @@ unsigned int cached_irq_mask = 0xffff;
68 */ 54 */
69unsigned long io_apic_irqs; 55unsigned long io_apic_irqs;
70 56
71static void disable_8259A_irq(unsigned int irq) 57static void mask_8259A_irq(unsigned int irq)
72{ 58{
73 unsigned int mask = 1 << irq; 59 unsigned int mask = 1 << irq;
74 unsigned long flags; 60 unsigned long flags;
@@ -82,7 +68,12 @@ static void disable_8259A_irq(unsigned int irq)
82 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 68 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
83} 69}
84 70
85static void enable_8259A_irq(unsigned int irq) 71static void disable_8259A_irq(struct irq_data *data)
72{
73 mask_8259A_irq(data->irq);
74}
75
76static void unmask_8259A_irq(unsigned int irq)
86{ 77{
87 unsigned int mask = ~(1 << irq); 78 unsigned int mask = ~(1 << irq);
88 unsigned long flags; 79 unsigned long flags;
@@ -96,6 +87,11 @@ static void enable_8259A_irq(unsigned int irq)
96 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 87 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
97} 88}
98 89
90static void enable_8259A_irq(struct irq_data *data)
91{
92 unmask_8259A_irq(data->irq);
93}
94
99static int i8259A_irq_pending(unsigned int irq) 95static int i8259A_irq_pending(unsigned int irq)
100{ 96{
101 unsigned int mask = 1<<irq; 97 unsigned int mask = 1<<irq;
@@ -116,8 +112,8 @@ static void make_8259A_irq(unsigned int irq)
116{ 112{
117 disable_irq_nosync(irq); 113 disable_irq_nosync(irq);
118 io_apic_irqs &= ~(1<<irq); 114 io_apic_irqs &= ~(1<<irq);
119 set_irq_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq, 115 irq_set_chip_and_handler_name(irq, &i8259A_chip, handle_level_irq,
120 "XT"); 116 i8259A_chip.name);
121 enable_irq(irq); 117 enable_irq(irq);
122} 118}
123 119
@@ -150,8 +146,9 @@ static inline int i8259A_irq_real(unsigned int irq)
150 * first, _then_ send the EOI, and the order of EOI 146 * first, _then_ send the EOI, and the order of EOI
151 * to the two 8259s is important! 147 * to the two 8259s is important!
152 */ 148 */
153static void mask_and_ack_8259A(unsigned int irq) 149static void mask_and_ack_8259A(struct irq_data *data)
154{ 150{
151 unsigned int irq = data->irq;
155 unsigned int irqmask = 1 << irq; 152 unsigned int irqmask = 1 << irq;
156 unsigned long flags; 153 unsigned long flags;
157 154
@@ -223,6 +220,14 @@ spurious_8259A_irq:
223 } 220 }
224} 221}
225 222
223struct irq_chip i8259A_chip = {
224 .name = "XT-PIC",
225 .irq_mask = disable_8259A_irq,
226 .irq_disable = disable_8259A_irq,
227 .irq_unmask = enable_8259A_irq,
228 .irq_mask_ack = mask_and_ack_8259A,
229};
230
226static char irq_trigger[2]; 231static char irq_trigger[2];
227/** 232/**
228 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ 233 * ELCR registers (0x4d0, 0x4d1) control edge/level of IRQ
@@ -240,20 +245,19 @@ static void save_ELCR(char *trigger)
240 trigger[1] = inb(0x4d1) & 0xDE; 245 trigger[1] = inb(0x4d1) & 0xDE;
241} 246}
242 247
243static int i8259A_resume(struct sys_device *dev) 248static void i8259A_resume(void)
244{ 249{
245 init_8259A(i8259A_auto_eoi); 250 init_8259A(i8259A_auto_eoi);
246 restore_ELCR(irq_trigger); 251 restore_ELCR(irq_trigger);
247 return 0;
248} 252}
249 253
250static int i8259A_suspend(struct sys_device *dev, pm_message_t state) 254static int i8259A_suspend(void)
251{ 255{
252 save_ELCR(irq_trigger); 256 save_ELCR(irq_trigger);
253 return 0; 257 return 0;
254} 258}
255 259
256static int i8259A_shutdown(struct sys_device *dev) 260static void i8259A_shutdown(void)
257{ 261{
258 /* Put the i8259A into a quiescent state that 262 /* Put the i8259A into a quiescent state that
259 * the kernel initialization code can get it 263 * the kernel initialization code can get it
@@ -261,21 +265,14 @@ static int i8259A_shutdown(struct sys_device *dev)
261 */ 265 */
262 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 266 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
263 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */ 267 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-1 */
264 return 0;
265} 268}
266 269
267static struct sysdev_class i8259_sysdev_class = { 270static struct syscore_ops i8259_syscore_ops = {
268 .name = "i8259",
269 .suspend = i8259A_suspend, 271 .suspend = i8259A_suspend,
270 .resume = i8259A_resume, 272 .resume = i8259A_resume,
271 .shutdown = i8259A_shutdown, 273 .shutdown = i8259A_shutdown,
272}; 274};
273 275
274static struct sys_device device_i8259A = {
275 .id = 0,
276 .cls = &i8259_sysdev_class,
277};
278
279static void mask_8259A(void) 276static void mask_8259A(void)
280{ 277{
281 unsigned long flags; 278 unsigned long flags;
@@ -342,9 +339,9 @@ static void init_8259A(int auto_eoi)
342 * In AEOI mode we just have to mask the interrupt 339 * In AEOI mode we just have to mask the interrupt
343 * when acking. 340 * when acking.
344 */ 341 */
345 i8259A_chip.mask_ack = disable_8259A_irq; 342 i8259A_chip.irq_mask_ack = disable_8259A_irq;
346 else 343 else
347 i8259A_chip.mask_ack = mask_and_ack_8259A; 344 i8259A_chip.irq_mask_ack = mask_and_ack_8259A;
348 345
349 udelay(100); /* wait for 8259A to initialize */ 346 udelay(100); /* wait for 8259A to initialize */
350 347
@@ -363,14 +360,6 @@ static void init_8259A(int auto_eoi)
363static void legacy_pic_noop(void) { }; 360static void legacy_pic_noop(void) { };
364static void legacy_pic_uint_noop(unsigned int unused) { }; 361static void legacy_pic_uint_noop(unsigned int unused) { };
365static void legacy_pic_int_noop(int unused) { }; 362static void legacy_pic_int_noop(int unused) { };
366
367static struct irq_chip dummy_pic_chip = {
368 .name = "dummy pic",
369 .mask = legacy_pic_uint_noop,
370 .unmask = legacy_pic_uint_noop,
371 .disable = legacy_pic_uint_noop,
372 .mask_ack = legacy_pic_uint_noop,
373};
374static int legacy_pic_irq_pending_noop(unsigned int irq) 363static int legacy_pic_irq_pending_noop(unsigned int irq)
375{ 364{
376 return 0; 365 return 0;
@@ -378,7 +367,9 @@ static int legacy_pic_irq_pending_noop(unsigned int irq)
378 367
379struct legacy_pic null_legacy_pic = { 368struct legacy_pic null_legacy_pic = {
380 .nr_legacy_irqs = 0, 369 .nr_legacy_irqs = 0,
381 .chip = &dummy_pic_chip, 370 .chip = &dummy_irq_chip,
371 .mask = legacy_pic_uint_noop,
372 .unmask = legacy_pic_uint_noop,
382 .mask_all = legacy_pic_noop, 373 .mask_all = legacy_pic_noop,
383 .restore_mask = legacy_pic_noop, 374 .restore_mask = legacy_pic_noop,
384 .init = legacy_pic_int_noop, 375 .init = legacy_pic_int_noop,
@@ -389,7 +380,9 @@ struct legacy_pic null_legacy_pic = {
389struct legacy_pic default_legacy_pic = { 380struct legacy_pic default_legacy_pic = {
390 .nr_legacy_irqs = NR_IRQS_LEGACY, 381 .nr_legacy_irqs = NR_IRQS_LEGACY,
391 .chip = &i8259A_chip, 382 .chip = &i8259A_chip,
392 .mask_all = mask_8259A, 383 .mask = mask_8259A_irq,
384 .unmask = unmask_8259A_irq,
385 .mask_all = mask_8259A,
393 .restore_mask = unmask_8259A, 386 .restore_mask = unmask_8259A,
394 .init = init_8259A, 387 .init = init_8259A,
395 .irq_pending = i8259A_irq_pending, 388 .irq_pending = i8259A_irq_pending,
@@ -398,17 +391,12 @@ struct legacy_pic default_legacy_pic = {
398 391
399struct legacy_pic *legacy_pic = &default_legacy_pic; 392struct legacy_pic *legacy_pic = &default_legacy_pic;
400 393
401static int __init i8259A_init_sysfs(void) 394static int __init i8259A_init_ops(void)
402{ 395{
403 int error; 396 if (legacy_pic == &default_legacy_pic)
404 397 register_syscore_ops(&i8259_syscore_ops);
405 if (legacy_pic != &default_legacy_pic)
406 return 0;
407 398
408 error = sysdev_class_register(&i8259_sysdev_class); 399 return 0;
409 if (!error)
410 error = sysdev_register(&device_i8259A);
411 return error;
412} 400}
413 401
414device_initcall(i8259A_init_sysfs); 402device_initcall(i8259A_init_ops);
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 8eec0ec59af2..8c968974253d 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -14,22 +14,9 @@
14#include <linux/slab.h> 14#include <linux/slab.h>
15#include <linux/thread_info.h> 15#include <linux/thread_info.h>
16#include <linux/syscalls.h> 16#include <linux/syscalls.h>
17#include <linux/bitmap.h>
17#include <asm/syscalls.h> 18#include <asm/syscalls.h>
18 19
19/* Set EXTENT bits starting at BASE in BITMAP to value TURN_ON. */
20static void set_bitmap(unsigned long *bitmap, unsigned int base,
21 unsigned int extent, int new_value)
22{
23 unsigned int i;
24
25 for (i = base; i < base + extent; i++) {
26 if (new_value)
27 __set_bit(i, bitmap);
28 else
29 __clear_bit(i, bitmap);
30 }
31}
32
33/* 20/*
34 * this changes the io permissions bitmap in the current task. 21 * this changes the io permissions bitmap in the current task.
35 */ 22 */
@@ -69,7 +56,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
69 */ 56 */
70 tss = &per_cpu(init_tss, get_cpu()); 57 tss = &per_cpu(init_tss, get_cpu());
71 58
72 set_bitmap(t->io_bitmap_ptr, from, num, !turn_on); 59 if (turn_on)
60 bitmap_clear(t->io_bitmap_ptr, from, num);
61 else
62 bitmap_set(t->io_bitmap_ptr, from, num);
73 63
74 /* 64 /*
75 * Search for a (possibly new) maximum. This is simple and stupid, 65 * Search for a (possibly new) maximum. This is simple and stupid,
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 91fd0c70a18a..6c0802eb2f7f 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -4,9 +4,11 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/interrupt.h> 5#include <linux/interrupt.h>
6#include <linux/kernel_stat.h> 6#include <linux/kernel_stat.h>
7#include <linux/of.h>
7#include <linux/seq_file.h> 8#include <linux/seq_file.h>
8#include <linux/smp.h> 9#include <linux/smp.h>
9#include <linux/ftrace.h> 10#include <linux/ftrace.h>
11#include <linux/delay.h>
10 12
11#include <asm/apic.h> 13#include <asm/apic.h>
12#include <asm/io_apic.h> 14#include <asm/io_apic.h>
@@ -43,9 +45,9 @@ void ack_bad_irq(unsigned int irq)
43 45
44#define irq_stats(x) (&per_cpu(irq_stat, x)) 46#define irq_stats(x) (&per_cpu(irq_stat, x))
45/* 47/*
46 * /proc/interrupts printing: 48 * /proc/interrupts printing for arch specific interrupts
47 */ 49 */
48static int show_other_interrupts(struct seq_file *p, int prec) 50int arch_show_interrupts(struct seq_file *p, int prec)
49{ 51{
50 int j; 52 int j;
51 53
@@ -67,10 +69,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
67 for_each_online_cpu(j) 69 for_each_online_cpu(j)
68 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); 70 seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs);
69 seq_printf(p, " Performance monitoring interrupts\n"); 71 seq_printf(p, " Performance monitoring interrupts\n");
70 seq_printf(p, "%*s: ", prec, "PND"); 72 seq_printf(p, "%*s: ", prec, "IWI");
71 for_each_online_cpu(j) 73 for_each_online_cpu(j)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 74 seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs);
73 seq_printf(p, " Performance pending work\n"); 75 seq_printf(p, " IRQ work interrupts\n");
74#endif 76#endif
75 if (x86_platform_ipi_callback) { 77 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 78 seq_printf(p, "%*s: ", prec, "PLT");
@@ -121,59 +123,6 @@ static int show_other_interrupts(struct seq_file *p, int prec)
121 return 0; 123 return 0;
122} 124}
123 125
124int show_interrupts(struct seq_file *p, void *v)
125{
126 unsigned long flags, any_count = 0;
127 int i = *(loff_t *) v, j, prec;
128 struct irqaction *action;
129 struct irq_desc *desc;
130
131 if (i > nr_irqs)
132 return 0;
133
134 for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec)
135 j *= 10;
136
137 if (i == nr_irqs)
138 return show_other_interrupts(p, prec);
139
140 /* print header */
141 if (i == 0) {
142 seq_printf(p, "%*s", prec + 8, "");
143 for_each_online_cpu(j)
144 seq_printf(p, "CPU%-8d", j);
145 seq_putc(p, '\n');
146 }
147
148 desc = irq_to_desc(i);
149 if (!desc)
150 return 0;
151
152 raw_spin_lock_irqsave(&desc->lock, flags);
153 for_each_online_cpu(j)
154 any_count |= kstat_irqs_cpu(i, j);
155 action = desc->action;
156 if (!action && !any_count)
157 goto out;
158
159 seq_printf(p, "%*d: ", prec, i);
160 for_each_online_cpu(j)
161 seq_printf(p, "%10u ", kstat_irqs_cpu(i, j));
162 seq_printf(p, " %8s", desc->chip->name);
163 seq_printf(p, "-%-8s", desc->name);
164
165 if (action) {
166 seq_printf(p, " %s", action->name);
167 while ((action = action->next) != NULL)
168 seq_printf(p, ", %s", action->name);
169 }
170
171 seq_putc(p, '\n');
172out:
173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 return 0;
175}
176
177/* 126/*
178 * /proc/stat helpers 127 * /proc/stat helpers
179 */ 128 */
@@ -185,7 +134,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
185 sum += irq_stats(cpu)->apic_timer_irqs; 134 sum += irq_stats(cpu)->apic_timer_irqs;
186 sum += irq_stats(cpu)->irq_spurious_count; 135 sum += irq_stats(cpu)->irq_spurious_count;
187 sum += irq_stats(cpu)->apic_perf_irqs; 136 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 137 sum += irq_stats(cpu)->apic_irq_work_irqs;
189#endif 138#endif
190 if (x86_platform_ipi_callback) 139 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->x86_platform_ipis; 140 sum += irq_stats(cpu)->x86_platform_ipis;
@@ -234,7 +183,7 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
234 exit_idle(); 183 exit_idle();
235 irq_enter(); 184 irq_enter();
236 185
237 irq = __get_cpu_var(vector_irq)[vector]; 186 irq = __this_cpu_read(vector_irq[vector]);
238 187
239 if (!handle_irq(irq, regs)) { 188 if (!handle_irq(irq, regs)) {
240 ack_APIC_irq(); 189 ack_APIC_irq();
@@ -282,6 +231,8 @@ void fixup_irqs(void)
282 unsigned int irq, vector; 231 unsigned int irq, vector;
283 static int warned; 232 static int warned;
284 struct irq_desc *desc; 233 struct irq_desc *desc;
234 struct irq_data *data;
235 struct irq_chip *chip;
285 236
286 for_each_irq_desc(irq, desc) { 237 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0; 238 int break_affinity = 0;
@@ -296,9 +247,10 @@ void fixup_irqs(void)
296 /* interrupt's are disabled at this point */ 247 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock); 248 raw_spin_lock(&desc->lock);
298 249
299 affinity = desc->affinity; 250 data = irq_desc_get_irq_data(desc);
300 if (!irq_has_action(irq) || 251 affinity = data->affinity;
301 cpumask_equal(affinity, cpu_online_mask)) { 252 if (!irq_has_action(irq) || irqd_is_per_cpu(data) ||
253 cpumask_subset(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock); 254 raw_spin_unlock(&desc->lock);
303 continue; 255 continue;
304 } 256 }
@@ -315,16 +267,18 @@ void fixup_irqs(void)
315 affinity = cpu_all_mask; 267 affinity = cpu_all_mask;
316 } 268 }
317 269
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask) 270 chip = irq_data_get_irq_chip(data);
319 desc->chip->mask(irq); 271 if (!irqd_can_move_in_process_context(data) && chip->irq_mask)
272 chip->irq_mask(data);
320 273
321 if (desc->chip->set_affinity) 274 if (chip->irq_set_affinity)
322 desc->chip->set_affinity(irq, affinity); 275 chip->irq_set_affinity(data, affinity, true);
323 else if (!(warned++)) 276 else if (!(warned++))
324 set_affinity = 0; 277 set_affinity = 0;
325 278
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask) 279 if (!irqd_can_move_in_process_context(data) &&
327 desc->chip->unmask(irq); 280 !irqd_irq_disabled(data) && chip->irq_unmask)
281 chip->irq_unmask(data);
328 282
329 raw_spin_unlock(&desc->lock); 283 raw_spin_unlock(&desc->lock);
330 284
@@ -348,17 +302,19 @@ void fixup_irqs(void)
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { 302 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr; 303 unsigned int irr;
350 304
351 if (__get_cpu_var(vector_irq)[vector] < 0) 305 if (__this_cpu_read(vector_irq[vector]) < 0)
352 continue; 306 continue;
353 307
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10)); 308 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) { 309 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector]; 310 irq = __this_cpu_read(vector_irq[vector]);
357 311
358 desc = irq_to_desc(irq); 312 desc = irq_to_desc(irq);
313 data = irq_desc_get_irq_data(desc);
314 chip = irq_data_get_irq_chip(data);
359 raw_spin_lock(&desc->lock); 315 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger) 316 if (chip->irq_retrigger)
361 desc->chip->retrigger(irq); 317 chip->irq_retrigger(data);
362 raw_spin_unlock(&desc->lock); 318 raw_spin_unlock(&desc->lock);
363 } 319 }
364 } 320 }
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 10709f29d166..72090705a656 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -17,6 +17,7 @@
17#include <linux/delay.h> 17#include <linux/delay.h>
18#include <linux/uaccess.h> 18#include <linux/uaccess.h>
19#include <linux/percpu.h> 19#include <linux/percpu.h>
20#include <linux/mm.h>
20 21
21#include <asm/apic.h> 22#include <asm/apic.h>
22 23
@@ -49,21 +50,17 @@ static inline int check_stack_overflow(void) { return 0; }
49static inline void print_stack_overflow(void) { } 50static inline void print_stack_overflow(void) { }
50#endif 51#endif
51 52
52#ifdef CONFIG_4KSTACKS
53/* 53/*
54 * per-CPU IRQ handling contexts (thread information and stack) 54 * per-CPU IRQ handling contexts (thread information and stack)
55 */ 55 */
56union irq_ctx { 56union irq_ctx {
57 struct thread_info tinfo; 57 struct thread_info tinfo;
58 u32 stack[THREAD_SIZE/sizeof(u32)]; 58 u32 stack[THREAD_SIZE/sizeof(u32)];
59} __attribute__((aligned(PAGE_SIZE))); 59} __attribute__((aligned(THREAD_SIZE)));
60 60
61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx); 61static DEFINE_PER_CPU(union irq_ctx *, hardirq_ctx);
62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx); 62static DEFINE_PER_CPU(union irq_ctx *, softirq_ctx);
63 63
64static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, hardirq_stack);
65static DEFINE_PER_CPU_PAGE_ALIGNED(union irq_ctx, softirq_stack);
66
67static void call_on_stack(void *func, void *stack) 64static void call_on_stack(void *func, void *stack)
68{ 65{
69 asm volatile("xchgl %%ebx,%%esp \n" 66 asm volatile("xchgl %%ebx,%%esp \n"
@@ -82,7 +79,7 @@ execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq)
82 u32 *isp, arg1, arg2; 79 u32 *isp, arg1, arg2;
83 80
84 curctx = (union irq_ctx *) current_thread_info(); 81 curctx = (union irq_ctx *) current_thread_info();
85 irqctx = __get_cpu_var(hardirq_ctx); 82 irqctx = __this_cpu_read(hardirq_ctx);
86 83
87 /* 84 /*
88 * this is where we switch to the IRQ stack. However, if we are 85 * this is where we switch to the IRQ stack. However, if we are
@@ -129,20 +126,21 @@ void __cpuinit irq_ctx_init(int cpu)
129 if (per_cpu(hardirq_ctx, cpu)) 126 if (per_cpu(hardirq_ctx, cpu))
130 return; 127 return;
131 128
132 irqctx = &per_cpu(hardirq_stack, cpu); 129 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
133 irqctx->tinfo.task = NULL; 130 THREAD_FLAGS,
134 irqctx->tinfo.exec_domain = NULL; 131 THREAD_ORDER));
132 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
135 irqctx->tinfo.cpu = cpu; 133 irqctx->tinfo.cpu = cpu;
136 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET; 134 irqctx->tinfo.preempt_count = HARDIRQ_OFFSET;
137 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 135 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
138 136
139 per_cpu(hardirq_ctx, cpu) = irqctx; 137 per_cpu(hardirq_ctx, cpu) = irqctx;
140 138
141 irqctx = &per_cpu(softirq_stack, cpu); 139 irqctx = page_address(alloc_pages_node(cpu_to_node(cpu),
142 irqctx->tinfo.task = NULL; 140 THREAD_FLAGS,
143 irqctx->tinfo.exec_domain = NULL; 141 THREAD_ORDER));
142 memset(&irqctx->tinfo, 0, sizeof(struct thread_info));
144 irqctx->tinfo.cpu = cpu; 143 irqctx->tinfo.cpu = cpu;
145 irqctx->tinfo.preempt_count = 0;
146 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0); 144 irqctx->tinfo.addr_limit = MAKE_MM_SEG(0);
147 145
148 per_cpu(softirq_ctx, cpu) = irqctx; 146 per_cpu(softirq_ctx, cpu) = irqctx;
@@ -151,11 +149,6 @@ void __cpuinit irq_ctx_init(int cpu)
151 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu)); 149 cpu, per_cpu(hardirq_ctx, cpu), per_cpu(softirq_ctx, cpu));
152} 150}
153 151
154void irq_ctx_exit(int cpu)
155{
156 per_cpu(hardirq_ctx, cpu) = NULL;
157}
158
159asmlinkage void do_softirq(void) 152asmlinkage void do_softirq(void)
160{ 153{
161 unsigned long flags; 154 unsigned long flags;
@@ -170,7 +163,7 @@ asmlinkage void do_softirq(void)
170 163
171 if (local_softirq_pending()) { 164 if (local_softirq_pending()) {
172 curctx = current_thread_info(); 165 curctx = current_thread_info();
173 irqctx = __get_cpu_var(softirq_ctx); 166 irqctx = __this_cpu_read(softirq_ctx);
174 irqctx->tinfo.task = curctx->task; 167 irqctx->tinfo.task = curctx->task;
175 irqctx->tinfo.previous_esp = current_stack_pointer; 168 irqctx->tinfo.previous_esp = current_stack_pointer;
176 169
@@ -179,7 +172,7 @@ asmlinkage void do_softirq(void)
179 172
180 call_on_stack(__do_softirq, isp); 173 call_on_stack(__do_softirq, isp);
181 /* 174 /*
182 * Shouldnt happen, we returned above if in_interrupt(): 175 * Shouldn't happen, we returned above if in_interrupt():
183 */ 176 */
184 WARN_ON_ONCE(softirq_count()); 177 WARN_ON_ONCE(softirq_count());
185 } 178 }
@@ -187,11 +180,6 @@ asmlinkage void do_softirq(void)
187 local_irq_restore(flags); 180 local_irq_restore(flags);
188} 181}
189 182
190#else
191static inline int
192execute_on_irq_stack(int overflow, struct irq_desc *desc, int irq) { return 0; }
193#endif
194
195bool handle_irq(unsigned irq, struct pt_regs *regs) 183bool handle_irq(unsigned irq, struct pt_regs *regs)
196{ 184{
197 struct irq_desc *desc; 185 struct irq_desc *desc;
diff --git a/arch/x86/kernel/irq_work.c b/arch/x86/kernel/irq_work.c
new file mode 100644
index 000000000000..ca8f703a1e70
--- /dev/null
+++ b/arch/x86/kernel/irq_work.c
@@ -0,0 +1,30 @@
1/*
2 * x86 specific code for irq_work
3 *
4 * Copyright (C) 2010 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
5 */
6
7#include <linux/kernel.h>
8#include <linux/irq_work.h>
9#include <linux/hardirq.h>
10#include <asm/apic.h>
11
12void smp_irq_work_interrupt(struct pt_regs *regs)
13{
14 irq_enter();
15 ack_APIC_irq();
16 inc_irq_stat(apic_irq_work_irqs);
17 irq_work_run();
18 irq_exit();
19}
20
21void arch_irq_work_raise(void)
22{
23#ifdef CONFIG_X86_LOCAL_APIC
24 if (!cpu_has_apic)
25 return;
26
27 apic->send_IPI_self(IRQ_WORK_VECTOR);
28 apic_wait_icr_idle();
29#endif
30}
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 9772b1a0f9a4..48acf71c6534 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -25,6 +25,7 @@
25#include <asm/setup.h> 25#include <asm/setup.h>
26#include <asm/i8259.h> 26#include <asm/i8259.h>
27#include <asm/traps.h> 27#include <asm/traps.h>
28#include <asm/prom.h>
28 29
29/* 30/*
30 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts: 31 * ISA PIC or low IO-APIC triggered (INTA-cycle or APIC) interrupts:
@@ -71,6 +72,7 @@ static irqreturn_t math_error_irq(int cpl, void *dev_id)
71static struct irqaction fpu_irq = { 72static struct irqaction fpu_irq = {
72 .handler = math_error_irq, 73 .handler = math_error_irq,
73 .name = "fpu", 74 .name = "fpu",
75 .flags = IRQF_NO_THREAD,
74}; 76};
75#endif 77#endif
76 78
@@ -80,6 +82,7 @@ static struct irqaction fpu_irq = {
80static struct irqaction irq2 = { 82static struct irqaction irq2 = {
81 .handler = no_action, 83 .handler = no_action,
82 .name = "cascade", 84 .name = "cascade",
85 .flags = IRQF_NO_THREAD,
83}; 86};
84 87
85DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 88DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
@@ -100,6 +103,8 @@ int vector_used_by_percpu_irq(unsigned int vector)
100 103
101void __init init_ISA_irqs(void) 104void __init init_ISA_irqs(void)
102{ 105{
106 struct irq_chip *chip = legacy_pic->chip;
107 const char *name = chip->name;
103 int i; 108 int i;
104 109
105#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 110#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
@@ -107,19 +112,8 @@ void __init init_ISA_irqs(void)
107#endif 112#endif
108 legacy_pic->init(0); 113 legacy_pic->init(0);
109 114
110 /* 115 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
111 * 16 old-style INTA-cycle interrupts: 116 irq_set_chip_and_handler_name(i, chip, handle_level_irq, name);
112 */
113 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
114 struct irq_desc *desc = irq_to_desc(i);
115
116 desc->status = IRQ_DISABLED;
117 desc->action = NULL;
118 desc->depth = 1;
119
120 set_irq_chip_and_handler_name(i, &i8259A_chip,
121 handle_level_irq, "XT");
122 }
123} 117}
124 118
125void __init init_IRQ(void) 119void __init init_IRQ(void)
@@ -127,6 +121,12 @@ void __init init_IRQ(void)
127 int i; 121 int i;
128 122
129 /* 123 /*
124 * We probably need a better place for this, but it works for
125 * now ...
126 */
127 x86_add_irq_domains();
128
129 /*
130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15. 130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
131 * If these IRQ's are handled by legacy interrupt-controllers like PIC, 131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
132 * then this configuration will likely be static after the boot. If 132 * then this configuration will likely be static after the boot. If
@@ -173,14 +173,77 @@ static void __init smp_intr_init(void)
173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt); 173 alloc_intr_gate(RESCHEDULE_VECTOR, reschedule_interrupt);
174 174
175 /* IPIs for invalidation */ 175 /* IPIs for invalidation */
176 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+0, invalidate_interrupt0); 176#define ALLOC_INVTLB_VEC(NR) \
177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+1, invalidate_interrupt1); 177 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+NR, \
178 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+2, invalidate_interrupt2); 178 invalidate_interrupt##NR)
179 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+3, invalidate_interrupt3); 179
180 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+4, invalidate_interrupt4); 180 switch (NUM_INVALIDATE_TLB_VECTORS) {
181 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+5, invalidate_interrupt5); 181 default:
182 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+6, invalidate_interrupt6); 182 ALLOC_INVTLB_VEC(31);
183 alloc_intr_gate(INVALIDATE_TLB_VECTOR_START+7, invalidate_interrupt7); 183 case 31:
184 ALLOC_INVTLB_VEC(30);
185 case 30:
186 ALLOC_INVTLB_VEC(29);
187 case 29:
188 ALLOC_INVTLB_VEC(28);
189 case 28:
190 ALLOC_INVTLB_VEC(27);
191 case 27:
192 ALLOC_INVTLB_VEC(26);
193 case 26:
194 ALLOC_INVTLB_VEC(25);
195 case 25:
196 ALLOC_INVTLB_VEC(24);
197 case 24:
198 ALLOC_INVTLB_VEC(23);
199 case 23:
200 ALLOC_INVTLB_VEC(22);
201 case 22:
202 ALLOC_INVTLB_VEC(21);
203 case 21:
204 ALLOC_INVTLB_VEC(20);
205 case 20:
206 ALLOC_INVTLB_VEC(19);
207 case 19:
208 ALLOC_INVTLB_VEC(18);
209 case 18:
210 ALLOC_INVTLB_VEC(17);
211 case 17:
212 ALLOC_INVTLB_VEC(16);
213 case 16:
214 ALLOC_INVTLB_VEC(15);
215 case 15:
216 ALLOC_INVTLB_VEC(14);
217 case 14:
218 ALLOC_INVTLB_VEC(13);
219 case 13:
220 ALLOC_INVTLB_VEC(12);
221 case 12:
222 ALLOC_INVTLB_VEC(11);
223 case 11:
224 ALLOC_INVTLB_VEC(10);
225 case 10:
226 ALLOC_INVTLB_VEC(9);
227 case 9:
228 ALLOC_INVTLB_VEC(8);
229 case 8:
230 ALLOC_INVTLB_VEC(7);
231 case 7:
232 ALLOC_INVTLB_VEC(6);
233 case 6:
234 ALLOC_INVTLB_VEC(5);
235 case 5:
236 ALLOC_INVTLB_VEC(4);
237 case 4:
238 ALLOC_INVTLB_VEC(3);
239 case 3:
240 ALLOC_INVTLB_VEC(2);
241 case 2:
242 ALLOC_INVTLB_VEC(1);
243 case 1:
244 ALLOC_INVTLB_VEC(0);
245 break;
246 }
184 247
185 /* IPI for generic function call */ 248 /* IPI for generic function call */
186 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt); 249 alloc_intr_gate(CALL_FUNCTION_VECTOR, call_function_interrupt);
@@ -227,9 +290,9 @@ static void __init apic_intr_init(void)
227 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 290 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
228 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); 291 alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt);
229 292
230 /* Performance monitoring interrupts: */ 293 /* IRQ work interrupts: */
231# ifdef CONFIG_PERF_EVENTS 294# ifdef CONFIG_IRQ_WORK
232 alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); 295 alloc_intr_gate(IRQ_WORK_VECTOR, irq_work_interrupt);
233# endif 296# endif
234 297
235#endif 298#endif
@@ -255,7 +318,7 @@ void __init native_init_IRQ(void)
255 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); 318 set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]);
256 } 319 }
257 320
258 if (!acpi_ioapic) 321 if (!acpi_ioapic && !of_ioapic)
259 setup_irq(2, &irq2); 322 setup_irq(2, &irq2);
260 323
261#ifdef CONFIG_X86_32 324#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c
new file mode 100644
index 000000000000..3fee346ef545
--- /dev/null
+++ b/arch/x86/kernel/jump_label.c
@@ -0,0 +1,51 @@
1/*
2 * jump label x86 support
3 *
4 * Copyright (C) 2009 Jason Baron <jbaron@redhat.com>
5 *
6 */
7#include <linux/jump_label.h>
8#include <linux/memory.h>
9#include <linux/uaccess.h>
10#include <linux/module.h>
11#include <linux/list.h>
12#include <linux/jhash.h>
13#include <linux/cpu.h>
14#include <asm/kprobes.h>
15#include <asm/alternative.h>
16
17#ifdef HAVE_JUMP_LABEL
18
19union jump_code_union {
20 char code[JUMP_LABEL_NOP_SIZE];
21 struct {
22 char jump;
23 int offset;
24 } __attribute__((packed));
25};
26
27void arch_jump_label_transform(struct jump_entry *entry,
28 enum jump_label_type type)
29{
30 union jump_code_union code;
31
32 if (type == JUMP_LABEL_ENABLE) {
33 code.jump = 0xe9;
34 code.offset = entry->target -
35 (entry->code + JUMP_LABEL_NOP_SIZE);
36 } else
37 memcpy(&code, ideal_nops[NOP_ATOMIC5], JUMP_LABEL_NOP_SIZE);
38 get_online_cpus();
39 mutex_lock(&text_mutex);
40 text_poke_smp((void *)entry->code, &code, JUMP_LABEL_NOP_SIZE);
41 mutex_unlock(&text_mutex);
42 put_online_cpus();
43}
44
45void arch_jump_label_text_poke_early(jump_label_t addr)
46{
47 text_poke_early((void *)addr, ideal_nops[NOP_ATOMIC5],
48 JUMP_LABEL_NOP_SIZE);
49}
50
51#endif
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
deleted file mode 100644
index 0f7bc20cfcde..000000000000
--- a/arch/x86/kernel/k8.c
+++ /dev/null
@@ -1,137 +0,0 @@
1/*
2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */
5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h>
8#include <linux/errno.h>
9#include <linux/module.h>
10#include <linux/spinlock.h>
11#include <asm/k8.h>
12
13int num_k8_northbridges;
14EXPORT_SYMBOL(num_k8_northbridges);
15
16static u32 *flush_words;
17
18struct pci_device_id k8_nb_ids[] = {
19 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) },
20 { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) },
21 {}
22};
23EXPORT_SYMBOL(k8_nb_ids);
24
25struct pci_dev **k8_northbridges;
26EXPORT_SYMBOL(k8_northbridges);
27
28static struct pci_dev *next_k8_northbridge(struct pci_dev *dev)
29{
30 do {
31 dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev);
32 if (!dev)
33 break;
34 } while (!pci_match_id(&k8_nb_ids[0], dev));
35 return dev;
36}
37
38int cache_k8_northbridges(void)
39{
40 int i;
41 struct pci_dev *dev;
42
43 if (num_k8_northbridges)
44 return 0;
45
46 dev = NULL;
47 while ((dev = next_k8_northbridge(dev)) != NULL)
48 num_k8_northbridges++;
49
50 k8_northbridges = kmalloc((num_k8_northbridges + 1) * sizeof(void *),
51 GFP_KERNEL);
52 if (!k8_northbridges)
53 return -ENOMEM;
54
55 if (!num_k8_northbridges) {
56 k8_northbridges[0] = NULL;
57 return 0;
58 }
59
60 flush_words = kmalloc(num_k8_northbridges * sizeof(u32), GFP_KERNEL);
61 if (!flush_words) {
62 kfree(k8_northbridges);
63 return -ENOMEM;
64 }
65
66 dev = NULL;
67 i = 0;
68 while ((dev = next_k8_northbridge(dev)) != NULL) {
69 k8_northbridges[i] = dev;
70 pci_read_config_dword(dev, 0x9c, &flush_words[i++]);
71 }
72 k8_northbridges[i] = NULL;
73 return 0;
74}
75EXPORT_SYMBOL_GPL(cache_k8_northbridges);
76
77/* Ignores subdevice/subvendor but as far as I can figure out
78 they're useless anyways */
79int __init early_is_k8_nb(u32 device)
80{
81 struct pci_device_id *id;
82 u32 vendor = device & 0xffff;
83 device >>= 16;
84 for (id = k8_nb_ids; id->vendor; id++)
85 if (vendor == id->vendor && device == id->device)
86 return 1;
87 return 0;
88}
89
90void k8_flush_garts(void)
91{
92 int flushed, i;
93 unsigned long flags;
94 static DEFINE_SPINLOCK(gart_lock);
95
96 /* Avoid races between AGP and IOMMU. In theory it's not needed
97 but I'm not sure if the hardware won't lose flush requests
98 when another is pending. This whole thing is so expensive anyways
99 that it doesn't matter to serialize more. -AK */
100 spin_lock_irqsave(&gart_lock, flags);
101 flushed = 0;
102 for (i = 0; i < num_k8_northbridges; i++) {
103 pci_write_config_dword(k8_northbridges[i], 0x9c,
104 flush_words[i]|1);
105 flushed++;
106 }
107 for (i = 0; i < num_k8_northbridges; i++) {
108 u32 w;
109 /* Make sure the hardware actually executed the flush*/
110 for (;;) {
111 pci_read_config_dword(k8_northbridges[i],
112 0x9c, &w);
113 if (!(w & 1))
114 break;
115 cpu_relax();
116 }
117 }
118 spin_unlock_irqrestore(&gart_lock, flags);
119 if (!flushed)
120 printk("nothing to flush?\n");
121}
122EXPORT_SYMBOL_GPL(k8_flush_garts);
123
124static __init int init_k8_nbs(void)
125{
126 int err = 0;
127
128 err = cache_k8_northbridges();
129
130 if (err < 0)
131 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
132
133 return err;
134}
135
136/* This has to go after the PCI subsystem */
137fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index 8afd9f321f10..90fcf62854bb 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -78,6 +78,7 @@ static int setup_data_open(struct inode *inode, struct file *file)
78static const struct file_operations fops_setup_data = { 78static const struct file_operations fops_setup_data = {
79 .read = setup_data_read, 79 .read = setup_data_read,
80 .open = setup_data_open, 80 .open = setup_data_open,
81 .llseek = default_llseek,
81}; 82};
82 83
83static int __init 84static int __init
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 852b81967a37..5f9ecff328b5 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -48,6 +48,7 @@
48#include <asm/apicdef.h> 48#include <asm/apicdef.h>
49#include <asm/system.h> 49#include <asm/system.h>
50#include <asm/apic.h> 50#include <asm/apic.h>
51#include <asm/nmi.h>
51 52
52struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = 53struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
53{ 54{
@@ -120,8 +121,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
120 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset, 121 memcpy(mem, (void *)regs + dbg_reg_def[regno].offset,
121 dbg_reg_def[regno].size); 122 dbg_reg_def[regno].size);
122 123
123 switch (regno) {
124#ifdef CONFIG_X86_32 124#ifdef CONFIG_X86_32
125 switch (regno) {
125 case GDB_SS: 126 case GDB_SS:
126 if (!user_mode_vm(regs)) 127 if (!user_mode_vm(regs))
127 *(unsigned long *)mem = __KERNEL_DS; 128 *(unsigned long *)mem = __KERNEL_DS;
@@ -134,8 +135,8 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
134 case GDB_FS: 135 case GDB_FS:
135 *(unsigned long *)mem = 0xFFFF; 136 *(unsigned long *)mem = 0xFFFF;
136 break; 137 break;
137#endif
138 } 138 }
139#endif
139 return dbg_reg_def[regno].name; 140 return dbg_reg_def[regno].name;
140} 141}
141 142
@@ -277,7 +278,7 @@ static int hw_break_release_slot(int breakno)
277 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu); 278 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
278 if (dbg_release_bp_slot(*pevent)) 279 if (dbg_release_bp_slot(*pevent))
279 /* 280 /*
280 * The debugger is responisble for handing the retry on 281 * The debugger is responsible for handing the retry on
281 * remove failure. 282 * remove failure.
282 */ 283 */
283 return -1; 284 return -1;
@@ -315,14 +316,18 @@ static void kgdb_remove_all_hw_break(void)
315 if (!breakinfo[i].enabled) 316 if (!breakinfo[i].enabled)
316 continue; 317 continue;
317 bp = *per_cpu_ptr(breakinfo[i].pev, cpu); 318 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
318 if (bp->attr.disabled == 1) 319 if (!bp->attr.disabled) {
320 arch_uninstall_hw_breakpoint(bp);
321 bp->attr.disabled = 1;
319 continue; 322 continue;
323 }
320 if (dbg_is_early) 324 if (dbg_is_early)
321 early_dr7 &= ~encode_dr7(i, breakinfo[i].len, 325 early_dr7 &= ~encode_dr7(i, breakinfo[i].len,
322 breakinfo[i].type); 326 breakinfo[i].type);
323 else 327 else if (hw_break_release_slot(i))
324 arch_uninstall_hw_breakpoint(bp); 328 printk(KERN_ERR "KGDB: hw bpt remove failed %lx\n",
325 bp->attr.disabled = 1; 329 breakinfo[i].addr);
330 breakinfo[i].enabled = 0;
326 } 331 }
327} 332}
328 333
@@ -387,7 +392,7 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
387 * disable hardware debugging while it is processing gdb packets or 392 * disable hardware debugging while it is processing gdb packets or
388 * handling exception. 393 * handling exception.
389 */ 394 */
390void kgdb_disable_hw_debug(struct pt_regs *regs) 395static void kgdb_disable_hw_debug(struct pt_regs *regs)
391{ 396{
392 int i; 397 int i;
393 int cpu = raw_smp_processor_id(); 398 int cpu = raw_smp_processor_id();
@@ -477,8 +482,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
477 raw_smp_processor_id()); 482 raw_smp_processor_id());
478 } 483 }
479 484
480 kgdb_correct_hw_break();
481
482 return 0; 485 return 0;
483 } 486 }
484 487
@@ -523,10 +526,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
523 } 526 }
524 return NOTIFY_DONE; 527 return NOTIFY_DONE;
525 528
526 case DIE_NMI_IPI:
527 /* Just ignore, we will handle the roundup on DIE_NMI. */
528 return NOTIFY_DONE;
529
530 case DIE_NMIUNKNOWN: 529 case DIE_NMIUNKNOWN:
531 if (was_in_debug_nmi[raw_smp_processor_id()]) { 530 if (was_in_debug_nmi[raw_smp_processor_id()]) {
532 was_in_debug_nmi[raw_smp_processor_id()] = 0; 531 was_in_debug_nmi[raw_smp_processor_id()] = 0;
@@ -534,15 +533,6 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
534 } 533 }
535 return NOTIFY_DONE; 534 return NOTIFY_DONE;
536 535
537 case DIE_NMIWATCHDOG:
538 if (atomic_read(&kgdb_active) != -1) {
539 /* KGDB CPU roundup: */
540 kgdb_nmicallback(raw_smp_processor_id(), regs);
541 return NOTIFY_STOP;
542 }
543 /* Enter debugger: */
544 break;
545
546 case DIE_DEBUG: 536 case DIE_DEBUG:
547 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) { 537 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
548 if (user_mode(regs)) 538 if (user_mode(regs))
@@ -604,7 +594,7 @@ static struct notifier_block kgdb_notifier = {
604 /* 594 /*
605 * Lowest-prio notifier priority, we want to be notified last: 595 * Lowest-prio notifier priority, we want to be notified last:
606 */ 596 */
607 .priority = -INT_MAX, 597 .priority = NMI_LOCAL_LOW_PRIOR,
608}; 598};
609 599
610/** 600/**
@@ -621,7 +611,12 @@ int kgdb_arch_init(void)
621static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi, 611static void kgdb_hw_overflow_handler(struct perf_event *event, int nmi,
622 struct perf_sample_data *data, struct pt_regs *regs) 612 struct perf_sample_data *data, struct pt_regs *regs)
623{ 613{
624 kgdb_ll_trap(DIE_DEBUG, "debug", regs, 0, 0, SIGTRAP); 614 struct task_struct *tsk = current;
615 int i;
616
617 for (i = 0; i < 4; i++)
618 if (breakinfo[i].enabled)
619 tsk->thread.debugreg6 |= (DR_TRAP0 << i);
625} 620}
626 621
627void kgdb_arch_late(void) 622void kgdb_arch_late(void)
@@ -644,7 +639,7 @@ void kgdb_arch_late(void)
644 if (breakinfo[i].pev) 639 if (breakinfo[i].pev)
645 continue; 640 continue;
646 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL); 641 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
647 if (IS_ERR(breakinfo[i].pev)) { 642 if (IS_ERR((void * __force)breakinfo[i].pev)) {
648 printk(KERN_ERR "kgdb: Could not allocate hw" 643 printk(KERN_ERR "kgdb: Could not allocate hw"
649 "breakpoints\nDisabling the kernel debugger\n"); 644 "breakpoints\nDisabling the kernel debugger\n");
650 breakinfo[i].pev = NULL; 645 breakinfo[i].pev = NULL;
@@ -721,6 +716,7 @@ struct kgdb_arch arch_kgdb_ops = {
721 .flags = KGDB_HW_BREAKPOINT, 716 .flags = KGDB_HW_BREAKPOINT,
722 .set_hw_breakpoint = kgdb_set_hw_break, 717 .set_hw_breakpoint = kgdb_set_hw_break,
723 .remove_hw_breakpoint = kgdb_remove_hw_break, 718 .remove_hw_breakpoint = kgdb_remove_hw_break,
719 .disable_hw_break = kgdb_disable_hw_debug,
724 .remove_all_hw_break = kgdb_remove_all_hw_break, 720 .remove_all_hw_break = kgdb_remove_all_hw_break,
725 .correct_hw_break = kgdb_correct_hw_break, 721 .correct_hw_break = kgdb_correct_hw_break,
726}; 722};
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 770ebfb349e9..f1a6244d7d93 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -230,9 +230,6 @@ static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
230 return 0; 230 return 0;
231} 231}
232 232
233/* Dummy buffers for kallsyms_lookup */
234static char __dummy_buf[KSYM_NAME_LEN];
235
236/* Check if paddr is at an instruction boundary */ 233/* Check if paddr is at an instruction boundary */
237static int __kprobes can_probe(unsigned long paddr) 234static int __kprobes can_probe(unsigned long paddr)
238{ 235{
@@ -241,7 +238,7 @@ static int __kprobes can_probe(unsigned long paddr)
241 struct insn insn; 238 struct insn insn;
242 kprobe_opcode_t buf[MAX_INSN_SIZE]; 239 kprobe_opcode_t buf[MAX_INSN_SIZE];
243 240
244 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf)) 241 if (!kallsyms_lookup_size_offset(paddr, NULL, &offset))
245 return 0; 242 return 0;
246 243
247 /* Decode instructions */ 244 /* Decode instructions */
@@ -406,7 +403,7 @@ static void __kprobes save_previous_kprobe(struct kprobe_ctlblk *kcb)
406 403
407static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb) 404static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
408{ 405{
409 __get_cpu_var(current_kprobe) = kcb->prev_kprobe.kp; 406 __this_cpu_write(current_kprobe, kcb->prev_kprobe.kp);
410 kcb->kprobe_status = kcb->prev_kprobe.status; 407 kcb->kprobe_status = kcb->prev_kprobe.status;
411 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags; 408 kcb->kprobe_old_flags = kcb->prev_kprobe.old_flags;
412 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags; 409 kcb->kprobe_saved_flags = kcb->prev_kprobe.saved_flags;
@@ -415,7 +412,7 @@ static void __kprobes restore_previous_kprobe(struct kprobe_ctlblk *kcb)
415static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs, 412static void __kprobes set_current_kprobe(struct kprobe *p, struct pt_regs *regs,
416 struct kprobe_ctlblk *kcb) 413 struct kprobe_ctlblk *kcb)
417{ 414{
418 __get_cpu_var(current_kprobe) = p; 415 __this_cpu_write(current_kprobe, p);
419 kcb->kprobe_saved_flags = kcb->kprobe_old_flags 416 kcb->kprobe_saved_flags = kcb->kprobe_old_flags
420 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF)); 417 = (regs->flags & (X86_EFLAGS_TF | X86_EFLAGS_IF));
421 if (is_IF_modifier(p->ainsn.insn)) 418 if (is_IF_modifier(p->ainsn.insn))
@@ -589,7 +586,7 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
589 preempt_enable_no_resched(); 586 preempt_enable_no_resched();
590 return 1; 587 return 1;
591 } else if (kprobe_running()) { 588 } else if (kprobe_running()) {
592 p = __get_cpu_var(current_kprobe); 589 p = __this_cpu_read(current_kprobe);
593 if (p->break_handler && p->break_handler(p, regs)) { 590 if (p->break_handler && p->break_handler(p, regs)) {
594 setup_singlestep(p, regs, kcb, 0); 591 setup_singlestep(p, regs, kcb, 0);
595 return 1; 592 return 1;
@@ -762,11 +759,11 @@ static __used __kprobes void *trampoline_handler(struct pt_regs *regs)
762 759
763 orig_ret_address = (unsigned long)ri->ret_addr; 760 orig_ret_address = (unsigned long)ri->ret_addr;
764 if (ri->rp && ri->rp->handler) { 761 if (ri->rp && ri->rp->handler) {
765 __get_cpu_var(current_kprobe) = &ri->rp->kp; 762 __this_cpu_write(current_kprobe, &ri->rp->kp);
766 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE; 763 get_kprobe_ctlblk()->kprobe_status = KPROBE_HIT_ACTIVE;
767 ri->ret_addr = correct_ret_addr; 764 ri->ret_addr = correct_ret_addr;
768 ri->rp->handler(ri, regs); 765 ri->rp->handler(ri, regs);
769 __get_cpu_var(current_kprobe) = NULL; 766 __this_cpu_write(current_kprobe, NULL);
770 } 767 }
771 768
772 recycle_rp_inst(ri, &empty_rp); 769 recycle_rp_inst(ri, &empty_rp);
@@ -1129,7 +1126,7 @@ static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1129 *(unsigned long *)addr = val; 1126 *(unsigned long *)addr = val;
1130} 1127}
1131 1128
1132void __kprobes kprobes_optinsn_template_holder(void) 1129static void __used __kprobes kprobes_optinsn_template_holder(void)
1133{ 1130{
1134 asm volatile ( 1131 asm volatile (
1135 ".global optprobe_template_entry\n" 1132 ".global optprobe_template_entry\n"
@@ -1186,8 +1183,13 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1186 struct pt_regs *regs) 1183 struct pt_regs *regs)
1187{ 1184{
1188 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); 1185 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1186 unsigned long flags;
1189 1187
1190 preempt_disable(); 1188 /* This is possible if op is under delayed unoptimizing */
1189 if (kprobe_disabled(&op->kp))
1190 return;
1191
1192 local_irq_save(flags);
1191 if (kprobe_running()) { 1193 if (kprobe_running()) {
1192 kprobes_inc_nmissed_count(&op->kp); 1194 kprobes_inc_nmissed_count(&op->kp);
1193 } else { 1195 } else {
@@ -1201,12 +1203,12 @@ static void __kprobes optimized_callback(struct optimized_kprobe *op,
1201 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE; 1203 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1202 regs->orig_ax = ~0UL; 1204 regs->orig_ax = ~0UL;
1203 1205
1204 __get_cpu_var(current_kprobe) = &op->kp; 1206 __this_cpu_write(current_kprobe, &op->kp);
1205 kcb->kprobe_status = KPROBE_HIT_ACTIVE; 1207 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1206 opt_pre_handler(&op->kp, regs); 1208 opt_pre_handler(&op->kp, regs);
1207 __get_cpu_var(current_kprobe) = NULL; 1209 __this_cpu_write(current_kprobe, NULL);
1208 } 1210 }
1209 preempt_enable_no_resched(); 1211 local_irq_restore(flags);
1210} 1212}
1211 1213
1212static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src) 1214static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
@@ -1221,7 +1223,8 @@ static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1221 } 1223 }
1222 /* Check whether the address range is reserved */ 1224 /* Check whether the address range is reserved */
1223 if (ftrace_text_reserved(src, src + len - 1) || 1225 if (ftrace_text_reserved(src, src + len - 1) ||
1224 alternatives_text_reserved(src, src + len - 1)) 1226 alternatives_text_reserved(src, src + len - 1) ||
1227 jump_label_text_reserved(src, src + len - 1))
1225 return -EBUSY; 1228 return -EBUSY;
1226 1229
1227 return len; 1230 return len;
@@ -1269,11 +1272,17 @@ static int __kprobes can_optimize(unsigned long paddr)
1269 unsigned long addr, size = 0, offset = 0; 1272 unsigned long addr, size = 0, offset = 0;
1270 struct insn insn; 1273 struct insn insn;
1271 kprobe_opcode_t buf[MAX_INSN_SIZE]; 1274 kprobe_opcode_t buf[MAX_INSN_SIZE];
1272 /* Dummy buffers for lookup_symbol_attrs */
1273 static char __dummy_buf[KSYM_NAME_LEN];
1274 1275
1275 /* Lookup symbol including addr */ 1276 /* Lookup symbol including addr */
1276 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf)) 1277 if (!kallsyms_lookup_size_offset(paddr, &size, &offset))
1278 return 0;
1279
1280 /*
1281 * Do not optimize in the entry code due to the unstable
1282 * stack handling.
1283 */
1284 if ((paddr >= (unsigned long )__entry_text_start) &&
1285 (paddr < (unsigned long )__entry_text_end))
1277 return 0; 1286 return 0;
1278 1287
1279 /* Check there is enough space for a relative jump. */ 1288 /* Check there is enough space for a relative jump. */
@@ -1405,10 +1414,16 @@ int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1405 return 0; 1414 return 0;
1406} 1415}
1407 1416
1408/* Replace a breakpoint (int3) with a relative jump. */ 1417#define MAX_OPTIMIZE_PROBES 256
1409int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op) 1418static struct text_poke_param *jump_poke_params;
1419static struct jump_poke_buffer {
1420 u8 buf[RELATIVEJUMP_SIZE];
1421} *jump_poke_bufs;
1422
1423static void __kprobes setup_optimize_kprobe(struct text_poke_param *tprm,
1424 u8 *insn_buf,
1425 struct optimized_kprobe *op)
1410{ 1426{
1411 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1412 s32 rel = (s32)((long)op->optinsn.insn - 1427 s32 rel = (s32)((long)op->optinsn.insn -
1413 ((long)op->kp.addr + RELATIVEJUMP_SIZE)); 1428 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1414 1429
@@ -1416,16 +1431,79 @@ int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1416 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE, 1431 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1417 RELATIVE_ADDR_SIZE); 1432 RELATIVE_ADDR_SIZE);
1418 1433
1419 jmp_code[0] = RELATIVEJUMP_OPCODE; 1434 insn_buf[0] = RELATIVEJUMP_OPCODE;
1420 *(s32 *)(&jmp_code[1]) = rel; 1435 *(s32 *)(&insn_buf[1]) = rel;
1436
1437 tprm->addr = op->kp.addr;
1438 tprm->opcode = insn_buf;
1439 tprm->len = RELATIVEJUMP_SIZE;
1440}
1441
1442/*
1443 * Replace breakpoints (int3) with relative jumps.
1444 * Caller must call with locking kprobe_mutex and text_mutex.
1445 */
1446void __kprobes arch_optimize_kprobes(struct list_head *oplist)
1447{
1448 struct optimized_kprobe *op, *tmp;
1449 int c = 0;
1450
1451 list_for_each_entry_safe(op, tmp, oplist, list) {
1452 WARN_ON(kprobe_disabled(&op->kp));
1453 /* Setup param */
1454 setup_optimize_kprobe(&jump_poke_params[c],
1455 jump_poke_bufs[c].buf, op);
1456 list_del_init(&op->list);
1457 if (++c >= MAX_OPTIMIZE_PROBES)
1458 break;
1459 }
1421 1460
1422 /* 1461 /*
1423 * text_poke_smp doesn't support NMI/MCE code modifying. 1462 * text_poke_smp doesn't support NMI/MCE code modifying.
1424 * However, since kprobes itself also doesn't support NMI/MCE 1463 * However, since kprobes itself also doesn't support NMI/MCE
1425 * code probing, it's not a problem. 1464 * code probing, it's not a problem.
1426 */ 1465 */
1427 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE); 1466 text_poke_smp_batch(jump_poke_params, c);
1428 return 0; 1467}
1468
1469static void __kprobes setup_unoptimize_kprobe(struct text_poke_param *tprm,
1470 u8 *insn_buf,
1471 struct optimized_kprobe *op)
1472{
1473 /* Set int3 to first byte for kprobes */
1474 insn_buf[0] = BREAKPOINT_INSTRUCTION;
1475 memcpy(insn_buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1476
1477 tprm->addr = op->kp.addr;
1478 tprm->opcode = insn_buf;
1479 tprm->len = RELATIVEJUMP_SIZE;
1480}
1481
1482/*
1483 * Recover original instructions and breakpoints from relative jumps.
1484 * Caller must call with locking kprobe_mutex.
1485 */
1486extern void arch_unoptimize_kprobes(struct list_head *oplist,
1487 struct list_head *done_list)
1488{
1489 struct optimized_kprobe *op, *tmp;
1490 int c = 0;
1491
1492 list_for_each_entry_safe(op, tmp, oplist, list) {
1493 /* Setup param */
1494 setup_unoptimize_kprobe(&jump_poke_params[c],
1495 jump_poke_bufs[c].buf, op);
1496 list_move(&op->list, done_list);
1497 if (++c >= MAX_OPTIMIZE_PROBES)
1498 break;
1499 }
1500
1501 /*
1502 * text_poke_smp doesn't support NMI/MCE code modifying.
1503 * However, since kprobes itself also doesn't support NMI/MCE
1504 * code probing, it's not a problem.
1505 */
1506 text_poke_smp_batch(jump_poke_params, c);
1429} 1507}
1430 1508
1431/* Replace a relative jump with a breakpoint (int3). */ 1509/* Replace a relative jump with a breakpoint (int3). */
@@ -1457,11 +1535,35 @@ static int __kprobes setup_detour_execution(struct kprobe *p,
1457 } 1535 }
1458 return 0; 1536 return 0;
1459} 1537}
1538
1539static int __kprobes init_poke_params(void)
1540{
1541 /* Allocate code buffer and parameter array */
1542 jump_poke_bufs = kmalloc(sizeof(struct jump_poke_buffer) *
1543 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1544 if (!jump_poke_bufs)
1545 return -ENOMEM;
1546
1547 jump_poke_params = kmalloc(sizeof(struct text_poke_param) *
1548 MAX_OPTIMIZE_PROBES, GFP_KERNEL);
1549 if (!jump_poke_params) {
1550 kfree(jump_poke_bufs);
1551 jump_poke_bufs = NULL;
1552 return -ENOMEM;
1553 }
1554
1555 return 0;
1556}
1557#else /* !CONFIG_OPTPROBES */
1558static int __kprobes init_poke_params(void)
1559{
1560 return 0;
1561}
1460#endif 1562#endif
1461 1563
1462int __init arch_init_kprobes(void) 1564int __init arch_init_kprobes(void)
1463{ 1565{
1464 return 0; 1566 return init_poke_params();
1465} 1567}
1466 1568
1467int __kprobes arch_trampoline_kprobe(struct kprobe *p) 1569int __kprobes arch_trampoline_kprobe(struct kprobe *p)
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 63b0ec8d3d4a..33c07b0b122e 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -27,16 +27,37 @@
27#include <linux/mm.h> 27#include <linux/mm.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/hardirq.h> 29#include <linux/hardirq.h>
30#include <linux/notifier.h>
31#include <linux/reboot.h>
32#include <linux/hash.h>
33#include <linux/sched.h>
34#include <linux/slab.h>
35#include <linux/kprobes.h>
30#include <asm/timer.h> 36#include <asm/timer.h>
37#include <asm/cpu.h>
38#include <asm/traps.h>
39#include <asm/desc.h>
40#include <asm/tlbflush.h>
31 41
32#define MMU_QUEUE_SIZE 1024 42#define MMU_QUEUE_SIZE 1024
33 43
44static int kvmapf = 1;
45
46static int parse_no_kvmapf(char *arg)
47{
48 kvmapf = 0;
49 return 0;
50}
51
52early_param("no-kvmapf", parse_no_kvmapf);
53
34struct kvm_para_state { 54struct kvm_para_state {
35 u8 mmu_queue[MMU_QUEUE_SIZE]; 55 u8 mmu_queue[MMU_QUEUE_SIZE];
36 int mmu_queue_len; 56 int mmu_queue_len;
37}; 57};
38 58
39static DEFINE_PER_CPU(struct kvm_para_state, para_state); 59static DEFINE_PER_CPU(struct kvm_para_state, para_state);
60static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
40 61
41static struct kvm_para_state *kvm_para_state(void) 62static struct kvm_para_state *kvm_para_state(void)
42{ 63{
@@ -50,6 +71,195 @@ static void kvm_io_delay(void)
50{ 71{
51} 72}
52 73
74#define KVM_TASK_SLEEP_HASHBITS 8
75#define KVM_TASK_SLEEP_HASHSIZE (1<<KVM_TASK_SLEEP_HASHBITS)
76
77struct kvm_task_sleep_node {
78 struct hlist_node link;
79 wait_queue_head_t wq;
80 u32 token;
81 int cpu;
82 bool halted;
83 struct mm_struct *mm;
84};
85
86static struct kvm_task_sleep_head {
87 spinlock_t lock;
88 struct hlist_head list;
89} async_pf_sleepers[KVM_TASK_SLEEP_HASHSIZE];
90
91static struct kvm_task_sleep_node *_find_apf_task(struct kvm_task_sleep_head *b,
92 u32 token)
93{
94 struct hlist_node *p;
95
96 hlist_for_each(p, &b->list) {
97 struct kvm_task_sleep_node *n =
98 hlist_entry(p, typeof(*n), link);
99 if (n->token == token)
100 return n;
101 }
102
103 return NULL;
104}
105
106void kvm_async_pf_task_wait(u32 token)
107{
108 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
109 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
110 struct kvm_task_sleep_node n, *e;
111 DEFINE_WAIT(wait);
112 int cpu, idle;
113
114 cpu = get_cpu();
115 idle = idle_cpu(cpu);
116 put_cpu();
117
118 spin_lock(&b->lock);
119 e = _find_apf_task(b, token);
120 if (e) {
121 /* dummy entry exist -> wake up was delivered ahead of PF */
122 hlist_del(&e->link);
123 kfree(e);
124 spin_unlock(&b->lock);
125 return;
126 }
127
128 n.token = token;
129 n.cpu = smp_processor_id();
130 n.mm = current->active_mm;
131 n.halted = idle || preempt_count() > 1;
132 atomic_inc(&n.mm->mm_count);
133 init_waitqueue_head(&n.wq);
134 hlist_add_head(&n.link, &b->list);
135 spin_unlock(&b->lock);
136
137 for (;;) {
138 if (!n.halted)
139 prepare_to_wait(&n.wq, &wait, TASK_UNINTERRUPTIBLE);
140 if (hlist_unhashed(&n.link))
141 break;
142
143 if (!n.halted) {
144 local_irq_enable();
145 schedule();
146 local_irq_disable();
147 } else {
148 /*
149 * We cannot reschedule. So halt.
150 */
151 native_safe_halt();
152 local_irq_disable();
153 }
154 }
155 if (!n.halted)
156 finish_wait(&n.wq, &wait);
157
158 return;
159}
160EXPORT_SYMBOL_GPL(kvm_async_pf_task_wait);
161
162static void apf_task_wake_one(struct kvm_task_sleep_node *n)
163{
164 hlist_del_init(&n->link);
165 if (!n->mm)
166 return;
167 mmdrop(n->mm);
168 if (n->halted)
169 smp_send_reschedule(n->cpu);
170 else if (waitqueue_active(&n->wq))
171 wake_up(&n->wq);
172}
173
174static void apf_task_wake_all(void)
175{
176 int i;
177
178 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++) {
179 struct hlist_node *p, *next;
180 struct kvm_task_sleep_head *b = &async_pf_sleepers[i];
181 spin_lock(&b->lock);
182 hlist_for_each_safe(p, next, &b->list) {
183 struct kvm_task_sleep_node *n =
184 hlist_entry(p, typeof(*n), link);
185 if (n->cpu == smp_processor_id())
186 apf_task_wake_one(n);
187 }
188 spin_unlock(&b->lock);
189 }
190}
191
192void kvm_async_pf_task_wake(u32 token)
193{
194 u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS);
195 struct kvm_task_sleep_head *b = &async_pf_sleepers[key];
196 struct kvm_task_sleep_node *n;
197
198 if (token == ~0) {
199 apf_task_wake_all();
200 return;
201 }
202
203again:
204 spin_lock(&b->lock);
205 n = _find_apf_task(b, token);
206 if (!n) {
207 /*
208 * async PF was not yet handled.
209 * Add dummy entry for the token.
210 */
211 n = kmalloc(sizeof(*n), GFP_ATOMIC);
212 if (!n) {
213 /*
214 * Allocation failed! Busy wait while other cpu
215 * handles async PF.
216 */
217 spin_unlock(&b->lock);
218 cpu_relax();
219 goto again;
220 }
221 n->token = token;
222 n->cpu = smp_processor_id();
223 n->mm = NULL;
224 init_waitqueue_head(&n->wq);
225 hlist_add_head(&n->link, &b->list);
226 } else
227 apf_task_wake_one(n);
228 spin_unlock(&b->lock);
229 return;
230}
231EXPORT_SYMBOL_GPL(kvm_async_pf_task_wake);
232
233u32 kvm_read_and_reset_pf_reason(void)
234{
235 u32 reason = 0;
236
237 if (__get_cpu_var(apf_reason).enabled) {
238 reason = __get_cpu_var(apf_reason).reason;
239 __get_cpu_var(apf_reason).reason = 0;
240 }
241
242 return reason;
243}
244EXPORT_SYMBOL_GPL(kvm_read_and_reset_pf_reason);
245
246dotraplinkage void __kprobes
247do_async_page_fault(struct pt_regs *regs, unsigned long error_code)
248{
249 switch (kvm_read_and_reset_pf_reason()) {
250 default:
251 do_page_fault(regs, error_code);
252 break;
253 case KVM_PV_REASON_PAGE_NOT_PRESENT:
254 /* page is swapped out by the host. */
255 kvm_async_pf_task_wait((u32)read_cr2());
256 break;
257 case KVM_PV_REASON_PAGE_READY:
258 kvm_async_pf_task_wake((u32)read_cr2());
259 break;
260 }
261}
262
53static void kvm_mmu_op(void *buffer, unsigned len) 263static void kvm_mmu_op(void *buffer, unsigned len)
54{ 264{
55 int r; 265 int r;
@@ -231,10 +441,117 @@ static void __init paravirt_ops_setup(void)
231#endif 441#endif
232} 442}
233 443
444void __cpuinit kvm_guest_cpu_init(void)
445{
446 if (!kvm_para_available())
447 return;
448
449 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF) && kvmapf) {
450 u64 pa = __pa(&__get_cpu_var(apf_reason));
451
452#ifdef CONFIG_PREEMPT
453 pa |= KVM_ASYNC_PF_SEND_ALWAYS;
454#endif
455 wrmsrl(MSR_KVM_ASYNC_PF_EN, pa | KVM_ASYNC_PF_ENABLED);
456 __get_cpu_var(apf_reason).enabled = 1;
457 printk(KERN_INFO"KVM setup async PF for cpu %d\n",
458 smp_processor_id());
459 }
460}
461
462static void kvm_pv_disable_apf(void *unused)
463{
464 if (!__get_cpu_var(apf_reason).enabled)
465 return;
466
467 wrmsrl(MSR_KVM_ASYNC_PF_EN, 0);
468 __get_cpu_var(apf_reason).enabled = 0;
469
470 printk(KERN_INFO"Unregister pv shared memory for cpu %d\n",
471 smp_processor_id());
472}
473
474static int kvm_pv_reboot_notify(struct notifier_block *nb,
475 unsigned long code, void *unused)
476{
477 if (code == SYS_RESTART)
478 on_each_cpu(kvm_pv_disable_apf, NULL, 1);
479 return NOTIFY_DONE;
480}
481
482static struct notifier_block kvm_pv_reboot_nb = {
483 .notifier_call = kvm_pv_reboot_notify,
484};
485
486#ifdef CONFIG_SMP
487static void __init kvm_smp_prepare_boot_cpu(void)
488{
489#ifdef CONFIG_KVM_CLOCK
490 WARN_ON(kvm_register_clock("primary cpu clock"));
491#endif
492 kvm_guest_cpu_init();
493 native_smp_prepare_boot_cpu();
494}
495
496static void __cpuinit kvm_guest_cpu_online(void *dummy)
497{
498 kvm_guest_cpu_init();
499}
500
501static void kvm_guest_cpu_offline(void *dummy)
502{
503 kvm_pv_disable_apf(NULL);
504 apf_task_wake_all();
505}
506
507static int __cpuinit kvm_cpu_notify(struct notifier_block *self,
508 unsigned long action, void *hcpu)
509{
510 int cpu = (unsigned long)hcpu;
511 switch (action) {
512 case CPU_ONLINE:
513 case CPU_DOWN_FAILED:
514 case CPU_ONLINE_FROZEN:
515 smp_call_function_single(cpu, kvm_guest_cpu_online, NULL, 0);
516 break;
517 case CPU_DOWN_PREPARE:
518 case CPU_DOWN_PREPARE_FROZEN:
519 smp_call_function_single(cpu, kvm_guest_cpu_offline, NULL, 1);
520 break;
521 default:
522 break;
523 }
524 return NOTIFY_OK;
525}
526
527static struct notifier_block __cpuinitdata kvm_cpu_notifier = {
528 .notifier_call = kvm_cpu_notify,
529};
530#endif
531
532static void __init kvm_apf_trap_init(void)
533{
534 set_intr_gate(14, &async_page_fault);
535}
536
234void __init kvm_guest_init(void) 537void __init kvm_guest_init(void)
235{ 538{
539 int i;
540
236 if (!kvm_para_available()) 541 if (!kvm_para_available())
237 return; 542 return;
238 543
239 paravirt_ops_setup(); 544 paravirt_ops_setup();
545 register_reboot_notifier(&kvm_pv_reboot_nb);
546 for (i = 0; i < KVM_TASK_SLEEP_HASHSIZE; i++)
547 spin_lock_init(&async_pf_sleepers[i].lock);
548 if (kvm_para_has_feature(KVM_FEATURE_ASYNC_PF))
549 x86_init.irqs.trap_init = kvm_apf_trap_init;
550
551#ifdef CONFIG_SMP
552 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
553 register_cpu_notifier(&kvm_cpu_notifier);
554#else
555 kvm_guest_cpu_init();
556#endif
240} 557}
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index eb9b76c716c2..6389a6bca11b 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -26,8 +26,6 @@
26#include <asm/x86_init.h> 26#include <asm/x86_init.h>
27#include <asm/reboot.h> 27#include <asm/reboot.h>
28 28
29#define KVM_SCALE 22
30
31static int kvmclock = 1; 29static int kvmclock = 1;
32static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME; 30static int msr_kvm_system_time = MSR_KVM_SYSTEM_TIME;
33static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK; 31static int msr_kvm_wall_clock = MSR_KVM_WALL_CLOCK;
@@ -120,21 +118,21 @@ static struct clocksource kvm_clock = {
120 .read = kvm_clock_get_cycles, 118 .read = kvm_clock_get_cycles,
121 .rating = 400, 119 .rating = 400,
122 .mask = CLOCKSOURCE_MASK(64), 120 .mask = CLOCKSOURCE_MASK(64),
123 .mult = 1 << KVM_SCALE,
124 .shift = KVM_SCALE,
125 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 121 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
126}; 122};
127 123
128static int kvm_register_clock(char *txt) 124int kvm_register_clock(char *txt)
129{ 125{
130 int cpu = smp_processor_id(); 126 int cpu = smp_processor_id();
131 int low, high; 127 int low, high, ret;
128
132 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1; 129 low = (int)__pa(&per_cpu(hv_clock, cpu)) | 1;
133 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32); 130 high = ((u64)__pa(&per_cpu(hv_clock, cpu)) >> 32);
131 ret = native_write_msr_safe(msr_kvm_system_time, low, high);
134 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n", 132 printk(KERN_INFO "kvm-clock: cpu %d, msr %x:%x, %s\n",
135 cpu, high, low, txt); 133 cpu, high, low, txt);
136 134
137 return native_write_msr_safe(msr_kvm_system_time, low, high); 135 return ret;
138} 136}
139 137
140#ifdef CONFIG_X86_LOCAL_APIC 138#ifdef CONFIG_X86_LOCAL_APIC
@@ -150,14 +148,6 @@ static void __cpuinit kvm_setup_secondary_clock(void)
150} 148}
151#endif 149#endif
152 150
153#ifdef CONFIG_SMP
154static void __init kvm_smp_prepare_boot_cpu(void)
155{
156 WARN_ON(kvm_register_clock("primary cpu clock"));
157 native_smp_prepare_boot_cpu();
158}
159#endif
160
161/* 151/*
162 * After the clock is registered, the host will keep writing to the 152 * After the clock is registered, the host will keep writing to the
163 * registered memory location. If the guest happens to shutdown, this memory 153 * registered memory location. If the guest happens to shutdown, this memory
@@ -204,15 +194,12 @@ void __init kvmclock_init(void)
204 x86_cpuinit.setup_percpu_clockev = 194 x86_cpuinit.setup_percpu_clockev =
205 kvm_setup_secondary_clock; 195 kvm_setup_secondary_clock;
206#endif 196#endif
207#ifdef CONFIG_SMP
208 smp_ops.smp_prepare_boot_cpu = kvm_smp_prepare_boot_cpu;
209#endif
210 machine_ops.shutdown = kvm_shutdown; 197 machine_ops.shutdown = kvm_shutdown;
211#ifdef CONFIG_KEXEC 198#ifdef CONFIG_KEXEC
212 machine_ops.crash_shutdown = kvm_crash_shutdown; 199 machine_ops.crash_shutdown = kvm_crash_shutdown;
213#endif 200#endif
214 kvm_get_preset_lpj(); 201 kvm_get_preset_lpj();
215 clocksource_register(&kvm_clock); 202 clocksource_register_hz(&kvm_clock, NSEC_PER_SEC);
216 pv_info.paravirt_enabled = 1; 203 pv_info.paravirt_enabled = 1;
217 pv_info.name = "KVM"; 204 pv_info.name = "KVM";
218 205
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 035c8c529181..b3ea9db39db6 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -36,7 +36,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
36 if (!page) 36 if (!page)
37 goto out; 37 goto out;
38 pud = (pud_t *)page_address(page); 38 pud = (pud_t *)page_address(page);
39 memset(pud, 0, PAGE_SIZE); 39 clear_page(pud);
40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE)); 40 set_pgd(pgd, __pgd(__pa(pud) | _KERNPG_TABLE));
41 } 41 }
42 pud = pud_offset(pgd, addr); 42 pud = pud_offset(pgd, addr);
@@ -45,7 +45,7 @@ static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
45 if (!page) 45 if (!page)
46 goto out; 46 goto out;
47 pmd = (pmd_t *)page_address(page); 47 pmd = (pmd_t *)page_address(page);
48 memset(pmd, 0, PAGE_SIZE); 48 clear_page(pmd);
49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE)); 49 set_pud(pud, __pud(__pa(pmd) | _KERNPG_TABLE));
50 } 50 }
51 pmd = pmd_offset(pud, addr); 51 pmd = pmd_offset(pud, addr);
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 63eaf6596233..177183cbb6ae 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -259,7 +259,7 @@ static int __init mca_init(void)
259 /* 259 /*
260 * WARNING: Be careful when making changes here. Putting an adapter 260 * WARNING: Be careful when making changes here. Putting an adapter
261 * and the motherboard simultaneously into setup mode may result in 261 * and the motherboard simultaneously into setup mode may result in
262 * damage to chips (according to The Indispensible PC Hardware Book 262 * damage to chips (according to The Indispensable PC Hardware Book
263 * by Hans-Peter Messmer). Also, we disable system interrupts (so 263 * by Hans-Peter Messmer). Also, we disable system interrupts (so
264 * that we are not disturbed in the middle of this). 264 * that we are not disturbed in the middle of this).
265 */ 265 */
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index e1af7c055c7d..c5610384ab16 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -66,7 +66,6 @@ struct microcode_amd {
66 unsigned int mpb[0]; 66 unsigned int mpb[0];
67}; 67};
68 68
69#define UCODE_MAX_SIZE 2048
70#define UCODE_CONTAINER_SECTION_HDR 8 69#define UCODE_CONTAINER_SECTION_HDR 8
71#define UCODE_CONTAINER_HEADER_SIZE 12 70#define UCODE_CONTAINER_HEADER_SIZE 12
72 71
@@ -77,20 +76,20 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
77 struct cpuinfo_x86 *c = &cpu_data(cpu); 76 struct cpuinfo_x86 *c = &cpu_data(cpu);
78 u32 dummy; 77 u32 dummy;
79 78
80 memset(csig, 0, sizeof(*csig));
81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 79 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not " 80 pr_warning("CPU%d: family %d not supported\n", cpu, c->x86);
83 "supported\n", cpu, c->x86);
84 return -1; 81 return -1;
85 } 82 }
83
86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 84 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev); 85 pr_info("CPU%d: patch_level=0x%08x\n", cpu, csig->rev);
86
88 return 0; 87 return 0;
89} 88}
90 89
91static int get_matching_microcode(int cpu, void *mc, int rev) 90static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr,
91 int rev)
92{ 92{
93 struct microcode_header_amd *mc_header = mc;
94 unsigned int current_cpu_id; 93 unsigned int current_cpu_id;
95 u16 equiv_cpu_id = 0; 94 u16 equiv_cpu_id = 0;
96 unsigned int i = 0; 95 unsigned int i = 0;
@@ -109,17 +108,17 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
109 if (!equiv_cpu_id) 108 if (!equiv_cpu_id)
110 return 0; 109 return 0;
111 110
112 if (mc_header->processor_rev_id != equiv_cpu_id) 111 if (mc_hdr->processor_rev_id != equiv_cpu_id)
113 return 0; 112 return 0;
114 113
115 /* ucode might be chipset specific -- currently we don't support this */ 114 /* ucode might be chipset specific -- currently we don't support this */
116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 115 if (mc_hdr->nb_dev_id || mc_hdr->sb_dev_id) {
117 pr_err("CPU%d: loading of chipset specific code not yet supported\n", 116 pr_err("CPU%d: chipset specific code not yet supported\n",
118 cpu); 117 cpu);
119 return 0; 118 return 0;
120 } 119 }
121 120
122 if (mc_header->patch_id <= rev) 121 if (mc_hdr->patch_id <= rev)
123 return 0; 122 return 0;
124 123
125 return 1; 124 return 1;
@@ -144,85 +143,93 @@ static int apply_microcode_amd(int cpu)
144 143
145 /* check current patch id and patch's id for match */ 144 /* check current patch id and patch's id for match */
146 if (rev != mc_amd->hdr.patch_id) { 145 if (rev != mc_amd->hdr.patch_id) {
147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n", 146 pr_err("CPU%d: update failed for patch_level=0x%08x\n",
148 cpu, mc_amd->hdr.patch_id); 147 cpu, mc_amd->hdr.patch_id);
149 return -1; 148 return -1;
150 } 149 }
151 150
152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); 151 pr_info("CPU%d: new patch_level=0x%08x\n", cpu, rev);
153 uci->cpu_sig.rev = rev; 152 uci->cpu_sig.rev = rev;
154 153
155 return 0; 154 return 0;
156} 155}
157 156
158static int get_ucode_data(void *to, const u8 *from, size_t n) 157static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size)
159{ 158{
160 memcpy(to, from, n); 159 struct cpuinfo_x86 *c = &cpu_data(cpu);
161 return 0; 160 unsigned int max_size, actual_size;
161
162#define F1XH_MPB_MAX_SIZE 2048
163#define F14H_MPB_MAX_SIZE 1824
164#define F15H_MPB_MAX_SIZE 4096
165
166 switch (c->x86) {
167 case 0x14:
168 max_size = F14H_MPB_MAX_SIZE;
169 break;
170 case 0x15:
171 max_size = F15H_MPB_MAX_SIZE;
172 break;
173 default:
174 max_size = F1XH_MPB_MAX_SIZE;
175 break;
176 }
177
178 actual_size = buf[4] + (buf[5] << 8);
179
180 if (actual_size > size || actual_size > max_size) {
181 pr_err("section size mismatch\n");
182 return 0;
183 }
184
185 return actual_size;
162} 186}
163 187
164static void * 188static struct microcode_header_amd *
165get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size) 189get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size)
166{ 190{
167 unsigned int total_size; 191 struct microcode_header_amd *mc = NULL;
168 u8 section_hdr[UCODE_CONTAINER_SECTION_HDR]; 192 unsigned int actual_size = 0;
169 void *mc;
170 193
171 if (get_ucode_data(section_hdr, buf, UCODE_CONTAINER_SECTION_HDR)) 194 if (buf[0] != UCODE_UCODE_TYPE) {
172 return NULL; 195 pr_err("invalid type field in container file section header\n");
173 196 goto out;
174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
175 pr_err("error: invalid type field in container file section header\n");
176 return NULL;
177 } 197 }
178 198
179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 199 actual_size = verify_ucode_size(cpu, buf, size);
200 if (!actual_size)
201 goto out;
180 202
181 if (total_size > size || total_size > UCODE_MAX_SIZE) { 203 mc = vzalloc(actual_size);
182 pr_err("error: size mismatch\n"); 204 if (!mc)
183 return NULL; 205 goto out;
184 }
185 206
186 mc = vmalloc(UCODE_MAX_SIZE); 207 get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, actual_size);
187 if (mc) { 208 *mc_size = actual_size + UCODE_CONTAINER_SECTION_HDR;
188 memset(mc, 0, UCODE_MAX_SIZE); 209
189 if (get_ucode_data(mc, buf + UCODE_CONTAINER_SECTION_HDR, 210out:
190 total_size)) {
191 vfree(mc);
192 mc = NULL;
193 } else
194 *mc_size = total_size + UCODE_CONTAINER_SECTION_HDR;
195 }
196 return mc; 211 return mc;
197} 212}
198 213
199static int install_equiv_cpu_table(const u8 *buf) 214static int install_equiv_cpu_table(const u8 *buf)
200{ 215{
201 u8 *container_hdr[UCODE_CONTAINER_HEADER_SIZE]; 216 unsigned int *ibuf = (unsigned int *)buf;
202 unsigned int *buf_pos = (unsigned int *)container_hdr; 217 unsigned int type = ibuf[1];
203 unsigned long size; 218 unsigned int size = ibuf[2];
204 219
205 if (get_ucode_data(&container_hdr, buf, UCODE_CONTAINER_HEADER_SIZE)) 220 if (type != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
206 return 0; 221 pr_err("empty section/"
207 222 "invalid type field in container file section header\n");
208 size = buf_pos[2]; 223 return -EINVAL;
209
210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
211 pr_err("error: invalid type field in container file section header\n");
212 return 0;
213 } 224 }
214 225
215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 226 equiv_cpu_table = vmalloc(size);
216 if (!equiv_cpu_table) { 227 if (!equiv_cpu_table) {
217 pr_err("failed to allocate equivalent CPU table\n"); 228 pr_err("failed to allocate equivalent CPU table\n");
218 return 0; 229 return -ENOMEM;
219 } 230 }
220 231
221 buf += UCODE_CONTAINER_HEADER_SIZE; 232 get_ucode_data(equiv_cpu_table, buf + UCODE_CONTAINER_HEADER_SIZE, size);
222 if (get_ucode_data(equiv_cpu_table, buf, size)) {
223 vfree(equiv_cpu_table);
224 return 0;
225 }
226 233
227 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */ 234 return size + UCODE_CONTAINER_HEADER_SIZE; /* add header length */
228} 235}
@@ -237,16 +244,16 @@ static enum ucode_state
237generic_load_microcode(int cpu, const u8 *data, size_t size) 244generic_load_microcode(int cpu, const u8 *data, size_t size)
238{ 245{
239 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 246 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
247 struct microcode_header_amd *mc_hdr = NULL;
248 unsigned int mc_size, leftover;
249 int offset;
240 const u8 *ucode_ptr = data; 250 const u8 *ucode_ptr = data;
241 void *new_mc = NULL; 251 void *new_mc = NULL;
242 void *mc; 252 unsigned int new_rev = uci->cpu_sig.rev;
243 int new_rev = uci->cpu_sig.rev;
244 unsigned int leftover;
245 unsigned long offset;
246 enum ucode_state state = UCODE_OK; 253 enum ucode_state state = UCODE_OK;
247 254
248 offset = install_equiv_cpu_table(ucode_ptr); 255 offset = install_equiv_cpu_table(ucode_ptr);
249 if (!offset) { 256 if (offset < 0) {
250 pr_err("failed to create equivalent cpu table\n"); 257 pr_err("failed to create equivalent cpu table\n");
251 return UCODE_ERROR; 258 return UCODE_ERROR;
252 } 259 }
@@ -255,64 +262,65 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
255 leftover = size - offset; 262 leftover = size - offset;
256 263
257 while (leftover) { 264 while (leftover) {
258 unsigned int uninitialized_var(mc_size); 265 mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size);
259 struct microcode_header_amd *mc_header; 266 if (!mc_hdr)
260
261 mc = get_next_ucode(ucode_ptr, leftover, &mc_size);
262 if (!mc)
263 break; 267 break;
264 268
265 mc_header = (struct microcode_header_amd *)mc; 269 if (get_matching_microcode(cpu, mc_hdr, new_rev)) {
266 if (get_matching_microcode(cpu, mc, new_rev)) {
267 vfree(new_mc); 270 vfree(new_mc);
268 new_rev = mc_header->patch_id; 271 new_rev = mc_hdr->patch_id;
269 new_mc = mc; 272 new_mc = mc_hdr;
270 } else 273 } else
271 vfree(mc); 274 vfree(mc_hdr);
272 275
273 ucode_ptr += mc_size; 276 ucode_ptr += mc_size;
274 leftover -= mc_size; 277 leftover -= mc_size;
275 } 278 }
276 279
277 if (new_mc) { 280 if (!new_mc) {
278 if (!leftover) {
279 vfree(uci->mc);
280 uci->mc = new_mc;
281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
282 cpu, new_rev, uci->cpu_sig.rev);
283 } else {
284 vfree(new_mc);
285 state = UCODE_ERROR;
286 }
287 } else
288 state = UCODE_NFOUND; 281 state = UCODE_NFOUND;
282 goto free_table;
283 }
289 284
285 if (!leftover) {
286 vfree(uci->mc);
287 uci->mc = new_mc;
288 pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n",
289 cpu, uci->cpu_sig.rev, new_rev);
290 } else {
291 vfree(new_mc);
292 state = UCODE_ERROR;
293 }
294
295free_table:
290 free_equiv_cpu_table(); 296 free_equiv_cpu_table();
291 297
292 return state; 298 return state;
293} 299}
294 300
295static enum ucode_state request_microcode_fw(int cpu, struct device *device) 301static enum ucode_state request_microcode_amd(int cpu, struct device *device)
296{ 302{
297 const char *fw_name = "amd-ucode/microcode_amd.bin"; 303 const char *fw_name = "amd-ucode/microcode_amd.bin";
298 const struct firmware *firmware; 304 const struct firmware *fw;
299 enum ucode_state ret; 305 enum ucode_state ret = UCODE_NFOUND;
300 306
301 if (request_firmware(&firmware, fw_name, device)) { 307 if (request_firmware(&fw, fw_name, device)) {
302 printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); 308 pr_err("failed to load file %s\n", fw_name);
303 return UCODE_NFOUND; 309 goto out;
304 } 310 }
305 311
306 if (*(u32 *)firmware->data != UCODE_MAGIC) { 312 ret = UCODE_ERROR;
307 pr_err("invalid UCODE_MAGIC (0x%08x)\n", 313 if (*(u32 *)fw->data != UCODE_MAGIC) {
308 *(u32 *)firmware->data); 314 pr_err("invalid magic value (0x%08x)\n", *(u32 *)fw->data);
309 return UCODE_ERROR; 315 goto fw_release;
310 } 316 }
311 317
312 ret = generic_load_microcode(cpu, firmware->data, firmware->size); 318 ret = generic_load_microcode(cpu, fw->data, fw->size);
313 319
314 release_firmware(firmware); 320fw_release:
321 release_firmware(fw);
315 322
323out:
316 return ret; 324 return ret;
317} 325}
318 326
@@ -333,7 +341,7 @@ static void microcode_fini_cpu_amd(int cpu)
333 341
334static struct microcode_ops microcode_amd_ops = { 342static struct microcode_ops microcode_amd_ops = {
335 .request_microcode_user = request_microcode_user, 343 .request_microcode_user = request_microcode_user,
336 .request_microcode_fw = request_microcode_fw, 344 .request_microcode_fw = request_microcode_amd,
337 .collect_cpu_info = collect_cpu_info_amd, 345 .collect_cpu_info = collect_cpu_info_amd,
338 .apply_microcode = apply_microcode_amd, 346 .apply_microcode = apply_microcode_amd,
339 .microcode_fini_cpu = microcode_fini_cpu_amd, 347 .microcode_fini_cpu = microcode_fini_cpu_amd,
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index fa6551d36c10..f9242800bc84 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -12,7 +12,7 @@
12 * Software Developer's Manual 12 * Software Developer's Manual
13 * Order Number 253668 or free download from: 13 * Order Number 253668 or free download from:
14 * 14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm 15 * http://developer.intel.com/Assets/PDF/manual/253668.pdf
16 * 16 *
17 * For more information, go to http://www.urbanmyth.org/microcode 17 * For more information, go to http://www.urbanmyth.org/microcode
18 * 18 *
@@ -82,6 +82,7 @@
82#include <linux/cpu.h> 82#include <linux/cpu.h>
83#include <linux/fs.h> 83#include <linux/fs.h>
84#include <linux/mm.h> 84#include <linux/mm.h>
85#include <linux/syscore_ops.h>
85 86
86#include <asm/microcode.h> 87#include <asm/microcode.h>
87#include <asm/processor.h> 88#include <asm/processor.h>
@@ -232,6 +233,7 @@ static const struct file_operations microcode_fops = {
232 .owner = THIS_MODULE, 233 .owner = THIS_MODULE,
233 .write = microcode_write, 234 .write = microcode_write,
234 .open = microcode_open, 235 .open = microcode_open,
236 .llseek = no_llseek,
235}; 237};
236 238
237static struct miscdevice microcode_dev = { 239static struct miscdevice microcode_dev = {
@@ -416,8 +418,10 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
416 if (err) 418 if (err)
417 return err; 419 return err;
418 420
419 if (microcode_init_cpu(cpu) == UCODE_ERROR) 421 if (microcode_init_cpu(cpu) == UCODE_ERROR) {
420 err = -EINVAL; 422 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
423 return -EINVAL;
424 }
421 425
422 return err; 426 return err;
423} 427}
@@ -435,33 +439,25 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
435 return 0; 439 return 0;
436} 440}
437 441
438static int mc_sysdev_resume(struct sys_device *dev) 442static struct sysdev_driver mc_sysdev_driver = {
443 .add = mc_sysdev_add,
444 .remove = mc_sysdev_remove,
445};
446
447/**
448 * mc_bp_resume - Update boot CPU microcode during resume.
449 */
450static void mc_bp_resume(void)
439{ 451{
440 int cpu = dev->id; 452 int cpu = smp_processor_id();
441 struct ucode_cpu_info *uci = ucode_cpu_info + cpu; 453 struct ucode_cpu_info *uci = ucode_cpu_info + cpu;
442 454
443 if (!cpu_online(cpu))
444 return 0;
445
446 /*
447 * All non-bootup cpus are still disabled,
448 * so only CPU 0 will apply ucode here.
449 *
450 * Moreover, there can be no concurrent
451 * updates from any other places at this point.
452 */
453 WARN_ON(cpu != 0);
454
455 if (uci->valid && uci->mc) 455 if (uci->valid && uci->mc)
456 microcode_ops->apply_microcode(cpu); 456 microcode_ops->apply_microcode(cpu);
457
458 return 0;
459} 457}
460 458
461static struct sysdev_driver mc_sysdev_driver = { 459static struct syscore_ops mc_syscore_ops = {
462 .add = mc_sysdev_add, 460 .resume = mc_bp_resume,
463 .remove = mc_sysdev_remove,
464 .resume = mc_sysdev_resume,
465}; 461};
466 462
467static __cpuinit int 463static __cpuinit int
@@ -539,6 +535,7 @@ static int __init microcode_init(void)
539 if (error) 535 if (error)
540 return error; 536 return error;
541 537
538 register_syscore_ops(&mc_syscore_ops);
542 register_hotcpu_notifier(&mc_cpu_notifier); 539 register_hotcpu_notifier(&mc_cpu_notifier);
543 540
544 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 541 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
@@ -553,6 +550,7 @@ static void __exit microcode_exit(void)
553 microcode_dev_exit(); 550 microcode_dev_exit();
554 551
555 unregister_hotcpu_notifier(&mc_cpu_notifier); 552 unregister_hotcpu_notifier(&mc_cpu_notifier);
553 unregister_syscore_ops(&mc_syscore_ops);
556 554
557 get_online_cpus(); 555 get_online_cpus();
558 mutex_lock(&microcode_mutex); 556 mutex_lock(&microcode_mutex);
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 356170262a93..1a1b606d3e92 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -12,7 +12,7 @@
12 * Software Developer's Manual 12 * Software Developer's Manual
13 * Order Number 253668 or free download from: 13 * Order Number 253668 or free download from:
14 * 14 *
15 * http://developer.intel.com/design/pentium4/manuals/253668.htm 15 * http://developer.intel.com/Assets/PDF/manual/253668.pdf
16 * 16 *
17 * For more information, go to http://www.urbanmyth.org/microcode 17 * For more information, go to http://www.urbanmyth.org/microcode
18 * 18 *
@@ -364,8 +364,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
364 364
365 /* For performance reasons, reuse mc area when possible */ 365 /* For performance reasons, reuse mc area when possible */
366 if (!mc || mc_size > curr_mc_size) { 366 if (!mc || mc_size > curr_mc_size) {
367 if (mc) 367 vfree(mc);
368 vfree(mc);
369 mc = vmalloc(mc_size); 368 mc = vmalloc(mc_size);
370 if (!mc) 369 if (!mc)
371 break; 370 break;
@@ -374,13 +373,11 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
374 373
375 if (get_ucode_data(mc, ucode_ptr, mc_size) || 374 if (get_ucode_data(mc, ucode_ptr, mc_size) ||
376 microcode_sanity_check(mc) < 0) { 375 microcode_sanity_check(mc) < 0) {
377 vfree(mc);
378 break; 376 break;
379 } 377 }
380 378
381 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) { 379 if (get_matching_microcode(&uci->cpu_sig, mc, new_rev)) {
382 if (new_mc) 380 vfree(new_mc);
383 vfree(new_mc);
384 new_rev = mc_header.rev; 381 new_rev = mc_header.rev;
385 new_mc = mc; 382 new_mc = mc;
386 mc = NULL; /* trigger new vmalloc */ 383 mc = NULL; /* trigger new vmalloc */
@@ -390,12 +387,10 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
390 leftover -= mc_size; 387 leftover -= mc_size;
391 } 388 }
392 389
393 if (mc) 390 vfree(mc);
394 vfree(mc);
395 391
396 if (leftover) { 392 if (leftover) {
397 if (new_mc) 393 vfree(new_mc);
398 vfree(new_mc);
399 state = UCODE_ERROR; 394 state = UCODE_ERROR;
400 goto out; 395 goto out;
401 } 396 }
@@ -405,8 +400,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 goto out; 400 goto out;
406 } 401 }
407 402
408 if (uci->mc) 403 vfree(uci->mc);
409 vfree(uci->mc);
410 uci->mc = (struct microcode_intel *)new_mc; 404 uci->mc = (struct microcode_intel *)new_mc;
411 405
412 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n", 406 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 71825806cd44..ac861b8348e2 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -25,7 +25,6 @@ struct pci_hostbridge_probe {
25}; 25};
26 26
27static u64 __cpuinitdata fam10h_pci_mmconf_base; 27static u64 __cpuinitdata fam10h_pci_mmconf_base;
28static int __cpuinitdata fam10h_pci_mmconf_base_status;
29 28
30static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = { 29static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
31 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 }, 30 { 0, 0x18, PCI_VENDOR_ID_AMD, 0x1200 },
@@ -44,10 +43,12 @@ static int __cpuinit cmp_range(const void *x1, const void *x2)
44 return start1 - start2; 43 return start1 - start2;
45} 44}
46 45
47/*[47:0] */ 46#define MMCONF_UNIT (1ULL << FAM10H_MMIO_CONF_BASE_SHIFT)
48/* need to avoid (0xfd<<32) and (0xfe<<32), ht used space */ 47#define MMCONF_MASK (~(MMCONF_UNIT - 1))
48#define MMCONF_SIZE (MMCONF_UNIT << 8)
49/* need to avoid (0xfd<<32), (0xfe<<32), and (0xff<<32), ht used space */
49#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32) 50#define FAM10H_PCI_MMCONF_BASE (0xfcULL<<32)
50#define BASE_VALID(b) ((b != (0xfdULL << 32)) && (b != (0xfeULL << 32))) 51#define BASE_VALID(b) ((b) + MMCONF_SIZE <= (0xfdULL<<32) || (b) >= (1ULL<<40))
51static void __cpuinit get_fam10h_pci_mmconf_base(void) 52static void __cpuinit get_fam10h_pci_mmconf_base(void)
52{ 53{
53 int i; 54 int i;
@@ -64,12 +65,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
64 struct range range[8]; 65 struct range range[8];
65 66
66 /* only try to get setting from BSP */ 67 /* only try to get setting from BSP */
67 /* -1 or 1 */ 68 if (fam10h_pci_mmconf_base)
68 if (fam10h_pci_mmconf_base_status)
69 return; 69 return;
70 70
71 if (!early_pci_allowed()) 71 if (!early_pci_allowed())
72 goto fail; 72 return;
73 73
74 found = 0; 74 found = 0;
75 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 75 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
@@ -91,7 +91,7 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
91 } 91 }
92 92
93 if (!found) 93 if (!found)
94 goto fail; 94 return;
95 95
96 /* SYS_CFG */ 96 /* SYS_CFG */
97 address = MSR_K8_SYSCFG; 97 address = MSR_K8_SYSCFG;
@@ -99,16 +99,16 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
99 99
100 /* TOP_MEM2 is not enabled? */ 100 /* TOP_MEM2 is not enabled? */
101 if (!(val & (1<<21))) { 101 if (!(val & (1<<21))) {
102 tom2 = 0; 102 tom2 = 1ULL << 32;
103 } else { 103 } else {
104 /* TOP_MEM2 */ 104 /* TOP_MEM2 */
105 address = MSR_K8_TOP_MEM2; 105 address = MSR_K8_TOP_MEM2;
106 rdmsrl(address, val); 106 rdmsrl(address, val);
107 tom2 = val & (0xffffULL<<32); 107 tom2 = max(val & 0xffffff800000ULL, 1ULL << 32);
108 } 108 }
109 109
110 if (base <= tom2) 110 if (base <= tom2)
111 base = tom2 + (1ULL<<32); 111 base = (tom2 + 2 * MMCONF_UNIT - 1) & MMCONF_MASK;
112 112
113 /* 113 /*
114 * need to check if the range is in the high mmio range that is 114 * need to check if the range is in the high mmio range that is
@@ -123,11 +123,11 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
123 if (!(reg & 3)) 123 if (!(reg & 3))
124 continue; 124 continue;
125 125
126 start = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ 126 start = (u64)(reg & 0xffffff00) << 8; /* 39:16 on 31:8*/
127 reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3)); 127 reg = read_pci_config(bus, slot, 1, 0x84 + (i << 3));
128 end = (((u64)reg) << 8) & (0xffULL << 32); /* 39:16 on 31:8*/ 128 end = ((u64)(reg & 0xffffff00) << 8) | 0xffff; /* 39:16 on 31:8*/
129 129
130 if (!end) 130 if (end < tom2)
131 continue; 131 continue;
132 132
133 range[hi_mmio_num].start = start; 133 range[hi_mmio_num].start = start;
@@ -143,32 +143,27 @@ static void __cpuinit get_fam10h_pci_mmconf_base(void)
143 143
144 if (range[hi_mmio_num - 1].end < base) 144 if (range[hi_mmio_num - 1].end < base)
145 goto out; 145 goto out;
146 if (range[0].start > base) 146 if (range[0].start > base + MMCONF_SIZE)
147 goto out; 147 goto out;
148 148
149 /* need to find one window */ 149 /* need to find one window */
150 base = range[0].start - (1ULL << 32); 150 base = (range[0].start & MMCONF_MASK) - MMCONF_UNIT;
151 if ((base > tom2) && BASE_VALID(base)) 151 if ((base > tom2) && BASE_VALID(base))
152 goto out; 152 goto out;
153 base = range[hi_mmio_num - 1].end + (1ULL << 32); 153 base = (range[hi_mmio_num - 1].end + MMCONF_UNIT) & MMCONF_MASK;
154 if ((base > tom2) && BASE_VALID(base)) 154 if (BASE_VALID(base))
155 goto out; 155 goto out;
156 /* need to find window between ranges */ 156 /* need to find window between ranges */
157 if (hi_mmio_num > 1) 157 for (i = 1; i < hi_mmio_num; i++) {
158 for (i = 0; i < hi_mmio_num - 1; i++) { 158 base = (range[i - 1].end + MMCONF_UNIT) & MMCONF_MASK;
159 if (range[i + 1].start > (range[i].end + (1ULL << 32))) { 159 val = range[i].start & MMCONF_MASK;
160 base = range[i].end + (1ULL << 32); 160 if (val >= base + MMCONF_SIZE && BASE_VALID(base))
161 if ((base > tom2) && BASE_VALID(base)) 161 goto out;
162 goto out;
163 }
164 } 162 }
165
166fail:
167 fam10h_pci_mmconf_base_status = -1;
168 return; 163 return;
164
169out: 165out:
170 fam10h_pci_mmconf_base = base; 166 fam10h_pci_mmconf_base = base;
171 fam10h_pci_mmconf_base_status = 1;
172} 167}
173 168
174void __cpuinit fam10h_check_enable_mmcfg(void) 169void __cpuinit fam10h_check_enable_mmcfg(void)
@@ -190,11 +185,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
190 185
191 /* only trust the one handle 256 buses, if acpi=off */ 186 /* only trust the one handle 256 buses, if acpi=off */
192 if (!acpi_pci_disabled || busnbits >= 8) { 187 if (!acpi_pci_disabled || busnbits >= 8) {
193 u64 base; 188 u64 base = val & MMCONF_MASK;
194 base = val & (0xffffULL << 32); 189
195 if (fam10h_pci_mmconf_base_status <= 0) { 190 if (!fam10h_pci_mmconf_base) {
196 fam10h_pci_mmconf_base = base; 191 fam10h_pci_mmconf_base = base;
197 fam10h_pci_mmconf_base_status = 1;
198 return; 192 return;
199 } else if (fam10h_pci_mmconf_base == base) 193 } else if (fam10h_pci_mmconf_base == base)
200 return; 194 return;
@@ -206,8 +200,10 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
206 * with 256 buses 200 * with 256 buses
207 */ 201 */
208 get_fam10h_pci_mmconf_base(); 202 get_fam10h_pci_mmconf_base();
209 if (fam10h_pci_mmconf_base_status <= 0) 203 if (!fam10h_pci_mmconf_base) {
204 pci_probe &= ~PCI_CHECK_ENABLE_AMD_MMCONF;
210 return; 205 return;
206 }
211 207
212 printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n"); 208 printk(KERN_INFO "Enable MMCONFIG on AMD Family 10h\n");
213 val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) | 209 val &= ~((FAM10H_MMIO_CONF_BASE_MASK<<FAM10H_MMIO_CONF_BASE_SHIFT) |
@@ -217,13 +213,13 @@ void __cpuinit fam10h_check_enable_mmcfg(void)
217 wrmsrl(address, val); 213 wrmsrl(address, val);
218} 214}
219 215
220static int __devinit set_check_enable_amd_mmconf(const struct dmi_system_id *d) 216static int __init set_check_enable_amd_mmconf(const struct dmi_system_id *d)
221{ 217{
222 pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF; 218 pci_probe |= PCI_CHECK_ENABLE_AMD_MMCONF;
223 return 0; 219 return 0;
224} 220}
225 221
226static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = { 222static const struct dmi_system_id __initconst mmconf_dmi_table[] = {
227 { 223 {
228 .callback = set_check_enable_amd_mmconf, 224 .callback = set_check_enable_amd_mmconf,
229 .ident = "Sun Microsystems Machine", 225 .ident = "Sun Microsystems Machine",
@@ -234,7 +230,8 @@ static const struct dmi_system_id __cpuinitconst mmconf_dmi_table[] = {
234 {} 230 {}
235}; 231};
236 232
237void __cpuinit check_enable_amd_mmconf_dmi(void) 233/* Called from a __cpuinit function, but only on the BSP. */
234void __ref check_enable_amd_mmconf_dmi(void)
238{ 235{
239 dmi_check_system(mmconf_dmi_table); 236 dmi_check_system(mmconf_dmi_table);
240} 237}
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 1c355c550960..52f256f2cc81 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -24,6 +24,7 @@
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h> 26#include <linux/gfp.h>
27#include <linux/jump_label.h>
27 28
28#include <asm/system.h> 29#include <asm/system.h>
29#include <asm/page.h> 30#include <asm/page.h>
@@ -37,20 +38,11 @@
37 38
38void *module_alloc(unsigned long size) 39void *module_alloc(unsigned long size)
39{ 40{
40 struct vm_struct *area; 41 if (PAGE_ALIGN(size) > MODULES_LEN)
41
42 if (!size)
43 return NULL;
44 size = PAGE_ALIGN(size);
45 if (size > MODULES_LEN)
46 return NULL;
47
48 area = __get_vm_area(size, VM_ALLOC, MODULES_VADDR, MODULES_END);
49 if (!area)
50 return NULL; 42 return NULL;
51 43 return __vmalloc_node_range(size, 1, MODULES_VADDR, MODULES_END,
52 return __vmalloc_area(area, GFP_KERNEL | __GFP_HIGHMEM, 44 GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC,
53 PAGE_KERNEL_EXEC); 45 -1, __builtin_return_address(0));
54} 46}
55 47
56/* Free memory returned from module_alloc */ 48/* Free memory returned from module_alloc */
@@ -239,6 +231,9 @@ int module_finalize(const Elf_Ehdr *hdr,
239 apply_paravirt(pseg, pseg + para->sh_size); 231 apply_paravirt(pseg, pseg + para->sh_size);
240 } 232 }
241 233
234 /* make jump label nops */
235 jump_label_apply_nops(me);
236
242 return 0; 237 return 0;
243} 238}
244 239
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index d7b6f7fb4fec..9103b89c145a 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -11,6 +11,7 @@
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/delay.h> 12#include <linux/delay.h>
13#include <linux/bootmem.h> 13#include <linux/bootmem.h>
14#include <linux/memblock.h>
14#include <linux/kernel_stat.h> 15#include <linux/kernel_stat.h>
15#include <linux/mc146818rtc.h> 16#include <linux/mc146818rtc.h>
16#include <linux/bitops.h> 17#include <linux/bitops.h>
@@ -117,21 +118,8 @@ static void __init MP_bus_info(struct mpc_bus *m)
117 118
118static void __init MP_ioapic_info(struct mpc_ioapic *m) 119static void __init MP_ioapic_info(struct mpc_ioapic *m)
119{ 120{
120 if (!(m->flags & MPC_APIC_USABLE)) 121 if (m->flags & MPC_APIC_USABLE)
121 return; 122 mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
122
123 printk(KERN_INFO "I/O APIC #%d Version %d at 0x%X.\n",
124 m->apicid, m->apicver, m->apicaddr);
125
126 mp_register_ioapic(m->apicid, m->apicaddr, gsi_top);
127}
128
129static void print_MP_intsrc_info(struct mpc_intsrc *m)
130{
131 apic_printk(APIC_VERBOSE, "Int: type %d, pol %d, trig %d, bus %02x,"
132 " IRQ %02x, APIC ID %x, APIC INT %02x\n",
133 m->irqtype, m->irqflag & 3, (m->irqflag >> 2) & 3, m->srcbus,
134 m->srcbusirq, m->dstapic, m->dstirq);
135} 123}
136 124
137static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq) 125static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
@@ -143,73 +131,11 @@ static void __init print_mp_irq_info(struct mpc_intsrc *mp_irq)
143 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq); 131 mp_irq->srcbusirq, mp_irq->dstapic, mp_irq->dstirq);
144} 132}
145 133
146static void __init assign_to_mp_irq(struct mpc_intsrc *m,
147 struct mpc_intsrc *mp_irq)
148{
149 mp_irq->dstapic = m->dstapic;
150 mp_irq->type = m->type;
151 mp_irq->irqtype = m->irqtype;
152 mp_irq->irqflag = m->irqflag;
153 mp_irq->srcbus = m->srcbus;
154 mp_irq->srcbusirq = m->srcbusirq;
155 mp_irq->dstirq = m->dstirq;
156}
157
158static void __init assign_to_mpc_intsrc(struct mpc_intsrc *mp_irq,
159 struct mpc_intsrc *m)
160{
161 m->dstapic = mp_irq->dstapic;
162 m->type = mp_irq->type;
163 m->irqtype = mp_irq->irqtype;
164 m->irqflag = mp_irq->irqflag;
165 m->srcbus = mp_irq->srcbus;
166 m->srcbusirq = mp_irq->srcbusirq;
167 m->dstirq = mp_irq->dstirq;
168}
169
170static int __init mp_irq_mpc_intsrc_cmp(struct mpc_intsrc *mp_irq,
171 struct mpc_intsrc *m)
172{
173 if (mp_irq->dstapic != m->dstapic)
174 return 1;
175 if (mp_irq->type != m->type)
176 return 2;
177 if (mp_irq->irqtype != m->irqtype)
178 return 3;
179 if (mp_irq->irqflag != m->irqflag)
180 return 4;
181 if (mp_irq->srcbus != m->srcbus)
182 return 5;
183 if (mp_irq->srcbusirq != m->srcbusirq)
184 return 6;
185 if (mp_irq->dstirq != m->dstirq)
186 return 7;
187
188 return 0;
189}
190
191static void __init MP_intsrc_info(struct mpc_intsrc *m)
192{
193 int i;
194
195 print_MP_intsrc_info(m);
196
197 for (i = 0; i < mp_irq_entries; i++) {
198 if (!mp_irq_mpc_intsrc_cmp(&mp_irqs[i], m))
199 return;
200 }
201
202 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
203 if (++mp_irq_entries == MAX_IRQ_SOURCES)
204 panic("Max # of irq sources exceeded!!\n");
205}
206#else /* CONFIG_X86_IO_APIC */ 134#else /* CONFIG_X86_IO_APIC */
207static inline void __init MP_bus_info(struct mpc_bus *m) {} 135static inline void __init MP_bus_info(struct mpc_bus *m) {}
208static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {} 136static inline void __init MP_ioapic_info(struct mpc_ioapic *m) {}
209static inline void __init MP_intsrc_info(struct mpc_intsrc *m) {}
210#endif /* CONFIG_X86_IO_APIC */ 137#endif /* CONFIG_X86_IO_APIC */
211 138
212
213static void __init MP_lintsrc_info(struct mpc_lintsrc *m) 139static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
214{ 140{
215 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x," 141 apic_printk(APIC_VERBOSE, "Lint: type %d, pol %d, trig %d, bus %02x,"
@@ -221,7 +147,6 @@ static void __init MP_lintsrc_info(struct mpc_lintsrc *m)
221/* 147/*
222 * Read/parse the MPC 148 * Read/parse the MPC
223 */ 149 */
224
225static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str) 150static int __init smp_check_mpc(struct mpc_table *mpc, char *oem, char *str)
226{ 151{
227 152
@@ -274,18 +199,6 @@ static void __init smp_dump_mptable(struct mpc_table *mpc, unsigned char *mpt)
274 199
275void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { } 200void __init default_smp_read_mpc_oem(struct mpc_table *mpc) { }
276 201
277static void __init smp_register_lapic_address(unsigned long address)
278{
279 mp_lapic_addr = address;
280
281 set_fixmap_nocache(FIX_APIC_BASE, address);
282 if (boot_cpu_physical_apicid == -1U) {
283 boot_cpu_physical_apicid = read_apic_id();
284 apic_version[boot_cpu_physical_apicid] =
285 GET_APIC_VERSION(apic_read(APIC_LVR));
286 }
287}
288
289static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early) 202static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
290{ 203{
291 char str[16]; 204 char str[16];
@@ -300,17 +213,13 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
300#ifdef CONFIG_X86_32 213#ifdef CONFIG_X86_32
301 generic_mps_oem_check(mpc, oem, str); 214 generic_mps_oem_check(mpc, oem, str);
302#endif 215#endif
303 /* save the local APIC address, it might be non-default */ 216 /* Initialize the lapic mapping */
304 if (!acpi_lapic) 217 if (!acpi_lapic)
305 mp_lapic_addr = mpc->lapic; 218 register_lapic_address(mpc->lapic);
306 219
307 if (early) 220 if (early)
308 return 1; 221 return 1;
309 222
310 /* Initialize the lapic mapping */
311 if (!acpi_lapic)
312 smp_register_lapic_address(mpc->lapic);
313
314 if (mpc->oemptr) 223 if (mpc->oemptr)
315 x86_init.mpparse.smp_read_mpc_oem(mpc); 224 x86_init.mpparse.smp_read_mpc_oem(mpc);
316 225
@@ -336,7 +245,7 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
336 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic)); 245 skip_entry(&mpt, &count, sizeof(struct mpc_ioapic));
337 break; 246 break;
338 case MP_INTSRC: 247 case MP_INTSRC:
339 MP_intsrc_info((struct mpc_intsrc *)mpt); 248 mp_save_irq((struct mpc_intsrc *)mpt);
340 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc)); 249 skip_entry(&mpt, &count, sizeof(struct mpc_intsrc));
341 break; 250 break;
342 case MP_LINTSRC: 251 case MP_LINTSRC:
@@ -376,7 +285,7 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
376 intsrc.type = MP_INTSRC; 285 intsrc.type = MP_INTSRC;
377 intsrc.irqflag = 0; /* conforming */ 286 intsrc.irqflag = 0; /* conforming */
378 intsrc.srcbus = 0; 287 intsrc.srcbus = 0;
379 intsrc.dstapic = mp_ioapics[0].apicid; 288 intsrc.dstapic = mpc_ioapic_id(0);
380 289
381 intsrc.irqtype = mp_INT; 290 intsrc.irqtype = mp_INT;
382 291
@@ -428,13 +337,13 @@ static void __init construct_default_ioirq_mptable(int mpc_default_type)
428 337
429 intsrc.srcbusirq = i; 338 intsrc.srcbusirq = i;
430 intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */ 339 intsrc.dstirq = i ? i : 2; /* IRQ0 to INTIN2 */
431 MP_intsrc_info(&intsrc); 340 mp_save_irq(&intsrc);
432 } 341 }
433 342
434 intsrc.irqtype = mp_ExtINT; 343 intsrc.irqtype = mp_ExtINT;
435 intsrc.srcbusirq = 0; 344 intsrc.srcbusirq = 0;
436 intsrc.dstirq = 0; /* 8259A to INTIN0 */ 345 intsrc.dstirq = 0; /* 8259A to INTIN0 */
437 MP_intsrc_info(&intsrc); 346 mp_save_irq(&intsrc);
438} 347}
439 348
440 349
@@ -657,7 +566,7 @@ static void __init smp_reserve_memory(struct mpf_intel *mpf)
657{ 566{
658 unsigned long size = get_mpc_size(mpf->physptr); 567 unsigned long size = get_mpc_size(mpf->physptr);
659 568
660 reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc"); 569 memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc");
661} 570}
662 571
663static int __init smp_scan_config(unsigned long base, unsigned long length) 572static int __init smp_scan_config(unsigned long base, unsigned long length)
@@ -686,7 +595,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length)
686 mpf, (u64)virt_to_phys(mpf)); 595 mpf, (u64)virt_to_phys(mpf));
687 596
688 mem = virt_to_phys(mpf); 597 mem = virt_to_phys(mpf);
689 reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf"); 598 memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf");
690 if (mpf->physptr) 599 if (mpf->physptr)
691 smp_reserve_memory(mpf); 600 smp_reserve_memory(mpf);
692 601
@@ -783,11 +692,11 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
783 int i; 692 int i;
784 693
785 apic_printk(APIC_VERBOSE, "OLD "); 694 apic_printk(APIC_VERBOSE, "OLD ");
786 print_MP_intsrc_info(m); 695 print_mp_irq_info(m);
787 696
788 i = get_MP_intsrc_index(m); 697 i = get_MP_intsrc_index(m);
789 if (i > 0) { 698 if (i > 0) {
790 assign_to_mpc_intsrc(&mp_irqs[i], m); 699 memcpy(m, &mp_irqs[i], sizeof(*m));
791 apic_printk(APIC_VERBOSE, "NEW "); 700 apic_printk(APIC_VERBOSE, "NEW ");
792 print_mp_irq_info(&mp_irqs[i]); 701 print_mp_irq_info(&mp_irqs[i]);
793 return; 702 return;
@@ -805,23 +714,21 @@ static void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare)
805 *nr_m_spare += 1; 714 *nr_m_spare += 1;
806 } 715 }
807} 716}
808#else /* CONFIG_X86_IO_APIC */
809static
810inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
811#endif /* CONFIG_X86_IO_APIC */
812 717
813static int 718static int __init
814check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count) 719check_slot(unsigned long mpc_new_phys, unsigned long mpc_new_length, int count)
815{ 720{
816 int ret = 0;
817
818 if (!mpc_new_phys || count <= mpc_new_length) { 721 if (!mpc_new_phys || count <= mpc_new_length) {
819 WARN(1, "update_mptable: No spare slots (length: %x)\n", count); 722 WARN(1, "update_mptable: No spare slots (length: %x)\n", count);
820 return -1; 723 return -1;
821 } 724 }
822 725
823 return ret; 726 return 0;
824} 727}
728#else /* CONFIG_X86_IO_APIC */
729static
730inline void __init check_irq_src(struct mpc_intsrc *m, int *nr_m_spare) {}
731#endif /* CONFIG_X86_IO_APIC */
825 732
826static int __init replace_intsrc_all(struct mpc_table *mpc, 733static int __init replace_intsrc_all(struct mpc_table *mpc,
827 unsigned long mpc_new_phys, 734 unsigned long mpc_new_phys,
@@ -874,14 +781,14 @@ static int __init replace_intsrc_all(struct mpc_table *mpc,
874 if (nr_m_spare > 0) { 781 if (nr_m_spare > 0) {
875 apic_printk(APIC_VERBOSE, "*NEW* found\n"); 782 apic_printk(APIC_VERBOSE, "*NEW* found\n");
876 nr_m_spare--; 783 nr_m_spare--;
877 assign_to_mpc_intsrc(&mp_irqs[i], m_spare[nr_m_spare]); 784 memcpy(m_spare[nr_m_spare], &mp_irqs[i], sizeof(mp_irqs[i]));
878 m_spare[nr_m_spare] = NULL; 785 m_spare[nr_m_spare] = NULL;
879 } else { 786 } else {
880 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt; 787 struct mpc_intsrc *m = (struct mpc_intsrc *)mpt;
881 count += sizeof(struct mpc_intsrc); 788 count += sizeof(struct mpc_intsrc);
882 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0) 789 if (check_slot(mpc_new_phys, mpc_new_length, count) < 0)
883 goto out; 790 goto out;
884 assign_to_mpc_intsrc(&mp_irqs[i], m); 791 memcpy(m, &mp_irqs[i], sizeof(*m));
885 mpc->length = count; 792 mpc->length = count;
886 mpt += sizeof(struct mpc_intsrc); 793 mpt += sizeof(struct mpc_intsrc);
887 } 794 }
@@ -974,7 +881,7 @@ static int __init update_mp_table(void)
974 881
975 if (!mpc_new_phys) { 882 if (!mpc_new_phys) {
976 unsigned char old, new; 883 unsigned char old, new;
977 /* check if we can change the postion */ 884 /* check if we can change the position */
978 mpc->checksum = 0; 885 mpc->checksum = 0;
979 old = mpf_checksum((unsigned char *)mpc, mpc->length); 886 old = mpf_checksum((unsigned char *)mpc, mpc->length);
980 mpc->checksum = 0xff; 887 mpc->checksum = 0xff;
@@ -983,7 +890,7 @@ static int __init update_mp_table(void)
983 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n"); 890 printk(KERN_INFO "mpc is readonly, please try alloc_mptable instead\n");
984 return 0; 891 return 0;
985 } 892 }
986 printk(KERN_INFO "use in-positon replacing\n"); 893 printk(KERN_INFO "use in-position replacing\n");
987 } else { 894 } else {
988 mpf->physptr = mpc_new_phys; 895 mpf->physptr = mpc_new_phys;
989 mpc_new = phys_to_virt(mpc_new_phys); 896 mpc_new = phys_to_virt(mpc_new_phys);
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
deleted file mode 100644
index 79ae68154e87..000000000000
--- a/arch/x86/kernel/mrst.c
+++ /dev/null
@@ -1,311 +0,0 @@
1/*
2 * mrst.c: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sfi.h>
15#include <linux/irq.h>
16#include <linux/module.h>
17
18#include <asm/setup.h>
19#include <asm/mpspec_def.h>
20#include <asm/hw_irq.h>
21#include <asm/apic.h>
22#include <asm/io_apic.h>
23#include <asm/mrst.h>
24#include <asm/io.h>
25#include <asm/i8259.h>
26#include <asm/apb_timer.h>
27
28/*
29 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
30 * cmdline option x86_mrst_timer can be used to override the configuration
31 * to prefer one or the other.
32 * at runtime, there are basically three timer configurations:
33 * 1. per cpu apbt clock only
34 * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
35 * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
36 *
37 * by default (without cmdline option), platform code first detects cpu type
38 * to see if we are on lincroft or penwell, then set up both lapic or apbt
39 * clocks accordingly.
40 * i.e. by default, medfield uses configuration #2, moorestown uses #1.
41 * config #3 is supported but not recommended on medfield.
42 *
43 * rating and feature summary:
44 * lapic (with C3STOP) --------- 100
45 * apbt (always-on) ------------ 110
46 * lapic (always-on,ARAT) ------ 150
47 */
48
49__cpuinitdata enum mrst_timer_options mrst_timer_options;
50
51static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
52static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
53enum mrst_cpu_type __mrst_cpu_chip;
54EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
55
56int sfi_mtimer_num;
57
58struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
59EXPORT_SYMBOL_GPL(sfi_mrtc_array);
60int sfi_mrtc_num;
61
62static inline void assign_to_mp_irq(struct mpc_intsrc *m,
63 struct mpc_intsrc *mp_irq)
64{
65 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
66}
67
68static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
69 struct mpc_intsrc *m)
70{
71 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
72}
73
74static void save_mp_irq(struct mpc_intsrc *m)
75{
76 int i;
77
78 for (i = 0; i < mp_irq_entries; i++) {
79 if (!mp_irq_cmp(&mp_irqs[i], m))
80 return;
81 }
82
83 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
84 if (++mp_irq_entries == MAX_IRQ_SOURCES)
85 panic("Max # of irq sources exceeded!!\n");
86}
87
88/* parse all the mtimer info to a static mtimer array */
89static int __init sfi_parse_mtmr(struct sfi_table_header *table)
90{
91 struct sfi_table_simple *sb;
92 struct sfi_timer_table_entry *pentry;
93 struct mpc_intsrc mp_irq;
94 int totallen;
95
96 sb = (struct sfi_table_simple *)table;
97 if (!sfi_mtimer_num) {
98 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
99 struct sfi_timer_table_entry);
100 pentry = (struct sfi_timer_table_entry *) sb->pentry;
101 totallen = sfi_mtimer_num * sizeof(*pentry);
102 memcpy(sfi_mtimer_array, pentry, totallen);
103 }
104
105 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
106 pentry = sfi_mtimer_array;
107 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
108 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
109 " irq = %d\n", totallen, (u32)pentry->phys_addr,
110 pentry->freq_hz, pentry->irq);
111 if (!pentry->irq)
112 continue;
113 mp_irq.type = MP_IOAPIC;
114 mp_irq.irqtype = mp_INT;
115/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
116 mp_irq.irqflag = 5;
117 mp_irq.srcbus = 0;
118 mp_irq.srcbusirq = pentry->irq; /* IRQ */
119 mp_irq.dstapic = MP_APIC_ALL;
120 mp_irq.dstirq = pentry->irq;
121 save_mp_irq(&mp_irq);
122 }
123
124 return 0;
125}
126
127struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
128{
129 int i;
130 if (hint < sfi_mtimer_num) {
131 if (!sfi_mtimer_usage[hint]) {
132 pr_debug("hint taken for timer %d irq %d\n",\
133 hint, sfi_mtimer_array[hint].irq);
134 sfi_mtimer_usage[hint] = 1;
135 return &sfi_mtimer_array[hint];
136 }
137 }
138 /* take the first timer available */
139 for (i = 0; i < sfi_mtimer_num;) {
140 if (!sfi_mtimer_usage[i]) {
141 sfi_mtimer_usage[i] = 1;
142 return &sfi_mtimer_array[i];
143 }
144 i++;
145 }
146 return NULL;
147}
148
149void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
150{
151 int i;
152 for (i = 0; i < sfi_mtimer_num;) {
153 if (mtmr->irq == sfi_mtimer_array[i].irq) {
154 sfi_mtimer_usage[i] = 0;
155 return;
156 }
157 i++;
158 }
159}
160
161/* parse all the mrtc info to a global mrtc array */
162int __init sfi_parse_mrtc(struct sfi_table_header *table)
163{
164 struct sfi_table_simple *sb;
165 struct sfi_rtc_table_entry *pentry;
166 struct mpc_intsrc mp_irq;
167
168 int totallen;
169
170 sb = (struct sfi_table_simple *)table;
171 if (!sfi_mrtc_num) {
172 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
173 struct sfi_rtc_table_entry);
174 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
175 totallen = sfi_mrtc_num * sizeof(*pentry);
176 memcpy(sfi_mrtc_array, pentry, totallen);
177 }
178
179 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
180 pentry = sfi_mrtc_array;
181 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
182 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
183 totallen, (u32)pentry->phys_addr, pentry->irq);
184 mp_irq.type = MP_IOAPIC;
185 mp_irq.irqtype = mp_INT;
186 mp_irq.irqflag = 0;
187 mp_irq.srcbus = 0;
188 mp_irq.srcbusirq = pentry->irq; /* IRQ */
189 mp_irq.dstapic = MP_APIC_ALL;
190 mp_irq.dstirq = pentry->irq;
191 save_mp_irq(&mp_irq);
192 }
193 return 0;
194}
195
196static unsigned long __init mrst_calibrate_tsc(void)
197{
198 unsigned long flags, fast_calibrate;
199
200 local_irq_save(flags);
201 fast_calibrate = apbt_quick_calibrate();
202 local_irq_restore(flags);
203
204 if (fast_calibrate)
205 return fast_calibrate;
206
207 return 0;
208}
209
210void __init mrst_time_init(void)
211{
212 switch (mrst_timer_options) {
213 case MRST_TIMER_APBT_ONLY:
214 break;
215 case MRST_TIMER_LAPIC_APBT:
216 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
217 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
218 break;
219 default:
220 if (!boot_cpu_has(X86_FEATURE_ARAT))
221 break;
222 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
223 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
224 return;
225 }
226 /* we need at least one APB timer */
227 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
228 pre_init_apic_IRQ0();
229 apbt_time_init();
230}
231
232void __init mrst_rtc_init(void)
233{
234 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
235}
236
237void __cpuinit mrst_arch_setup(void)
238{
239 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
240 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
241 else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
242 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
243 else {
244 pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
245 boot_cpu_data.x86, boot_cpu_data.x86_model);
246 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
247 }
248 pr_debug("Moorestown CPU %s identified\n",
249 (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
250 "Lincroft" : "Penwell");
251}
252
253/* MID systems don't have i8042 controller */
254static int mrst_i8042_detect(void)
255{
256 return 0;
257}
258
259/*
260 * Moorestown specific x86_init function overrides and early setup
261 * calls.
262 */
263void __init x86_mrst_early_setup(void)
264{
265 x86_init.resources.probe_roms = x86_init_noop;
266 x86_init.resources.reserve_resources = x86_init_noop;
267
268 x86_init.timers.timer_init = mrst_time_init;
269 x86_init.timers.setup_percpu_clockev = x86_init_noop;
270
271 x86_init.irqs.pre_vector_init = x86_init_noop;
272
273 x86_init.oem.arch_setup = mrst_arch_setup;
274
275 x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
276
277 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
278 x86_platform.i8042_detect = mrst_i8042_detect;
279 x86_init.pci.init = pci_mrst_init;
280 x86_init.pci.fixup_irqs = x86_init_noop;
281
282 legacy_pic = &null_legacy_pic;
283
284 /* Avoid searching for BIOS MP tables */
285 x86_init.mpparse.find_smp_config = x86_init_noop;
286 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
287
288}
289
290/*
291 * if user does not want to use per CPU apb timer, just give it a lower rating
292 * than local apic timer and skip the late per cpu timer init.
293 */
294static inline int __init setup_x86_mrst_timer(char *arg)
295{
296 if (!arg)
297 return -EINVAL;
298
299 if (strcmp("apbt_only", arg) == 0)
300 mrst_timer_options = MRST_TIMER_APBT_ONLY;
301 else if (strcmp("lapic_and_apbt", arg) == 0)
302 mrst_timer_options = MRST_TIMER_LAPIC_APBT;
303 else {
304 pr_warning("X86 MRST timer option %s not recognised"
305 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
306 arg);
307 return -EINVAL;
308 }
309 return 0;
310}
311__setup("x86_mrst_timer=", setup_x86_mrst_timer);
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 7bf2dc4c8f70..12fcbe2c143e 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -30,7 +30,6 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/poll.h> 31#include <linux/poll.h>
32#include <linux/smp.h> 32#include <linux/smp.h>
33#include <linux/smp_lock.h>
34#include <linux/major.h> 33#include <linux/major.h>
35#include <linux/fs.h> 34#include <linux/fs.h>
36#include <linux/device.h> 35#include <linux/device.h>
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1db183ed7c01..869e1aeeb71b 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -413,7 +413,6 @@ struct pv_mmu_ops pv_mmu_ops = {
413 413
414 .alloc_pte = paravirt_nop, 414 .alloc_pte = paravirt_nop,
415 .alloc_pmd = paravirt_nop, 415 .alloc_pmd = paravirt_nop,
416 .alloc_pmd_clone = paravirt_nop,
417 .alloc_pud = paravirt_nop, 416 .alloc_pud = paravirt_nop,
418 .release_pte = paravirt_nop, 417 .release_pte = paravirt_nop,
419 .release_pmd = paravirt_nop, 418 .release_pmd = paravirt_nop,
@@ -422,8 +421,11 @@ struct pv_mmu_ops pv_mmu_ops = {
422 .set_pte = native_set_pte, 421 .set_pte = native_set_pte,
423 .set_pte_at = native_set_pte_at, 422 .set_pte_at = native_set_pte_at,
424 .set_pmd = native_set_pmd, 423 .set_pmd = native_set_pmd,
424 .set_pmd_at = native_set_pmd_at,
425 .pte_update = paravirt_nop, 425 .pte_update = paravirt_nop,
426 .pte_update_defer = paravirt_nop, 426 .pte_update_defer = paravirt_nop,
427 .pmd_update = paravirt_nop,
428 .pmd_update_defer = paravirt_nop,
427 429
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 430 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 431 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 078d4ec1a9d9..e8c33a302006 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -47,6 +47,7 @@
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h> 49#include <asm/x86_init.h>
50#include <asm/iommu_table.h>
50 51
51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 52#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
52int use_calgary __read_mostly = 1; 53int use_calgary __read_mostly = 1;
@@ -1278,7 +1279,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1278 1279
1279 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) { 1280 if (pci_dev == PCI_DEVICE_ID_IBM_CALIOC2) {
1280 /* 1281 /*
1281 * FIXME: properly scan for devices accross the 1282 * FIXME: properly scan for devices across the
1282 * PCI-to-PCI bridge on every CalIOC2 port. 1283 * PCI-to-PCI bridge on every CalIOC2 port.
1283 */ 1284 */
1284 return 1; 1285 return 1;
@@ -1294,7 +1295,7 @@ static int __init calgary_bus_has_devices(int bus, unsigned short pci_dev)
1294 1295
1295/* 1296/*
1296 * calgary_init_bitmap_from_tce_table(): 1297 * calgary_init_bitmap_from_tce_table():
1297 * Funtion for kdump case. In the second/kdump kernel initialize 1298 * Function for kdump case. In the second/kdump kernel initialize
1298 * the bitmap based on the tce table entries obtained from first kernel 1299 * the bitmap based on the tce table entries obtained from first kernel
1299 */ 1300 */
1300static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl) 1301static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
@@ -1364,7 +1365,7 @@ static int __init calgary_iommu_init(void)
1364 return 0; 1365 return 0;
1365} 1366}
1366 1367
1367void __init detect_calgary(void) 1368int __init detect_calgary(void)
1368{ 1369{
1369 int bus; 1370 int bus;
1370 void *tbl; 1371 void *tbl;
@@ -1378,13 +1379,13 @@ void __init detect_calgary(void)
1378 * another HW IOMMU already, bail out. 1379 * another HW IOMMU already, bail out.
1379 */ 1380 */
1380 if (no_iommu || iommu_detected) 1381 if (no_iommu || iommu_detected)
1381 return; 1382 return -ENODEV;
1382 1383
1383 if (!use_calgary) 1384 if (!use_calgary)
1384 return; 1385 return -ENODEV;
1385 1386
1386 if (!early_pci_allowed()) 1387 if (!early_pci_allowed())
1387 return; 1388 return -ENODEV;
1388 1389
1389 printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n"); 1390 printk(KERN_DEBUG "Calgary: detecting Calgary via BIOS EBDA area\n");
1390 1391
@@ -1410,13 +1411,13 @@ void __init detect_calgary(void)
1410 if (!rio_table_hdr) { 1411 if (!rio_table_hdr) {
1411 printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table " 1412 printk(KERN_DEBUG "Calgary: Unable to locate Rio Grande table "
1412 "in EBDA - bailing!\n"); 1413 "in EBDA - bailing!\n");
1413 return; 1414 return -ENODEV;
1414 } 1415 }
1415 1416
1416 ret = build_detail_arrays(); 1417 ret = build_detail_arrays();
1417 if (ret) { 1418 if (ret) {
1418 printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret); 1419 printk(KERN_DEBUG "Calgary: build_detail_arrays ret %d\n", ret);
1419 return; 1420 return -ENOMEM;
1420 } 1421 }
1421 1422
1422 specified_table_size = determine_tce_table_size((is_kdump_kernel() ? 1423 specified_table_size = determine_tce_table_size((is_kdump_kernel() ?
@@ -1464,7 +1465,7 @@ void __init detect_calgary(void)
1464 1465
1465 x86_init.iommu.iommu_init = calgary_iommu_init; 1466 x86_init.iommu.iommu_init = calgary_iommu_init;
1466 } 1467 }
1467 return; 1468 return calgary_found;
1468 1469
1469cleanup: 1470cleanup:
1470 for (--bus; bus >= 0; --bus) { 1471 for (--bus; bus >= 0; --bus) {
@@ -1473,6 +1474,7 @@ cleanup:
1473 if (info->tce_space) 1474 if (info->tce_space)
1474 free_tce_table(info->tce_space); 1475 free_tce_table(info->tce_space);
1475 } 1476 }
1477 return -ENOMEM;
1476} 1478}
1477 1479
1478static int __init calgary_parse_options(char *p) 1480static int __init calgary_parse_options(char *p)
@@ -1594,3 +1596,5 @@ static int __init calgary_fixup_tce_spaces(void)
1594 * and before device_initcall. 1596 * and before device_initcall.
1595 */ 1597 */
1596rootfs_initcall(calgary_fixup_tce_spaces); 1598rootfs_initcall(calgary_fixup_tce_spaces);
1599
1600IOMMU_INIT_POST(detect_calgary);
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index 9f07cfcbd3a5..b49d00da2aed 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -11,9 +11,8 @@
11#include <asm/iommu.h> 11#include <asm/iommu.h>
12#include <asm/gart.h> 12#include <asm/gart.h>
13#include <asm/calgary.h> 13#include <asm/calgary.h>
14#include <asm/amd_iommu.h>
15#include <asm/x86_init.h> 14#include <asm/x86_init.h>
16#include <asm/xen/swiotlb-xen.h> 15#include <asm/iommu_table.h>
17 16
18static int forbid_dac __read_mostly; 17static int forbid_dac __read_mostly;
19 18
@@ -45,6 +44,8 @@ int iommu_detected __read_mostly = 0;
45 */ 44 */
46int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
47 46
47extern struct iommu_table_entry __iommu_table[], __iommu_table_end[];
48
48/* Dummy device used for NULL arguments (normally ISA). */ 49/* Dummy device used for NULL arguments (normally ISA). */
49struct device x86_dma_fallback_dev = { 50struct device x86_dma_fallback_dev = {
50 .init_name = "fallback device", 51 .init_name = "fallback device",
@@ -67,89 +68,23 @@ int dma_set_mask(struct device *dev, u64 mask)
67} 68}
68EXPORT_SYMBOL(dma_set_mask); 69EXPORT_SYMBOL(dma_set_mask);
69 70
70#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
71static __initdata void *dma32_bootmem_ptr;
72static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
73
74static int __init parse_dma32_size_opt(char *p)
75{
76 if (!p)
77 return -EINVAL;
78 dma32_bootmem_size = memparse(p, &p);
79 return 0;
80}
81early_param("dma32_size", parse_dma32_size_opt);
82
83void __init dma32_reserve_bootmem(void)
84{
85 unsigned long size, align;
86 if (max_pfn <= MAX_DMA32_PFN)
87 return;
88
89 /*
90 * check aperture_64.c allocate_aperture() for reason about
91 * using 512M as goal
92 */
93 align = 64ULL<<20;
94 size = roundup(dma32_bootmem_size, align);
95 dma32_bootmem_ptr = __alloc_bootmem_nopanic(size, align,
96 512ULL<<20);
97 /*
98 * Kmemleak should not scan this block as it may not be mapped via the
99 * kernel direct mapping.
100 */
101 kmemleak_ignore(dma32_bootmem_ptr);
102 if (dma32_bootmem_ptr)
103 dma32_bootmem_size = size;
104 else
105 dma32_bootmem_size = 0;
106}
107static void __init dma32_free_bootmem(void)
108{
109
110 if (max_pfn <= MAX_DMA32_PFN)
111 return;
112
113 if (!dma32_bootmem_ptr)
114 return;
115
116 free_bootmem(__pa(dma32_bootmem_ptr), dma32_bootmem_size);
117
118 dma32_bootmem_ptr = NULL;
119 dma32_bootmem_size = 0;
120}
121#else
122void __init dma32_reserve_bootmem(void)
123{
124}
125static void __init dma32_free_bootmem(void)
126{
127}
128
129#endif
130
131void __init pci_iommu_alloc(void) 71void __init pci_iommu_alloc(void)
132{ 72{
133 /* free the range so iommu could get some range less than 4G */ 73 struct iommu_table_entry *p;
134 dma32_free_bootmem(); 74
135 75 sort_iommu_table(__iommu_table, __iommu_table_end);
136 if (pci_xen_swiotlb_detect() || pci_swiotlb_detect()) 76 check_iommu_entries(__iommu_table, __iommu_table_end);
137 goto out; 77
138 78 for (p = __iommu_table; p < __iommu_table_end; p++) {
139 gart_iommu_hole_init(); 79 if (p && p->detect && p->detect() > 0) {
140 80 p->flags |= IOMMU_DETECTED;
141 detect_calgary(); 81 if (p->early_init)
142 82 p->early_init();
143 detect_intel_iommu(); 83 if (p->flags & IOMMU_FINISH_IF_DETECTED)
144 84 break;
145 /* needs to be called after gart_iommu_hole_init */ 85 }
146 amd_iommu_detect(); 86 }
147out:
148 pci_xen_swiotlb_init();
149
150 pci_swiotlb_init();
151} 87}
152
153void *dma_generic_alloc_coherent(struct device *dev, size_t size, 88void *dma_generic_alloc_coherent(struct device *dev, size_t size,
154 dma_addr_t *dma_addr, gfp_t flag) 89 dma_addr_t *dma_addr, gfp_t flag)
155{ 90{
@@ -292,6 +227,7 @@ EXPORT_SYMBOL(dma_supported);
292 227
293static int __init pci_iommu_init(void) 228static int __init pci_iommu_init(void)
294{ 229{
230 struct iommu_table_entry *p;
295 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES); 231 dma_debug_init(PREALLOC_DMA_DEBUG_ENTRIES);
296 232
297#ifdef CONFIG_PCI 233#ifdef CONFIG_PCI
@@ -299,12 +235,10 @@ static int __init pci_iommu_init(void)
299#endif 235#endif
300 x86_init.iommu.iommu_init(); 236 x86_init.iommu.iommu_init();
301 237
302 if (swiotlb || xen_swiotlb) { 238 for (p = __iommu_table; p < __iommu_table_end; p++) {
303 printk(KERN_INFO "PCI-DMA: " 239 if (p && (p->flags & IOMMU_DETECTED) && p->late_init)
304 "Using software bounce buffering for IO (SWIOTLB)\n"); 240 p->late_init();
305 swiotlb_print_info(); 241 }
306 } else
307 swiotlb_free();
308 242
309 return 0; 243 return 0;
310} 244}
diff --git a/arch/x86/kernel/pci-iommu_table.c b/arch/x86/kernel/pci-iommu_table.c
new file mode 100644
index 000000000000..35ccf75696eb
--- /dev/null
+++ b/arch/x86/kernel/pci-iommu_table.c
@@ -0,0 +1,79 @@
1#include <linux/dma-mapping.h>
2#include <asm/iommu_table.h>
3#include <linux/string.h>
4#include <linux/kallsyms.h>
5
6
7#define DEBUG 1
8
9static struct iommu_table_entry * __init
10find_dependents_of(struct iommu_table_entry *start,
11 struct iommu_table_entry *finish,
12 struct iommu_table_entry *q)
13{
14 struct iommu_table_entry *p;
15
16 if (!q)
17 return NULL;
18
19 for (p = start; p < finish; p++)
20 if (p->detect == q->depend)
21 return p;
22
23 return NULL;
24}
25
26
27void __init sort_iommu_table(struct iommu_table_entry *start,
28 struct iommu_table_entry *finish) {
29
30 struct iommu_table_entry *p, *q, tmp;
31
32 for (p = start; p < finish; p++) {
33again:
34 q = find_dependents_of(start, finish, p);
35 /* We are bit sneaky here. We use the memory address to figure
36 * out if the node we depend on is past our point, if so, swap.
37 */
38 if (q > p) {
39 tmp = *p;
40 memmove(p, q, sizeof(*p));
41 *q = tmp;
42 goto again;
43 }
44 }
45
46}
47
48#ifdef DEBUG
49void __init check_iommu_entries(struct iommu_table_entry *start,
50 struct iommu_table_entry *finish)
51{
52 struct iommu_table_entry *p, *q, *x;
53
54 /* Simple cyclic dependency checker. */
55 for (p = start; p < finish; p++) {
56 q = find_dependents_of(start, finish, p);
57 x = find_dependents_of(start, finish, q);
58 if (p == x) {
59 printk(KERN_ERR "CYCLIC DEPENDENCY FOUND! %pS depends on %pS and vice-versa. BREAKING IT.\n",
60 p->detect, q->detect);
61 /* Heavy handed way..*/
62 x->depend = 0;
63 }
64 }
65
66 for (p = start; p < finish; p++) {
67 q = find_dependents_of(p, finish, p);
68 if (q && q > p) {
69 printk(KERN_ERR "EXECUTION ORDER INVALID! %pS should be called before %pS!\n",
70 p->detect, q->detect);
71 }
72 }
73}
74#else
75inline void check_iommu_entries(struct iommu_table_entry *start,
76 struct iommu_table_entry *finish)
77{
78}
79#endif
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index a5bc528d4328..8f972cbddef0 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -10,7 +10,8 @@
10#include <asm/iommu.h> 10#include <asm/iommu.h>
11#include <asm/swiotlb.h> 11#include <asm/swiotlb.h>
12#include <asm/dma.h> 12#include <asm/dma.h>
13 13#include <asm/xen/swiotlb-xen.h>
14#include <asm/iommu_table.h>
14int swiotlb __read_mostly; 15int swiotlb __read_mostly;
15 16
16static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size, 17static void *x86_swiotlb_alloc_coherent(struct device *hwdev, size_t size,
@@ -41,25 +42,42 @@ static struct dma_map_ops swiotlb_dma_ops = {
41}; 42};
42 43
43/* 44/*
44 * pci_swiotlb_detect - set swiotlb to 1 if necessary 45 * pci_swiotlb_detect_override - set swiotlb to 1 if necessary
45 * 46 *
46 * This returns non-zero if we are forced to use swiotlb (by the boot 47 * This returns non-zero if we are forced to use swiotlb (by the boot
47 * option). 48 * option).
48 */ 49 */
49int __init pci_swiotlb_detect(void) 50int __init pci_swiotlb_detect_override(void)
50{ 51{
51 int use_swiotlb = swiotlb | swiotlb_force; 52 int use_swiotlb = swiotlb | swiotlb_force;
52 53
54 if (swiotlb_force)
55 swiotlb = 1;
56
57 return use_swiotlb;
58}
59IOMMU_INIT_FINISH(pci_swiotlb_detect_override,
60 pci_xen_swiotlb_detect,
61 pci_swiotlb_init,
62 pci_swiotlb_late_init);
63
64/*
65 * if 4GB or more detected (and iommu=off not set) return 1
66 * and set swiotlb to 1.
67 */
68int __init pci_swiotlb_detect_4gb(void)
69{
53 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 70 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
54#ifdef CONFIG_X86_64 71#ifdef CONFIG_X86_64
55 if (!no_iommu && max_pfn > MAX_DMA32_PFN) 72 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
56 swiotlb = 1; 73 swiotlb = 1;
57#endif 74#endif
58 if (swiotlb_force) 75 return swiotlb;
59 swiotlb = 1;
60
61 return use_swiotlb;
62} 76}
77IOMMU_INIT(pci_swiotlb_detect_4gb,
78 pci_swiotlb_detect_override,
79 pci_swiotlb_init,
80 pci_swiotlb_late_init);
63 81
64void __init pci_swiotlb_init(void) 82void __init pci_swiotlb_init(void)
65{ 83{
@@ -68,3 +86,15 @@ void __init pci_swiotlb_init(void)
68 dma_ops = &swiotlb_dma_ops; 86 dma_ops = &swiotlb_dma_ops;
69 } 87 }
70} 88}
89
90void __init pci_swiotlb_late_init(void)
91{
92 /* An IOMMU turned us off. */
93 if (!swiotlb)
94 swiotlb_free();
95 else {
96 printk(KERN_INFO "PCI-DMA: "
97 "Using software bounce buffering for IO (SWIOTLB)\n");
98 swiotlb_print_info();
99 }
100}
diff --git a/arch/x86/kernel/pmtimer_64.c b/arch/x86/kernel/pmtimer_64.c
deleted file mode 100644
index b112406f1996..000000000000
--- a/arch/x86/kernel/pmtimer_64.c
+++ /dev/null
@@ -1,69 +0,0 @@
1/* Ported over from i386 by AK, original copyright was:
2 *
3 * (C) Dominik Brodowski <linux@brodo.de> 2003
4 *
5 * Driver to use the Power Management Timer (PMTMR) available in some
6 * southbridges as primary timing source for the Linux kernel.
7 *
8 * Based on parts of linux/drivers/acpi/hardware/hwtimer.c, timer_pit.c,
9 * timer_hpet.c, and on Arjan van de Ven's implementation for 2.4.
10 *
11 * This file is licensed under the GPL v2.
12 *
13 * Dropped all the hardware bug workarounds for now. Hopefully they
14 * are not needed on 64bit chipsets.
15 */
16
17#include <linux/jiffies.h>
18#include <linux/kernel.h>
19#include <linux/time.h>
20#include <linux/init.h>
21#include <linux/cpumask.h>
22#include <linux/acpi_pmtmr.h>
23
24#include <asm/io.h>
25#include <asm/proto.h>
26#include <asm/msr.h>
27#include <asm/vsyscall.h>
28
29static inline u32 cyc2us(u32 cycles)
30{
31 /* The Power Management Timer ticks at 3.579545 ticks per microsecond.
32 * 1 / PM_TIMER_FREQUENCY == 0.27936511 =~ 286/1024 [error: 0.024%]
33 *
34 * Even with HZ = 100, delta is at maximum 35796 ticks, so it can
35 * easily be multiplied with 286 (=0x11E) without having to fear
36 * u32 overflows.
37 */
38 cycles *= 286;
39 return (cycles >> 10);
40}
41
42static unsigned pmtimer_wait_tick(void)
43{
44 u32 a, b;
45 for (a = b = inl(pmtmr_ioport) & ACPI_PM_MASK;
46 a == b;
47 b = inl(pmtmr_ioport) & ACPI_PM_MASK)
48 cpu_relax();
49 return b;
50}
51
52/* note: wait time is rounded up to one tick */
53void pmtimer_wait(unsigned us)
54{
55 u32 a, b;
56 a = pmtimer_wait_tick();
57 do {
58 b = inl(pmtmr_ioport);
59 cpu_relax();
60 } while (cyc2us(b - a) < us);
61}
62
63static int __init nopmtimer_setup(char *s)
64{
65 pmtmr_ioport = 0;
66 return 1;
67}
68
69__setup("nopmtimer", nopmtimer_setup);
diff --git a/arch/x86/kernel/probe_roms_32.c b/arch/x86/kernel/probe_roms.c
index 071e7fea42e5..ba0a4cce53be 100644
--- a/arch/x86/kernel/probe_roms_32.c
+++ b/arch/x86/kernel/probe_roms.c
@@ -73,6 +73,107 @@ static struct resource video_rom_resource = {
73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM 73 .flags = IORESOURCE_BUSY | IORESOURCE_READONLY | IORESOURCE_MEM
74}; 74};
75 75
76/* does this oprom support the given pci device, or any of the devices
77 * that the driver supports?
78 */
79static bool match_id(struct pci_dev *pdev, unsigned short vendor, unsigned short device)
80{
81 struct pci_driver *drv = pdev->driver;
82 const struct pci_device_id *id;
83
84 if (pdev->vendor == vendor && pdev->device == device)
85 return true;
86
87 for (id = drv ? drv->id_table : NULL; id && id->vendor; id++)
88 if (id->vendor == vendor && id->device == device)
89 break;
90
91 return id && id->vendor;
92}
93
94static bool probe_list(struct pci_dev *pdev, unsigned short vendor,
95 const unsigned char *rom_list)
96{
97 unsigned short device;
98
99 do {
100 if (probe_kernel_address(rom_list, device) != 0)
101 device = 0;
102
103 if (device && match_id(pdev, vendor, device))
104 break;
105
106 rom_list += 2;
107 } while (device);
108
109 return !!device;
110}
111
112static struct resource *find_oprom(struct pci_dev *pdev)
113{
114 struct resource *oprom = NULL;
115 int i;
116
117 for (i = 0; i < ARRAY_SIZE(adapter_rom_resources); i++) {
118 struct resource *res = &adapter_rom_resources[i];
119 unsigned short offset, vendor, device, list, rev;
120 const unsigned char *rom;
121
122 if (res->end == 0)
123 break;
124
125 rom = isa_bus_to_virt(res->start);
126 if (probe_kernel_address(rom + 0x18, offset) != 0)
127 continue;
128
129 if (probe_kernel_address(rom + offset + 0x4, vendor) != 0)
130 continue;
131
132 if (probe_kernel_address(rom + offset + 0x6, device) != 0)
133 continue;
134
135 if (match_id(pdev, vendor, device)) {
136 oprom = res;
137 break;
138 }
139
140 if (probe_kernel_address(rom + offset + 0x8, list) == 0 &&
141 probe_kernel_address(rom + offset + 0xc, rev) == 0 &&
142 rev >= 3 && list &&
143 probe_list(pdev, vendor, rom + offset + list)) {
144 oprom = res;
145 break;
146 }
147 }
148
149 return oprom;
150}
151
152void *pci_map_biosrom(struct pci_dev *pdev)
153{
154 struct resource *oprom = find_oprom(pdev);
155
156 if (!oprom)
157 return NULL;
158
159 return ioremap(oprom->start, resource_size(oprom));
160}
161EXPORT_SYMBOL(pci_map_biosrom);
162
163void pci_unmap_biosrom(void __iomem *image)
164{
165 iounmap(image);
166}
167EXPORT_SYMBOL(pci_unmap_biosrom);
168
169size_t pci_biosrom_size(struct pci_dev *pdev)
170{
171 struct resource *oprom = find_oprom(pdev);
172
173 return oprom ? resource_size(oprom) : 0;
174}
175EXPORT_SYMBOL(pci_biosrom_size);
176
76#define ROMSIGNATURE 0xaa55 177#define ROMSIGNATURE 0xaa55
77 178
78static int __init romsignature(const unsigned char *rom) 179static int __init romsignature(const unsigned char *rom)
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 57d1868a86aa..e1ba8cb24e4e 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -14,6 +14,7 @@
14#include <linux/utsname.h> 14#include <linux/utsname.h>
15#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h> 16#include <linux/hw_breakpoint.h>
17#include <asm/cpu.h>
17#include <asm/system.h> 18#include <asm/system.h>
18#include <asm/apic.h> 19#include <asm/apic.h>
19#include <asm/syscalls.h> 20#include <asm/syscalls.h>
@@ -22,11 +23,6 @@
22#include <asm/i387.h> 23#include <asm/i387.h>
23#include <asm/debugreg.h> 24#include <asm/debugreg.h>
24 25
25unsigned long idle_halt;
26EXPORT_SYMBOL(idle_halt);
27unsigned long idle_nomwait;
28EXPORT_SYMBOL(idle_nomwait);
29
30struct kmem_cache *task_xstate_cachep; 26struct kmem_cache *task_xstate_cachep;
31EXPORT_SYMBOL_GPL(task_xstate_cachep); 27EXPORT_SYMBOL_GPL(task_xstate_cachep);
32 28
@@ -91,27 +87,33 @@ void exit_thread(void)
91void show_regs(struct pt_regs *regs) 87void show_regs(struct pt_regs *regs)
92{ 88{
93 show_registers(regs); 89 show_registers(regs);
94 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 90 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs), 0);
95 regs->bp);
96} 91}
97 92
98void show_regs_common(void) 93void show_regs_common(void)
99{ 94{
100 const char *board, *product; 95 const char *vendor, *product, *board;
101 96
102 board = dmi_get_system_info(DMI_BOARD_NAME); 97 vendor = dmi_get_system_info(DMI_SYS_VENDOR);
103 if (!board) 98 if (!vendor)
104 board = ""; 99 vendor = "";
105 product = dmi_get_system_info(DMI_PRODUCT_NAME); 100 product = dmi_get_system_info(DMI_PRODUCT_NAME);
106 if (!product) 101 if (!product)
107 product = ""; 102 product = "";
108 103
104 /* Board Name is optional */
105 board = dmi_get_system_info(DMI_BOARD_NAME);
106
109 printk(KERN_CONT "\n"); 107 printk(KERN_CONT "\n");
110 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n", 108 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s",
111 current->pid, current->comm, print_tainted(), 109 current->pid, current->comm, print_tainted(),
112 init_utsname()->release, 110 init_utsname()->release,
113 (int)strcspn(init_utsname()->version, " "), 111 (int)strcspn(init_utsname()->version, " "),
114 init_utsname()->version, board, product); 112 init_utsname()->version);
113 printk(KERN_CONT " %s %s", vendor, product);
114 if (board)
115 printk(KERN_CONT "/%s", board);
116 printk(KERN_CONT "\n");
115} 117}
116 118
117void flush_thread(void) 119void flush_thread(void)
@@ -328,14 +330,16 @@ long sys_execve(const char __user *name,
328/* 330/*
329 * Idle related variables and functions 331 * Idle related variables and functions
330 */ 332 */
331unsigned long boot_option_idle_override = 0; 333unsigned long boot_option_idle_override = IDLE_NO_OVERRIDE;
332EXPORT_SYMBOL(boot_option_idle_override); 334EXPORT_SYMBOL(boot_option_idle_override);
333 335
334/* 336/*
335 * Powermanagement idle function, if any.. 337 * Powermanagement idle function, if any..
336 */ 338 */
337void (*pm_idle)(void); 339void (*pm_idle)(void);
340#ifdef CONFIG_APM_MODULE
338EXPORT_SYMBOL(pm_idle); 341EXPORT_SYMBOL(pm_idle);
342#endif
339 343
340#ifdef CONFIG_X86_32 344#ifdef CONFIG_X86_32
341/* 345/*
@@ -374,6 +378,7 @@ void default_idle(void)
374{ 378{
375 if (hlt_use_halt()) { 379 if (hlt_use_halt()) {
376 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 380 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
381 trace_cpu_idle(1, smp_processor_id());
377 current_thread_info()->status &= ~TS_POLLING; 382 current_thread_info()->status &= ~TS_POLLING;
378 /* 383 /*
379 * TS_POLLING-cleared state must be visible before we 384 * TS_POLLING-cleared state must be visible before we
@@ -386,6 +391,8 @@ void default_idle(void)
386 else 391 else
387 local_irq_enable(); 392 local_irq_enable();
388 current_thread_info()->status |= TS_POLLING; 393 current_thread_info()->status |= TS_POLLING;
394 trace_power_end(smp_processor_id());
395 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
389 } else { 396 } else {
390 local_irq_enable(); 397 local_irq_enable();
391 /* loop is done by the caller */ 398 /* loop is done by the caller */
@@ -443,9 +450,8 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
443 */ 450 */
444void mwait_idle_with_hints(unsigned long ax, unsigned long cx) 451void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
445{ 452{
446 trace_power_start(POWER_CSTATE, (ax>>4)+1, smp_processor_id());
447 if (!need_resched()) { 453 if (!need_resched()) {
448 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 454 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
449 clflush((void *)&current_thread_info()->flags); 455 clflush((void *)&current_thread_info()->flags);
450 456
451 __monitor((void *)&current_thread_info()->flags, 0, 0); 457 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -460,7 +466,8 @@ static void mwait_idle(void)
460{ 466{
461 if (!need_resched()) { 467 if (!need_resched()) {
462 trace_power_start(POWER_CSTATE, 1, smp_processor_id()); 468 trace_power_start(POWER_CSTATE, 1, smp_processor_id());
463 if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR)) 469 trace_cpu_idle(1, smp_processor_id());
470 if (this_cpu_has(X86_FEATURE_CLFLUSH_MONITOR))
464 clflush((void *)&current_thread_info()->flags); 471 clflush((void *)&current_thread_info()->flags);
465 472
466 __monitor((void *)&current_thread_info()->flags, 0, 0); 473 __monitor((void *)&current_thread_info()->flags, 0, 0);
@@ -469,6 +476,8 @@ static void mwait_idle(void)
469 __sti_mwait(0, 0); 476 __sti_mwait(0, 0);
470 else 477 else
471 local_irq_enable(); 478 local_irq_enable();
479 trace_power_end(smp_processor_id());
480 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
472 } else 481 } else
473 local_irq_enable(); 482 local_irq_enable();
474} 483}
@@ -481,10 +490,12 @@ static void mwait_idle(void)
481static void poll_idle(void) 490static void poll_idle(void)
482{ 491{
483 trace_power_start(POWER_CSTATE, 0, smp_processor_id()); 492 trace_power_start(POWER_CSTATE, 0, smp_processor_id());
493 trace_cpu_idle(0, smp_processor_id());
484 local_irq_enable(); 494 local_irq_enable();
485 while (!need_resched()) 495 while (!need_resched())
486 cpu_relax(); 496 cpu_relax();
487 trace_power_end(0); 497 trace_power_end(smp_processor_id());
498 trace_cpu_idle(PWR_EVENT_EXIT, smp_processor_id());
488} 499}
489 500
490/* 501/*
@@ -499,17 +510,16 @@ static void poll_idle(void)
499 * 510 *
500 * idle=mwait overrides this decision and forces the usage of mwait. 511 * idle=mwait overrides this decision and forces the usage of mwait.
501 */ 512 */
502static int __cpuinitdata force_mwait;
503 513
504#define MWAIT_INFO 0x05 514#define MWAIT_INFO 0x05
505#define MWAIT_ECX_EXTENDED_INFO 0x01 515#define MWAIT_ECX_EXTENDED_INFO 0x01
506#define MWAIT_EDX_C1 0xf0 516#define MWAIT_EDX_C1 0xf0
507 517
508static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c) 518int mwait_usable(const struct cpuinfo_x86 *c)
509{ 519{
510 u32 eax, ebx, ecx, edx; 520 u32 eax, ebx, ecx, edx;
511 521
512 if (force_mwait) 522 if (boot_option_idle_override == IDLE_FORCE_MWAIT)
513 return 1; 523 return 1;
514 524
515 if (c->cpuid_level < MWAIT_INFO) 525 if (c->cpuid_level < MWAIT_INFO)
@@ -527,45 +537,45 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
527 return (edx & MWAIT_EDX_C1); 537 return (edx & MWAIT_EDX_C1);
528} 538}
529 539
530bool c1e_detected; 540bool amd_e400_c1e_detected;
531EXPORT_SYMBOL(c1e_detected); 541EXPORT_SYMBOL(amd_e400_c1e_detected);
532 542
533static cpumask_var_t c1e_mask; 543static cpumask_var_t amd_e400_c1e_mask;
534 544
535void c1e_remove_cpu(int cpu) 545void amd_e400_remove_cpu(int cpu)
536{ 546{
537 if (c1e_mask != NULL) 547 if (amd_e400_c1e_mask != NULL)
538 cpumask_clear_cpu(cpu, c1e_mask); 548 cpumask_clear_cpu(cpu, amd_e400_c1e_mask);
539} 549}
540 550
541/* 551/*
542 * C1E aware idle routine. We check for C1E active in the interrupt 552 * AMD Erratum 400 aware idle routine. We check for C1E active in the interrupt
543 * pending message MSR. If we detect C1E, then we handle it the same 553 * pending message MSR. If we detect C1E, then we handle it the same
544 * way as C3 power states (local apic timer and TSC stop) 554 * way as C3 power states (local apic timer and TSC stop)
545 */ 555 */
546static void c1e_idle(void) 556static void amd_e400_idle(void)
547{ 557{
548 if (need_resched()) 558 if (need_resched())
549 return; 559 return;
550 560
551 if (!c1e_detected) { 561 if (!amd_e400_c1e_detected) {
552 u32 lo, hi; 562 u32 lo, hi;
553 563
554 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi); 564 rdmsr(MSR_K8_INT_PENDING_MSG, lo, hi);
555 565
556 if (lo & K8_INTP_C1E_ACTIVE_MASK) { 566 if (lo & K8_INTP_C1E_ACTIVE_MASK) {
557 c1e_detected = true; 567 amd_e400_c1e_detected = true;
558 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) 568 if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC))
559 mark_tsc_unstable("TSC halt in AMD C1E"); 569 mark_tsc_unstable("TSC halt in AMD C1E");
560 printk(KERN_INFO "System has AMD C1E enabled\n"); 570 printk(KERN_INFO "System has AMD C1E enabled\n");
561 } 571 }
562 } 572 }
563 573
564 if (c1e_detected) { 574 if (amd_e400_c1e_detected) {
565 int cpu = smp_processor_id(); 575 int cpu = smp_processor_id();
566 576
567 if (!cpumask_test_cpu(cpu, c1e_mask)) { 577 if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
568 cpumask_set_cpu(cpu, c1e_mask); 578 cpumask_set_cpu(cpu, amd_e400_c1e_mask);
569 /* 579 /*
570 * Force broadcast so ACPI can not interfere. 580 * Force broadcast so ACPI can not interfere.
571 */ 581 */
@@ -608,17 +618,17 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
608 pm_idle = mwait_idle; 618 pm_idle = mwait_idle;
609 } else if (cpu_has_amd_erratum(amd_erratum_400)) { 619 } else if (cpu_has_amd_erratum(amd_erratum_400)) {
610 /* E400: APIC timer interrupt does not wake up CPU from C1e */ 620 /* E400: APIC timer interrupt does not wake up CPU from C1e */
611 printk(KERN_INFO "using C1E aware idle routine\n"); 621 printk(KERN_INFO "using AMD E400 aware idle routine\n");
612 pm_idle = c1e_idle; 622 pm_idle = amd_e400_idle;
613 } else 623 } else
614 pm_idle = default_idle; 624 pm_idle = default_idle;
615} 625}
616 626
617void __init init_c1e_mask(void) 627void __init init_amd_e400_c1e_mask(void)
618{ 628{
619 /* If we're using c1e_idle, we need to allocate c1e_mask. */ 629 /* If we're using amd_e400_idle, we need to allocate amd_e400_c1e_mask. */
620 if (pm_idle == c1e_idle) 630 if (pm_idle == amd_e400_idle)
621 zalloc_cpumask_var(&c1e_mask, GFP_KERNEL); 631 zalloc_cpumask_var(&amd_e400_c1e_mask, GFP_KERNEL);
622} 632}
623 633
624static int __init idle_setup(char *str) 634static int __init idle_setup(char *str)
@@ -629,9 +639,11 @@ static int __init idle_setup(char *str)
629 if (!strcmp(str, "poll")) { 639 if (!strcmp(str, "poll")) {
630 printk("using polling idle threads.\n"); 640 printk("using polling idle threads.\n");
631 pm_idle = poll_idle; 641 pm_idle = poll_idle;
632 } else if (!strcmp(str, "mwait")) 642 boot_option_idle_override = IDLE_POLL;
633 force_mwait = 1; 643 } else if (!strcmp(str, "mwait")) {
634 else if (!strcmp(str, "halt")) { 644 boot_option_idle_override = IDLE_FORCE_MWAIT;
645 WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\n");
646 } else if (!strcmp(str, "halt")) {
635 /* 647 /*
636 * When the boot option of idle=halt is added, halt is 648 * When the boot option of idle=halt is added, halt is
637 * forced to be used for CPU idle. In such case CPU C2/C3 649 * forced to be used for CPU idle. In such case CPU C2/C3
@@ -640,8 +652,7 @@ static int __init idle_setup(char *str)
640 * the boot_option_idle_override. 652 * the boot_option_idle_override.
641 */ 653 */
642 pm_idle = default_idle; 654 pm_idle = default_idle;
643 idle_halt = 1; 655 boot_option_idle_override = IDLE_HALT;
644 return 0;
645 } else if (!strcmp(str, "nomwait")) { 656 } else if (!strcmp(str, "nomwait")) {
646 /* 657 /*
647 * If the boot option of "idle=nomwait" is added, 658 * If the boot option of "idle=nomwait" is added,
@@ -649,12 +660,10 @@ static int __init idle_setup(char *str)
649 * states. In such case it won't touch the variable 660 * states. In such case it won't touch the variable
650 * of boot_option_idle_override. 661 * of boot_option_idle_override.
651 */ 662 */
652 idle_nomwait = 1; 663 boot_option_idle_override = IDLE_NOMWAIT;
653 return 0;
654 } else 664 } else
655 return -1; 665 return -1;
656 666
657 boot_option_idle_override = 1;
658 return 0; 667 return 0;
659} 668}
660early_param("idle", idle_setup); 669early_param("idle", idle_setup);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 96586c3cbbbf..a3d0dc59067b 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -57,8 +57,6 @@
57#include <asm/syscalls.h> 57#include <asm/syscalls.h>
58#include <asm/debugreg.h> 58#include <asm/debugreg.h>
59 59
60#include <trace/events/power.h>
61
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 60asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 61
64/* 62/*
@@ -113,8 +111,6 @@ void cpu_idle(void)
113 stop_critical_timings(); 111 stop_critical_timings();
114 pm_idle(); 112 pm_idle();
115 start_critical_timings(); 113 start_critical_timings();
116
117 trace_power_end(smp_processor_id());
118 } 114 }
119 tick_nohz_restart_sched_tick(); 115 tick_nohz_restart_sched_tick();
120 preempt_enable_no_resched(); 116 preempt_enable_no_resched();
@@ -249,7 +245,6 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
249{ 245{
250 set_user_gs(regs, 0); 246 set_user_gs(regs, 0);
251 regs->fs = 0; 247 regs->fs = 0;
252 set_fs(USER_DS);
253 regs->ds = __USER_DS; 248 regs->ds = __USER_DS;
254 regs->es = __USER_DS; 249 regs->es = __USER_DS;
255 regs->ss = __USER_DS; 250 regs->ss = __USER_DS;
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 3d9ea531ddd1..ca6f7ab8df33 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -51,8 +51,6 @@
51#include <asm/syscalls.h> 51#include <asm/syscalls.h>
52#include <asm/debugreg.h> 52#include <asm/debugreg.h>
53 53
54#include <trace/events/power.h>
55
56asmlinkage extern void ret_from_fork(void); 54asmlinkage extern void ret_from_fork(void);
57 55
58DEFINE_PER_CPU(unsigned long, old_rsp); 56DEFINE_PER_CPU(unsigned long, old_rsp);
@@ -141,8 +139,6 @@ void cpu_idle(void)
141 pm_idle(); 139 pm_idle();
142 start_critical_timings(); 140 start_critical_timings();
143 141
144 trace_power_end(smp_processor_id());
145
146 /* In many cases the interrupt that ended idle 142 /* In many cases the interrupt that ended idle
147 has already called exit_idle. But some idle 143 has already called exit_idle. But some idle
148 loops can be woken up without interrupt. */ 144 loops can be woken up without interrupt. */
@@ -342,7 +338,6 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
342 regs->cs = _cs; 338 regs->cs = _cs;
343 regs->ss = _ss; 339 regs->ss = _ss;
344 regs->flags = X86_EFLAGS_IF; 340 regs->flags = X86_EFLAGS_IF;
345 set_fs(USER_DS);
346 /* 341 /*
347 * Free the old FP and other extended state 342 * Free the old FP and other extended state
348 */ 343 */
@@ -424,7 +419,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
424 load_TLS(next, cpu); 419 load_TLS(next, cpu);
425 420
426 /* Must be after DS reload */ 421 /* Must be after DS reload */
427 unlazy_fpu(prev_p); 422 __unlazy_fpu(prev_p);
428 423
429 /* Make sure cpu is ready for new context */ 424 /* Make sure cpu is ready for new context */
430 if (preload_fpu) 425 if (preload_fpu)
@@ -505,6 +500,10 @@ void set_personality_64bit(void)
505 /* Make sure to be in 64bit mode */ 500 /* Make sure to be in 64bit mode */
506 clear_thread_flag(TIF_IA32); 501 clear_thread_flag(TIF_IA32);
507 502
503 /* Ensure the corresponding mm is not marked. */
504 if (current->mm)
505 current->mm->context.ia32_compat = 0;
506
508 /* TBD: overwrites user setup. Should have two bits. 507 /* TBD: overwrites user setup. Should have two bits.
509 But 64bit processes have always behaved this way, 508 But 64bit processes have always behaved this way,
510 so it's not too bad. The main problem is just that 509 so it's not too bad. The main problem is just that
@@ -520,6 +519,10 @@ void set_personality_ia32(void)
520 set_thread_flag(TIF_IA32); 519 set_thread_flag(TIF_IA32);
521 current->personality |= force_personality32; 520 current->personality |= force_personality32;
522 521
522 /* Mark the associated mm as containing 32-bit tasks. */
523 if (current->mm)
524 current->mm->context.ia32_compat = 1;
525
523 /* Prepare the first "return" to user space */ 526 /* Prepare the first "return" to user space */
524 current_thread_info()->status |= TS_COMPAT; 527 current_thread_info()->status |= TS_COMPAT;
525} 528}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 70c4872cd8aa..807c2a2b80f1 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -608,6 +608,9 @@ static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
608 unsigned len, type; 608 unsigned len, type;
609 struct perf_event *bp; 609 struct perf_event *bp;
610 610
611 if (ptrace_get_breakpoints(tsk) < 0)
612 return -ESRCH;
613
611 data &= ~DR_CONTROL_RESERVED; 614 data &= ~DR_CONTROL_RESERVED;
612 old_dr7 = ptrace_get_dr7(thread->ptrace_bps); 615 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
613restore: 616restore:
@@ -655,6 +658,9 @@ restore:
655 } 658 }
656 goto restore; 659 goto restore;
657 } 660 }
661
662 ptrace_put_breakpoints(tsk);
663
658 return ((orig_ret < 0) ? orig_ret : rc); 664 return ((orig_ret < 0) ? orig_ret : rc);
659} 665}
660 666
@@ -668,10 +674,17 @@ static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
668 674
669 if (n < HBP_NUM) { 675 if (n < HBP_NUM) {
670 struct perf_event *bp; 676 struct perf_event *bp;
677
678 if (ptrace_get_breakpoints(tsk) < 0)
679 return -ESRCH;
680
671 bp = thread->ptrace_bps[n]; 681 bp = thread->ptrace_bps[n];
672 if (!bp) 682 if (!bp)
673 return 0; 683 val = 0;
674 val = bp->hw.info.address; 684 else
685 val = bp->hw.info.address;
686
687 ptrace_put_breakpoints(tsk);
675 } else if (n == 6) { 688 } else if (n == 6) {
676 val = thread->debugreg6; 689 val = thread->debugreg6;
677 } else if (n == 7) { 690 } else if (n == 7) {
@@ -686,6 +699,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
686 struct perf_event *bp; 699 struct perf_event *bp;
687 struct thread_struct *t = &tsk->thread; 700 struct thread_struct *t = &tsk->thread;
688 struct perf_event_attr attr; 701 struct perf_event_attr attr;
702 int err = 0;
703
704 if (ptrace_get_breakpoints(tsk) < 0)
705 return -ESRCH;
689 706
690 if (!t->ptrace_bps[nr]) { 707 if (!t->ptrace_bps[nr]) {
691 ptrace_breakpoint_init(&attr); 708 ptrace_breakpoint_init(&attr);
@@ -709,24 +726,23 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
709 * writing for the user. And anyway this is the previous 726 * writing for the user. And anyway this is the previous
710 * behaviour. 727 * behaviour.
711 */ 728 */
712 if (IS_ERR(bp)) 729 if (IS_ERR(bp)) {
713 return PTR_ERR(bp); 730 err = PTR_ERR(bp);
731 goto put;
732 }
714 733
715 t->ptrace_bps[nr] = bp; 734 t->ptrace_bps[nr] = bp;
716 } else { 735 } else {
717 int err;
718
719 bp = t->ptrace_bps[nr]; 736 bp = t->ptrace_bps[nr];
720 737
721 attr = bp->attr; 738 attr = bp->attr;
722 attr.bp_addr = addr; 739 attr.bp_addr = addr;
723 err = modify_user_hw_breakpoint(bp, &attr); 740 err = modify_user_hw_breakpoint(bp, &attr);
724 if (err)
725 return err;
726 } 741 }
727 742
728 743put:
729 return 0; 744 ptrace_put_breakpoints(tsk);
745 return err;
730} 746}
731 747
732/* 748/*
@@ -801,7 +817,8 @@ void ptrace_disable(struct task_struct *child)
801static const struct user_regset_view user_x86_32_view; /* Initialized below. */ 817static const struct user_regset_view user_x86_32_view; /* Initialized below. */
802#endif 818#endif
803 819
804long arch_ptrace(struct task_struct *child, long request, long addr, long data) 820long arch_ptrace(struct task_struct *child, long request,
821 unsigned long addr, unsigned long data)
805{ 822{
806 int ret; 823 int ret;
807 unsigned long __user *datap = (unsigned long __user *)data; 824 unsigned long __user *datap = (unsigned long __user *)data;
@@ -812,8 +829,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
812 unsigned long tmp; 829 unsigned long tmp;
813 830
814 ret = -EIO; 831 ret = -EIO;
815 if ((addr & (sizeof(data) - 1)) || addr < 0 || 832 if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
816 addr >= sizeof(struct user))
817 break; 833 break;
818 834
819 tmp = 0; /* Default return condition */ 835 tmp = 0; /* Default return condition */
@@ -830,8 +846,7 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
830 846
831 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */ 847 case PTRACE_POKEUSR: /* write the word at location addr in the USER area */
832 ret = -EIO; 848 ret = -EIO;
833 if ((addr & (sizeof(data) - 1)) || addr < 0 || 849 if ((addr & (sizeof(data) - 1)) || addr >= sizeof(struct user))
834 addr >= sizeof(struct user))
835 break; 850 break;
836 851
837 if (addr < sizeof(struct user_regs_struct)) 852 if (addr < sizeof(struct user_regs_struct))
@@ -888,17 +903,17 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data)
888 903
889#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 904#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
890 case PTRACE_GET_THREAD_AREA: 905 case PTRACE_GET_THREAD_AREA:
891 if (addr < 0) 906 if ((int) addr < 0)
892 return -EIO; 907 return -EIO;
893 ret = do_get_thread_area(child, addr, 908 ret = do_get_thread_area(child, addr,
894 (struct user_desc __user *) data); 909 (struct user_desc __user *)data);
895 break; 910 break;
896 911
897 case PTRACE_SET_THREAD_AREA: 912 case PTRACE_SET_THREAD_AREA:
898 if (addr < 0) 913 if ((int) addr < 0)
899 return -EIO; 914 return -EIO;
900 ret = do_set_thread_area(child, addr, 915 ret = do_set_thread_area(child, addr,
901 (struct user_desc __user *) data, 0); 916 (struct user_desc __user *)data, 0);
902 break; 917 break;
903#endif 918#endif
904 919
@@ -1348,7 +1363,7 @@ void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1348 * We must return the syscall number to actually look up in the table. 1363 * We must return the syscall number to actually look up in the table.
1349 * This can be -1L to skip running any syscall at all. 1364 * This can be -1L to skip running any syscall at all.
1350 */ 1365 */
1351asmregparm long syscall_trace_enter(struct pt_regs *regs) 1366long syscall_trace_enter(struct pt_regs *regs)
1352{ 1367{
1353 long ret = 0; 1368 long ret = 0;
1354 1369
@@ -1393,7 +1408,7 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1393 return ret ?: regs->orig_ax; 1408 return ret ?: regs->orig_ax;
1394} 1409}
1395 1410
1396asmregparm void syscall_trace_leave(struct pt_regs *regs) 1411void syscall_trace_leave(struct pt_regs *regs)
1397{ 1412{
1398 bool step; 1413 bool step;
1399 1414
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 239427ca02af..42eb3300dfc6 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -41,48 +41,11 @@ void pvclock_set_flags(u8 flags)
41 valid_flags = flags; 41 valid_flags = flags;
42} 42}
43 43
44/*
45 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
46 * yielding a 64-bit result.
47 */
48static inline u64 scale_delta(u64 delta, u32 mul_frac, int shift)
49{
50 u64 product;
51#ifdef __i386__
52 u32 tmp1, tmp2;
53#endif
54
55 if (shift < 0)
56 delta >>= -shift;
57 else
58 delta <<= shift;
59
60#ifdef __i386__
61 __asm__ (
62 "mul %5 ; "
63 "mov %4,%%eax ; "
64 "mov %%edx,%4 ; "
65 "mul %5 ; "
66 "xor %5,%5 ; "
67 "add %4,%%eax ; "
68 "adc %5,%%edx ; "
69 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
70 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (mul_frac) );
71#elif defined(__x86_64__)
72 __asm__ (
73 "mul %%rdx ; shrd $32,%%rdx,%%rax"
74 : "=a" (product) : "0" (delta), "d" ((u64)mul_frac) );
75#else
76#error implement me!
77#endif
78
79 return product;
80}
81
82static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow) 44static u64 pvclock_get_nsec_offset(struct pvclock_shadow_time *shadow)
83{ 45{
84 u64 delta = native_read_tsc() - shadow->tsc_timestamp; 46 u64 delta = native_read_tsc() - shadow->tsc_timestamp;
85 return scale_delta(delta, shadow->tsc_to_nsec_mul, shadow->tsc_shift); 47 return pvclock_scale_delta(delta, shadow->tsc_to_nsec_mul,
48 shadow->tsc_shift);
86} 49}
87 50
88/* 51/*
@@ -120,6 +83,11 @@ unsigned long pvclock_tsc_khz(struct pvclock_vcpu_time_info *src)
120 83
121static atomic64_t last_value = ATOMIC64_INIT(0); 84static atomic64_t last_value = ATOMIC64_INIT(0);
122 85
86void pvclock_resume(void)
87{
88 atomic64_set(&last_value, 0);
89}
90
123cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src) 91cycle_t pvclock_clocksource_read(struct pvclock_vcpu_time_info *src)
124{ 92{
125 struct pvclock_shadow_time shadow; 93 struct pvclock_shadow_time shadow;
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 939b9e98245f..8bbe8c56916d 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -344,6 +344,8 @@ DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8235,
344 vt8237_force_enable_hpet); 344 vt8237_force_enable_hpet);
345DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237, 345DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_8237,
346 vt8237_force_enable_hpet); 346 vt8237_force_enable_hpet);
347DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_VIA, PCI_DEVICE_ID_VIA_CX700,
348 vt8237_force_enable_hpet);
347 349
348static void ati_force_hpet_resume(void) 350static void ati_force_hpet_resume(void)
349{ 351{
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index e3af342fe83a..9242436e9937 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -6,6 +6,7 @@
6#include <linux/dmi.h> 6#include <linux/dmi.h>
7#include <linux/sched.h> 7#include <linux/sched.h>
8#include <linux/tboot.h> 8#include <linux/tboot.h>
9#include <linux/delay.h>
9#include <acpi/reboot.h> 10#include <acpi/reboot.h>
10#include <asm/io.h> 11#include <asm/io.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
@@ -18,6 +19,7 @@
18#include <asm/pci_x86.h> 19#include <asm/pci_x86.h>
19#include <asm/virtext.h> 20#include <asm/virtext.h>
20#include <asm/cpu.h> 21#include <asm/cpu.h>
22#include <asm/nmi.h>
21 23
22#ifdef CONFIG_X86_32 24#ifdef CONFIG_X86_32
23# include <linux/ctype.h> 25# include <linux/ctype.h>
@@ -34,7 +36,7 @@ EXPORT_SYMBOL(pm_power_off);
34 36
35static const struct desc_ptr no_idt = {}; 37static const struct desc_ptr no_idt = {};
36static int reboot_mode; 38static int reboot_mode;
37enum reboot_type reboot_type = BOOT_KBD; 39enum reboot_type reboot_type = BOOT_ACPI;
38int reboot_force; 40int reboot_force;
39 41
40#if defined(CONFIG_X86_32) && defined(CONFIG_SMP) 42#if defined(CONFIG_X86_32) && defined(CONFIG_SMP)
@@ -84,7 +86,7 @@ static int __init reboot_setup(char *str)
84 } 86 }
85 /* we will leave sorting out the final value 87 /* we will leave sorting out the final value
86 when we are ready to reboot, since we might not 88 when we are ready to reboot, since we might not
87 have set up boot_cpu_id or smp_num_cpu */ 89 have detected BSP APIC ID or smp_num_cpu */
88 break; 90 break;
89#endif /* CONFIG_SMP */ 91#endif /* CONFIG_SMP */
90 92
@@ -284,6 +286,22 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
284 DMI_MATCH(DMI_BOARD_NAME, "P4S800"), 286 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
285 }, 287 },
286 }, 288 },
289 { /* Handle problems with rebooting on VersaLogic Menlow boards */
290 .callback = set_bios_reboot,
291 .ident = "VersaLogic Menlow based board",
292 .matches = {
293 DMI_MATCH(DMI_BOARD_VENDOR, "VersaLogic Corporation"),
294 DMI_MATCH(DMI_BOARD_NAME, "VersaLogic Menlow board"),
295 },
296 },
297 { /* Handle reboot issue on Acer Aspire one */
298 .callback = set_bios_reboot,
299 .ident = "Acer Aspire One A110",
300 .matches = {
301 DMI_MATCH(DMI_SYS_VENDOR, "Acer"),
302 DMI_MATCH(DMI_PRODUCT_NAME, "AOA110"),
303 },
304 },
287 { } 305 { }
288}; 306};
289 307
@@ -294,68 +312,16 @@ static int __init reboot_init(void)
294} 312}
295core_initcall(reboot_init); 313core_initcall(reboot_init);
296 314
297/* The following code and data reboots the machine by switching to real 315extern const unsigned char machine_real_restart_asm[];
298 mode and jumping to the BIOS reset entry point, as if the CPU has 316extern const u64 machine_real_restart_gdt[3];
299 really been reset. The previous version asked the keyboard
300 controller to pulse the CPU reset line, which is more thorough, but
301 doesn't work with at least one type of 486 motherboard. It is easy
302 to stop this code working; hence the copious comments. */
303static const unsigned long long
304real_mode_gdt_entries [3] =
305{
306 0x0000000000000000ULL, /* Null descriptor */
307 0x00009b000000ffffULL, /* 16-bit real-mode 64k code at 0x00000000 */
308 0x000093000100ffffULL /* 16-bit real-mode 64k data at 0x00000100 */
309};
310 317
311static const struct desc_ptr 318void machine_real_restart(unsigned int type)
312real_mode_gdt = { sizeof (real_mode_gdt_entries) - 1, (long)real_mode_gdt_entries },
313real_mode_idt = { 0x3ff, 0 };
314
315/* This is 16-bit protected mode code to disable paging and the cache,
316 switch to real mode and jump to the BIOS reset code.
317
318 The instruction that switches to real mode by writing to CR0 must be
319 followed immediately by a far jump instruction, which set CS to a
320 valid value for real mode, and flushes the prefetch queue to avoid
321 running instructions that have already been decoded in protected
322 mode.
323
324 Clears all the flags except ET, especially PG (paging), PE
325 (protected-mode enable) and TS (task switch for coprocessor state
326 save). Flushes the TLB after paging has been disabled. Sets CD and
327 NW, to disable the cache on a 486, and invalidates the cache. This
328 is more like the state of a 486 after reset. I don't know if
329 something else should be done for other chips.
330
331 More could be done here to set up the registers as if a CPU reset had
332 occurred; hopefully real BIOSs don't assume much. */
333static const unsigned char real_mode_switch [] =
334{ 319{
335 0x66, 0x0f, 0x20, 0xc0, /* movl %cr0,%eax */ 320 void *restart_va;
336 0x66, 0x83, 0xe0, 0x11, /* andl $0x00000011,%eax */ 321 unsigned long restart_pa;
337 0x66, 0x0d, 0x00, 0x00, 0x00, 0x60, /* orl $0x60000000,%eax */ 322 void (*restart_lowmem)(unsigned int);
338 0x66, 0x0f, 0x22, 0xc0, /* movl %eax,%cr0 */ 323 u64 *lowmem_gdt;
339 0x66, 0x0f, 0x22, 0xd8, /* movl %eax,%cr3 */
340 0x66, 0x0f, 0x20, 0xc3, /* movl %cr0,%ebx */
341 0x66, 0x81, 0xe3, 0x00, 0x00, 0x00, 0x60, /* andl $0x60000000,%ebx */
342 0x74, 0x02, /* jz f */
343 0x0f, 0x09, /* wbinvd */
344 0x24, 0x10, /* f: andb $0x10,al */
345 0x66, 0x0f, 0x22, 0xc0 /* movl %eax,%cr0 */
346};
347static const unsigned char jump_to_bios [] =
348{
349 0xea, 0x00, 0x00, 0xff, 0xff /* ljmp $0xffff,$0x0000 */
350};
351 324
352/*
353 * Switch to real mode and then execute the code
354 * specified by the code and length parameters.
355 * We assume that length will aways be less that 100!
356 */
357void machine_real_restart(const unsigned char *code, int length)
358{
359 local_irq_disable(); 325 local_irq_disable();
360 326
361 /* Write zero to CMOS register number 0x0f, which the BIOS POST 327 /* Write zero to CMOS register number 0x0f, which the BIOS POST
@@ -371,16 +337,10 @@ void machine_real_restart(const unsigned char *code, int length)
371 CMOS_WRITE(0x00, 0x8f); 337 CMOS_WRITE(0x00, 0x8f);
372 spin_unlock(&rtc_lock); 338 spin_unlock(&rtc_lock);
373 339
374 /* Remap the kernel at virtual address zero, as well as offset zero
375 from the kernel segment. This assumes the kernel segment starts at
376 virtual address PAGE_OFFSET. */
377 memcpy(swapper_pg_dir, swapper_pg_dir + KERNEL_PGD_BOUNDARY,
378 sizeof(swapper_pg_dir [0]) * KERNEL_PGD_PTRS);
379
380 /* 340 /*
381 * Use `swapper_pg_dir' as our page directory. 341 * Switch back to the initial page table.
382 */ 342 */
383 load_cr3(swapper_pg_dir); 343 load_cr3(initial_page_table);
384 344
385 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads 345 /* Write 0x1234 to absolute memory location 0x472. The BIOS reads
386 this on booting to tell it to "Bypass memory test (also warm 346 this on booting to tell it to "Bypass memory test (also warm
@@ -389,41 +349,23 @@ void machine_real_restart(const unsigned char *code, int length)
389 too. */ 349 too. */
390 *((unsigned short *)0x472) = reboot_mode; 350 *((unsigned short *)0x472) = reboot_mode;
391 351
392 /* For the switch to real mode, copy some code to low memory. It has 352 /* Patch the GDT in the low memory trampoline */
393 to be in the first 64k because it is running in 16-bit mode, and it 353 lowmem_gdt = TRAMPOLINE_SYM(machine_real_restart_gdt);
394 has to have the same physical and virtual address, because it turns 354
395 off paging. Copy it near the end of the first page, out of the way 355 restart_va = TRAMPOLINE_SYM(machine_real_restart_asm);
396 of BIOS variables. */ 356 restart_pa = virt_to_phys(restart_va);
397 memcpy((void *)(0x1000 - sizeof(real_mode_switch) - 100), 357 restart_lowmem = (void (*)(unsigned int))restart_pa;
398 real_mode_switch, sizeof (real_mode_switch)); 358
399 memcpy((void *)(0x1000 - 100), code, length); 359 /* GDT[0]: GDT self-pointer */
400 360 lowmem_gdt[0] =
401 /* Set up the IDT for real mode. */ 361 (u64)(sizeof(machine_real_restart_gdt) - 1) +
402 load_idt(&real_mode_idt); 362 ((u64)virt_to_phys(lowmem_gdt) << 16);
403 363 /* GDT[1]: 64K real mode code segment */
404 /* Set up a GDT from which we can load segment descriptors for real 364 lowmem_gdt[1] =
405 mode. The GDT is not used in real mode; it is just needed here to 365 GDT_ENTRY(0x009b, restart_pa, 0xffff);
406 prepare the descriptors. */ 366
407 load_gdt(&real_mode_gdt); 367 /* Jump to the identity-mapped low memory code */
408 368 restart_lowmem(type);
409 /* Load the data segment registers, and thus the descriptors ready for
410 real mode. The base address of each segment is 0x100, 16 times the
411 selector value being loaded here. This is so that the segment
412 registers don't have to be reloaded after switching to real mode:
413 the values are consistent for real mode operation already. */
414 __asm__ __volatile__ ("movl $0x0010,%%eax\n"
415 "\tmovl %%eax,%%ds\n"
416 "\tmovl %%eax,%%es\n"
417 "\tmovl %%eax,%%fs\n"
418 "\tmovl %%eax,%%gs\n"
419 "\tmovl %%eax,%%ss" : : : "eax");
420
421 /* Jump to the 16-bit code that we copied earlier. It disables paging
422 and the cache, switches to real mode, and jumps to the BIOS reset
423 entry point. */
424 __asm__ __volatile__ ("ljmp $0x0008,%0"
425 :
426 : "i" ((void *)(0x1000 - sizeof (real_mode_switch) - 100)));
427} 369}
428#ifdef CONFIG_APM_MODULE 370#ifdef CONFIG_APM_MODULE
429EXPORT_SYMBOL(machine_real_restart); 371EXPORT_SYMBOL(machine_real_restart);
@@ -477,6 +419,30 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
477 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"), 419 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
478 }, 420 },
479 }, 421 },
422 { /* Handle problems with rebooting on the Latitude E6320. */
423 .callback = set_pci_reboot,
424 .ident = "Dell Latitude E6320",
425 .matches = {
426 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
427 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6320"),
428 },
429 },
430 { /* Handle problems with rebooting on the Latitude E5420. */
431 .callback = set_pci_reboot,
432 .ident = "Dell Latitude E5420",
433 .matches = {
434 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
435 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E5420"),
436 },
437 },
438 { /* Handle problems with rebooting on the Latitude E6420. */
439 .callback = set_pci_reboot,
440 .ident = "Dell Latitude E6420",
441 .matches = {
442 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
443 DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"),
444 },
445 },
480 { } 446 { }
481}; 447};
482 448
@@ -544,9 +510,24 @@ void __attribute__((weak)) mach_reboot_fixups(void)
544{ 510{
545} 511}
546 512
513/*
514 * Windows compatible x86 hardware expects the following on reboot:
515 *
516 * 1) If the FADT has the ACPI reboot register flag set, try it
517 * 2) If still alive, write to the keyboard controller
518 * 3) If still alive, write to the ACPI reboot register again
519 * 4) If still alive, write to the keyboard controller again
520 *
521 * If the machine is still alive at this stage, it gives up. We default to
522 * following the same pattern, except that if we're still alive after (4) we'll
523 * try to force a triple fault and then cycle between hitting the keyboard
524 * controller and doing that
525 */
547static void native_machine_emergency_restart(void) 526static void native_machine_emergency_restart(void)
548{ 527{
549 int i; 528 int i;
529 int attempt = 0;
530 int orig_reboot_type = reboot_type;
550 531
551 if (reboot_emergency) 532 if (reboot_emergency)
552 emergency_vmx_disable_all(); 533 emergency_vmx_disable_all();
@@ -568,6 +549,13 @@ static void native_machine_emergency_restart(void)
568 outb(0xfe, 0x64); /* pulse reset low */ 549 outb(0xfe, 0x64); /* pulse reset low */
569 udelay(50); 550 udelay(50);
570 } 551 }
552 if (attempt == 0 && orig_reboot_type == BOOT_ACPI) {
553 attempt = 1;
554 reboot_type = BOOT_ACPI;
555 } else {
556 reboot_type = BOOT_TRIPLE;
557 }
558 break;
571 559
572 case BOOT_TRIPLE: 560 case BOOT_TRIPLE:
573 load_idt(&no_idt); 561 load_idt(&no_idt);
@@ -578,7 +566,7 @@ static void native_machine_emergency_restart(void)
578 566
579#ifdef CONFIG_X86_32 567#ifdef CONFIG_X86_32
580 case BOOT_BIOS: 568 case BOOT_BIOS:
581 machine_real_restart(jump_to_bios, sizeof(jump_to_bios)); 569 machine_real_restart(MRR_BIOS);
582 570
583 reboot_type = BOOT_KBD; 571 reboot_type = BOOT_KBD;
584 break; 572 break;
@@ -641,7 +629,7 @@ void native_machine_shutdown(void)
641 /* O.K Now that I'm on the appropriate processor, 629 /* O.K Now that I'm on the appropriate processor,
642 * stop all of the others. 630 * stop all of the others.
643 */ 631 */
644 smp_send_stop(); 632 stop_other_cpus();
645#endif 633#endif
646 634
647 lapic_shutdown(); 635 lapic_shutdown();
@@ -753,7 +741,7 @@ static int crash_nmi_callback(struct notifier_block *self,
753{ 741{
754 int cpu; 742 int cpu;
755 743
756 if (val != DIE_NMI_IPI) 744 if (val != DIE_NMI)
757 return NOTIFY_OK; 745 return NOTIFY_OK;
758 746
759 cpu = raw_smp_processor_id(); 747 cpu = raw_smp_processor_id();
@@ -784,6 +772,8 @@ static void smp_send_nmi_allbutself(void)
784 772
785static struct notifier_block crash_nmi_nb = { 773static struct notifier_block crash_nmi_nb = {
786 .notifier_call = crash_nmi_callback, 774 .notifier_call = crash_nmi_callback,
775 /* we want to be the first one called */
776 .priority = NMI_LOCAL_HIGH_PRIOR+1,
787}; 777};
788 778
789/* Halt all other CPUs, calling the specified function on each of them 779/* Halt all other CPUs, calling the specified function on each of them
diff --git a/arch/x86/kernel/reboot_32.S b/arch/x86/kernel/reboot_32.S
new file mode 100644
index 000000000000..1d5c46df0d78
--- /dev/null
+++ b/arch/x86/kernel/reboot_32.S
@@ -0,0 +1,135 @@
1#include <linux/linkage.h>
2#include <linux/init.h>
3#include <asm/segment.h>
4#include <asm/page_types.h>
5
6/*
7 * The following code and data reboots the machine by switching to real
8 * mode and jumping to the BIOS reset entry point, as if the CPU has
9 * really been reset. The previous version asked the keyboard
10 * controller to pulse the CPU reset line, which is more thorough, but
11 * doesn't work with at least one type of 486 motherboard. It is easy
12 * to stop this code working; hence the copious comments.
13 *
14 * This code is called with the restart type (0 = BIOS, 1 = APM) in %eax.
15 */
16 .section ".x86_trampoline","a"
17 .balign 16
18 .code32
19ENTRY(machine_real_restart_asm)
20r_base = .
21 /* Get our own relocated address */
22 call 1f
231: popl %ebx
24 subl $(1b - r_base), %ebx
25
26 /* Compute the equivalent real-mode segment */
27 movl %ebx, %ecx
28 shrl $4, %ecx
29
30 /* Patch post-real-mode segment jump */
31 movw (dispatch_table - r_base)(%ebx,%eax,2),%ax
32 movw %ax, (101f - r_base)(%ebx)
33 movw %cx, (102f - r_base)(%ebx)
34
35 /* Set up the IDT for real mode. */
36 lidtl (machine_real_restart_idt - r_base)(%ebx)
37
38 /*
39 * Set up a GDT from which we can load segment descriptors for real
40 * mode. The GDT is not used in real mode; it is just needed here to
41 * prepare the descriptors.
42 */
43 lgdtl (machine_real_restart_gdt - r_base)(%ebx)
44
45 /*
46 * Load the data segment registers with 16-bit compatible values
47 */
48 movl $16, %ecx
49 movl %ecx, %ds
50 movl %ecx, %es
51 movl %ecx, %fs
52 movl %ecx, %gs
53 movl %ecx, %ss
54 ljmpl $8, $1f - r_base
55
56/*
57 * This is 16-bit protected mode code to disable paging and the cache,
58 * switch to real mode and jump to the BIOS reset code.
59 *
60 * The instruction that switches to real mode by writing to CR0 must be
61 * followed immediately by a far jump instruction, which set CS to a
62 * valid value for real mode, and flushes the prefetch queue to avoid
63 * running instructions that have already been decoded in protected
64 * mode.
65 *
66 * Clears all the flags except ET, especially PG (paging), PE
67 * (protected-mode enable) and TS (task switch for coprocessor state
68 * save). Flushes the TLB after paging has been disabled. Sets CD and
69 * NW, to disable the cache on a 486, and invalidates the cache. This
70 * is more like the state of a 486 after reset. I don't know if
71 * something else should be done for other chips.
72 *
73 * More could be done here to set up the registers as if a CPU reset had
74 * occurred; hopefully real BIOSs don't assume much. This is not the
75 * actual BIOS entry point, anyway (that is at 0xfffffff0).
76 *
77 * Most of this work is probably excessive, but it is what is tested.
78 */
79 .code16
801:
81 xorl %ecx, %ecx
82 movl %cr0, %eax
83 andl $0x00000011, %eax
84 orl $0x60000000, %eax
85 movl %eax, %cr0
86 movl %ecx, %cr3
87 movl %cr0, %edx
88 andl $0x60000000, %edx /* If no cache bits -> no wbinvd */
89 jz 2f
90 wbinvd
912:
92 andb $0x10, %al
93 movl %eax, %cr0
94 .byte 0xea /* ljmpw */
95101: .word 0 /* Offset */
96102: .word 0 /* Segment */
97
98bios:
99 ljmpw $0xf000, $0xfff0
100
101apm:
102 movw $0x1000, %ax
103 movw %ax, %ss
104 movw $0xf000, %sp
105 movw $0x5307, %ax
106 movw $0x0001, %bx
107 movw $0x0003, %cx
108 int $0x15
109
110END(machine_real_restart_asm)
111
112 .balign 16
113 /* These must match <asm/reboot.h */
114dispatch_table:
115 .word bios - r_base
116 .word apm - r_base
117END(dispatch_table)
118
119 .balign 16
120machine_real_restart_idt:
121 .word 0xffff /* Length - real mode default value */
122 .long 0 /* Base - real mode default value */
123END(machine_real_restart_idt)
124
125 .balign 16
126ENTRY(machine_real_restart_gdt)
127 .quad 0 /* Self-pointer, filled in by PM code */
128 .quad 0 /* 16-bit code segment, filled in by PM code */
129 /*
130 * 16-bit data segment with the selector value 16 = 0x10 and
131 * base value 0x100; since this is consistent with real mode
132 * semantics we don't have to reload the segments once CR0.PE = 0.
133 */
134 .quad GDT_ENTRY(0x0093, 0x100, 0xffff)
135END(machine_real_restart_gdt)
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index fda313ebbb03..c8e41e90f59c 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -43,17 +43,33 @@ static void rdc321x_reset(struct pci_dev *dev)
43 outb(1, 0x92); 43 outb(1, 0x92);
44} 44}
45 45
46static void ce4100_reset(struct pci_dev *dev)
47{
48 int i;
49
50 for (i = 0; i < 10; i++) {
51 outb(0x2, 0xcf9);
52 udelay(50);
53 }
54}
55
46struct device_fixup { 56struct device_fixup {
47 unsigned int vendor; 57 unsigned int vendor;
48 unsigned int device; 58 unsigned int device;
49 void (*reboot_fixup)(struct pci_dev *); 59 void (*reboot_fixup)(struct pci_dev *);
50}; 60};
51 61
62/*
63 * PCI ids solely used for fixups_table go here
64 */
65#define PCI_DEVICE_ID_INTEL_CE4100 0x0708
66
52static const struct device_fixup fixups_table[] = { 67static const struct device_fixup fixups_table[] = {
53{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset }, 68{ PCI_VENDOR_ID_CYRIX, PCI_DEVICE_ID_CYRIX_5530_LEGACY, cs5530a_warm_reset },
54{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset }, 69{ PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CS5536_ISA, cs5536_warm_reset },
55{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset }, 70{ PCI_VENDOR_ID_NS, PCI_DEVICE_ID_NS_SC1100_BRIDGE, cs5530a_warm_reset },
56{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset }, 71{ PCI_VENDOR_ID_RDC, PCI_DEVICE_ID_RDC_R6030, rdc321x_reset },
72{ PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_CE4100, ce4100_reset },
57}; 73};
58 74
59/* 75/*
diff --git a/arch/x86/kernel/resource.c b/arch/x86/kernel/resource.c
new file mode 100644
index 000000000000..2a26819bb6a8
--- /dev/null
+++ b/arch/x86/kernel/resource.c
@@ -0,0 +1,48 @@
1#include <linux/ioport.h>
2#include <asm/e820.h>
3
4static void resource_clip(struct resource *res, resource_size_t start,
5 resource_size_t end)
6{
7 resource_size_t low = 0, high = 0;
8
9 if (res->end < start || res->start > end)
10 return; /* no conflict */
11
12 if (res->start < start)
13 low = start - res->start;
14
15 if (res->end > end)
16 high = res->end - end;
17
18 /* Keep the area above or below the conflict, whichever is larger */
19 if (low > high)
20 res->end = start - 1;
21 else
22 res->start = end + 1;
23}
24
25static void remove_e820_regions(struct resource *avail)
26{
27 int i;
28 struct e820entry *entry;
29
30 for (i = 0; i < e820.nr_map; i++) {
31 entry = &e820.map[i];
32
33 resource_clip(avail, entry->addr,
34 entry->addr + entry->size - 1);
35 }
36}
37
38void arch_remove_reservations(struct resource *avail)
39{
40 /* Trim out BIOS areas (low 1MB and high 2MB) and E820 regions */
41 if (avail->flags & IORESOURCE_MEM) {
42 if (avail->start < BIOS_END)
43 avail->start = BIOS_END;
44 resource_clip(avail, BIOS_ROM_BASE, BIOS_ROM_END);
45
46 remove_e820_regions(avail);
47 }
48}
diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 1cfbbfc3ae26..3f2ad2640d85 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -6,6 +6,7 @@
6#include <linux/acpi.h> 6#include <linux/acpi.h>
7#include <linux/bcd.h> 7#include <linux/bcd.h>
8#include <linux/pnp.h> 8#include <linux/pnp.h>
9#include <linux/of.h>
9 10
10#include <asm/vsyscall.h> 11#include <asm/vsyscall.h>
11#include <asm/x86_init.h> 12#include <asm/x86_init.h>
@@ -76,7 +77,7 @@ int mach_set_rtc_mmss(unsigned long nowtime)
76 CMOS_WRITE(real_seconds, RTC_SECONDS); 77 CMOS_WRITE(real_seconds, RTC_SECONDS);
77 CMOS_WRITE(real_minutes, RTC_MINUTES); 78 CMOS_WRITE(real_minutes, RTC_MINUTES);
78 } else { 79 } else {
79 printk(KERN_WARNING 80 printk_once(KERN_NOTICE
80 "set_rtc_mmss: can't update from %d to %d\n", 81 "set_rtc_mmss: can't update from %d to %d\n",
81 cmos_minutes, real_minutes); 82 cmos_minutes, real_minutes);
82 retval = -1; 83 retval = -1;
@@ -236,6 +237,8 @@ static __init int add_rtc_cmos(void)
236 } 237 }
237 } 238 }
238#endif 239#endif
240 if (of_have_populated_dt())
241 return 0;
239 242
240 platform_device_register(&rtc_device); 243 platform_device_register(&rtc_device);
241 dev_info(&rtc_device.dev, 244 dev_info(&rtc_device.dev,
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index c3a4fbb2b996..afaf38447ef5 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -31,6 +31,7 @@
31#include <linux/apm_bios.h> 31#include <linux/apm_bios.h>
32#include <linux/initrd.h> 32#include <linux/initrd.h>
33#include <linux/bootmem.h> 33#include <linux/bootmem.h>
34#include <linux/memblock.h>
34#include <linux/seq_file.h> 35#include <linux/seq_file.h>
35#include <linux/console.h> 36#include <linux/console.h>
36#include <linux/mca.h> 37#include <linux/mca.h>
@@ -83,7 +84,6 @@
83#include <asm/dmi.h> 84#include <asm/dmi.h>
84#include <asm/io_apic.h> 85#include <asm/io_apic.h>
85#include <asm/ist.h> 86#include <asm/ist.h>
86#include <asm/vmi.h>
87#include <asm/setup_arch.h> 87#include <asm/setup_arch.h>
88#include <asm/bios_ebda.h> 88#include <asm/bios_ebda.h>
89#include <asm/cacheflush.h> 89#include <asm/cacheflush.h>
@@ -107,11 +107,13 @@
107#include <asm/percpu.h> 107#include <asm/percpu.h>
108#include <asm/topology.h> 108#include <asm/topology.h>
109#include <asm/apicdef.h> 109#include <asm/apicdef.h>
110#include <asm/k8.h> 110#include <asm/amd_nb.h>
111#ifdef CONFIG_X86_64 111#ifdef CONFIG_X86_64
112#include <asm/numa_64.h> 112#include <asm/numa_64.h>
113#endif 113#endif
114#include <asm/mce.h> 114#include <asm/mce.h>
115#include <asm/alternative.h>
116#include <asm/prom.h>
115 117
116/* 118/*
117 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 119 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -125,7 +127,6 @@ unsigned long max_pfn_mapped;
125RESERVE_BRK(dmi_alloc, 65536); 127RESERVE_BRK(dmi_alloc, 65536);
126#endif 128#endif
127 129
128unsigned int boot_cpu_id __read_mostly;
129 130
130static __initdata unsigned long _brk_start = (unsigned long)__brk_base; 131static __initdata unsigned long _brk_start = (unsigned long)__brk_base;
131unsigned long _brk_end = (unsigned long)__brk_base; 132unsigned long _brk_end = (unsigned long)__brk_base;
@@ -297,12 +298,15 @@ static void __init init_gbpages(void)
297static inline void init_gbpages(void) 298static inline void init_gbpages(void)
298{ 299{
299} 300}
301static void __init cleanup_highmap(void)
302{
303}
300#endif 304#endif
301 305
302static void __init reserve_brk(void) 306static void __init reserve_brk(void)
303{ 307{
304 if (_brk_end > _brk_start) 308 if (_brk_end > _brk_start)
305 reserve_early(__pa(_brk_start), __pa(_brk_end), "BRK"); 309 memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK");
306 310
307 /* Mark brk area as locked down and no longer taking any 311 /* Mark brk area as locked down and no longer taking any
308 new allocations */ 312 new allocations */
@@ -324,17 +328,16 @@ static void __init relocate_initrd(void)
324 char *p, *q; 328 char *p, *q;
325 329
326 /* We need to move the initrd down into lowmem */ 330 /* We need to move the initrd down into lowmem */
327 ramdisk_here = find_e820_area(0, end_of_lowmem, area_size, 331 ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size,
328 PAGE_SIZE); 332 PAGE_SIZE);
329 333
330 if (ramdisk_here == -1ULL) 334 if (ramdisk_here == MEMBLOCK_ERROR)
331 panic("Cannot find place for new RAMDISK of size %lld\n", 335 panic("Cannot find place for new RAMDISK of size %lld\n",
332 ramdisk_size); 336 ramdisk_size);
333 337
334 /* Note: this includes all the lowmem currently occupied by 338 /* Note: this includes all the lowmem currently occupied by
335 the initrd, we rely on that fact to keep the data intact. */ 339 the initrd, we rely on that fact to keep the data intact. */
336 reserve_early(ramdisk_here, ramdisk_here + area_size, 340 memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK");
337 "NEW RAMDISK");
338 initrd_start = ramdisk_here + PAGE_OFFSET; 341 initrd_start = ramdisk_here + PAGE_OFFSET;
339 initrd_end = initrd_start + ramdisk_size; 342 initrd_end = initrd_start + ramdisk_size;
340 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", 343 printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n",
@@ -390,7 +393,7 @@ static void __init reserve_initrd(void)
390 initrd_start = 0; 393 initrd_start = 0;
391 394
392 if (ramdisk_size >= (end_of_lowmem>>1)) { 395 if (ramdisk_size >= (end_of_lowmem>>1)) {
393 free_early(ramdisk_image, ramdisk_end); 396 memblock_x86_free_range(ramdisk_image, ramdisk_end);
394 printk(KERN_ERR "initrd too large to handle, " 397 printk(KERN_ERR "initrd too large to handle, "
395 "disabling initrd\n"); 398 "disabling initrd\n");
396 return; 399 return;
@@ -413,7 +416,7 @@ static void __init reserve_initrd(void)
413 416
414 relocate_initrd(); 417 relocate_initrd();
415 418
416 free_early(ramdisk_image, ramdisk_end); 419 memblock_x86_free_range(ramdisk_image, ramdisk_end);
417} 420}
418#else 421#else
419static void __init reserve_initrd(void) 422static void __init reserve_initrd(void)
@@ -430,16 +433,30 @@ static void __init parse_setup_data(void)
430 return; 433 return;
431 pa_data = boot_params.hdr.setup_data; 434 pa_data = boot_params.hdr.setup_data;
432 while (pa_data) { 435 while (pa_data) {
433 data = early_memremap(pa_data, PAGE_SIZE); 436 u32 data_len, map_len;
437
438 map_len = max(PAGE_SIZE - (pa_data & ~PAGE_MASK),
439 (u64)sizeof(struct setup_data));
440 data = early_memremap(pa_data, map_len);
441 data_len = data->len + sizeof(struct setup_data);
442 if (data_len > map_len) {
443 early_iounmap(data, map_len);
444 data = early_memremap(pa_data, data_len);
445 map_len = data_len;
446 }
447
434 switch (data->type) { 448 switch (data->type) {
435 case SETUP_E820_EXT: 449 case SETUP_E820_EXT:
436 parse_e820_ext(data, pa_data); 450 parse_e820_ext(data);
451 break;
452 case SETUP_DTB:
453 add_dtb(pa_data);
437 break; 454 break;
438 default: 455 default:
439 break; 456 break;
440 } 457 }
441 pa_data = data->next; 458 pa_data = data->next;
442 early_iounmap(data, PAGE_SIZE); 459 early_iounmap(data, map_len);
443 } 460 }
444} 461}
445 462
@@ -469,7 +486,7 @@ static void __init e820_reserve_setup_data(void)
469 e820_print_map("reserve setup_data"); 486 e820_print_map("reserve setup_data");
470} 487}
471 488
472static void __init reserve_early_setup_data(void) 489static void __init memblock_x86_reserve_range_setup_data(void)
473{ 490{
474 struct setup_data *data; 491 struct setup_data *data;
475 u64 pa_data; 492 u64 pa_data;
@@ -481,7 +498,7 @@ static void __init reserve_early_setup_data(void)
481 while (pa_data) { 498 while (pa_data) {
482 data = early_memremap(pa_data, sizeof(*data)); 499 data = early_memremap(pa_data, sizeof(*data));
483 sprintf(buf, "setup data %x", data->type); 500 sprintf(buf, "setup data %x", data->type);
484 reserve_early(pa_data, pa_data+sizeof(*data)+data->len, buf); 501 memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf);
485 pa_data = data->next; 502 pa_data = data->next;
486 early_iounmap(data, sizeof(*data)); 503 early_iounmap(data, sizeof(*data));
487 } 504 }
@@ -502,6 +519,18 @@ static inline unsigned long long get_total_mem(void)
502 return total << PAGE_SHIFT; 519 return total << PAGE_SHIFT;
503} 520}
504 521
522/*
523 * Keep the crash kernel below this limit. On 32 bits earlier kernels
524 * would limit the kernel to the low 512 MiB due to mapping restrictions.
525 * On 64 bits, kexec-tools currently limits us to 896 MiB; increase this
526 * limit once kexec-tools are fixed.
527 */
528#ifdef CONFIG_X86_32
529# define CRASH_KERNEL_ADDR_MAX (512 << 20)
530#else
531# define CRASH_KERNEL_ADDR_MAX (896 << 20)
532#endif
533
505static void __init reserve_crashkernel(void) 534static void __init reserve_crashkernel(void)
506{ 535{
507 unsigned long long total_mem; 536 unsigned long long total_mem;
@@ -519,23 +548,27 @@ static void __init reserve_crashkernel(void)
519 if (crash_base <= 0) { 548 if (crash_base <= 0) {
520 const unsigned long long alignment = 16<<20; /* 16M */ 549 const unsigned long long alignment = 16<<20; /* 16M */
521 550
522 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size, 551 /*
523 alignment); 552 * kexec want bzImage is below CRASH_KERNEL_ADDR_MAX
524 if (crash_base == -1ULL) { 553 */
554 crash_base = memblock_find_in_range(alignment,
555 CRASH_KERNEL_ADDR_MAX, crash_size, alignment);
556
557 if (crash_base == MEMBLOCK_ERROR) {
525 pr_info("crashkernel reservation failed - No suitable area found.\n"); 558 pr_info("crashkernel reservation failed - No suitable area found.\n");
526 return; 559 return;
527 } 560 }
528 } else { 561 } else {
529 unsigned long long start; 562 unsigned long long start;
530 563
531 start = find_e820_area(crash_base, ULONG_MAX, crash_size, 564 start = memblock_find_in_range(crash_base,
532 1<<20); 565 crash_base + crash_size, crash_size, 1<<20);
533 if (start != crash_base) { 566 if (start != crash_base) {
534 pr_info("crashkernel reservation failed - memory is in use.\n"); 567 pr_info("crashkernel reservation failed - memory is in use.\n");
535 return; 568 return;
536 } 569 }
537 } 570 }
538 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL"); 571 memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL");
539 572
540 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 573 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
541 "for crashkernel (System RAM: %ldMB)\n", 574 "for crashkernel (System RAM: %ldMB)\n",
@@ -586,28 +619,6 @@ void __init reserve_standard_io_resources(void)
586 619
587} 620}
588 621
589/*
590 * Note: elfcorehdr_addr is not just limited to vmcore. It is also used by
591 * is_kdump_kernel() to determine if we are booting after a panic. Hence
592 * ifdef it under CONFIG_CRASH_DUMP and not CONFIG_PROC_VMCORE.
593 */
594
595#ifdef CONFIG_CRASH_DUMP
596/* elfcorehdr= specifies the location of elf core header
597 * stored by the crashed kernel. This option will be passed
598 * by kexec loader to the capture kernel.
599 */
600static int __init setup_elfcorehdr(char *arg)
601{
602 char *end;
603 if (!arg)
604 return -EINVAL;
605 elfcorehdr_addr = memparse(arg, &end);
606 return end > arg ? 0 : -EINVAL;
607}
608early_param("elfcorehdr", setup_elfcorehdr);
609#endif
610
611static __init void reserve_ibft_region(void) 622static __init void reserve_ibft_region(void)
612{ 623{
613 unsigned long addr, size = 0; 624 unsigned long addr, size = 0;
@@ -615,82 +626,10 @@ static __init void reserve_ibft_region(void)
615 addr = find_ibft_region(&size); 626 addr = find_ibft_region(&size);
616 627
617 if (size) 628 if (size)
618 reserve_early_overlap_ok(addr, addr + size, "ibft"); 629 memblock_x86_reserve_range(addr, addr + size, "* ibft");
619} 630}
620 631
621#ifdef CONFIG_X86_RESERVE_LOW_64K 632static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10;
622static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
623{
624 printk(KERN_NOTICE
625 "%s detected: BIOS may corrupt low RAM, working around it.\n",
626 d->ident);
627
628 e820_update_range(0, 0x10000, E820_RAM, E820_RESERVED);
629 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
630
631 return 0;
632}
633#endif
634
635/* List of systems that have known low memory corruption BIOS problems */
636static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
637#ifdef CONFIG_X86_RESERVE_LOW_64K
638 {
639 .callback = dmi_low_memory_corruption,
640 .ident = "AMI BIOS",
641 .matches = {
642 DMI_MATCH(DMI_BIOS_VENDOR, "American Megatrends Inc."),
643 },
644 },
645 {
646 .callback = dmi_low_memory_corruption,
647 .ident = "Phoenix BIOS",
648 .matches = {
649 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix Technologies"),
650 },
651 },
652 {
653 .callback = dmi_low_memory_corruption,
654 .ident = "Phoenix/MSC BIOS",
655 .matches = {
656 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
657 },
658 },
659 /*
660 * AMI BIOS with low memory corruption was found on Intel DG45ID and
661 * DG45FC boards.
662 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
663 * match only DMI_BOARD_NAME and see if there is more bad products
664 * with this vendor.
665 */
666 {
667 .callback = dmi_low_memory_corruption,
668 .ident = "AMI BIOS",
669 .matches = {
670 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
671 },
672 },
673 {
674 .callback = dmi_low_memory_corruption,
675 .ident = "AMI BIOS",
676 .matches = {
677 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
678 },
679 },
680 /*
681 * The Dell Inspiron Mini 1012 has DMI_BIOS_VENDOR = "Dell Inc.", so
682 * match on the product name.
683 */
684 {
685 .callback = dmi_low_memory_corruption,
686 .ident = "Phoenix BIOS",
687 .matches = {
688 DMI_MATCH(DMI_PRODUCT_NAME, "Inspiron 1012"),
689 },
690 },
691#endif
692 {}
693};
694 633
695static void __init trim_bios_range(void) 634static void __init trim_bios_range(void)
696{ 635{
@@ -698,8 +637,14 @@ static void __init trim_bios_range(void)
698 * A special case is the first 4Kb of memory; 637 * A special case is the first 4Kb of memory;
699 * This is a BIOS owned area, not kernel ram, but generally 638 * This is a BIOS owned area, not kernel ram, but generally
700 * not listed as such in the E820 table. 639 * not listed as such in the E820 table.
640 *
641 * This typically reserves additional memory (64KiB by default)
642 * since some BIOSes are known to corrupt low memory. See the
643 * Kconfig help text for X86_RESERVE_LOW.
701 */ 644 */
702 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED); 645 e820_update_range(0, ALIGN(reserve_low, PAGE_SIZE),
646 E820_RAM, E820_RESERVED);
647
703 /* 648 /*
704 * special case: Some BIOSen report the PC BIOS 649 * special case: Some BIOSen report the PC BIOS
705 * area (640->1Mb) as ram even though it is not. 650 * area (640->1Mb) as ram even though it is not.
@@ -709,6 +654,28 @@ static void __init trim_bios_range(void)
709 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 654 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
710} 655}
711 656
657static int __init parse_reservelow(char *p)
658{
659 unsigned long long size;
660
661 if (!p)
662 return -EINVAL;
663
664 size = memparse(p, &p);
665
666 if (size < 4096)
667 size = 4096;
668
669 if (size > 640*1024)
670 size = 640*1024;
671
672 reserve_low = size;
673
674 return 0;
675}
676
677early_param("reservelow", parse_reservelow);
678
712/* 679/*
713 * Determine if we were loaded by an EFI loader. If so, then we have also been 680 * Determine if we were loaded by an EFI loader. If so, then we have also been
714 * passed the efi memmap, systab, etc., so we should use these data structures 681 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -724,20 +691,28 @@ static void __init trim_bios_range(void)
724 691
725void __init setup_arch(char **cmdline_p) 692void __init setup_arch(char **cmdline_p)
726{ 693{
727 int acpi = 0;
728 int k8 = 0;
729
730#ifdef CONFIG_X86_32 694#ifdef CONFIG_X86_32
731 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 695 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
732 visws_early_detect(); 696 visws_early_detect();
697
698 /*
699 * copy kernel address range established so far and switch
700 * to the proper swapper page table
701 */
702 clone_pgd_range(swapper_pg_dir + KERNEL_PGD_BOUNDARY,
703 initial_page_table + KERNEL_PGD_BOUNDARY,
704 KERNEL_PGD_PTRS);
705
706 load_cr3(swapper_pg_dir);
707 __flush_tlb_all();
733#else 708#else
734 printk(KERN_INFO "Command line: %s\n", boot_command_line); 709 printk(KERN_INFO "Command line: %s\n", boot_command_line);
735#endif 710#endif
736 711
737 /* VMI may relocate the fixmap; do this before touching ioremap area */ 712 /*
738 vmi_init(); 713 * If we have OLPC OFW, we might end up relocating the fixmap due to
739 714 * reserve_top(), so do this before touching the ioremap area.
740 /* OFW also may relocate the fixmap */ 715 */
741 olpc_ofw_detect(); 716 olpc_ofw_detect();
742 717
743 early_trap_init(); 718 early_trap_init();
@@ -782,12 +757,13 @@ void __init setup_arch(char **cmdline_p)
782#endif 757#endif
783 4)) { 758 4)) {
784 efi_enabled = 1; 759 efi_enabled = 1;
785 efi_reserve_early(); 760 efi_memblock_x86_reserve_range();
786 } 761 }
787#endif 762#endif
788 763
789 x86_init.oem.arch_setup(); 764 x86_init.oem.arch_setup();
790 765
766 iomem_resource.end = (1ULL << boot_cpu_data.x86_phys_bits) - 1;
791 setup_memory_map(); 767 setup_memory_map();
792 parse_setup_data(); 768 parse_setup_data();
793 /* update the e820_saved too */ 769 /* update the e820_saved too */
@@ -838,11 +814,8 @@ void __init setup_arch(char **cmdline_p)
838 814
839 x86_report_nx(); 815 x86_report_nx();
840 816
841 /* Must be before kernel pagetables are setup */
842 vmi_activate();
843
844 /* after early param, so could get panic from serial */ 817 /* after early param, so could get panic from serial */
845 reserve_early_setup_data(); 818 memblock_x86_reserve_range_setup_data();
846 819
847 if (acpi_mps_check()) { 820 if (acpi_mps_check()) {
848#ifdef CONFIG_X86_LOCAL_APIC 821#ifdef CONFIG_X86_LOCAL_APIC
@@ -863,8 +836,6 @@ void __init setup_arch(char **cmdline_p)
863 836
864 dmi_scan_machine(); 837 dmi_scan_machine();
865 838
866 dmi_check_system(bad_bios_dmi_table);
867
868 /* 839 /*
869 * VMware detection requires dmi to be available, so this 840 * VMware detection requires dmi to be available, so this
870 * needs to be done after dmi_scan_machine, for the BP. 841 * needs to be done after dmi_scan_machine, for the BP.
@@ -897,8 +868,6 @@ void __init setup_arch(char **cmdline_p)
897 */ 868 */
898 max_pfn = e820_end_of_ram_pfn(); 869 max_pfn = e820_end_of_ram_pfn();
899 870
900 /* preallocate 4k for mptable mpc */
901 early_reserve_e820_mpc_new();
902 /* update e820 for memory not covered by WB MTRRs */ 871 /* update e820 for memory not covered by WB MTRRs */
903 mtrr_bp_init(); 872 mtrr_bp_init();
904 if (mtrr_trim_uncached_memory(max_pfn)) 873 if (mtrr_trim_uncached_memory(max_pfn))
@@ -920,18 +889,8 @@ void __init setup_arch(char **cmdline_p)
920 max_low_pfn = max_pfn; 889 max_low_pfn = max_pfn;
921 890
922 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; 891 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
923 max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT;
924#endif 892#endif
925 893
926#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
927 setup_bios_corruption_check();
928#endif
929
930 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
931 max_pfn_mapped<<PAGE_SHIFT);
932
933 reserve_brk();
934
935 /* 894 /*
936 * Find and reserve possible boot-time SMP configuration: 895 * Find and reserve possible boot-time SMP configuration:
937 */ 896 */
@@ -939,15 +898,37 @@ void __init setup_arch(char **cmdline_p)
939 898
940 reserve_ibft_region(); 899 reserve_ibft_region();
941 900
942 reserve_trampoline_memory(); 901 /*
902 * Need to conclude brk, before memblock_x86_fill()
903 * it could use memblock_find_in_range, could overlap with
904 * brk area.
905 */
906 reserve_brk();
907
908 cleanup_highmap();
909
910 memblock.current_limit = get_max_mapped();
911 memblock_x86_fill();
943 912
944#ifdef CONFIG_ACPI_SLEEP
945 /* 913 /*
946 * Reserve low memory region for sleep support. 914 * The EFI specification says that boot service code won't be called
947 * even before init_memory_mapping 915 * after ExitBootServices(). This is, in fact, a lie.
948 */ 916 */
949 acpi_reserve_wakeup_memory(); 917 if (efi_enabled)
918 efi_reserve_boot_services();
919
920 /* preallocate 4k for mptable mpc */
921 early_reserve_e820_mpc_new();
922
923#ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION
924 setup_bios_corruption_check();
950#endif 925#endif
926
927 printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n",
928 max_pfn_mapped<<PAGE_SHIFT);
929
930 setup_trampolines();
931
951 init_gbpages(); 932 init_gbpages();
952 933
953 /* max_pfn_mapped is updated here */ 934 /* max_pfn_mapped is updated here */
@@ -962,6 +943,7 @@ void __init setup_arch(char **cmdline_p)
962 max_low_pfn = max_pfn; 943 max_low_pfn = max_pfn;
963 } 944 }
964#endif 945#endif
946 memblock.current_limit = get_max_mapped();
965 947
966 /* 948 /*
967 * NOTE: On x86-32, only from this point on, fixmaps are ready for use. 949 * NOTE: On x86-32, only from this point on, fixmaps are ready for use.
@@ -971,6 +953,8 @@ void __init setup_arch(char **cmdline_p)
971 if (init_ohci1394_dma_early) 953 if (init_ohci1394_dma_early)
972 init_ohci1394_dma_on_all_controllers(); 954 init_ohci1394_dma_on_all_controllers();
973#endif 955#endif
956 /* Allocate bigger log buffer */
957 setup_log_buf(1);
974 958
975 reserve_initrd(); 959 reserve_initrd();
976 960
@@ -987,24 +971,8 @@ void __init setup_arch(char **cmdline_p)
987 971
988 early_acpi_boot_init(); 972 early_acpi_boot_init();
989 973
990#ifdef CONFIG_ACPI_NUMA 974 initmem_init();
991 /* 975 memblock_find_dma_reserve();
992 * Parse SRAT to discover nodes.
993 */
994 acpi = acpi_numa_init();
995#endif
996
997#ifdef CONFIG_K8_NUMA
998 if (!acpi)
999 k8 = !k8_numa_init(0, max_pfn);
1000#endif
1001
1002 initmem_init(0, max_pfn, acpi, k8);
1003#ifndef CONFIG_NO_BOOTMEM
1004 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
1005#endif
1006
1007 dma32_reserve_bootmem();
1008 976
1009#ifdef CONFIG_KVM_CLOCK 977#ifdef CONFIG_KVM_CLOCK
1010 kvmclock_init(); 978 kvmclock_init();
@@ -1014,7 +982,17 @@ void __init setup_arch(char **cmdline_p)
1014 paging_init(); 982 paging_init();
1015 x86_init.paging.pagetable_setup_done(swapper_pg_dir); 983 x86_init.paging.pagetable_setup_done(swapper_pg_dir);
1016 984
1017 setup_trampoline_page_table(); 985 if (boot_cpu_data.cpuid_level >= 0) {
986 /* A CPU has %cr4 if and only if it has CPUID */
987 mmu_cr4_features = read_cr4();
988 }
989
990#ifdef CONFIG_X86_32
991 /* sync back kernel address range */
992 clone_pgd_range(initial_page_table + KERNEL_PGD_BOUNDARY,
993 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
994 KERNEL_PGD_PTRS);
995#endif
1018 996
1019 tboot_probe(); 997 tboot_probe();
1020 998
@@ -1030,8 +1008,8 @@ void __init setup_arch(char **cmdline_p)
1030 * Read APIC and some other early information from ACPI tables. 1008 * Read APIC and some other early information from ACPI tables.
1031 */ 1009 */
1032 acpi_boot_init(); 1010 acpi_boot_init();
1033
1034 sfi_init(); 1011 sfi_init();
1012 x86_dtb_init();
1035 1013
1036 /* 1014 /*
1037 * get boot-time SMP configuration: 1015 * get boot-time SMP configuration:
@@ -1041,15 +1019,10 @@ void __init setup_arch(char **cmdline_p)
1041 1019
1042 prefill_possible_map(); 1020 prefill_possible_map();
1043 1021
1044#ifdef CONFIG_X86_64
1045 init_cpu_to_node(); 1022 init_cpu_to_node();
1046#endif
1047 1023
1048 init_apic_mappings(); 1024 init_apic_mappings();
1049 ioapic_init_mappings(); 1025 ioapic_and_gsi_init();
1050
1051 /* need to wait for io_apic is mapped */
1052 probe_nr_irqs_gsi();
1053 1026
1054 kvm_guest_init(); 1027 kvm_guest_init();
1055 1028
@@ -1070,7 +1043,11 @@ void __init setup_arch(char **cmdline_p)
1070#endif 1043#endif
1071 x86_init.oem.banner(); 1044 x86_init.oem.banner();
1072 1045
1046 x86_init.timers.wallclock_init();
1047
1073 mcheck_init(); 1048 mcheck_init();
1049
1050 arch_init_ideal_nops();
1074} 1051}
1075 1052
1076#ifdef CONFIG_X86_32 1053#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index a60df9ae6454..71f4727da373 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -131,13 +131,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
131 131
132static void __init pcpu_fc_free(void *ptr, size_t size) 132static void __init pcpu_fc_free(void *ptr, size_t size)
133{ 133{
134#ifdef CONFIG_NO_BOOTMEM
135 u64 start = __pa(ptr);
136 u64 end = start + size;
137 free_early_partial(start, end);
138#else
139 free_bootmem(__pa(ptr), size); 134 free_bootmem(__pa(ptr), size);
140#endif
141} 135}
142 136
143static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 137static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -231,10 +225,15 @@ void __init setup_per_cpu_areas(void)
231 per_cpu(x86_bios_cpu_apicid, cpu) = 225 per_cpu(x86_bios_cpu_apicid, cpu) =
232 early_per_cpu_map(x86_bios_cpu_apicid, cpu); 226 early_per_cpu_map(x86_bios_cpu_apicid, cpu);
233#endif 227#endif
228#ifdef CONFIG_X86_32
229 per_cpu(x86_cpu_to_logical_apicid, cpu) =
230 early_per_cpu_map(x86_cpu_to_logical_apicid, cpu);
231#endif
234#ifdef CONFIG_X86_64 232#ifdef CONFIG_X86_64
235 per_cpu(irq_stack_ptr, cpu) = 233 per_cpu(irq_stack_ptr, cpu) =
236 per_cpu(irq_stack_union.irq_stack, cpu) + 234 per_cpu(irq_stack_union.irq_stack, cpu) +
237 IRQ_STACK_SIZE - 64; 235 IRQ_STACK_SIZE - 64;
236#endif
238#ifdef CONFIG_NUMA 237#ifdef CONFIG_NUMA
239 per_cpu(x86_cpu_to_node_map, cpu) = 238 per_cpu(x86_cpu_to_node_map, cpu) =
240 early_per_cpu_map(x86_cpu_to_node_map, cpu); 239 early_per_cpu_map(x86_cpu_to_node_map, cpu);
@@ -248,12 +247,11 @@ void __init setup_per_cpu_areas(void)
248 */ 247 */
249 set_cpu_numa_node(cpu, early_cpu_to_node(cpu)); 248 set_cpu_numa_node(cpu, early_cpu_to_node(cpu));
250#endif 249#endif
251#endif
252 /* 250 /*
253 * Up to this point, the boot CPU has been using .init.data 251 * Up to this point, the boot CPU has been using .init.data
254 * area. Reload any changed state for the boot CPU. 252 * area. Reload any changed state for the boot CPU.
255 */ 253 */
256 if (cpu == boot_cpu_id) 254 if (!cpu)
257 switch_to_new_gdt(cpu); 255 switch_to_new_gdt(cpu);
258 } 256 }
259 257
@@ -262,7 +260,10 @@ void __init setup_per_cpu_areas(void)
262 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL; 260 early_per_cpu_ptr(x86_cpu_to_apicid) = NULL;
263 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL; 261 early_per_cpu_ptr(x86_bios_cpu_apicid) = NULL;
264#endif 262#endif
265#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) 263#ifdef CONFIG_X86_32
264 early_per_cpu_ptr(x86_cpu_to_logical_apicid) = NULL;
265#endif
266#ifdef CONFIG_NUMA
266 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; 267 early_per_cpu_ptr(x86_cpu_to_node_map) = NULL;
267#endif 268#endif
268 269
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 4fd173cd8e57..40a24932a8a1 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -601,10 +601,7 @@ long sys_rt_sigreturn(struct pt_regs *regs)
601 goto badframe; 601 goto badframe;
602 602
603 sigdelsetmask(&set, ~_BLOCKABLE); 603 sigdelsetmask(&set, ~_BLOCKABLE);
604 spin_lock_irq(&current->sighand->siglock); 604 set_current_blocked(&set);
605 current->blocked = set;
606 recalc_sigpending();
607 spin_unlock_irq(&current->sighand->siglock);
608 605
609 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) 606 if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
610 goto badframe; 607 goto badframe;
@@ -682,6 +679,7 @@ static int
682handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka, 679handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
683 sigset_t *oldset, struct pt_regs *regs) 680 sigset_t *oldset, struct pt_regs *regs)
684{ 681{
682 sigset_t blocked;
685 int ret; 683 int ret;
686 684
687 /* Are we from a system call? */ 685 /* Are we from a system call? */
@@ -741,12 +739,10 @@ handle_signal(unsigned long sig, siginfo_t *info, struct k_sigaction *ka,
741 */ 739 */
742 regs->flags &= ~X86_EFLAGS_TF; 740 regs->flags &= ~X86_EFLAGS_TF;
743 741
744 spin_lock_irq(&current->sighand->siglock); 742 sigorsets(&blocked, &current->blocked, &ka->sa.sa_mask);
745 sigorsets(&current->blocked, &current->blocked, &ka->sa.sa_mask);
746 if (!(ka->sa.sa_flags & SA_NODEFER)) 743 if (!(ka->sa.sa_flags & SA_NODEFER))
747 sigaddset(&current->blocked, sig); 744 sigaddset(&blocked, sig);
748 recalc_sigpending(); 745 set_current_blocked(&blocked);
749 spin_unlock_irq(&current->sighand->siglock);
750 746
751 tracehook_signal_handler(sig, info, ka, regs, 747 tracehook_signal_handler(sig, info, ka, regs,
752 test_thread_flag(TIF_SINGLESTEP)); 748 test_thread_flag(TIF_SINGLESTEP));
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 74cca6014c0e..ed4c4f54e2ae 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -174,10 +174,10 @@ asmlinkage void smp_reboot_interrupt(void)
174 irq_exit(); 174 irq_exit();
175} 175}
176 176
177static void native_smp_send_stop(void) 177static void native_stop_other_cpus(int wait)
178{ 178{
179 unsigned long flags; 179 unsigned long flags;
180 unsigned long wait; 180 unsigned long timeout;
181 181
182 if (reboot_force) 182 if (reboot_force)
183 return; 183 return;
@@ -194,9 +194,12 @@ static void native_smp_send_stop(void)
194 if (num_online_cpus() > 1) { 194 if (num_online_cpus() > 1) {
195 apic->send_IPI_allbutself(REBOOT_VECTOR); 195 apic->send_IPI_allbutself(REBOOT_VECTOR);
196 196
197 /* Don't wait longer than a second */ 197 /*
198 wait = USEC_PER_SEC; 198 * Don't wait longer than a second if the caller
199 while (num_online_cpus() > 1 && wait--) 199 * didn't ask us to wait.
200 */
201 timeout = USEC_PER_SEC;
202 while (num_online_cpus() > 1 && (wait || timeout--))
200 udelay(1); 203 udelay(1);
201 } 204 }
202 205
@@ -206,9 +209,7 @@ static void native_smp_send_stop(void)
206} 209}
207 210
208/* 211/*
209 * Reschedule call back. Nothing to do, 212 * Reschedule call back.
210 * all the work is done automatically when
211 * we return from the interrupt.
212 */ 213 */
213void smp_reschedule_interrupt(struct pt_regs *regs) 214void smp_reschedule_interrupt(struct pt_regs *regs)
214{ 215{
@@ -216,6 +217,11 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
216 /* LITMUS^RT: this IPI might need to trigger the sched state machine. */ 217 /* LITMUS^RT: this IPI might need to trigger the sched state machine. */
217 sched_state_ipi(); 218 sched_state_ipi();
218 inc_irq_stat(irq_resched_count); 219 inc_irq_stat(irq_resched_count);
220 /*
221 * LITMUS^RT: starting from 3.0 schedule_ipi() actually does something.
222 * This may increase IPI latencies compared with previous versions.
223 */
224 scheduler_ipi();
219 TS_SEND_RESCHED_END; 225 TS_SEND_RESCHED_END;
220 /* 226 /*
221 * KVM uses this interrupt to force a cpu out of guest mode 227 * KVM uses this interrupt to force a cpu out of guest mode
@@ -254,7 +260,7 @@ struct smp_ops smp_ops = {
254 .smp_prepare_cpus = native_smp_prepare_cpus, 260 .smp_prepare_cpus = native_smp_prepare_cpus,
255 .smp_cpus_done = native_smp_cpus_done, 261 .smp_cpus_done = native_smp_cpus_done,
256 262
257 .smp_send_stop = native_smp_send_stop, 263 .stop_other_cpus = native_stop_other_cpus,
258 .smp_send_reschedule = native_smp_send_reschedule, 264 .smp_send_reschedule = native_smp_send_reschedule,
259 265
260 .cpu_up = native_cpu_up, 266 .cpu_up = native_cpu_up,
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 8b3bfc4dd708..9fd3137230d4 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -62,8 +62,9 @@
62#include <asm/pgtable.h> 62#include <asm/pgtable.h>
63#include <asm/tlbflush.h> 63#include <asm/tlbflush.h>
64#include <asm/mtrr.h> 64#include <asm/mtrr.h>
65#include <asm/vmi.h> 65#include <asm/mwait.h>
66#include <asm/apic.h> 66#include <asm/apic.h>
67#include <asm/io_apic.h>
67#include <asm/setup.h> 68#include <asm/setup.h>
68#include <asm/uv/uv.h> 69#include <asm/uv/uv.h>
69#include <linux/mc146818rtc.h> 70#include <linux/mc146818rtc.h>
@@ -71,10 +72,6 @@
71#include <asm/smpboot_hooks.h> 72#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h> 73#include <asm/i8259.h>
73 74
74#ifdef CONFIG_X86_32
75u8 apicid_2_node[MAX_APICID];
76#endif
77
78/* State of each CPU */ 75/* State of each CPU */
79DEFINE_PER_CPU(int, cpu_state) = { 0 }; 76DEFINE_PER_CPU(int, cpu_state) = { 0 };
80 77
@@ -97,12 +94,12 @@ static DEFINE_PER_CPU(struct task_struct *, idle_thread_array);
97 */ 94 */
98static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex); 95static DEFINE_MUTEX(x86_cpu_hotplug_driver_mutex);
99 96
100void cpu_hotplug_driver_lock() 97void cpu_hotplug_driver_lock(void)
101{ 98{
102 mutex_lock(&x86_cpu_hotplug_driver_mutex); 99 mutex_lock(&x86_cpu_hotplug_driver_mutex);
103} 100}
104 101
105void cpu_hotplug_driver_unlock() 102void cpu_hotplug_driver_unlock(void)
106{ 103{
107 mutex_unlock(&x86_cpu_hotplug_driver_mutex); 104 mutex_unlock(&x86_cpu_hotplug_driver_mutex);
108} 105}
@@ -130,68 +127,14 @@ EXPORT_PER_CPU_SYMBOL(cpu_sibling_map);
130DEFINE_PER_CPU(cpumask_var_t, cpu_core_map); 127DEFINE_PER_CPU(cpumask_var_t, cpu_core_map);
131EXPORT_PER_CPU_SYMBOL(cpu_core_map); 128EXPORT_PER_CPU_SYMBOL(cpu_core_map);
132 129
130DEFINE_PER_CPU(cpumask_var_t, cpu_llc_shared_map);
131
133/* Per CPU bogomips and other parameters */ 132/* Per CPU bogomips and other parameters */
134DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info); 133DEFINE_PER_CPU_SHARED_ALIGNED(struct cpuinfo_x86, cpu_info);
135EXPORT_PER_CPU_SYMBOL(cpu_info); 134EXPORT_PER_CPU_SYMBOL(cpu_info);
136 135
137atomic_t init_deasserted; 136atomic_t init_deasserted;
138 137
139#if defined(CONFIG_NUMA) && defined(CONFIG_X86_32)
140/* which node each logical CPU is on */
141int cpu_to_node_map[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = 0 };
142EXPORT_SYMBOL(cpu_to_node_map);
143
144/* set up a mapping between cpu and node. */
145static void map_cpu_to_node(int cpu, int node)
146{
147 printk(KERN_INFO "Mapping cpu %d to node %d\n", cpu, node);
148 cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
149 cpu_to_node_map[cpu] = node;
150}
151
152/* undo a mapping between cpu and node. */
153static void unmap_cpu_to_node(int cpu)
154{
155 int node;
156
157 printk(KERN_INFO "Unmapping cpu %d from all nodes\n", cpu);
158 for (node = 0; node < MAX_NUMNODES; node++)
159 cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
160 cpu_to_node_map[cpu] = 0;
161}
162#else /* !(CONFIG_NUMA && CONFIG_X86_32) */
163#define map_cpu_to_node(cpu, node) ({})
164#define unmap_cpu_to_node(cpu) ({})
165#endif
166
167#ifdef CONFIG_X86_32
168static int boot_cpu_logical_apicid;
169
170u8 cpu_2_logical_apicid[NR_CPUS] __read_mostly =
171 { [0 ... NR_CPUS-1] = BAD_APICID };
172
173static void map_cpu_to_logical_apicid(void)
174{
175 int cpu = smp_processor_id();
176 int apicid = logical_smp_processor_id();
177 int node = apic->apicid_to_node(apicid);
178
179 if (!node_online(node))
180 node = first_online_node;
181
182 cpu_2_logical_apicid[cpu] = apicid;
183 map_cpu_to_node(cpu, node);
184}
185
186void numa_remove_cpu(int cpu)
187{
188 cpu_2_logical_apicid[cpu] = BAD_APICID;
189 unmap_cpu_to_node(cpu);
190}
191#else
192#define map_cpu_to_logical_apicid() do {} while (0)
193#endif
194
195/* 138/*
196 * Report back to the Boot Processor. 139 * Report back to the Boot Processor.
197 * Running on AP. 140 * Running on AP.
@@ -259,7 +202,6 @@ static void __cpuinit smp_callin(void)
259 apic->smp_callin_clear_local_apic(); 202 apic->smp_callin_clear_local_apic();
260 setup_local_APIC(); 203 setup_local_APIC();
261 end_local_APIC_setup(); 204 end_local_APIC_setup();
262 map_cpu_to_logical_apicid();
263 205
264 /* 206 /*
265 * Need to setup vector mappings before we enable interrupts. 207 * Need to setup vector mappings before we enable interrupts.
@@ -281,6 +223,13 @@ static void __cpuinit smp_callin(void)
281 */ 223 */
282 smp_store_cpu_info(cpuid); 224 smp_store_cpu_info(cpuid);
283 225
226 /*
227 * This must be done before setting cpu_online_mask
228 * or calling notify_cpu_starting.
229 */
230 set_cpu_sibling_map(raw_smp_processor_id());
231 wmb();
232
284 notify_cpu_starting(cpuid); 233 notify_cpu_starting(cpuid);
285 234
286 /* 235 /*
@@ -299,23 +248,16 @@ notrace static void __cpuinit start_secondary(void *unused)
299 * fragile that we want to limit the things done here to the 248 * fragile that we want to limit the things done here to the
300 * most necessary things. 249 * most necessary things.
301 */ 250 */
251 cpu_init();
252 preempt_disable();
253 smp_callin();
302 254
303#ifdef CONFIG_X86_32 255#ifdef CONFIG_X86_32
304 /* 256 /* switch away from the initial page table */
305 * Switch away from the trampoline page-table
306 *
307 * Do this before cpu_init() because it needs to access per-cpu
308 * data which may not be mapped in the trampoline page-table.
309 */
310 load_cr3(swapper_pg_dir); 257 load_cr3(swapper_pg_dir);
311 __flush_tlb_all(); 258 __flush_tlb_all();
312#endif 259#endif
313 260
314 vmi_bringup();
315 cpu_init();
316 preempt_disable();
317 smp_callin();
318
319 /* otherwise gcc will move up smp_processor_id before the cpu_init */ 261 /* otherwise gcc will move up smp_processor_id before the cpu_init */
320 barrier(); 262 barrier();
321 /* 263 /*
@@ -323,16 +265,6 @@ notrace static void __cpuinit start_secondary(void *unused)
323 */ 265 */
324 check_tsc_sync_target(); 266 check_tsc_sync_target();
325 267
326 if (nmi_watchdog == NMI_IO_APIC) {
327 legacy_pic->chip->mask(0);
328 enable_NMI_through_LVT0();
329 legacy_pic->chip->unmask(0);
330 }
331
332 /* This must be done before setting cpu_online_mask */
333 set_cpu_sibling_map(raw_smp_processor_id());
334 wmb();
335
336 /* 268 /*
337 * We need to hold call_lock, so there is no inconsistency 269 * We need to hold call_lock, so there is no inconsistency
338 * between the time smp_call_function() determines number of 270 * between the time smp_call_function() determines number of
@@ -353,6 +285,19 @@ notrace static void __cpuinit start_secondary(void *unused)
353 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 285 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
354 x86_platform.nmi_init(); 286 x86_platform.nmi_init();
355 287
288 /*
289 * Wait until the cpu which brought this one up marked it
290 * online before enabling interrupts. If we don't do that then
291 * we can end up waking up the softirq thread before this cpu
292 * reached the active state, which makes the scheduler unhappy
293 * and schedule the softirq thread on the wrong cpu. This is
294 * only observable with forced threaded interrupts, but in
295 * theory it could also happen w/o them. It's just way harder
296 * to achieve.
297 */
298 while (!cpumask_test_cpu(smp_processor_id(), cpu_active_mask))
299 cpu_relax();
300
356 /* enable local interrupts */ 301 /* enable local interrupts */
357 local_irq_enable(); 302 local_irq_enable();
358 303
@@ -365,23 +310,6 @@ notrace static void __cpuinit start_secondary(void *unused)
365 cpu_idle(); 310 cpu_idle();
366} 311}
367 312
368#ifdef CONFIG_CPUMASK_OFFSTACK
369/* In this case, llc_shared_map is a pointer to a cpumask. */
370static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
371 const struct cpuinfo_x86 *src)
372{
373 struct cpumask *llc = dst->llc_shared_map;
374 *dst = *src;
375 dst->llc_shared_map = llc;
376}
377#else
378static inline void copy_cpuinfo_x86(struct cpuinfo_x86 *dst,
379 const struct cpuinfo_x86 *src)
380{
381 *dst = *src;
382}
383#endif /* CONFIG_CPUMASK_OFFSTACK */
384
385/* 313/*
386 * The bootstrap kernel entry code has set these up. Save them for 314 * The bootstrap kernel entry code has set these up. Save them for
387 * a given CPU 315 * a given CPU
@@ -391,12 +319,22 @@ void __cpuinit smp_store_cpu_info(int id)
391{ 319{
392 struct cpuinfo_x86 *c = &cpu_data(id); 320 struct cpuinfo_x86 *c = &cpu_data(id);
393 321
394 copy_cpuinfo_x86(c, &boot_cpu_data); 322 *c = boot_cpu_data;
395 c->cpu_index = id; 323 c->cpu_index = id;
396 if (id != 0) 324 if (id != 0)
397 identify_secondary_cpu(c); 325 identify_secondary_cpu(c);
398} 326}
399 327
328static void __cpuinit link_thread_siblings(int cpu1, int cpu2)
329{
330 cpumask_set_cpu(cpu1, cpu_sibling_mask(cpu2));
331 cpumask_set_cpu(cpu2, cpu_sibling_mask(cpu1));
332 cpumask_set_cpu(cpu1, cpu_core_mask(cpu2));
333 cpumask_set_cpu(cpu2, cpu_core_mask(cpu1));
334 cpumask_set_cpu(cpu1, cpu_llc_shared_mask(cpu2));
335 cpumask_set_cpu(cpu2, cpu_llc_shared_mask(cpu1));
336}
337
400 338
401void __cpuinit set_cpu_sibling_map(int cpu) 339void __cpuinit set_cpu_sibling_map(int cpu)
402{ 340{
@@ -409,23 +347,23 @@ void __cpuinit set_cpu_sibling_map(int cpu)
409 for_each_cpu(i, cpu_sibling_setup_mask) { 347 for_each_cpu(i, cpu_sibling_setup_mask) {
410 struct cpuinfo_x86 *o = &cpu_data(i); 348 struct cpuinfo_x86 *o = &cpu_data(i);
411 349
412 if (c->phys_proc_id == o->phys_proc_id && 350 if (cpu_has(c, X86_FEATURE_TOPOEXT)) {
413 c->cpu_core_id == o->cpu_core_id) { 351 if (c->phys_proc_id == o->phys_proc_id &&
414 cpumask_set_cpu(i, cpu_sibling_mask(cpu)); 352 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i) &&
415 cpumask_set_cpu(cpu, cpu_sibling_mask(i)); 353 c->compute_unit_id == o->compute_unit_id)
416 cpumask_set_cpu(i, cpu_core_mask(cpu)); 354 link_thread_siblings(cpu, i);
417 cpumask_set_cpu(cpu, cpu_core_mask(i)); 355 } else if (c->phys_proc_id == o->phys_proc_id &&
418 cpumask_set_cpu(i, c->llc_shared_map); 356 c->cpu_core_id == o->cpu_core_id) {
419 cpumask_set_cpu(cpu, o->llc_shared_map); 357 link_thread_siblings(cpu, i);
420 } 358 }
421 } 359 }
422 } else { 360 } else {
423 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu)); 361 cpumask_set_cpu(cpu, cpu_sibling_mask(cpu));
424 } 362 }
425 363
426 cpumask_set_cpu(cpu, c->llc_shared_map); 364 cpumask_set_cpu(cpu, cpu_llc_shared_mask(cpu));
427 365
428 if (current_cpu_data.x86_max_cores == 1) { 366 if (__this_cpu_read(cpu_info.x86_max_cores) == 1) {
429 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu)); 367 cpumask_copy(cpu_core_mask(cpu), cpu_sibling_mask(cpu));
430 c->booted_cores = 1; 368 c->booted_cores = 1;
431 return; 369 return;
@@ -434,8 +372,8 @@ void __cpuinit set_cpu_sibling_map(int cpu)
434 for_each_cpu(i, cpu_sibling_setup_mask) { 372 for_each_cpu(i, cpu_sibling_setup_mask) {
435 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID && 373 if (per_cpu(cpu_llc_id, cpu) != BAD_APICID &&
436 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) { 374 per_cpu(cpu_llc_id, cpu) == per_cpu(cpu_llc_id, i)) {
437 cpumask_set_cpu(i, c->llc_shared_map); 375 cpumask_set_cpu(i, cpu_llc_shared_mask(cpu));
438 cpumask_set_cpu(cpu, cpu_data(i).llc_shared_map); 376 cpumask_set_cpu(cpu, cpu_llc_shared_mask(i));
439 } 377 }
440 if (c->phys_proc_id == cpu_data(i).phys_proc_id) { 378 if (c->phys_proc_id == cpu_data(i).phys_proc_id) {
441 cpumask_set_cpu(i, cpu_core_mask(cpu)); 379 cpumask_set_cpu(i, cpu_core_mask(cpu));
@@ -474,7 +412,7 @@ const struct cpumask *cpu_coregroup_mask(int cpu)
474 !(cpu_has(c, X86_FEATURE_AMD_DCM))) 412 !(cpu_has(c, X86_FEATURE_AMD_DCM)))
475 return cpu_core_mask(cpu); 413 return cpu_core_mask(cpu);
476 else 414 else
477 return c->llc_shared_map; 415 return cpu_llc_shared_mask(cpu);
478} 416}
479 417
480static void impress_friends(void) 418static void impress_friends(void)
@@ -636,7 +574,7 @@ wakeup_secondary_cpu_via_init(int phys_apicid, unsigned long start_eip)
636 * target processor state. 574 * target processor state.
637 */ 575 */
638 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary, 576 startup_ipi_hook(phys_apicid, (unsigned long) start_secondary,
639 (unsigned long)stack_start.sp); 577 stack_start);
640 578
641 /* 579 /*
642 * Run STARTUP IPI loop. 580 * Run STARTUP IPI loop.
@@ -742,7 +680,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
742 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 680 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
743 }; 681 };
744 682
745 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle); 683 INIT_WORK_ONSTACK(&c_idle.work, do_fork_idle);
746 684
747 alternatives_smp_switch(1); 685 alternatives_smp_switch(1);
748 686
@@ -774,7 +712,6 @@ do_rest:
774#ifdef CONFIG_X86_32 712#ifdef CONFIG_X86_32
775 /* Stack for startup_32 can be just as for start_secondary onwards */ 713 /* Stack for startup_32 can be just as for start_secondary onwards */
776 irq_ctx_init(cpu); 714 irq_ctx_init(cpu);
777 initial_page_table = __pa(&trampoline_pg_dir);
778#else 715#else
779 clear_tsk_thread_flag(c_idle.idle, TIF_FORK); 716 clear_tsk_thread_flag(c_idle.idle, TIF_FORK);
780 initial_gs = per_cpu_offset(cpu); 717 initial_gs = per_cpu_offset(cpu);
@@ -784,10 +721,10 @@ do_rest:
784#endif 721#endif
785 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); 722 early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
786 initial_code = (unsigned long)start_secondary; 723 initial_code = (unsigned long)start_secondary;
787 stack_start.sp = (void *) c_idle.idle->thread.sp; 724 stack_start = c_idle.idle->thread.sp;
788 725
789 /* start_ip had better be page-aligned! */ 726 /* start_ip had better be page-aligned! */
790 start_ip = setup_trampoline(); 727 start_ip = trampoline_address();
791 728
792 /* So we see what's up */ 729 /* So we see what's up */
793 announce_cpu(cpu, apicid); 730 announce_cpu(cpu, apicid);
@@ -797,6 +734,8 @@ do_rest:
797 * the targeted processor. 734 * the targeted processor.
798 */ 735 */
799 736
737 printk(KERN_DEBUG "smpboot cpu %d: start_ip = %lx\n", cpu, start_ip);
738
800 atomic_set(&init_deasserted, 0); 739 atomic_set(&init_deasserted, 0);
801 740
802 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 741 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
@@ -850,8 +789,8 @@ do_rest:
850 pr_debug("CPU%d: has booted.\n", cpu); 789 pr_debug("CPU%d: has booted.\n", cpu);
851 else { 790 else {
852 boot_error = 1; 791 boot_error = 1;
853 if (*((volatile unsigned char *)trampoline_base) 792 if (*(volatile u32 *)TRAMPOLINE_SYM(trampoline_status)
854 == 0xA5) 793 == 0xA5A5A5A5)
855 /* trampoline started but...? */ 794 /* trampoline started but...? */
856 pr_err("CPU%d: Stuck ??\n", cpu); 795 pr_err("CPU%d: Stuck ??\n", cpu);
857 else 796 else
@@ -877,7 +816,7 @@ do_rest:
877 } 816 }
878 817
879 /* mark "stuck" area as not stuck */ 818 /* mark "stuck" area as not stuck */
880 *((volatile unsigned long *)trampoline_base) = 0; 819 *(volatile u32 *)TRAMPOLINE_SYM(trampoline_status) = 0;
881 820
882 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) { 821 if (get_uv_system_type() != UV_NON_UNIQUE_APIC) {
883 /* 822 /*
@@ -923,7 +862,6 @@ int __cpuinit native_cpu_up(unsigned int cpu)
923 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; 862 per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
924 863
925 err = do_boot_cpu(apicid, cpu); 864 err = do_boot_cpu(apicid, cpu);
926
927 if (err) { 865 if (err) {
928 pr_debug("do_boot_cpu failed %d\n", err); 866 pr_debug("do_boot_cpu failed %d\n", err);
929 return -EIO; 867 return -EIO;
@@ -945,6 +883,14 @@ int __cpuinit native_cpu_up(unsigned int cpu)
945 return 0; 883 return 0;
946} 884}
947 885
886/**
887 * arch_disable_smp_support() - disables SMP support for x86 at runtime
888 */
889void arch_disable_smp_support(void)
890{
891 disable_ioapic_support();
892}
893
948/* 894/*
949 * Fall back to non SMP mode after errors. 895 * Fall back to non SMP mode after errors.
950 * 896 *
@@ -960,7 +906,6 @@ static __init void disable_smp(void)
960 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map); 906 physid_set_mask_of_physid(boot_cpu_physical_apicid, &phys_cpu_present_map);
961 else 907 else
962 physid_set_mask_of_physid(0, &phys_cpu_present_map); 908 physid_set_mask_of_physid(0, &phys_cpu_present_map);
963 map_cpu_to_logical_apicid();
964 cpumask_set_cpu(0, cpu_sibling_mask(0)); 909 cpumask_set_cpu(0, cpu_sibling_mask(0));
965 cpumask_set_cpu(0, cpu_core_mask(0)); 910 cpumask_set_cpu(0, cpu_core_mask(0));
966} 911}
@@ -1045,7 +990,7 @@ static int __init smp_sanity_check(unsigned max_cpus)
1045 "(tell your hw vendor)\n"); 990 "(tell your hw vendor)\n");
1046 } 991 }
1047 smpboot_clear_io_apic(); 992 smpboot_clear_io_apic();
1048 arch_disable_smp_support(); 993 disable_ioapic_support();
1049 return -1; 994 return -1;
1050 } 995 }
1051 996
@@ -1058,11 +1003,9 @@ static int __init smp_sanity_check(unsigned max_cpus)
1058 printk(KERN_INFO "SMP mode deactivated.\n"); 1003 printk(KERN_INFO "SMP mode deactivated.\n");
1059 smpboot_clear_io_apic(); 1004 smpboot_clear_io_apic();
1060 1005
1061 localise_nmi_watchdog();
1062
1063 connect_bsp_APIC(); 1006 connect_bsp_APIC();
1064 setup_local_APIC(); 1007 setup_local_APIC();
1065 end_local_APIC_setup(); 1008 bsp_end_local_APIC_setup();
1066 return -1; 1009 return -1;
1067 } 1010 }
1068 1011
@@ -1091,26 +1034,22 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1091 1034
1092 preempt_disable(); 1035 preempt_disable();
1093 smp_cpu_index_default(); 1036 smp_cpu_index_default();
1094 current_cpu_data = boot_cpu_data; 1037
1095 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1096 mb();
1097 /* 1038 /*
1098 * Setup boot CPU information 1039 * Setup boot CPU information
1099 */ 1040 */
1100 smp_store_cpu_info(0); /* Final full version of the data */ 1041 smp_store_cpu_info(0); /* Final full version of the data */
1101#ifdef CONFIG_X86_32 1042 cpumask_copy(cpu_callin_mask, cpumask_of(0));
1102 boot_cpu_logical_apicid = logical_smp_processor_id(); 1043 mb();
1103#endif 1044
1104 current_thread_info()->cpu = 0; /* needed? */ 1045 current_thread_info()->cpu = 0; /* needed? */
1105 for_each_possible_cpu(i) { 1046 for_each_possible_cpu(i) {
1106 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL); 1047 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
1107 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); 1048 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
1108 zalloc_cpumask_var(&cpu_data(i).llc_shared_map, GFP_KERNEL); 1049 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
1109 } 1050 }
1110 set_cpu_sibling_map(0); 1051 set_cpu_sibling_map(0);
1111 1052
1112 enable_IR_x2apic();
1113 default_setup_apic_routing();
1114 1053
1115 if (smp_sanity_check(max_cpus) < 0) { 1054 if (smp_sanity_check(max_cpus) < 0) {
1116 printk(KERN_INFO "SMP disabled\n"); 1055 printk(KERN_INFO "SMP disabled\n");
@@ -1118,6 +1057,8 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1118 goto out; 1057 goto out;
1119 } 1058 }
1120 1059
1060 default_setup_apic_routing();
1061
1121 preempt_disable(); 1062 preempt_disable();
1122 if (read_apic_id() != boot_cpu_physical_apicid) { 1063 if (read_apic_id() != boot_cpu_physical_apicid) {
1123 panic("Boot APIC ID in local APIC unexpected (%d vs %d)", 1064 panic("Boot APIC ID in local APIC unexpected (%d vs %d)",
@@ -1139,9 +1080,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1139 if (!skip_ioapic_setup && nr_ioapics) 1080 if (!skip_ioapic_setup && nr_ioapics)
1140 enable_IO_APIC(); 1081 enable_IO_APIC();
1141 1082
1142 end_local_APIC_setup(); 1083 bsp_end_local_APIC_setup();
1143
1144 map_cpu_to_logical_apicid();
1145 1084
1146 if (apic->setup_portio_remap) 1085 if (apic->setup_portio_remap)
1147 apic->setup_portio_remap(); 1086 apic->setup_portio_remap();
@@ -1163,6 +1102,20 @@ out:
1163 preempt_enable(); 1102 preempt_enable();
1164} 1103}
1165 1104
1105void arch_disable_nonboot_cpus_begin(void)
1106{
1107 /*
1108 * Avoid the smp alternatives switch during the disable_nonboot_cpus().
1109 * In the suspend path, we will be back in the SMP mode shortly anyways.
1110 */
1111 skip_smp_alternatives = true;
1112}
1113
1114void arch_disable_nonboot_cpus_end(void)
1115{
1116 skip_smp_alternatives = false;
1117}
1118
1166void arch_enable_nonboot_cpus_begin(void) 1119void arch_enable_nonboot_cpus_begin(void)
1167{ 1120{
1168 set_mtrr_aps_delayed_init(); 1121 set_mtrr_aps_delayed_init();
@@ -1193,7 +1146,6 @@ void __init native_smp_cpus_done(unsigned int max_cpus)
1193#ifdef CONFIG_X86_IO_APIC 1146#ifdef CONFIG_X86_IO_APIC
1194 setup_ioapic_dest(); 1147 setup_ioapic_dest();
1195#endif 1148#endif
1196 check_nmi_watchdog();
1197 mtrr_aps_init(); 1149 mtrr_aps_init();
1198} 1150}
1199 1151
@@ -1338,8 +1290,6 @@ int native_cpu_disable(void)
1338 if (cpu == 0) 1290 if (cpu == 0)
1339 return -EBUSY; 1291 return -EBUSY;
1340 1292
1341 if (nmi_watchdog == NMI_LOCAL_APIC)
1342 stop_apic_nmi_watchdog(NULL);
1343 clear_local_APIC(); 1293 clear_local_APIC();
1344 1294
1345 cpu_disable_common(); 1295 cpu_disable_common();
@@ -1370,12 +1320,11 @@ void play_dead_common(void)
1370{ 1320{
1371 idle_task_exit(); 1321 idle_task_exit();
1372 reset_lazy_tlbstate(); 1322 reset_lazy_tlbstate();
1373 irq_ctx_exit(raw_smp_processor_id()); 1323 amd_e400_remove_cpu(raw_smp_processor_id());
1374 c1e_remove_cpu(raw_smp_processor_id());
1375 1324
1376 mb(); 1325 mb();
1377 /* Ack it */ 1326 /* Ack it */
1378 __get_cpu_var(cpu_state) = CPU_DEAD; 1327 __this_cpu_write(cpu_state, CPU_DEAD);
1379 1328
1380 /* 1329 /*
1381 * With physical CPU hotplug, we should halt the cpu 1330 * With physical CPU hotplug, we should halt the cpu
@@ -1383,11 +1332,89 @@ void play_dead_common(void)
1383 local_irq_disable(); 1332 local_irq_disable();
1384} 1333}
1385 1334
1335/*
1336 * We need to flush the caches before going to sleep, lest we have
1337 * dirty data in our caches when we come back up.
1338 */
1339static inline void mwait_play_dead(void)
1340{
1341 unsigned int eax, ebx, ecx, edx;
1342 unsigned int highest_cstate = 0;
1343 unsigned int highest_subcstate = 0;
1344 int i;
1345 void *mwait_ptr;
1346 struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info);
1347
1348 if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)))
1349 return;
1350 if (!this_cpu_has(X86_FEATURE_CLFLSH))
1351 return;
1352 if (__this_cpu_read(cpu_info.cpuid_level) < CPUID_MWAIT_LEAF)
1353 return;
1354
1355 eax = CPUID_MWAIT_LEAF;
1356 ecx = 0;
1357 native_cpuid(&eax, &ebx, &ecx, &edx);
1358
1359 /*
1360 * eax will be 0 if EDX enumeration is not valid.
1361 * Initialized below to cstate, sub_cstate value when EDX is valid.
1362 */
1363 if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED)) {
1364 eax = 0;
1365 } else {
1366 edx >>= MWAIT_SUBSTATE_SIZE;
1367 for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) {
1368 if (edx & MWAIT_SUBSTATE_MASK) {
1369 highest_cstate = i;
1370 highest_subcstate = edx & MWAIT_SUBSTATE_MASK;
1371 }
1372 }
1373 eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) |
1374 (highest_subcstate - 1);
1375 }
1376
1377 /*
1378 * This should be a memory location in a cache line which is
1379 * unlikely to be touched by other processors. The actual
1380 * content is immaterial as it is not actually modified in any way.
1381 */
1382 mwait_ptr = &current_thread_info()->flags;
1383
1384 wbinvd();
1385
1386 while (1) {
1387 /*
1388 * The CLFLUSH is a workaround for erratum AAI65 for
1389 * the Xeon 7400 series. It's not clear it is actually
1390 * needed, but it should be harmless in either case.
1391 * The WBINVD is insufficient due to the spurious-wakeup
1392 * case where we return around the loop.
1393 */
1394 clflush(mwait_ptr);
1395 __monitor(mwait_ptr, 0, 0);
1396 mb();
1397 __mwait(eax, 0);
1398 }
1399}
1400
1401static inline void hlt_play_dead(void)
1402{
1403 if (__this_cpu_read(cpu_info.x86) >= 4)
1404 wbinvd();
1405
1406 while (1) {
1407 native_halt();
1408 }
1409}
1410
1386void native_play_dead(void) 1411void native_play_dead(void)
1387{ 1412{
1388 play_dead_common(); 1413 play_dead_common();
1389 tboot_shutdown(TB_SHUTDOWN_WFS); 1414 tboot_shutdown(TB_SHUTDOWN_WFS);
1390 wbinvd_halt(); 1415
1416 mwait_play_dead(); /* Only returns on failure */
1417 hlt_play_dead();
1391} 1418}
1392 1419
1393#else /* ... !CONFIG_HOTPLUG_CPU */ 1420#else /* ... !CONFIG_HOTPLUG_CPU */
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index b53c525368a7..55d9bc03f696 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -9,15 +9,6 @@
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <asm/stacktrace.h> 10#include <asm/stacktrace.h>
11 11
12static void save_stack_warning(void *data, char *msg)
13{
14}
15
16static void
17save_stack_warning_symbol(void *data, char *msg, unsigned long symbol)
18{
19}
20
21static int save_stack_stack(void *data, char *name) 12static int save_stack_stack(void *data, char *name)
22{ 13{
23 return 0; 14 return 0;
@@ -53,16 +44,12 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 44}
54 45
55static const struct stacktrace_ops save_stack_ops = { 46static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 47 .stack = save_stack_stack,
59 .address = save_stack_address, 48 .address = save_stack_address,
60 .walk_stack = print_context_stack, 49 .walk_stack = print_context_stack,
61}; 50};
62 51
63static const struct stacktrace_ops save_stack_ops_nosched = { 52static const struct stacktrace_ops save_stack_ops_nosched = {
64 .warning = save_stack_warning,
65 .warning_symbol = save_stack_warning_symbol,
66 .stack = save_stack_stack, 53 .stack = save_stack_stack,
67 .address = save_stack_address_nosched, 54 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack, 55 .walk_stack = print_context_stack,
@@ -79,9 +66,9 @@ void save_stack_trace(struct stack_trace *trace)
79} 66}
80EXPORT_SYMBOL_GPL(save_stack_trace); 67EXPORT_SYMBOL_GPL(save_stack_trace);
81 68
82void save_stack_trace_bp(struct stack_trace *trace, unsigned long bp) 69void save_stack_trace_regs(struct stack_trace *trace, struct pt_regs *regs)
83{ 70{
84 dump_trace(current, NULL, NULL, bp, &save_stack_ops, trace); 71 dump_trace(current, regs, NULL, 0, &save_stack_ops, trace);
85 if (trace->nr_entries < trace->max_entries) 72 if (trace->nr_entries < trace->max_entries)
86 trace->entries[trace->nr_entries++] = ULONG_MAX; 73 trace->entries[trace->nr_entries++] = ULONG_MAX;
87} 74}
diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c
index 58de45ee08b6..7977f0cfe339 100644
--- a/arch/x86/kernel/step.c
+++ b/arch/x86/kernel/step.c
@@ -166,7 +166,7 @@ static void enable_step(struct task_struct *child, bool block)
166 * Make sure block stepping (BTF) is not enabled unless it should be. 166 * Make sure block stepping (BTF) is not enabled unless it should be.
167 * Note that we don't try to worry about any is_setting_trap_flag() 167 * Note that we don't try to worry about any is_setting_trap_flag()
168 * instructions after the first when using block stepping. 168 * instructions after the first when using block stepping.
169 * So noone should try to use debugger block stepping in a program 169 * So no one should try to use debugger block stepping in a program
170 * that uses user-mode single stepping itself. 170 * that uses user-mode single stepping itself.
171 */ 171 */
172 if (enable_single_step(child) && block) { 172 if (enable_single_step(child) && block) {
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index d5e06624e34a..0b0cb5fede19 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -33,8 +33,8 @@ int kernel_execve(const char *filename,
33 const char *const envp[]) 33 const char *const envp[])
34{ 34{
35 long __res; 35 long __res;
36 asm volatile ("push %%ebx ; movl %2,%%ebx ; int $0x80 ; pop %%ebx" 36 asm volatile ("int $0x80"
37 : "=a" (__res) 37 : "=a" (__res)
38 : "0" (__NR_execve), "ri" (filename), "c" (argv), "d" (envp) : "memory"); 38 : "0" (__NR_execve), "b" (filename), "c" (argv), "d" (envp) : "memory");
39 return __res; 39 return __res;
40} 40}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 37702905f658..d0126222b394 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -340,15 +340,21 @@ ENTRY(sys_call_table)
340 .long sys_fanotify_init 340 .long sys_fanotify_init
341 .long sys_fanotify_mark 341 .long sys_fanotify_mark
342 .long sys_prlimit64 /* 340 */ 342 .long sys_prlimit64 /* 340 */
343 .long sys_set_rt_task_param /* LITMUS^RT 341 */ 343 .long sys_name_to_handle_at
344 .long sys_open_by_handle_at
345 .long sys_clock_adjtime
346 .long sys_syncfs
347 .long sys_sendmmsg /* 345 */
348 .long sys_setns
349 .long sys_set_rt_task_param /* LITMUS^RT 347 */
344 .long sys_get_rt_task_param 350 .long sys_get_rt_task_param
345 .long sys_complete_job 351 .long sys_complete_job
346 .long sys_od_open 352 .long sys_od_open
347 .long sys_od_close 353 .long sys_od_close
348 .long sys_litmus_lock 354 .long sys_litmus_lock /* +5 */
349 .long sys_litmus_unlock 355 .long sys_litmus_unlock
350 .long sys_query_job_no 356 .long sys_query_job_no
351 .long sys_wait_for_job_release 357 .long sys_wait_for_job_release
352 .long sys_wait_for_ts_release 358 .long sys_wait_for_ts_release
353 .long sys_release_ts 359 .long sys_release_ts /* +10 */
354 .long sys_null_call 360 .long sys_null_call
diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c
index c2f1b26141e2..30ac65df7d4e 100644
--- a/arch/x86/kernel/tboot.c
+++ b/arch/x86/kernel/tboot.c
@@ -110,7 +110,6 @@ static struct mm_struct tboot_mm = {
110 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), 110 .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
111 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), 111 .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
112 .mmlist = LIST_HEAD_INIT(init_mm.mmlist), 112 .mmlist = LIST_HEAD_INIT(init_mm.mmlist),
113 .cpu_vm_mask = CPU_MASK_ALL,
114}; 113};
115 114
116static inline void switch_to_tboot_pt(void) 115static inline void switch_to_tboot_pt(void)
@@ -133,7 +132,7 @@ static int map_tboot_page(unsigned long vaddr, unsigned long pfn,
133 pmd = pmd_alloc(&tboot_mm, pud, vaddr); 132 pmd = pmd_alloc(&tboot_mm, pud, vaddr);
134 if (!pmd) 133 if (!pmd)
135 return -1; 134 return -1;
136 pte = pte_alloc_map(&tboot_mm, pmd, vaddr); 135 pte = pte_alloc_map(&tboot_mm, NULL, pmd, vaddr);
137 if (!pte) 136 if (!pte)
138 return -1; 137 return -1;
139 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot)); 138 set_pte_at(&tboot_mm, vaddr, pte, pfn_pte(pfn, prot));
diff --git a/arch/x86/kernel/test_nx.c b/arch/x86/kernel/test_nx.c
index 787a5e499dd1..3f92ce07e525 100644
--- a/arch/x86/kernel/test_nx.c
+++ b/arch/x86/kernel/test_nx.c
@@ -161,7 +161,7 @@ static int test_NX(void)
161 } 161 }
162 162
163#endif 163#endif
164 return 0; 164 return ret;
165} 165}
166 166
167static void test_exit(void) 167static void test_exit(void)
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index fb5cc5e14cfa..00cbb272627f 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -22,12 +22,8 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/time.h> 23#include <asm/time.h>
24 24
25#if defined(CONFIG_X86_32) && defined(CONFIG_X86_IO_APIC)
26int timer_ack;
27#endif
28
29#ifdef CONFIG_X86_64 25#ifdef CONFIG_X86_64
30volatile unsigned long __jiffies __section_jiffies = INITIAL_JIFFIES; 26DEFINE_VVAR(volatile unsigned long, jiffies) = INITIAL_JIFFIES;
31#endif 27#endif
32 28
33unsigned long profile_pc(struct pt_regs *regs) 29unsigned long profile_pc(struct pt_regs *regs)
@@ -63,20 +59,6 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
63 /* Keep nmi watchdog up to date */ 59 /* Keep nmi watchdog up to date */
64 inc_irq_stat(irq0_irqs); 60 inc_irq_stat(irq0_irqs);
65 61
66 /* Optimized out for !IO_APIC and x86_64 */
67 if (timer_ack) {
68 /*
69 * Subtle, when I/O APICs are used we have to ack timer IRQ
70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system.
72 */
73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL);
77 raw_spin_unlock(&i8259A_lock);
78 }
79
80 global_clock_event->event_handler(global_clock_event); 62 global_clock_event->event_handler(global_clock_event);
81 63
82 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */ 64 /* MCA bus quirk: Acknowledge irq0 by setting bit 7 in port 0x61 */
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
deleted file mode 100644
index 312ef0292815..000000000000
--- a/arch/x86/kernel/tlb_uv.c
+++ /dev/null
@@ -1,1655 +0,0 @@
1/*
2 * SGI UltraViolet TLB flush routines.
3 *
4 * (c) 2008-2010 Cliff Wickman <cpw@sgi.com>, SGI.
5 *
6 * This code is released under the GNU General Public License version 2 or
7 * later.
8 */
9#include <linux/seq_file.h>
10#include <linux/proc_fs.h>
11#include <linux/debugfs.h>
12#include <linux/kernel.h>
13#include <linux/slab.h>
14
15#include <asm/mmu_context.h>
16#include <asm/uv/uv.h>
17#include <asm/uv/uv_mmrs.h>
18#include <asm/uv/uv_hub.h>
19#include <asm/uv/uv_bau.h>
20#include <asm/apic.h>
21#include <asm/idle.h>
22#include <asm/tsc.h>
23#include <asm/irq_vectors.h>
24#include <asm/timer.h>
25
26/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
27static int timeout_base_ns[] = {
28 20,
29 160,
30 1280,
31 10240,
32 81920,
33 655360,
34 5242880,
35 167772160
36};
37static int timeout_us;
38static int nobau;
39static int baudisabled;
40static spinlock_t disable_lock;
41static cycles_t congested_cycles;
42
43/* tunables: */
44static int max_bau_concurrent = MAX_BAU_CONCURRENT;
45static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT;
46static int plugged_delay = PLUGGED_DELAY;
47static int plugsb4reset = PLUGSB4RESET;
48static int timeoutsb4reset = TIMEOUTSB4RESET;
49static int ipi_reset_limit = IPI_RESET_LIMIT;
50static int complete_threshold = COMPLETE_THRESHOLD;
51static int congested_response_us = CONGESTED_RESPONSE_US;
52static int congested_reps = CONGESTED_REPS;
53static int congested_period = CONGESTED_PERIOD;
54static struct dentry *tunables_dir;
55static struct dentry *tunables_file;
56
57static int __init setup_nobau(char *arg)
58{
59 nobau = 1;
60 return 0;
61}
62early_param("nobau", setup_nobau);
63
64/* base pnode in this partition */
65static int uv_partition_base_pnode __read_mostly;
66/* position of pnode (which is nasid>>1): */
67static int uv_nshift __read_mostly;
68static unsigned long uv_mmask __read_mostly;
69
70static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
71static DEFINE_PER_CPU(struct bau_control, bau_control);
72static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
73
74/*
75 * Determine the first node on a uvhub. 'Nodes' are used for kernel
76 * memory allocation.
77 */
78static int __init uvhub_to_first_node(int uvhub)
79{
80 int node, b;
81
82 for_each_online_node(node) {
83 b = uv_node_to_blade_id(node);
84 if (uvhub == b)
85 return node;
86 }
87 return -1;
88}
89
90/*
91 * Determine the apicid of the first cpu on a uvhub.
92 */
93static int __init uvhub_to_first_apicid(int uvhub)
94{
95 int cpu;
96
97 for_each_present_cpu(cpu)
98 if (uvhub == uv_cpu_to_blade_id(cpu))
99 return per_cpu(x86_cpu_to_apicid, cpu);
100 return -1;
101}
102
103/*
104 * Free a software acknowledge hardware resource by clearing its Pending
105 * bit. This will return a reply to the sender.
106 * If the message has timed out, a reply has already been sent by the
107 * hardware but the resource has not been released. In that case our
108 * clear of the Timeout bit (as well) will free the resource. No reply will
109 * be sent (the hardware will only do one reply per message).
110 */
111static inline void uv_reply_to_message(struct msg_desc *mdp,
112 struct bau_control *bcp)
113{
114 unsigned long dw;
115 struct bau_payload_queue_entry *msg;
116
117 msg = mdp->msg;
118 if (!msg->canceled) {
119 dw = (msg->sw_ack_vector << UV_SW_ACK_NPENDING) |
120 msg->sw_ack_vector;
121 uv_write_local_mmr(
122 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, dw);
123 }
124 msg->replied_to = 1;
125 msg->sw_ack_vector = 0;
126}
127
128/*
129 * Process the receipt of a RETRY message
130 */
131static inline void uv_bau_process_retry_msg(struct msg_desc *mdp,
132 struct bau_control *bcp)
133{
134 int i;
135 int cancel_count = 0;
136 int slot2;
137 unsigned long msg_res;
138 unsigned long mmr = 0;
139 struct bau_payload_queue_entry *msg;
140 struct bau_payload_queue_entry *msg2;
141 struct ptc_stats *stat;
142
143 msg = mdp->msg;
144 stat = bcp->statp;
145 stat->d_retries++;
146 /*
147 * cancel any message from msg+1 to the retry itself
148 */
149 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
150 if (msg2 > mdp->va_queue_last)
151 msg2 = mdp->va_queue_first;
152 if (msg2 == msg)
153 break;
154
155 /* same conditions for cancellation as uv_do_reset */
156 if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
157 (msg2->sw_ack_vector) && ((msg2->sw_ack_vector &
158 msg->sw_ack_vector) == 0) &&
159 (msg2->sending_cpu == msg->sending_cpu) &&
160 (msg2->msg_type != MSG_NOOP)) {
161 slot2 = msg2 - mdp->va_queue_first;
162 mmr = uv_read_local_mmr
163 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
164 msg_res = msg2->sw_ack_vector;
165 /*
166 * This is a message retry; clear the resources held
167 * by the previous message only if they timed out.
168 * If it has not timed out we have an unexpected
169 * situation to report.
170 */
171 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
172 /*
173 * is the resource timed out?
174 * make everyone ignore the cancelled message.
175 */
176 msg2->canceled = 1;
177 stat->d_canceled++;
178 cancel_count++;
179 uv_write_local_mmr(
180 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
181 (msg_res << UV_SW_ACK_NPENDING) |
182 msg_res);
183 }
184 }
185 }
186 if (!cancel_count)
187 stat->d_nocanceled++;
188}
189
190/*
191 * Do all the things a cpu should do for a TLB shootdown message.
192 * Other cpu's may come here at the same time for this message.
193 */
194static void uv_bau_process_message(struct msg_desc *mdp,
195 struct bau_control *bcp)
196{
197 int msg_ack_count;
198 short socket_ack_count = 0;
199 struct ptc_stats *stat;
200 struct bau_payload_queue_entry *msg;
201 struct bau_control *smaster = bcp->socket_master;
202
203 /*
204 * This must be a normal message, or retry of a normal message
205 */
206 msg = mdp->msg;
207 stat = bcp->statp;
208 if (msg->address == TLB_FLUSH_ALL) {
209 local_flush_tlb();
210 stat->d_alltlb++;
211 } else {
212 __flush_tlb_one(msg->address);
213 stat->d_onetlb++;
214 }
215 stat->d_requestee++;
216
217 /*
218 * One cpu on each uvhub has the additional job on a RETRY
219 * of releasing the resource held by the message that is
220 * being retried. That message is identified by sending
221 * cpu number.
222 */
223 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
224 uv_bau_process_retry_msg(mdp, bcp);
225
226 /*
227 * This is a sw_ack message, so we have to reply to it.
228 * Count each responding cpu on the socket. This avoids
229 * pinging the count's cache line back and forth between
230 * the sockets.
231 */
232 socket_ack_count = atomic_add_short_return(1, (struct atomic_short *)
233 &smaster->socket_acknowledge_count[mdp->msg_slot]);
234 if (socket_ack_count == bcp->cpus_in_socket) {
235 /*
236 * Both sockets dump their completed count total into
237 * the message's count.
238 */
239 smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
240 msg_ack_count = atomic_add_short_return(socket_ack_count,
241 (struct atomic_short *)&msg->acknowledge_count);
242
243 if (msg_ack_count == bcp->cpus_in_uvhub) {
244 /*
245 * All cpus in uvhub saw it; reply
246 */
247 uv_reply_to_message(mdp, bcp);
248 }
249 }
250
251 return;
252}
253
254/*
255 * Determine the first cpu on a uvhub.
256 */
257static int uvhub_to_first_cpu(int uvhub)
258{
259 int cpu;
260 for_each_present_cpu(cpu)
261 if (uvhub == uv_cpu_to_blade_id(cpu))
262 return cpu;
263 return -1;
264}
265
266/*
267 * Last resort when we get a large number of destination timeouts is
268 * to clear resources held by a given cpu.
269 * Do this with IPI so that all messages in the BAU message queue
270 * can be identified by their nonzero sw_ack_vector field.
271 *
272 * This is entered for a single cpu on the uvhub.
273 * The sender want's this uvhub to free a specific message's
274 * sw_ack resources.
275 */
276static void
277uv_do_reset(void *ptr)
278{
279 int i;
280 int slot;
281 int count = 0;
282 unsigned long mmr;
283 unsigned long msg_res;
284 struct bau_control *bcp;
285 struct reset_args *rap;
286 struct bau_payload_queue_entry *msg;
287 struct ptc_stats *stat;
288
289 bcp = &per_cpu(bau_control, smp_processor_id());
290 rap = (struct reset_args *)ptr;
291 stat = bcp->statp;
292 stat->d_resets++;
293
294 /*
295 * We're looking for the given sender, and
296 * will free its sw_ack resource.
297 * If all cpu's finally responded after the timeout, its
298 * message 'replied_to' was set.
299 */
300 for (msg = bcp->va_queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
301 /* uv_do_reset: same conditions for cancellation as
302 uv_bau_process_retry_msg() */
303 if ((msg->replied_to == 0) &&
304 (msg->canceled == 0) &&
305 (msg->sending_cpu == rap->sender) &&
306 (msg->sw_ack_vector) &&
307 (msg->msg_type != MSG_NOOP)) {
308 /*
309 * make everyone else ignore this message
310 */
311 msg->canceled = 1;
312 slot = msg - bcp->va_queue_first;
313 count++;
314 /*
315 * only reset the resource if it is still pending
316 */
317 mmr = uv_read_local_mmr
318 (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE);
319 msg_res = msg->sw_ack_vector;
320 if (mmr & msg_res) {
321 stat->d_rcanceled++;
322 uv_write_local_mmr(
323 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS,
324 (msg_res << UV_SW_ACK_NPENDING) |
325 msg_res);
326 }
327 }
328 }
329 return;
330}
331
332/*
333 * Use IPI to get all target uvhubs to release resources held by
334 * a given sending cpu number.
335 */
336static void uv_reset_with_ipi(struct bau_target_uvhubmask *distribution,
337 int sender)
338{
339 int uvhub;
340 int cpu;
341 cpumask_t mask;
342 struct reset_args reset_args;
343
344 reset_args.sender = sender;
345
346 cpus_clear(mask);
347 /* find a single cpu for each uvhub in this distribution mask */
348 for (uvhub = 0;
349 uvhub < sizeof(struct bau_target_uvhubmask) * BITSPERBYTE;
350 uvhub++) {
351 if (!bau_uvhub_isset(uvhub, distribution))
352 continue;
353 /* find a cpu for this uvhub */
354 cpu = uvhub_to_first_cpu(uvhub);
355 cpu_set(cpu, mask);
356 }
357 /* IPI all cpus; Preemption is already disabled */
358 smp_call_function_many(&mask, uv_do_reset, (void *)&reset_args, 1);
359 return;
360}
361
362static inline unsigned long
363cycles_2_us(unsigned long long cyc)
364{
365 unsigned long long ns;
366 unsigned long us;
367 ns = (cyc * per_cpu(cyc2ns, smp_processor_id()))
368 >> CYC2NS_SCALE_FACTOR;
369 us = ns / 1000;
370 return us;
371}
372
373/*
374 * wait for all cpus on this hub to finish their sends and go quiet
375 * leaves uvhub_quiesce set so that no new broadcasts are started by
376 * bau_flush_send_and_wait()
377 */
378static inline void
379quiesce_local_uvhub(struct bau_control *hmaster)
380{
381 atomic_add_short_return(1, (struct atomic_short *)
382 &hmaster->uvhub_quiesce);
383}
384
385/*
386 * mark this quiet-requestor as done
387 */
388static inline void
389end_uvhub_quiesce(struct bau_control *hmaster)
390{
391 atomic_add_short_return(-1, (struct atomic_short *)
392 &hmaster->uvhub_quiesce);
393}
394
395/*
396 * Wait for completion of a broadcast software ack message
397 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
398 */
399static int uv_wait_completion(struct bau_desc *bau_desc,
400 unsigned long mmr_offset, int right_shift, int this_cpu,
401 struct bau_control *bcp, struct bau_control *smaster, long try)
402{
403 unsigned long descriptor_status;
404 cycles_t ttime;
405 struct ptc_stats *stat = bcp->statp;
406 struct bau_control *hmaster;
407
408 hmaster = bcp->uvhub_master;
409
410 /* spin on the status MMR, waiting for it to go idle */
411 while ((descriptor_status = (((unsigned long)
412 uv_read_local_mmr(mmr_offset) >>
413 right_shift) & UV_ACT_STATUS_MASK)) !=
414 DESC_STATUS_IDLE) {
415 /*
416 * Our software ack messages may be blocked because there are
417 * no swack resources available. As long as none of them
418 * has timed out hardware will NACK our message and its
419 * state will stay IDLE.
420 */
421 if (descriptor_status == DESC_STATUS_SOURCE_TIMEOUT) {
422 stat->s_stimeout++;
423 return FLUSH_GIVEUP;
424 } else if (descriptor_status ==
425 DESC_STATUS_DESTINATION_TIMEOUT) {
426 stat->s_dtimeout++;
427 ttime = get_cycles();
428
429 /*
430 * Our retries may be blocked by all destination
431 * swack resources being consumed, and a timeout
432 * pending. In that case hardware returns the
433 * ERROR that looks like a destination timeout.
434 */
435 if (cycles_2_us(ttime - bcp->send_message) <
436 timeout_us) {
437 bcp->conseccompletes = 0;
438 return FLUSH_RETRY_PLUGGED;
439 }
440
441 bcp->conseccompletes = 0;
442 return FLUSH_RETRY_TIMEOUT;
443 } else {
444 /*
445 * descriptor_status is still BUSY
446 */
447 cpu_relax();
448 }
449 }
450 bcp->conseccompletes++;
451 return FLUSH_COMPLETE;
452}
453
454static inline cycles_t
455sec_2_cycles(unsigned long sec)
456{
457 unsigned long ns;
458 cycles_t cyc;
459
460 ns = sec * 1000000000;
461 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
462 return cyc;
463}
464
465/*
466 * conditionally add 1 to *v, unless *v is >= u
467 * return 0 if we cannot add 1 to *v because it is >= u
468 * return 1 if we can add 1 to *v because it is < u
469 * the add is atomic
470 *
471 * This is close to atomic_add_unless(), but this allows the 'u' value
472 * to be lowered below the current 'v'. atomic_add_unless can only stop
473 * on equal.
474 */
475static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u)
476{
477 spin_lock(lock);
478 if (atomic_read(v) >= u) {
479 spin_unlock(lock);
480 return 0;
481 }
482 atomic_inc(v);
483 spin_unlock(lock);
484 return 1;
485}
486
487/*
488 * Our retries are blocked by all destination swack resources being
489 * in use, and a timeout is pending. In that case hardware immediately
490 * returns the ERROR that looks like a destination timeout.
491 */
492static void
493destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp,
494 struct bau_control *hmaster, struct ptc_stats *stat)
495{
496 udelay(bcp->plugged_delay);
497 bcp->plugged_tries++;
498 if (bcp->plugged_tries >= bcp->plugsb4reset) {
499 bcp->plugged_tries = 0;
500 quiesce_local_uvhub(hmaster);
501 spin_lock(&hmaster->queue_lock);
502 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
503 spin_unlock(&hmaster->queue_lock);
504 end_uvhub_quiesce(hmaster);
505 bcp->ipi_attempts++;
506 stat->s_resets_plug++;
507 }
508}
509
510static void
511destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp,
512 struct bau_control *hmaster, struct ptc_stats *stat)
513{
514 hmaster->max_bau_concurrent = 1;
515 bcp->timeout_tries++;
516 if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
517 bcp->timeout_tries = 0;
518 quiesce_local_uvhub(hmaster);
519 spin_lock(&hmaster->queue_lock);
520 uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu);
521 spin_unlock(&hmaster->queue_lock);
522 end_uvhub_quiesce(hmaster);
523 bcp->ipi_attempts++;
524 stat->s_resets_timeout++;
525 }
526}
527
528/*
529 * Completions are taking a very long time due to a congested numalink
530 * network.
531 */
532static void
533disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat)
534{
535 int tcpu;
536 struct bau_control *tbcp;
537
538 /* let only one cpu do this disabling */
539 spin_lock(&disable_lock);
540 if (!baudisabled && bcp->period_requests &&
541 ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
542 /* it becomes this cpu's job to turn on the use of the
543 BAU again */
544 baudisabled = 1;
545 bcp->set_bau_off = 1;
546 bcp->set_bau_on_time = get_cycles() +
547 sec_2_cycles(bcp->congested_period);
548 stat->s_bau_disabled++;
549 for_each_present_cpu(tcpu) {
550 tbcp = &per_cpu(bau_control, tcpu);
551 tbcp->baudisabled = 1;
552 }
553 }
554 spin_unlock(&disable_lock);
555}
556
557/**
558 * uv_flush_send_and_wait
559 *
560 * Send a broadcast and wait for it to complete.
561 *
562 * The flush_mask contains the cpus the broadcast is to be sent to including
563 * cpus that are on the local uvhub.
564 *
565 * Returns 0 if all flushing represented in the mask was done.
566 * Returns 1 if it gives up entirely and the original cpu mask is to be
567 * returned to the kernel.
568 */
569int uv_flush_send_and_wait(struct bau_desc *bau_desc,
570 struct cpumask *flush_mask, struct bau_control *bcp)
571{
572 int right_shift;
573 int completion_status = 0;
574 int seq_number = 0;
575 long try = 0;
576 int cpu = bcp->uvhub_cpu;
577 int this_cpu = bcp->cpu;
578 unsigned long mmr_offset;
579 unsigned long index;
580 cycles_t time1;
581 cycles_t time2;
582 cycles_t elapsed;
583 struct ptc_stats *stat = bcp->statp;
584 struct bau_control *smaster = bcp->socket_master;
585 struct bau_control *hmaster = bcp->uvhub_master;
586
587 if (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
588 &hmaster->active_descriptor_count,
589 hmaster->max_bau_concurrent)) {
590 stat->s_throttles++;
591 do {
592 cpu_relax();
593 } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock,
594 &hmaster->active_descriptor_count,
595 hmaster->max_bau_concurrent));
596 }
597 while (hmaster->uvhub_quiesce)
598 cpu_relax();
599
600 if (cpu < UV_CPUS_PER_ACT_STATUS) {
601 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
602 right_shift = cpu * UV_ACT_STATUS_SIZE;
603 } else {
604 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
605 right_shift =
606 ((cpu - UV_CPUS_PER_ACT_STATUS) * UV_ACT_STATUS_SIZE);
607 }
608 time1 = get_cycles();
609 do {
610 if (try == 0) {
611 bau_desc->header.msg_type = MSG_REGULAR;
612 seq_number = bcp->message_number++;
613 } else {
614 bau_desc->header.msg_type = MSG_RETRY;
615 stat->s_retry_messages++;
616 }
617 bau_desc->header.sequence = seq_number;
618 index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) |
619 bcp->uvhub_cpu;
620 bcp->send_message = get_cycles();
621 uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index);
622 try++;
623 completion_status = uv_wait_completion(bau_desc, mmr_offset,
624 right_shift, this_cpu, bcp, smaster, try);
625
626 if (completion_status == FLUSH_RETRY_PLUGGED) {
627 destination_plugged(bau_desc, bcp, hmaster, stat);
628 } else if (completion_status == FLUSH_RETRY_TIMEOUT) {
629 destination_timeout(bau_desc, bcp, hmaster, stat);
630 }
631 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
632 bcp->ipi_attempts = 0;
633 completion_status = FLUSH_GIVEUP;
634 break;
635 }
636 cpu_relax();
637 } while ((completion_status == FLUSH_RETRY_PLUGGED) ||
638 (completion_status == FLUSH_RETRY_TIMEOUT));
639 time2 = get_cycles();
640 bcp->plugged_tries = 0;
641 bcp->timeout_tries = 0;
642 if ((completion_status == FLUSH_COMPLETE) &&
643 (bcp->conseccompletes > bcp->complete_threshold) &&
644 (hmaster->max_bau_concurrent <
645 hmaster->max_bau_concurrent_constant))
646 hmaster->max_bau_concurrent++;
647 while (hmaster->uvhub_quiesce)
648 cpu_relax();
649 atomic_dec(&hmaster->active_descriptor_count);
650 if (time2 > time1) {
651 elapsed = time2 - time1;
652 stat->s_time += elapsed;
653 if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
654 bcp->period_requests++;
655 bcp->period_time += elapsed;
656 if ((elapsed > congested_cycles) &&
657 (bcp->period_requests > bcp->congested_reps)) {
658 disable_for_congestion(bcp, stat);
659 }
660 }
661 } else
662 stat->s_requestor--;
663 if (completion_status == FLUSH_COMPLETE && try > 1)
664 stat->s_retriesok++;
665 else if (completion_status == FLUSH_GIVEUP) {
666 stat->s_giveup++;
667 return 1;
668 }
669 return 0;
670}
671
672/**
673 * uv_flush_tlb_others - globally purge translation cache of a virtual
674 * address or all TLB's
675 * @cpumask: mask of all cpu's in which the address is to be removed
676 * @mm: mm_struct containing virtual address range
677 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
678 * @cpu: the current cpu
679 *
680 * This is the entry point for initiating any UV global TLB shootdown.
681 *
682 * Purges the translation caches of all specified processors of the given
683 * virtual address, or purges all TLB's on specified processors.
684 *
685 * The caller has derived the cpumask from the mm_struct. This function
686 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
687 *
688 * The cpumask is converted into a uvhubmask of the uvhubs containing
689 * those cpus.
690 *
691 * Note that this function should be called with preemption disabled.
692 *
693 * Returns NULL if all remote flushing was done.
694 * Returns pointer to cpumask if some remote flushing remains to be
695 * done. The returned pointer is valid till preemption is re-enabled.
696 */
697const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
698 struct mm_struct *mm,
699 unsigned long va, unsigned int cpu)
700{
701 int tcpu;
702 int uvhub;
703 int locals = 0;
704 int remotes = 0;
705 int hubs = 0;
706 struct bau_desc *bau_desc;
707 struct cpumask *flush_mask;
708 struct ptc_stats *stat;
709 struct bau_control *bcp;
710 struct bau_control *tbcp;
711
712 /* kernel was booted 'nobau' */
713 if (nobau)
714 return cpumask;
715
716 bcp = &per_cpu(bau_control, cpu);
717 stat = bcp->statp;
718
719 /* bau was disabled due to slow response */
720 if (bcp->baudisabled) {
721 /* the cpu that disabled it must re-enable it */
722 if (bcp->set_bau_off) {
723 if (get_cycles() >= bcp->set_bau_on_time) {
724 stat->s_bau_reenabled++;
725 baudisabled = 0;
726 for_each_present_cpu(tcpu) {
727 tbcp = &per_cpu(bau_control, tcpu);
728 tbcp->baudisabled = 0;
729 tbcp->period_requests = 0;
730 tbcp->period_time = 0;
731 }
732 }
733 }
734 return cpumask;
735 }
736
737 /*
738 * Each sending cpu has a per-cpu mask which it fills from the caller's
739 * cpu mask. All cpus are converted to uvhubs and copied to the
740 * activation descriptor.
741 */
742 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
743 /* don't actually do a shootdown of the local cpu */
744 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
745 if (cpu_isset(cpu, *cpumask))
746 stat->s_ntargself++;
747
748 bau_desc = bcp->descriptor_base;
749 bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu;
750 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
751
752 /* cpu statistics */
753 for_each_cpu(tcpu, flush_mask) {
754 uvhub = uv_cpu_to_blade_id(tcpu);
755 bau_uvhub_set(uvhub, &bau_desc->distribution);
756 if (uvhub == bcp->uvhub)
757 locals++;
758 else
759 remotes++;
760 }
761 if ((locals + remotes) == 0)
762 return NULL;
763 stat->s_requestor++;
764 stat->s_ntargcpu += remotes + locals;
765 stat->s_ntargremotes += remotes;
766 stat->s_ntarglocals += locals;
767 remotes = bau_uvhub_weight(&bau_desc->distribution);
768
769 /* uvhub statistics */
770 hubs = bau_uvhub_weight(&bau_desc->distribution);
771 if (locals) {
772 stat->s_ntarglocaluvhub++;
773 stat->s_ntargremoteuvhub += (hubs - 1);
774 } else
775 stat->s_ntargremoteuvhub += hubs;
776 stat->s_ntarguvhub += hubs;
777 if (hubs >= 16)
778 stat->s_ntarguvhub16++;
779 else if (hubs >= 8)
780 stat->s_ntarguvhub8++;
781 else if (hubs >= 4)
782 stat->s_ntarguvhub4++;
783 else if (hubs >= 2)
784 stat->s_ntarguvhub2++;
785 else
786 stat->s_ntarguvhub1++;
787
788 bau_desc->payload.address = va;
789 bau_desc->payload.sending_cpu = cpu;
790
791 /*
792 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
793 * or 1 if it gave up and the original cpumask should be returned.
794 */
795 if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
796 return NULL;
797 else
798 return cpumask;
799}
800
801/*
802 * The BAU message interrupt comes here. (registered by set_intr_gate)
803 * See entry_64.S
804 *
805 * We received a broadcast assist message.
806 *
807 * Interrupts are disabled; this interrupt could represent
808 * the receipt of several messages.
809 *
810 * All cores/threads on this hub get this interrupt.
811 * The last one to see it does the software ack.
812 * (the resource will not be freed until noninterruptable cpus see this
813 * interrupt; hardware may timeout the s/w ack and reply ERROR)
814 */
815void uv_bau_message_interrupt(struct pt_regs *regs)
816{
817 int count = 0;
818 cycles_t time_start;
819 struct bau_payload_queue_entry *msg;
820 struct bau_control *bcp;
821 struct ptc_stats *stat;
822 struct msg_desc msgdesc;
823
824 time_start = get_cycles();
825 bcp = &per_cpu(bau_control, smp_processor_id());
826 stat = bcp->statp;
827 msgdesc.va_queue_first = bcp->va_queue_first;
828 msgdesc.va_queue_last = bcp->va_queue_last;
829 msg = bcp->bau_msg_head;
830 while (msg->sw_ack_vector) {
831 count++;
832 msgdesc.msg_slot = msg - msgdesc.va_queue_first;
833 msgdesc.sw_ack_slot = ffs(msg->sw_ack_vector) - 1;
834 msgdesc.msg = msg;
835 uv_bau_process_message(&msgdesc, bcp);
836 msg++;
837 if (msg > msgdesc.va_queue_last)
838 msg = msgdesc.va_queue_first;
839 bcp->bau_msg_head = msg;
840 }
841 stat->d_time += (get_cycles() - time_start);
842 if (!count)
843 stat->d_nomsg++;
844 else if (count > 1)
845 stat->d_multmsg++;
846 ack_APIC_irq();
847}
848
849/*
850 * uv_enable_timeouts
851 *
852 * Each target uvhub (i.e. a uvhub that has no cpu's) needs to have
853 * shootdown message timeouts enabled. The timeout does not cause
854 * an interrupt, but causes an error message to be returned to
855 * the sender.
856 */
857static void uv_enable_timeouts(void)
858{
859 int uvhub;
860 int nuvhubs;
861 int pnode;
862 unsigned long mmr_image;
863
864 nuvhubs = uv_num_possible_blades();
865
866 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
867 if (!uv_blade_nr_possible_cpus(uvhub))
868 continue;
869
870 pnode = uv_blade_to_pnode(uvhub);
871 mmr_image =
872 uv_read_global_mmr64(pnode, UVH_LB_BAU_MISC_CONTROL);
873 /*
874 * Set the timeout period and then lock it in, in three
875 * steps; captures and locks in the period.
876 *
877 * To program the period, the SOFT_ACK_MODE must be off.
878 */
879 mmr_image &= ~((unsigned long)1 <<
880 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
881 uv_write_global_mmr64
882 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
883 /*
884 * Set the 4-bit period.
885 */
886 mmr_image &= ~((unsigned long)0xf <<
887 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
888 mmr_image |= (UV_INTD_SOFT_ACK_TIMEOUT_PERIOD <<
889 UVH_LB_BAU_MISC_CONTROL_INTD_SOFT_ACK_TIMEOUT_PERIOD_SHFT);
890 uv_write_global_mmr64
891 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
892 /*
893 * Subsequent reversals of the timebase bit (3) cause an
894 * immediate timeout of one or all INTD resources as
895 * indicated in bits 2:0 (7 causes all of them to timeout).
896 */
897 mmr_image |= ((unsigned long)1 <<
898 UVH_LB_BAU_MISC_CONTROL_ENABLE_INTD_SOFT_ACK_MODE_SHFT);
899 uv_write_global_mmr64
900 (pnode, UVH_LB_BAU_MISC_CONTROL, mmr_image);
901 }
902}
903
904static void *uv_ptc_seq_start(struct seq_file *file, loff_t *offset)
905{
906 if (*offset < num_possible_cpus())
907 return offset;
908 return NULL;
909}
910
911static void *uv_ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
912{
913 (*offset)++;
914 if (*offset < num_possible_cpus())
915 return offset;
916 return NULL;
917}
918
919static void uv_ptc_seq_stop(struct seq_file *file, void *data)
920{
921}
922
923static inline unsigned long long
924microsec_2_cycles(unsigned long microsec)
925{
926 unsigned long ns;
927 unsigned long long cyc;
928
929 ns = microsec * 1000;
930 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
931 return cyc;
932}
933
934/*
935 * Display the statistics thru /proc.
936 * 'data' points to the cpu number
937 */
938static int uv_ptc_seq_show(struct seq_file *file, void *data)
939{
940 struct ptc_stats *stat;
941 int cpu;
942
943 cpu = *(loff_t *)data;
944
945 if (!cpu) {
946 seq_printf(file,
947 "# cpu sent stime self locals remotes ncpus localhub ");
948 seq_printf(file,
949 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
950 seq_printf(file,
951 "numuvhubs4 numuvhubs2 numuvhubs1 dto ");
952 seq_printf(file,
953 "retries rok resetp resett giveup sto bz throt ");
954 seq_printf(file,
955 "sw_ack recv rtime all ");
956 seq_printf(file,
957 "one mult none retry canc nocan reset rcan ");
958 seq_printf(file,
959 "disable enable\n");
960 }
961 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
962 stat = &per_cpu(ptcstats, cpu);
963 /* source side statistics */
964 seq_printf(file,
965 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
966 cpu, stat->s_requestor, cycles_2_us(stat->s_time),
967 stat->s_ntargself, stat->s_ntarglocals,
968 stat->s_ntargremotes, stat->s_ntargcpu,
969 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
970 stat->s_ntarguvhub, stat->s_ntarguvhub16);
971 seq_printf(file, "%ld %ld %ld %ld %ld ",
972 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
973 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
974 stat->s_dtimeout);
975 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
976 stat->s_retry_messages, stat->s_retriesok,
977 stat->s_resets_plug, stat->s_resets_timeout,
978 stat->s_giveup, stat->s_stimeout,
979 stat->s_busy, stat->s_throttles);
980
981 /* destination side statistics */
982 seq_printf(file,
983 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
984 uv_read_global_mmr64(uv_cpu_to_pnode(cpu),
985 UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE),
986 stat->d_requestee, cycles_2_us(stat->d_time),
987 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
988 stat->d_nomsg, stat->d_retries, stat->d_canceled,
989 stat->d_nocanceled, stat->d_resets,
990 stat->d_rcanceled);
991 seq_printf(file, "%ld %ld\n",
992 stat->s_bau_disabled, stat->s_bau_reenabled);
993 }
994
995 return 0;
996}
997
998/*
999 * Display the tunables thru debugfs
1000 */
1001static ssize_t tunables_read(struct file *file, char __user *userbuf,
1002 size_t count, loff_t *ppos)
1003{
1004 char buf[300];
1005 int ret;
1006
1007 ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1008 "max_bau_concurrent plugged_delay plugsb4reset",
1009 "timeoutsb4reset ipi_reset_limit complete_threshold",
1010 "congested_response_us congested_reps congested_period",
1011 max_bau_concurrent, plugged_delay, plugsb4reset,
1012 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1013 congested_response_us, congested_reps, congested_period);
1014
1015 return simple_read_from_buffer(userbuf, count, ppos, buf, ret);
1016}
1017
1018/*
1019 * -1: resetf the statistics
1020 * 0: display meaning of the statistics
1021 */
1022static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user,
1023 size_t count, loff_t *data)
1024{
1025 int cpu;
1026 long input_arg;
1027 char optstr[64];
1028 struct ptc_stats *stat;
1029
1030 if (count == 0 || count > sizeof(optstr))
1031 return -EINVAL;
1032 if (copy_from_user(optstr, user, count))
1033 return -EFAULT;
1034 optstr[count - 1] = '\0';
1035 if (strict_strtol(optstr, 10, &input_arg) < 0) {
1036 printk(KERN_DEBUG "%s is invalid\n", optstr);
1037 return -EINVAL;
1038 }
1039
1040 if (input_arg == 0) {
1041 printk(KERN_DEBUG "# cpu: cpu number\n");
1042 printk(KERN_DEBUG "Sender statistics:\n");
1043 printk(KERN_DEBUG
1044 "sent: number of shootdown messages sent\n");
1045 printk(KERN_DEBUG
1046 "stime: time spent sending messages\n");
1047 printk(KERN_DEBUG
1048 "numuvhubs: number of hubs targeted with shootdown\n");
1049 printk(KERN_DEBUG
1050 "numuvhubs16: number times 16 or more hubs targeted\n");
1051 printk(KERN_DEBUG
1052 "numuvhubs8: number times 8 or more hubs targeted\n");
1053 printk(KERN_DEBUG
1054 "numuvhubs4: number times 4 or more hubs targeted\n");
1055 printk(KERN_DEBUG
1056 "numuvhubs2: number times 2 or more hubs targeted\n");
1057 printk(KERN_DEBUG
1058 "numuvhubs1: number times 1 hub targeted\n");
1059 printk(KERN_DEBUG
1060 "numcpus: number of cpus targeted with shootdown\n");
1061 printk(KERN_DEBUG
1062 "dto: number of destination timeouts\n");
1063 printk(KERN_DEBUG
1064 "retries: destination timeout retries sent\n");
1065 printk(KERN_DEBUG
1066 "rok: : destination timeouts successfully retried\n");
1067 printk(KERN_DEBUG
1068 "resetp: ipi-style resource resets for plugs\n");
1069 printk(KERN_DEBUG
1070 "resett: ipi-style resource resets for timeouts\n");
1071 printk(KERN_DEBUG
1072 "giveup: fall-backs to ipi-style shootdowns\n");
1073 printk(KERN_DEBUG
1074 "sto: number of source timeouts\n");
1075 printk(KERN_DEBUG
1076 "bz: number of stay-busy's\n");
1077 printk(KERN_DEBUG
1078 "throt: number times spun in throttle\n");
1079 printk(KERN_DEBUG "Destination side statistics:\n");
1080 printk(KERN_DEBUG
1081 "sw_ack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE\n");
1082 printk(KERN_DEBUG
1083 "recv: shootdown messages received\n");
1084 printk(KERN_DEBUG
1085 "rtime: time spent processing messages\n");
1086 printk(KERN_DEBUG
1087 "all: shootdown all-tlb messages\n");
1088 printk(KERN_DEBUG
1089 "one: shootdown one-tlb messages\n");
1090 printk(KERN_DEBUG
1091 "mult: interrupts that found multiple messages\n");
1092 printk(KERN_DEBUG
1093 "none: interrupts that found no messages\n");
1094 printk(KERN_DEBUG
1095 "retry: number of retry messages processed\n");
1096 printk(KERN_DEBUG
1097 "canc: number messages canceled by retries\n");
1098 printk(KERN_DEBUG
1099 "nocan: number retries that found nothing to cancel\n");
1100 printk(KERN_DEBUG
1101 "reset: number of ipi-style reset requests processed\n");
1102 printk(KERN_DEBUG
1103 "rcan: number messages canceled by reset requests\n");
1104 printk(KERN_DEBUG
1105 "disable: number times use of the BAU was disabled\n");
1106 printk(KERN_DEBUG
1107 "enable: number times use of the BAU was re-enabled\n");
1108 } else if (input_arg == -1) {
1109 for_each_present_cpu(cpu) {
1110 stat = &per_cpu(ptcstats, cpu);
1111 memset(stat, 0, sizeof(struct ptc_stats));
1112 }
1113 }
1114
1115 return count;
1116}
1117
1118static int local_atoi(const char *name)
1119{
1120 int val = 0;
1121
1122 for (;; name++) {
1123 switch (*name) {
1124 case '0' ... '9':
1125 val = 10*val+(*name-'0');
1126 break;
1127 default:
1128 return val;
1129 }
1130 }
1131}
1132
1133/*
1134 * set the tunables
1135 * 0 values reset them to defaults
1136 */
1137static ssize_t tunables_write(struct file *file, const char __user *user,
1138 size_t count, loff_t *data)
1139{
1140 int cpu;
1141 int cnt = 0;
1142 int val;
1143 char *p;
1144 char *q;
1145 char instr[64];
1146 struct bau_control *bcp;
1147
1148 if (count == 0 || count > sizeof(instr)-1)
1149 return -EINVAL;
1150 if (copy_from_user(instr, user, count))
1151 return -EFAULT;
1152
1153 instr[count] = '\0';
1154 /* count the fields */
1155 p = instr + strspn(instr, WHITESPACE);
1156 q = p;
1157 for (; *p; p = q + strspn(q, WHITESPACE)) {
1158 q = p + strcspn(p, WHITESPACE);
1159 cnt++;
1160 if (q == p)
1161 break;
1162 }
1163 if (cnt != 9) {
1164 printk(KERN_INFO "bau tunable error: should be 9 numbers\n");
1165 return -EINVAL;
1166 }
1167
1168 p = instr + strspn(instr, WHITESPACE);
1169 q = p;
1170 for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
1171 q = p + strcspn(p, WHITESPACE);
1172 val = local_atoi(p);
1173 switch (cnt) {
1174 case 0:
1175 if (val == 0) {
1176 max_bau_concurrent = MAX_BAU_CONCURRENT;
1177 max_bau_concurrent_constant =
1178 MAX_BAU_CONCURRENT;
1179 continue;
1180 }
1181 bcp = &per_cpu(bau_control, smp_processor_id());
1182 if (val < 1 || val > bcp->cpus_in_uvhub) {
1183 printk(KERN_DEBUG
1184 "Error: BAU max concurrent %d is invalid\n",
1185 val);
1186 return -EINVAL;
1187 }
1188 max_bau_concurrent = val;
1189 max_bau_concurrent_constant = val;
1190 continue;
1191 case 1:
1192 if (val == 0)
1193 plugged_delay = PLUGGED_DELAY;
1194 else
1195 plugged_delay = val;
1196 continue;
1197 case 2:
1198 if (val == 0)
1199 plugsb4reset = PLUGSB4RESET;
1200 else
1201 plugsb4reset = val;
1202 continue;
1203 case 3:
1204 if (val == 0)
1205 timeoutsb4reset = TIMEOUTSB4RESET;
1206 else
1207 timeoutsb4reset = val;
1208 continue;
1209 case 4:
1210 if (val == 0)
1211 ipi_reset_limit = IPI_RESET_LIMIT;
1212 else
1213 ipi_reset_limit = val;
1214 continue;
1215 case 5:
1216 if (val == 0)
1217 complete_threshold = COMPLETE_THRESHOLD;
1218 else
1219 complete_threshold = val;
1220 continue;
1221 case 6:
1222 if (val == 0)
1223 congested_response_us = CONGESTED_RESPONSE_US;
1224 else
1225 congested_response_us = val;
1226 continue;
1227 case 7:
1228 if (val == 0)
1229 congested_reps = CONGESTED_REPS;
1230 else
1231 congested_reps = val;
1232 continue;
1233 case 8:
1234 if (val == 0)
1235 congested_period = CONGESTED_PERIOD;
1236 else
1237 congested_period = val;
1238 continue;
1239 }
1240 if (q == p)
1241 break;
1242 }
1243 for_each_present_cpu(cpu) {
1244 bcp = &per_cpu(bau_control, cpu);
1245 bcp->max_bau_concurrent = max_bau_concurrent;
1246 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1247 bcp->plugged_delay = plugged_delay;
1248 bcp->plugsb4reset = plugsb4reset;
1249 bcp->timeoutsb4reset = timeoutsb4reset;
1250 bcp->ipi_reset_limit = ipi_reset_limit;
1251 bcp->complete_threshold = complete_threshold;
1252 bcp->congested_response_us = congested_response_us;
1253 bcp->congested_reps = congested_reps;
1254 bcp->congested_period = congested_period;
1255 }
1256 return count;
1257}
1258
1259static const struct seq_operations uv_ptc_seq_ops = {
1260 .start = uv_ptc_seq_start,
1261 .next = uv_ptc_seq_next,
1262 .stop = uv_ptc_seq_stop,
1263 .show = uv_ptc_seq_show
1264};
1265
1266static int uv_ptc_proc_open(struct inode *inode, struct file *file)
1267{
1268 return seq_open(file, &uv_ptc_seq_ops);
1269}
1270
1271static int tunables_open(struct inode *inode, struct file *file)
1272{
1273 return 0;
1274}
1275
1276static const struct file_operations proc_uv_ptc_operations = {
1277 .open = uv_ptc_proc_open,
1278 .read = seq_read,
1279 .write = uv_ptc_proc_write,
1280 .llseek = seq_lseek,
1281 .release = seq_release,
1282};
1283
1284static const struct file_operations tunables_fops = {
1285 .open = tunables_open,
1286 .read = tunables_read,
1287 .write = tunables_write,
1288};
1289
1290static int __init uv_ptc_init(void)
1291{
1292 struct proc_dir_entry *proc_uv_ptc;
1293
1294 if (!is_uv_system())
1295 return 0;
1296
1297 proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
1298 &proc_uv_ptc_operations);
1299 if (!proc_uv_ptc) {
1300 printk(KERN_ERR "unable to create %s proc entry\n",
1301 UV_PTC_BASENAME);
1302 return -EINVAL;
1303 }
1304
1305 tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
1306 if (!tunables_dir) {
1307 printk(KERN_ERR "unable to create debugfs directory %s\n",
1308 UV_BAU_TUNABLES_DIR);
1309 return -EINVAL;
1310 }
1311 tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1312 tunables_dir, NULL, &tunables_fops);
1313 if (!tunables_file) {
1314 printk(KERN_ERR "unable to create debugfs file %s\n",
1315 UV_BAU_TUNABLES_FILE);
1316 return -EINVAL;
1317 }
1318 return 0;
1319}
1320
1321/*
1322 * initialize the sending side's sending buffers
1323 */
1324static void
1325uv_activation_descriptor_init(int node, int pnode)
1326{
1327 int i;
1328 int cpu;
1329 unsigned long pa;
1330 unsigned long m;
1331 unsigned long n;
1332 struct bau_desc *bau_desc;
1333 struct bau_desc *bd2;
1334 struct bau_control *bcp;
1335
1336 /*
1337 * each bau_desc is 64 bytes; there are 8 (UV_ITEMS_PER_DESCRIPTOR)
1338 * per cpu; and up to 32 (UV_ADP_SIZE) cpu's per uvhub
1339 */
1340 bau_desc = (struct bau_desc *)kmalloc_node(sizeof(struct bau_desc)*
1341 UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR, GFP_KERNEL, node);
1342 BUG_ON(!bau_desc);
1343
1344 pa = uv_gpa(bau_desc); /* need the real nasid*/
1345 n = pa >> uv_nshift;
1346 m = pa & uv_mmask;
1347
1348 uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_DESCRIPTOR_BASE,
1349 (n << UV_DESC_BASE_PNODE_SHIFT | m));
1350
1351 /*
1352 * initializing all 8 (UV_ITEMS_PER_DESCRIPTOR) descriptors for each
1353 * cpu even though we only use the first one; one descriptor can
1354 * describe a broadcast to 256 uv hubs.
1355 */
1356 for (i = 0, bd2 = bau_desc; i < (UV_ADP_SIZE*UV_ITEMS_PER_DESCRIPTOR);
1357 i++, bd2++) {
1358 memset(bd2, 0, sizeof(struct bau_desc));
1359 bd2->header.sw_ack_flag = 1;
1360 /*
1361 * base_dest_nodeid is the nasid (pnode<<1) of the first uvhub
1362 * in the partition. The bit map will indicate uvhub numbers,
1363 * which are 0-N in a partition. Pnodes are unique system-wide.
1364 */
1365 bd2->header.base_dest_nodeid = uv_partition_base_pnode << 1;
1366 bd2->header.dest_subnodeid = 0x10; /* the LB */
1367 bd2->header.command = UV_NET_ENDPOINT_INTD;
1368 bd2->header.int_both = 1;
1369 /*
1370 * all others need to be set to zero:
1371 * fairness chaining multilevel count replied_to
1372 */
1373 }
1374 for_each_present_cpu(cpu) {
1375 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1376 continue;
1377 bcp = &per_cpu(bau_control, cpu);
1378 bcp->descriptor_base = bau_desc;
1379 }
1380}
1381
1382/*
1383 * initialize the destination side's receiving buffers
1384 * entered for each uvhub in the partition
1385 * - node is first node (kernel memory notion) on the uvhub
1386 * - pnode is the uvhub's physical identifier
1387 */
1388static void
1389uv_payload_queue_init(int node, int pnode)
1390{
1391 int pn;
1392 int cpu;
1393 char *cp;
1394 unsigned long pa;
1395 struct bau_payload_queue_entry *pqp;
1396 struct bau_payload_queue_entry *pqp_malloc;
1397 struct bau_control *bcp;
1398
1399 pqp = (struct bau_payload_queue_entry *) kmalloc_node(
1400 (DEST_Q_SIZE + 1) * sizeof(struct bau_payload_queue_entry),
1401 GFP_KERNEL, node);
1402 BUG_ON(!pqp);
1403 pqp_malloc = pqp;
1404
1405 cp = (char *)pqp + 31;
1406 pqp = (struct bau_payload_queue_entry *)(((unsigned long)cp >> 5) << 5);
1407
1408 for_each_present_cpu(cpu) {
1409 if (pnode != uv_cpu_to_pnode(cpu))
1410 continue;
1411 /* for every cpu on this pnode: */
1412 bcp = &per_cpu(bau_control, cpu);
1413 bcp->va_queue_first = pqp;
1414 bcp->bau_msg_head = pqp;
1415 bcp->va_queue_last = pqp + (DEST_Q_SIZE - 1);
1416 }
1417 /*
1418 * need the pnode of where the memory was really allocated
1419 */
1420 pa = uv_gpa(pqp);
1421 pn = pa >> uv_nshift;
1422 uv_write_global_mmr64(pnode,
1423 UVH_LB_BAU_INTD_PAYLOAD_QUEUE_FIRST,
1424 ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) |
1425 uv_physnodeaddr(pqp));
1426 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_TAIL,
1427 uv_physnodeaddr(pqp));
1428 uv_write_global_mmr64(pnode, UVH_LB_BAU_INTD_PAYLOAD_QUEUE_LAST,
1429 (unsigned long)
1430 uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1)));
1431 /* in effect, all msg_type's are set to MSG_NOOP */
1432 memset(pqp, 0, sizeof(struct bau_payload_queue_entry) * DEST_Q_SIZE);
1433}
1434
1435/*
1436 * Initialization of each UV hub's structures
1437 */
1438static void __init uv_init_uvhub(int uvhub, int vector)
1439{
1440 int node;
1441 int pnode;
1442 unsigned long apicid;
1443
1444 node = uvhub_to_first_node(uvhub);
1445 pnode = uv_blade_to_pnode(uvhub);
1446 uv_activation_descriptor_init(node, pnode);
1447 uv_payload_queue_init(node, pnode);
1448 /*
1449 * the below initialization can't be in firmware because the
1450 * messaging IRQ will be determined by the OS
1451 */
1452 apicid = uvhub_to_first_apicid(uvhub);
1453 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
1454 ((apicid << 32) | vector));
1455}
1456
1457/*
1458 * We will set BAU_MISC_CONTROL with a timeout period.
1459 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1460 * So the destination timeout period has be be calculated from them.
1461 */
1462static int
1463calculate_destination_timeout(void)
1464{
1465 unsigned long mmr_image;
1466 int mult1;
1467 int mult2;
1468 int index;
1469 int base;
1470 int ret;
1471 unsigned long ts_ns;
1472
1473 mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1474 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1475 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1476 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1477 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1478 base = timeout_base_ns[index];
1479 ts_ns = base * mult1 * mult2;
1480 ret = ts_ns / 1000;
1481 return ret;
1482}
1483
1484/*
1485 * initialize the bau_control structure for each cpu
1486 */
1487static void __init uv_init_per_cpu(int nuvhubs)
1488{
1489 int i;
1490 int cpu;
1491 int pnode;
1492 int uvhub;
1493 int have_hmaster;
1494 short socket = 0;
1495 unsigned short socket_mask;
1496 unsigned char *uvhub_mask;
1497 struct bau_control *bcp;
1498 struct uvhub_desc *bdp;
1499 struct socket_desc *sdp;
1500 struct bau_control *hmaster = NULL;
1501 struct bau_control *smaster = NULL;
1502 struct socket_desc {
1503 short num_cpus;
1504 short cpu_number[16];
1505 };
1506 struct uvhub_desc {
1507 unsigned short socket_mask;
1508 short num_cpus;
1509 short uvhub;
1510 short pnode;
1511 struct socket_desc socket[2];
1512 };
1513 struct uvhub_desc *uvhub_descs;
1514
1515 timeout_us = calculate_destination_timeout();
1516
1517 uvhub_descs = (struct uvhub_desc *)
1518 kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1519 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1520 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1521 for_each_present_cpu(cpu) {
1522 bcp = &per_cpu(bau_control, cpu);
1523 memset(bcp, 0, sizeof(struct bau_control));
1524 pnode = uv_cpu_hub_info(cpu)->pnode;
1525 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1526 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1527 bdp = &uvhub_descs[uvhub];
1528 bdp->num_cpus++;
1529 bdp->uvhub = uvhub;
1530 bdp->pnode = pnode;
1531 /* kludge: 'assuming' one node per socket, and assuming that
1532 disabling a socket just leaves a gap in node numbers */
1533 socket = (cpu_to_node(cpu) & 1);
1534 bdp->socket_mask |= (1 << socket);
1535 sdp = &bdp->socket[socket];
1536 sdp->cpu_number[sdp->num_cpus] = cpu;
1537 sdp->num_cpus++;
1538 }
1539 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1540 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1541 continue;
1542 have_hmaster = 0;
1543 bdp = &uvhub_descs[uvhub];
1544 socket_mask = bdp->socket_mask;
1545 socket = 0;
1546 while (socket_mask) {
1547 if (!(socket_mask & 1))
1548 goto nextsocket;
1549 sdp = &bdp->socket[socket];
1550 for (i = 0; i < sdp->num_cpus; i++) {
1551 cpu = sdp->cpu_number[i];
1552 bcp = &per_cpu(bau_control, cpu);
1553 bcp->cpu = cpu;
1554 if (i == 0) {
1555 smaster = bcp;
1556 if (!have_hmaster) {
1557 have_hmaster++;
1558 hmaster = bcp;
1559 }
1560 }
1561 bcp->cpus_in_uvhub = bdp->num_cpus;
1562 bcp->cpus_in_socket = sdp->num_cpus;
1563 bcp->socket_master = smaster;
1564 bcp->uvhub = bdp->uvhub;
1565 bcp->uvhub_master = hmaster;
1566 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->
1567 blade_processor_id;
1568 }
1569nextsocket:
1570 socket++;
1571 socket_mask = (socket_mask >> 1);
1572 }
1573 }
1574 kfree(uvhub_descs);
1575 kfree(uvhub_mask);
1576 for_each_present_cpu(cpu) {
1577 bcp = &per_cpu(bau_control, cpu);
1578 bcp->baudisabled = 0;
1579 bcp->statp = &per_cpu(ptcstats, cpu);
1580 /* time interval to catch a hardware stay-busy bug */
1581 bcp->timeout_interval = microsec_2_cycles(2*timeout_us);
1582 bcp->max_bau_concurrent = max_bau_concurrent;
1583 bcp->max_bau_concurrent_constant = max_bau_concurrent;
1584 bcp->plugged_delay = plugged_delay;
1585 bcp->plugsb4reset = plugsb4reset;
1586 bcp->timeoutsb4reset = timeoutsb4reset;
1587 bcp->ipi_reset_limit = ipi_reset_limit;
1588 bcp->complete_threshold = complete_threshold;
1589 bcp->congested_response_us = congested_response_us;
1590 bcp->congested_reps = congested_reps;
1591 bcp->congested_period = congested_period;
1592 }
1593}
1594
1595/*
1596 * Initialization of BAU-related structures
1597 */
1598static int __init uv_bau_init(void)
1599{
1600 int uvhub;
1601 int pnode;
1602 int nuvhubs;
1603 int cur_cpu;
1604 int vector;
1605 unsigned long mmr;
1606
1607 if (!is_uv_system())
1608 return 0;
1609
1610 if (nobau)
1611 return 0;
1612
1613 for_each_possible_cpu(cur_cpu)
1614 zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu),
1615 GFP_KERNEL, cpu_to_node(cur_cpu));
1616
1617 uv_nshift = uv_hub_info->m_val;
1618 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1619 nuvhubs = uv_num_possible_blades();
1620 spin_lock_init(&disable_lock);
1621 congested_cycles = microsec_2_cycles(congested_response_us);
1622
1623 uv_init_per_cpu(nuvhubs);
1624
1625 uv_partition_base_pnode = 0x7fffffff;
1626 for (uvhub = 0; uvhub < nuvhubs; uvhub++)
1627 if (uv_blade_nr_possible_cpus(uvhub) &&
1628 (uv_blade_to_pnode(uvhub) < uv_partition_base_pnode))
1629 uv_partition_base_pnode = uv_blade_to_pnode(uvhub);
1630
1631 vector = UV_BAU_MESSAGE;
1632 for_each_possible_blade(uvhub)
1633 if (uv_blade_nr_possible_cpus(uvhub))
1634 uv_init_uvhub(uvhub, vector);
1635
1636 uv_enable_timeouts();
1637 alloc_intr_gate(vector, uv_bau_message_intr1);
1638
1639 for_each_possible_blade(uvhub) {
1640 if (uv_blade_nr_possible_cpus(uvhub)) {
1641 pnode = uv_blade_to_pnode(uvhub);
1642 /* INIT the bau */
1643 uv_write_global_mmr64(pnode,
1644 UVH_LB_BAU_SB_ACTIVATION_CONTROL,
1645 ((unsigned long)1 << 63));
1646 mmr = 1; /* should be 1 to broadcast to both sockets */
1647 uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST,
1648 mmr);
1649 }
1650 }
1651
1652 return 0;
1653}
1654core_initcall(uv_bau_init);
1655fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/topology.c b/arch/x86/kernel/topology.c
index 7e4515957a1c..8927486a4649 100644
--- a/arch/x86/kernel/topology.c
+++ b/arch/x86/kernel/topology.c
@@ -39,7 +39,7 @@ int __ref arch_register_cpu(int num)
39 /* 39 /*
40 * CPU0 cannot be offlined due to several 40 * CPU0 cannot be offlined due to several
41 * restrictions and assumptions in kernel. This basically 41 * restrictions and assumptions in kernel. This basically
42 * doesnt add a control file, one cannot attempt to offline 42 * doesn't add a control file, one cannot attempt to offline
43 * BSP. 43 * BSP.
44 * 44 *
45 * Also certain PCI quirks require not to enable hotplug control 45 * Also certain PCI quirks require not to enable hotplug control
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index e2a595257390..a91ae7709b49 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -1,56 +1,42 @@
1#include <linux/io.h> 1#include <linux/io.h>
2#include <linux/memblock.h>
2 3
3#include <asm/trampoline.h> 4#include <asm/trampoline.h>
5#include <asm/cacheflush.h>
4#include <asm/pgtable.h> 6#include <asm/pgtable.h>
5#include <asm/e820.h>
6 7
7#if defined(CONFIG_X86_64) && defined(CONFIG_ACPI_SLEEP) 8unsigned char *x86_trampoline_base;
8#define __trampinit
9#define __trampinitdata
10#else
11#define __trampinit __cpuinit
12#define __trampinitdata __cpuinitdata
13#endif
14 9
15/* ready for x86_64 and x86 */ 10void __init setup_trampolines(void)
16unsigned char *__trampinitdata trampoline_base;
17
18void __init reserve_trampoline_memory(void)
19{ 11{
20 unsigned long mem; 12 phys_addr_t mem;
13 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
21 14
22 /* Has to be in very low memory so we can execute real-mode AP code. */ 15 /* Has to be in very low memory so we can execute real-mode AP code. */
23 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE); 16 mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE);
24 if (mem == -1L) 17 if (mem == MEMBLOCK_ERROR)
25 panic("Cannot allocate trampoline\n"); 18 panic("Cannot allocate trampoline\n");
26 19
27 trampoline_base = __va(mem); 20 x86_trampoline_base = __va(mem);
28 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE"); 21 memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE");
22
23 printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n",
24 x86_trampoline_base, (unsigned long long)mem, size);
25
26 memcpy(x86_trampoline_base, x86_trampoline_start, size);
29} 27}
30 28
31/* 29/*
32 * Currently trivial. Write the real->protected mode 30 * setup_trampolines() gets called very early, to guarantee the
33 * bootstrap into the page concerned. The caller 31 * availability of low memory. This is before the proper kernel page
34 * has made sure it's suitably aligned. 32 * tables are set up, so we cannot set page permissions in that
33 * function. Thus, we use an arch_initcall instead.
35 */ 34 */
36unsigned long __trampinit setup_trampoline(void) 35static int __init configure_trampolines(void)
37{ 36{
38 memcpy(trampoline_base, trampoline_data, TRAMPOLINE_SIZE); 37 size_t size = PAGE_ALIGN(x86_trampoline_end - x86_trampoline_start);
39 return virt_to_phys(trampoline_base);
40}
41 38
42void __init setup_trampoline_page_table(void) 39 set_memory_x((unsigned long)x86_trampoline_base, size >> PAGE_SHIFT);
43{ 40 return 0;
44#ifdef CONFIG_X86_32
45 /* Copy kernel address range */
46 clone_pgd_range(trampoline_pg_dir + KERNEL_PGD_BOUNDARY,
47 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
48 KERNEL_PGD_PTRS);
49
50 /* Initialize low mappings */
51 clone_pgd_range(trampoline_pg_dir,
52 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
53 min_t(unsigned long, KERNEL_PGD_PTRS,
54 KERNEL_PGD_BOUNDARY));
55#endif
56} 41}
42arch_initcall(configure_trampolines);
diff --git a/arch/x86/kernel/trampoline_32.S b/arch/x86/kernel/trampoline_32.S
index 8508237e8e43..451c0a7ef7fd 100644
--- a/arch/x86/kernel/trampoline_32.S
+++ b/arch/x86/kernel/trampoline_32.S
@@ -32,9 +32,11 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/page_types.h> 33#include <asm/page_types.h>
34 34
35/* We can free up trampoline after bootup if cpu hotplug is not supported. */ 35#ifdef CONFIG_SMP
36__CPUINITRODATA 36
37.code16 37 .section ".x86_trampoline","a"
38 .balign PAGE_SIZE
39 .code16
38 40
39ENTRY(trampoline_data) 41ENTRY(trampoline_data)
40r_base = . 42r_base = .
@@ -44,7 +46,7 @@ r_base = .
44 46
45 cli # We should be safe anyway 47 cli # We should be safe anyway
46 48
47 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
48 # write marker for master knows we're running 50 # write marker for master knows we're running
49 51
50 /* GDT tables in non default location kernel can be beyond 16MB and 52 /* GDT tables in non default location kernel can be beyond 16MB and
@@ -72,5 +74,10 @@ boot_idt_descr:
72 .word 0 # idt limit = 0 74 .word 0 # idt limit = 0
73 .long 0 # idt base = 0L 75 .long 0 # idt base = 0L
74 76
77ENTRY(trampoline_status)
78 .long 0
79
75.globl trampoline_end 80.globl trampoline_end
76trampoline_end: 81trampoline_end:
82
83#endif /* CONFIG_SMP */
diff --git a/arch/x86/kernel/trampoline_64.S b/arch/x86/kernel/trampoline_64.S
index 3af2dff58b21..09ff51799e96 100644
--- a/arch/x86/kernel/trampoline_64.S
+++ b/arch/x86/kernel/trampoline_64.S
@@ -32,13 +32,9 @@
32#include <asm/segment.h> 32#include <asm/segment.h>
33#include <asm/processor-flags.h> 33#include <asm/processor-flags.h>
34 34
35#ifdef CONFIG_ACPI_SLEEP 35 .section ".x86_trampoline","a"
36.section .rodata, "a", @progbits 36 .balign PAGE_SIZE
37#else 37 .code16
38/* We can free up the trampoline after bootup if cpu hotplug is not supported. */
39__CPUINITRODATA
40#endif
41.code16
42 38
43ENTRY(trampoline_data) 39ENTRY(trampoline_data)
44r_base = . 40r_base = .
@@ -50,7 +46,7 @@ r_base = .
50 mov %ax, %ss 46 mov %ax, %ss
51 47
52 48
53 movl $0xA5A5A5A5, trampoline_data - r_base 49 movl $0xA5A5A5A5, trampoline_status - r_base
54 # write marker for master knows we're running 50 # write marker for master knows we're running
55 51
56 # Setup stack 52 # Setup stack
@@ -64,10 +60,13 @@ r_base = .
64 movzx %ax, %esi # Find the 32bit trampoline location 60 movzx %ax, %esi # Find the 32bit trampoline location
65 shll $4, %esi 61 shll $4, %esi
66 62
67 # Fixup the vectors 63 # Fixup the absolute vectors
68 addl %esi, startup_32_vector - r_base 64 leal (startup_32 - r_base)(%esi), %eax
69 addl %esi, startup_64_vector - r_base 65 movl %eax, startup_32_vector - r_base
70 addl %esi, tgdt + 2 - r_base # Fixup the gdt pointer 66 leal (startup_64 - r_base)(%esi), %eax
67 movl %eax, startup_64_vector - r_base
68 leal (tgdt - r_base)(%esi), %eax
69 movl %eax, (tgdt + 2 - r_base)
71 70
72 /* 71 /*
73 * GDT tables in non default location kernel can be beyond 16MB and 72 * GDT tables in non default location kernel can be beyond 16MB and
@@ -127,8 +126,9 @@ startup_64:
127no_longmode: 126no_longmode:
128 hlt 127 hlt
129 jmp no_longmode 128 jmp no_longmode
130#include "verify_cpu_64.S" 129#include "verify_cpu.S"
131 130
131 .balign 4
132 # Careful these need to be in the same 64K segment as the above; 132 # Careful these need to be in the same 64K segment as the above;
133tidt: 133tidt:
134 .word 0 # idt limit = 0 134 .word 0 # idt limit = 0
@@ -156,6 +156,10 @@ startup_64_vector:
156 .long startup_64 - r_base 156 .long startup_64 - r_base
157 .word __KERNEL_CS, 0 157 .word __KERNEL_CS, 0
158 158
159 .balign 4
160ENTRY(trampoline_status)
161 .long 0
162
159trampoline_stack: 163trampoline_stack:
160 .org 0x1000 164 .org 0x1000
161trampoline_stack_end: 165trampoline_stack_end:
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 60788dee0f8a..b9b67166f9de 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -83,6 +83,13 @@ EXPORT_SYMBOL_GPL(used_vectors);
83 83
84static int ignore_nmis; 84static int ignore_nmis;
85 85
86int unknown_nmi_panic;
87/*
88 * Prevent NMI reason port (0x61) being accessed simultaneously, can
89 * only be used in NMI handler.
90 */
91static DEFINE_RAW_SPINLOCK(nmi_reason_lock);
92
86static inline void conditional_sti(struct pt_regs *regs) 93static inline void conditional_sti(struct pt_regs *regs)
87{ 94{
88 if (regs->flags & X86_EFLAGS_IF) 95 if (regs->flags & X86_EFLAGS_IF)
@@ -300,16 +307,23 @@ gp_in_kernel:
300 die("general protection fault", regs, error_code); 307 die("general protection fault", regs, error_code);
301} 308}
302 309
303static notrace __kprobes void 310static int __init setup_unknown_nmi_panic(char *str)
304mem_parity_error(unsigned char reason, struct pt_regs *regs)
305{ 311{
306 printk(KERN_EMERG 312 unknown_nmi_panic = 1;
307 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 313 return 1;
308 reason, smp_processor_id()); 314}
315__setup("unknown_nmi_panic", setup_unknown_nmi_panic);
309 316
310 printk(KERN_EMERG 317static notrace __kprobes void
311 "You have some hardware problem, likely on the PCI bus.\n"); 318pci_serr_error(unsigned char reason, struct pt_regs *regs)
319{
320 pr_emerg("NMI: PCI system error (SERR) for reason %02x on CPU %d.\n",
321 reason, smp_processor_id());
312 322
323 /*
324 * On some machines, PCI SERR line is used to report memory
325 * errors. EDAC makes use of it.
326 */
313#if defined(CONFIG_EDAC) 327#if defined(CONFIG_EDAC)
314 if (edac_handler_set()) { 328 if (edac_handler_set()) {
315 edac_atomic_assert_error(); 329 edac_atomic_assert_error();
@@ -320,11 +334,11 @@ mem_parity_error(unsigned char reason, struct pt_regs *regs)
320 if (panic_on_unrecovered_nmi) 334 if (panic_on_unrecovered_nmi)
321 panic("NMI: Not continuing"); 335 panic("NMI: Not continuing");
322 336
323 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 337 pr_emerg("Dazed and confused, but trying to continue\n");
324 338
325 /* Clear and disable the memory parity error line. */ 339 /* Clear and disable the PCI SERR error line. */
326 reason = (reason & 0xf) | 4; 340 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_SERR;
327 outb(reason, 0x61); 341 outb(reason, NMI_REASON_PORT);
328} 342}
329 343
330static notrace __kprobes void 344static notrace __kprobes void
@@ -332,22 +346,26 @@ io_check_error(unsigned char reason, struct pt_regs *regs)
332{ 346{
333 unsigned long i; 347 unsigned long i;
334 348
335 printk(KERN_EMERG "NMI: IOCK error (debug interrupt?)\n"); 349 pr_emerg(
350 "NMI: IOCK error (debug interrupt?) for reason %02x on CPU %d.\n",
351 reason, smp_processor_id());
336 show_registers(regs); 352 show_registers(regs);
337 353
338 if (panic_on_io_nmi) 354 if (panic_on_io_nmi)
339 panic("NMI IOCK error: Not continuing"); 355 panic("NMI IOCK error: Not continuing");
340 356
341 /* Re-enable the IOCK line, wait for a few seconds */ 357 /* Re-enable the IOCK line, wait for a few seconds */
342 reason = (reason & 0xf) | 8; 358 reason = (reason & NMI_REASON_CLEAR_MASK) | NMI_REASON_CLEAR_IOCHK;
343 outb(reason, 0x61); 359 outb(reason, NMI_REASON_PORT);
344 360
345 i = 2000; 361 i = 20000;
346 while (--i) 362 while (--i) {
347 udelay(1000); 363 touch_nmi_watchdog();
364 udelay(100);
365 }
348 366
349 reason &= ~8; 367 reason &= ~NMI_REASON_CLEAR_IOCHK;
350 outb(reason, 0x61); 368 outb(reason, NMI_REASON_PORT);
351} 369}
352 370
353static notrace __kprobes void 371static notrace __kprobes void
@@ -366,69 +384,50 @@ unknown_nmi_error(unsigned char reason, struct pt_regs *regs)
366 return; 384 return;
367 } 385 }
368#endif 386#endif
369 printk(KERN_EMERG 387 pr_emerg("Uhhuh. NMI received for unknown reason %02x on CPU %d.\n",
370 "Uhhuh. NMI received for unknown reason %02x on CPU %d.\n", 388 reason, smp_processor_id());
371 reason, smp_processor_id());
372 389
373 printk(KERN_EMERG "Do you have a strange power saving mode enabled?\n"); 390 pr_emerg("Do you have a strange power saving mode enabled?\n");
374 if (panic_on_unrecovered_nmi) 391 if (unknown_nmi_panic || panic_on_unrecovered_nmi)
375 panic("NMI: Not continuing"); 392 panic("NMI: Not continuing");
376 393
377 printk(KERN_EMERG "Dazed and confused, but trying to continue\n"); 394 pr_emerg("Dazed and confused, but trying to continue\n");
378} 395}
379 396
380static notrace __kprobes void default_do_nmi(struct pt_regs *regs) 397static notrace __kprobes void default_do_nmi(struct pt_regs *regs)
381{ 398{
382 unsigned char reason = 0; 399 unsigned char reason = 0;
383 int cpu;
384 400
385 cpu = smp_processor_id(); 401 /*
386 402 * CPU-specific NMI must be processed before non-CPU-specific
387 /* Only the BSP gets external NMIs from the system. */ 403 * NMI, otherwise we may lose it, because the CPU-specific
388 if (!cpu) 404 * NMI can not be detected/processed on other CPUs.
389 reason = get_nmi_reason(); 405 */
390 406 if (notify_die(DIE_NMI, "nmi", regs, 0, 2, SIGINT) == NOTIFY_STOP)
391 if (!(reason & 0xc0)) { 407 return;
392 if (notify_die(DIE_NMI_IPI, "nmi_ipi", regs, reason, 2, SIGINT)
393 == NOTIFY_STOP)
394 return;
395 408
396#ifdef CONFIG_X86_LOCAL_APIC 409 /* Non-CPU-specific NMI: NMI sources can be processed on any CPU */
397 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) 410 raw_spin_lock(&nmi_reason_lock);
398 == NOTIFY_STOP) 411 reason = get_nmi_reason();
399 return;
400 412
401#ifndef CONFIG_LOCKUP_DETECTOR 413 if (reason & NMI_REASON_MASK) {
414 if (reason & NMI_REASON_SERR)
415 pci_serr_error(reason, regs);
416 else if (reason & NMI_REASON_IOCHK)
417 io_check_error(reason, regs);
418#ifdef CONFIG_X86_32
402 /* 419 /*
403 * Ok, so this is none of the documented NMI sources, 420 * Reassert NMI in case it became active
404 * so it must be the NMI watchdog. 421 * meanwhile as it's edge-triggered:
405 */ 422 */
406 if (nmi_watchdog_tick(regs, reason)) 423 reassert_nmi();
407 return;
408 if (!do_nmi_callback(regs, cpu))
409#endif /* !CONFIG_LOCKUP_DETECTOR */
410 unknown_nmi_error(reason, regs);
411#else
412 unknown_nmi_error(reason, regs);
413#endif 424#endif
414 425 raw_spin_unlock(&nmi_reason_lock);
415 return; 426 return;
416 } 427 }
417 if (notify_die(DIE_NMI, "nmi", regs, reason, 2, SIGINT) == NOTIFY_STOP) 428 raw_spin_unlock(&nmi_reason_lock);
418 return;
419 429
420 /* AK: following checks seem to be broken on modern chipsets. FIXME */ 430 unknown_nmi_error(reason, regs);
421 if (reason & 0x80)
422 mem_parity_error(reason, regs);
423 if (reason & 0x40)
424 io_check_error(reason, regs);
425#ifdef CONFIG_X86_32
426 /*
427 * Reassert NMI in case it became active meanwhile
428 * as it's edge-triggered:
429 */
430 reassert_nmi();
431#endif
432} 431}
433 432
434dotraplinkage notrace __kprobes void 433dotraplinkage notrace __kprobes void
@@ -446,14 +445,12 @@ do_nmi(struct pt_regs *regs, long error_code)
446 445
447void stop_nmi(void) 446void stop_nmi(void)
448{ 447{
449 acpi_nmi_disable();
450 ignore_nmis++; 448 ignore_nmis++;
451} 449}
452 450
453void restart_nmi(void) 451void restart_nmi(void)
454{ 452{
455 ignore_nmis--; 453 ignore_nmis--;
456 acpi_nmi_enable();
457} 454}
458 455
459/* May run on IST stack. */ 456/* May run on IST stack. */
@@ -575,6 +572,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
575 if (regs->flags & X86_VM_MASK) { 572 if (regs->flags & X86_VM_MASK) {
576 handle_vm86_trap((struct kernel_vm86_regs *) regs, 573 handle_vm86_trap((struct kernel_vm86_regs *) regs,
577 error_code, 1); 574 error_code, 1);
575 preempt_conditional_cli(regs);
578 return; 576 return;
579 } 577 }
580 578
@@ -776,21 +774,10 @@ asmlinkage void math_state_restore(void)
776} 774}
777EXPORT_SYMBOL_GPL(math_state_restore); 775EXPORT_SYMBOL_GPL(math_state_restore);
778 776
779#ifndef CONFIG_MATH_EMULATION
780void math_emulate(struct math_emu_info *info)
781{
782 printk(KERN_EMERG
783 "math-emulation not enabled and no coprocessor found.\n");
784 printk(KERN_EMERG "killing %s.\n", current->comm);
785 force_sig(SIGFPE, current);
786 schedule();
787}
788#endif /* CONFIG_MATH_EMULATION */
789
790dotraplinkage void __kprobes 777dotraplinkage void __kprobes
791do_device_not_available(struct pt_regs *regs, long error_code) 778do_device_not_available(struct pt_regs *regs, long error_code)
792{ 779{
793#ifdef CONFIG_X86_32 780#ifdef CONFIG_MATH_EMULATION
794 if (read_cr0() & X86_CR0_EM) { 781 if (read_cr0() & X86_CR0_EM) {
795 struct math_emu_info info = { }; 782 struct math_emu_info info = { };
796 783
@@ -798,12 +785,12 @@ do_device_not_available(struct pt_regs *regs, long error_code)
798 785
799 info.regs = regs; 786 info.regs = regs;
800 math_emulate(&info); 787 math_emulate(&info);
801 } else { 788 return;
802 math_state_restore(); /* interrupts still off */
803 conditional_sti(regs);
804 } 789 }
805#else 790#endif
806 math_state_restore(); 791 math_state_restore(); /* interrupts still off */
792#ifdef CONFIG_X86_32
793 conditional_sti(regs);
807#endif 794#endif
808} 795}
809 796
@@ -881,18 +868,6 @@ void __init trap_init(void)
881#endif 868#endif
882 869
883#ifdef CONFIG_X86_32 870#ifdef CONFIG_X86_32
884 if (cpu_has_fxsr) {
885 printk(KERN_INFO "Enabling fast FPU save and restore... ");
886 set_in_cr4(X86_CR4_OSFXSR);
887 printk("done.\n");
888 }
889 if (cpu_has_xmm) {
890 printk(KERN_INFO
891 "Enabling unmasked SIMD FPU exception support... ");
892 set_in_cr4(X86_CR4_OSXMMEXCPT);
893 printk("done.\n");
894 }
895
896 set_system_trap_gate(SYSCALL_VECTOR, &system_call); 871 set_system_trap_gate(SYSCALL_VECTOR, &system_call);
897 set_bit(SYSCALL_VECTOR, used_vectors); 872 set_bit(SYSCALL_VECTOR, used_vectors);
898#endif 873#endif
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index 26a863a9c2a8..6cc6922262af 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -104,10 +104,14 @@ int __init notsc_setup(char *str)
104 104
105__setup("notsc", notsc_setup); 105__setup("notsc", notsc_setup);
106 106
107static int no_sched_irq_time;
108
107static int __init tsc_setup(char *str) 109static int __init tsc_setup(char *str)
108{ 110{
109 if (!strcmp(str, "reliable")) 111 if (!strcmp(str, "reliable"))
110 tsc_clocksource_reliable = 1; 112 tsc_clocksource_reliable = 1;
113 if (!strncmp(str, "noirqtime", 9))
114 no_sched_irq_time = 1;
111 return 1; 115 return 1;
112} 116}
113 117
@@ -423,7 +427,7 @@ unsigned long native_calibrate_tsc(void)
423 * the delta to the previous read. We keep track of the min 427 * the delta to the previous read. We keep track of the min
424 * and max values of that delta. The delta is mostly defined 428 * and max values of that delta. The delta is mostly defined
425 * by the IO time of the PIT access, so we can detect when a 429 * by the IO time of the PIT access, so we can detect when a
426 * SMI/SMM disturbance happend between the two reads. If the 430 * SMI/SMM disturbance happened between the two reads. If the
427 * maximum time is significantly larger than the minimum time, 431 * maximum time is significantly larger than the minimum time,
428 * then we discard the result and have another try. 432 * then we discard the result and have another try.
429 * 433 *
@@ -460,7 +464,7 @@ unsigned long native_calibrate_tsc(void)
460 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz); 464 tsc_pit_min = min(tsc_pit_min, tsc_pit_khz);
461 465
462 /* hpet or pmtimer available ? */ 466 /* hpet or pmtimer available ? */
463 if (!hpet && !ref1 && !ref2) 467 if (ref1 == ref2)
464 continue; 468 continue;
465 469
466 /* Check, whether the sampling was disturbed by an SMI */ 470 /* Check, whether the sampling was disturbed by an SMI */
@@ -655,7 +659,7 @@ void restore_sched_clock_state(void)
655 659
656 local_irq_save(flags); 660 local_irq_save(flags);
657 661
658 __get_cpu_var(cyc2ns_offset) = 0; 662 __this_cpu_write(cyc2ns_offset, 0);
659 offset = cyc2ns_suspend - sched_clock(); 663 offset = cyc2ns_suspend - sched_clock();
660 664
661 for_each_possible_cpu(cpu) 665 for_each_possible_cpu(cpu)
@@ -759,25 +763,6 @@ static cycle_t read_tsc(struct clocksource *cs)
759 ret : clocksource_tsc.cycle_last; 763 ret : clocksource_tsc.cycle_last;
760} 764}
761 765
762#ifdef CONFIG_X86_64
763static cycle_t __vsyscall_fn vread_tsc(void)
764{
765 cycle_t ret;
766
767 /*
768 * Surround the RDTSC by barriers, to make sure it's not
769 * speculated to outside the seqlock critical section and
770 * does not cause time warps:
771 */
772 rdtsc_barrier();
773 ret = (cycle_t)vget_cycles();
774 rdtsc_barrier();
775
776 return ret >= __vsyscall_gtod_data.clock.cycle_last ?
777 ret : __vsyscall_gtod_data.clock.cycle_last;
778}
779#endif
780
781static void resume_tsc(struct clocksource *cs) 766static void resume_tsc(struct clocksource *cs)
782{ 767{
783 clocksource_tsc.cycle_last = 0; 768 clocksource_tsc.cycle_last = 0;
@@ -801,6 +786,7 @@ void mark_tsc_unstable(char *reason)
801 if (!tsc_unstable) { 786 if (!tsc_unstable) {
802 tsc_unstable = 1; 787 tsc_unstable = 1;
803 sched_clock_stable = 0; 788 sched_clock_stable = 0;
789 disable_sched_clock_irqtime();
804 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 790 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
805 /* Change only the rating, when not registered */ 791 /* Change only the rating, when not registered */
806 if (clocksource_tsc.mult) 792 if (clocksource_tsc.mult)
@@ -867,6 +853,9 @@ __cpuinit int unsynchronized_tsc(void)
867 853
868 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 854 if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
869 return 0; 855 return 0;
856
857 if (tsc_clocksource_reliable)
858 return 0;
870 /* 859 /*
871 * Intel systems are normally all synchronized. 860 * Intel systems are normally all synchronized.
872 * Exceptions must mark TSC as unstable: 861 * Exceptions must mark TSC as unstable:
@@ -874,14 +863,92 @@ __cpuinit int unsynchronized_tsc(void)
874 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) { 863 if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) {
875 /* assume multi socket systems are not synchronized: */ 864 /* assume multi socket systems are not synchronized: */
876 if (num_possible_cpus() > 1) 865 if (num_possible_cpus() > 1)
877 tsc_unstable = 1; 866 return 1;
878 } 867 }
879 868
880 return tsc_unstable; 869 return 0;
881} 870}
882 871
883static void __init init_tsc_clocksource(void) 872
873static void tsc_refine_calibration_work(struct work_struct *work);
874static DECLARE_DELAYED_WORK(tsc_irqwork, tsc_refine_calibration_work);
875/**
876 * tsc_refine_calibration_work - Further refine tsc freq calibration
877 * @work - ignored.
878 *
879 * This functions uses delayed work over a period of a
880 * second to further refine the TSC freq value. Since this is
881 * timer based, instead of loop based, we don't block the boot
882 * process while this longer calibration is done.
883 *
884 * If there are any calibration anomalies (too many SMIs, etc),
885 * or the refined calibration is off by 1% of the fast early
886 * calibration, we throw out the new calibration and use the
887 * early calibration.
888 */
889static void tsc_refine_calibration_work(struct work_struct *work)
884{ 890{
891 static u64 tsc_start = -1, ref_start;
892 static int hpet;
893 u64 tsc_stop, ref_stop, delta;
894 unsigned long freq;
895
896 /* Don't bother refining TSC on unstable systems */
897 if (check_tsc_unstable())
898 goto out;
899
900 /*
901 * Since the work is started early in boot, we may be
902 * delayed the first time we expire. So set the workqueue
903 * again once we know timers are working.
904 */
905 if (tsc_start == -1) {
906 /*
907 * Only set hpet once, to avoid mixing hardware
908 * if the hpet becomes enabled later.
909 */
910 hpet = is_hpet_enabled();
911 schedule_delayed_work(&tsc_irqwork, HZ);
912 tsc_start = tsc_read_refs(&ref_start, hpet);
913 return;
914 }
915
916 tsc_stop = tsc_read_refs(&ref_stop, hpet);
917
918 /* hpet or pmtimer available ? */
919 if (ref_start == ref_stop)
920 goto out;
921
922 /* Check, whether the sampling was disturbed by an SMI */
923 if (tsc_start == ULLONG_MAX || tsc_stop == ULLONG_MAX)
924 goto out;
925
926 delta = tsc_stop - tsc_start;
927 delta *= 1000000LL;
928 if (hpet)
929 freq = calc_hpet_ref(delta, ref_start, ref_stop);
930 else
931 freq = calc_pmtimer_ref(delta, ref_start, ref_stop);
932
933 /* Make sure we're within 1% */
934 if (abs(tsc_khz - freq) > tsc_khz/100)
935 goto out;
936
937 tsc_khz = freq;
938 printk(KERN_INFO "Refined TSC clocksource calibration: "
939 "%lu.%03lu MHz.\n", (unsigned long)tsc_khz / 1000,
940 (unsigned long)tsc_khz % 1000);
941
942out:
943 clocksource_register_khz(&clocksource_tsc, tsc_khz);
944}
945
946
947static int __init init_tsc_clocksource(void)
948{
949 if (!cpu_has_tsc || tsc_disabled > 0 || !tsc_khz)
950 return 0;
951
885 if (tsc_clocksource_reliable) 952 if (tsc_clocksource_reliable)
886 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY; 953 clocksource_tsc.flags &= ~CLOCK_SOURCE_MUST_VERIFY;
887 /* lower the rating if we already know its unstable: */ 954 /* lower the rating if we already know its unstable: */
@@ -889,62 +956,14 @@ static void __init init_tsc_clocksource(void)
889 clocksource_tsc.rating = 0; 956 clocksource_tsc.rating = 0;
890 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS; 957 clocksource_tsc.flags &= ~CLOCK_SOURCE_IS_CONTINUOUS;
891 } 958 }
892 clocksource_register_khz(&clocksource_tsc, tsc_khz); 959 schedule_delayed_work(&tsc_irqwork, 0);
960 return 0;
893} 961}
894
895#ifdef CONFIG_X86_64
896/* 962/*
897 * calibrate_cpu is used on systems with fixed rate TSCs to determine 963 * We use device_initcall here, to ensure we run after the hpet
898 * processor frequency 964 * is fully initialized, which may occur at fs_initcall time.
899 */ 965 */
900#define TICK_COUNT 100000000 966device_initcall(init_tsc_clocksource);
901static unsigned long __init calibrate_cpu(void)
902{
903 int tsc_start, tsc_now;
904 int i, no_ctr_free;
905 unsigned long evntsel3 = 0, pmc3 = 0, pmc_now = 0;
906 unsigned long flags;
907
908 for (i = 0; i < 4; i++)
909 if (avail_to_resrv_perfctr_nmi_bit(i))
910 break;
911 no_ctr_free = (i == 4);
912 if (no_ctr_free) {
913 WARN(1, KERN_WARNING "Warning: AMD perfctrs busy ... "
914 "cpu_khz value may be incorrect.\n");
915 i = 3;
916 rdmsrl(MSR_K7_EVNTSEL3, evntsel3);
917 wrmsrl(MSR_K7_EVNTSEL3, 0);
918 rdmsrl(MSR_K7_PERFCTR3, pmc3);
919 } else {
920 reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i);
921 reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
922 }
923 local_irq_save(flags);
924 /* start measuring cycles, incrementing from 0 */
925 wrmsrl(MSR_K7_PERFCTR0 + i, 0);
926 wrmsrl(MSR_K7_EVNTSEL0 + i, 1 << 22 | 3 << 16 | 0x76);
927 rdtscl(tsc_start);
928 do {
929 rdmsrl(MSR_K7_PERFCTR0 + i, pmc_now);
930 tsc_now = get_cycles();
931 } while ((tsc_now - tsc_start) < TICK_COUNT);
932
933 local_irq_restore(flags);
934 if (no_ctr_free) {
935 wrmsrl(MSR_K7_EVNTSEL3, 0);
936 wrmsrl(MSR_K7_PERFCTR3, pmc3);
937 wrmsrl(MSR_K7_EVNTSEL3, evntsel3);
938 } else {
939 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
940 release_evntsel_nmi(MSR_K7_EVNTSEL0 + i);
941 }
942
943 return pmc_now * tsc_khz / (tsc_now - tsc_start);
944}
945#else
946static inline unsigned long calibrate_cpu(void) { return cpu_khz; }
947#endif
948 967
949void __init tsc_init(void) 968void __init tsc_init(void)
950{ 969{
@@ -964,10 +983,6 @@ void __init tsc_init(void)
964 return; 983 return;
965 } 984 }
966 985
967 if (cpu_has(&boot_cpu_data, X86_FEATURE_CONSTANT_TSC) &&
968 (boot_cpu_data.x86_vendor == X86_VENDOR_AMD))
969 cpu_khz = calibrate_cpu();
970
971 printk("Detected %lu.%03lu MHz processor.\n", 986 printk("Detected %lu.%03lu MHz processor.\n",
972 (unsigned long)cpu_khz / 1000, 987 (unsigned long)cpu_khz / 1000,
973 (unsigned long)cpu_khz % 1000); 988 (unsigned long)cpu_khz % 1000);
@@ -987,6 +1002,9 @@ void __init tsc_init(void)
987 /* now allow native_sched_clock() to use rdtsc */ 1002 /* now allow native_sched_clock() to use rdtsc */
988 tsc_disabled = 0; 1003 tsc_disabled = 0;
989 1004
1005 if (!no_sched_irq_time)
1006 enable_sched_clock_irqtime();
1007
990 lpj = ((u64)tsc_khz * 1000); 1008 lpj = ((u64)tsc_khz * 1000);
991 do_div(lpj, HZ); 1009 do_div(lpj, HZ);
992 lpj_fine = lpj; 1010 lpj_fine = lpj;
@@ -999,6 +1017,5 @@ void __init tsc_init(void)
999 mark_tsc_unstable("TSCs unsynchronized"); 1017 mark_tsc_unstable("TSCs unsynchronized");
1000 1018
1001 check_system_tsc_reliable(); 1019 check_system_tsc_reliable();
1002 init_tsc_clocksource();
1003} 1020}
1004 1021
diff --git a/arch/x86/kernel/verify_cpu_64.S b/arch/x86/kernel/verify_cpu.S
index 56a8c2a867d9..b9242bacbe59 100644
--- a/arch/x86/kernel/verify_cpu_64.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -7,6 +7,7 @@
7 * Copyright (c) 2007 Andi Kleen (ak@suse.de) 7 * Copyright (c) 2007 Andi Kleen (ak@suse.de)
8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com) 8 * Copyright (c) 2007 Eric Biederman (ebiederm@xmission.com)
9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com) 9 * Copyright (c) 2007 Vivek Goyal (vgoyal@in.ibm.com)
10 * Copyright (c) 2010 Kees Cook (kees.cook@canonical.com)
10 * 11 *
11 * This source code is licensed under the GNU General Public License, 12 * This source code is licensed under the GNU General Public License,
12 * Version 2. See the file COPYING for more details. 13 * Version 2. See the file COPYING for more details.
@@ -14,18 +15,17 @@
14 * This is a common code for verification whether CPU supports 15 * This is a common code for verification whether CPU supports
15 * long mode and SSE or not. It is not called directly instead this 16 * long mode and SSE or not. It is not called directly instead this
16 * file is included at various places and compiled in that context. 17 * file is included at various places and compiled in that context.
17 * Following are the current usage. 18 * This file is expected to run in 32bit code. Currently:
18 * 19 *
19 * This file is included by both 16bit and 32bit code. 20 * arch/x86/boot/compressed/head_64.S: Boot cpu verification
21 * arch/x86/kernel/trampoline_64.S: secondary processor verification
22 * arch/x86/kernel/head_32.S: processor startup
20 * 23 *
21 * arch/x86_64/boot/setup.S : Boot cpu verification (16bit) 24 * verify_cpu, returns the status of longmode and SSE in register %eax.
22 * arch/x86_64/boot/compressed/head.S: Boot cpu verification (32bit)
23 * arch/x86_64/kernel/trampoline.S: secondary processor verfication (16bit)
24 * arch/x86_64/kernel/acpi/wakeup.S:Verfication at resume (16bit)
25 *
26 * verify_cpu, returns the status of cpu check in register %eax.
27 * 0: Success 1: Failure 25 * 0: Success 1: Failure
28 * 26 *
27 * On Intel, the XD_DISABLE flag will be cleared as a side-effect.
28 *
29 * The caller needs to check for the error code and take the action 29 * The caller needs to check for the error code and take the action
30 * appropriately. Either display a message or halt. 30 * appropriately. Either display a message or halt.
31 */ 31 */
@@ -62,8 +62,41 @@ verify_cpu:
62 cmpl $0x444d4163,%ecx 62 cmpl $0x444d4163,%ecx
63 jnz verify_cpu_noamd 63 jnz verify_cpu_noamd
64 mov $1,%di # cpu is from AMD 64 mov $1,%di # cpu is from AMD
65 jmp verify_cpu_check
65 66
66verify_cpu_noamd: 67verify_cpu_noamd:
68 cmpl $0x756e6547,%ebx # GenuineIntel?
69 jnz verify_cpu_check
70 cmpl $0x49656e69,%edx
71 jnz verify_cpu_check
72 cmpl $0x6c65746e,%ecx
73 jnz verify_cpu_check
74
75 # only call IA32_MISC_ENABLE when:
76 # family > 6 || (family == 6 && model >= 0xd)
77 movl $0x1, %eax # check CPU family and model
78 cpuid
79 movl %eax, %ecx
80
81 andl $0x0ff00f00, %eax # mask family and extended family
82 shrl $8, %eax
83 cmpl $6, %eax
84 ja verify_cpu_clear_xd # family > 6, ok
85 jb verify_cpu_check # family < 6, skip
86
87 andl $0x000f00f0, %ecx # mask model and extended model
88 shrl $4, %ecx
89 cmpl $0xd, %ecx
90 jb verify_cpu_check # family == 6, model < 0xd, skip
91
92verify_cpu_clear_xd:
93 movl $MSR_IA32_MISC_ENABLE, %ecx
94 rdmsr
95 btrl $2, %edx # clear MSR_IA32_MISC_ENABLE_XD_DISABLE
96 jnc verify_cpu_check # only write MSR if bit was changed
97 wrmsr
98
99verify_cpu_check:
67 movl $0x1,%eax # Does the cpu have what it takes 100 movl $0x1,%eax # Does the cpu have what it takes
68 cpuid 101 cpuid
69 andl $REQUIRED_MASK0,%edx 102 andl $REQUIRED_MASK0,%edx
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 5ffb5622f793..863f8753ab0a 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -179,6 +179,7 @@ static void mark_screen_rdonly(struct mm_struct *mm)
179 if (pud_none_or_clear_bad(pud)) 179 if (pud_none_or_clear_bad(pud))
180 goto out; 180 goto out;
181 pmd = pmd_offset(pud, 0xA0000); 181 pmd = pmd_offset(pud, 0xA0000);
182 split_huge_page_pmd(mm, pmd);
182 if (pmd_none_or_clear_bad(pmd)) 183 if (pmd_none_or_clear_bad(pmd))
183 goto out; 184 goto out;
184 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl); 185 pte = pte_offset_map_lock(mm, pmd, 0xA0000, &ptl);
@@ -551,8 +552,14 @@ cannot_handle:
551int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno) 552int handle_vm86_trap(struct kernel_vm86_regs *regs, long error_code, int trapno)
552{ 553{
553 if (VMPI.is_vm86pus) { 554 if (VMPI.is_vm86pus) {
554 if ((trapno == 3) || (trapno == 1)) 555 if ((trapno == 3) || (trapno == 1)) {
555 return_to_32bit(regs, VM86_TRAP + (trapno << 8)); 556 KVM86->regs32->ax = VM86_TRAP + (trapno << 8);
557 /* setting this flag forces the code in entry_32.S to
558 call save_v86_state() and change the stack pointer
559 to KVM86->regs32 */
560 set_thread_flag(TIF_IRET);
561 return 0;
562 }
556 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs)); 563 do_int(regs, trapno, (unsigned char __user *) (regs->pt.ss << 4), SP(regs));
557 return 0; 564 return 0;
558 } 565 }
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
deleted file mode 100644
index ce9fbacb7526..000000000000
--- a/arch/x86/kernel/vmi_32.c
+++ /dev/null
@@ -1,893 +0,0 @@
1/*
2 * VMI specific paravirt-ops implementation
3 *
4 * Copyright (C) 2005, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 * Send feedback to zach@vmware.com
22 *
23 */
24
25#include <linux/module.h>
26#include <linux/cpu.h>
27#include <linux/bootmem.h>
28#include <linux/mm.h>
29#include <linux/highmem.h>
30#include <linux/sched.h>
31#include <linux/gfp.h>
32#include <asm/vmi.h>
33#include <asm/io.h>
34#include <asm/fixmap.h>
35#include <asm/apicdef.h>
36#include <asm/apic.h>
37#include <asm/pgalloc.h>
38#include <asm/processor.h>
39#include <asm/timer.h>
40#include <asm/vmi_time.h>
41#include <asm/kmap_types.h>
42#include <asm/setup.h>
43
44/* Convenient for calling VMI functions indirectly in the ROM */
45typedef u32 __attribute__((regparm(1))) (VROMFUNC)(void);
46typedef u64 __attribute__((regparm(2))) (VROMLONGFUNC)(int);
47
48#define call_vrom_func(rom,func) \
49 (((VROMFUNC *)(rom->func))())
50
51#define call_vrom_long_func(rom,func,arg) \
52 (((VROMLONGFUNC *)(rom->func)) (arg))
53
54static struct vrom_header *vmi_rom;
55static int disable_pge;
56static int disable_pse;
57static int disable_sep;
58static int disable_tsc;
59static int disable_mtrr;
60static int disable_noidle;
61static int disable_vmi_timer;
62
63/* Cached VMI operations */
64static struct {
65 void (*cpuid)(void /* non-c */);
66 void (*_set_ldt)(u32 selector);
67 void (*set_tr)(u32 selector);
68 void (*write_idt_entry)(struct desc_struct *, int, u32, u32);
69 void (*write_gdt_entry)(struct desc_struct *, int, u32, u32);
70 void (*write_ldt_entry)(struct desc_struct *, int, u32, u32);
71 void (*set_kernel_stack)(u32 selector, u32 sp0);
72 void (*allocate_page)(u32, u32, u32, u32, u32);
73 void (*release_page)(u32, u32);
74 void (*set_pte)(pte_t, pte_t *, unsigned);
75 void (*update_pte)(pte_t *, unsigned);
76 void (*set_linear_mapping)(int, void *, u32, u32);
77 void (*_flush_tlb)(int);
78 void (*set_initial_ap_state)(int, int);
79 void (*halt)(void);
80 void (*set_lazy_mode)(int mode);
81} vmi_ops;
82
83/* Cached VMI operations */
84struct vmi_timer_ops vmi_timer_ops;
85
86/*
87 * VMI patching routines.
88 */
89#define MNEM_CALL 0xe8
90#define MNEM_JMP 0xe9
91#define MNEM_RET 0xc3
92
93#define IRQ_PATCH_INT_MASK 0
94#define IRQ_PATCH_DISABLE 5
95
96static inline void patch_offset(void *insnbuf,
97 unsigned long ip, unsigned long dest)
98{
99 *(unsigned long *)(insnbuf+1) = dest-ip-5;
100}
101
102static unsigned patch_internal(int call, unsigned len, void *insnbuf,
103 unsigned long ip)
104{
105 u64 reloc;
106 struct vmi_relocation_info *const rel = (struct vmi_relocation_info *)&reloc;
107 reloc = call_vrom_long_func(vmi_rom, get_reloc, call);
108 switch(rel->type) {
109 case VMI_RELOCATION_CALL_REL:
110 BUG_ON(len < 5);
111 *(char *)insnbuf = MNEM_CALL;
112 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
113 return 5;
114
115 case VMI_RELOCATION_JUMP_REL:
116 BUG_ON(len < 5);
117 *(char *)insnbuf = MNEM_JMP;
118 patch_offset(insnbuf, ip, (unsigned long)rel->eip);
119 return 5;
120
121 case VMI_RELOCATION_NOP:
122 /* obliterate the whole thing */
123 return 0;
124
125 case VMI_RELOCATION_NONE:
126 /* leave native code in place */
127 break;
128
129 default:
130 BUG();
131 }
132 return len;
133}
134
135/*
136 * Apply patch if appropriate, return length of new instruction
137 * sequence. The callee does nop padding for us.
138 */
139static unsigned vmi_patch(u8 type, u16 clobbers, void *insns,
140 unsigned long ip, unsigned len)
141{
142 switch (type) {
143 case PARAVIRT_PATCH(pv_irq_ops.irq_disable):
144 return patch_internal(VMI_CALL_DisableInterrupts, len,
145 insns, ip);
146 case PARAVIRT_PATCH(pv_irq_ops.irq_enable):
147 return patch_internal(VMI_CALL_EnableInterrupts, len,
148 insns, ip);
149 case PARAVIRT_PATCH(pv_irq_ops.restore_fl):
150 return patch_internal(VMI_CALL_SetInterruptMask, len,
151 insns, ip);
152 case PARAVIRT_PATCH(pv_irq_ops.save_fl):
153 return patch_internal(VMI_CALL_GetInterruptMask, len,
154 insns, ip);
155 case PARAVIRT_PATCH(pv_cpu_ops.iret):
156 return patch_internal(VMI_CALL_IRET, len, insns, ip);
157 case PARAVIRT_PATCH(pv_cpu_ops.irq_enable_sysexit):
158 return patch_internal(VMI_CALL_SYSEXIT, len, insns, ip);
159 default:
160 break;
161 }
162 return len;
163}
164
165/* CPUID has non-C semantics, and paravirt-ops API doesn't match hardware ISA */
166static void vmi_cpuid(unsigned int *ax, unsigned int *bx,
167 unsigned int *cx, unsigned int *dx)
168{
169 int override = 0;
170 if (*ax == 1)
171 override = 1;
172 asm volatile ("call *%6"
173 : "=a" (*ax),
174 "=b" (*bx),
175 "=c" (*cx),
176 "=d" (*dx)
177 : "0" (*ax), "2" (*cx), "r" (vmi_ops.cpuid));
178 if (override) {
179 if (disable_pse)
180 *dx &= ~X86_FEATURE_PSE;
181 if (disable_pge)
182 *dx &= ~X86_FEATURE_PGE;
183 if (disable_sep)
184 *dx &= ~X86_FEATURE_SEP;
185 if (disable_tsc)
186 *dx &= ~X86_FEATURE_TSC;
187 if (disable_mtrr)
188 *dx &= ~X86_FEATURE_MTRR;
189 }
190}
191
192static inline void vmi_maybe_load_tls(struct desc_struct *gdt, int nr, struct desc_struct *new)
193{
194 if (gdt[nr].a != new->a || gdt[nr].b != new->b)
195 write_gdt_entry(gdt, nr, new, 0);
196}
197
198static void vmi_load_tls(struct thread_struct *t, unsigned int cpu)
199{
200 struct desc_struct *gdt = get_cpu_gdt_table(cpu);
201 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 0, &t->tls_array[0]);
202 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 1, &t->tls_array[1]);
203 vmi_maybe_load_tls(gdt, GDT_ENTRY_TLS_MIN + 2, &t->tls_array[2]);
204}
205
206static void vmi_set_ldt(const void *addr, unsigned entries)
207{
208 unsigned cpu = smp_processor_id();
209 struct desc_struct desc;
210
211 pack_descriptor(&desc, (unsigned long)addr,
212 entries * sizeof(struct desc_struct) - 1,
213 DESC_LDT, 0);
214 write_gdt_entry(get_cpu_gdt_table(cpu), GDT_ENTRY_LDT, &desc, DESC_LDT);
215 vmi_ops._set_ldt(entries ? GDT_ENTRY_LDT*sizeof(struct desc_struct) : 0);
216}
217
218static void vmi_set_tr(void)
219{
220 vmi_ops.set_tr(GDT_ENTRY_TSS*sizeof(struct desc_struct));
221}
222
223static void vmi_write_idt_entry(gate_desc *dt, int entry, const gate_desc *g)
224{
225 u32 *idt_entry = (u32 *)g;
226 vmi_ops.write_idt_entry(dt, entry, idt_entry[0], idt_entry[1]);
227}
228
229static void vmi_write_gdt_entry(struct desc_struct *dt, int entry,
230 const void *desc, int type)
231{
232 u32 *gdt_entry = (u32 *)desc;
233 vmi_ops.write_gdt_entry(dt, entry, gdt_entry[0], gdt_entry[1]);
234}
235
236static void vmi_write_ldt_entry(struct desc_struct *dt, int entry,
237 const void *desc)
238{
239 u32 *ldt_entry = (u32 *)desc;
240 vmi_ops.write_ldt_entry(dt, entry, ldt_entry[0], ldt_entry[1]);
241}
242
243static void vmi_load_sp0(struct tss_struct *tss,
244 struct thread_struct *thread)
245{
246 tss->x86_tss.sp0 = thread->sp0;
247
248 /* This can only happen when SEP is enabled, no need to test "SEP"arately */
249 if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
250 tss->x86_tss.ss1 = thread->sysenter_cs;
251 wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
252 }
253 vmi_ops.set_kernel_stack(__KERNEL_DS, tss->x86_tss.sp0);
254}
255
256static void vmi_flush_tlb_user(void)
257{
258 vmi_ops._flush_tlb(VMI_FLUSH_TLB);
259}
260
261static void vmi_flush_tlb_kernel(void)
262{
263 vmi_ops._flush_tlb(VMI_FLUSH_TLB | VMI_FLUSH_GLOBAL);
264}
265
266/* Stub to do nothing at all; used for delays and unimplemented calls */
267static void vmi_nop(void)
268{
269}
270
271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
272{
273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
274}
275
276static void vmi_allocate_pmd(struct mm_struct *mm, unsigned long pfn)
277{
278 /*
279 * This call comes in very early, before mem_map is setup.
280 * It is called only for swapper_pg_dir, which already has
281 * data on it.
282 */
283 vmi_ops.allocate_page(pfn, VMI_PAGE_L2, 0, 0, 0);
284}
285
286static void vmi_allocate_pmd_clone(unsigned long pfn, unsigned long clonepfn, unsigned long start, unsigned long count)
287{
288 vmi_ops.allocate_page(pfn, VMI_PAGE_L2 | VMI_PAGE_CLONE, clonepfn, start, count);
289}
290
291static void vmi_release_pte(unsigned long pfn)
292{
293 vmi_ops.release_page(pfn, VMI_PAGE_L1);
294}
295
296static void vmi_release_pmd(unsigned long pfn)
297{
298 vmi_ops.release_page(pfn, VMI_PAGE_L2);
299}
300
301/*
302 * We use the pgd_free hook for releasing the pgd page:
303 */
304static void vmi_pgd_free(struct mm_struct *mm, pgd_t *pgd)
305{
306 unsigned long pfn = __pa(pgd) >> PAGE_SHIFT;
307
308 vmi_ops.release_page(pfn, VMI_PAGE_L2);
309}
310
311/*
312 * Helper macros for MMU update flags. We can defer updates until a flush
313 * or page invalidation only if the update is to the current address space
314 * (otherwise, there is no flush). We must check against init_mm, since
315 * this could be a kernel update, which usually passes init_mm, although
316 * sometimes this check can be skipped if we know the particular function
317 * is only called on user mode PTEs. We could change the kernel to pass
318 * current->active_mm here, but in particular, I was unsure if changing
319 * mm/highmem.c to do this would still be correct on other architectures.
320 */
321#define is_current_as(mm, mustbeuser) ((mm) == current->active_mm || \
322 (!mustbeuser && (mm) == &init_mm))
323#define vmi_flags_addr(mm, addr, level, user) \
324 ((level) | (is_current_as(mm, user) ? \
325 (VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
326#define vmi_flags_addr_defer(mm, addr, level, user) \
327 ((level) | (is_current_as(mm, user) ? \
328 (VMI_PAGE_DEFER | VMI_PAGE_CURRENT_AS | ((addr) & VMI_PAGE_VA_MASK)) : 0))
329
330static void vmi_update_pte(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
331{
332 vmi_ops.update_pte(ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
333}
334
335static void vmi_update_pte_defer(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
336{
337 vmi_ops.update_pte(ptep, vmi_flags_addr_defer(mm, addr, VMI_PAGE_PT, 0));
338}
339
340static void vmi_set_pte(pte_t *ptep, pte_t pte)
341{
342 /* XXX because of set_pmd_pte, this can be called on PT or PD layers */
343 vmi_ops.set_pte(pte, ptep, VMI_PAGE_PT);
344}
345
346static void vmi_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte)
347{
348 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
349}
350
351static void vmi_set_pmd(pmd_t *pmdp, pmd_t pmdval)
352{
353#ifdef CONFIG_X86_PAE
354 const pte_t pte = { .pte = pmdval.pmd };
355#else
356 const pte_t pte = { pmdval.pud.pgd.pgd };
357#endif
358 vmi_ops.set_pte(pte, (pte_t *)pmdp, VMI_PAGE_PD);
359}
360
361#ifdef CONFIG_X86_PAE
362
363static void vmi_set_pte_atomic(pte_t *ptep, pte_t pteval)
364{
365 /*
366 * XXX This is called from set_pmd_pte, but at both PT
367 * and PD layers so the VMI_PAGE_PT flag is wrong. But
368 * it is only called for large page mapping changes,
369 * the Xen backend, doesn't support large pages, and the
370 * ESX backend doesn't depend on the flag.
371 */
372 set_64bit((unsigned long long *)ptep,pte_val(pteval));
373 vmi_ops.update_pte(ptep, VMI_PAGE_PT);
374}
375
376static void vmi_set_pud(pud_t *pudp, pud_t pudval)
377{
378 /* Um, eww */
379 const pte_t pte = { .pte = pudval.pgd.pgd };
380 vmi_ops.set_pte(pte, (pte_t *)pudp, VMI_PAGE_PDP);
381}
382
383static void vmi_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
384{
385 const pte_t pte = { .pte = 0 };
386 vmi_ops.set_pte(pte, ptep, vmi_flags_addr(mm, addr, VMI_PAGE_PT, 0));
387}
388
389static void vmi_pmd_clear(pmd_t *pmd)
390{
391 const pte_t pte = { .pte = 0 };
392 vmi_ops.set_pte(pte, (pte_t *)pmd, VMI_PAGE_PD);
393}
394#endif
395
396#ifdef CONFIG_SMP
397static void __devinit
398vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip,
399 unsigned long start_esp)
400{
401 struct vmi_ap_state ap;
402
403 /* Default everything to zero. This is fine for most GPRs. */
404 memset(&ap, 0, sizeof(struct vmi_ap_state));
405
406 ap.gdtr_limit = GDT_SIZE - 1;
407 ap.gdtr_base = (unsigned long) get_cpu_gdt_table(phys_apicid);
408
409 ap.idtr_limit = IDT_ENTRIES * 8 - 1;
410 ap.idtr_base = (unsigned long) idt_table;
411
412 ap.ldtr = 0;
413
414 ap.cs = __KERNEL_CS;
415 ap.eip = (unsigned long) start_eip;
416 ap.ss = __KERNEL_DS;
417 ap.esp = (unsigned long) start_esp;
418
419 ap.ds = __USER_DS;
420 ap.es = __USER_DS;
421 ap.fs = __KERNEL_PERCPU;
422 ap.gs = __KERNEL_STACK_CANARY;
423
424 ap.eflags = 0;
425
426#ifdef CONFIG_X86_PAE
427 /* efer should match BSP efer. */
428 if (cpu_has_nx) {
429 unsigned l, h;
430 rdmsr(MSR_EFER, l, h);
431 ap.efer = (unsigned long long) h << 32 | l;
432 }
433#endif
434
435 ap.cr3 = __pa(swapper_pg_dir);
436 /* Protected mode, paging, AM, WP, NE, MP. */
437 ap.cr0 = 0x80050023;
438 ap.cr4 = mmu_cr4_features;
439 vmi_ops.set_initial_ap_state((u32)&ap, phys_apicid);
440}
441#endif
442
443static void vmi_start_context_switch(struct task_struct *prev)
444{
445 paravirt_start_context_switch(prev);
446 vmi_ops.set_lazy_mode(2);
447}
448
449static void vmi_end_context_switch(struct task_struct *next)
450{
451 vmi_ops.set_lazy_mode(0);
452 paravirt_end_context_switch(next);
453}
454
455static void vmi_enter_lazy_mmu(void)
456{
457 paravirt_enter_lazy_mmu();
458 vmi_ops.set_lazy_mode(1);
459}
460
461static void vmi_leave_lazy_mmu(void)
462{
463 vmi_ops.set_lazy_mode(0);
464 paravirt_leave_lazy_mmu();
465}
466
467static inline int __init check_vmi_rom(struct vrom_header *rom)
468{
469 struct pci_header *pci;
470 struct pnp_header *pnp;
471 const char *manufacturer = "UNKNOWN";
472 const char *product = "UNKNOWN";
473 const char *license = "unspecified";
474
475 if (rom->rom_signature != 0xaa55)
476 return 0;
477 if (rom->vrom_signature != VMI_SIGNATURE)
478 return 0;
479 if (rom->api_version_maj != VMI_API_REV_MAJOR ||
480 rom->api_version_min+1 < VMI_API_REV_MINOR+1) {
481 printk(KERN_WARNING "VMI: Found mismatched rom version %d.%d\n",
482 rom->api_version_maj,
483 rom->api_version_min);
484 return 0;
485 }
486
487 /*
488 * Relying on the VMI_SIGNATURE field is not 100% safe, so check
489 * the PCI header and device type to make sure this is really a
490 * VMI device.
491 */
492 if (!rom->pci_header_offs) {
493 printk(KERN_WARNING "VMI: ROM does not contain PCI header.\n");
494 return 0;
495 }
496
497 pci = (struct pci_header *)((char *)rom+rom->pci_header_offs);
498 if (pci->vendorID != PCI_VENDOR_ID_VMWARE ||
499 pci->deviceID != PCI_DEVICE_ID_VMWARE_VMI) {
500 /* Allow it to run... anyways, but warn */
501 printk(KERN_WARNING "VMI: ROM from unknown manufacturer\n");
502 }
503
504 if (rom->pnp_header_offs) {
505 pnp = (struct pnp_header *)((char *)rom+rom->pnp_header_offs);
506 if (pnp->manufacturer_offset)
507 manufacturer = (const char *)rom+pnp->manufacturer_offset;
508 if (pnp->product_offset)
509 product = (const char *)rom+pnp->product_offset;
510 }
511
512 if (rom->license_offs)
513 license = (char *)rom+rom->license_offs;
514
515 printk(KERN_INFO "VMI: Found %s %s, API version %d.%d, ROM version %d.%d\n",
516 manufacturer, product,
517 rom->api_version_maj, rom->api_version_min,
518 pci->rom_version_maj, pci->rom_version_min);
519
520 /* Don't allow BSD/MIT here for now because we don't want to end up
521 with any binary only shim layers */
522 if (strcmp(license, "GPL") && strcmp(license, "GPL v2")) {
523 printk(KERN_WARNING "VMI: Non GPL license `%s' found for ROM. Not used.\n",
524 license);
525 return 0;
526 }
527
528 return 1;
529}
530
531/*
532 * Probe for the VMI option ROM
533 */
534static inline int __init probe_vmi_rom(void)
535{
536 unsigned long base;
537
538 /* VMI ROM is in option ROM area, check signature */
539 for (base = 0xC0000; base < 0xE0000; base += 2048) {
540 struct vrom_header *romstart;
541 romstart = (struct vrom_header *)isa_bus_to_virt(base);
542 if (check_vmi_rom(romstart)) {
543 vmi_rom = romstart;
544 return 1;
545 }
546 }
547 return 0;
548}
549
550/*
551 * VMI setup common to all processors
552 */
553void vmi_bringup(void)
554{
555 /* We must establish the lowmem mapping for MMU ops to work */
556 if (vmi_ops.set_linear_mapping)
557 vmi_ops.set_linear_mapping(0, (void *)__PAGE_OFFSET, MAXMEM_PFN, 0);
558}
559
560/*
561 * Return a pointer to a VMI function or NULL if unimplemented
562 */
563static void *vmi_get_function(int vmicall)
564{
565 u64 reloc;
566 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
567 reloc = call_vrom_long_func(vmi_rom, get_reloc, vmicall);
568 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL);
569 if (rel->type == VMI_RELOCATION_CALL_REL)
570 return (void *)rel->eip;
571 else
572 return NULL;
573}
574
575/*
576 * Helper macro for making the VMI paravirt-ops fill code readable.
577 * For unimplemented operations, fall back to default, unless nop
578 * is returned by the ROM.
579 */
580#define para_fill(opname, vmicall) \
581do { \
582 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
583 VMI_CALL_##vmicall); \
584 if (rel->type == VMI_RELOCATION_CALL_REL) \
585 opname = (void *)rel->eip; \
586 else if (rel->type == VMI_RELOCATION_NOP) \
587 opname = (void *)vmi_nop; \
588 else if (rel->type != VMI_RELOCATION_NONE) \
589 printk(KERN_WARNING "VMI: Unknown relocation " \
590 "type %d for " #vmicall"\n",\
591 rel->type); \
592} while (0)
593
594/*
595 * Helper macro for making the VMI paravirt-ops fill code readable.
596 * For cached operations which do not match the VMI ROM ABI and must
597 * go through a tranlation stub. Ignore NOPs, since it is not clear
598 * a NOP * VMI function corresponds to a NOP paravirt-op when the
599 * functions are not in 1-1 correspondence.
600 */
601#define para_wrap(opname, wrapper, cache, vmicall) \
602do { \
603 reloc = call_vrom_long_func(vmi_rom, get_reloc, \
604 VMI_CALL_##vmicall); \
605 BUG_ON(rel->type == VMI_RELOCATION_JUMP_REL); \
606 if (rel->type == VMI_RELOCATION_CALL_REL) { \
607 opname = wrapper; \
608 vmi_ops.cache = (void *)rel->eip; \
609 } \
610} while (0)
611
612/*
613 * Activate the VMI interface and switch into paravirtualized mode
614 */
615static inline int __init activate_vmi(void)
616{
617 short kernel_cs;
618 u64 reloc;
619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
628 printk(KERN_ERR "VMI ROM failed to initialize!");
629 return 0;
630 }
631 savesegment(cs, kernel_cs);
632
633 pv_info.paravirt_enabled = 1;
634 pv_info.kernel_rpl = kernel_cs & SEGMENT_RPL_MASK;
635 pv_info.name = "vmi [deprecated]";
636
637 pv_init_ops.patch = vmi_patch;
638
639 /*
640 * Many of these operations are ABI compatible with VMI.
641 * This means we can fill in the paravirt-ops with direct
642 * pointers into the VMI ROM. If the calling convention for
643 * these operations changes, this code needs to be updated.
644 *
645 * Exceptions
646 * CPUID paravirt-op uses pointers, not the native ISA
647 * halt has no VMI equivalent; all VMI halts are "safe"
648 * no MSR support yet - just trap and emulate. VMI uses the
649 * same ABI as the native ISA, but Linux wants exceptions
650 * from bogus MSR read / write handled
651 * rdpmc is not yet used in Linux
652 */
653
654 /* CPUID is special, so very special it gets wrapped like a present */
655 para_wrap(pv_cpu_ops.cpuid, vmi_cpuid, cpuid, CPUID);
656
657 para_fill(pv_cpu_ops.clts, CLTS);
658 para_fill(pv_cpu_ops.get_debugreg, GetDR);
659 para_fill(pv_cpu_ops.set_debugreg, SetDR);
660 para_fill(pv_cpu_ops.read_cr0, GetCR0);
661 para_fill(pv_mmu_ops.read_cr2, GetCR2);
662 para_fill(pv_mmu_ops.read_cr3, GetCR3);
663 para_fill(pv_cpu_ops.read_cr4, GetCR4);
664 para_fill(pv_cpu_ops.write_cr0, SetCR0);
665 para_fill(pv_mmu_ops.write_cr2, SetCR2);
666 para_fill(pv_mmu_ops.write_cr3, SetCR3);
667 para_fill(pv_cpu_ops.write_cr4, SetCR4);
668
669 para_fill(pv_irq_ops.save_fl.func, GetInterruptMask);
670 para_fill(pv_irq_ops.restore_fl.func, SetInterruptMask);
671 para_fill(pv_irq_ops.irq_disable.func, DisableInterrupts);
672 para_fill(pv_irq_ops.irq_enable.func, EnableInterrupts);
673
674 para_fill(pv_cpu_ops.wbinvd, WBINVD);
675 para_fill(pv_cpu_ops.read_tsc, RDTSC);
676
677 /* The following we emulate with trap and emulate for now */
678 /* paravirt_ops.read_msr = vmi_rdmsr */
679 /* paravirt_ops.write_msr = vmi_wrmsr */
680 /* paravirt_ops.rdpmc = vmi_rdpmc */
681
682 /* TR interface doesn't pass TR value, wrap */
683 para_wrap(pv_cpu_ops.load_tr_desc, vmi_set_tr, set_tr, SetTR);
684
685 /* LDT is special, too */
686 para_wrap(pv_cpu_ops.set_ldt, vmi_set_ldt, _set_ldt, SetLDT);
687
688 para_fill(pv_cpu_ops.load_gdt, SetGDT);
689 para_fill(pv_cpu_ops.load_idt, SetIDT);
690 para_fill(pv_cpu_ops.store_gdt, GetGDT);
691 para_fill(pv_cpu_ops.store_idt, GetIDT);
692 para_fill(pv_cpu_ops.store_tr, GetTR);
693 pv_cpu_ops.load_tls = vmi_load_tls;
694 para_wrap(pv_cpu_ops.write_ldt_entry, vmi_write_ldt_entry,
695 write_ldt_entry, WriteLDTEntry);
696 para_wrap(pv_cpu_ops.write_gdt_entry, vmi_write_gdt_entry,
697 write_gdt_entry, WriteGDTEntry);
698 para_wrap(pv_cpu_ops.write_idt_entry, vmi_write_idt_entry,
699 write_idt_entry, WriteIDTEntry);
700 para_wrap(pv_cpu_ops.load_sp0, vmi_load_sp0, set_kernel_stack, UpdateKernelStack);
701 para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask);
702 para_fill(pv_cpu_ops.io_delay, IODelay);
703
704 para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch,
705 set_lazy_mode, SetLazyMode);
706 para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch,
707 set_lazy_mode, SetLazyMode);
708
709 para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu,
710 set_lazy_mode, SetLazyMode);
711 para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu,
712 set_lazy_mode, SetLazyMode);
713
714 /* user and kernel flush are just handled with different flags to FlushTLB */
715 para_wrap(pv_mmu_ops.flush_tlb_user, vmi_flush_tlb_user, _flush_tlb, FlushTLB);
716 para_wrap(pv_mmu_ops.flush_tlb_kernel, vmi_flush_tlb_kernel, _flush_tlb, FlushTLB);
717 para_fill(pv_mmu_ops.flush_tlb_single, InvalPage);
718
719 /*
720 * Until a standard flag format can be agreed on, we need to
721 * implement these as wrappers in Linux. Get the VMI ROM
722 * function pointers for the two backend calls.
723 */
724#ifdef CONFIG_X86_PAE
725 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxELong);
726 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxELong);
727#else
728 vmi_ops.set_pte = vmi_get_function(VMI_CALL_SetPxE);
729 vmi_ops.update_pte = vmi_get_function(VMI_CALL_UpdatePxE);
730#endif
731
732 if (vmi_ops.set_pte) {
733 pv_mmu_ops.set_pte = vmi_set_pte;
734 pv_mmu_ops.set_pte_at = vmi_set_pte_at;
735 pv_mmu_ops.set_pmd = vmi_set_pmd;
736#ifdef CONFIG_X86_PAE
737 pv_mmu_ops.set_pte_atomic = vmi_set_pte_atomic;
738 pv_mmu_ops.set_pud = vmi_set_pud;
739 pv_mmu_ops.pte_clear = vmi_pte_clear;
740 pv_mmu_ops.pmd_clear = vmi_pmd_clear;
741#endif
742 }
743
744 if (vmi_ops.update_pte) {
745 pv_mmu_ops.pte_update = vmi_update_pte;
746 pv_mmu_ops.pte_update_defer = vmi_update_pte_defer;
747 }
748
749 vmi_ops.allocate_page = vmi_get_function(VMI_CALL_AllocatePage);
750 if (vmi_ops.allocate_page) {
751 pv_mmu_ops.alloc_pte = vmi_allocate_pte;
752 pv_mmu_ops.alloc_pmd = vmi_allocate_pmd;
753 pv_mmu_ops.alloc_pmd_clone = vmi_allocate_pmd_clone;
754 }
755
756 vmi_ops.release_page = vmi_get_function(VMI_CALL_ReleasePage);
757 if (vmi_ops.release_page) {
758 pv_mmu_ops.release_pte = vmi_release_pte;
759 pv_mmu_ops.release_pmd = vmi_release_pmd;
760 pv_mmu_ops.pgd_free = vmi_pgd_free;
761 }
762
763 /* Set linear is needed in all cases */
764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
765
766 /*
767 * These MUST always be patched. Don't support indirect jumps
768 * through these operations, as the VMI interface may use either
769 * a jump or a call to get to these operations, depending on
770 * the backend. They are performance critical anyway, so requiring
771 * a patch is not a big problem.
772 */
773 pv_cpu_ops.irq_enable_sysexit = (void *)0xfeedbab0;
774 pv_cpu_ops.iret = (void *)0xbadbab0;
775
776#ifdef CONFIG_SMP
777 para_wrap(pv_apic_ops.startup_ipi_hook, vmi_startup_ipi_hook, set_initial_ap_state, SetInitialAPState);
778#endif
779
780#ifdef CONFIG_X86_LOCAL_APIC
781 para_fill(apic->read, APICRead);
782 para_fill(apic->write, APICWrite);
783#endif
784
785 /*
786 * Check for VMI timer functionality by probing for a cycle frequency method
787 */
788 reloc = call_vrom_long_func(vmi_rom, get_reloc, VMI_CALL_GetCycleFrequency);
789 if (!disable_vmi_timer && rel->type != VMI_RELOCATION_NONE) {
790 vmi_timer_ops.get_cycle_frequency = (void *)rel->eip;
791 vmi_timer_ops.get_cycle_counter =
792 vmi_get_function(VMI_CALL_GetCycleCounter);
793 vmi_timer_ops.get_wallclock =
794 vmi_get_function(VMI_CALL_GetWallclockTime);
795 vmi_timer_ops.wallclock_updated =
796 vmi_get_function(VMI_CALL_WallclockUpdated);
797 vmi_timer_ops.set_alarm = vmi_get_function(VMI_CALL_SetAlarm);
798 vmi_timer_ops.cancel_alarm =
799 vmi_get_function(VMI_CALL_CancelAlarm);
800 x86_init.timers.timer_init = vmi_time_init;
801#ifdef CONFIG_X86_LOCAL_APIC
802 x86_init.timers.setup_percpu_clockev = vmi_time_bsp_init;
803 x86_cpuinit.setup_percpu_clockev = vmi_time_ap_init;
804#endif
805 pv_time_ops.sched_clock = vmi_sched_clock;
806 x86_platform.calibrate_tsc = vmi_tsc_khz;
807 x86_platform.get_wallclock = vmi_get_wallclock;
808 x86_platform.set_wallclock = vmi_set_wallclock;
809
810 /* We have true wallclock functions; disable CMOS clock sync */
811 no_sync_cmos_clock = 1;
812 } else {
813 disable_noidle = 1;
814 disable_vmi_timer = 1;
815 }
816
817 para_fill(pv_irq_ops.safe_halt, Halt);
818
819 /*
820 * Alternative instruction rewriting doesn't happen soon enough
821 * to convert VMI_IRET to a call instead of a jump; so we have
822 * to do this before IRQs get reenabled. Fortunately, it is
823 * idempotent.
824 */
825 apply_paravirt(__parainstructions, __parainstructions_end);
826
827 vmi_bringup();
828
829 return 1;
830}
831
832#undef para_fill
833
834void __init vmi_init(void)
835{
836 if (!vmi_rom)
837 probe_vmi_rom();
838 else
839 check_vmi_rom(vmi_rom);
840
841 /* In case probing for or validating the ROM failed, basil */
842 if (!vmi_rom)
843 return;
844
845 reserve_top_address(-vmi_rom->virtual_top);
846
847#ifdef CONFIG_X86_IO_APIC
848 /* This is virtual hardware; timer routing is wired correctly */
849 no_timer_check = 1;
850#endif
851}
852
853void __init vmi_activate(void)
854{
855 unsigned long flags;
856
857 if (!vmi_rom)
858 return;
859
860 local_irq_save(flags);
861 activate_vmi();
862 local_irq_restore(flags & X86_EFLAGS_IF);
863}
864
865static int __init parse_vmi(char *arg)
866{
867 if (!arg)
868 return -EINVAL;
869
870 if (!strcmp(arg, "disable_pge")) {
871 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE);
872 disable_pge = 1;
873 } else if (!strcmp(arg, "disable_pse")) {
874 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PSE);
875 disable_pse = 1;
876 } else if (!strcmp(arg, "disable_sep")) {
877 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_SEP);
878 disable_sep = 1;
879 } else if (!strcmp(arg, "disable_tsc")) {
880 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_TSC);
881 disable_tsc = 1;
882 } else if (!strcmp(arg, "disable_mtrr")) {
883 clear_cpu_cap(&boot_cpu_data, X86_FEATURE_MTRR);
884 disable_mtrr = 1;
885 } else if (!strcmp(arg, "disable_timer")) {
886 disable_vmi_timer = 1;
887 disable_noidle = 1;
888 } else if (!strcmp(arg, "disable_noidle"))
889 disable_noidle = 1;
890 return 0;
891}
892
893early_param("vmi", parse_vmi);
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
deleted file mode 100644
index 5e1ff66ecd73..000000000000
--- a/arch/x86/kernel/vmiclock_32.c
+++ /dev/null
@@ -1,317 +0,0 @@
1/*
2 * VMI paravirtual timer support routines.
3 *
4 * Copyright (C) 2007, VMware, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful, but
12 * WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
14 * NON INFRINGEMENT. See the GNU General Public License for more
15 * details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 *
21 */
22
23#include <linux/smp.h>
24#include <linux/interrupt.h>
25#include <linux/cpumask.h>
26#include <linux/clocksource.h>
27#include <linux/clockchips.h>
28
29#include <asm/vmi.h>
30#include <asm/vmi_time.h>
31#include <asm/apicdef.h>
32#include <asm/apic.h>
33#include <asm/timer.h>
34#include <asm/i8253.h>
35#include <asm/irq_vectors.h>
36
37#define VMI_ONESHOT (VMI_ALARM_IS_ONESHOT | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
38#define VMI_PERIODIC (VMI_ALARM_IS_PERIODIC | VMI_CYCLES_REAL | vmi_get_alarm_wiring())
39
40static DEFINE_PER_CPU(struct clock_event_device, local_events);
41
42static inline u32 vmi_counter(u32 flags)
43{
44 /* Given VMI_ONESHOT or VMI_PERIODIC, return the corresponding
45 * cycle counter. */
46 return flags & VMI_ALARM_COUNTER_MASK;
47}
48
49/* paravirt_ops.get_wallclock = vmi_get_wallclock */
50unsigned long vmi_get_wallclock(void)
51{
52 unsigned long long wallclock;
53 wallclock = vmi_timer_ops.get_wallclock(); // nsec
54 (void)do_div(wallclock, 1000000000); // sec
55
56 return wallclock;
57}
58
59/* paravirt_ops.set_wallclock = vmi_set_wallclock */
60int vmi_set_wallclock(unsigned long now)
61{
62 return 0;
63}
64
65/* paravirt_ops.sched_clock = vmi_sched_clock */
66unsigned long long vmi_sched_clock(void)
67{
68 return cycles_2_ns(vmi_timer_ops.get_cycle_counter(VMI_CYCLES_AVAILABLE));
69}
70
71/* x86_platform.calibrate_tsc = vmi_tsc_khz */
72unsigned long vmi_tsc_khz(void)
73{
74 unsigned long long khz;
75 khz = vmi_timer_ops.get_cycle_frequency();
76 (void)do_div(khz, 1000);
77 return khz;
78}
79
80static inline unsigned int vmi_get_timer_vector(void)
81{
82 return IRQ0_VECTOR;
83}
84
85/** vmi clockchip */
86#ifdef CONFIG_X86_LOCAL_APIC
87static unsigned int startup_timer_irq(unsigned int irq)
88{
89 unsigned long val = apic_read(APIC_LVTT);
90 apic_write(APIC_LVTT, vmi_get_timer_vector());
91
92 return (val & APIC_SEND_PENDING);
93}
94
95static void mask_timer_irq(unsigned int irq)
96{
97 unsigned long val = apic_read(APIC_LVTT);
98 apic_write(APIC_LVTT, val | APIC_LVT_MASKED);
99}
100
101static void unmask_timer_irq(unsigned int irq)
102{
103 unsigned long val = apic_read(APIC_LVTT);
104 apic_write(APIC_LVTT, val & ~APIC_LVT_MASKED);
105}
106
107static void ack_timer_irq(unsigned int irq)
108{
109 ack_APIC_irq();
110}
111
112static struct irq_chip vmi_chip __read_mostly = {
113 .name = "VMI-LOCAL",
114 .startup = startup_timer_irq,
115 .mask = mask_timer_irq,
116 .unmask = unmask_timer_irq,
117 .ack = ack_timer_irq
118};
119#endif
120
121/** vmi clockevent */
122#define VMI_ALARM_WIRED_IRQ0 0x00000000
123#define VMI_ALARM_WIRED_LVTT 0x00010000
124static int vmi_wiring = VMI_ALARM_WIRED_IRQ0;
125
126static inline int vmi_get_alarm_wiring(void)
127{
128 return vmi_wiring;
129}
130
131static void vmi_timer_set_mode(enum clock_event_mode mode,
132 struct clock_event_device *evt)
133{
134 cycle_t now, cycles_per_hz;
135 BUG_ON(!irqs_disabled());
136
137 switch (mode) {
138 case CLOCK_EVT_MODE_ONESHOT:
139 case CLOCK_EVT_MODE_RESUME:
140 break;
141 case CLOCK_EVT_MODE_PERIODIC:
142 cycles_per_hz = vmi_timer_ops.get_cycle_frequency();
143 (void)do_div(cycles_per_hz, HZ);
144 now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_PERIODIC));
145 vmi_timer_ops.set_alarm(VMI_PERIODIC, now, cycles_per_hz);
146 break;
147 case CLOCK_EVT_MODE_UNUSED:
148 case CLOCK_EVT_MODE_SHUTDOWN:
149 switch (evt->mode) {
150 case CLOCK_EVT_MODE_ONESHOT:
151 vmi_timer_ops.cancel_alarm(VMI_ONESHOT);
152 break;
153 case CLOCK_EVT_MODE_PERIODIC:
154 vmi_timer_ops.cancel_alarm(VMI_PERIODIC);
155 break;
156 default:
157 break;
158 }
159 break;
160 default:
161 break;
162 }
163}
164
165static int vmi_timer_next_event(unsigned long delta,
166 struct clock_event_device *evt)
167{
168 /* Unfortunately, set_next_event interface only passes relative
169 * expiry, but we want absolute expiry. It'd be better if were
170 * were passed an absolute expiry, since a bunch of time may
171 * have been stolen between the time the delta is computed and
172 * when we set the alarm below. */
173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
174
175 BUG_ON(evt->mode != CLOCK_EVT_MODE_ONESHOT);
176 vmi_timer_ops.set_alarm(VMI_ONESHOT, now + delta, 0);
177 return 0;
178}
179
180static struct clock_event_device vmi_clockevent = {
181 .name = "vmi-timer",
182 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
183 .shift = 22,
184 .set_mode = vmi_timer_set_mode,
185 .set_next_event = vmi_timer_next_event,
186 .rating = 1000,
187 .irq = 0,
188};
189
190static irqreturn_t vmi_timer_interrupt(int irq, void *dev_id)
191{
192 struct clock_event_device *evt = &__get_cpu_var(local_events);
193 evt->event_handler(evt);
194 return IRQ_HANDLED;
195}
196
197static struct irqaction vmi_clock_action = {
198 .name = "vmi-timer",
199 .handler = vmi_timer_interrupt,
200 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
201};
202
203static void __devinit vmi_time_init_clockevent(void)
204{
205 cycle_t cycles_per_msec;
206 struct clock_event_device *evt;
207
208 int cpu = smp_processor_id();
209 evt = &__get_cpu_var(local_events);
210
211 /* Use cycles_per_msec since div_sc params are 32-bits. */
212 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
213 (void)do_div(cycles_per_msec, 1000);
214
215 memcpy(evt, &vmi_clockevent, sizeof(*evt));
216 /* Must pick .shift such that .mult fits in 32-bits. Choosing
217 * .shift to be 22 allows 2^(32-22) cycles per nano-seconds
218 * before overflow. */
219 evt->mult = div_sc(cycles_per_msec, NSEC_PER_MSEC, evt->shift);
220 /* Upper bound is clockevent's use of ulong for cycle deltas. */
221 evt->max_delta_ns = clockevent_delta2ns(ULONG_MAX, evt);
222 evt->min_delta_ns = clockevent_delta2ns(1, evt);
223 evt->cpumask = cpumask_of(cpu);
224
225 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
226 evt->name, evt->mult, evt->shift);
227 clockevents_register_device(evt);
228}
229
230void __init vmi_time_init(void)
231{
232 unsigned int cpu;
233 /* Disable PIT: BIOSes start PIT CH0 with 18.2hz peridic. */
234 outb_pit(0x3a, PIT_MODE); /* binary, mode 5, LSB/MSB, ch 0 */
235
236 vmi_time_init_clockevent();
237 setup_irq(0, &vmi_clock_action);
238 for_each_possible_cpu(cpu)
239 per_cpu(vector_irq, cpu)[vmi_get_timer_vector()] = 0;
240}
241
242#ifdef CONFIG_X86_LOCAL_APIC
243void __devinit vmi_time_bsp_init(void)
244{
245 /*
246 * On APIC systems, we want local timers to fire on each cpu. We do
247 * this by programming LVTT to deliver timer events to the IRQ handler
248 * for IRQ-0, since we can't re-use the APIC local timer handler
249 * without interfering with that code.
250 */
251 clockevents_notify(CLOCK_EVT_NOTIFY_SUSPEND, NULL);
252 local_irq_disable();
253#ifdef CONFIG_SMP
254 /*
255 * XXX handle_percpu_irq only defined for SMP; we need to switch over
256 * to using it, since this is a local interrupt, which each CPU must
257 * handle individually without locking out or dropping simultaneous
258 * local timers on other CPUs. We also don't want to trigger the
259 * quirk workaround code for interrupts which gets invoked from
260 * handle_percpu_irq via eoi, so we use our own IRQ chip.
261 */
262 set_irq_chip_and_handler_name(0, &vmi_chip, handle_percpu_irq, "lvtt");
263#else
264 set_irq_chip_and_handler_name(0, &vmi_chip, handle_edge_irq, "lvtt");
265#endif
266 vmi_wiring = VMI_ALARM_WIRED_LVTT;
267 apic_write(APIC_LVTT, vmi_get_timer_vector());
268 local_irq_enable();
269 clockevents_notify(CLOCK_EVT_NOTIFY_RESUME, NULL);
270}
271
272void __devinit vmi_time_ap_init(void)
273{
274 vmi_time_init_clockevent();
275 apic_write(APIC_LVTT, vmi_get_timer_vector());
276}
277#endif
278
279/** vmi clocksource */
280static struct clocksource clocksource_vmi;
281
282static cycle_t read_real_cycles(struct clocksource *cs)
283{
284 cycle_t ret = (cycle_t)vmi_timer_ops.get_cycle_counter(VMI_CYCLES_REAL);
285 return max(ret, clocksource_vmi.cycle_last);
286}
287
288static struct clocksource clocksource_vmi = {
289 .name = "vmi-timer",
290 .rating = 450,
291 .read = read_real_cycles,
292 .mask = CLOCKSOURCE_MASK(64),
293 .mult = 0, /* to be set */
294 .shift = 22,
295 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
296};
297
298static int __init init_vmi_clocksource(void)
299{
300 cycle_t cycles_per_msec;
301
302 if (!vmi_timer_ops.get_cycle_frequency)
303 return 0;
304 /* Use khz2mult rather than hz2mult since hz arg is only 32-bits. */
305 cycles_per_msec = vmi_timer_ops.get_cycle_frequency();
306 (void)do_div(cycles_per_msec, 1000);
307
308 /* Note that clocksource.{mult, shift} converts in the opposite direction
309 * as clockevents. */
310 clocksource_vmi.mult = clocksource_khz2mult(cycles_per_msec,
311 clocksource_vmi.shift);
312
313 printk(KERN_WARNING "vmi: registering clock source khz=%lld\n", cycles_per_msec);
314 return clocksource_register(&clocksource_vmi);
315
316}
317module_init(init_vmi_clocksource);
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index d0bb52296fa3..89aed99aafce 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -69,7 +69,7 @@ jiffies_64 = jiffies;
69 69
70PHDRS { 70PHDRS {
71 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
72 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(6); /* RW_ */
73#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
74 user PT_LOAD FLAGS(5); /* R_E */ 74 user PT_LOAD FLAGS(5); /* R_E */
75#ifdef CONFIG_SMP 75#ifdef CONFIG_SMP
@@ -105,6 +105,7 @@ SECTIONS
105 SCHED_TEXT 105 SCHED_TEXT
106 LOCK_TEXT 106 LOCK_TEXT
107 KPROBES_TEXT 107 KPROBES_TEXT
108 ENTRY_TEXT
108 IRQENTRY_TEXT 109 IRQENTRY_TEXT
109 *(.fixup) 110 *(.fixup)
110 *(.gnu.warning) 111 *(.gnu.warning)
@@ -116,6 +117,10 @@ SECTIONS
116 117
117 EXCEPTION_TABLE(16) :text = 0x9090 118 EXCEPTION_TABLE(16) :text = 0x9090
118 119
120#if defined(CONFIG_DEBUG_RODATA)
121 /* .text should occupy whole number of pages */
122 . = ALIGN(PAGE_SIZE);
123#endif
119 X64_ALIGN_DEBUG_RODATA_BEGIN 124 X64_ALIGN_DEBUG_RODATA_BEGIN
120 RO_DATA(PAGE_SIZE) 125 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END 126 X64_ALIGN_DEBUG_RODATA_END
@@ -156,6 +161,12 @@ SECTIONS
156 161
157#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0) 162#define VVIRT_OFFSET (VSYSCALL_ADDR - __vsyscall_0)
158#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) 163#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET)
164#define EMIT_VVAR(x, offset) .vsyscall_var_ ## x \
165 ADDR(.vsyscall_0) + offset \
166 : AT(VLOAD(.vsyscall_var_ ## x)) { \
167 *(.vsyscall_var_ ## x) \
168 } \
169 x = VVIRT(.vsyscall_var_ ## x);
159 170
160 . = ALIGN(4096); 171 . = ALIGN(4096);
161 __vsyscall_0 = .; 172 __vsyscall_0 = .;
@@ -170,18 +181,6 @@ SECTIONS
170 *(.vsyscall_fn) 181 *(.vsyscall_fn)
171 } 182 }
172 183
173 . = ALIGN(L1_CACHE_BYTES);
174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
175 *(.vsyscall_gtod_data)
176 }
177
178 vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data);
179 .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) {
180 *(.vsyscall_clock)
181 }
182 vsyscall_clock = VVIRT(.vsyscall_clock);
183
184
185 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { 184 .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) {
186 *(.vsyscall_1) 185 *(.vsyscall_1)
187 } 186 }
@@ -189,21 +188,14 @@ SECTIONS
189 *(.vsyscall_2) 188 *(.vsyscall_2)
190 } 189 }
191 190
192 .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) {
193 *(.vgetcpu_mode)
194 }
195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
196
197 . = ALIGN(L1_CACHE_BYTES);
198 .jiffies : AT(VLOAD(.jiffies)) {
199 *(.jiffies)
200 }
201 jiffies = VVIRT(.jiffies);
202
203 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { 191 .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) {
204 *(.vsyscall_3) 192 *(.vsyscall_3)
205 } 193 }
206 194
195#define __VVAR_KERNEL_LDS
196#include <asm/vvar.h>
197#undef __VVAR_KERNEL_LDS
198
207 . = __vsyscall_0 + PAGE_SIZE; 199 . = __vsyscall_0 + PAGE_SIZE;
208 200
209#undef VSYSCALL_ADDR 201#undef VSYSCALL_ADDR
@@ -211,6 +203,7 @@ SECTIONS
211#undef VLOAD 203#undef VLOAD
212#undef VVIRT_OFFSET 204#undef VVIRT_OFFSET
213#undef VVIRT 205#undef VVIRT
206#undef EMIT_VVAR
214 207
215#endif /* CONFIG_X86_64 */ 208#endif /* CONFIG_X86_64 */
216 209
@@ -226,7 +219,7 @@ SECTIONS
226 * output PHDR, so the next output section - .init.text - should 219 * output PHDR, so the next output section - .init.text - should
227 * start another segment - init. 220 * start another segment - init.
228 */ 221 */
229 PERCPU_VADDR(0, :percpu) 222 PERCPU_VADDR(INTERNODE_CACHE_BYTES, 0, :percpu)
230#endif 223#endif
231 224
232 INIT_TEXT_SECTION(PAGE_SIZE) 225 INIT_TEXT_SECTION(PAGE_SIZE)
@@ -236,12 +229,30 @@ SECTIONS
236 229
237 INIT_DATA_SECTION(16) 230 INIT_DATA_SECTION(16)
238 231
232 /*
233 * Code and data for a variety of lowlevel trampolines, to be
234 * copied into base memory (< 1 MiB) during initialization.
235 * Since it is copied early, the main copy can be discarded
236 * afterwards.
237 */
238 .x86_trampoline : AT(ADDR(.x86_trampoline) - LOAD_OFFSET) {
239 x86_trampoline_start = .;
240 *(.x86_trampoline)
241 x86_trampoline_end = .;
242 }
243
239 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { 244 .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) {
240 __x86_cpu_dev_start = .; 245 __x86_cpu_dev_start = .;
241 *(.x86_cpu_dev.init) 246 *(.x86_cpu_dev.init)
242 __x86_cpu_dev_end = .; 247 __x86_cpu_dev_end = .;
243 } 248 }
244 249
250 /*
251 * start address and size of operations which during runtime
252 * can be patched with virtualization friendly instructions or
253 * baremetal native ones. Think page table operations.
254 * Details in paravirt_types.h
255 */
245 . = ALIGN(8); 256 . = ALIGN(8);
246 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { 257 .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) {
247 __parainstructions = .; 258 __parainstructions = .;
@@ -249,6 +260,11 @@ SECTIONS
249 __parainstructions_end = .; 260 __parainstructions_end = .;
250 } 261 }
251 262
263 /*
264 * struct alt_inst entries. From the header (alternative.h):
265 * "Alternative instructions for different CPU types or capabilities"
266 * Think locking instructions on spinlocks.
267 */
252 . = ALIGN(8); 268 . = ALIGN(8);
253 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { 269 .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) {
254 __alt_instructions = .; 270 __alt_instructions = .;
@@ -256,11 +272,36 @@ SECTIONS
256 __alt_instructions_end = .; 272 __alt_instructions_end = .;
257 } 273 }
258 274
275 /*
276 * And here are the replacement instructions. The linker sticks
277 * them as binary blobs. The .altinstructions has enough data to
278 * get the address and the length of them to patch the kernel safely.
279 */
259 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { 280 .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) {
260 *(.altinstr_replacement) 281 *(.altinstr_replacement)
261 } 282 }
262 283
263 /* 284 /*
285 * struct iommu_table_entry entries are injected in this section.
286 * It is an array of IOMMUs which during run time gets sorted depending
287 * on its dependency order. After rootfs_initcall is complete
288 * this section can be safely removed.
289 */
290 .iommu_table : AT(ADDR(.iommu_table) - LOAD_OFFSET) {
291 __iommu_table = .;
292 *(.iommu_table)
293 __iommu_table_end = .;
294 }
295
296 . = ALIGN(8);
297 .apicdrivers : AT(ADDR(.apicdrivers) - LOAD_OFFSET) {
298 __apicdrivers = .;
299 *(.apicdrivers);
300 __apicdrivers_end = .;
301 }
302
303 . = ALIGN(8);
304 /*
264 * .exit.text is discard at runtime, not link time, to deal with 305 * .exit.text is discard at runtime, not link time, to deal with
265 * references from .altinstructions and .eh_frame 306 * references from .altinstructions and .eh_frame
266 */ 307 */
@@ -273,7 +314,7 @@ SECTIONS
273 } 314 }
274 315
275#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP) 316#if !defined(CONFIG_X86_64) || !defined(CONFIG_SMP)
276 PERCPU(PAGE_SIZE) 317 PERCPU_SECTION(INTERNODE_CACHE_BYTES)
277#endif 318#endif
278 319
279 . = ALIGN(PAGE_SIZE); 320 . = ALIGN(PAGE_SIZE);
@@ -307,7 +348,7 @@ SECTIONS
307 __bss_start = .; 348 __bss_start = .;
308 *(.bss..page_aligned) 349 *(.bss..page_aligned)
309 *(.bss) 350 *(.bss)
310 . = ALIGN(4); 351 . = ALIGN(PAGE_SIZE);
311 __bss_stop = .; 352 __bss_stop = .;
312 } 353 }
313 354
diff --git a/arch/x86/kernel/vread_tsc_64.c b/arch/x86/kernel/vread_tsc_64.c
new file mode 100644
index 000000000000..a81aa9e9894c
--- /dev/null
+++ b/arch/x86/kernel/vread_tsc_64.c
@@ -0,0 +1,36 @@
1/* This code runs in userspace. */
2
3#define DISABLE_BRANCH_PROFILING
4#include <asm/vgtod.h>
5
6notrace cycle_t __vsyscall_fn vread_tsc(void)
7{
8 cycle_t ret;
9 u64 last;
10
11 /*
12 * Empirically, a fence (of type that depends on the CPU)
13 * before rdtsc is enough to ensure that rdtsc is ordered
14 * with respect to loads. The various CPU manuals are unclear
15 * as to whether rdtsc can be reordered with later loads,
16 * but no one has ever seen it happen.
17 */
18 rdtsc_barrier();
19 ret = (cycle_t)vget_cycles();
20
21 last = VVAR(vsyscall_gtod_data).clock.cycle_last;
22
23 if (likely(ret >= last))
24 return ret;
25
26 /*
27 * GCC likes to generate cmov here, but this branch is extremely
28 * predictable (it's just a funciton of time and the likely is
29 * very likely) and there's a data dependence, so force GCC
30 * to generate a branch instead. I don't barrier() because
31 * we don't actually need a barrier, and if this function
32 * ever gets inlined it will generate worse code.
33 */
34 asm volatile ("");
35 return last;
36}
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index dcbb28c4b694..3e682184d76c 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -49,17 +49,10 @@
49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace 49 __attribute__ ((unused, __section__(".vsyscall_" #nr))) notrace
50#define __syscall_clobber "r11","cx","memory" 50#define __syscall_clobber "r11","cx","memory"
51 51
52/* 52DEFINE_VVAR(int, vgetcpu_mode);
53 * vsyscall_gtod_data contains data that is : 53DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) =
54 * - readonly from vsyscalls
55 * - written by timer interrupt or systcl (/proc/sys/kernel/vsyscall64)
56 * Try to keep this structure as small as possible to avoid cache line ping pongs
57 */
58int __vgetcpu_mode __section_vgetcpu_mode;
59
60struct vsyscall_gtod_data __vsyscall_gtod_data __section_vsyscall_gtod_data =
61{ 54{
62 .lock = SEQLOCK_UNLOCKED, 55 .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock),
63 .sysctl_enabled = 1, 56 .sysctl_enabled = 1,
64}; 57};
65 58
@@ -97,7 +90,7 @@ void update_vsyscall(struct timespec *wall_time, struct timespec *wtm,
97 */ 90 */
98static __always_inline void do_get_tz(struct timezone * tz) 91static __always_inline void do_get_tz(struct timezone * tz)
99{ 92{
100 *tz = __vsyscall_gtod_data.sys_tz; 93 *tz = VVAR(vsyscall_gtod_data).sys_tz;
101} 94}
102 95
103static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz) 96static __always_inline int gettimeofday(struct timeval *tv, struct timezone *tz)
@@ -126,23 +119,24 @@ static __always_inline void do_vgettimeofday(struct timeval * tv)
126 unsigned long mult, shift, nsec; 119 unsigned long mult, shift, nsec;
127 cycle_t (*vread)(void); 120 cycle_t (*vread)(void);
128 do { 121 do {
129 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 122 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
130 123
131 vread = __vsyscall_gtod_data.clock.vread; 124 vread = VVAR(vsyscall_gtod_data).clock.vread;
132 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled || !vread)) { 125 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled ||
126 !vread)) {
133 gettimeofday(tv,NULL); 127 gettimeofday(tv,NULL);
134 return; 128 return;
135 } 129 }
136 130
137 now = vread(); 131 now = vread();
138 base = __vsyscall_gtod_data.clock.cycle_last; 132 base = VVAR(vsyscall_gtod_data).clock.cycle_last;
139 mask = __vsyscall_gtod_data.clock.mask; 133 mask = VVAR(vsyscall_gtod_data).clock.mask;
140 mult = __vsyscall_gtod_data.clock.mult; 134 mult = VVAR(vsyscall_gtod_data).clock.mult;
141 shift = __vsyscall_gtod_data.clock.shift; 135 shift = VVAR(vsyscall_gtod_data).clock.shift;
142 136
143 tv->tv_sec = __vsyscall_gtod_data.wall_time_sec; 137 tv->tv_sec = VVAR(vsyscall_gtod_data).wall_time_sec;
144 nsec = __vsyscall_gtod_data.wall_time_nsec; 138 nsec = VVAR(vsyscall_gtod_data).wall_time_nsec;
145 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 139 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
146 140
147 /* calculate interval: */ 141 /* calculate interval: */
148 cycle_delta = (now - base) & mask; 142 cycle_delta = (now - base) & mask;
@@ -171,15 +165,15 @@ time_t __vsyscall(1) vtime(time_t *t)
171{ 165{
172 unsigned seq; 166 unsigned seq;
173 time_t result; 167 time_t result;
174 if (unlikely(!__vsyscall_gtod_data.sysctl_enabled)) 168 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
175 return time_syscall(t); 169 return time_syscall(t);
176 170
177 do { 171 do {
178 seq = read_seqbegin(&__vsyscall_gtod_data.lock); 172 seq = read_seqbegin(&VVAR(vsyscall_gtod_data).lock);
179 173
180 result = __vsyscall_gtod_data.wall_time_sec; 174 result = VVAR(vsyscall_gtod_data).wall_time_sec;
181 175
182 } while (read_seqretry(&__vsyscall_gtod_data.lock, seq)); 176 } while (read_seqretry(&VVAR(vsyscall_gtod_data).lock, seq));
183 177
184 if (t) 178 if (t)
185 *t = result; 179 *t = result;
@@ -208,9 +202,9 @@ vgetcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *tcache)
208 We do this here because otherwise user space would do it on 202 We do this here because otherwise user space would do it on
209 its own in a likely inferior way (no access to jiffies). 203 its own in a likely inferior way (no access to jiffies).
210 If you don't like it pass NULL. */ 204 If you don't like it pass NULL. */
211 if (tcache && tcache->blob[0] == (j = __jiffies)) { 205 if (tcache && tcache->blob[0] == (j = VVAR(jiffies))) {
212 p = tcache->blob[1]; 206 p = tcache->blob[1];
213 } else if (__vgetcpu_mode == VGETCPU_RDTSCP) { 207 } else if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
214 /* Load per CPU data from RDTSCP */ 208 /* Load per CPU data from RDTSCP */
215 native_read_tscp(&p); 209 native_read_tscp(&p);
216 } else { 210 } else {
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 1b950d151e58..9796c2f3d074 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -52,6 +52,7 @@ extern void *__memcpy(void *, const void *, __kernel_size_t);
52EXPORT_SYMBOL(memset); 52EXPORT_SYMBOL(memset);
53EXPORT_SYMBOL(memcpy); 53EXPORT_SYMBOL(memcpy);
54EXPORT_SYMBOL(__memcpy); 54EXPORT_SYMBOL(__memcpy);
55EXPORT_SYMBOL(memmove);
55 56
56EXPORT_SYMBOL(empty_zero_page); 57EXPORT_SYMBOL(empty_zero_page);
57#ifndef CONFIG_PARAVIRT 58#ifndef CONFIG_PARAVIRT
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index cd6da6bf3eca..6f164bd5e14d 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -6,10 +6,12 @@
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h> 7#include <linux/ioport.h>
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/pci.h>
9 10
10#include <asm/bios_ebda.h> 11#include <asm/bios_ebda.h>
11#include <asm/paravirt.h> 12#include <asm/paravirt.h>
12#include <asm/pci_x86.h> 13#include <asm/pci_x86.h>
14#include <asm/pci.h>
13#include <asm/mpspec.h> 15#include <asm/mpspec.h>
14#include <asm/setup.h> 16#include <asm/setup.h>
15#include <asm/apic.h> 17#include <asm/apic.h>
@@ -33,7 +35,7 @@ void iommu_shutdown_noop(void) { }
33struct x86_init_ops x86_init __initdata = { 35struct x86_init_ops x86_init __initdata = {
34 36
35 .resources = { 37 .resources = {
36 .probe_roms = x86_init_noop, 38 .probe_roms = probe_roms,
37 .reserve_resources = reserve_standard_io_resources, 39 .reserve_resources = reserve_standard_io_resources,
38 .memory_setup = default_machine_specific_memory_setup, 40 .memory_setup = default_machine_specific_memory_setup,
39 }, 41 },
@@ -59,6 +61,10 @@ struct x86_init_ops x86_init __initdata = {
59 .banner = default_banner, 61 .banner = default_banner,
60 }, 62 },
61 63
64 .mapping = {
65 .pagetable_reserve = native_pagetable_reserve,
66 },
67
62 .paging = { 68 .paging = {
63 .pagetable_setup_start = native_pagetable_setup_start, 69 .pagetable_setup_start = native_pagetable_setup_start,
64 .pagetable_setup_done = native_pagetable_setup_done, 70 .pagetable_setup_done = native_pagetable_setup_done,
@@ -68,6 +74,7 @@ struct x86_init_ops x86_init __initdata = {
68 .setup_percpu_clockev = setup_boot_APIC_clock, 74 .setup_percpu_clockev = setup_boot_APIC_clock,
69 .tsc_pre_init = x86_init_noop, 75 .tsc_pre_init = x86_init_noop,
70 .timer_init = hpet_time_init, 76 .timer_init = hpet_time_init,
77 .wallclock_init = x86_init_noop,
71 }, 78 },
72 79
73 .iommu = { 80 .iommu = {
@@ -99,3 +106,8 @@ struct x86_platform_ops x86_platform = {
99}; 106};
100 107
101EXPORT_SYMBOL_GPL(x86_platform); 108EXPORT_SYMBOL_GPL(x86_platform);
109struct x86_msi_ops x86_msi = {
110 .setup_msi_irqs = native_setup_msi_irqs,
111 .teardown_msi_irq = native_teardown_msi_irq,
112 .teardown_msi_irqs = default_teardown_msi_irqs,
113};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index 9c253bd65e24..a3911343976b 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -53,7 +53,7 @@ void __sanitize_i387_state(struct task_struct *tsk)
53 53
54 /* 54 /*
55 * None of the feature bits are in init state. So nothing else 55 * None of the feature bits are in init state. So nothing else
56 * to do for us, as the memory layout is upto date. 56 * to do for us, as the memory layout is up to date.
57 */ 57 */
58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask) 58 if ((xstate_bv & pcntxt_mask) == pcntxt_mask)
59 return; 59 return;
@@ -394,7 +394,8 @@ static void __init setup_xstate_init(void)
394 * Setup init_xstate_buf to represent the init state of 394 * Setup init_xstate_buf to represent the init state of
395 * all the features managed by the xsave 395 * all the features managed by the xsave
396 */ 396 */
397 init_xstate_buf = alloc_bootmem(xstate_size); 397 init_xstate_buf = alloc_bootmem_align(xstate_size,
398 __alignof__(struct xsave_struct));
398 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT; 399 init_xstate_buf->i387.mxcsr = MXCSR_DEFAULT;
399 400
400 clts(); 401 clts();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index 970bbd479516..50f63648ce1b 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,7 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select KVM_ASYNC_PF
31 select USER_RETURN_NOTIFIER 32 select USER_RETURN_NOTIFIER
32 select KVM_MMIO 33 select KVM_MMIO
33 ---help--- 34 ---help---
@@ -64,6 +65,13 @@ config KVM_AMD
64 To compile this as a module, choose M here: the module 65 To compile this as a module, choose M here: the module
65 will be called kvm-amd. 66 will be called kvm-amd.
66 67
68config KVM_MMU_AUDIT
69 bool "Audit KVM MMU"
70 depends on KVM && TRACEPOINTS
71 ---help---
72 This option adds a R/W kVM module parameter 'mmu_audit', which allows
73 audit KVM MMU at runtime.
74
67# OK, it's a little counter-intuitive to do this, but it puts it neatly under 75# OK, it's a little counter-intuitive to do this, but it puts it neatly under
68# the virtualization menu. 76# the virtualization menu.
69source drivers/vhost/Kconfig 77source drivers/vhost/Kconfig
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31a7035c4bd9..f15501f431c8 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -1,5 +1,5 @@
1 1
2EXTRA_CFLAGS += -Ivirt/kvm -Iarch/x86/kvm 2ccflags-y += -Ivirt/kvm -Iarch/x86/kvm
3 3
4CFLAGS_x86.o := -I. 4CFLAGS_x86.o := -I.
5CFLAGS_svm.o := -I. 5CFLAGS_svm.o := -I.
@@ -9,6 +9,7 @@ kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o \ 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o) 10 assigned-dev.o)
11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
12kvm-$(CONFIG_KVM_ASYNC_PF) += $(addprefix ../../../virt/kvm/, async_pf.o)
12 13
13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 14kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
14 i8254.o timer.o 15 i8254.o timer.o
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 66ca98aafdd6..adc98675cda0 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -9,7 +9,7 @@
9 * privileged instructions: 9 * privileged instructions:
10 * 10 *
11 * Copyright (C) 2006 Qumranet 11 * Copyright (C) 2006 Qumranet
12 * Copyright 2010 Red Hat, Inc. and/or its affilates. 12 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
13 * 13 *
14 * Avi Kivity <avi@qumranet.com> 14 * Avi Kivity <avi@qumranet.com>
15 * Yaniv Kamay <yaniv@qumranet.com> 15 * Yaniv Kamay <yaniv@qumranet.com>
@@ -20,16 +20,8 @@
20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4 20 * From: xen-unstable 10676:af9809f51f81a3c43f276f00c81a52ef558afda4
21 */ 21 */
22 22
23#ifndef __KERNEL__
24#include <stdio.h>
25#include <stdint.h>
26#include <public/xen.h>
27#define DPRINTF(_f, _a ...) printf(_f , ## _a)
28#else
29#include <linux/kvm_host.h> 23#include <linux/kvm_host.h>
30#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
31#define DPRINTF(x...) do {} while (0)
32#endif
33#include <linux/module.h> 25#include <linux/module.h>
34#include <asm/kvm_emulate.h> 26#include <asm/kvm_emulate.h>
35 27
@@ -51,39 +43,50 @@
51#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */ 43#define ImplicitOps (1<<1) /* Implicit in opcode. No generic decode. */
52#define DstReg (2<<1) /* Register operand. */ 44#define DstReg (2<<1) /* Register operand. */
53#define DstMem (3<<1) /* Memory operand. */ 45#define DstMem (3<<1) /* Memory operand. */
54#define DstAcc (4<<1) /* Destination Accumulator */ 46#define DstAcc (4<<1) /* Destination Accumulator */
55#define DstDI (5<<1) /* Destination is in ES:(E)DI */ 47#define DstDI (5<<1) /* Destination is in ES:(E)DI */
56#define DstMem64 (6<<1) /* 64bit memory operand */ 48#define DstMem64 (6<<1) /* 64bit memory operand */
57#define DstMask (7<<1) 49#define DstImmUByte (7<<1) /* 8-bit unsigned immediate operand */
50#define DstDX (8<<1) /* Destination is in DX register */
51#define DstMask (0xf<<1)
58/* Source operand type. */ 52/* Source operand type. */
59#define SrcNone (0<<4) /* No source operand. */ 53#define SrcNone (0<<5) /* No source operand. */
60#define SrcImplicit (0<<4) /* Source operand is implicit in the opcode. */ 54#define SrcReg (1<<5) /* Register operand. */
61#define SrcReg (1<<4) /* Register operand. */ 55#define SrcMem (2<<5) /* Memory operand. */
62#define SrcMem (2<<4) /* Memory operand. */ 56#define SrcMem16 (3<<5) /* Memory operand (16-bit). */
63#define SrcMem16 (3<<4) /* Memory operand (16-bit). */ 57#define SrcMem32 (4<<5) /* Memory operand (32-bit). */
64#define SrcMem32 (4<<4) /* Memory operand (32-bit). */ 58#define SrcImm (5<<5) /* Immediate operand. */
65#define SrcImm (5<<4) /* Immediate operand. */ 59#define SrcImmByte (6<<5) /* 8-bit sign-extended immediate operand. */
66#define SrcImmByte (6<<4) /* 8-bit sign-extended immediate operand. */ 60#define SrcOne (7<<5) /* Implied '1' */
67#define SrcOne (7<<4) /* Implied '1' */ 61#define SrcImmUByte (8<<5) /* 8-bit unsigned immediate operand. */
68#define SrcImmUByte (8<<4) /* 8-bit unsigned immediate operand. */ 62#define SrcImmU (9<<5) /* Immediate operand, unsigned */
69#define SrcImmU (9<<4) /* Immediate operand, unsigned */ 63#define SrcSI (0xa<<5) /* Source is in the DS:RSI */
70#define SrcSI (0xa<<4) /* Source is in the DS:RSI */ 64#define SrcImmFAddr (0xb<<5) /* Source is immediate far address */
71#define SrcImmFAddr (0xb<<4) /* Source is immediate far address */ 65#define SrcMemFAddr (0xc<<5) /* Source is far address in memory */
72#define SrcMemFAddr (0xc<<4) /* Source is far address in memory */ 66#define SrcAcc (0xd<<5) /* Source Accumulator */
73#define SrcAcc (0xd<<4) /* Source Accumulator */ 67#define SrcImmU16 (0xe<<5) /* Immediate operand, unsigned, 16 bits */
74#define SrcMask (0xf<<4) 68#define SrcDX (0xf<<5) /* Source is in DX register */
69#define SrcMask (0xf<<5)
75/* Generic ModRM decode. */ 70/* Generic ModRM decode. */
76#define ModRM (1<<8) 71#define ModRM (1<<9)
77/* Destination is only written; never read. */ 72/* Destination is only written; never read. */
78#define Mov (1<<9) 73#define Mov (1<<10)
79#define BitOp (1<<10) 74#define BitOp (1<<11)
80#define MemAbs (1<<11) /* Memory operand is absolute displacement */ 75#define MemAbs (1<<12) /* Memory operand is absolute displacement */
81#define String (1<<12) /* String instruction (rep capable) */ 76#define String (1<<13) /* String instruction (rep capable) */
82#define Stack (1<<13) /* Stack instruction (push/pop) */ 77#define Stack (1<<14) /* Stack instruction (push/pop) */
83#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 78#define GroupMask (7<<15) /* Opcode uses one of the group mechanisms */
84#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 79#define Group (1<<15) /* Bits 3:5 of modrm byte extend opcode */
85#define GroupMask 0xff /* Group number stored in bits 0:7 */ 80#define GroupDual (2<<15) /* Alternate decoding of mod == 3 */
81#define Prefix (3<<15) /* Instruction varies with 66/f2/f3 prefix */
82#define RMExt (4<<15) /* Opcode extension in ModRM r/m if mod == 3 */
83#define Sse (1<<18) /* SSE Vector instruction */
86/* Misc flags */ 84/* Misc flags */
85#define Prot (1<<21) /* instruction generates #UD if not in prot-mode */
86#define VendorSpecific (1<<22) /* Vendor specific instruction */
87#define NoAccess (1<<23) /* Don't access memory (lea/invlpg/verr etc) */
88#define Op3264 (1<<24) /* Operand is 64b in long mode, 32b otherwise */
89#define Undefined (1<<25) /* No Such Instruction */
87#define Lock (1<<26) /* lock prefix is allowed for the instruction */ 90#define Lock (1<<26) /* lock prefix is allowed for the instruction */
88#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */ 91#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
89#define No64 (1<<28) 92#define No64 (1<<28)
@@ -92,285 +95,40 @@
92#define Src2CL (1<<29) 95#define Src2CL (1<<29)
93#define Src2ImmByte (2<<29) 96#define Src2ImmByte (2<<29)
94#define Src2One (3<<29) 97#define Src2One (3<<29)
98#define Src2Imm (4<<29)
95#define Src2Mask (7<<29) 99#define Src2Mask (7<<29)
96 100
97enum { 101#define X2(x...) x, x
98 Group1_80, Group1_81, Group1_82, Group1_83, 102#define X3(x...) X2(x), x
99 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 103#define X4(x...) X2(x), X2(x)
100 Group8, Group9, 104#define X5(x...) X4(x), x
101}; 105#define X6(x...) X4(x), X2(x)
102 106#define X7(x...) X4(x), X3(x)
103static u32 opcode_table[256] = { 107#define X8(x...) X4(x), X4(x)
104 /* 0x00 - 0x07 */ 108#define X16(x...) X8(x), X8(x)
105 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 109
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 110struct opcode {
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 111 u32 flags;
108 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64, 112 u8 intercept;
109 /* 0x08 - 0x0F */ 113 union {
110 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 114 int (*execute)(struct x86_emulate_ctxt *ctxt);
111 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 115 struct opcode *group;
112 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 116 struct group_dual *gdual;
113 ImplicitOps | Stack | No64, 0, 117 struct gprefix *gprefix;
114 /* 0x10 - 0x17 */ 118 } u;
115 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock, 119 int (*check_perm)(struct x86_emulate_ctxt *ctxt);
116 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
117 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
118 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
119 /* 0x18 - 0x1F */
120 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
121 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
122 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
123 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
124 /* 0x20 - 0x27 */
125 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
126 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
127 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
128 /* 0x28 - 0x2F */
129 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
130 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
131 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
132 /* 0x30 - 0x37 */
133 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
134 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
135 ByteOp | DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
136 /* 0x38 - 0x3F */
137 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
138 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
139 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
140 0, 0,
141 /* 0x40 - 0x47 */
142 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
143 /* 0x48 - 0x4F */
144 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
145 /* 0x50 - 0x57 */
146 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
147 SrcReg | Stack, SrcReg | Stack, SrcReg | Stack, SrcReg | Stack,
148 /* 0x58 - 0x5F */
149 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
150 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
151 /* 0x60 - 0x67 */
152 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
153 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
154 0, 0, 0, 0,
155 /* 0x68 - 0x6F */
156 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
157 DstDI | ByteOp | Mov | String, DstDI | Mov | String, /* insb, insw/insd */
158 SrcSI | ByteOp | ImplicitOps | String, SrcSI | ImplicitOps | String, /* outsb, outsw/outsd */
159 /* 0x70 - 0x77 */
160 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
161 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
162 /* 0x78 - 0x7F */
163 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
164 SrcImmByte, SrcImmByte, SrcImmByte, SrcImmByte,
165 /* 0x80 - 0x87 */
166 Group | Group1_80, Group | Group1_81,
167 Group | Group1_82, Group | Group1_83,
168 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
169 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
170 /* 0x88 - 0x8F */
171 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
172 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
173 DstMem | SrcNone | ModRM | Mov, ModRM | DstReg,
174 ImplicitOps | SrcMem16 | ModRM, Group | Group1A,
175 /* 0x90 - 0x97 */
176 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
177 /* 0x98 - 0x9F */
178 0, 0, SrcImmFAddr | No64, 0,
179 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
180 /* 0xA0 - 0xA7 */
181 ByteOp | DstAcc | SrcMem | Mov | MemAbs, DstAcc | SrcMem | Mov | MemAbs,
182 ByteOp | DstMem | SrcAcc | Mov | MemAbs, DstMem | SrcAcc | Mov | MemAbs,
183 ByteOp | SrcSI | DstDI | Mov | String, SrcSI | DstDI | Mov | String,
184 ByteOp | SrcSI | DstDI | String, SrcSI | DstDI | String,
185 /* 0xA8 - 0xAF */
186 DstAcc | SrcImmByte | ByteOp, DstAcc | SrcImm, ByteOp | DstDI | Mov | String, DstDI | Mov | String,
187 ByteOp | SrcSI | DstAcc | Mov | String, SrcSI | DstAcc | Mov | String,
188 ByteOp | DstDI | String, DstDI | String,
189 /* 0xB0 - 0xB7 */
190 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
191 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
192 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
193 ByteOp | DstReg | SrcImm | Mov, ByteOp | DstReg | SrcImm | Mov,
194 /* 0xB8 - 0xBF */
195 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
196 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
197 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
198 DstReg | SrcImm | Mov, DstReg | SrcImm | Mov,
199 /* 0xC0 - 0xC7 */
200 ByteOp | DstMem | SrcImm | ModRM, DstMem | SrcImmByte | ModRM,
201 0, ImplicitOps | Stack, 0, 0,
202 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
203 /* 0xC8 - 0xCF */
204 0, 0, 0, ImplicitOps | Stack,
205 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
206 /* 0xD0 - 0xD7 */
207 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
208 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
209 0, 0, 0, 0,
210 /* 0xD8 - 0xDF */
211 0, 0, 0, 0, 0, 0, 0, 0,
212 /* 0xE0 - 0xE7 */
213 0, 0, 0, 0,
214 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
215 ByteOp | SrcImmUByte | DstAcc, SrcImmUByte | DstAcc,
216 /* 0xE8 - 0xEF */
217 SrcImm | Stack, SrcImm | ImplicitOps,
218 SrcImmFAddr | No64, SrcImmByte | ImplicitOps,
219 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
220 SrcNone | ByteOp | DstAcc, SrcNone | DstAcc,
221 /* 0xF0 - 0xF7 */
222 0, 0, 0, 0,
223 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
224 /* 0xF8 - 0xFF */
225 ImplicitOps, 0, ImplicitOps, ImplicitOps,
226 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
227};
228
229static u32 twobyte_table[256] = {
230 /* 0x00 - 0x0F */
231 0, Group | GroupDual | Group7, 0, 0,
232 0, ImplicitOps, ImplicitOps | Priv, 0,
233 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
234 0, ImplicitOps | ModRM, 0, 0,
235 /* 0x10 - 0x1F */
236 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
237 /* 0x20 - 0x2F */
238 ModRM | ImplicitOps | Priv, ModRM | Priv,
239 ModRM | ImplicitOps | Priv, ModRM | Priv,
240 0, 0, 0, 0,
241 0, 0, 0, 0, 0, 0, 0, 0,
242 /* 0x30 - 0x3F */
243 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
244 ImplicitOps, ImplicitOps | Priv, 0, 0,
245 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0x40 - 0x47 */
247 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
248 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
249 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
250 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
251 /* 0x48 - 0x4F */
252 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
253 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
254 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
255 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
256 /* 0x50 - 0x5F */
257 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
258 /* 0x60 - 0x6F */
259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
260 /* 0x70 - 0x7F */
261 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
262 /* 0x80 - 0x8F */
263 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
264 SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm, SrcImm,
265 /* 0x90 - 0x9F */
266 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
267 /* 0xA0 - 0xA7 */
268 ImplicitOps | Stack, ImplicitOps | Stack,
269 0, DstMem | SrcReg | ModRM | BitOp,
270 DstMem | SrcReg | Src2ImmByte | ModRM,
271 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
272 /* 0xA8 - 0xAF */
273 ImplicitOps | Stack, ImplicitOps | Stack,
274 0, DstMem | SrcReg | ModRM | BitOp | Lock,
275 DstMem | SrcReg | Src2ImmByte | ModRM,
276 DstMem | SrcReg | Src2CL | ModRM,
277 ModRM, 0,
278 /* 0xB0 - 0xB7 */
279 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
280 0, DstMem | SrcReg | ModRM | BitOp | Lock,
281 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
282 DstReg | SrcMem16 | ModRM | Mov,
283 /* 0xB8 - 0xBF */
284 0, 0,
285 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
286 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
287 DstReg | SrcMem16 | ModRM | Mov,
288 /* 0xC0 - 0xCF */
289 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
290 0, 0, 0, Group | GroupDual | Group9,
291 0, 0, 0, 0, 0, 0, 0, 0,
292 /* 0xD0 - 0xDF */
293 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
294 /* 0xE0 - 0xEF */
295 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
296 /* 0xF0 - 0xFF */
297 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
298}; 120};
299 121
300static u32 group_table[] = { 122struct group_dual {
301 [Group1_80*8] = 123 struct opcode mod012[8];
302 ByteOp | DstMem | SrcImm | ModRM | Lock, 124 struct opcode mod3[8];
303 ByteOp | DstMem | SrcImm | ModRM | Lock,
304 ByteOp | DstMem | SrcImm | ModRM | Lock,
305 ByteOp | DstMem | SrcImm | ModRM | Lock,
306 ByteOp | DstMem | SrcImm | ModRM | Lock,
307 ByteOp | DstMem | SrcImm | ModRM | Lock,
308 ByteOp | DstMem | SrcImm | ModRM | Lock,
309 ByteOp | DstMem | SrcImm | ModRM,
310 [Group1_81*8] =
311 DstMem | SrcImm | ModRM | Lock,
312 DstMem | SrcImm | ModRM | Lock,
313 DstMem | SrcImm | ModRM | Lock,
314 DstMem | SrcImm | ModRM | Lock,
315 DstMem | SrcImm | ModRM | Lock,
316 DstMem | SrcImm | ModRM | Lock,
317 DstMem | SrcImm | ModRM | Lock,
318 DstMem | SrcImm | ModRM,
319 [Group1_82*8] =
320 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
321 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
322 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
323 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
324 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
325 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
326 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
327 ByteOp | DstMem | SrcImm | ModRM | No64,
328 [Group1_83*8] =
329 DstMem | SrcImmByte | ModRM | Lock,
330 DstMem | SrcImmByte | ModRM | Lock,
331 DstMem | SrcImmByte | ModRM | Lock,
332 DstMem | SrcImmByte | ModRM | Lock,
333 DstMem | SrcImmByte | ModRM | Lock,
334 DstMem | SrcImmByte | ModRM | Lock,
335 DstMem | SrcImmByte | ModRM | Lock,
336 DstMem | SrcImmByte | ModRM,
337 [Group1A*8] =
338 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
339 [Group3_Byte*8] =
340 ByteOp | SrcImm | DstMem | ModRM, ByteOp | SrcImm | DstMem | ModRM,
341 ByteOp | DstMem | SrcNone | ModRM, ByteOp | DstMem | SrcNone | ModRM,
342 0, 0, 0, 0,
343 [Group3*8] =
344 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM,
345 DstMem | SrcNone | ModRM, DstMem | SrcNone | ModRM,
346 0, 0, 0, 0,
347 [Group4*8] =
348 ByteOp | DstMem | SrcNone | ModRM | Lock, ByteOp | DstMem | SrcNone | ModRM | Lock,
349 0, 0, 0, 0, 0, 0,
350 [Group5*8] =
351 DstMem | SrcNone | ModRM | Lock, DstMem | SrcNone | ModRM | Lock,
352 SrcMem | ModRM | Stack, 0,
353 SrcMem | ModRM | Stack, SrcMemFAddr | ModRM | ImplicitOps,
354 SrcMem | ModRM | Stack, 0,
355 [Group7*8] =
356 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
357 SrcNone | ModRM | DstMem | Mov, 0,
358 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
359 [Group8*8] =
360 0, 0, 0, 0,
361 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
362 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
363 [Group9*8] =
364 0, DstMem64 | ModRM | Lock, 0, 0, 0, 0, 0, 0,
365}; 125};
366 126
367static u32 group2_table[] = { 127struct gprefix {
368 [Group7*8] = 128 struct opcode pfx_no;
369 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM | Priv, 129 struct opcode pfx_66;
370 SrcNone | ModRM | DstMem | Mov, 0, 130 struct opcode pfx_f2;
371 SrcMem16 | ModRM | Mov | Priv, 0, 131 struct opcode pfx_f3;
372 [Group9*8] =
373 0, 0, 0, 0, 0, 0, 0, 0,
374}; 132};
375 133
376/* EFLAGS bit definitions. */ 134/* EFLAGS bit definitions. */
@@ -392,6 +150,9 @@ static u32 group2_table[] = {
392#define EFLG_PF (1<<2) 150#define EFLG_PF (1<<2)
393#define EFLG_CF (1<<0) 151#define EFLG_CF (1<<0)
394 152
153#define EFLG_RESERVED_ZEROS_MASK 0xffc0802a
154#define EFLG_RESERVED_ONE_MASK 2
155
395/* 156/*
396 * Instruction emulation: 157 * Instruction emulation:
397 * Most instructions are emulated directly via a fragment of inline assembly 158 * Most instructions are emulated directly via a fragment of inline assembly
@@ -444,13 +205,13 @@ static u32 group2_table[] = {
444#define ON64(x) 205#define ON64(x)
445#endif 206#endif
446 207
447#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix) \ 208#define ____emulate_2op(_op, _src, _dst, _eflags, _x, _y, _suffix, _dsttype) \
448 do { \ 209 do { \
449 __asm__ __volatile__ ( \ 210 __asm__ __volatile__ ( \
450 _PRE_EFLAGS("0", "4", "2") \ 211 _PRE_EFLAGS("0", "4", "2") \
451 _op _suffix " %"_x"3,%1; " \ 212 _op _suffix " %"_x"3,%1; " \
452 _POST_EFLAGS("0", "4", "2") \ 213 _POST_EFLAGS("0", "4", "2") \
453 : "=m" (_eflags), "=m" ((_dst).val), \ 214 : "=m" (_eflags), "+q" (*(_dsttype*)&(_dst).val),\
454 "=&r" (_tmp) \ 215 "=&r" (_tmp) \
455 : _y ((_src).val), "i" (EFLAGS_MASK)); \ 216 : _y ((_src).val), "i" (EFLAGS_MASK)); \
456 } while (0) 217 } while (0)
@@ -463,13 +224,13 @@ static u32 group2_table[] = {
463 \ 224 \
464 switch ((_dst).bytes) { \ 225 switch ((_dst).bytes) { \
465 case 2: \ 226 case 2: \
466 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w"); \ 227 ____emulate_2op(_op,_src,_dst,_eflags,_wx,_wy,"w",u16);\
467 break; \ 228 break; \
468 case 4: \ 229 case 4: \
469 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l"); \ 230 ____emulate_2op(_op,_src,_dst,_eflags,_lx,_ly,"l",u32);\
470 break; \ 231 break; \
471 case 8: \ 232 case 8: \
472 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q")); \ 233 ON64(____emulate_2op(_op,_src,_dst,_eflags,_qx,_qy,"q",u64)); \
473 break; \ 234 break; \
474 } \ 235 } \
475 } while (0) 236 } while (0)
@@ -479,7 +240,7 @@ static u32 group2_table[] = {
479 unsigned long _tmp; \ 240 unsigned long _tmp; \
480 switch ((_dst).bytes) { \ 241 switch ((_dst).bytes) { \
481 case 1: \ 242 case 1: \
482 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b"); \ 243 ____emulate_2op(_op,_src,_dst,_eflags,_bx,_by,"b",u8); \
483 break; \ 244 break; \
484 default: \ 245 default: \
485 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \ 246 __emulate_2op_nobyte(_op, _src, _dst, _eflags, \
@@ -504,42 +265,42 @@ static u32 group2_table[] = {
504 "w", "r", _LO32, "r", "", "r") 265 "w", "r", _LO32, "r", "", "r")
505 266
506/* Instruction has three operands and one operand is stored in ECX register */ 267/* Instruction has three operands and one operand is stored in ECX register */
507#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \ 268#define __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, _suffix, _type) \
508 do { \ 269 do { \
509 unsigned long _tmp; \ 270 unsigned long _tmp; \
510 _type _clv = (_cl).val; \ 271 _type _clv = (_cl).val; \
511 _type _srcv = (_src).val; \ 272 _type _srcv = (_src).val; \
512 _type _dstv = (_dst).val; \ 273 _type _dstv = (_dst).val; \
513 \ 274 \
514 __asm__ __volatile__ ( \ 275 __asm__ __volatile__ ( \
515 _PRE_EFLAGS("0", "5", "2") \ 276 _PRE_EFLAGS("0", "5", "2") \
516 _op _suffix " %4,%1 \n" \ 277 _op _suffix " %4,%1 \n" \
517 _POST_EFLAGS("0", "5", "2") \ 278 _POST_EFLAGS("0", "5", "2") \
518 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \ 279 : "=m" (_eflags), "+r" (_dstv), "=&r" (_tmp) \
519 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \ 280 : "c" (_clv) , "r" (_srcv), "i" (EFLAGS_MASK) \
520 ); \ 281 ); \
521 \ 282 \
522 (_cl).val = (unsigned long) _clv; \ 283 (_cl).val = (unsigned long) _clv; \
523 (_src).val = (unsigned long) _srcv; \ 284 (_src).val = (unsigned long) _srcv; \
524 (_dst).val = (unsigned long) _dstv; \ 285 (_dst).val = (unsigned long) _dstv; \
525 } while (0) 286 } while (0)
526 287
527#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \ 288#define emulate_2op_cl(_op, _cl, _src, _dst, _eflags) \
528 do { \ 289 do { \
529 switch ((_dst).bytes) { \ 290 switch ((_dst).bytes) { \
530 case 2: \ 291 case 2: \
531 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 292 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
532 "w", unsigned short); \ 293 "w", unsigned short); \
533 break; \ 294 break; \
534 case 4: \ 295 case 4: \
535 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 296 __emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
536 "l", unsigned int); \ 297 "l", unsigned int); \
537 break; \ 298 break; \
538 case 8: \ 299 case 8: \
539 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \ 300 ON64(__emulate_2op_cl(_op, _cl, _src, _dst, _eflags, \
540 "q", unsigned long)); \ 301 "q", unsigned long)); \
541 break; \ 302 break; \
542 } \ 303 } \
543 } while (0) 304 } while (0)
544 305
545#define __emulate_1op(_op, _dst, _eflags, _suffix) \ 306#define __emulate_1op(_op, _dst, _eflags, _suffix) \
@@ -566,6 +327,86 @@ static u32 group2_table[] = {
566 } \ 327 } \
567 } while (0) 328 } while (0)
568 329
330#define __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags, _suffix) \
331 do { \
332 unsigned long _tmp; \
333 \
334 __asm__ __volatile__ ( \
335 _PRE_EFLAGS("0", "4", "1") \
336 _op _suffix " %5; " \
337 _POST_EFLAGS("0", "4", "1") \
338 : "=m" (_eflags), "=&r" (_tmp), \
339 "+a" (_rax), "+d" (_rdx) \
340 : "i" (EFLAGS_MASK), "m" ((_src).val), \
341 "a" (_rax), "d" (_rdx)); \
342 } while (0)
343
344#define __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _suffix, _ex) \
345 do { \
346 unsigned long _tmp; \
347 \
348 __asm__ __volatile__ ( \
349 _PRE_EFLAGS("0", "5", "1") \
350 "1: \n\t" \
351 _op _suffix " %6; " \
352 "2: \n\t" \
353 _POST_EFLAGS("0", "5", "1") \
354 ".pushsection .fixup,\"ax\" \n\t" \
355 "3: movb $1, %4 \n\t" \
356 "jmp 2b \n\t" \
357 ".popsection \n\t" \
358 _ASM_EXTABLE(1b, 3b) \
359 : "=m" (_eflags), "=&r" (_tmp), \
360 "+a" (_rax), "+d" (_rdx), "+qm"(_ex) \
361 : "i" (EFLAGS_MASK), "m" ((_src).val), \
362 "a" (_rax), "d" (_rdx)); \
363 } while (0)
364
365/* instruction has only one source operand, destination is implicit (e.g. mul, div, imul, idiv) */
366#define emulate_1op_rax_rdx(_op, _src, _rax, _rdx, _eflags) \
367 do { \
368 switch((_src).bytes) { \
369 case 1: \
370 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
371 _eflags, "b"); \
372 break; \
373 case 2: \
374 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
375 _eflags, "w"); \
376 break; \
377 case 4: \
378 __emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
379 _eflags, "l"); \
380 break; \
381 case 8: \
382 ON64(__emulate_1op_rax_rdx(_op, _src, _rax, _rdx, \
383 _eflags, "q")); \
384 break; \
385 } \
386 } while (0)
387
388#define emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, _eflags, _ex) \
389 do { \
390 switch((_src).bytes) { \
391 case 1: \
392 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
393 _eflags, "b", _ex); \
394 break; \
395 case 2: \
396 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
397 _eflags, "w", _ex); \
398 break; \
399 case 4: \
400 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
401 _eflags, "l", _ex); \
402 break; \
403 case 8: ON64( \
404 __emulate_1op_rax_rdx_ex(_op, _src, _rax, _rdx, \
405 _eflags, "q", _ex)); \
406 break; \
407 } \
408 } while (0)
409
569/* Fetch next part of the instruction being emulated. */ 410/* Fetch next part of the instruction being emulated. */
570#define insn_fetch(_type, _size, _eip) \ 411#define insn_fetch(_type, _size, _eip) \
571({ unsigned long _x; \ 412({ unsigned long _x; \
@@ -576,13 +417,33 @@ static u32 group2_table[] = {
576 (_type)_x; \ 417 (_type)_x; \
577}) 418})
578 419
579#define insn_fetch_arr(_arr, _size, _eip) \ 420#define insn_fetch_arr(_arr, _size, _eip) \
580({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \ 421({ rc = do_insn_fetch(ctxt, ops, (_eip), _arr, (_size)); \
581 if (rc != X86EMUL_CONTINUE) \ 422 if (rc != X86EMUL_CONTINUE) \
582 goto done; \ 423 goto done; \
583 (_eip) += (_size); \ 424 (_eip) += (_size); \
584}) 425})
585 426
427static int emulator_check_intercept(struct x86_emulate_ctxt *ctxt,
428 enum x86_intercept intercept,
429 enum x86_intercept_stage stage)
430{
431 struct x86_instruction_info info = {
432 .intercept = intercept,
433 .rep_prefix = ctxt->decode.rep_prefix,
434 .modrm_mod = ctxt->decode.modrm_mod,
435 .modrm_reg = ctxt->decode.modrm_reg,
436 .modrm_rm = ctxt->decode.modrm_rm,
437 .src_val = ctxt->decode.src.val64,
438 .src_bytes = ctxt->decode.src.bytes,
439 .dst_bytes = ctxt->decode.dst.bytes,
440 .ad_bytes = ctxt->decode.ad_bytes,
441 .next_rip = ctxt->eip,
442 };
443
444 return ctxt->ops->intercept(ctxt, &info, stage);
445}
446
586static inline unsigned long ad_mask(struct decode_cache *c) 447static inline unsigned long ad_mask(struct decode_cache *c)
587{ 448{
588 return (1UL << (c->ad_bytes << 3)) - 1; 449 return (1UL << (c->ad_bytes << 3)) - 1;
@@ -599,9 +460,9 @@ address_mask(struct decode_cache *c, unsigned long reg)
599} 460}
600 461
601static inline unsigned long 462static inline unsigned long
602register_address(struct decode_cache *c, unsigned long base, unsigned long reg) 463register_address(struct decode_cache *c, unsigned long reg)
603{ 464{
604 return base + address_mask(c, reg); 465 return address_mask(c, reg);
605} 466}
606 467
607static inline void 468static inline void
@@ -618,6 +479,13 @@ static inline void jmp_rel(struct decode_cache *c, int rel)
618 register_address_increment(c, &c->eip, rel); 479 register_address_increment(c, &c->eip, rel);
619} 480}
620 481
482static u32 desc_limit_scaled(struct desc_struct *desc)
483{
484 u32 limit = get_desc_limit(desc);
485
486 return desc->g ? (limit << 12) | 0xfff : limit;
487}
488
621static void set_seg_override(struct decode_cache *c, int seg) 489static void set_seg_override(struct decode_cache *c, int seg)
622{ 490{
623 c->has_seg_override = true; 491 c->has_seg_override = true;
@@ -630,60 +498,177 @@ static unsigned long seg_base(struct x86_emulate_ctxt *ctxt,
630 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS) 498 if (ctxt->mode == X86EMUL_MODE_PROT64 && seg < VCPU_SREG_FS)
631 return 0; 499 return 0;
632 500
633 return ops->get_cached_segment_base(seg, ctxt->vcpu); 501 return ops->get_cached_segment_base(ctxt, seg);
634} 502}
635 503
636static unsigned long seg_override_base(struct x86_emulate_ctxt *ctxt, 504static unsigned seg_override(struct x86_emulate_ctxt *ctxt,
637 struct x86_emulate_ops *ops, 505 struct decode_cache *c)
638 struct decode_cache *c)
639{ 506{
640 if (!c->has_seg_override) 507 if (!c->has_seg_override)
641 return 0; 508 return 0;
642 509
643 return seg_base(ctxt, ops, c->seg_override); 510 return c->seg_override;
644} 511}
645 512
646static unsigned long es_base(struct x86_emulate_ctxt *ctxt, 513static int emulate_exception(struct x86_emulate_ctxt *ctxt, int vec,
647 struct x86_emulate_ops *ops) 514 u32 error, bool valid)
648{ 515{
649 return seg_base(ctxt, ops, VCPU_SREG_ES); 516 ctxt->exception.vector = vec;
517 ctxt->exception.error_code = error;
518 ctxt->exception.error_code_valid = valid;
519 return X86EMUL_PROPAGATE_FAULT;
650} 520}
651 521
652static unsigned long ss_base(struct x86_emulate_ctxt *ctxt, 522static int emulate_db(struct x86_emulate_ctxt *ctxt)
653 struct x86_emulate_ops *ops) 523{
524 return emulate_exception(ctxt, DB_VECTOR, 0, false);
525}
526
527static int emulate_gp(struct x86_emulate_ctxt *ctxt, int err)
528{
529 return emulate_exception(ctxt, GP_VECTOR, err, true);
530}
531
532static int emulate_ss(struct x86_emulate_ctxt *ctxt, int err)
533{
534 return emulate_exception(ctxt, SS_VECTOR, err, true);
535}
536
537static int emulate_ud(struct x86_emulate_ctxt *ctxt)
538{
539 return emulate_exception(ctxt, UD_VECTOR, 0, false);
540}
541
542static int emulate_ts(struct x86_emulate_ctxt *ctxt, int err)
543{
544 return emulate_exception(ctxt, TS_VECTOR, err, true);
545}
546
547static int emulate_de(struct x86_emulate_ctxt *ctxt)
654{ 548{
655 return seg_base(ctxt, ops, VCPU_SREG_SS); 549 return emulate_exception(ctxt, DE_VECTOR, 0, false);
656} 550}
657 551
658static void emulate_exception(struct x86_emulate_ctxt *ctxt, int vec, 552static int emulate_nm(struct x86_emulate_ctxt *ctxt)
659 u32 error, bool valid)
660{ 553{
661 ctxt->exception = vec; 554 return emulate_exception(ctxt, NM_VECTOR, 0, false);
662 ctxt->error_code = error;
663 ctxt->error_code_valid = valid;
664 ctxt->restart = false;
665} 555}
666 556
667static void emulate_gp(struct x86_emulate_ctxt *ctxt, int err) 557static u16 get_segment_selector(struct x86_emulate_ctxt *ctxt, unsigned seg)
668{ 558{
669 emulate_exception(ctxt, GP_VECTOR, err, true); 559 u16 selector;
560 struct desc_struct desc;
561
562 ctxt->ops->get_segment(ctxt, &selector, &desc, NULL, seg);
563 return selector;
670} 564}
671 565
672static void emulate_pf(struct x86_emulate_ctxt *ctxt, unsigned long addr, 566static void set_segment_selector(struct x86_emulate_ctxt *ctxt, u16 selector,
673 int err) 567 unsigned seg)
674{ 568{
675 ctxt->cr2 = addr; 569 u16 dummy;
676 emulate_exception(ctxt, PF_VECTOR, err, true); 570 u32 base3;
571 struct desc_struct desc;
572
573 ctxt->ops->get_segment(ctxt, &dummy, &desc, &base3, seg);
574 ctxt->ops->set_segment(ctxt, selector, &desc, base3, seg);
677} 575}
678 576
679static void emulate_ud(struct x86_emulate_ctxt *ctxt) 577static int __linearize(struct x86_emulate_ctxt *ctxt,
578 struct segmented_address addr,
579 unsigned size, bool write, bool fetch,
580 ulong *linear)
680{ 581{
681 emulate_exception(ctxt, UD_VECTOR, 0, false); 582 struct decode_cache *c = &ctxt->decode;
583 struct desc_struct desc;
584 bool usable;
585 ulong la;
586 u32 lim;
587 u16 sel;
588 unsigned cpl, rpl;
589
590 la = seg_base(ctxt, ctxt->ops, addr.seg) + addr.ea;
591 switch (ctxt->mode) {
592 case X86EMUL_MODE_REAL:
593 break;
594 case X86EMUL_MODE_PROT64:
595 if (((signed long)la << 16) >> 16 != la)
596 return emulate_gp(ctxt, 0);
597 break;
598 default:
599 usable = ctxt->ops->get_segment(ctxt, &sel, &desc, NULL,
600 addr.seg);
601 if (!usable)
602 goto bad;
603 /* code segment or read-only data segment */
604 if (((desc.type & 8) || !(desc.type & 2)) && write)
605 goto bad;
606 /* unreadable code segment */
607 if (!fetch && (desc.type & 8) && !(desc.type & 2))
608 goto bad;
609 lim = desc_limit_scaled(&desc);
610 if ((desc.type & 8) || !(desc.type & 4)) {
611 /* expand-up segment */
612 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
613 goto bad;
614 } else {
615 /* exapand-down segment */
616 if (addr.ea <= lim || (u32)(addr.ea + size - 1) <= lim)
617 goto bad;
618 lim = desc.d ? 0xffffffff : 0xffff;
619 if (addr.ea > lim || (u32)(addr.ea + size - 1) > lim)
620 goto bad;
621 }
622 cpl = ctxt->ops->cpl(ctxt);
623 rpl = sel & 3;
624 cpl = max(cpl, rpl);
625 if (!(desc.type & 8)) {
626 /* data segment */
627 if (cpl > desc.dpl)
628 goto bad;
629 } else if ((desc.type & 8) && !(desc.type & 4)) {
630 /* nonconforming code segment */
631 if (cpl != desc.dpl)
632 goto bad;
633 } else if ((desc.type & 8) && (desc.type & 4)) {
634 /* conforming code segment */
635 if (cpl < desc.dpl)
636 goto bad;
637 }
638 break;
639 }
640 if (fetch ? ctxt->mode != X86EMUL_MODE_PROT64 : c->ad_bytes != 8)
641 la &= (u32)-1;
642 *linear = la;
643 return X86EMUL_CONTINUE;
644bad:
645 if (addr.seg == VCPU_SREG_SS)
646 return emulate_ss(ctxt, addr.seg);
647 else
648 return emulate_gp(ctxt, addr.seg);
682} 649}
683 650
684static void emulate_ts(struct x86_emulate_ctxt *ctxt, int err) 651static int linearize(struct x86_emulate_ctxt *ctxt,
652 struct segmented_address addr,
653 unsigned size, bool write,
654 ulong *linear)
685{ 655{
686 emulate_exception(ctxt, TS_VECTOR, err, true); 656 return __linearize(ctxt, addr, size, write, false, linear);
657}
658
659
660static int segmented_read_std(struct x86_emulate_ctxt *ctxt,
661 struct segmented_address addr,
662 void *data,
663 unsigned size)
664{
665 int rc;
666 ulong linear;
667
668 rc = linearize(ctxt, addr, size, false, &linear);
669 if (rc != X86EMUL_CONTINUE)
670 return rc;
671 return ctxt->ops->read_std(ctxt, linear, data, size, &ctxt->exception);
687} 672}
688 673
689static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt, 674static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
@@ -695,10 +680,15 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
695 int size, cur_size; 680 int size, cur_size;
696 681
697 if (eip == fc->end) { 682 if (eip == fc->end) {
683 unsigned long linear;
684 struct segmented_address addr = { .seg=VCPU_SREG_CS, .ea=eip};
698 cur_size = fc->end - fc->start; 685 cur_size = fc->end - fc->start;
699 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip)); 686 size = min(15UL - cur_size, PAGE_SIZE - offset_in_page(eip));
700 rc = ops->fetch(ctxt->cs_base + eip, fc->data + cur_size, 687 rc = __linearize(ctxt, addr, size, false, true, &linear);
701 size, ctxt->vcpu, NULL); 688 if (rc != X86EMUL_CONTINUE)
689 return rc;
690 rc = ops->fetch(ctxt, linear, fc->data + cur_size,
691 size, &ctxt->exception);
702 if (rc != X86EMUL_CONTINUE) 692 if (rc != X86EMUL_CONTINUE)
703 return rc; 693 return rc;
704 fc->end += size; 694 fc->end += size;
@@ -741,8 +731,7 @@ static void *decode_register(u8 modrm_reg, unsigned long *regs,
741} 731}
742 732
743static int read_descriptor(struct x86_emulate_ctxt *ctxt, 733static int read_descriptor(struct x86_emulate_ctxt *ctxt,
744 struct x86_emulate_ops *ops, 734 struct segmented_address addr,
745 void *ptr,
746 u16 *size, unsigned long *address, int op_bytes) 735 u16 *size, unsigned long *address, int op_bytes)
747{ 736{
748 int rc; 737 int rc;
@@ -750,12 +739,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
750 if (op_bytes == 2) 739 if (op_bytes == 2)
751 op_bytes = 3; 740 op_bytes = 3;
752 *address = 0; 741 *address = 0;
753 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 742 rc = segmented_read_std(ctxt, addr, size, 2);
754 ctxt->vcpu, NULL);
755 if (rc != X86EMUL_CONTINUE) 743 if (rc != X86EMUL_CONTINUE)
756 return rc; 744 return rc;
757 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 745 addr.ea += 2;
758 ctxt->vcpu, NULL); 746 rc = segmented_read_std(ctxt, addr, address, op_bytes);
759 return rc; 747 return rc;
760} 748}
761 749
@@ -794,7 +782,81 @@ static int test_cc(unsigned int condition, unsigned int flags)
794 return (!!rc ^ (condition & 1)); 782 return (!!rc ^ (condition & 1));
795} 783}
796 784
797static void decode_register_operand(struct operand *op, 785static void fetch_register_operand(struct operand *op)
786{
787 switch (op->bytes) {
788 case 1:
789 op->val = *(u8 *)op->addr.reg;
790 break;
791 case 2:
792 op->val = *(u16 *)op->addr.reg;
793 break;
794 case 4:
795 op->val = *(u32 *)op->addr.reg;
796 break;
797 case 8:
798 op->val = *(u64 *)op->addr.reg;
799 break;
800 }
801}
802
803static void read_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data, int reg)
804{
805 ctxt->ops->get_fpu(ctxt);
806 switch (reg) {
807 case 0: asm("movdqu %%xmm0, %0" : "=m"(*data)); break;
808 case 1: asm("movdqu %%xmm1, %0" : "=m"(*data)); break;
809 case 2: asm("movdqu %%xmm2, %0" : "=m"(*data)); break;
810 case 3: asm("movdqu %%xmm3, %0" : "=m"(*data)); break;
811 case 4: asm("movdqu %%xmm4, %0" : "=m"(*data)); break;
812 case 5: asm("movdqu %%xmm5, %0" : "=m"(*data)); break;
813 case 6: asm("movdqu %%xmm6, %0" : "=m"(*data)); break;
814 case 7: asm("movdqu %%xmm7, %0" : "=m"(*data)); break;
815#ifdef CONFIG_X86_64
816 case 8: asm("movdqu %%xmm8, %0" : "=m"(*data)); break;
817 case 9: asm("movdqu %%xmm9, %0" : "=m"(*data)); break;
818 case 10: asm("movdqu %%xmm10, %0" : "=m"(*data)); break;
819 case 11: asm("movdqu %%xmm11, %0" : "=m"(*data)); break;
820 case 12: asm("movdqu %%xmm12, %0" : "=m"(*data)); break;
821 case 13: asm("movdqu %%xmm13, %0" : "=m"(*data)); break;
822 case 14: asm("movdqu %%xmm14, %0" : "=m"(*data)); break;
823 case 15: asm("movdqu %%xmm15, %0" : "=m"(*data)); break;
824#endif
825 default: BUG();
826 }
827 ctxt->ops->put_fpu(ctxt);
828}
829
830static void write_sse_reg(struct x86_emulate_ctxt *ctxt, sse128_t *data,
831 int reg)
832{
833 ctxt->ops->get_fpu(ctxt);
834 switch (reg) {
835 case 0: asm("movdqu %0, %%xmm0" : : "m"(*data)); break;
836 case 1: asm("movdqu %0, %%xmm1" : : "m"(*data)); break;
837 case 2: asm("movdqu %0, %%xmm2" : : "m"(*data)); break;
838 case 3: asm("movdqu %0, %%xmm3" : : "m"(*data)); break;
839 case 4: asm("movdqu %0, %%xmm4" : : "m"(*data)); break;
840 case 5: asm("movdqu %0, %%xmm5" : : "m"(*data)); break;
841 case 6: asm("movdqu %0, %%xmm6" : : "m"(*data)); break;
842 case 7: asm("movdqu %0, %%xmm7" : : "m"(*data)); break;
843#ifdef CONFIG_X86_64
844 case 8: asm("movdqu %0, %%xmm8" : : "m"(*data)); break;
845 case 9: asm("movdqu %0, %%xmm9" : : "m"(*data)); break;
846 case 10: asm("movdqu %0, %%xmm10" : : "m"(*data)); break;
847 case 11: asm("movdqu %0, %%xmm11" : : "m"(*data)); break;
848 case 12: asm("movdqu %0, %%xmm12" : : "m"(*data)); break;
849 case 13: asm("movdqu %0, %%xmm13" : : "m"(*data)); break;
850 case 14: asm("movdqu %0, %%xmm14" : : "m"(*data)); break;
851 case 15: asm("movdqu %0, %%xmm15" : : "m"(*data)); break;
852#endif
853 default: BUG();
854 }
855 ctxt->ops->put_fpu(ctxt);
856}
857
858static void decode_register_operand(struct x86_emulate_ctxt *ctxt,
859 struct operand *op,
798 struct decode_cache *c, 860 struct decode_cache *c,
799 int inhibit_bytereg) 861 int inhibit_bytereg)
800{ 862{
@@ -803,36 +865,36 @@ static void decode_register_operand(struct operand *op,
803 865
804 if (!(c->d & ModRM)) 866 if (!(c->d & ModRM))
805 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3); 867 reg = (c->b & 7) | ((c->rex_prefix & 1) << 3);
868
869 if (c->d & Sse) {
870 op->type = OP_XMM;
871 op->bytes = 16;
872 op->addr.xmm = reg;
873 read_sse_reg(ctxt, &op->vec_val, reg);
874 return;
875 }
876
806 op->type = OP_REG; 877 op->type = OP_REG;
807 if ((c->d & ByteOp) && !inhibit_bytereg) { 878 if ((c->d & ByteOp) && !inhibit_bytereg) {
808 op->ptr = decode_register(reg, c->regs, highbyte_regs); 879 op->addr.reg = decode_register(reg, c->regs, highbyte_regs);
809 op->val = *(u8 *)op->ptr;
810 op->bytes = 1; 880 op->bytes = 1;
811 } else { 881 } else {
812 op->ptr = decode_register(reg, c->regs, 0); 882 op->addr.reg = decode_register(reg, c->regs, 0);
813 op->bytes = c->op_bytes; 883 op->bytes = c->op_bytes;
814 switch (op->bytes) {
815 case 2:
816 op->val = *(u16 *)op->ptr;
817 break;
818 case 4:
819 op->val = *(u32 *)op->ptr;
820 break;
821 case 8:
822 op->val = *(u64 *) op->ptr;
823 break;
824 }
825 } 884 }
885 fetch_register_operand(op);
826 op->orig_val = op->val; 886 op->orig_val = op->val;
827} 887}
828 888
829static int decode_modrm(struct x86_emulate_ctxt *ctxt, 889static int decode_modrm(struct x86_emulate_ctxt *ctxt,
830 struct x86_emulate_ops *ops) 890 struct x86_emulate_ops *ops,
891 struct operand *op)
831{ 892{
832 struct decode_cache *c = &ctxt->decode; 893 struct decode_cache *c = &ctxt->decode;
833 u8 sib; 894 u8 sib;
834 int index_reg = 0, base_reg = 0, scale; 895 int index_reg = 0, base_reg = 0, scale;
835 int rc = X86EMUL_CONTINUE; 896 int rc = X86EMUL_CONTINUE;
897 ulong modrm_ea = 0;
836 898
837 if (c->rex_prefix) { 899 if (c->rex_prefix) {
838 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */ 900 c->modrm_reg = (c->rex_prefix & 4) << 1; /* REX.R */
@@ -844,16 +906,26 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
844 c->modrm_mod |= (c->modrm & 0xc0) >> 6; 906 c->modrm_mod |= (c->modrm & 0xc0) >> 6;
845 c->modrm_reg |= (c->modrm & 0x38) >> 3; 907 c->modrm_reg |= (c->modrm & 0x38) >> 3;
846 c->modrm_rm |= (c->modrm & 0x07); 908 c->modrm_rm |= (c->modrm & 0x07);
847 c->modrm_ea = 0; 909 c->modrm_seg = VCPU_SREG_DS;
848 c->use_modrm_ea = 1;
849 910
850 if (c->modrm_mod == 3) { 911 if (c->modrm_mod == 3) {
851 c->modrm_ptr = decode_register(c->modrm_rm, 912 op->type = OP_REG;
913 op->bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
914 op->addr.reg = decode_register(c->modrm_rm,
852 c->regs, c->d & ByteOp); 915 c->regs, c->d & ByteOp);
853 c->modrm_val = *(unsigned long *)c->modrm_ptr; 916 if (c->d & Sse) {
917 op->type = OP_XMM;
918 op->bytes = 16;
919 op->addr.xmm = c->modrm_rm;
920 read_sse_reg(ctxt, &op->vec_val, c->modrm_rm);
921 return rc;
922 }
923 fetch_register_operand(op);
854 return rc; 924 return rc;
855 } 925 }
856 926
927 op->type = OP_MEM;
928
857 if (c->ad_bytes == 2) { 929 if (c->ad_bytes == 2) {
858 unsigned bx = c->regs[VCPU_REGS_RBX]; 930 unsigned bx = c->regs[VCPU_REGS_RBX];
859 unsigned bp = c->regs[VCPU_REGS_RBP]; 931 unsigned bp = c->regs[VCPU_REGS_RBP];
@@ -864,47 +936,46 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
864 switch (c->modrm_mod) { 936 switch (c->modrm_mod) {
865 case 0: 937 case 0:
866 if (c->modrm_rm == 6) 938 if (c->modrm_rm == 6)
867 c->modrm_ea += insn_fetch(u16, 2, c->eip); 939 modrm_ea += insn_fetch(u16, 2, c->eip);
868 break; 940 break;
869 case 1: 941 case 1:
870 c->modrm_ea += insn_fetch(s8, 1, c->eip); 942 modrm_ea += insn_fetch(s8, 1, c->eip);
871 break; 943 break;
872 case 2: 944 case 2:
873 c->modrm_ea += insn_fetch(u16, 2, c->eip); 945 modrm_ea += insn_fetch(u16, 2, c->eip);
874 break; 946 break;
875 } 947 }
876 switch (c->modrm_rm) { 948 switch (c->modrm_rm) {
877 case 0: 949 case 0:
878 c->modrm_ea += bx + si; 950 modrm_ea += bx + si;
879 break; 951 break;
880 case 1: 952 case 1:
881 c->modrm_ea += bx + di; 953 modrm_ea += bx + di;
882 break; 954 break;
883 case 2: 955 case 2:
884 c->modrm_ea += bp + si; 956 modrm_ea += bp + si;
885 break; 957 break;
886 case 3: 958 case 3:
887 c->modrm_ea += bp + di; 959 modrm_ea += bp + di;
888 break; 960 break;
889 case 4: 961 case 4:
890 c->modrm_ea += si; 962 modrm_ea += si;
891 break; 963 break;
892 case 5: 964 case 5:
893 c->modrm_ea += di; 965 modrm_ea += di;
894 break; 966 break;
895 case 6: 967 case 6:
896 if (c->modrm_mod != 0) 968 if (c->modrm_mod != 0)
897 c->modrm_ea += bp; 969 modrm_ea += bp;
898 break; 970 break;
899 case 7: 971 case 7:
900 c->modrm_ea += bx; 972 modrm_ea += bx;
901 break; 973 break;
902 } 974 }
903 if (c->modrm_rm == 2 || c->modrm_rm == 3 || 975 if (c->modrm_rm == 2 || c->modrm_rm == 3 ||
904 (c->modrm_rm == 6 && c->modrm_mod != 0)) 976 (c->modrm_rm == 6 && c->modrm_mod != 0))
905 if (!c->has_seg_override) 977 c->modrm_seg = VCPU_SREG_SS;
906 set_seg_override(c, VCPU_SREG_SS); 978 modrm_ea = (u16)modrm_ea;
907 c->modrm_ea = (u16)c->modrm_ea;
908 } else { 979 } else {
909 /* 32/64-bit ModR/M decode. */ 980 /* 32/64-bit ModR/M decode. */
910 if ((c->modrm_rm & 7) == 4) { 981 if ((c->modrm_rm & 7) == 4) {
@@ -914,410 +985,74 @@ static int decode_modrm(struct x86_emulate_ctxt *ctxt,
914 scale = sib >> 6; 985 scale = sib >> 6;
915 986
916 if ((base_reg & 7) == 5 && c->modrm_mod == 0) 987 if ((base_reg & 7) == 5 && c->modrm_mod == 0)
917 c->modrm_ea += insn_fetch(s32, 4, c->eip); 988 modrm_ea += insn_fetch(s32, 4, c->eip);
918 else 989 else
919 c->modrm_ea += c->regs[base_reg]; 990 modrm_ea += c->regs[base_reg];
920 if (index_reg != 4) 991 if (index_reg != 4)
921 c->modrm_ea += c->regs[index_reg] << scale; 992 modrm_ea += c->regs[index_reg] << scale;
922 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) { 993 } else if ((c->modrm_rm & 7) == 5 && c->modrm_mod == 0) {
923 if (ctxt->mode == X86EMUL_MODE_PROT64) 994 if (ctxt->mode == X86EMUL_MODE_PROT64)
924 c->rip_relative = 1; 995 c->rip_relative = 1;
925 } else 996 } else
926 c->modrm_ea += c->regs[c->modrm_rm]; 997 modrm_ea += c->regs[c->modrm_rm];
927 switch (c->modrm_mod) { 998 switch (c->modrm_mod) {
928 case 0: 999 case 0:
929 if (c->modrm_rm == 5) 1000 if (c->modrm_rm == 5)
930 c->modrm_ea += insn_fetch(s32, 4, c->eip); 1001 modrm_ea += insn_fetch(s32, 4, c->eip);
931 break; 1002 break;
932 case 1: 1003 case 1:
933 c->modrm_ea += insn_fetch(s8, 1, c->eip); 1004 modrm_ea += insn_fetch(s8, 1, c->eip);
934 break; 1005 break;
935 case 2: 1006 case 2:
936 c->modrm_ea += insn_fetch(s32, 4, c->eip); 1007 modrm_ea += insn_fetch(s32, 4, c->eip);
937 break; 1008 break;
938 } 1009 }
939 } 1010 }
1011 op->addr.mem.ea = modrm_ea;
940done: 1012done:
941 return rc; 1013 return rc;
942} 1014}
943 1015
944static int decode_abs(struct x86_emulate_ctxt *ctxt, 1016static int decode_abs(struct x86_emulate_ctxt *ctxt,
945 struct x86_emulate_ops *ops) 1017 struct x86_emulate_ops *ops,
1018 struct operand *op)
946{ 1019{
947 struct decode_cache *c = &ctxt->decode; 1020 struct decode_cache *c = &ctxt->decode;
948 int rc = X86EMUL_CONTINUE; 1021 int rc = X86EMUL_CONTINUE;
949 1022
1023 op->type = OP_MEM;
950 switch (c->ad_bytes) { 1024 switch (c->ad_bytes) {
951 case 2: 1025 case 2:
952 c->modrm_ea = insn_fetch(u16, 2, c->eip); 1026 op->addr.mem.ea = insn_fetch(u16, 2, c->eip);
953 break; 1027 break;
954 case 4: 1028 case 4:
955 c->modrm_ea = insn_fetch(u32, 4, c->eip); 1029 op->addr.mem.ea = insn_fetch(u32, 4, c->eip);
956 break; 1030 break;
957 case 8: 1031 case 8:
958 c->modrm_ea = insn_fetch(u64, 8, c->eip); 1032 op->addr.mem.ea = insn_fetch(u64, 8, c->eip);
959 break; 1033 break;
960 } 1034 }
961done: 1035done:
962 return rc; 1036 return rc;
963} 1037}
964 1038
965int 1039static void fetch_bit_operand(struct decode_cache *c)
966x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
967{ 1040{
968 struct decode_cache *c = &ctxt->decode; 1041 long sv = 0, mask;
969 int rc = X86EMUL_CONTINUE;
970 int mode = ctxt->mode;
971 int def_op_bytes, def_ad_bytes, group;
972
973
974 /* we cannot decode insn before we complete previous rep insn */
975 WARN_ON(ctxt->restart);
976
977 c->eip = ctxt->eip;
978 c->fetch.start = c->fetch.end = c->eip;
979 ctxt->cs_base = seg_base(ctxt, ops, VCPU_SREG_CS);
980
981 switch (mode) {
982 case X86EMUL_MODE_REAL:
983 case X86EMUL_MODE_VM86:
984 case X86EMUL_MODE_PROT16:
985 def_op_bytes = def_ad_bytes = 2;
986 break;
987 case X86EMUL_MODE_PROT32:
988 def_op_bytes = def_ad_bytes = 4;
989 break;
990#ifdef CONFIG_X86_64
991 case X86EMUL_MODE_PROT64:
992 def_op_bytes = 4;
993 def_ad_bytes = 8;
994 break;
995#endif
996 default:
997 return -1;
998 }
999
1000 c->op_bytes = def_op_bytes;
1001 c->ad_bytes = def_ad_bytes;
1002
1003 /* Legacy prefixes. */
1004 for (;;) {
1005 switch (c->b = insn_fetch(u8, 1, c->eip)) {
1006 case 0x66: /* operand-size override */
1007 /* switch between 2/4 bytes */
1008 c->op_bytes = def_op_bytes ^ 6;
1009 break;
1010 case 0x67: /* address-size override */
1011 if (mode == X86EMUL_MODE_PROT64)
1012 /* switch between 4/8 bytes */
1013 c->ad_bytes = def_ad_bytes ^ 12;
1014 else
1015 /* switch between 2/4 bytes */
1016 c->ad_bytes = def_ad_bytes ^ 6;
1017 break;
1018 case 0x26: /* ES override */
1019 case 0x2e: /* CS override */
1020 case 0x36: /* SS override */
1021 case 0x3e: /* DS override */
1022 set_seg_override(c, (c->b >> 3) & 3);
1023 break;
1024 case 0x64: /* FS override */
1025 case 0x65: /* GS override */
1026 set_seg_override(c, c->b & 7);
1027 break;
1028 case 0x40 ... 0x4f: /* REX */
1029 if (mode != X86EMUL_MODE_PROT64)
1030 goto done_prefixes;
1031 c->rex_prefix = c->b;
1032 continue;
1033 case 0xf0: /* LOCK */
1034 c->lock_prefix = 1;
1035 break;
1036 case 0xf2: /* REPNE/REPNZ */
1037 c->rep_prefix = REPNE_PREFIX;
1038 break;
1039 case 0xf3: /* REP/REPE/REPZ */
1040 c->rep_prefix = REPE_PREFIX;
1041 break;
1042 default:
1043 goto done_prefixes;
1044 }
1045
1046 /* Any legacy prefix after a REX prefix nullifies its effect. */
1047 1042
1048 c->rex_prefix = 0; 1043 if (c->dst.type == OP_MEM && c->src.type == OP_REG) {
1049 } 1044 mask = ~(c->dst.bytes * 8 - 1);
1050
1051done_prefixes:
1052
1053 /* REX prefix. */
1054 if (c->rex_prefix)
1055 if (c->rex_prefix & 8)
1056 c->op_bytes = 8; /* REX.W */
1057 1045
1058 /* Opcode byte(s). */ 1046 if (c->src.bytes == 2)
1059 c->d = opcode_table[c->b]; 1047 sv = (s16)c->src.val & (s16)mask;
1060 if (c->d == 0) { 1048 else if (c->src.bytes == 4)
1061 /* Two-byte opcode? */ 1049 sv = (s32)c->src.val & (s32)mask;
1062 if (c->b == 0x0f) {
1063 c->twobyte = 1;
1064 c->b = insn_fetch(u8, 1, c->eip);
1065 c->d = twobyte_table[c->b];
1066 }
1067 }
1068
1069 if (c->d & Group) {
1070 group = c->d & GroupMask;
1071 c->modrm = insn_fetch(u8, 1, c->eip);
1072 --c->eip;
1073
1074 group = (group << 3) + ((c->modrm >> 3) & 7);
1075 if ((c->d & GroupDual) && (c->modrm >> 6) == 3)
1076 c->d = group2_table[group];
1077 else
1078 c->d = group_table[group];
1079 }
1080 1050
1081 /* Unrecognised? */ 1051 c->dst.addr.mem.ea += (sv >> 3);
1082 if (c->d == 0) {
1083 DPRINTF("Cannot emulate %02x\n", c->b);
1084 return -1;
1085 } 1052 }
1086 1053
1087 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack)) 1054 /* only subword offset */
1088 c->op_bytes = 8; 1055 c->src.val &= (c->dst.bytes << 3) - 1;
1089
1090 /* ModRM and SIB bytes. */
1091 if (c->d & ModRM)
1092 rc = decode_modrm(ctxt, ops);
1093 else if (c->d & MemAbs)
1094 rc = decode_abs(ctxt, ops);
1095 if (rc != X86EMUL_CONTINUE)
1096 goto done;
1097
1098 if (!c->has_seg_override)
1099 set_seg_override(c, VCPU_SREG_DS);
1100
1101 if (!(!c->twobyte && c->b == 0x8d))
1102 c->modrm_ea += seg_override_base(ctxt, ops, c);
1103
1104 if (c->ad_bytes != 8)
1105 c->modrm_ea = (u32)c->modrm_ea;
1106
1107 if (c->rip_relative)
1108 c->modrm_ea += c->eip;
1109
1110 /*
1111 * Decode and fetch the source operand: register, memory
1112 * or immediate.
1113 */
1114 switch (c->d & SrcMask) {
1115 case SrcNone:
1116 break;
1117 case SrcReg:
1118 decode_register_operand(&c->src, c, 0);
1119 break;
1120 case SrcMem16:
1121 c->src.bytes = 2;
1122 goto srcmem_common;
1123 case SrcMem32:
1124 c->src.bytes = 4;
1125 goto srcmem_common;
1126 case SrcMem:
1127 c->src.bytes = (c->d & ByteOp) ? 1 :
1128 c->op_bytes;
1129 /* Don't fetch the address for invlpg: it could be unmapped. */
1130 if (c->twobyte && c->b == 0x01 && c->modrm_reg == 7)
1131 break;
1132 srcmem_common:
1133 /*
1134 * For instructions with a ModR/M byte, switch to register
1135 * access if Mod = 3.
1136 */
1137 if ((c->d & ModRM) && c->modrm_mod == 3) {
1138 c->src.type = OP_REG;
1139 c->src.val = c->modrm_val;
1140 c->src.ptr = c->modrm_ptr;
1141 break;
1142 }
1143 c->src.type = OP_MEM;
1144 c->src.ptr = (unsigned long *)c->modrm_ea;
1145 c->src.val = 0;
1146 break;
1147 case SrcImm:
1148 case SrcImmU:
1149 c->src.type = OP_IMM;
1150 c->src.ptr = (unsigned long *)c->eip;
1151 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1152 if (c->src.bytes == 8)
1153 c->src.bytes = 4;
1154 /* NB. Immediates are sign-extended as necessary. */
1155 switch (c->src.bytes) {
1156 case 1:
1157 c->src.val = insn_fetch(s8, 1, c->eip);
1158 break;
1159 case 2:
1160 c->src.val = insn_fetch(s16, 2, c->eip);
1161 break;
1162 case 4:
1163 c->src.val = insn_fetch(s32, 4, c->eip);
1164 break;
1165 }
1166 if ((c->d & SrcMask) == SrcImmU) {
1167 switch (c->src.bytes) {
1168 case 1:
1169 c->src.val &= 0xff;
1170 break;
1171 case 2:
1172 c->src.val &= 0xffff;
1173 break;
1174 case 4:
1175 c->src.val &= 0xffffffff;
1176 break;
1177 }
1178 }
1179 break;
1180 case SrcImmByte:
1181 case SrcImmUByte:
1182 c->src.type = OP_IMM;
1183 c->src.ptr = (unsigned long *)c->eip;
1184 c->src.bytes = 1;
1185 if ((c->d & SrcMask) == SrcImmByte)
1186 c->src.val = insn_fetch(s8, 1, c->eip);
1187 else
1188 c->src.val = insn_fetch(u8, 1, c->eip);
1189 break;
1190 case SrcAcc:
1191 c->src.type = OP_REG;
1192 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1193 c->src.ptr = &c->regs[VCPU_REGS_RAX];
1194 switch (c->src.bytes) {
1195 case 1:
1196 c->src.val = *(u8 *)c->src.ptr;
1197 break;
1198 case 2:
1199 c->src.val = *(u16 *)c->src.ptr;
1200 break;
1201 case 4:
1202 c->src.val = *(u32 *)c->src.ptr;
1203 break;
1204 case 8:
1205 c->src.val = *(u64 *)c->src.ptr;
1206 break;
1207 }
1208 break;
1209 case SrcOne:
1210 c->src.bytes = 1;
1211 c->src.val = 1;
1212 break;
1213 case SrcSI:
1214 c->src.type = OP_MEM;
1215 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1216 c->src.ptr = (unsigned long *)
1217 register_address(c, seg_override_base(ctxt, ops, c),
1218 c->regs[VCPU_REGS_RSI]);
1219 c->src.val = 0;
1220 break;
1221 case SrcImmFAddr:
1222 c->src.type = OP_IMM;
1223 c->src.ptr = (unsigned long *)c->eip;
1224 c->src.bytes = c->op_bytes + 2;
1225 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
1226 break;
1227 case SrcMemFAddr:
1228 c->src.type = OP_MEM;
1229 c->src.ptr = (unsigned long *)c->modrm_ea;
1230 c->src.bytes = c->op_bytes + 2;
1231 break;
1232 }
1233
1234 /*
1235 * Decode and fetch the second source operand: register, memory
1236 * or immediate.
1237 */
1238 switch (c->d & Src2Mask) {
1239 case Src2None:
1240 break;
1241 case Src2CL:
1242 c->src2.bytes = 1;
1243 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
1244 break;
1245 case Src2ImmByte:
1246 c->src2.type = OP_IMM;
1247 c->src2.ptr = (unsigned long *)c->eip;
1248 c->src2.bytes = 1;
1249 c->src2.val = insn_fetch(u8, 1, c->eip);
1250 break;
1251 case Src2One:
1252 c->src2.bytes = 1;
1253 c->src2.val = 1;
1254 break;
1255 }
1256
1257 /* Decode and fetch the destination operand: register or memory. */
1258 switch (c->d & DstMask) {
1259 case ImplicitOps:
1260 /* Special instructions do their own operand decoding. */
1261 return 0;
1262 case DstReg:
1263 decode_register_operand(&c->dst, c,
1264 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
1265 break;
1266 case DstMem:
1267 case DstMem64:
1268 if ((c->d & ModRM) && c->modrm_mod == 3) {
1269 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1270 c->dst.type = OP_REG;
1271 c->dst.val = c->dst.orig_val = c->modrm_val;
1272 c->dst.ptr = c->modrm_ptr;
1273 break;
1274 }
1275 c->dst.type = OP_MEM;
1276 c->dst.ptr = (unsigned long *)c->modrm_ea;
1277 if ((c->d & DstMask) == DstMem64)
1278 c->dst.bytes = 8;
1279 else
1280 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1281 c->dst.val = 0;
1282 if (c->d & BitOp) {
1283 unsigned long mask = ~(c->dst.bytes * 8 - 1);
1284
1285 c->dst.ptr = (void *)c->dst.ptr +
1286 (c->src.val & mask) / 8;
1287 }
1288 break;
1289 case DstAcc:
1290 c->dst.type = OP_REG;
1291 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1292 c->dst.ptr = &c->regs[VCPU_REGS_RAX];
1293 switch (c->dst.bytes) {
1294 case 1:
1295 c->dst.val = *(u8 *)c->dst.ptr;
1296 break;
1297 case 2:
1298 c->dst.val = *(u16 *)c->dst.ptr;
1299 break;
1300 case 4:
1301 c->dst.val = *(u32 *)c->dst.ptr;
1302 break;
1303 case 8:
1304 c->dst.val = *(u64 *)c->dst.ptr;
1305 break;
1306 }
1307 c->dst.orig_val = c->dst.val;
1308 break;
1309 case DstDI:
1310 c->dst.type = OP_MEM;
1311 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1312 c->dst.ptr = (unsigned long *)
1313 register_address(c, es_base(ctxt, ops),
1314 c->regs[VCPU_REGS_RDI]);
1315 c->dst.val = 0;
1316 break;
1317 }
1318
1319done:
1320 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0;
1321} 1056}
1322 1057
1323static int read_emulated(struct x86_emulate_ctxt *ctxt, 1058static int read_emulated(struct x86_emulate_ctxt *ctxt,
@@ -1326,7 +1061,6 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1326{ 1061{
1327 int rc; 1062 int rc;
1328 struct read_cache *mc = &ctxt->decode.mem_read; 1063 struct read_cache *mc = &ctxt->decode.mem_read;
1329 u32 err;
1330 1064
1331 while (size) { 1065 while (size) {
1332 int n = min(size, 8u); 1066 int n = min(size, 8u);
@@ -1334,10 +1068,8 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1334 if (mc->pos < mc->end) 1068 if (mc->pos < mc->end)
1335 goto read_cached; 1069 goto read_cached;
1336 1070
1337 rc = ops->read_emulated(addr, mc->data + mc->end, n, &err, 1071 rc = ops->read_emulated(ctxt, addr, mc->data + mc->end, n,
1338 ctxt->vcpu); 1072 &ctxt->exception);
1339 if (rc == X86EMUL_PROPAGATE_FAULT)
1340 emulate_pf(ctxt, addr, err);
1341 if (rc != X86EMUL_CONTINUE) 1073 if (rc != X86EMUL_CONTINUE)
1342 return rc; 1074 return rc;
1343 mc->end += n; 1075 mc->end += n;
@@ -1351,6 +1083,50 @@ static int read_emulated(struct x86_emulate_ctxt *ctxt,
1351 return X86EMUL_CONTINUE; 1083 return X86EMUL_CONTINUE;
1352} 1084}
1353 1085
1086static int segmented_read(struct x86_emulate_ctxt *ctxt,
1087 struct segmented_address addr,
1088 void *data,
1089 unsigned size)
1090{
1091 int rc;
1092 ulong linear;
1093
1094 rc = linearize(ctxt, addr, size, false, &linear);
1095 if (rc != X86EMUL_CONTINUE)
1096 return rc;
1097 return read_emulated(ctxt, ctxt->ops, linear, data, size);
1098}
1099
1100static int segmented_write(struct x86_emulate_ctxt *ctxt,
1101 struct segmented_address addr,
1102 const void *data,
1103 unsigned size)
1104{
1105 int rc;
1106 ulong linear;
1107
1108 rc = linearize(ctxt, addr, size, true, &linear);
1109 if (rc != X86EMUL_CONTINUE)
1110 return rc;
1111 return ctxt->ops->write_emulated(ctxt, linear, data, size,
1112 &ctxt->exception);
1113}
1114
1115static int segmented_cmpxchg(struct x86_emulate_ctxt *ctxt,
1116 struct segmented_address addr,
1117 const void *orig_data, const void *data,
1118 unsigned size)
1119{
1120 int rc;
1121 ulong linear;
1122
1123 rc = linearize(ctxt, addr, size, true, &linear);
1124 if (rc != X86EMUL_CONTINUE)
1125 return rc;
1126 return ctxt->ops->cmpxchg_emulated(ctxt, linear, orig_data, data,
1127 size, &ctxt->exception);
1128}
1129
1354static int pio_in_emulated(struct x86_emulate_ctxt *ctxt, 1130static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1355 struct x86_emulate_ops *ops, 1131 struct x86_emulate_ops *ops,
1356 unsigned int size, unsigned short port, 1132 unsigned int size, unsigned short port,
@@ -1371,7 +1147,7 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1371 if (n == 0) 1147 if (n == 0)
1372 n = 1; 1148 n = 1;
1373 rc->pos = rc->end = 0; 1149 rc->pos = rc->end = 0;
1374 if (!ops->pio_in_emulated(size, port, rc->data, n, ctxt->vcpu)) 1150 if (!ops->pio_in_emulated(ctxt, size, port, rc->data, n))
1375 return 0; 1151 return 0;
1376 rc->end = n * size; 1152 rc->end = n * size;
1377 } 1153 }
@@ -1381,27 +1157,22 @@ static int pio_in_emulated(struct x86_emulate_ctxt *ctxt,
1381 return 1; 1157 return 1;
1382} 1158}
1383 1159
1384static u32 desc_limit_scaled(struct desc_struct *desc)
1385{
1386 u32 limit = get_desc_limit(desc);
1387
1388 return desc->g ? (limit << 12) | 0xfff : limit;
1389}
1390
1391static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt, 1160static void get_descriptor_table_ptr(struct x86_emulate_ctxt *ctxt,
1392 struct x86_emulate_ops *ops, 1161 struct x86_emulate_ops *ops,
1393 u16 selector, struct desc_ptr *dt) 1162 u16 selector, struct desc_ptr *dt)
1394{ 1163{
1395 if (selector & 1 << 2) { 1164 if (selector & 1 << 2) {
1396 struct desc_struct desc; 1165 struct desc_struct desc;
1166 u16 sel;
1167
1397 memset (dt, 0, sizeof *dt); 1168 memset (dt, 0, sizeof *dt);
1398 if (!ops->get_cached_descriptor(&desc, VCPU_SREG_LDTR, ctxt->vcpu)) 1169 if (!ops->get_segment(ctxt, &sel, &desc, NULL, VCPU_SREG_LDTR))
1399 return; 1170 return;
1400 1171
1401 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */ 1172 dt->size = desc_limit_scaled(&desc); /* what if limit > 65535? */
1402 dt->address = get_desc_base(&desc); 1173 dt->address = get_desc_base(&desc);
1403 } else 1174 } else
1404 ops->get_gdt(dt, ctxt->vcpu); 1175 ops->get_gdt(ctxt, dt);
1405} 1176}
1406 1177
1407/* allowed just for 8 bytes segments */ 1178/* allowed just for 8 bytes segments */
@@ -1412,19 +1183,14 @@ static int read_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1412 struct desc_ptr dt; 1183 struct desc_ptr dt;
1413 u16 index = selector >> 3; 1184 u16 index = selector >> 3;
1414 int ret; 1185 int ret;
1415 u32 err;
1416 ulong addr; 1186 ulong addr;
1417 1187
1418 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1188 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1419 1189
1420 if (dt.size < index * 8 + 7) { 1190 if (dt.size < index * 8 + 7)
1421 emulate_gp(ctxt, selector & 0xfffc); 1191 return emulate_gp(ctxt, selector & 0xfffc);
1422 return X86EMUL_PROPAGATE_FAULT;
1423 }
1424 addr = dt.address + index * 8; 1192 addr = dt.address + index * 8;
1425 ret = ops->read_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1193 ret = ops->read_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
1426 if (ret == X86EMUL_PROPAGATE_FAULT)
1427 emulate_pf(ctxt, addr, err);
1428 1194
1429 return ret; 1195 return ret;
1430} 1196}
@@ -1436,25 +1202,21 @@ static int write_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1436{ 1202{
1437 struct desc_ptr dt; 1203 struct desc_ptr dt;
1438 u16 index = selector >> 3; 1204 u16 index = selector >> 3;
1439 u32 err;
1440 ulong addr; 1205 ulong addr;
1441 int ret; 1206 int ret;
1442 1207
1443 get_descriptor_table_ptr(ctxt, ops, selector, &dt); 1208 get_descriptor_table_ptr(ctxt, ops, selector, &dt);
1444 1209
1445 if (dt.size < index * 8 + 7) { 1210 if (dt.size < index * 8 + 7)
1446 emulate_gp(ctxt, selector & 0xfffc); 1211 return emulate_gp(ctxt, selector & 0xfffc);
1447 return X86EMUL_PROPAGATE_FAULT;
1448 }
1449 1212
1450 addr = dt.address + index * 8; 1213 addr = dt.address + index * 8;
1451 ret = ops->write_std(addr, desc, sizeof *desc, ctxt->vcpu, &err); 1214 ret = ops->write_std(ctxt, addr, desc, sizeof *desc, &ctxt->exception);
1452 if (ret == X86EMUL_PROPAGATE_FAULT)
1453 emulate_pf(ctxt, addr, err);
1454 1215
1455 return ret; 1216 return ret;
1456} 1217}
1457 1218
1219/* Does not support long mode */
1458static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt, 1220static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1459 struct x86_emulate_ops *ops, 1221 struct x86_emulate_ops *ops,
1460 u16 selector, int seg) 1222 u16 selector, int seg)
@@ -1509,7 +1271,7 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1509 1271
1510 rpl = selector & 3; 1272 rpl = selector & 3;
1511 dpl = seg_desc.dpl; 1273 dpl = seg_desc.dpl;
1512 cpl = ops->cpl(ctxt->vcpu); 1274 cpl = ops->cpl(ctxt);
1513 1275
1514 switch (seg) { 1276 switch (seg) {
1515 case VCPU_SREG_SS: 1277 case VCPU_SREG_SS:
@@ -1565,63 +1327,59 @@ static int load_segment_descriptor(struct x86_emulate_ctxt *ctxt,
1565 return ret; 1327 return ret;
1566 } 1328 }
1567load: 1329load:
1568 ops->set_segment_selector(selector, seg, ctxt->vcpu); 1330 ops->set_segment(ctxt, selector, &seg_desc, 0, seg);
1569 ops->set_cached_descriptor(&seg_desc, seg, ctxt->vcpu);
1570 return X86EMUL_CONTINUE; 1331 return X86EMUL_CONTINUE;
1571exception: 1332exception:
1572 emulate_exception(ctxt, err_vec, err_code, true); 1333 emulate_exception(ctxt, err_vec, err_code, true);
1573 return X86EMUL_PROPAGATE_FAULT; 1334 return X86EMUL_PROPAGATE_FAULT;
1574} 1335}
1575 1336
1576static inline int writeback(struct x86_emulate_ctxt *ctxt, 1337static void write_register_operand(struct operand *op)
1577 struct x86_emulate_ops *ops) 1338{
1339 /* The 4-byte case *is* correct: in 64-bit mode we zero-extend. */
1340 switch (op->bytes) {
1341 case 1:
1342 *(u8 *)op->addr.reg = (u8)op->val;
1343 break;
1344 case 2:
1345 *(u16 *)op->addr.reg = (u16)op->val;
1346 break;
1347 case 4:
1348 *op->addr.reg = (u32)op->val;
1349 break; /* 64b: zero-extend */
1350 case 8:
1351 *op->addr.reg = op->val;
1352 break;
1353 }
1354}
1355
1356static int writeback(struct x86_emulate_ctxt *ctxt)
1578{ 1357{
1579 int rc; 1358 int rc;
1580 struct decode_cache *c = &ctxt->decode; 1359 struct decode_cache *c = &ctxt->decode;
1581 u32 err;
1582 1360
1583 switch (c->dst.type) { 1361 switch (c->dst.type) {
1584 case OP_REG: 1362 case OP_REG:
1585 /* The 4-byte case *is* correct: 1363 write_register_operand(&c->dst);
1586 * in 64-bit mode we zero-extend.
1587 */
1588 switch (c->dst.bytes) {
1589 case 1:
1590 *(u8 *)c->dst.ptr = (u8)c->dst.val;
1591 break;
1592 case 2:
1593 *(u16 *)c->dst.ptr = (u16)c->dst.val;
1594 break;
1595 case 4:
1596 *c->dst.ptr = (u32)c->dst.val;
1597 break; /* 64b: zero-ext */
1598 case 8:
1599 *c->dst.ptr = c->dst.val;
1600 break;
1601 }
1602 break; 1364 break;
1603 case OP_MEM: 1365 case OP_MEM:
1604 if (c->lock_prefix) 1366 if (c->lock_prefix)
1605 rc = ops->cmpxchg_emulated( 1367 rc = segmented_cmpxchg(ctxt,
1606 (unsigned long)c->dst.ptr, 1368 c->dst.addr.mem,
1607 &c->dst.orig_val, 1369 &c->dst.orig_val,
1608 &c->dst.val, 1370 &c->dst.val,
1609 c->dst.bytes, 1371 c->dst.bytes);
1610 &err,
1611 ctxt->vcpu);
1612 else 1372 else
1613 rc = ops->write_emulated( 1373 rc = segmented_write(ctxt,
1614 (unsigned long)c->dst.ptr, 1374 c->dst.addr.mem,
1615 &c->dst.val, 1375 &c->dst.val,
1616 c->dst.bytes, 1376 c->dst.bytes);
1617 &err,
1618 ctxt->vcpu);
1619 if (rc == X86EMUL_PROPAGATE_FAULT)
1620 emulate_pf(ctxt,
1621 (unsigned long)c->dst.ptr, err);
1622 if (rc != X86EMUL_CONTINUE) 1377 if (rc != X86EMUL_CONTINUE)
1623 return rc; 1378 return rc;
1624 break; 1379 break;
1380 case OP_XMM:
1381 write_sse_reg(ctxt, &c->dst.vec_val, c->dst.addr.xmm);
1382 break;
1625 case OP_NONE: 1383 case OP_NONE:
1626 /* no writeback */ 1384 /* no writeback */
1627 break; 1385 break;
@@ -1631,29 +1389,30 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1631 return X86EMUL_CONTINUE; 1389 return X86EMUL_CONTINUE;
1632} 1390}
1633 1391
1634static inline void emulate_push(struct x86_emulate_ctxt *ctxt, 1392static int em_push(struct x86_emulate_ctxt *ctxt)
1635 struct x86_emulate_ops *ops)
1636{ 1393{
1637 struct decode_cache *c = &ctxt->decode; 1394 struct decode_cache *c = &ctxt->decode;
1395 struct segmented_address addr;
1638 1396
1639 c->dst.type = OP_MEM;
1640 c->dst.bytes = c->op_bytes;
1641 c->dst.val = c->src.val;
1642 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes); 1397 register_address_increment(c, &c->regs[VCPU_REGS_RSP], -c->op_bytes);
1643 c->dst.ptr = (void *) register_address(c, ss_base(ctxt, ops), 1398 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1644 c->regs[VCPU_REGS_RSP]); 1399 addr.seg = VCPU_SREG_SS;
1400
1401 /* Disable writeback. */
1402 c->dst.type = OP_NONE;
1403 return segmented_write(ctxt, addr, &c->src.val, c->op_bytes);
1645} 1404}
1646 1405
1647static int emulate_pop(struct x86_emulate_ctxt *ctxt, 1406static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1648 struct x86_emulate_ops *ops,
1649 void *dest, int len) 1407 void *dest, int len)
1650{ 1408{
1651 struct decode_cache *c = &ctxt->decode; 1409 struct decode_cache *c = &ctxt->decode;
1652 int rc; 1410 int rc;
1411 struct segmented_address addr;
1653 1412
1654 rc = read_emulated(ctxt, ops, register_address(c, ss_base(ctxt, ops), 1413 addr.ea = register_address(c, c->regs[VCPU_REGS_RSP]);
1655 c->regs[VCPU_REGS_RSP]), 1414 addr.seg = VCPU_SREG_SS;
1656 dest, len); 1415 rc = segmented_read(ctxt, addr, dest, len);
1657 if (rc != X86EMUL_CONTINUE) 1416 if (rc != X86EMUL_CONTINUE)
1658 return rc; 1417 return rc;
1659 1418
@@ -1661,6 +1420,13 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1661 return rc; 1420 return rc;
1662} 1421}
1663 1422
1423static int em_pop(struct x86_emulate_ctxt *ctxt)
1424{
1425 struct decode_cache *c = &ctxt->decode;
1426
1427 return emulate_pop(ctxt, &c->dst.val, c->op_bytes);
1428}
1429
1664static int emulate_popf(struct x86_emulate_ctxt *ctxt, 1430static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1665 struct x86_emulate_ops *ops, 1431 struct x86_emulate_ops *ops,
1666 void *dest, int len) 1432 void *dest, int len)
@@ -1668,9 +1434,9 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1668 int rc; 1434 int rc;
1669 unsigned long val, change_mask; 1435 unsigned long val, change_mask;
1670 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 1436 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1671 int cpl = ops->cpl(ctxt->vcpu); 1437 int cpl = ops->cpl(ctxt);
1672 1438
1673 rc = emulate_pop(ctxt, ops, &val, len); 1439 rc = emulate_pop(ctxt, &val, len);
1674 if (rc != X86EMUL_CONTINUE) 1440 if (rc != X86EMUL_CONTINUE)
1675 return rc; 1441 return rc;
1676 1442
@@ -1687,10 +1453,8 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1687 change_mask |= EFLG_IF; 1453 change_mask |= EFLG_IF;
1688 break; 1454 break;
1689 case X86EMUL_MODE_VM86: 1455 case X86EMUL_MODE_VM86:
1690 if (iopl < 3) { 1456 if (iopl < 3)
1691 emulate_gp(ctxt, 0); 1457 return emulate_gp(ctxt, 0);
1692 return X86EMUL_PROPAGATE_FAULT;
1693 }
1694 change_mask |= EFLG_IF; 1458 change_mask |= EFLG_IF;
1695 break; 1459 break;
1696 default: /* real mode */ 1460 default: /* real mode */
@@ -1704,14 +1468,24 @@ static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1704 return rc; 1468 return rc;
1705} 1469}
1706 1470
1707static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, 1471static int em_popf(struct x86_emulate_ctxt *ctxt)
1708 struct x86_emulate_ops *ops, int seg) 1472{
1473 struct decode_cache *c = &ctxt->decode;
1474
1475 c->dst.type = OP_REG;
1476 c->dst.addr.reg = &ctxt->eflags;
1477 c->dst.bytes = c->op_bytes;
1478 return emulate_popf(ctxt, ctxt->ops, &c->dst.val, c->op_bytes);
1479}
1480
1481static int emulate_push_sreg(struct x86_emulate_ctxt *ctxt,
1482 struct x86_emulate_ops *ops, int seg)
1709{ 1483{
1710 struct decode_cache *c = &ctxt->decode; 1484 struct decode_cache *c = &ctxt->decode;
1711 1485
1712 c->src.val = ops->get_segment_selector(seg, ctxt->vcpu); 1486 c->src.val = get_segment_selector(ctxt, seg);
1713 1487
1714 emulate_push(ctxt, ops); 1488 return em_push(ctxt);
1715} 1489}
1716 1490
1717static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt, 1491static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
@@ -1721,7 +1495,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1721 unsigned long selector; 1495 unsigned long selector;
1722 int rc; 1496 int rc;
1723 1497
1724 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes); 1498 rc = emulate_pop(ctxt, &selector, c->op_bytes);
1725 if (rc != X86EMUL_CONTINUE) 1499 if (rc != X86EMUL_CONTINUE)
1726 return rc; 1500 return rc;
1727 1501
@@ -1729,8 +1503,7 @@ static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1729 return rc; 1503 return rc;
1730} 1504}
1731 1505
1732static int emulate_pusha(struct x86_emulate_ctxt *ctxt, 1506static int em_pusha(struct x86_emulate_ctxt *ctxt)
1733 struct x86_emulate_ops *ops)
1734{ 1507{
1735 struct decode_cache *c = &ctxt->decode; 1508 struct decode_cache *c = &ctxt->decode;
1736 unsigned long old_esp = c->regs[VCPU_REGS_RSP]; 1509 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
@@ -1741,23 +1514,25 @@ static int emulate_pusha(struct x86_emulate_ctxt *ctxt,
1741 (reg == VCPU_REGS_RSP) ? 1514 (reg == VCPU_REGS_RSP) ?
1742 (c->src.val = old_esp) : (c->src.val = c->regs[reg]); 1515 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1743 1516
1744 emulate_push(ctxt, ops); 1517 rc = em_push(ctxt);
1745
1746 rc = writeback(ctxt, ops);
1747 if (rc != X86EMUL_CONTINUE) 1518 if (rc != X86EMUL_CONTINUE)
1748 return rc; 1519 return rc;
1749 1520
1750 ++reg; 1521 ++reg;
1751 } 1522 }
1752 1523
1753 /* Disable writeback. */
1754 c->dst.type = OP_NONE;
1755
1756 return rc; 1524 return rc;
1757} 1525}
1758 1526
1759static int emulate_popa(struct x86_emulate_ctxt *ctxt, 1527static int em_pushf(struct x86_emulate_ctxt *ctxt)
1760 struct x86_emulate_ops *ops) 1528{
1529 struct decode_cache *c = &ctxt->decode;
1530
1531 c->src.val = (unsigned long)ctxt->eflags;
1532 return em_push(ctxt);
1533}
1534
1535static int em_popa(struct x86_emulate_ctxt *ctxt)
1761{ 1536{
1762 struct decode_cache *c = &ctxt->decode; 1537 struct decode_cache *c = &ctxt->decode;
1763 int rc = X86EMUL_CONTINUE; 1538 int rc = X86EMUL_CONTINUE;
@@ -1770,7 +1545,7 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1770 --reg; 1545 --reg;
1771 } 1546 }
1772 1547
1773 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes); 1548 rc = emulate_pop(ctxt, &c->regs[reg], c->op_bytes);
1774 if (rc != X86EMUL_CONTINUE) 1549 if (rc != X86EMUL_CONTINUE)
1775 break; 1550 break;
1776 --reg; 1551 --reg;
@@ -1778,15 +1553,167 @@ static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1778 return rc; 1553 return rc;
1779} 1554}
1780 1555
1781static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1556int emulate_int_real(struct x86_emulate_ctxt *ctxt,
1782 struct x86_emulate_ops *ops) 1557 struct x86_emulate_ops *ops, int irq)
1783{ 1558{
1784 struct decode_cache *c = &ctxt->decode; 1559 struct decode_cache *c = &ctxt->decode;
1560 int rc;
1561 struct desc_ptr dt;
1562 gva_t cs_addr;
1563 gva_t eip_addr;
1564 u16 cs, eip;
1785 1565
1786 return emulate_pop(ctxt, ops, &c->dst.val, c->dst.bytes); 1566 /* TODO: Add limit checks */
1567 c->src.val = ctxt->eflags;
1568 rc = em_push(ctxt);
1569 if (rc != X86EMUL_CONTINUE)
1570 return rc;
1571
1572 ctxt->eflags &= ~(EFLG_IF | EFLG_TF | EFLG_AC);
1573
1574 c->src.val = get_segment_selector(ctxt, VCPU_SREG_CS);
1575 rc = em_push(ctxt);
1576 if (rc != X86EMUL_CONTINUE)
1577 return rc;
1578
1579 c->src.val = c->eip;
1580 rc = em_push(ctxt);
1581 if (rc != X86EMUL_CONTINUE)
1582 return rc;
1583
1584 ops->get_idt(ctxt, &dt);
1585
1586 eip_addr = dt.address + (irq << 2);
1587 cs_addr = dt.address + (irq << 2) + 2;
1588
1589 rc = ops->read_std(ctxt, cs_addr, &cs, 2, &ctxt->exception);
1590 if (rc != X86EMUL_CONTINUE)
1591 return rc;
1592
1593 rc = ops->read_std(ctxt, eip_addr, &eip, 2, &ctxt->exception);
1594 if (rc != X86EMUL_CONTINUE)
1595 return rc;
1596
1597 rc = load_segment_descriptor(ctxt, ops, cs, VCPU_SREG_CS);
1598 if (rc != X86EMUL_CONTINUE)
1599 return rc;
1600
1601 c->eip = eip;
1602
1603 return rc;
1787} 1604}
1788 1605
1789static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt) 1606static int emulate_int(struct x86_emulate_ctxt *ctxt,
1607 struct x86_emulate_ops *ops, int irq)
1608{
1609 switch(ctxt->mode) {
1610 case X86EMUL_MODE_REAL:
1611 return emulate_int_real(ctxt, ops, irq);
1612 case X86EMUL_MODE_VM86:
1613 case X86EMUL_MODE_PROT16:
1614 case X86EMUL_MODE_PROT32:
1615 case X86EMUL_MODE_PROT64:
1616 default:
1617 /* Protected mode interrupts unimplemented yet */
1618 return X86EMUL_UNHANDLEABLE;
1619 }
1620}
1621
1622static int emulate_iret_real(struct x86_emulate_ctxt *ctxt,
1623 struct x86_emulate_ops *ops)
1624{
1625 struct decode_cache *c = &ctxt->decode;
1626 int rc = X86EMUL_CONTINUE;
1627 unsigned long temp_eip = 0;
1628 unsigned long temp_eflags = 0;
1629 unsigned long cs = 0;
1630 unsigned long mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_TF |
1631 EFLG_IF | EFLG_DF | EFLG_OF | EFLG_IOPL | EFLG_NT | EFLG_RF |
1632 EFLG_AC | EFLG_ID | (1 << 1); /* Last one is the reserved bit */
1633 unsigned long vm86_mask = EFLG_VM | EFLG_VIF | EFLG_VIP;
1634
1635 /* TODO: Add stack limit check */
1636
1637 rc = emulate_pop(ctxt, &temp_eip, c->op_bytes);
1638
1639 if (rc != X86EMUL_CONTINUE)
1640 return rc;
1641
1642 if (temp_eip & ~0xffff)
1643 return emulate_gp(ctxt, 0);
1644
1645 rc = emulate_pop(ctxt, &cs, c->op_bytes);
1646
1647 if (rc != X86EMUL_CONTINUE)
1648 return rc;
1649
1650 rc = emulate_pop(ctxt, &temp_eflags, c->op_bytes);
1651
1652 if (rc != X86EMUL_CONTINUE)
1653 return rc;
1654
1655 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1656
1657 if (rc != X86EMUL_CONTINUE)
1658 return rc;
1659
1660 c->eip = temp_eip;
1661
1662
1663 if (c->op_bytes == 4)
1664 ctxt->eflags = ((temp_eflags & mask) | (ctxt->eflags & vm86_mask));
1665 else if (c->op_bytes == 2) {
1666 ctxt->eflags &= ~0xffff;
1667 ctxt->eflags |= temp_eflags;
1668 }
1669
1670 ctxt->eflags &= ~EFLG_RESERVED_ZEROS_MASK; /* Clear reserved zeros */
1671 ctxt->eflags |= EFLG_RESERVED_ONE_MASK;
1672
1673 return rc;
1674}
1675
1676static inline int emulate_iret(struct x86_emulate_ctxt *ctxt,
1677 struct x86_emulate_ops* ops)
1678{
1679 switch(ctxt->mode) {
1680 case X86EMUL_MODE_REAL:
1681 return emulate_iret_real(ctxt, ops);
1682 case X86EMUL_MODE_VM86:
1683 case X86EMUL_MODE_PROT16:
1684 case X86EMUL_MODE_PROT32:
1685 case X86EMUL_MODE_PROT64:
1686 default:
1687 /* iret from protected mode unimplemented yet */
1688 return X86EMUL_UNHANDLEABLE;
1689 }
1690}
1691
1692static int em_jmp_far(struct x86_emulate_ctxt *ctxt)
1693{
1694 struct decode_cache *c = &ctxt->decode;
1695 int rc;
1696 unsigned short sel;
1697
1698 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1699
1700 rc = load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS);
1701 if (rc != X86EMUL_CONTINUE)
1702 return rc;
1703
1704 c->eip = 0;
1705 memcpy(&c->eip, c->src.valptr, c->op_bytes);
1706 return X86EMUL_CONTINUE;
1707}
1708
1709static int em_grp1a(struct x86_emulate_ctxt *ctxt)
1710{
1711 struct decode_cache *c = &ctxt->decode;
1712
1713 return emulate_pop(ctxt, &c->dst.val, c->dst.bytes);
1714}
1715
1716static int em_grp2(struct x86_emulate_ctxt *ctxt)
1790{ 1717{
1791 struct decode_cache *c = &ctxt->decode; 1718 struct decode_cache *c = &ctxt->decode;
1792 switch (c->modrm_reg) { 1719 switch (c->modrm_reg) {
@@ -1813,12 +1740,15 @@ static inline void emulate_grp2(struct x86_emulate_ctxt *ctxt)
1813 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags); 1740 emulate_2op_SrcB("sar", c->src, c->dst, ctxt->eflags);
1814 break; 1741 break;
1815 } 1742 }
1743 return X86EMUL_CONTINUE;
1816} 1744}
1817 1745
1818static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt, 1746static int em_grp3(struct x86_emulate_ctxt *ctxt)
1819 struct x86_emulate_ops *ops)
1820{ 1747{
1821 struct decode_cache *c = &ctxt->decode; 1748 struct decode_cache *c = &ctxt->decode;
1749 unsigned long *rax = &c->regs[VCPU_REGS_RAX];
1750 unsigned long *rdx = &c->regs[VCPU_REGS_RDX];
1751 u8 de = 0;
1822 1752
1823 switch (c->modrm_reg) { 1753 switch (c->modrm_reg) {
1824 case 0 ... 1: /* test */ 1754 case 0 ... 1: /* test */
@@ -1830,16 +1760,32 @@ static inline int emulate_grp3(struct x86_emulate_ctxt *ctxt,
1830 case 3: /* neg */ 1760 case 3: /* neg */
1831 emulate_1op("neg", c->dst, ctxt->eflags); 1761 emulate_1op("neg", c->dst, ctxt->eflags);
1832 break; 1762 break;
1763 case 4: /* mul */
1764 emulate_1op_rax_rdx("mul", c->src, *rax, *rdx, ctxt->eflags);
1765 break;
1766 case 5: /* imul */
1767 emulate_1op_rax_rdx("imul", c->src, *rax, *rdx, ctxt->eflags);
1768 break;
1769 case 6: /* div */
1770 emulate_1op_rax_rdx_ex("div", c->src, *rax, *rdx,
1771 ctxt->eflags, de);
1772 break;
1773 case 7: /* idiv */
1774 emulate_1op_rax_rdx_ex("idiv", c->src, *rax, *rdx,
1775 ctxt->eflags, de);
1776 break;
1833 default: 1777 default:
1834 return 0; 1778 return X86EMUL_UNHANDLEABLE;
1835 } 1779 }
1836 return 1; 1780 if (de)
1781 return emulate_de(ctxt);
1782 return X86EMUL_CONTINUE;
1837} 1783}
1838 1784
1839static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt, 1785static int em_grp45(struct x86_emulate_ctxt *ctxt)
1840 struct x86_emulate_ops *ops)
1841{ 1786{
1842 struct decode_cache *c = &ctxt->decode; 1787 struct decode_cache *c = &ctxt->decode;
1788 int rc = X86EMUL_CONTINUE;
1843 1789
1844 switch (c->modrm_reg) { 1790 switch (c->modrm_reg) {
1845 case 0: /* inc */ 1791 case 0: /* inc */
@@ -1853,21 +1799,23 @@ static inline int emulate_grp45(struct x86_emulate_ctxt *ctxt,
1853 old_eip = c->eip; 1799 old_eip = c->eip;
1854 c->eip = c->src.val; 1800 c->eip = c->src.val;
1855 c->src.val = old_eip; 1801 c->src.val = old_eip;
1856 emulate_push(ctxt, ops); 1802 rc = em_push(ctxt);
1857 break; 1803 break;
1858 } 1804 }
1859 case 4: /* jmp abs */ 1805 case 4: /* jmp abs */
1860 c->eip = c->src.val; 1806 c->eip = c->src.val;
1861 break; 1807 break;
1808 case 5: /* jmp far */
1809 rc = em_jmp_far(ctxt);
1810 break;
1862 case 6: /* push */ 1811 case 6: /* push */
1863 emulate_push(ctxt, ops); 1812 rc = em_push(ctxt);
1864 break; 1813 break;
1865 } 1814 }
1866 return X86EMUL_CONTINUE; 1815 return rc;
1867} 1816}
1868 1817
1869static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt, 1818static int em_grp9(struct x86_emulate_ctxt *ctxt)
1870 struct x86_emulate_ops *ops)
1871{ 1819{
1872 struct decode_cache *c = &ctxt->decode; 1820 struct decode_cache *c = &ctxt->decode;
1873 u64 old = c->dst.orig_val64; 1821 u64 old = c->dst.orig_val64;
@@ -1893,25 +1841,44 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1893 int rc; 1841 int rc;
1894 unsigned long cs; 1842 unsigned long cs;
1895 1843
1896 rc = emulate_pop(ctxt, ops, &c->eip, c->op_bytes); 1844 rc = emulate_pop(ctxt, &c->eip, c->op_bytes);
1897 if (rc != X86EMUL_CONTINUE) 1845 if (rc != X86EMUL_CONTINUE)
1898 return rc; 1846 return rc;
1899 if (c->op_bytes == 4) 1847 if (c->op_bytes == 4)
1900 c->eip = (u32)c->eip; 1848 c->eip = (u32)c->eip;
1901 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1849 rc = emulate_pop(ctxt, &cs, c->op_bytes);
1902 if (rc != X86EMUL_CONTINUE) 1850 if (rc != X86EMUL_CONTINUE)
1903 return rc; 1851 return rc;
1904 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS); 1852 rc = load_segment_descriptor(ctxt, ops, (u16)cs, VCPU_SREG_CS);
1905 return rc; 1853 return rc;
1906} 1854}
1907 1855
1856static int emulate_load_segment(struct x86_emulate_ctxt *ctxt,
1857 struct x86_emulate_ops *ops, int seg)
1858{
1859 struct decode_cache *c = &ctxt->decode;
1860 unsigned short sel;
1861 int rc;
1862
1863 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
1864
1865 rc = load_segment_descriptor(ctxt, ops, sel, seg);
1866 if (rc != X86EMUL_CONTINUE)
1867 return rc;
1868
1869 c->dst.val = c->src.val;
1870 return rc;
1871}
1872
1908static inline void 1873static inline void
1909setup_syscalls_segments(struct x86_emulate_ctxt *ctxt, 1874setup_syscalls_segments(struct x86_emulate_ctxt *ctxt,
1910 struct x86_emulate_ops *ops, struct desc_struct *cs, 1875 struct x86_emulate_ops *ops, struct desc_struct *cs,
1911 struct desc_struct *ss) 1876 struct desc_struct *ss)
1912{ 1877{
1878 u16 selector;
1879
1913 memset(cs, 0, sizeof(struct desc_struct)); 1880 memset(cs, 0, sizeof(struct desc_struct));
1914 ops->get_cached_descriptor(cs, VCPU_SREG_CS, ctxt->vcpu); 1881 ops->get_segment(ctxt, &selector, cs, NULL, VCPU_SREG_CS);
1915 memset(ss, 0, sizeof(struct desc_struct)); 1882 memset(ss, 0, sizeof(struct desc_struct));
1916 1883
1917 cs->l = 0; /* will be adjusted later */ 1884 cs->l = 0; /* will be adjusted later */
@@ -1941,46 +1908,44 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1941 struct desc_struct cs, ss; 1908 struct desc_struct cs, ss;
1942 u64 msr_data; 1909 u64 msr_data;
1943 u16 cs_sel, ss_sel; 1910 u16 cs_sel, ss_sel;
1911 u64 efer = 0;
1944 1912
1945 /* syscall is not available in real mode */ 1913 /* syscall is not available in real mode */
1946 if (ctxt->mode == X86EMUL_MODE_REAL || 1914 if (ctxt->mode == X86EMUL_MODE_REAL ||
1947 ctxt->mode == X86EMUL_MODE_VM86) { 1915 ctxt->mode == X86EMUL_MODE_VM86)
1948 emulate_ud(ctxt); 1916 return emulate_ud(ctxt);
1949 return X86EMUL_PROPAGATE_FAULT;
1950 }
1951 1917
1918 ops->get_msr(ctxt, MSR_EFER, &efer);
1952 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1919 setup_syscalls_segments(ctxt, ops, &cs, &ss);
1953 1920
1954 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1921 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1955 msr_data >>= 32; 1922 msr_data >>= 32;
1956 cs_sel = (u16)(msr_data & 0xfffc); 1923 cs_sel = (u16)(msr_data & 0xfffc);
1957 ss_sel = (u16)(msr_data + 8); 1924 ss_sel = (u16)(msr_data + 8);
1958 1925
1959 if (is_long_mode(ctxt->vcpu)) { 1926 if (efer & EFER_LMA) {
1960 cs.d = 0; 1927 cs.d = 0;
1961 cs.l = 1; 1928 cs.l = 1;
1962 } 1929 }
1963 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 1930 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
1964 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 1931 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
1965 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
1966 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
1967 1932
1968 c->regs[VCPU_REGS_RCX] = c->eip; 1933 c->regs[VCPU_REGS_RCX] = c->eip;
1969 if (is_long_mode(ctxt->vcpu)) { 1934 if (efer & EFER_LMA) {
1970#ifdef CONFIG_X86_64 1935#ifdef CONFIG_X86_64
1971 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF; 1936 c->regs[VCPU_REGS_R11] = ctxt->eflags & ~EFLG_RF;
1972 1937
1973 ops->get_msr(ctxt->vcpu, 1938 ops->get_msr(ctxt,
1974 ctxt->mode == X86EMUL_MODE_PROT64 ? 1939 ctxt->mode == X86EMUL_MODE_PROT64 ?
1975 MSR_LSTAR : MSR_CSTAR, &msr_data); 1940 MSR_LSTAR : MSR_CSTAR, &msr_data);
1976 c->eip = msr_data; 1941 c->eip = msr_data;
1977 1942
1978 ops->get_msr(ctxt->vcpu, MSR_SYSCALL_MASK, &msr_data); 1943 ops->get_msr(ctxt, MSR_SYSCALL_MASK, &msr_data);
1979 ctxt->eflags &= ~(msr_data | EFLG_RF); 1944 ctxt->eflags &= ~(msr_data | EFLG_RF);
1980#endif 1945#endif
1981 } else { 1946 } else {
1982 /* legacy mode */ 1947 /* legacy mode */
1983 ops->get_msr(ctxt->vcpu, MSR_STAR, &msr_data); 1948 ops->get_msr(ctxt, MSR_STAR, &msr_data);
1984 c->eip = (u32)msr_data; 1949 c->eip = (u32)msr_data;
1985 1950
1986 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1951 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
@@ -1996,36 +1961,30 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1996 struct desc_struct cs, ss; 1961 struct desc_struct cs, ss;
1997 u64 msr_data; 1962 u64 msr_data;
1998 u16 cs_sel, ss_sel; 1963 u16 cs_sel, ss_sel;
1964 u64 efer = 0;
1999 1965
1966 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2000 /* inject #GP if in real mode */ 1967 /* inject #GP if in real mode */
2001 if (ctxt->mode == X86EMUL_MODE_REAL) { 1968 if (ctxt->mode == X86EMUL_MODE_REAL)
2002 emulate_gp(ctxt, 0); 1969 return emulate_gp(ctxt, 0);
2003 return X86EMUL_PROPAGATE_FAULT;
2004 }
2005 1970
2006 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1971 /* XXX sysenter/sysexit have not been tested in 64bit mode.
2007 * Therefore, we inject an #UD. 1972 * Therefore, we inject an #UD.
2008 */ 1973 */
2009 if (ctxt->mode == X86EMUL_MODE_PROT64) { 1974 if (ctxt->mode == X86EMUL_MODE_PROT64)
2010 emulate_ud(ctxt); 1975 return emulate_ud(ctxt);
2011 return X86EMUL_PROPAGATE_FAULT;
2012 }
2013 1976
2014 setup_syscalls_segments(ctxt, ops, &cs, &ss); 1977 setup_syscalls_segments(ctxt, ops, &cs, &ss);
2015 1978
2016 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 1979 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
2017 switch (ctxt->mode) { 1980 switch (ctxt->mode) {
2018 case X86EMUL_MODE_PROT32: 1981 case X86EMUL_MODE_PROT32:
2019 if ((msr_data & 0xfffc) == 0x0) { 1982 if ((msr_data & 0xfffc) == 0x0)
2020 emulate_gp(ctxt, 0); 1983 return emulate_gp(ctxt, 0);
2021 return X86EMUL_PROPAGATE_FAULT;
2022 }
2023 break; 1984 break;
2024 case X86EMUL_MODE_PROT64: 1985 case X86EMUL_MODE_PROT64:
2025 if (msr_data == 0x0) { 1986 if (msr_data == 0x0)
2026 emulate_gp(ctxt, 0); 1987 return emulate_gp(ctxt, 0);
2027 return X86EMUL_PROPAGATE_FAULT;
2028 }
2029 break; 1988 break;
2030 } 1989 }
2031 1990
@@ -2034,21 +1993,18 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2034 cs_sel &= ~SELECTOR_RPL_MASK; 1993 cs_sel &= ~SELECTOR_RPL_MASK;
2035 ss_sel = cs_sel + 8; 1994 ss_sel = cs_sel + 8;
2036 ss_sel &= ~SELECTOR_RPL_MASK; 1995 ss_sel &= ~SELECTOR_RPL_MASK;
2037 if (ctxt->mode == X86EMUL_MODE_PROT64 1996 if (ctxt->mode == X86EMUL_MODE_PROT64 || (efer & EFER_LMA)) {
2038 || is_long_mode(ctxt->vcpu)) {
2039 cs.d = 0; 1997 cs.d = 0;
2040 cs.l = 1; 1998 cs.l = 1;
2041 } 1999 }
2042 2000
2043 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 2001 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2044 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2002 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2045 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2046 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2047 2003
2048 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_EIP, &msr_data); 2004 ops->get_msr(ctxt, MSR_IA32_SYSENTER_EIP, &msr_data);
2049 c->eip = msr_data; 2005 c->eip = msr_data;
2050 2006
2051 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 2007 ops->get_msr(ctxt, MSR_IA32_SYSENTER_ESP, &msr_data);
2052 c->regs[VCPU_REGS_RSP] = msr_data; 2008 c->regs[VCPU_REGS_RSP] = msr_data;
2053 2009
2054 return X86EMUL_CONTINUE; 2010 return X86EMUL_CONTINUE;
@@ -2065,10 +2021,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2065 2021
2066 /* inject #GP if in real mode or Virtual 8086 mode */ 2022 /* inject #GP if in real mode or Virtual 8086 mode */
2067 if (ctxt->mode == X86EMUL_MODE_REAL || 2023 if (ctxt->mode == X86EMUL_MODE_REAL ||
2068 ctxt->mode == X86EMUL_MODE_VM86) { 2024 ctxt->mode == X86EMUL_MODE_VM86)
2069 emulate_gp(ctxt, 0); 2025 return emulate_gp(ctxt, 0);
2070 return X86EMUL_PROPAGATE_FAULT;
2071 }
2072 2026
2073 setup_syscalls_segments(ctxt, ops, &cs, &ss); 2027 setup_syscalls_segments(ctxt, ops, &cs, &ss);
2074 2028
@@ -2079,22 +2033,18 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2079 2033
2080 cs.dpl = 3; 2034 cs.dpl = 3;
2081 ss.dpl = 3; 2035 ss.dpl = 3;
2082 ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_CS, &msr_data); 2036 ops->get_msr(ctxt, MSR_IA32_SYSENTER_CS, &msr_data);
2083 switch (usermode) { 2037 switch (usermode) {
2084 case X86EMUL_MODE_PROT32: 2038 case X86EMUL_MODE_PROT32:
2085 cs_sel = (u16)(msr_data + 16); 2039 cs_sel = (u16)(msr_data + 16);
2086 if ((msr_data & 0xfffc) == 0x0) { 2040 if ((msr_data & 0xfffc) == 0x0)
2087 emulate_gp(ctxt, 0); 2041 return emulate_gp(ctxt, 0);
2088 return X86EMUL_PROPAGATE_FAULT;
2089 }
2090 ss_sel = (u16)(msr_data + 24); 2042 ss_sel = (u16)(msr_data + 24);
2091 break; 2043 break;
2092 case X86EMUL_MODE_PROT64: 2044 case X86EMUL_MODE_PROT64:
2093 cs_sel = (u16)(msr_data + 32); 2045 cs_sel = (u16)(msr_data + 32);
2094 if (msr_data == 0x0) { 2046 if (msr_data == 0x0)
2095 emulate_gp(ctxt, 0); 2047 return emulate_gp(ctxt, 0);
2096 return X86EMUL_PROPAGATE_FAULT;
2097 }
2098 ss_sel = cs_sel + 8; 2048 ss_sel = cs_sel + 8;
2099 cs.d = 0; 2049 cs.d = 0;
2100 cs.l = 1; 2050 cs.l = 1;
@@ -2103,10 +2053,8 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2103 cs_sel |= SELECTOR_RPL_MASK; 2053 cs_sel |= SELECTOR_RPL_MASK;
2104 ss_sel |= SELECTOR_RPL_MASK; 2054 ss_sel |= SELECTOR_RPL_MASK;
2105 2055
2106 ops->set_cached_descriptor(&cs, VCPU_SREG_CS, ctxt->vcpu); 2056 ops->set_segment(ctxt, cs_sel, &cs, 0, VCPU_SREG_CS);
2107 ops->set_segment_selector(cs_sel, VCPU_SREG_CS, ctxt->vcpu); 2057 ops->set_segment(ctxt, ss_sel, &ss, 0, VCPU_SREG_SS);
2108 ops->set_cached_descriptor(&ss, VCPU_SREG_SS, ctxt->vcpu);
2109 ops->set_segment_selector(ss_sel, VCPU_SREG_SS, ctxt->vcpu);
2110 2058
2111 c->eip = c->regs[VCPU_REGS_RDX]; 2059 c->eip = c->regs[VCPU_REGS_RDX];
2112 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX]; 2060 c->regs[VCPU_REGS_RSP] = c->regs[VCPU_REGS_RCX];
@@ -2123,7 +2071,7 @@ static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt,
2123 if (ctxt->mode == X86EMUL_MODE_VM86) 2071 if (ctxt->mode == X86EMUL_MODE_VM86)
2124 return true; 2072 return true;
2125 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT; 2073 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
2126 return ops->cpl(ctxt->vcpu) > iopl; 2074 return ops->cpl(ctxt) > iopl;
2127} 2075}
2128 2076
2129static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt, 2077static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
@@ -2131,24 +2079,27 @@ static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
2131 u16 port, u16 len) 2079 u16 port, u16 len)
2132{ 2080{
2133 struct desc_struct tr_seg; 2081 struct desc_struct tr_seg;
2082 u32 base3;
2134 int r; 2083 int r;
2135 u16 io_bitmap_ptr; 2084 u16 tr, io_bitmap_ptr, perm, bit_idx = port & 0x7;
2136 u8 perm, bit_idx = port & 0x7;
2137 unsigned mask = (1 << len) - 1; 2085 unsigned mask = (1 << len) - 1;
2086 unsigned long base;
2138 2087
2139 ops->get_cached_descriptor(&tr_seg, VCPU_SREG_TR, ctxt->vcpu); 2088 ops->get_segment(ctxt, &tr, &tr_seg, &base3, VCPU_SREG_TR);
2140 if (!tr_seg.p) 2089 if (!tr_seg.p)
2141 return false; 2090 return false;
2142 if (desc_limit_scaled(&tr_seg) < 103) 2091 if (desc_limit_scaled(&tr_seg) < 103)
2143 return false; 2092 return false;
2144 r = ops->read_std(get_desc_base(&tr_seg) + 102, &io_bitmap_ptr, 2, 2093 base = get_desc_base(&tr_seg);
2145 ctxt->vcpu, NULL); 2094#ifdef CONFIG_X86_64
2095 base |= ((u64)base3) << 32;
2096#endif
2097 r = ops->read_std(ctxt, base + 102, &io_bitmap_ptr, 2, NULL);
2146 if (r != X86EMUL_CONTINUE) 2098 if (r != X86EMUL_CONTINUE)
2147 return false; 2099 return false;
2148 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg)) 2100 if (io_bitmap_ptr + port/8 > desc_limit_scaled(&tr_seg))
2149 return false; 2101 return false;
2150 r = ops->read_std(get_desc_base(&tr_seg) + io_bitmap_ptr + port/8, 2102 r = ops->read_std(ctxt, base + io_bitmap_ptr + port/8, &perm, 2, NULL);
2151 &perm, 1, ctxt->vcpu, NULL);
2152 if (r != X86EMUL_CONTINUE) 2103 if (r != X86EMUL_CONTINUE)
2153 return false; 2104 return false;
2154 if ((perm >> bit_idx) & mask) 2105 if ((perm >> bit_idx) & mask)
@@ -2160,9 +2111,15 @@ static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
2160 struct x86_emulate_ops *ops, 2111 struct x86_emulate_ops *ops,
2161 u16 port, u16 len) 2112 u16 port, u16 len)
2162{ 2113{
2114 if (ctxt->perm_ok)
2115 return true;
2116
2163 if (emulator_bad_iopl(ctxt, ops)) 2117 if (emulator_bad_iopl(ctxt, ops))
2164 if (!emulator_io_port_access_allowed(ctxt, ops, port, len)) 2118 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
2165 return false; 2119 return false;
2120
2121 ctxt->perm_ok = true;
2122
2166 return true; 2123 return true;
2167} 2124}
2168 2125
@@ -2183,11 +2140,11 @@ static void save_state_to_tss16(struct x86_emulate_ctxt *ctxt,
2183 tss->si = c->regs[VCPU_REGS_RSI]; 2140 tss->si = c->regs[VCPU_REGS_RSI];
2184 tss->di = c->regs[VCPU_REGS_RDI]; 2141 tss->di = c->regs[VCPU_REGS_RDI];
2185 2142
2186 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); 2143 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2187 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2144 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2188 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); 2145 tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
2189 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); 2146 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
2190 tss->ldt = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); 2147 tss->ldt = get_segment_selector(ctxt, VCPU_SREG_LDTR);
2191} 2148}
2192 2149
2193static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt, 2150static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
@@ -2212,11 +2169,11 @@ static int load_state_from_tss16(struct x86_emulate_ctxt *ctxt,
2212 * SDM says that segment selectors are loaded before segment 2169 * SDM says that segment selectors are loaded before segment
2213 * descriptors 2170 * descriptors
2214 */ 2171 */
2215 ops->set_segment_selector(tss->ldt, VCPU_SREG_LDTR, ctxt->vcpu); 2172 set_segment_selector(ctxt, tss->ldt, VCPU_SREG_LDTR);
2216 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); 2173 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
2217 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); 2174 set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
2218 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); 2175 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
2219 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); 2176 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2220 2177
2221 /* 2178 /*
2222 * Now load segment descriptors. If fault happenes at this stage 2179 * Now load segment descriptors. If fault happenes at this stage
@@ -2248,46 +2205,38 @@ static int task_switch_16(struct x86_emulate_ctxt *ctxt,
2248{ 2205{
2249 struct tss_segment_16 tss_seg; 2206 struct tss_segment_16 tss_seg;
2250 int ret; 2207 int ret;
2251 u32 err, new_tss_base = get_desc_base(new_desc); 2208 u32 new_tss_base = get_desc_base(new_desc);
2252 2209
2253 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2210 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2254 &err); 2211 &ctxt->exception);
2255 if (ret == X86EMUL_PROPAGATE_FAULT) { 2212 if (ret != X86EMUL_CONTINUE)
2256 /* FIXME: need to provide precise fault address */ 2213 /* FIXME: need to provide precise fault address */
2257 emulate_pf(ctxt, old_tss_base, err);
2258 return ret; 2214 return ret;
2259 }
2260 2215
2261 save_state_to_tss16(ctxt, ops, &tss_seg); 2216 save_state_to_tss16(ctxt, ops, &tss_seg);
2262 2217
2263 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2218 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2264 &err); 2219 &ctxt->exception);
2265 if (ret == X86EMUL_PROPAGATE_FAULT) { 2220 if (ret != X86EMUL_CONTINUE)
2266 /* FIXME: need to provide precise fault address */ 2221 /* FIXME: need to provide precise fault address */
2267 emulate_pf(ctxt, old_tss_base, err);
2268 return ret; 2222 return ret;
2269 }
2270 2223
2271 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2224 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
2272 &err); 2225 &ctxt->exception);
2273 if (ret == X86EMUL_PROPAGATE_FAULT) { 2226 if (ret != X86EMUL_CONTINUE)
2274 /* FIXME: need to provide precise fault address */ 2227 /* FIXME: need to provide precise fault address */
2275 emulate_pf(ctxt, new_tss_base, err);
2276 return ret; 2228 return ret;
2277 }
2278 2229
2279 if (old_tss_sel != 0xffff) { 2230 if (old_tss_sel != 0xffff) {
2280 tss_seg.prev_task_link = old_tss_sel; 2231 tss_seg.prev_task_link = old_tss_sel;
2281 2232
2282 ret = ops->write_std(new_tss_base, 2233 ret = ops->write_std(ctxt, new_tss_base,
2283 &tss_seg.prev_task_link, 2234 &tss_seg.prev_task_link,
2284 sizeof tss_seg.prev_task_link, 2235 sizeof tss_seg.prev_task_link,
2285 ctxt->vcpu, &err); 2236 &ctxt->exception);
2286 if (ret == X86EMUL_PROPAGATE_FAULT) { 2237 if (ret != X86EMUL_CONTINUE)
2287 /* FIXME: need to provide precise fault address */ 2238 /* FIXME: need to provide precise fault address */
2288 emulate_pf(ctxt, new_tss_base, err);
2289 return ret; 2239 return ret;
2290 }
2291 } 2240 }
2292 2241
2293 return load_state_from_tss16(ctxt, ops, &tss_seg); 2242 return load_state_from_tss16(ctxt, ops, &tss_seg);
@@ -2299,7 +2248,7 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2299{ 2248{
2300 struct decode_cache *c = &ctxt->decode; 2249 struct decode_cache *c = &ctxt->decode;
2301 2250
2302 tss->cr3 = ops->get_cr(3, ctxt->vcpu); 2251 tss->cr3 = ops->get_cr(ctxt, 3);
2303 tss->eip = c->eip; 2252 tss->eip = c->eip;
2304 tss->eflags = ctxt->eflags; 2253 tss->eflags = ctxt->eflags;
2305 tss->eax = c->regs[VCPU_REGS_RAX]; 2254 tss->eax = c->regs[VCPU_REGS_RAX];
@@ -2311,13 +2260,13 @@ static void save_state_to_tss32(struct x86_emulate_ctxt *ctxt,
2311 tss->esi = c->regs[VCPU_REGS_RSI]; 2260 tss->esi = c->regs[VCPU_REGS_RSI];
2312 tss->edi = c->regs[VCPU_REGS_RDI]; 2261 tss->edi = c->regs[VCPU_REGS_RDI];
2313 2262
2314 tss->es = ops->get_segment_selector(VCPU_SREG_ES, ctxt->vcpu); 2263 tss->es = get_segment_selector(ctxt, VCPU_SREG_ES);
2315 tss->cs = ops->get_segment_selector(VCPU_SREG_CS, ctxt->vcpu); 2264 tss->cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2316 tss->ss = ops->get_segment_selector(VCPU_SREG_SS, ctxt->vcpu); 2265 tss->ss = get_segment_selector(ctxt, VCPU_SREG_SS);
2317 tss->ds = ops->get_segment_selector(VCPU_SREG_DS, ctxt->vcpu); 2266 tss->ds = get_segment_selector(ctxt, VCPU_SREG_DS);
2318 tss->fs = ops->get_segment_selector(VCPU_SREG_FS, ctxt->vcpu); 2267 tss->fs = get_segment_selector(ctxt, VCPU_SREG_FS);
2319 tss->gs = ops->get_segment_selector(VCPU_SREG_GS, ctxt->vcpu); 2268 tss->gs = get_segment_selector(ctxt, VCPU_SREG_GS);
2320 tss->ldt_selector = ops->get_segment_selector(VCPU_SREG_LDTR, ctxt->vcpu); 2269 tss->ldt_selector = get_segment_selector(ctxt, VCPU_SREG_LDTR);
2321} 2270}
2322 2271
2323static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt, 2272static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
@@ -2327,10 +2276,8 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2327 struct decode_cache *c = &ctxt->decode; 2276 struct decode_cache *c = &ctxt->decode;
2328 int ret; 2277 int ret;
2329 2278
2330 if (ops->set_cr(3, tss->cr3, ctxt->vcpu)) { 2279 if (ops->set_cr(ctxt, 3, tss->cr3))
2331 emulate_gp(ctxt, 0); 2280 return emulate_gp(ctxt, 0);
2332 return X86EMUL_PROPAGATE_FAULT;
2333 }
2334 c->eip = tss->eip; 2281 c->eip = tss->eip;
2335 ctxt->eflags = tss->eflags | 2; 2282 ctxt->eflags = tss->eflags | 2;
2336 c->regs[VCPU_REGS_RAX] = tss->eax; 2283 c->regs[VCPU_REGS_RAX] = tss->eax;
@@ -2346,13 +2293,13 @@ static int load_state_from_tss32(struct x86_emulate_ctxt *ctxt,
2346 * SDM says that segment selectors are loaded before segment 2293 * SDM says that segment selectors are loaded before segment
2347 * descriptors 2294 * descriptors
2348 */ 2295 */
2349 ops->set_segment_selector(tss->ldt_selector, VCPU_SREG_LDTR, ctxt->vcpu); 2296 set_segment_selector(ctxt, tss->ldt_selector, VCPU_SREG_LDTR);
2350 ops->set_segment_selector(tss->es, VCPU_SREG_ES, ctxt->vcpu); 2297 set_segment_selector(ctxt, tss->es, VCPU_SREG_ES);
2351 ops->set_segment_selector(tss->cs, VCPU_SREG_CS, ctxt->vcpu); 2298 set_segment_selector(ctxt, tss->cs, VCPU_SREG_CS);
2352 ops->set_segment_selector(tss->ss, VCPU_SREG_SS, ctxt->vcpu); 2299 set_segment_selector(ctxt, tss->ss, VCPU_SREG_SS);
2353 ops->set_segment_selector(tss->ds, VCPU_SREG_DS, ctxt->vcpu); 2300 set_segment_selector(ctxt, tss->ds, VCPU_SREG_DS);
2354 ops->set_segment_selector(tss->fs, VCPU_SREG_FS, ctxt->vcpu); 2301 set_segment_selector(ctxt, tss->fs, VCPU_SREG_FS);
2355 ops->set_segment_selector(tss->gs, VCPU_SREG_GS, ctxt->vcpu); 2302 set_segment_selector(ctxt, tss->gs, VCPU_SREG_GS);
2356 2303
2357 /* 2304 /*
2358 * Now load segment descriptors. If fault happenes at this stage 2305 * Now load segment descriptors. If fault happenes at this stage
@@ -2390,46 +2337,38 @@ static int task_switch_32(struct x86_emulate_ctxt *ctxt,
2390{ 2337{
2391 struct tss_segment_32 tss_seg; 2338 struct tss_segment_32 tss_seg;
2392 int ret; 2339 int ret;
2393 u32 err, new_tss_base = get_desc_base(new_desc); 2340 u32 new_tss_base = get_desc_base(new_desc);
2394 2341
2395 ret = ops->read_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2342 ret = ops->read_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2396 &err); 2343 &ctxt->exception);
2397 if (ret == X86EMUL_PROPAGATE_FAULT) { 2344 if (ret != X86EMUL_CONTINUE)
2398 /* FIXME: need to provide precise fault address */ 2345 /* FIXME: need to provide precise fault address */
2399 emulate_pf(ctxt, old_tss_base, err);
2400 return ret; 2346 return ret;
2401 }
2402 2347
2403 save_state_to_tss32(ctxt, ops, &tss_seg); 2348 save_state_to_tss32(ctxt, ops, &tss_seg);
2404 2349
2405 ret = ops->write_std(old_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2350 ret = ops->write_std(ctxt, old_tss_base, &tss_seg, sizeof tss_seg,
2406 &err); 2351 &ctxt->exception);
2407 if (ret == X86EMUL_PROPAGATE_FAULT) { 2352 if (ret != X86EMUL_CONTINUE)
2408 /* FIXME: need to provide precise fault address */ 2353 /* FIXME: need to provide precise fault address */
2409 emulate_pf(ctxt, old_tss_base, err);
2410 return ret; 2354 return ret;
2411 }
2412 2355
2413 ret = ops->read_std(new_tss_base, &tss_seg, sizeof tss_seg, ctxt->vcpu, 2356 ret = ops->read_std(ctxt, new_tss_base, &tss_seg, sizeof tss_seg,
2414 &err); 2357 &ctxt->exception);
2415 if (ret == X86EMUL_PROPAGATE_FAULT) { 2358 if (ret != X86EMUL_CONTINUE)
2416 /* FIXME: need to provide precise fault address */ 2359 /* FIXME: need to provide precise fault address */
2417 emulate_pf(ctxt, new_tss_base, err);
2418 return ret; 2360 return ret;
2419 }
2420 2361
2421 if (old_tss_sel != 0xffff) { 2362 if (old_tss_sel != 0xffff) {
2422 tss_seg.prev_task_link = old_tss_sel; 2363 tss_seg.prev_task_link = old_tss_sel;
2423 2364
2424 ret = ops->write_std(new_tss_base, 2365 ret = ops->write_std(ctxt, new_tss_base,
2425 &tss_seg.prev_task_link, 2366 &tss_seg.prev_task_link,
2426 sizeof tss_seg.prev_task_link, 2367 sizeof tss_seg.prev_task_link,
2427 ctxt->vcpu, &err); 2368 &ctxt->exception);
2428 if (ret == X86EMUL_PROPAGATE_FAULT) { 2369 if (ret != X86EMUL_CONTINUE)
2429 /* FIXME: need to provide precise fault address */ 2370 /* FIXME: need to provide precise fault address */
2430 emulate_pf(ctxt, new_tss_base, err);
2431 return ret; 2371 return ret;
2432 }
2433 } 2372 }
2434 2373
2435 return load_state_from_tss32(ctxt, ops, &tss_seg); 2374 return load_state_from_tss32(ctxt, ops, &tss_seg);
@@ -2442,9 +2381,9 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2442{ 2381{
2443 struct desc_struct curr_tss_desc, next_tss_desc; 2382 struct desc_struct curr_tss_desc, next_tss_desc;
2444 int ret; 2383 int ret;
2445 u16 old_tss_sel = ops->get_segment_selector(VCPU_SREG_TR, ctxt->vcpu); 2384 u16 old_tss_sel = get_segment_selector(ctxt, VCPU_SREG_TR);
2446 ulong old_tss_base = 2385 ulong old_tss_base =
2447 ops->get_cached_segment_base(VCPU_SREG_TR, ctxt->vcpu); 2386 ops->get_cached_segment_base(ctxt, VCPU_SREG_TR);
2448 u32 desc_limit; 2387 u32 desc_limit;
2449 2388
2450 /* FIXME: old_tss_base == ~0 ? */ 2389 /* FIXME: old_tss_base == ~0 ? */
@@ -2460,10 +2399,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2460 2399
2461 if (reason != TASK_SWITCH_IRET) { 2400 if (reason != TASK_SWITCH_IRET) {
2462 if ((tss_selector & 3) > next_tss_desc.dpl || 2401 if ((tss_selector & 3) > next_tss_desc.dpl ||
2463 ops->cpl(ctxt->vcpu) > next_tss_desc.dpl) { 2402 ops->cpl(ctxt) > next_tss_desc.dpl)
2464 emulate_gp(ctxt, 0); 2403 return emulate_gp(ctxt, 0);
2465 return X86EMUL_PROPAGATE_FAULT;
2466 }
2467 } 2404 }
2468 2405
2469 desc_limit = desc_limit_scaled(&next_tss_desc); 2406 desc_limit = desc_limit_scaled(&next_tss_desc);
@@ -2506,9 +2443,8 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2506 &next_tss_desc); 2443 &next_tss_desc);
2507 } 2444 }
2508 2445
2509 ops->set_cr(0, ops->get_cr(0, ctxt->vcpu) | X86_CR0_TS, ctxt->vcpu); 2446 ops->set_cr(ctxt, 0, ops->get_cr(ctxt, 0) | X86_CR0_TS);
2510 ops->set_cached_descriptor(&next_tss_desc, VCPU_SREG_TR, ctxt->vcpu); 2447 ops->set_segment(ctxt, tss_selector, &next_tss_desc, 0, VCPU_SREG_TR);
2511 ops->set_segment_selector(tss_selector, VCPU_SREG_TR, ctxt->vcpu);
2512 2448
2513 if (has_error_code) { 2449 if (has_error_code) {
2514 struct decode_cache *c = &ctxt->decode; 2450 struct decode_cache *c = &ctxt->decode;
@@ -2516,17 +2452,17 @@ static int emulator_do_task_switch(struct x86_emulate_ctxt *ctxt,
2516 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2; 2452 c->op_bytes = c->ad_bytes = (next_tss_desc.type & 8) ? 4 : 2;
2517 c->lock_prefix = 0; 2453 c->lock_prefix = 0;
2518 c->src.val = (unsigned long) error_code; 2454 c->src.val = (unsigned long) error_code;
2519 emulate_push(ctxt, ops); 2455 ret = em_push(ctxt);
2520 } 2456 }
2521 2457
2522 return ret; 2458 return ret;
2523} 2459}
2524 2460
2525int emulator_task_switch(struct x86_emulate_ctxt *ctxt, 2461int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2526 struct x86_emulate_ops *ops,
2527 u16 tss_selector, int reason, 2462 u16 tss_selector, int reason,
2528 bool has_error_code, u32 error_code) 2463 bool has_error_code, u32 error_code)
2529{ 2464{
2465 struct x86_emulate_ops *ops = ctxt->ops;
2530 struct decode_cache *c = &ctxt->decode; 2466 struct decode_cache *c = &ctxt->decode;
2531 int rc; 2467 int rc;
2532 2468
@@ -2536,91 +2472,1357 @@ int emulator_task_switch(struct x86_emulate_ctxt *ctxt,
2536 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason, 2472 rc = emulator_do_task_switch(ctxt, ops, tss_selector, reason,
2537 has_error_code, error_code); 2473 has_error_code, error_code);
2538 2474
2539 if (rc == X86EMUL_CONTINUE) { 2475 if (rc == X86EMUL_CONTINUE)
2540 rc = writeback(ctxt, ops); 2476 ctxt->eip = c->eip;
2541 if (rc == X86EMUL_CONTINUE)
2542 ctxt->eip = c->eip;
2543 }
2544 2477
2545 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 2478 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
2546} 2479}
2547 2480
2548static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned long base, 2481static void string_addr_inc(struct x86_emulate_ctxt *ctxt, unsigned seg,
2549 int reg, struct operand *op) 2482 int reg, struct operand *op)
2550{ 2483{
2551 struct decode_cache *c = &ctxt->decode; 2484 struct decode_cache *c = &ctxt->decode;
2552 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1; 2485 int df = (ctxt->eflags & EFLG_DF) ? -1 : 1;
2553 2486
2554 register_address_increment(c, &c->regs[reg], df * op->bytes); 2487 register_address_increment(c, &c->regs[reg], df * op->bytes);
2555 op->ptr = (unsigned long *)register_address(c, base, c->regs[reg]); 2488 op->addr.mem.ea = register_address(c, c->regs[reg]);
2489 op->addr.mem.seg = seg;
2490}
2491
2492static int em_das(struct x86_emulate_ctxt *ctxt)
2493{
2494 struct decode_cache *c = &ctxt->decode;
2495 u8 al, old_al;
2496 bool af, cf, old_cf;
2497
2498 cf = ctxt->eflags & X86_EFLAGS_CF;
2499 al = c->dst.val;
2500
2501 old_al = al;
2502 old_cf = cf;
2503 cf = false;
2504 af = ctxt->eflags & X86_EFLAGS_AF;
2505 if ((al & 0x0f) > 9 || af) {
2506 al -= 6;
2507 cf = old_cf | (al >= 250);
2508 af = true;
2509 } else {
2510 af = false;
2511 }
2512 if (old_al > 0x99 || old_cf) {
2513 al -= 0x60;
2514 cf = true;
2515 }
2516
2517 c->dst.val = al;
2518 /* Set PF, ZF, SF */
2519 c->src.type = OP_IMM;
2520 c->src.val = 0;
2521 c->src.bytes = 1;
2522 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2523 ctxt->eflags &= ~(X86_EFLAGS_AF | X86_EFLAGS_CF);
2524 if (cf)
2525 ctxt->eflags |= X86_EFLAGS_CF;
2526 if (af)
2527 ctxt->eflags |= X86_EFLAGS_AF;
2528 return X86EMUL_CONTINUE;
2529}
2530
2531static int em_call_far(struct x86_emulate_ctxt *ctxt)
2532{
2533 struct decode_cache *c = &ctxt->decode;
2534 u16 sel, old_cs;
2535 ulong old_eip;
2536 int rc;
2537
2538 old_cs = get_segment_selector(ctxt, VCPU_SREG_CS);
2539 old_eip = c->eip;
2540
2541 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2542 if (load_segment_descriptor(ctxt, ctxt->ops, sel, VCPU_SREG_CS))
2543 return X86EMUL_CONTINUE;
2544
2545 c->eip = 0;
2546 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2547
2548 c->src.val = old_cs;
2549 rc = em_push(ctxt);
2550 if (rc != X86EMUL_CONTINUE)
2551 return rc;
2552
2553 c->src.val = old_eip;
2554 return em_push(ctxt);
2555}
2556
2557static int em_ret_near_imm(struct x86_emulate_ctxt *ctxt)
2558{
2559 struct decode_cache *c = &ctxt->decode;
2560 int rc;
2561
2562 c->dst.type = OP_REG;
2563 c->dst.addr.reg = &c->eip;
2564 c->dst.bytes = c->op_bytes;
2565 rc = emulate_pop(ctxt, &c->dst.val, c->op_bytes);
2566 if (rc != X86EMUL_CONTINUE)
2567 return rc;
2568 register_address_increment(c, &c->regs[VCPU_REGS_RSP], c->src.val);
2569 return X86EMUL_CONTINUE;
2570}
2571
2572static int em_add(struct x86_emulate_ctxt *ctxt)
2573{
2574 struct decode_cache *c = &ctxt->decode;
2575
2576 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2577 return X86EMUL_CONTINUE;
2578}
2579
2580static int em_or(struct x86_emulate_ctxt *ctxt)
2581{
2582 struct decode_cache *c = &ctxt->decode;
2583
2584 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2585 return X86EMUL_CONTINUE;
2586}
2587
2588static int em_adc(struct x86_emulate_ctxt *ctxt)
2589{
2590 struct decode_cache *c = &ctxt->decode;
2591
2592 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2593 return X86EMUL_CONTINUE;
2594}
2595
2596static int em_sbb(struct x86_emulate_ctxt *ctxt)
2597{
2598 struct decode_cache *c = &ctxt->decode;
2599
2600 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2601 return X86EMUL_CONTINUE;
2602}
2603
2604static int em_and(struct x86_emulate_ctxt *ctxt)
2605{
2606 struct decode_cache *c = &ctxt->decode;
2607
2608 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2609 return X86EMUL_CONTINUE;
2610}
2611
2612static int em_sub(struct x86_emulate_ctxt *ctxt)
2613{
2614 struct decode_cache *c = &ctxt->decode;
2615
2616 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2617 return X86EMUL_CONTINUE;
2618}
2619
2620static int em_xor(struct x86_emulate_ctxt *ctxt)
2621{
2622 struct decode_cache *c = &ctxt->decode;
2623
2624 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2625 return X86EMUL_CONTINUE;
2626}
2627
2628static int em_cmp(struct x86_emulate_ctxt *ctxt)
2629{
2630 struct decode_cache *c = &ctxt->decode;
2631
2632 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2633 /* Disable writeback. */
2634 c->dst.type = OP_NONE;
2635 return X86EMUL_CONTINUE;
2636}
2637
2638static int em_imul(struct x86_emulate_ctxt *ctxt)
2639{
2640 struct decode_cache *c = &ctxt->decode;
2641
2642 emulate_2op_SrcV_nobyte("imul", c->src, c->dst, ctxt->eflags);
2643 return X86EMUL_CONTINUE;
2644}
2645
2646static int em_imul_3op(struct x86_emulate_ctxt *ctxt)
2647{
2648 struct decode_cache *c = &ctxt->decode;
2649
2650 c->dst.val = c->src2.val;
2651 return em_imul(ctxt);
2652}
2653
2654static int em_cwd(struct x86_emulate_ctxt *ctxt)
2655{
2656 struct decode_cache *c = &ctxt->decode;
2657
2658 c->dst.type = OP_REG;
2659 c->dst.bytes = c->src.bytes;
2660 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
2661 c->dst.val = ~((c->src.val >> (c->src.bytes * 8 - 1)) - 1);
2662
2663 return X86EMUL_CONTINUE;
2664}
2665
2666static int em_rdtsc(struct x86_emulate_ctxt *ctxt)
2667{
2668 struct decode_cache *c = &ctxt->decode;
2669 u64 tsc = 0;
2670
2671 ctxt->ops->get_msr(ctxt, MSR_IA32_TSC, &tsc);
2672 c->regs[VCPU_REGS_RAX] = (u32)tsc;
2673 c->regs[VCPU_REGS_RDX] = tsc >> 32;
2674 return X86EMUL_CONTINUE;
2675}
2676
2677static int em_mov(struct x86_emulate_ctxt *ctxt)
2678{
2679 struct decode_cache *c = &ctxt->decode;
2680 c->dst.val = c->src.val;
2681 return X86EMUL_CONTINUE;
2682}
2683
2684static int em_movdqu(struct x86_emulate_ctxt *ctxt)
2685{
2686 struct decode_cache *c = &ctxt->decode;
2687 memcpy(&c->dst.vec_val, &c->src.vec_val, c->op_bytes);
2688 return X86EMUL_CONTINUE;
2689}
2690
2691static int em_invlpg(struct x86_emulate_ctxt *ctxt)
2692{
2693 struct decode_cache *c = &ctxt->decode;
2694 int rc;
2695 ulong linear;
2696
2697 rc = linearize(ctxt, c->src.addr.mem, 1, false, &linear);
2698 if (rc == X86EMUL_CONTINUE)
2699 ctxt->ops->invlpg(ctxt, linear);
2700 /* Disable writeback. */
2701 c->dst.type = OP_NONE;
2702 return X86EMUL_CONTINUE;
2703}
2704
2705static int em_clts(struct x86_emulate_ctxt *ctxt)
2706{
2707 ulong cr0;
2708
2709 cr0 = ctxt->ops->get_cr(ctxt, 0);
2710 cr0 &= ~X86_CR0_TS;
2711 ctxt->ops->set_cr(ctxt, 0, cr0);
2712 return X86EMUL_CONTINUE;
2713}
2714
2715static int em_vmcall(struct x86_emulate_ctxt *ctxt)
2716{
2717 struct decode_cache *c = &ctxt->decode;
2718 int rc;
2719
2720 if (c->modrm_mod != 3 || c->modrm_rm != 1)
2721 return X86EMUL_UNHANDLEABLE;
2722
2723 rc = ctxt->ops->fix_hypercall(ctxt);
2724 if (rc != X86EMUL_CONTINUE)
2725 return rc;
2726
2727 /* Let the processor re-execute the fixed hypercall */
2728 c->eip = ctxt->eip;
2729 /* Disable writeback. */
2730 c->dst.type = OP_NONE;
2731 return X86EMUL_CONTINUE;
2732}
2733
2734static int em_lgdt(struct x86_emulate_ctxt *ctxt)
2735{
2736 struct decode_cache *c = &ctxt->decode;
2737 struct desc_ptr desc_ptr;
2738 int rc;
2739
2740 rc = read_descriptor(ctxt, c->src.addr.mem,
2741 &desc_ptr.size, &desc_ptr.address,
2742 c->op_bytes);
2743 if (rc != X86EMUL_CONTINUE)
2744 return rc;
2745 ctxt->ops->set_gdt(ctxt, &desc_ptr);
2746 /* Disable writeback. */
2747 c->dst.type = OP_NONE;
2748 return X86EMUL_CONTINUE;
2749}
2750
2751static int em_vmmcall(struct x86_emulate_ctxt *ctxt)
2752{
2753 struct decode_cache *c = &ctxt->decode;
2754 int rc;
2755
2756 rc = ctxt->ops->fix_hypercall(ctxt);
2757
2758 /* Disable writeback. */
2759 c->dst.type = OP_NONE;
2760 return rc;
2761}
2762
2763static int em_lidt(struct x86_emulate_ctxt *ctxt)
2764{
2765 struct decode_cache *c = &ctxt->decode;
2766 struct desc_ptr desc_ptr;
2767 int rc;
2768
2769 rc = read_descriptor(ctxt, c->src.addr.mem,
2770 &desc_ptr.size, &desc_ptr.address,
2771 c->op_bytes);
2772 if (rc != X86EMUL_CONTINUE)
2773 return rc;
2774 ctxt->ops->set_idt(ctxt, &desc_ptr);
2775 /* Disable writeback. */
2776 c->dst.type = OP_NONE;
2777 return X86EMUL_CONTINUE;
2778}
2779
2780static int em_smsw(struct x86_emulate_ctxt *ctxt)
2781{
2782 struct decode_cache *c = &ctxt->decode;
2783
2784 c->dst.bytes = 2;
2785 c->dst.val = ctxt->ops->get_cr(ctxt, 0);
2786 return X86EMUL_CONTINUE;
2787}
2788
2789static int em_lmsw(struct x86_emulate_ctxt *ctxt)
2790{
2791 struct decode_cache *c = &ctxt->decode;
2792 ctxt->ops->set_cr(ctxt, 0, (ctxt->ops->get_cr(ctxt, 0) & ~0x0eul)
2793 | (c->src.val & 0x0f));
2794 c->dst.type = OP_NONE;
2795 return X86EMUL_CONTINUE;
2796}
2797
2798static bool valid_cr(int nr)
2799{
2800 switch (nr) {
2801 case 0:
2802 case 2 ... 4:
2803 case 8:
2804 return true;
2805 default:
2806 return false;
2807 }
2808}
2809
2810static int check_cr_read(struct x86_emulate_ctxt *ctxt)
2811{
2812 struct decode_cache *c = &ctxt->decode;
2813
2814 if (!valid_cr(c->modrm_reg))
2815 return emulate_ud(ctxt);
2816
2817 return X86EMUL_CONTINUE;
2818}
2819
2820static int check_cr_write(struct x86_emulate_ctxt *ctxt)
2821{
2822 struct decode_cache *c = &ctxt->decode;
2823 u64 new_val = c->src.val64;
2824 int cr = c->modrm_reg;
2825 u64 efer = 0;
2826
2827 static u64 cr_reserved_bits[] = {
2828 0xffffffff00000000ULL,
2829 0, 0, 0, /* CR3 checked later */
2830 CR4_RESERVED_BITS,
2831 0, 0, 0,
2832 CR8_RESERVED_BITS,
2833 };
2834
2835 if (!valid_cr(cr))
2836 return emulate_ud(ctxt);
2837
2838 if (new_val & cr_reserved_bits[cr])
2839 return emulate_gp(ctxt, 0);
2840
2841 switch (cr) {
2842 case 0: {
2843 u64 cr4;
2844 if (((new_val & X86_CR0_PG) && !(new_val & X86_CR0_PE)) ||
2845 ((new_val & X86_CR0_NW) && !(new_val & X86_CR0_CD)))
2846 return emulate_gp(ctxt, 0);
2847
2848 cr4 = ctxt->ops->get_cr(ctxt, 4);
2849 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2850
2851 if ((new_val & X86_CR0_PG) && (efer & EFER_LME) &&
2852 !(cr4 & X86_CR4_PAE))
2853 return emulate_gp(ctxt, 0);
2854
2855 break;
2856 }
2857 case 3: {
2858 u64 rsvd = 0;
2859
2860 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2861 if (efer & EFER_LMA)
2862 rsvd = CR3_L_MODE_RESERVED_BITS;
2863 else if (ctxt->ops->get_cr(ctxt, 4) & X86_CR4_PAE)
2864 rsvd = CR3_PAE_RESERVED_BITS;
2865 else if (ctxt->ops->get_cr(ctxt, 0) & X86_CR0_PG)
2866 rsvd = CR3_NONPAE_RESERVED_BITS;
2867
2868 if (new_val & rsvd)
2869 return emulate_gp(ctxt, 0);
2870
2871 break;
2872 }
2873 case 4: {
2874 u64 cr4;
2875
2876 cr4 = ctxt->ops->get_cr(ctxt, 4);
2877 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2878
2879 if ((efer & EFER_LMA) && !(new_val & X86_CR4_PAE))
2880 return emulate_gp(ctxt, 0);
2881
2882 break;
2883 }
2884 }
2885
2886 return X86EMUL_CONTINUE;
2887}
2888
2889static int check_dr7_gd(struct x86_emulate_ctxt *ctxt)
2890{
2891 unsigned long dr7;
2892
2893 ctxt->ops->get_dr(ctxt, 7, &dr7);
2894
2895 /* Check if DR7.Global_Enable is set */
2896 return dr7 & (1 << 13);
2897}
2898
2899static int check_dr_read(struct x86_emulate_ctxt *ctxt)
2900{
2901 struct decode_cache *c = &ctxt->decode;
2902 int dr = c->modrm_reg;
2903 u64 cr4;
2904
2905 if (dr > 7)
2906 return emulate_ud(ctxt);
2907
2908 cr4 = ctxt->ops->get_cr(ctxt, 4);
2909 if ((cr4 & X86_CR4_DE) && (dr == 4 || dr == 5))
2910 return emulate_ud(ctxt);
2911
2912 if (check_dr7_gd(ctxt))
2913 return emulate_db(ctxt);
2914
2915 return X86EMUL_CONTINUE;
2916}
2917
2918static int check_dr_write(struct x86_emulate_ctxt *ctxt)
2919{
2920 struct decode_cache *c = &ctxt->decode;
2921 u64 new_val = c->src.val64;
2922 int dr = c->modrm_reg;
2923
2924 if ((dr == 6 || dr == 7) && (new_val & 0xffffffff00000000ULL))
2925 return emulate_gp(ctxt, 0);
2926
2927 return check_dr_read(ctxt);
2928}
2929
2930static int check_svme(struct x86_emulate_ctxt *ctxt)
2931{
2932 u64 efer;
2933
2934 ctxt->ops->get_msr(ctxt, MSR_EFER, &efer);
2935
2936 if (!(efer & EFER_SVME))
2937 return emulate_ud(ctxt);
2938
2939 return X86EMUL_CONTINUE;
2940}
2941
2942static int check_svme_pa(struct x86_emulate_ctxt *ctxt)
2943{
2944 u64 rax = ctxt->decode.regs[VCPU_REGS_RAX];
2945
2946 /* Valid physical address? */
2947 if (rax & 0xffff000000000000ULL)
2948 return emulate_gp(ctxt, 0);
2949
2950 return check_svme(ctxt);
2951}
2952
2953static int check_rdtsc(struct x86_emulate_ctxt *ctxt)
2954{
2955 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2956
2957 if (cr4 & X86_CR4_TSD && ctxt->ops->cpl(ctxt))
2958 return emulate_ud(ctxt);
2959
2960 return X86EMUL_CONTINUE;
2961}
2962
2963static int check_rdpmc(struct x86_emulate_ctxt *ctxt)
2964{
2965 u64 cr4 = ctxt->ops->get_cr(ctxt, 4);
2966 u64 rcx = ctxt->decode.regs[VCPU_REGS_RCX];
2967
2968 if ((!(cr4 & X86_CR4_PCE) && ctxt->ops->cpl(ctxt)) ||
2969 (rcx > 3))
2970 return emulate_gp(ctxt, 0);
2971
2972 return X86EMUL_CONTINUE;
2973}
2974
2975static int check_perm_in(struct x86_emulate_ctxt *ctxt)
2976{
2977 struct decode_cache *c = &ctxt->decode;
2978
2979 c->dst.bytes = min(c->dst.bytes, 4u);
2980 if (!emulator_io_permited(ctxt, ctxt->ops, c->src.val, c->dst.bytes))
2981 return emulate_gp(ctxt, 0);
2982
2983 return X86EMUL_CONTINUE;
2984}
2985
2986static int check_perm_out(struct x86_emulate_ctxt *ctxt)
2987{
2988 struct decode_cache *c = &ctxt->decode;
2989
2990 c->src.bytes = min(c->src.bytes, 4u);
2991 if (!emulator_io_permited(ctxt, ctxt->ops, c->dst.val, c->src.bytes))
2992 return emulate_gp(ctxt, 0);
2993
2994 return X86EMUL_CONTINUE;
2995}
2996
2997#define D(_y) { .flags = (_y) }
2998#define DI(_y, _i) { .flags = (_y), .intercept = x86_intercept_##_i }
2999#define DIP(_y, _i, _p) { .flags = (_y), .intercept = x86_intercept_##_i, \
3000 .check_perm = (_p) }
3001#define N D(0)
3002#define EXT(_f, _e) { .flags = ((_f) | RMExt), .u.group = (_e) }
3003#define G(_f, _g) { .flags = ((_f) | Group), .u.group = (_g) }
3004#define GD(_f, _g) { .flags = ((_f) | GroupDual), .u.gdual = (_g) }
3005#define I(_f, _e) { .flags = (_f), .u.execute = (_e) }
3006#define II(_f, _e, _i) \
3007 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i }
3008#define IIP(_f, _e, _i, _p) \
3009 { .flags = (_f), .u.execute = (_e), .intercept = x86_intercept_##_i, \
3010 .check_perm = (_p) }
3011#define GP(_f, _g) { .flags = ((_f) | Prefix), .u.gprefix = (_g) }
3012
3013#define D2bv(_f) D((_f) | ByteOp), D(_f)
3014#define D2bvIP(_f, _i, _p) DIP((_f) | ByteOp, _i, _p), DIP(_f, _i, _p)
3015#define I2bv(_f, _e) I((_f) | ByteOp, _e), I(_f, _e)
3016
3017#define I6ALU(_f, _e) I2bv((_f) | DstMem | SrcReg | ModRM, _e), \
3018 I2bv(((_f) | DstReg | SrcMem | ModRM) & ~Lock, _e), \
3019 I2bv(((_f) & ~Lock) | DstAcc | SrcImm, _e)
3020
3021static struct opcode group7_rm1[] = {
3022 DI(SrcNone | ModRM | Priv, monitor),
3023 DI(SrcNone | ModRM | Priv, mwait),
3024 N, N, N, N, N, N,
3025};
3026
3027static struct opcode group7_rm3[] = {
3028 DIP(SrcNone | ModRM | Prot | Priv, vmrun, check_svme_pa),
3029 II(SrcNone | ModRM | Prot | VendorSpecific, em_vmmcall, vmmcall),
3030 DIP(SrcNone | ModRM | Prot | Priv, vmload, check_svme_pa),
3031 DIP(SrcNone | ModRM | Prot | Priv, vmsave, check_svme_pa),
3032 DIP(SrcNone | ModRM | Prot | Priv, stgi, check_svme),
3033 DIP(SrcNone | ModRM | Prot | Priv, clgi, check_svme),
3034 DIP(SrcNone | ModRM | Prot | Priv, skinit, check_svme),
3035 DIP(SrcNone | ModRM | Prot | Priv, invlpga, check_svme),
3036};
3037
3038static struct opcode group7_rm7[] = {
3039 N,
3040 DIP(SrcNone | ModRM, rdtscp, check_rdtsc),
3041 N, N, N, N, N, N,
3042};
3043
3044static struct opcode group1[] = {
3045 I(Lock, em_add),
3046 I(Lock, em_or),
3047 I(Lock, em_adc),
3048 I(Lock, em_sbb),
3049 I(Lock, em_and),
3050 I(Lock, em_sub),
3051 I(Lock, em_xor),
3052 I(0, em_cmp),
3053};
3054
3055static struct opcode group1A[] = {
3056 D(DstMem | SrcNone | ModRM | Mov | Stack), N, N, N, N, N, N, N,
3057};
3058
3059static struct opcode group3[] = {
3060 D(DstMem | SrcImm | ModRM), D(DstMem | SrcImm | ModRM),
3061 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
3062 X4(D(SrcMem | ModRM)),
3063};
3064
3065static struct opcode group4[] = {
3066 D(ByteOp | DstMem | SrcNone | ModRM | Lock), D(ByteOp | DstMem | SrcNone | ModRM | Lock),
3067 N, N, N, N, N, N,
3068};
3069
3070static struct opcode group5[] = {
3071 D(DstMem | SrcNone | ModRM | Lock), D(DstMem | SrcNone | ModRM | Lock),
3072 D(SrcMem | ModRM | Stack),
3073 I(SrcMemFAddr | ModRM | ImplicitOps | Stack, em_call_far),
3074 D(SrcMem | ModRM | Stack), D(SrcMemFAddr | ModRM | ImplicitOps),
3075 D(SrcMem | ModRM | Stack), N,
3076};
3077
3078static struct opcode group6[] = {
3079 DI(ModRM | Prot, sldt),
3080 DI(ModRM | Prot, str),
3081 DI(ModRM | Prot | Priv, lldt),
3082 DI(ModRM | Prot | Priv, ltr),
3083 N, N, N, N,
3084};
3085
3086static struct group_dual group7 = { {
3087 DI(ModRM | Mov | DstMem | Priv, sgdt),
3088 DI(ModRM | Mov | DstMem | Priv, sidt),
3089 II(ModRM | SrcMem | Priv, em_lgdt, lgdt),
3090 II(ModRM | SrcMem | Priv, em_lidt, lidt),
3091 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
3092 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw),
3093 II(SrcMem | ModRM | ByteOp | Priv | NoAccess, em_invlpg, invlpg),
3094}, {
3095 I(SrcNone | ModRM | Priv | VendorSpecific, em_vmcall),
3096 EXT(0, group7_rm1),
3097 N, EXT(0, group7_rm3),
3098 II(SrcNone | ModRM | DstMem | Mov, em_smsw, smsw), N,
3099 II(SrcMem16 | ModRM | Mov | Priv, em_lmsw, lmsw), EXT(0, group7_rm7),
3100} };
3101
3102static struct opcode group8[] = {
3103 N, N, N, N,
3104 D(DstMem | SrcImmByte | ModRM), D(DstMem | SrcImmByte | ModRM | Lock),
3105 D(DstMem | SrcImmByte | ModRM | Lock), D(DstMem | SrcImmByte | ModRM | Lock),
3106};
3107
3108static struct group_dual group9 = { {
3109 N, D(DstMem64 | ModRM | Lock), N, N, N, N, N, N,
3110}, {
3111 N, N, N, N, N, N, N, N,
3112} };
3113
3114static struct opcode group11[] = {
3115 I(DstMem | SrcImm | ModRM | Mov, em_mov), X7(D(Undefined)),
3116};
3117
3118static struct gprefix pfx_0f_6f_0f_7f = {
3119 N, N, N, I(Sse, em_movdqu),
3120};
3121
3122static struct opcode opcode_table[256] = {
3123 /* 0x00 - 0x07 */
3124 I6ALU(Lock, em_add),
3125 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
3126 /* 0x08 - 0x0F */
3127 I6ALU(Lock, em_or),
3128 D(ImplicitOps | Stack | No64), N,
3129 /* 0x10 - 0x17 */
3130 I6ALU(Lock, em_adc),
3131 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
3132 /* 0x18 - 0x1F */
3133 I6ALU(Lock, em_sbb),
3134 D(ImplicitOps | Stack | No64), D(ImplicitOps | Stack | No64),
3135 /* 0x20 - 0x27 */
3136 I6ALU(Lock, em_and), N, N,
3137 /* 0x28 - 0x2F */
3138 I6ALU(Lock, em_sub), N, I(ByteOp | DstAcc | No64, em_das),
3139 /* 0x30 - 0x37 */
3140 I6ALU(Lock, em_xor), N, N,
3141 /* 0x38 - 0x3F */
3142 I6ALU(0, em_cmp), N, N,
3143 /* 0x40 - 0x4F */
3144 X16(D(DstReg)),
3145 /* 0x50 - 0x57 */
3146 X8(I(SrcReg | Stack, em_push)),
3147 /* 0x58 - 0x5F */
3148 X8(I(DstReg | Stack, em_pop)),
3149 /* 0x60 - 0x67 */
3150 I(ImplicitOps | Stack | No64, em_pusha),
3151 I(ImplicitOps | Stack | No64, em_popa),
3152 N, D(DstReg | SrcMem32 | ModRM | Mov) /* movsxd (x86/64) */ ,
3153 N, N, N, N,
3154 /* 0x68 - 0x6F */
3155 I(SrcImm | Mov | Stack, em_push),
3156 I(DstReg | SrcMem | ModRM | Src2Imm, em_imul_3op),
3157 I(SrcImmByte | Mov | Stack, em_push),
3158 I(DstReg | SrcMem | ModRM | Src2ImmByte, em_imul_3op),
3159 D2bvIP(DstDI | SrcDX | Mov | String, ins, check_perm_in), /* insb, insw/insd */
3160 D2bvIP(SrcSI | DstDX | String, outs, check_perm_out), /* outsb, outsw/outsd */
3161 /* 0x70 - 0x7F */
3162 X16(D(SrcImmByte)),
3163 /* 0x80 - 0x87 */
3164 G(ByteOp | DstMem | SrcImm | ModRM | Group, group1),
3165 G(DstMem | SrcImm | ModRM | Group, group1),
3166 G(ByteOp | DstMem | SrcImm | ModRM | No64 | Group, group1),
3167 G(DstMem | SrcImmByte | ModRM | Group, group1),
3168 D2bv(DstMem | SrcReg | ModRM), D2bv(DstMem | SrcReg | ModRM | Lock),
3169 /* 0x88 - 0x8F */
3170 I2bv(DstMem | SrcReg | ModRM | Mov, em_mov),
3171 I2bv(DstReg | SrcMem | ModRM | Mov, em_mov),
3172 D(DstMem | SrcNone | ModRM | Mov), D(ModRM | SrcMem | NoAccess | DstReg),
3173 D(ImplicitOps | SrcMem16 | ModRM), G(0, group1A),
3174 /* 0x90 - 0x97 */
3175 DI(SrcAcc | DstReg, pause), X7(D(SrcAcc | DstReg)),
3176 /* 0x98 - 0x9F */
3177 D(DstAcc | SrcNone), I(ImplicitOps | SrcAcc, em_cwd),
3178 I(SrcImmFAddr | No64, em_call_far), N,
3179 II(ImplicitOps | Stack, em_pushf, pushf),
3180 II(ImplicitOps | Stack, em_popf, popf), N, N,
3181 /* 0xA0 - 0xA7 */
3182 I2bv(DstAcc | SrcMem | Mov | MemAbs, em_mov),
3183 I2bv(DstMem | SrcAcc | Mov | MemAbs, em_mov),
3184 I2bv(SrcSI | DstDI | Mov | String, em_mov),
3185 I2bv(SrcSI | DstDI | String, em_cmp),
3186 /* 0xA8 - 0xAF */
3187 D2bv(DstAcc | SrcImm),
3188 I2bv(SrcAcc | DstDI | Mov | String, em_mov),
3189 I2bv(SrcSI | DstAcc | Mov | String, em_mov),
3190 I2bv(SrcAcc | DstDI | String, em_cmp),
3191 /* 0xB0 - 0xB7 */
3192 X8(I(ByteOp | DstReg | SrcImm | Mov, em_mov)),
3193 /* 0xB8 - 0xBF */
3194 X8(I(DstReg | SrcImm | Mov, em_mov)),
3195 /* 0xC0 - 0xC7 */
3196 D2bv(DstMem | SrcImmByte | ModRM),
3197 I(ImplicitOps | Stack | SrcImmU16, em_ret_near_imm),
3198 D(ImplicitOps | Stack),
3199 D(DstReg | SrcMemFAddr | ModRM | No64), D(DstReg | SrcMemFAddr | ModRM | No64),
3200 G(ByteOp, group11), G(0, group11),
3201 /* 0xC8 - 0xCF */
3202 N, N, N, D(ImplicitOps | Stack),
3203 D(ImplicitOps), DI(SrcImmByte, intn),
3204 D(ImplicitOps | No64), DI(ImplicitOps, iret),
3205 /* 0xD0 - 0xD7 */
3206 D2bv(DstMem | SrcOne | ModRM), D2bv(DstMem | ModRM),
3207 N, N, N, N,
3208 /* 0xD8 - 0xDF */
3209 N, N, N, N, N, N, N, N,
3210 /* 0xE0 - 0xE7 */
3211 X4(D(SrcImmByte)),
3212 D2bvIP(SrcImmUByte | DstAcc, in, check_perm_in),
3213 D2bvIP(SrcAcc | DstImmUByte, out, check_perm_out),
3214 /* 0xE8 - 0xEF */
3215 D(SrcImm | Stack), D(SrcImm | ImplicitOps),
3216 D(SrcImmFAddr | No64), D(SrcImmByte | ImplicitOps),
3217 D2bvIP(SrcDX | DstAcc, in, check_perm_in),
3218 D2bvIP(SrcAcc | DstDX, out, check_perm_out),
3219 /* 0xF0 - 0xF7 */
3220 N, DI(ImplicitOps, icebp), N, N,
3221 DI(ImplicitOps | Priv, hlt), D(ImplicitOps),
3222 G(ByteOp, group3), G(0, group3),
3223 /* 0xF8 - 0xFF */
3224 D(ImplicitOps), D(ImplicitOps), D(ImplicitOps), D(ImplicitOps),
3225 D(ImplicitOps), D(ImplicitOps), G(0, group4), G(0, group5),
3226};
3227
3228static struct opcode twobyte_table[256] = {
3229 /* 0x00 - 0x0F */
3230 G(0, group6), GD(0, &group7), N, N,
3231 N, D(ImplicitOps | VendorSpecific), DI(ImplicitOps | Priv, clts), N,
3232 DI(ImplicitOps | Priv, invd), DI(ImplicitOps | Priv, wbinvd), N, N,
3233 N, D(ImplicitOps | ModRM), N, N,
3234 /* 0x10 - 0x1F */
3235 N, N, N, N, N, N, N, N, D(ImplicitOps | ModRM), N, N, N, N, N, N, N,
3236 /* 0x20 - 0x2F */
3237 DIP(ModRM | DstMem | Priv | Op3264, cr_read, check_cr_read),
3238 DIP(ModRM | DstMem | Priv | Op3264, dr_read, check_dr_read),
3239 DIP(ModRM | SrcMem | Priv | Op3264, cr_write, check_cr_write),
3240 DIP(ModRM | SrcMem | Priv | Op3264, dr_write, check_dr_write),
3241 N, N, N, N,
3242 N, N, N, N, N, N, N, N,
3243 /* 0x30 - 0x3F */
3244 DI(ImplicitOps | Priv, wrmsr),
3245 IIP(ImplicitOps, em_rdtsc, rdtsc, check_rdtsc),
3246 DI(ImplicitOps | Priv, rdmsr),
3247 DIP(ImplicitOps | Priv, rdpmc, check_rdpmc),
3248 D(ImplicitOps | VendorSpecific), D(ImplicitOps | Priv | VendorSpecific),
3249 N, N,
3250 N, N, N, N, N, N, N, N,
3251 /* 0x40 - 0x4F */
3252 X16(D(DstReg | SrcMem | ModRM | Mov)),
3253 /* 0x50 - 0x5F */
3254 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3255 /* 0x60 - 0x6F */
3256 N, N, N, N,
3257 N, N, N, N,
3258 N, N, N, N,
3259 N, N, N, GP(SrcMem | DstReg | ModRM | Mov, &pfx_0f_6f_0f_7f),
3260 /* 0x70 - 0x7F */
3261 N, N, N, N,
3262 N, N, N, N,
3263 N, N, N, N,
3264 N, N, N, GP(SrcReg | DstMem | ModRM | Mov, &pfx_0f_6f_0f_7f),
3265 /* 0x80 - 0x8F */
3266 X16(D(SrcImm)),
3267 /* 0x90 - 0x9F */
3268 X16(D(ByteOp | DstMem | SrcNone | ModRM| Mov)),
3269 /* 0xA0 - 0xA7 */
3270 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
3271 DI(ImplicitOps, cpuid), D(DstMem | SrcReg | ModRM | BitOp),
3272 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3273 D(DstMem | SrcReg | Src2CL | ModRM), N, N,
3274 /* 0xA8 - 0xAF */
3275 D(ImplicitOps | Stack), D(ImplicitOps | Stack),
3276 DI(ImplicitOps, rsm), D(DstMem | SrcReg | ModRM | BitOp | Lock),
3277 D(DstMem | SrcReg | Src2ImmByte | ModRM),
3278 D(DstMem | SrcReg | Src2CL | ModRM),
3279 D(ModRM), I(DstReg | SrcMem | ModRM, em_imul),
3280 /* 0xB0 - 0xB7 */
3281 D2bv(DstMem | SrcReg | ModRM | Lock),
3282 D(DstReg | SrcMemFAddr | ModRM), D(DstMem | SrcReg | ModRM | BitOp | Lock),
3283 D(DstReg | SrcMemFAddr | ModRM), D(DstReg | SrcMemFAddr | ModRM),
3284 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3285 /* 0xB8 - 0xBF */
3286 N, N,
3287 G(BitOp, group8), D(DstMem | SrcReg | ModRM | BitOp | Lock),
3288 D(DstReg | SrcMem | ModRM), D(DstReg | SrcMem | ModRM),
3289 D(ByteOp | DstReg | SrcMem | ModRM | Mov), D(DstReg | SrcMem16 | ModRM | Mov),
3290 /* 0xC0 - 0xCF */
3291 D2bv(DstMem | SrcReg | ModRM | Lock),
3292 N, D(DstMem | SrcReg | ModRM | Mov),
3293 N, N, N, GD(0, &group9),
3294 N, N, N, N, N, N, N, N,
3295 /* 0xD0 - 0xDF */
3296 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3297 /* 0xE0 - 0xEF */
3298 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N,
3299 /* 0xF0 - 0xFF */
3300 N, N, N, N, N, N, N, N, N, N, N, N, N, N, N, N
3301};
3302
3303#undef D
3304#undef N
3305#undef G
3306#undef GD
3307#undef I
3308#undef GP
3309#undef EXT
3310
3311#undef D2bv
3312#undef D2bvIP
3313#undef I2bv
3314#undef I6ALU
3315
3316static unsigned imm_size(struct decode_cache *c)
3317{
3318 unsigned size;
3319
3320 size = (c->d & ByteOp) ? 1 : c->op_bytes;
3321 if (size == 8)
3322 size = 4;
3323 return size;
3324}
3325
3326static int decode_imm(struct x86_emulate_ctxt *ctxt, struct operand *op,
3327 unsigned size, bool sign_extension)
3328{
3329 struct decode_cache *c = &ctxt->decode;
3330 struct x86_emulate_ops *ops = ctxt->ops;
3331 int rc = X86EMUL_CONTINUE;
3332
3333 op->type = OP_IMM;
3334 op->bytes = size;
3335 op->addr.mem.ea = c->eip;
3336 /* NB. Immediates are sign-extended as necessary. */
3337 switch (op->bytes) {
3338 case 1:
3339 op->val = insn_fetch(s8, 1, c->eip);
3340 break;
3341 case 2:
3342 op->val = insn_fetch(s16, 2, c->eip);
3343 break;
3344 case 4:
3345 op->val = insn_fetch(s32, 4, c->eip);
3346 break;
3347 }
3348 if (!sign_extension) {
3349 switch (op->bytes) {
3350 case 1:
3351 op->val &= 0xff;
3352 break;
3353 case 2:
3354 op->val &= 0xffff;
3355 break;
3356 case 4:
3357 op->val &= 0xffffffff;
3358 break;
3359 }
3360 }
3361done:
3362 return rc;
2556} 3363}
2557 3364
2558int 3365int
2559x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops) 3366x86_decode_insn(struct x86_emulate_ctxt *ctxt, void *insn, int insn_len)
2560{ 3367{
3368 struct x86_emulate_ops *ops = ctxt->ops;
3369 struct decode_cache *c = &ctxt->decode;
3370 int rc = X86EMUL_CONTINUE;
3371 int mode = ctxt->mode;
3372 int def_op_bytes, def_ad_bytes, goffset, simd_prefix;
3373 bool op_prefix = false;
3374 struct opcode opcode;
3375 struct operand memop = { .type = OP_NONE }, *memopp = NULL;
3376
3377 c->eip = ctxt->eip;
3378 c->fetch.start = c->eip;
3379 c->fetch.end = c->fetch.start + insn_len;
3380 if (insn_len > 0)
3381 memcpy(c->fetch.data, insn, insn_len);
3382
3383 switch (mode) {
3384 case X86EMUL_MODE_REAL:
3385 case X86EMUL_MODE_VM86:
3386 case X86EMUL_MODE_PROT16:
3387 def_op_bytes = def_ad_bytes = 2;
3388 break;
3389 case X86EMUL_MODE_PROT32:
3390 def_op_bytes = def_ad_bytes = 4;
3391 break;
3392#ifdef CONFIG_X86_64
3393 case X86EMUL_MODE_PROT64:
3394 def_op_bytes = 4;
3395 def_ad_bytes = 8;
3396 break;
3397#endif
3398 default:
3399 return -1;
3400 }
3401
3402 c->op_bytes = def_op_bytes;
3403 c->ad_bytes = def_ad_bytes;
3404
3405 /* Legacy prefixes. */
3406 for (;;) {
3407 switch (c->b = insn_fetch(u8, 1, c->eip)) {
3408 case 0x66: /* operand-size override */
3409 op_prefix = true;
3410 /* switch between 2/4 bytes */
3411 c->op_bytes = def_op_bytes ^ 6;
3412 break;
3413 case 0x67: /* address-size override */
3414 if (mode == X86EMUL_MODE_PROT64)
3415 /* switch between 4/8 bytes */
3416 c->ad_bytes = def_ad_bytes ^ 12;
3417 else
3418 /* switch between 2/4 bytes */
3419 c->ad_bytes = def_ad_bytes ^ 6;
3420 break;
3421 case 0x26: /* ES override */
3422 case 0x2e: /* CS override */
3423 case 0x36: /* SS override */
3424 case 0x3e: /* DS override */
3425 set_seg_override(c, (c->b >> 3) & 3);
3426 break;
3427 case 0x64: /* FS override */
3428 case 0x65: /* GS override */
3429 set_seg_override(c, c->b & 7);
3430 break;
3431 case 0x40 ... 0x4f: /* REX */
3432 if (mode != X86EMUL_MODE_PROT64)
3433 goto done_prefixes;
3434 c->rex_prefix = c->b;
3435 continue;
3436 case 0xf0: /* LOCK */
3437 c->lock_prefix = 1;
3438 break;
3439 case 0xf2: /* REPNE/REPNZ */
3440 case 0xf3: /* REP/REPE/REPZ */
3441 c->rep_prefix = c->b;
3442 break;
3443 default:
3444 goto done_prefixes;
3445 }
3446
3447 /* Any legacy prefix after a REX prefix nullifies its effect. */
3448
3449 c->rex_prefix = 0;
3450 }
3451
3452done_prefixes:
3453
3454 /* REX prefix. */
3455 if (c->rex_prefix & 8)
3456 c->op_bytes = 8; /* REX.W */
3457
3458 /* Opcode byte(s). */
3459 opcode = opcode_table[c->b];
3460 /* Two-byte opcode? */
3461 if (c->b == 0x0f) {
3462 c->twobyte = 1;
3463 c->b = insn_fetch(u8, 1, c->eip);
3464 opcode = twobyte_table[c->b];
3465 }
3466 c->d = opcode.flags;
3467
3468 while (c->d & GroupMask) {
3469 switch (c->d & GroupMask) {
3470 case Group:
3471 c->modrm = insn_fetch(u8, 1, c->eip);
3472 --c->eip;
3473 goffset = (c->modrm >> 3) & 7;
3474 opcode = opcode.u.group[goffset];
3475 break;
3476 case GroupDual:
3477 c->modrm = insn_fetch(u8, 1, c->eip);
3478 --c->eip;
3479 goffset = (c->modrm >> 3) & 7;
3480 if ((c->modrm >> 6) == 3)
3481 opcode = opcode.u.gdual->mod3[goffset];
3482 else
3483 opcode = opcode.u.gdual->mod012[goffset];
3484 break;
3485 case RMExt:
3486 goffset = c->modrm & 7;
3487 opcode = opcode.u.group[goffset];
3488 break;
3489 case Prefix:
3490 if (c->rep_prefix && op_prefix)
3491 return X86EMUL_UNHANDLEABLE;
3492 simd_prefix = op_prefix ? 0x66 : c->rep_prefix;
3493 switch (simd_prefix) {
3494 case 0x00: opcode = opcode.u.gprefix->pfx_no; break;
3495 case 0x66: opcode = opcode.u.gprefix->pfx_66; break;
3496 case 0xf2: opcode = opcode.u.gprefix->pfx_f2; break;
3497 case 0xf3: opcode = opcode.u.gprefix->pfx_f3; break;
3498 }
3499 break;
3500 default:
3501 return X86EMUL_UNHANDLEABLE;
3502 }
3503
3504 c->d &= ~GroupMask;
3505 c->d |= opcode.flags;
3506 }
3507
3508 c->execute = opcode.u.execute;
3509 c->check_perm = opcode.check_perm;
3510 c->intercept = opcode.intercept;
3511
3512 /* Unrecognised? */
3513 if (c->d == 0 || (c->d & Undefined))
3514 return -1;
3515
3516 if (!(c->d & VendorSpecific) && ctxt->only_vendor_specific_insn)
3517 return -1;
3518
3519 if (mode == X86EMUL_MODE_PROT64 && (c->d & Stack))
3520 c->op_bytes = 8;
3521
3522 if (c->d & Op3264) {
3523 if (mode == X86EMUL_MODE_PROT64)
3524 c->op_bytes = 8;
3525 else
3526 c->op_bytes = 4;
3527 }
3528
3529 if (c->d & Sse)
3530 c->op_bytes = 16;
3531
3532 /* ModRM and SIB bytes. */
3533 if (c->d & ModRM) {
3534 rc = decode_modrm(ctxt, ops, &memop);
3535 if (!c->has_seg_override)
3536 set_seg_override(c, c->modrm_seg);
3537 } else if (c->d & MemAbs)
3538 rc = decode_abs(ctxt, ops, &memop);
3539 if (rc != X86EMUL_CONTINUE)
3540 goto done;
3541
3542 if (!c->has_seg_override)
3543 set_seg_override(c, VCPU_SREG_DS);
3544
3545 memop.addr.mem.seg = seg_override(ctxt, c);
3546
3547 if (memop.type == OP_MEM && c->ad_bytes != 8)
3548 memop.addr.mem.ea = (u32)memop.addr.mem.ea;
3549
3550 /*
3551 * Decode and fetch the source operand: register, memory
3552 * or immediate.
3553 */
3554 switch (c->d & SrcMask) {
3555 case SrcNone:
3556 break;
3557 case SrcReg:
3558 decode_register_operand(ctxt, &c->src, c, 0);
3559 break;
3560 case SrcMem16:
3561 memop.bytes = 2;
3562 goto srcmem_common;
3563 case SrcMem32:
3564 memop.bytes = 4;
3565 goto srcmem_common;
3566 case SrcMem:
3567 memop.bytes = (c->d & ByteOp) ? 1 :
3568 c->op_bytes;
3569 srcmem_common:
3570 c->src = memop;
3571 memopp = &c->src;
3572 break;
3573 case SrcImmU16:
3574 rc = decode_imm(ctxt, &c->src, 2, false);
3575 break;
3576 case SrcImm:
3577 rc = decode_imm(ctxt, &c->src, imm_size(c), true);
3578 break;
3579 case SrcImmU:
3580 rc = decode_imm(ctxt, &c->src, imm_size(c), false);
3581 break;
3582 case SrcImmByte:
3583 rc = decode_imm(ctxt, &c->src, 1, true);
3584 break;
3585 case SrcImmUByte:
3586 rc = decode_imm(ctxt, &c->src, 1, false);
3587 break;
3588 case SrcAcc:
3589 c->src.type = OP_REG;
3590 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3591 c->src.addr.reg = &c->regs[VCPU_REGS_RAX];
3592 fetch_register_operand(&c->src);
3593 break;
3594 case SrcOne:
3595 c->src.bytes = 1;
3596 c->src.val = 1;
3597 break;
3598 case SrcSI:
3599 c->src.type = OP_MEM;
3600 c->src.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3601 c->src.addr.mem.ea =
3602 register_address(c, c->regs[VCPU_REGS_RSI]);
3603 c->src.addr.mem.seg = seg_override(ctxt, c);
3604 c->src.val = 0;
3605 break;
3606 case SrcImmFAddr:
3607 c->src.type = OP_IMM;
3608 c->src.addr.mem.ea = c->eip;
3609 c->src.bytes = c->op_bytes + 2;
3610 insn_fetch_arr(c->src.valptr, c->src.bytes, c->eip);
3611 break;
3612 case SrcMemFAddr:
3613 memop.bytes = c->op_bytes + 2;
3614 goto srcmem_common;
3615 break;
3616 case SrcDX:
3617 c->src.type = OP_REG;
3618 c->src.bytes = 2;
3619 c->src.addr.reg = &c->regs[VCPU_REGS_RDX];
3620 fetch_register_operand(&c->src);
3621 break;
3622 }
3623
3624 if (rc != X86EMUL_CONTINUE)
3625 goto done;
3626
3627 /*
3628 * Decode and fetch the second source operand: register, memory
3629 * or immediate.
3630 */
3631 switch (c->d & Src2Mask) {
3632 case Src2None:
3633 break;
3634 case Src2CL:
3635 c->src2.bytes = 1;
3636 c->src2.val = c->regs[VCPU_REGS_RCX] & 0x8;
3637 break;
3638 case Src2ImmByte:
3639 rc = decode_imm(ctxt, &c->src2, 1, true);
3640 break;
3641 case Src2One:
3642 c->src2.bytes = 1;
3643 c->src2.val = 1;
3644 break;
3645 case Src2Imm:
3646 rc = decode_imm(ctxt, &c->src2, imm_size(c), true);
3647 break;
3648 }
3649
3650 if (rc != X86EMUL_CONTINUE)
3651 goto done;
3652
3653 /* Decode and fetch the destination operand: register or memory. */
3654 switch (c->d & DstMask) {
3655 case DstReg:
3656 decode_register_operand(ctxt, &c->dst, c,
3657 c->twobyte && (c->b == 0xb6 || c->b == 0xb7));
3658 break;
3659 case DstImmUByte:
3660 c->dst.type = OP_IMM;
3661 c->dst.addr.mem.ea = c->eip;
3662 c->dst.bytes = 1;
3663 c->dst.val = insn_fetch(u8, 1, c->eip);
3664 break;
3665 case DstMem:
3666 case DstMem64:
3667 c->dst = memop;
3668 memopp = &c->dst;
3669 if ((c->d & DstMask) == DstMem64)
3670 c->dst.bytes = 8;
3671 else
3672 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3673 if (c->d & BitOp)
3674 fetch_bit_operand(c);
3675 c->dst.orig_val = c->dst.val;
3676 break;
3677 case DstAcc:
3678 c->dst.type = OP_REG;
3679 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3680 c->dst.addr.reg = &c->regs[VCPU_REGS_RAX];
3681 fetch_register_operand(&c->dst);
3682 c->dst.orig_val = c->dst.val;
3683 break;
3684 case DstDI:
3685 c->dst.type = OP_MEM;
3686 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
3687 c->dst.addr.mem.ea =
3688 register_address(c, c->regs[VCPU_REGS_RDI]);
3689 c->dst.addr.mem.seg = VCPU_SREG_ES;
3690 c->dst.val = 0;
3691 break;
3692 case DstDX:
3693 c->dst.type = OP_REG;
3694 c->dst.bytes = 2;
3695 c->dst.addr.reg = &c->regs[VCPU_REGS_RDX];
3696 fetch_register_operand(&c->dst);
3697 break;
3698 case ImplicitOps:
3699 /* Special instructions do their own operand decoding. */
3700 default:
3701 c->dst.type = OP_NONE; /* Disable writeback. */
3702 break;
3703 }
3704
3705done:
3706 if (memopp && memopp->type == OP_MEM && c->rip_relative)
3707 memopp->addr.mem.ea += c->eip;
3708
3709 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3710}
3711
3712static bool string_insn_completed(struct x86_emulate_ctxt *ctxt)
3713{
3714 struct decode_cache *c = &ctxt->decode;
3715
3716 /* The second termination condition only applies for REPE
3717 * and REPNE. Test if the repeat string operation prefix is
3718 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
3719 * corresponding termination condition according to:
3720 * - if REPE/REPZ and ZF = 0 then done
3721 * - if REPNE/REPNZ and ZF = 1 then done
3722 */
3723 if (((c->b == 0xa6) || (c->b == 0xa7) ||
3724 (c->b == 0xae) || (c->b == 0xaf))
3725 && (((c->rep_prefix == REPE_PREFIX) &&
3726 ((ctxt->eflags & EFLG_ZF) == 0))
3727 || ((c->rep_prefix == REPNE_PREFIX) &&
3728 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))))
3729 return true;
3730
3731 return false;
3732}
3733
3734int
3735x86_emulate_insn(struct x86_emulate_ctxt *ctxt)
3736{
3737 struct x86_emulate_ops *ops = ctxt->ops;
2561 u64 msr_data; 3738 u64 msr_data;
2562 struct decode_cache *c = &ctxt->decode; 3739 struct decode_cache *c = &ctxt->decode;
2563 int rc = X86EMUL_CONTINUE; 3740 int rc = X86EMUL_CONTINUE;
2564 int saved_dst_type = c->dst.type; 3741 int saved_dst_type = c->dst.type;
3742 int irq; /* Used for int 3, int, and into */
2565 3743
2566 ctxt->decode.mem_read.pos = 0; 3744 ctxt->decode.mem_read.pos = 0;
2567 3745
2568 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) { 3746 if (ctxt->mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
2569 emulate_ud(ctxt); 3747 rc = emulate_ud(ctxt);
2570 goto done; 3748 goto done;
2571 } 3749 }
2572 3750
2573 /* LOCK prefix is allowed only with some instructions */ 3751 /* LOCK prefix is allowed only with some instructions */
2574 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) { 3752 if (c->lock_prefix && (!(c->d & Lock) || c->dst.type != OP_MEM)) {
2575 emulate_ud(ctxt); 3753 rc = emulate_ud(ctxt);
3754 goto done;
3755 }
3756
3757 if ((c->d & SrcMask) == SrcMemFAddr && c->src.type != OP_MEM) {
3758 rc = emulate_ud(ctxt);
3759 goto done;
3760 }
3761
3762 if ((c->d & Sse)
3763 && ((ops->get_cr(ctxt, 0) & X86_CR0_EM)
3764 || !(ops->get_cr(ctxt, 4) & X86_CR4_OSFXSR))) {
3765 rc = emulate_ud(ctxt);
3766 goto done;
3767 }
3768
3769 if ((c->d & Sse) && (ops->get_cr(ctxt, 0) & X86_CR0_TS)) {
3770 rc = emulate_nm(ctxt);
2576 goto done; 3771 goto done;
2577 } 3772 }
2578 3773
3774 if (unlikely(ctxt->guest_mode) && c->intercept) {
3775 rc = emulator_check_intercept(ctxt, c->intercept,
3776 X86_ICPT_PRE_EXCEPT);
3777 if (rc != X86EMUL_CONTINUE)
3778 goto done;
3779 }
3780
2579 /* Privileged instruction can be executed only in CPL=0 */ 3781 /* Privileged instruction can be executed only in CPL=0 */
2580 if ((c->d & Priv) && ops->cpl(ctxt->vcpu)) { 3782 if ((c->d & Priv) && ops->cpl(ctxt)) {
2581 emulate_gp(ctxt, 0); 3783 rc = emulate_gp(ctxt, 0);
2582 goto done; 3784 goto done;
2583 } 3785 }
2584 3786
3787 /* Instruction can only be executed in protected mode */
3788 if ((c->d & Prot) && !(ctxt->mode & X86EMUL_MODE_PROT)) {
3789 rc = emulate_ud(ctxt);
3790 goto done;
3791 }
3792
3793 /* Do instruction specific permission checks */
3794 if (c->check_perm) {
3795 rc = c->check_perm(ctxt);
3796 if (rc != X86EMUL_CONTINUE)
3797 goto done;
3798 }
3799
3800 if (unlikely(ctxt->guest_mode) && c->intercept) {
3801 rc = emulator_check_intercept(ctxt, c->intercept,
3802 X86_ICPT_POST_EXCEPT);
3803 if (rc != X86EMUL_CONTINUE)
3804 goto done;
3805 }
3806
2585 if (c->rep_prefix && (c->d & String)) { 3807 if (c->rep_prefix && (c->d & String)) {
2586 ctxt->restart = true;
2587 /* All REP prefixes have the same first termination condition */ 3808 /* All REP prefixes have the same first termination condition */
2588 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) { 3809 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0) {
2589 string_done:
2590 ctxt->restart = false;
2591 ctxt->eip = c->eip; 3810 ctxt->eip = c->eip;
2592 goto done; 3811 goto done;
2593 } 3812 }
2594 /* The second termination condition only applies for REPE
2595 * and REPNE. Test if the repeat string operation prefix is
2596 * REPE/REPZ or REPNE/REPNZ and if it's the case it tests the
2597 * corresponding termination condition according to:
2598 * - if REPE/REPZ and ZF = 0 then done
2599 * - if REPNE/REPNZ and ZF = 1 then done
2600 */
2601 if ((c->b == 0xa6) || (c->b == 0xa7) ||
2602 (c->b == 0xae) || (c->b == 0xaf)) {
2603 if ((c->rep_prefix == REPE_PREFIX) &&
2604 ((ctxt->eflags & EFLG_ZF) == 0))
2605 goto string_done;
2606 if ((c->rep_prefix == REPNE_PREFIX) &&
2607 ((ctxt->eflags & EFLG_ZF) == EFLG_ZF))
2608 goto string_done;
2609 }
2610 c->eip = ctxt->eip;
2611 } 3813 }
2612 3814
2613 if (c->src.type == OP_MEM) { 3815 if ((c->src.type == OP_MEM) && !(c->d & NoAccess)) {
2614 rc = read_emulated(ctxt, ops, (unsigned long)c->src.ptr, 3816 rc = segmented_read(ctxt, c->src.addr.mem,
2615 c->src.valptr, c->src.bytes); 3817 c->src.valptr, c->src.bytes);
2616 if (rc != X86EMUL_CONTINUE) 3818 if (rc != X86EMUL_CONTINUE)
2617 goto done; 3819 goto done;
2618 c->src.orig_val64 = c->src.val64; 3820 c->src.orig_val64 = c->src.val64;
2619 } 3821 }
2620 3822
2621 if (c->src2.type == OP_MEM) { 3823 if (c->src2.type == OP_MEM) {
2622 rc = read_emulated(ctxt, ops, (unsigned long)c->src2.ptr, 3824 rc = segmented_read(ctxt, c->src2.addr.mem,
2623 &c->src2.val, c->src2.bytes); 3825 &c->src2.val, c->src2.bytes);
2624 if (rc != X86EMUL_CONTINUE) 3826 if (rc != X86EMUL_CONTINUE)
2625 goto done; 3827 goto done;
2626 } 3828 }
@@ -2631,7 +3833,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2631 3833
2632 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) { 3834 if ((c->dst.type == OP_MEM) && !(c->d & Mov)) {
2633 /* optimisation - avoid slow emulated read if Mov */ 3835 /* optimisation - avoid slow emulated read if Mov */
2634 rc = read_emulated(ctxt, ops, (unsigned long)c->dst.ptr, 3836 rc = segmented_read(ctxt, c->dst.addr.mem,
2635 &c->dst.val, c->dst.bytes); 3837 &c->dst.val, c->dst.bytes);
2636 if (rc != X86EMUL_CONTINUE) 3838 if (rc != X86EMUL_CONTINUE)
2637 goto done; 3839 goto done;
@@ -2640,68 +3842,44 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
2640 3842
2641special_insn: 3843special_insn:
2642 3844
3845 if (unlikely(ctxt->guest_mode) && c->intercept) {
3846 rc = emulator_check_intercept(ctxt, c->intercept,
3847 X86_ICPT_POST_MEMACCESS);
3848 if (rc != X86EMUL_CONTINUE)
3849 goto done;
3850 }
3851
3852 if (c->execute) {
3853 rc = c->execute(ctxt);
3854 if (rc != X86EMUL_CONTINUE)
3855 goto done;
3856 goto writeback;
3857 }
3858
2643 if (c->twobyte) 3859 if (c->twobyte)
2644 goto twobyte_insn; 3860 goto twobyte_insn;
2645 3861
2646 switch (c->b) { 3862 switch (c->b) {
2647 case 0x00 ... 0x05:
2648 add: /* add */
2649 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
2650 break;
2651 case 0x06: /* push es */ 3863 case 0x06: /* push es */
2652 emulate_push_sreg(ctxt, ops, VCPU_SREG_ES); 3864 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_ES);
2653 break; 3865 break;
2654 case 0x07: /* pop es */ 3866 case 0x07: /* pop es */
2655 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES); 3867 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
2656 if (rc != X86EMUL_CONTINUE)
2657 goto done;
2658 break;
2659 case 0x08 ... 0x0d:
2660 or: /* or */
2661 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
2662 break; 3868 break;
2663 case 0x0e: /* push cs */ 3869 case 0x0e: /* push cs */
2664 emulate_push_sreg(ctxt, ops, VCPU_SREG_CS); 3870 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_CS);
2665 break;
2666 case 0x10 ... 0x15:
2667 adc: /* adc */
2668 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
2669 break; 3871 break;
2670 case 0x16: /* push ss */ 3872 case 0x16: /* push ss */
2671 emulate_push_sreg(ctxt, ops, VCPU_SREG_SS); 3873 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_SS);
2672 break; 3874 break;
2673 case 0x17: /* pop ss */ 3875 case 0x17: /* pop ss */
2674 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS); 3876 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
2675 if (rc != X86EMUL_CONTINUE)
2676 goto done;
2677 break;
2678 case 0x18 ... 0x1d:
2679 sbb: /* sbb */
2680 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
2681 break; 3877 break;
2682 case 0x1e: /* push ds */ 3878 case 0x1e: /* push ds */
2683 emulate_push_sreg(ctxt, ops, VCPU_SREG_DS); 3879 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_DS);
2684 break; 3880 break;
2685 case 0x1f: /* pop ds */ 3881 case 0x1f: /* pop ds */
2686 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS); 3882 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
2687 if (rc != X86EMUL_CONTINUE)
2688 goto done;
2689 break;
2690 case 0x20 ... 0x25:
2691 and: /* and */
2692 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
2693 break;
2694 case 0x28 ... 0x2d:
2695 sub: /* sub */
2696 emulate_2op_SrcV("sub", c->src, c->dst, ctxt->eflags);
2697 break;
2698 case 0x30 ... 0x35:
2699 xor: /* xor */
2700 emulate_2op_SrcV("xor", c->src, c->dst, ctxt->eflags);
2701 break;
2702 case 0x38 ... 0x3d:
2703 cmp: /* cmp */
2704 emulate_2op_SrcV("cmp", c->src, c->dst, ctxt->eflags);
2705 break; 3883 break;
2706 case 0x40 ... 0x47: /* inc r16/r32 */ 3884 case 0x40 ... 0x47: /* inc r16/r32 */
2707 emulate_1op("inc", c->dst, ctxt->eflags); 3885 emulate_1op("inc", c->dst, ctxt->eflags);
@@ -2709,83 +3887,24 @@ special_insn:
2709 case 0x48 ... 0x4f: /* dec r16/r32 */ 3887 case 0x48 ... 0x4f: /* dec r16/r32 */
2710 emulate_1op("dec", c->dst, ctxt->eflags); 3888 emulate_1op("dec", c->dst, ctxt->eflags);
2711 break; 3889 break;
2712 case 0x50 ... 0x57: /* push reg */
2713 emulate_push(ctxt, ops);
2714 break;
2715 case 0x58 ... 0x5f: /* pop reg */
2716 pop_instruction:
2717 rc = emulate_pop(ctxt, ops, &c->dst.val, c->op_bytes);
2718 if (rc != X86EMUL_CONTINUE)
2719 goto done;
2720 break;
2721 case 0x60: /* pusha */
2722 rc = emulate_pusha(ctxt, ops);
2723 if (rc != X86EMUL_CONTINUE)
2724 goto done;
2725 break;
2726 case 0x61: /* popa */
2727 rc = emulate_popa(ctxt, ops);
2728 if (rc != X86EMUL_CONTINUE)
2729 goto done;
2730 break;
2731 case 0x63: /* movsxd */ 3890 case 0x63: /* movsxd */
2732 if (ctxt->mode != X86EMUL_MODE_PROT64) 3891 if (ctxt->mode != X86EMUL_MODE_PROT64)
2733 goto cannot_emulate; 3892 goto cannot_emulate;
2734 c->dst.val = (s32) c->src.val; 3893 c->dst.val = (s32) c->src.val;
2735 break; 3894 break;
2736 case 0x68: /* push imm */
2737 case 0x6a: /* push imm8 */
2738 emulate_push(ctxt, ops);
2739 break;
2740 case 0x6c: /* insb */ 3895 case 0x6c: /* insb */
2741 case 0x6d: /* insw/insd */ 3896 case 0x6d: /* insw/insd */
2742 c->dst.bytes = min(c->dst.bytes, 4u); 3897 c->src.val = c->regs[VCPU_REGS_RDX];
2743 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3898 goto do_io_in;
2744 c->dst.bytes)) {
2745 emulate_gp(ctxt, 0);
2746 goto done;
2747 }
2748 if (!pio_in_emulated(ctxt, ops, c->dst.bytes,
2749 c->regs[VCPU_REGS_RDX], &c->dst.val))
2750 goto done; /* IO is needed, skip writeback */
2751 break;
2752 case 0x6e: /* outsb */ 3899 case 0x6e: /* outsb */
2753 case 0x6f: /* outsw/outsd */ 3900 case 0x6f: /* outsw/outsd */
2754 c->src.bytes = min(c->src.bytes, 4u); 3901 c->dst.val = c->regs[VCPU_REGS_RDX];
2755 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX], 3902 goto do_io_out;
2756 c->src.bytes)) {
2757 emulate_gp(ctxt, 0);
2758 goto done;
2759 }
2760 ops->pio_out_emulated(c->src.bytes, c->regs[VCPU_REGS_RDX],
2761 &c->src.val, 1, ctxt->vcpu);
2762
2763 c->dst.type = OP_NONE; /* nothing to writeback */
2764 break; 3903 break;
2765 case 0x70 ... 0x7f: /* jcc (short) */ 3904 case 0x70 ... 0x7f: /* jcc (short) */
2766 if (test_cc(c->b, ctxt->eflags)) 3905 if (test_cc(c->b, ctxt->eflags))
2767 jmp_rel(c, c->src.val); 3906 jmp_rel(c, c->src.val);
2768 break; 3907 break;
2769 case 0x80 ... 0x83: /* Grp1 */
2770 switch (c->modrm_reg) {
2771 case 0:
2772 goto add;
2773 case 1:
2774 goto or;
2775 case 2:
2776 goto adc;
2777 case 3:
2778 goto sbb;
2779 case 4:
2780 goto and;
2781 case 5:
2782 goto sub;
2783 case 6:
2784 goto xor;
2785 case 7:
2786 goto cmp;
2787 }
2788 break;
2789 case 0x84 ... 0x85: 3908 case 0x84 ... 0x85:
2790 test: 3909 test:
2791 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags); 3910 emulate_2op_SrcV("test", c->src, c->dst, ctxt->eflags);
@@ -2793,38 +3912,24 @@ special_insn:
2793 case 0x86 ... 0x87: /* xchg */ 3912 case 0x86 ... 0x87: /* xchg */
2794 xchg: 3913 xchg:
2795 /* Write back the register source. */ 3914 /* Write back the register source. */
2796 switch (c->dst.bytes) { 3915 c->src.val = c->dst.val;
2797 case 1: 3916 write_register_operand(&c->src);
2798 *(u8 *) c->src.ptr = (u8) c->dst.val;
2799 break;
2800 case 2:
2801 *(u16 *) c->src.ptr = (u16) c->dst.val;
2802 break;
2803 case 4:
2804 *c->src.ptr = (u32) c->dst.val;
2805 break; /* 64b reg: zero-extend */
2806 case 8:
2807 *c->src.ptr = c->dst.val;
2808 break;
2809 }
2810 /* 3917 /*
2811 * Write back the memory destination with implicit LOCK 3918 * Write back the memory destination with implicit LOCK
2812 * prefix. 3919 * prefix.
2813 */ 3920 */
2814 c->dst.val = c->src.val; 3921 c->dst.val = c->src.orig_val;
2815 c->lock_prefix = 1; 3922 c->lock_prefix = 1;
2816 break; 3923 break;
2817 case 0x88 ... 0x8b: /* mov */
2818 goto mov;
2819 case 0x8c: /* mov r/m, sreg */ 3924 case 0x8c: /* mov r/m, sreg */
2820 if (c->modrm_reg > VCPU_SREG_GS) { 3925 if (c->modrm_reg > VCPU_SREG_GS) {
2821 emulate_ud(ctxt); 3926 rc = emulate_ud(ctxt);
2822 goto done; 3927 goto done;
2823 } 3928 }
2824 c->dst.val = ops->get_segment_selector(c->modrm_reg, ctxt->vcpu); 3929 c->dst.val = get_segment_selector(ctxt, c->modrm_reg);
2825 break; 3930 break;
2826 case 0x8d: /* lea r16/r32, m */ 3931 case 0x8d: /* lea r16/r32, m */
2827 c->dst.val = c->modrm_ea; 3932 c->dst.val = c->src.addr.mem.ea;
2828 break; 3933 break;
2829 case 0x8e: { /* mov seg, r/m16 */ 3934 case 0x8e: { /* mov seg, r/m16 */
2830 uint16_t sel; 3935 uint16_t sel;
@@ -2833,7 +3938,7 @@ special_insn:
2833 3938
2834 if (c->modrm_reg == VCPU_SREG_CS || 3939 if (c->modrm_reg == VCPU_SREG_CS ||
2835 c->modrm_reg > VCPU_SREG_GS) { 3940 c->modrm_reg > VCPU_SREG_GS) {
2836 emulate_ud(ctxt); 3941 rc = emulate_ud(ctxt);
2837 goto done; 3942 goto done;
2838 } 3943 }
2839 3944
@@ -2846,76 +3951,72 @@ special_insn:
2846 break; 3951 break;
2847 } 3952 }
2848 case 0x8f: /* pop (sole member of Grp1a) */ 3953 case 0x8f: /* pop (sole member of Grp1a) */
2849 rc = emulate_grp1a(ctxt, ops); 3954 rc = em_grp1a(ctxt);
2850 if (rc != X86EMUL_CONTINUE)
2851 goto done;
2852 break; 3955 break;
2853 case 0x90: /* nop / xchg r8,rax */ 3956 case 0x90 ... 0x97: /* nop / xchg reg, rax */
2854 if (c->dst.ptr == (unsigned long *)&c->regs[VCPU_REGS_RAX]) { 3957 if (c->dst.addr.reg == &c->regs[VCPU_REGS_RAX])
2855 c->dst.type = OP_NONE; /* nop */
2856 break; 3958 break;
2857 }
2858 case 0x91 ... 0x97: /* xchg reg,rax */
2859 c->src.type = OP_REG;
2860 c->src.bytes = c->op_bytes;
2861 c->src.ptr = (unsigned long *) &c->regs[VCPU_REGS_RAX];
2862 c->src.val = *(c->src.ptr);
2863 goto xchg; 3959 goto xchg;
2864 case 0x9c: /* pushf */ 3960 case 0x98: /* cbw/cwde/cdqe */
2865 c->src.val = (unsigned long) ctxt->eflags; 3961 switch (c->op_bytes) {
2866 emulate_push(ctxt, ops); 3962 case 2: c->dst.val = (s8)c->dst.val; break;
2867 break; 3963 case 4: c->dst.val = (s16)c->dst.val; break;
2868 case 0x9d: /* popf */ 3964 case 8: c->dst.val = (s32)c->dst.val; break;
2869 c->dst.type = OP_REG; 3965 }
2870 c->dst.ptr = (unsigned long *) &ctxt->eflags;
2871 c->dst.bytes = c->op_bytes;
2872 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2873 if (rc != X86EMUL_CONTINUE)
2874 goto done;
2875 break; 3966 break;
2876 case 0xa0 ... 0xa3: /* mov */
2877 case 0xa4 ... 0xa5: /* movs */
2878 goto mov;
2879 case 0xa6 ... 0xa7: /* cmps */
2880 c->dst.type = OP_NONE; /* Disable writeback. */
2881 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
2882 goto cmp;
2883 case 0xa8 ... 0xa9: /* test ax, imm */ 3967 case 0xa8 ... 0xa9: /* test ax, imm */
2884 goto test; 3968 goto test;
2885 case 0xaa ... 0xab: /* stos */
2886 c->dst.val = c->regs[VCPU_REGS_RAX];
2887 break;
2888 case 0xac ... 0xad: /* lods */
2889 goto mov;
2890 case 0xae ... 0xaf: /* scas */
2891 DPRINTF("Urk! I don't handle SCAS.\n");
2892 goto cannot_emulate;
2893 case 0xb0 ... 0xbf: /* mov r, imm */
2894 goto mov;
2895 case 0xc0 ... 0xc1: 3969 case 0xc0 ... 0xc1:
2896 emulate_grp2(ctxt); 3970 rc = em_grp2(ctxt);
2897 break; 3971 break;
2898 case 0xc3: /* ret */ 3972 case 0xc3: /* ret */
2899 c->dst.type = OP_REG; 3973 c->dst.type = OP_REG;
2900 c->dst.ptr = &c->eip; 3974 c->dst.addr.reg = &c->eip;
2901 c->dst.bytes = c->op_bytes; 3975 c->dst.bytes = c->op_bytes;
2902 goto pop_instruction; 3976 rc = em_pop(ctxt);
2903 case 0xc6 ... 0xc7: /* mov (sole member of Grp11) */ 3977 break;
2904 mov: 3978 case 0xc4: /* les */
2905 c->dst.val = c->src.val; 3979 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_ES);
3980 break;
3981 case 0xc5: /* lds */
3982 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_DS);
2906 break; 3983 break;
2907 case 0xcb: /* ret far */ 3984 case 0xcb: /* ret far */
2908 rc = emulate_ret_far(ctxt, ops); 3985 rc = emulate_ret_far(ctxt, ops);
2909 if (rc != X86EMUL_CONTINUE) 3986 break;
2910 goto done; 3987 case 0xcc: /* int3 */
3988 irq = 3;
3989 goto do_interrupt;
3990 case 0xcd: /* int n */
3991 irq = c->src.val;
3992 do_interrupt:
3993 rc = emulate_int(ctxt, ops, irq);
3994 break;
3995 case 0xce: /* into */
3996 if (ctxt->eflags & EFLG_OF) {
3997 irq = 4;
3998 goto do_interrupt;
3999 }
4000 break;
4001 case 0xcf: /* iret */
4002 rc = emulate_iret(ctxt, ops);
2911 break; 4003 break;
2912 case 0xd0 ... 0xd1: /* Grp2 */ 4004 case 0xd0 ... 0xd1: /* Grp2 */
2913 c->src.val = 1; 4005 rc = em_grp2(ctxt);
2914 emulate_grp2(ctxt);
2915 break; 4006 break;
2916 case 0xd2 ... 0xd3: /* Grp2 */ 4007 case 0xd2 ... 0xd3: /* Grp2 */
2917 c->src.val = c->regs[VCPU_REGS_RCX]; 4008 c->src.val = c->regs[VCPU_REGS_RCX];
2918 emulate_grp2(ctxt); 4009 rc = em_grp2(ctxt);
4010 break;
4011 case 0xe0 ... 0xe2: /* loop/loopz/loopnz */
4012 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
4013 if (address_mask(c, c->regs[VCPU_REGS_RCX]) != 0 &&
4014 (c->b == 0xe2 || test_cc(c->b ^ 0x5, ctxt->eflags)))
4015 jmp_rel(c, c->src.val);
4016 break;
4017 case 0xe3: /* jcxz/jecxz/jrcxz */
4018 if (address_mask(c, c->regs[VCPU_REGS_RCX]) == 0)
4019 jmp_rel(c, c->src.val);
2919 break; 4020 break;
2920 case 0xe4: /* inb */ 4021 case 0xe4: /* inb */
2921 case 0xe5: /* in */ 4022 case 0xe5: /* in */
@@ -2927,23 +4028,14 @@ special_insn:
2927 long int rel = c->src.val; 4028 long int rel = c->src.val;
2928 c->src.val = (unsigned long) c->eip; 4029 c->src.val = (unsigned long) c->eip;
2929 jmp_rel(c, rel); 4030 jmp_rel(c, rel);
2930 emulate_push(ctxt, ops); 4031 rc = em_push(ctxt);
2931 break; 4032 break;
2932 } 4033 }
2933 case 0xe9: /* jmp rel */ 4034 case 0xe9: /* jmp rel */
2934 goto jmp; 4035 goto jmp;
2935 case 0xea: { /* jmp far */ 4036 case 0xea: /* jmp far */
2936 unsigned short sel; 4037 rc = em_jmp_far(ctxt);
2937 jump_far:
2938 memcpy(&sel, c->src.valptr + c->op_bytes, 2);
2939
2940 if (load_segment_descriptor(ctxt, ops, sel, VCPU_SREG_CS))
2941 goto done;
2942
2943 c->eip = 0;
2944 memcpy(&c->eip, c->src.valptr, c->op_bytes);
2945 break; 4038 break;
2946 }
2947 case 0xeb: 4039 case 0xeb:
2948 jmp: /* jmp rel short */ 4040 jmp: /* jmp rel short */
2949 jmp_rel(c, c->src.val); 4041 jmp_rel(c, c->src.val);
@@ -2951,87 +4043,71 @@ special_insn:
2951 break; 4043 break;
2952 case 0xec: /* in al,dx */ 4044 case 0xec: /* in al,dx */
2953 case 0xed: /* in (e/r)ax,dx */ 4045 case 0xed: /* in (e/r)ax,dx */
2954 c->src.val = c->regs[VCPU_REGS_RDX];
2955 do_io_in: 4046 do_io_in:
2956 c->dst.bytes = min(c->dst.bytes, 4u);
2957 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) {
2958 emulate_gp(ctxt, 0);
2959 goto done;
2960 }
2961 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val, 4047 if (!pio_in_emulated(ctxt, ops, c->dst.bytes, c->src.val,
2962 &c->dst.val)) 4048 &c->dst.val))
2963 goto done; /* IO is needed */ 4049 goto done; /* IO is needed */
2964 break; 4050 break;
2965 case 0xee: /* out dx,al */ 4051 case 0xee: /* out dx,al */
2966 case 0xef: /* out dx,(e/r)ax */ 4052 case 0xef: /* out dx,(e/r)ax */
2967 c->src.val = c->regs[VCPU_REGS_RDX];
2968 do_io_out: 4053 do_io_out:
2969 c->dst.bytes = min(c->dst.bytes, 4u); 4054 ops->pio_out_emulated(ctxt, c->src.bytes, c->dst.val,
2970 if (!emulator_io_permited(ctxt, ops, c->src.val, c->dst.bytes)) { 4055 &c->src.val, 1);
2971 emulate_gp(ctxt, 0);
2972 goto done;
2973 }
2974 ops->pio_out_emulated(c->dst.bytes, c->src.val, &c->dst.val, 1,
2975 ctxt->vcpu);
2976 c->dst.type = OP_NONE; /* Disable writeback. */ 4056 c->dst.type = OP_NONE; /* Disable writeback. */
2977 break; 4057 break;
2978 case 0xf4: /* hlt */ 4058 case 0xf4: /* hlt */
2979 ctxt->vcpu->arch.halt_request = 1; 4059 ctxt->ops->halt(ctxt);
2980 break; 4060 break;
2981 case 0xf5: /* cmc */ 4061 case 0xf5: /* cmc */
2982 /* complement carry flag from eflags reg */ 4062 /* complement carry flag from eflags reg */
2983 ctxt->eflags ^= EFLG_CF; 4063 ctxt->eflags ^= EFLG_CF;
2984 c->dst.type = OP_NONE; /* Disable writeback. */
2985 break; 4064 break;
2986 case 0xf6 ... 0xf7: /* Grp3 */ 4065 case 0xf6 ... 0xf7: /* Grp3 */
2987 if (!emulate_grp3(ctxt, ops)) 4066 rc = em_grp3(ctxt);
2988 goto cannot_emulate;
2989 break; 4067 break;
2990 case 0xf8: /* clc */ 4068 case 0xf8: /* clc */
2991 ctxt->eflags &= ~EFLG_CF; 4069 ctxt->eflags &= ~EFLG_CF;
2992 c->dst.type = OP_NONE; /* Disable writeback. */ 4070 break;
4071 case 0xf9: /* stc */
4072 ctxt->eflags |= EFLG_CF;
2993 break; 4073 break;
2994 case 0xfa: /* cli */ 4074 case 0xfa: /* cli */
2995 if (emulator_bad_iopl(ctxt, ops)) { 4075 if (emulator_bad_iopl(ctxt, ops)) {
2996 emulate_gp(ctxt, 0); 4076 rc = emulate_gp(ctxt, 0);
2997 goto done; 4077 goto done;
2998 } else { 4078 } else
2999 ctxt->eflags &= ~X86_EFLAGS_IF; 4079 ctxt->eflags &= ~X86_EFLAGS_IF;
3000 c->dst.type = OP_NONE; /* Disable writeback. */
3001 }
3002 break; 4080 break;
3003 case 0xfb: /* sti */ 4081 case 0xfb: /* sti */
3004 if (emulator_bad_iopl(ctxt, ops)) { 4082 if (emulator_bad_iopl(ctxt, ops)) {
3005 emulate_gp(ctxt, 0); 4083 rc = emulate_gp(ctxt, 0);
3006 goto done; 4084 goto done;
3007 } else { 4085 } else {
3008 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI; 4086 ctxt->interruptibility = KVM_X86_SHADOW_INT_STI;
3009 ctxt->eflags |= X86_EFLAGS_IF; 4087 ctxt->eflags |= X86_EFLAGS_IF;
3010 c->dst.type = OP_NONE; /* Disable writeback. */
3011 } 4088 }
3012 break; 4089 break;
3013 case 0xfc: /* cld */ 4090 case 0xfc: /* cld */
3014 ctxt->eflags &= ~EFLG_DF; 4091 ctxt->eflags &= ~EFLG_DF;
3015 c->dst.type = OP_NONE; /* Disable writeback. */
3016 break; 4092 break;
3017 case 0xfd: /* std */ 4093 case 0xfd: /* std */
3018 ctxt->eflags |= EFLG_DF; 4094 ctxt->eflags |= EFLG_DF;
3019 c->dst.type = OP_NONE; /* Disable writeback. */
3020 break; 4095 break;
3021 case 0xfe: /* Grp4 */ 4096 case 0xfe: /* Grp4 */
3022 grp45: 4097 rc = em_grp45(ctxt);
3023 rc = emulate_grp45(ctxt, ops);
3024 if (rc != X86EMUL_CONTINUE)
3025 goto done;
3026 break; 4098 break;
3027 case 0xff: /* Grp5 */ 4099 case 0xff: /* Grp5 */
3028 if (c->modrm_reg == 5) 4100 rc = em_grp45(ctxt);
3029 goto jump_far; 4101 break;
3030 goto grp45; 4102 default:
4103 goto cannot_emulate;
3031 } 4104 }
3032 4105
4106 if (rc != X86EMUL_CONTINUE)
4107 goto done;
4108
3033writeback: 4109writeback:
3034 rc = writeback(ctxt, ops); 4110 rc = writeback(ctxt);
3035 if (rc != X86EMUL_CONTINUE) 4111 if (rc != X86EMUL_CONTINUE)
3036 goto done; 4112 goto done;
3037 4113
@@ -3042,165 +4118,82 @@ writeback:
3042 c->dst.type = saved_dst_type; 4118 c->dst.type = saved_dst_type;
3043 4119
3044 if ((c->d & SrcMask) == SrcSI) 4120 if ((c->d & SrcMask) == SrcSI)
3045 string_addr_inc(ctxt, seg_override_base(ctxt, ops, c), 4121 string_addr_inc(ctxt, seg_override(ctxt, c),
3046 VCPU_REGS_RSI, &c->src); 4122 VCPU_REGS_RSI, &c->src);
3047 4123
3048 if ((c->d & DstMask) == DstDI) 4124 if ((c->d & DstMask) == DstDI)
3049 string_addr_inc(ctxt, es_base(ctxt, ops), VCPU_REGS_RDI, 4125 string_addr_inc(ctxt, VCPU_SREG_ES, VCPU_REGS_RDI,
3050 &c->dst); 4126 &c->dst);
3051 4127
3052 if (c->rep_prefix && (c->d & String)) { 4128 if (c->rep_prefix && (c->d & String)) {
3053 struct read_cache *rc = &ctxt->decode.io_read; 4129 struct read_cache *r = &ctxt->decode.io_read;
3054 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1); 4130 register_address_increment(c, &c->regs[VCPU_REGS_RCX], -1);
3055 /* 4131
3056 * Re-enter guest when pio read ahead buffer is empty or, 4132 if (!string_insn_completed(ctxt)) {
3057 * if it is not used, after each 1024 iteration. 4133 /*
3058 */ 4134 * Re-enter guest when pio read ahead buffer is empty
3059 if ((rc->end == 0 && !(c->regs[VCPU_REGS_RCX] & 0x3ff)) || 4135 * or, if it is not used, after each 1024 iteration.
3060 (rc->end != 0 && rc->end == rc->pos)) 4136 */
3061 ctxt->restart = false; 4137 if ((r->end != 0 || c->regs[VCPU_REGS_RCX] & 0x3ff) &&
4138 (r->end == 0 || r->end != r->pos)) {
4139 /*
4140 * Reset read cache. Usually happens before
4141 * decode, but since instruction is restarted
4142 * we have to do it here.
4143 */
4144 ctxt->decode.mem_read.end = 0;
4145 return EMULATION_RESTART;
4146 }
4147 goto done; /* skip rip writeback */
4148 }
3062 } 4149 }
3063 /* 4150
3064 * reset read cache here in case string instruction is restared
3065 * without decoding
3066 */
3067 ctxt->decode.mem_read.end = 0;
3068 ctxt->eip = c->eip; 4151 ctxt->eip = c->eip;
3069 4152
3070done: 4153done:
3071 return (rc == X86EMUL_UNHANDLEABLE) ? -1 : 0; 4154 if (rc == X86EMUL_PROPAGATE_FAULT)
4155 ctxt->have_exception = true;
4156 if (rc == X86EMUL_INTERCEPTED)
4157 return EMULATION_INTERCEPTED;
4158
4159 return (rc == X86EMUL_UNHANDLEABLE) ? EMULATION_FAILED : EMULATION_OK;
3072 4160
3073twobyte_insn: 4161twobyte_insn:
3074 switch (c->b) { 4162 switch (c->b) {
3075 case 0x01: /* lgdt, lidt, lmsw */
3076 switch (c->modrm_reg) {
3077 u16 size;
3078 unsigned long address;
3079
3080 case 0: /* vmcall */
3081 if (c->modrm_mod != 3 || c->modrm_rm != 1)
3082 goto cannot_emulate;
3083
3084 rc = kvm_fix_hypercall(ctxt->vcpu);
3085 if (rc != X86EMUL_CONTINUE)
3086 goto done;
3087
3088 /* Let the processor re-execute the fixed hypercall */
3089 c->eip = ctxt->eip;
3090 /* Disable writeback. */
3091 c->dst.type = OP_NONE;
3092 break;
3093 case 2: /* lgdt */
3094 rc = read_descriptor(ctxt, ops, c->src.ptr,
3095 &size, &address, c->op_bytes);
3096 if (rc != X86EMUL_CONTINUE)
3097 goto done;
3098 realmode_lgdt(ctxt->vcpu, size, address);
3099 /* Disable writeback. */
3100 c->dst.type = OP_NONE;
3101 break;
3102 case 3: /* lidt/vmmcall */
3103 if (c->modrm_mod == 3) {
3104 switch (c->modrm_rm) {
3105 case 1:
3106 rc = kvm_fix_hypercall(ctxt->vcpu);
3107 if (rc != X86EMUL_CONTINUE)
3108 goto done;
3109 break;
3110 default:
3111 goto cannot_emulate;
3112 }
3113 } else {
3114 rc = read_descriptor(ctxt, ops, c->src.ptr,
3115 &size, &address,
3116 c->op_bytes);
3117 if (rc != X86EMUL_CONTINUE)
3118 goto done;
3119 realmode_lidt(ctxt->vcpu, size, address);
3120 }
3121 /* Disable writeback. */
3122 c->dst.type = OP_NONE;
3123 break;
3124 case 4: /* smsw */
3125 c->dst.bytes = 2;
3126 c->dst.val = ops->get_cr(0, ctxt->vcpu);
3127 break;
3128 case 6: /* lmsw */
3129 ops->set_cr(0, (ops->get_cr(0, ctxt->vcpu) & ~0x0ful) |
3130 (c->src.val & 0x0f), ctxt->vcpu);
3131 c->dst.type = OP_NONE;
3132 break;
3133 case 5: /* not defined */
3134 emulate_ud(ctxt);
3135 goto done;
3136 case 7: /* invlpg*/
3137 emulate_invlpg(ctxt->vcpu, c->modrm_ea);
3138 /* Disable writeback. */
3139 c->dst.type = OP_NONE;
3140 break;
3141 default:
3142 goto cannot_emulate;
3143 }
3144 break;
3145 case 0x05: /* syscall */ 4163 case 0x05: /* syscall */
3146 rc = emulate_syscall(ctxt, ops); 4164 rc = emulate_syscall(ctxt, ops);
3147 if (rc != X86EMUL_CONTINUE)
3148 goto done;
3149 else
3150 goto writeback;
3151 break; 4165 break;
3152 case 0x06: 4166 case 0x06:
3153 emulate_clts(ctxt->vcpu); 4167 rc = em_clts(ctxt);
3154 c->dst.type = OP_NONE;
3155 break; 4168 break;
3156 case 0x09: /* wbinvd */ 4169 case 0x09: /* wbinvd */
3157 kvm_emulate_wbinvd(ctxt->vcpu); 4170 (ctxt->ops->wbinvd)(ctxt);
3158 c->dst.type = OP_NONE;
3159 break; 4171 break;
3160 case 0x08: /* invd */ 4172 case 0x08: /* invd */
3161 case 0x0d: /* GrpP (prefetch) */ 4173 case 0x0d: /* GrpP (prefetch) */
3162 case 0x18: /* Grp16 (prefetch/nop) */ 4174 case 0x18: /* Grp16 (prefetch/nop) */
3163 c->dst.type = OP_NONE;
3164 break; 4175 break;
3165 case 0x20: /* mov cr, reg */ 4176 case 0x20: /* mov cr, reg */
3166 switch (c->modrm_reg) { 4177 c->dst.val = ops->get_cr(ctxt, c->modrm_reg);
3167 case 1:
3168 case 5 ... 7:
3169 case 9 ... 15:
3170 emulate_ud(ctxt);
3171 goto done;
3172 }
3173 c->regs[c->modrm_rm] = ops->get_cr(c->modrm_reg, ctxt->vcpu);
3174 c->dst.type = OP_NONE; /* no writeback */
3175 break; 4178 break;
3176 case 0x21: /* mov from dr to reg */ 4179 case 0x21: /* mov from dr to reg */
3177 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 4180 ops->get_dr(ctxt, c->modrm_reg, &c->dst.val);
3178 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3179 emulate_ud(ctxt);
3180 goto done;
3181 }
3182 ops->get_dr(c->modrm_reg, &c->regs[c->modrm_rm], ctxt->vcpu);
3183 c->dst.type = OP_NONE; /* no writeback */
3184 break; 4181 break;
3185 case 0x22: /* mov reg, cr */ 4182 case 0x22: /* mov reg, cr */
3186 if (ops->set_cr(c->modrm_reg, c->modrm_val, ctxt->vcpu)) { 4183 if (ops->set_cr(ctxt, c->modrm_reg, c->src.val)) {
3187 emulate_gp(ctxt, 0); 4184 emulate_gp(ctxt, 0);
4185 rc = X86EMUL_PROPAGATE_FAULT;
3188 goto done; 4186 goto done;
3189 } 4187 }
3190 c->dst.type = OP_NONE; 4188 c->dst.type = OP_NONE;
3191 break; 4189 break;
3192 case 0x23: /* mov from reg to dr */ 4190 case 0x23: /* mov from reg to dr */
3193 if ((ops->get_cr(4, ctxt->vcpu) & X86_CR4_DE) && 4191 if (ops->set_dr(ctxt, c->modrm_reg, c->src.val &
3194 (c->modrm_reg == 4 || c->modrm_reg == 5)) {
3195 emulate_ud(ctxt);
3196 goto done;
3197 }
3198
3199 if (ops->set_dr(c->modrm_reg, c->regs[c->modrm_rm] &
3200 ((ctxt->mode == X86EMUL_MODE_PROT64) ? 4192 ((ctxt->mode == X86EMUL_MODE_PROT64) ?
3201 ~0ULL : ~0U), ctxt->vcpu) < 0) { 4193 ~0ULL : ~0U)) < 0) {
3202 /* #UD condition is already handled by the code above */ 4194 /* #UD condition is already handled by the code above */
3203 emulate_gp(ctxt, 0); 4195 emulate_gp(ctxt, 0);
4196 rc = X86EMUL_PROPAGATE_FAULT;
3204 goto done; 4197 goto done;
3205 } 4198 }
3206 4199
@@ -3210,38 +4203,30 @@ twobyte_insn:
3210 /* wrmsr */ 4203 /* wrmsr */
3211 msr_data = (u32)c->regs[VCPU_REGS_RAX] 4204 msr_data = (u32)c->regs[VCPU_REGS_RAX]
3212 | ((u64)c->regs[VCPU_REGS_RDX] << 32); 4205 | ((u64)c->regs[VCPU_REGS_RDX] << 32);
3213 if (ops->set_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], msr_data)) { 4206 if (ops->set_msr(ctxt, c->regs[VCPU_REGS_RCX], msr_data)) {
3214 emulate_gp(ctxt, 0); 4207 emulate_gp(ctxt, 0);
4208 rc = X86EMUL_PROPAGATE_FAULT;
3215 goto done; 4209 goto done;
3216 } 4210 }
3217 rc = X86EMUL_CONTINUE; 4211 rc = X86EMUL_CONTINUE;
3218 c->dst.type = OP_NONE;
3219 break; 4212 break;
3220 case 0x32: 4213 case 0x32:
3221 /* rdmsr */ 4214 /* rdmsr */
3222 if (ops->get_msr(ctxt->vcpu, c->regs[VCPU_REGS_RCX], &msr_data)) { 4215 if (ops->get_msr(ctxt, c->regs[VCPU_REGS_RCX], &msr_data)) {
3223 emulate_gp(ctxt, 0); 4216 emulate_gp(ctxt, 0);
4217 rc = X86EMUL_PROPAGATE_FAULT;
3224 goto done; 4218 goto done;
3225 } else { 4219 } else {
3226 c->regs[VCPU_REGS_RAX] = (u32)msr_data; 4220 c->regs[VCPU_REGS_RAX] = (u32)msr_data;
3227 c->regs[VCPU_REGS_RDX] = msr_data >> 32; 4221 c->regs[VCPU_REGS_RDX] = msr_data >> 32;
3228 } 4222 }
3229 rc = X86EMUL_CONTINUE; 4223 rc = X86EMUL_CONTINUE;
3230 c->dst.type = OP_NONE;
3231 break; 4224 break;
3232 case 0x34: /* sysenter */ 4225 case 0x34: /* sysenter */
3233 rc = emulate_sysenter(ctxt, ops); 4226 rc = emulate_sysenter(ctxt, ops);
3234 if (rc != X86EMUL_CONTINUE)
3235 goto done;
3236 else
3237 goto writeback;
3238 break; 4227 break;
3239 case 0x35: /* sysexit */ 4228 case 0x35: /* sysexit */
3240 rc = emulate_sysexit(ctxt, ops); 4229 rc = emulate_sysexit(ctxt, ops);
3241 if (rc != X86EMUL_CONTINUE)
3242 goto done;
3243 else
3244 goto writeback;
3245 break; 4230 break;
3246 case 0x40 ... 0x4f: /* cmov */ 4231 case 0x40 ... 0x4f: /* cmov */
3247 c->dst.val = c->dst.orig_val = c->src.val; 4232 c->dst.val = c->dst.orig_val = c->src.val;
@@ -3251,15 +4236,15 @@ twobyte_insn:
3251 case 0x80 ... 0x8f: /* jnz rel, etc*/ 4236 case 0x80 ... 0x8f: /* jnz rel, etc*/
3252 if (test_cc(c->b, ctxt->eflags)) 4237 if (test_cc(c->b, ctxt->eflags))
3253 jmp_rel(c, c->src.val); 4238 jmp_rel(c, c->src.val);
3254 c->dst.type = OP_NONE; 4239 break;
4240 case 0x90 ... 0x9f: /* setcc r/m8 */
4241 c->dst.val = test_cc(c->b, ctxt->eflags);
3255 break; 4242 break;
3256 case 0xa0: /* push fs */ 4243 case 0xa0: /* push fs */
3257 emulate_push_sreg(ctxt, ops, VCPU_SREG_FS); 4244 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_FS);
3258 break; 4245 break;
3259 case 0xa1: /* pop fs */ 4246 case 0xa1: /* pop fs */
3260 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS); 4247 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
3261 if (rc != X86EMUL_CONTINUE)
3262 goto done;
3263 break; 4248 break;
3264 case 0xa3: 4249 case 0xa3:
3265 bt: /* bt */ 4250 bt: /* bt */
@@ -3273,17 +4258,13 @@ twobyte_insn:
3273 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 4258 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
3274 break; 4259 break;
3275 case 0xa8: /* push gs */ 4260 case 0xa8: /* push gs */
3276 emulate_push_sreg(ctxt, ops, VCPU_SREG_GS); 4261 rc = emulate_push_sreg(ctxt, ops, VCPU_SREG_GS);
3277 break; 4262 break;
3278 case 0xa9: /* pop gs */ 4263 case 0xa9: /* pop gs */
3279 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS); 4264 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
3280 if (rc != X86EMUL_CONTINUE)
3281 goto done;
3282 break; 4265 break;
3283 case 0xab: 4266 case 0xab:
3284 bts: /* bts */ 4267 bts: /* bts */
3285 /* only subword offset */
3286 c->src.val &= (c->dst.bytes << 3) - 1;
3287 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags); 4268 emulate_2op_SrcV_nobyte("bts", c->src, c->dst, ctxt->eflags);
3288 break; 4269 break;
3289 case 0xac: /* shrd imm8, r, r/m */ 4270 case 0xac: /* shrd imm8, r, r/m */
@@ -3306,15 +4287,22 @@ twobyte_insn:
3306 } else { 4287 } else {
3307 /* Failure: write the value we saw to EAX. */ 4288 /* Failure: write the value we saw to EAX. */
3308 c->dst.type = OP_REG; 4289 c->dst.type = OP_REG;
3309 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 4290 c->dst.addr.reg = (unsigned long *)&c->regs[VCPU_REGS_RAX];
3310 } 4291 }
3311 break; 4292 break;
4293 case 0xb2: /* lss */
4294 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_SS);
4295 break;
3312 case 0xb3: 4296 case 0xb3:
3313 btr: /* btr */ 4297 btr: /* btr */
3314 /* only subword offset */
3315 c->src.val &= (c->dst.bytes << 3) - 1;
3316 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags); 4298 emulate_2op_SrcV_nobyte("btr", c->src, c->dst, ctxt->eflags);
3317 break; 4299 break;
4300 case 0xb4: /* lfs */
4301 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_FS);
4302 break;
4303 case 0xb5: /* lgs */
4304 rc = emulate_load_segment(ctxt, ops, VCPU_SREG_GS);
4305 break;
3318 case 0xb6 ... 0xb7: /* movzx */ 4306 case 0xb6 ... 0xb7: /* movzx */
3319 c->dst.bytes = c->op_bytes; 4307 c->dst.bytes = c->op_bytes;
3320 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val 4308 c->dst.val = (c->d & ByteOp) ? (u8) c->src.val
@@ -3334,29 +4322,60 @@ twobyte_insn:
3334 break; 4322 break;
3335 case 0xbb: 4323 case 0xbb:
3336 btc: /* btc */ 4324 btc: /* btc */
3337 /* only subword offset */
3338 c->src.val &= (c->dst.bytes << 3) - 1;
3339 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags); 4325 emulate_2op_SrcV_nobyte("btc", c->src, c->dst, ctxt->eflags);
3340 break; 4326 break;
4327 case 0xbc: { /* bsf */
4328 u8 zf;
4329 __asm__ ("bsf %2, %0; setz %1"
4330 : "=r"(c->dst.val), "=q"(zf)
4331 : "r"(c->src.val));
4332 ctxt->eflags &= ~X86_EFLAGS_ZF;
4333 if (zf) {
4334 ctxt->eflags |= X86_EFLAGS_ZF;
4335 c->dst.type = OP_NONE; /* Disable writeback. */
4336 }
4337 break;
4338 }
4339 case 0xbd: { /* bsr */
4340 u8 zf;
4341 __asm__ ("bsr %2, %0; setz %1"
4342 : "=r"(c->dst.val), "=q"(zf)
4343 : "r"(c->src.val));
4344 ctxt->eflags &= ~X86_EFLAGS_ZF;
4345 if (zf) {
4346 ctxt->eflags |= X86_EFLAGS_ZF;
4347 c->dst.type = OP_NONE; /* Disable writeback. */
4348 }
4349 break;
4350 }
3341 case 0xbe ... 0xbf: /* movsx */ 4351 case 0xbe ... 0xbf: /* movsx */
3342 c->dst.bytes = c->op_bytes; 4352 c->dst.bytes = c->op_bytes;
3343 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val : 4353 c->dst.val = (c->d & ByteOp) ? (s8) c->src.val :
3344 (s16) c->src.val; 4354 (s16) c->src.val;
3345 break; 4355 break;
4356 case 0xc0 ... 0xc1: /* xadd */
4357 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
4358 /* Write back the register source. */
4359 c->src.val = c->dst.orig_val;
4360 write_register_operand(&c->src);
4361 break;
3346 case 0xc3: /* movnti */ 4362 case 0xc3: /* movnti */
3347 c->dst.bytes = c->op_bytes; 4363 c->dst.bytes = c->op_bytes;
3348 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val : 4364 c->dst.val = (c->op_bytes == 4) ? (u32) c->src.val :
3349 (u64) c->src.val; 4365 (u64) c->src.val;
3350 break; 4366 break;
3351 case 0xc7: /* Grp9 (cmpxchg8b) */ 4367 case 0xc7: /* Grp9 (cmpxchg8b) */
3352 rc = emulate_grp9(ctxt, ops); 4368 rc = em_grp9(ctxt);
3353 if (rc != X86EMUL_CONTINUE)
3354 goto done;
3355 break; 4369 break;
4370 default:
4371 goto cannot_emulate;
3356 } 4372 }
4373
4374 if (rc != X86EMUL_CONTINUE)
4375 goto done;
4376
3357 goto writeback; 4377 goto writeback;
3358 4378
3359cannot_emulate: 4379cannot_emulate:
3360 DPRINTF("Cannot emulate %02x\n", c->b); 4380 return EMULATION_FAILED;
3361 return -1;
3362} 4381}
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index ddeb2314b522..efad72385058 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -5,7 +5,7 @@
5 * Copyright (c) 2006 Intel Corporation 5 * Copyright (c) 2006 Intel Corporation
6 * Copyright (c) 2007 Keir Fraser, XenSource Inc 6 * Copyright (c) 2007 Keir Fraser, XenSource Inc
7 * Copyright (c) 2008 Intel Corporation 7 * Copyright (c) 2008 Intel Corporation
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Permission is hereby granted, free of charge, to any person obtaining a copy 10 * Permission is hereby granted, free of charge, to any person obtaining a copy
11 * of this software and associated documentation files (the "Software"), to deal 11 * of this software and associated documentation files (the "Software"), to deal
@@ -232,15 +232,6 @@ static void pit_latch_status(struct kvm *kvm, int channel)
232 } 232 }
233} 233}
234 234
235int pit_has_pending_timer(struct kvm_vcpu *vcpu)
236{
237 struct kvm_pit *pit = vcpu->kvm->arch.vpit;
238
239 if (pit && kvm_vcpu_is_bsp(vcpu) && pit->pit_state.irq_ack)
240 return atomic_read(&pit->pit_state.pit_timer.pending);
241 return 0;
242}
243
244static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian) 235static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
245{ 236{
246 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 237 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index 46d08ca0b48f..51a97426e791 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -33,7 +33,6 @@ struct kvm_kpit_state {
33}; 33};
34 34
35struct kvm_pit { 35struct kvm_pit {
36 unsigned long base_addresss;
37 struct kvm_io_device dev; 36 struct kvm_io_device dev;
38 struct kvm_io_device speaker_dev; 37 struct kvm_io_device speaker_dev;
39 struct kvm *kvm; 38 struct kvm *kvm;
@@ -51,7 +50,6 @@ struct kvm_pit {
51#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100 50#define KVM_MAX_PIT_INTR_INTERVAL HZ / 100
52#define KVM_PIT_CHANNEL_MASK 0x3 51#define KVM_PIT_CHANNEL_MASK 0x3
53 52
54void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu);
55void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start); 53void kvm_pit_load_count(struct kvm *kvm, int channel, u32 val, int hpet_legacy_start);
56struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags); 54struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags);
57void kvm_free_pit(struct kvm *kvm); 55void kvm_free_pit(struct kvm *kvm);
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 4b7b73ce2098..19fe855e7953 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -3,7 +3,7 @@
3 * 3 *
4 * Copyright (c) 2003-2004 Fabrice Bellard 4 * Copyright (c) 2003-2004 Fabrice Bellard
5 * Copyright (c) 2007 Intel Corporation 5 * Copyright (c) 2007 Intel Corporation
6 * Copyright 2009 Red Hat, Inc. and/or its affilates. 6 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
7 * 7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal 9 * of this software and associated documentation files (the "Software"), to deal
@@ -39,7 +39,7 @@ static void pic_irq_request(struct kvm *kvm, int level);
39static void pic_lock(struct kvm_pic *s) 39static void pic_lock(struct kvm_pic *s)
40 __acquires(&s->lock) 40 __acquires(&s->lock)
41{ 41{
42 raw_spin_lock(&s->lock); 42 spin_lock(&s->lock);
43} 43}
44 44
45static void pic_unlock(struct kvm_pic *s) 45static void pic_unlock(struct kvm_pic *s)
@@ -51,7 +51,7 @@ static void pic_unlock(struct kvm_pic *s)
51 51
52 s->wakeup_needed = false; 52 s->wakeup_needed = false;
53 53
54 raw_spin_unlock(&s->lock); 54 spin_unlock(&s->lock);
55 55
56 if (wakeup) { 56 if (wakeup) {
57 kvm_for_each_vcpu(i, vcpu, s->kvm) { 57 kvm_for_each_vcpu(i, vcpu, s->kvm) {
@@ -62,11 +62,9 @@ static void pic_unlock(struct kvm_pic *s)
62 } 62 }
63 63
64 if (!found) 64 if (!found)
65 found = s->kvm->bsp_vcpu;
66
67 if (!found)
68 return; 65 return;
69 66
67 kvm_make_request(KVM_REQ_EVENT, found);
70 kvm_vcpu_kick(found); 68 kvm_vcpu_kick(found);
71 } 69 }
72} 70}
@@ -74,7 +72,6 @@ static void pic_unlock(struct kvm_pic *s)
74static void pic_clear_isr(struct kvm_kpic_state *s, int irq) 72static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
75{ 73{
76 s->isr &= ~(1 << irq); 74 s->isr &= ~(1 << irq);
77 s->isr_ack |= (1 << irq);
78 if (s != &s->pics_state->pics[0]) 75 if (s != &s->pics_state->pics[0])
79 irq += 8; 76 irq += 8;
80 /* 77 /*
@@ -88,16 +85,6 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
88 pic_lock(s->pics_state); 85 pic_lock(s->pics_state);
89} 86}
90 87
91void kvm_pic_clear_isr_ack(struct kvm *kvm)
92{
93 struct kvm_pic *s = pic_irqchip(kvm);
94
95 pic_lock(s);
96 s->pics[0].isr_ack = 0xff;
97 s->pics[1].isr_ack = 0xff;
98 pic_unlock(s);
99}
100
101/* 88/*
102 * set irq level. If an edge is detected, then the IRR is set to 1 89 * set irq level. If an edge is detected, then the IRR is set to 1
103 */ 90 */
@@ -280,7 +267,6 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
280 s->irr = 0; 267 s->irr = 0;
281 s->imr = 0; 268 s->imr = 0;
282 s->isr = 0; 269 s->isr = 0;
283 s->isr_ack = 0xff;
284 s->priority_add = 0; 270 s->priority_add = 0;
285 s->irq_base = 0; 271 s->irq_base = 0;
286 s->read_reg_select = 0; 272 s->read_reg_select = 0;
@@ -308,13 +294,17 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
308 addr &= 1; 294 addr &= 1;
309 if (addr == 0) { 295 if (addr == 0) {
310 if (val & 0x10) { 296 if (val & 0x10) {
311 kvm_pic_reset(s); /* init */
312 /*
313 * deassert a pending interrupt
314 */
315 pic_irq_request(s->pics_state->kvm, 0);
316 s->init_state = 1;
317 s->init4 = val & 1; 297 s->init4 = val & 1;
298 s->last_irr = 0;
299 s->imr = 0;
300 s->priority_add = 0;
301 s->special_mask = 0;
302 s->read_reg_select = 0;
303 if (!s->init4) {
304 s->special_fully_nested_mode = 0;
305 s->auto_eoi = 0;
306 }
307 s->init_state = 1;
318 if (val & 0x02) 308 if (val & 0x02)
319 printk(KERN_ERR "single mode not supported"); 309 printk(KERN_ERR "single mode not supported");
320 if (val & 0x08) 310 if (val & 0x08)
@@ -540,15 +530,11 @@ static int picdev_read(struct kvm_io_device *this,
540 */ 530 */
541static void pic_irq_request(struct kvm *kvm, int level) 531static void pic_irq_request(struct kvm *kvm, int level)
542{ 532{
543 struct kvm_vcpu *vcpu = kvm->bsp_vcpu;
544 struct kvm_pic *s = pic_irqchip(kvm); 533 struct kvm_pic *s = pic_irqchip(kvm);
545 int irq = pic_get_irq(&s->pics[0]);
546 534
547 s->output = level; 535 if (!s->output)
548 if (vcpu && level && (s->pics[0].isr_ack & (1 << irq))) {
549 s->pics[0].isr_ack &= ~(1 << irq);
550 s->wakeup_needed = true; 536 s->wakeup_needed = true;
551 } 537 s->output = level;
552} 538}
553 539
554static const struct kvm_io_device_ops picdev_ops = { 540static const struct kvm_io_device_ops picdev_ops = {
@@ -564,7 +550,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
564 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 550 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
565 if (!s) 551 if (!s)
566 return NULL; 552 return NULL;
567 raw_spin_lock_init(&s->lock); 553 spin_lock_init(&s->lock);
568 s->kvm = kvm; 554 s->kvm = kvm;
569 s->pics[0].elcr_mask = 0xf8; 555 s->pics[0].elcr_mask = 0xf8;
570 s->pics[1].elcr_mask = 0xde; 556 s->pics[1].elcr_mask = 0xde;
diff --git a/arch/x86/kvm/irq.c b/arch/x86/kvm/irq.c
index 2095a049835e..7e06ba1618bd 100644
--- a/arch/x86/kvm/irq.c
+++ b/arch/x86/kvm/irq.c
@@ -1,7 +1,7 @@
1/* 1/*
2 * irq.c: API for in kernel interrupt controller 2 * irq.c: API for in kernel interrupt controller
3 * Copyright (c) 2007, Intel Corporation. 3 * Copyright (c) 2007, Intel Corporation.
4 * Copyright 2009 Red Hat, Inc. and/or its affilates. 4 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
5 * 5 *
6 * This program is free software; you can redistribute it and/or modify it 6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License, 7 * under the terms and conditions of the GNU General Public License,
@@ -33,12 +33,7 @@
33 */ 33 */
34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu) 34int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
35{ 35{
36 int ret; 36 return apic_has_pending_timer(vcpu);
37
38 ret = pit_has_pending_timer(vcpu);
39 ret |= apic_has_pending_timer(vcpu);
40
41 return ret;
42} 37}
43EXPORT_SYMBOL(kvm_cpu_has_pending_timer); 38EXPORT_SYMBOL(kvm_cpu_has_pending_timer);
44 39
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 63c314502993..53e2d084bffb 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -60,7 +60,7 @@ struct kvm_kpic_state {
60}; 60};
61 61
62struct kvm_pic { 62struct kvm_pic {
63 raw_spinlock_t lock; 63 spinlock_t lock;
64 bool wakeup_needed; 64 bool wakeup_needed;
65 unsigned pending_acks; 65 unsigned pending_acks;
66 struct kvm *kvm; 66 struct kvm *kvm;
@@ -75,7 +75,6 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm);
75void kvm_destroy_pic(struct kvm *kvm); 75void kvm_destroy_pic(struct kvm *kvm);
76int kvm_pic_read_irq(struct kvm *kvm); 76int kvm_pic_read_irq(struct kvm *kvm);
77void kvm_pic_update_irq(struct kvm_pic *s); 77void kvm_pic_update_irq(struct kvm_pic *s);
78void kvm_pic_clear_isr_ack(struct kvm *kvm);
79 78
80static inline struct kvm_pic *pic_irqchip(struct kvm *kvm) 79static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
81{ 80{
@@ -100,7 +99,6 @@ void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu);
100void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu); 99void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu);
101void __kvm_migrate_timers(struct kvm_vcpu *vcpu); 100void __kvm_migrate_timers(struct kvm_vcpu *vcpu);
102 101
103int pit_has_pending_timer(struct kvm_vcpu *vcpu);
104int apic_has_pending_timer(struct kvm_vcpu *vcpu); 102int apic_has_pending_timer(struct kvm_vcpu *vcpu);
105 103
106#endif 104#endif
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 6491ac8e755b..3377d53fcd36 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -42,7 +42,14 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
42 (unsigned long *)&vcpu->arch.regs_avail)) 42 (unsigned long *)&vcpu->arch.regs_avail))
43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR); 43 kvm_x86_ops->cache_reg(vcpu, VCPU_EXREG_PDPTR);
44 44
45 return vcpu->arch.pdptrs[index]; 45 return vcpu->arch.walk_mmu->pdptrs[index];
46}
47
48static inline u64 kvm_pdptr_read_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, int index)
49{
50 load_pdptrs(vcpu, mmu, mmu->get_cr3(vcpu));
51
52 return mmu->pdptrs[index];
46} 53}
47 54
48static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask) 55static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
@@ -66,6 +73,13 @@ static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
66 return vcpu->arch.cr4 & mask; 73 return vcpu->arch.cr4 & mask;
67} 74}
68 75
76static inline ulong kvm_read_cr3(struct kvm_vcpu *vcpu)
77{
78 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
79 kvm_x86_ops->decache_cr3(vcpu);
80 return vcpu->arch.cr3;
81}
82
69static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu) 83static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
70{ 84{
71 return kvm_read_cr4_bits(vcpu, ~0UL); 85 return kvm_read_cr4_bits(vcpu, ~0UL);
@@ -77,4 +91,19 @@ static inline u64 kvm_read_edx_eax(struct kvm_vcpu *vcpu)
77 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32); 91 | ((u64)(kvm_register_read(vcpu, VCPU_REGS_RDX) & -1u) << 32);
78} 92}
79 93
94static inline void enter_guest_mode(struct kvm_vcpu *vcpu)
95{
96 vcpu->arch.hflags |= HF_GUEST_MASK;
97}
98
99static inline void leave_guest_mode(struct kvm_vcpu *vcpu)
100{
101 vcpu->arch.hflags &= ~HF_GUEST_MASK;
102}
103
104static inline bool is_guest_mode(struct kvm_vcpu *vcpu)
105{
106 return vcpu->arch.hflags & HF_GUEST_MASK;
107}
108
80#endif 109#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 77d8c0f4817d..2b2255b1f04b 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -5,7 +5,7 @@
5 * Copyright (C) 2006 Qumranet, Inc. 5 * Copyright (C) 2006 Qumranet, Inc.
6 * Copyright (C) 2007 Novell 6 * Copyright (C) 2007 Novell
7 * Copyright (C) 2007 Intel 7 * Copyright (C) 2007 Intel
8 * Copyright 2009 Red Hat, Inc. and/or its affilates. 8 * Copyright 2009 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Dor Laor <dor.laor@qumranet.com> 11 * Dor Laor <dor.laor@qumranet.com>
@@ -259,9 +259,10 @@ static inline int apic_find_highest_isr(struct kvm_lapic *apic)
259 259
260static void apic_update_ppr(struct kvm_lapic *apic) 260static void apic_update_ppr(struct kvm_lapic *apic)
261{ 261{
262 u32 tpr, isrv, ppr; 262 u32 tpr, isrv, ppr, old_ppr;
263 int isr; 263 int isr;
264 264
265 old_ppr = apic_get_reg(apic, APIC_PROCPRI);
265 tpr = apic_get_reg(apic, APIC_TASKPRI); 266 tpr = apic_get_reg(apic, APIC_TASKPRI);
266 isr = apic_find_highest_isr(apic); 267 isr = apic_find_highest_isr(apic);
267 isrv = (isr != -1) ? isr : 0; 268 isrv = (isr != -1) ? isr : 0;
@@ -274,7 +275,11 @@ static void apic_update_ppr(struct kvm_lapic *apic)
274 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x", 275 apic_debug("vlapic %p, ppr 0x%x, isr 0x%x, isrv 0x%x",
275 apic, ppr, isr, isrv); 276 apic, ppr, isr, isrv);
276 277
277 apic_set_reg(apic, APIC_PROCPRI, ppr); 278 if (old_ppr != ppr) {
279 apic_set_reg(apic, APIC_PROCPRI, ppr);
280 if (ppr < old_ppr)
281 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
282 }
278} 283}
279 284
280static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr) 285static void apic_set_tpr(struct kvm_lapic *apic, u32 tpr)
@@ -391,6 +396,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
391 break; 396 break;
392 } 397 }
393 398
399 kvm_make_request(KVM_REQ_EVENT, vcpu);
394 kvm_vcpu_kick(vcpu); 400 kvm_vcpu_kick(vcpu);
395 break; 401 break;
396 402
@@ -411,11 +417,8 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
411 case APIC_DM_INIT: 417 case APIC_DM_INIT:
412 if (level) { 418 if (level) {
413 result = 1; 419 result = 1;
414 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
415 printk(KERN_DEBUG
416 "INIT on a runnable vcpu %d\n",
417 vcpu->vcpu_id);
418 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; 420 vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
421 kvm_make_request(KVM_REQ_EVENT, vcpu);
419 kvm_vcpu_kick(vcpu); 422 kvm_vcpu_kick(vcpu);
420 } else { 423 } else {
421 apic_debug("Ignoring de-assert INIT to vcpu %d\n", 424 apic_debug("Ignoring de-assert INIT to vcpu %d\n",
@@ -430,6 +433,7 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
430 result = 1; 433 result = 1;
431 vcpu->arch.sipi_vector = vector; 434 vcpu->arch.sipi_vector = vector;
432 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED; 435 vcpu->arch.mp_state = KVM_MP_STATE_SIPI_RECEIVED;
436 kvm_make_request(KVM_REQ_EVENT, vcpu);
433 kvm_vcpu_kick(vcpu); 437 kvm_vcpu_kick(vcpu);
434 } 438 }
435 break; 439 break;
@@ -475,6 +479,7 @@ static void apic_set_eoi(struct kvm_lapic *apic)
475 trigger_mode = IOAPIC_EDGE_TRIG; 479 trigger_mode = IOAPIC_EDGE_TRIG;
476 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) 480 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
477 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 481 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
482 kvm_make_request(KVM_REQ_EVENT, apic->vcpu);
478} 483}
479 484
480static void apic_send_ipi(struct kvm_lapic *apic) 485static void apic_send_ipi(struct kvm_lapic *apic)
@@ -866,8 +871,8 @@ void kvm_free_lapic(struct kvm_vcpu *vcpu)
866 871
867 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer); 872 hrtimer_cancel(&vcpu->arch.apic->lapic_timer.timer);
868 873
869 if (vcpu->arch.apic->regs_page) 874 if (vcpu->arch.apic->regs)
870 __free_page(vcpu->arch.apic->regs_page); 875 free_page((unsigned long)vcpu->arch.apic->regs);
871 876
872 kfree(vcpu->arch.apic); 877 kfree(vcpu->arch.apic);
873} 878}
@@ -1056,14 +1061,12 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu)
1056 1061
1057 vcpu->arch.apic = apic; 1062 vcpu->arch.apic = apic;
1058 1063
1059 apic->regs_page = alloc_page(GFP_KERNEL); 1064 apic->regs = (void *)get_zeroed_page(GFP_KERNEL);
1060 if (apic->regs_page == NULL) { 1065 if (!apic->regs) {
1061 printk(KERN_ERR "malloc apic regs error for vcpu %x\n", 1066 printk(KERN_ERR "malloc apic regs error for vcpu %x\n",
1062 vcpu->vcpu_id); 1067 vcpu->vcpu_id);
1063 goto nomem_free_apic; 1068 goto nomem_free_apic;
1064 } 1069 }
1065 apic->regs = page_address(apic->regs_page);
1066 memset(apic->regs, 0, PAGE_SIZE);
1067 apic->vcpu = vcpu; 1070 apic->vcpu = vcpu;
1068 1071
1069 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC, 1072 hrtimer_init(&apic->lapic_timer.timer, CLOCK_MONOTONIC,
@@ -1152,6 +1155,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1152 update_divide_count(apic); 1155 update_divide_count(apic);
1153 start_apic_timer(apic); 1156 start_apic_timer(apic);
1154 apic->irr_pending = true; 1157 apic->irr_pending = true;
1158 kvm_make_request(KVM_REQ_EVENT, vcpu);
1155} 1159}
1156 1160
1157void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1161void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index f5fe32c5edad..52c9e6b9e725 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -13,7 +13,6 @@ struct kvm_lapic {
13 u32 divide_count; 13 u32 divide_count;
14 struct kvm_vcpu *vcpu; 14 struct kvm_vcpu *vcpu;
15 bool irr_pending; 15 bool irr_pending;
16 struct page *regs_page;
17 void *regs; 16 void *regs;
18 gpa_t vapic_addr; 17 gpa_t vapic_addr;
19 struct page *vapic_page; 18 struct page *vapic_page;
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 311f6dad8951..aee38623b768 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -18,9 +18,11 @@
18 * 18 *
19 */ 19 */
20 20
21#include "irq.h"
21#include "mmu.h" 22#include "mmu.h"
22#include "x86.h" 23#include "x86.h"
23#include "kvm_cache_regs.h" 24#include "kvm_cache_regs.h"
25#include "x86.h"
24 26
25#include <linux/kvm_host.h> 27#include <linux/kvm_host.h>
26#include <linux/types.h> 28#include <linux/types.h>
@@ -49,15 +51,25 @@
49 */ 51 */
50bool tdp_enabled = false; 52bool tdp_enabled = false;
51 53
52#undef MMU_DEBUG 54enum {
55 AUDIT_PRE_PAGE_FAULT,
56 AUDIT_POST_PAGE_FAULT,
57 AUDIT_PRE_PTE_WRITE,
58 AUDIT_POST_PTE_WRITE,
59 AUDIT_PRE_SYNC,
60 AUDIT_POST_SYNC
61};
53 62
54#undef AUDIT 63char *audit_point_name[] = {
64 "pre page fault",
65 "post page fault",
66 "pre pte write",
67 "post pte write",
68 "pre sync",
69 "post sync"
70};
55 71
56#ifdef AUDIT 72#undef MMU_DEBUG
57static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg);
58#else
59static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
60#endif
61 73
62#ifdef MMU_DEBUG 74#ifdef MMU_DEBUG
63 75
@@ -71,7 +83,7 @@ static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) {}
71 83
72#endif 84#endif
73 85
74#if defined(MMU_DEBUG) || defined(AUDIT) 86#ifdef MMU_DEBUG
75static int dbg = 0; 87static int dbg = 0;
76module_param(dbg, bool, 0644); 88module_param(dbg, bool, 0644);
77#endif 89#endif
@@ -89,6 +101,8 @@ module_param(oos_shadow, bool, 0644);
89 } 101 }
90#endif 102#endif
91 103
104#define PTE_PREFETCH_NUM 8
105
92#define PT_FIRST_AVAIL_BITS_SHIFT 9 106#define PT_FIRST_AVAIL_BITS_SHIFT 9
93#define PT64_SECOND_AVAIL_BITS_SHIFT 52 107#define PT64_SECOND_AVAIL_BITS_SHIFT 52
94 108
@@ -97,9 +111,6 @@ module_param(oos_shadow, bool, 0644);
97#define PT64_LEVEL_SHIFT(level) \ 111#define PT64_LEVEL_SHIFT(level) \
98 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS) 112 (PAGE_SHIFT + (level - 1) * PT64_LEVEL_BITS)
99 113
100#define PT64_LEVEL_MASK(level) \
101 (((1ULL << PT64_LEVEL_BITS) - 1) << PT64_LEVEL_SHIFT(level))
102
103#define PT64_INDEX(address, level)\ 114#define PT64_INDEX(address, level)\
104 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1)) 115 (((address) >> PT64_LEVEL_SHIFT(level)) & ((1 << PT64_LEVEL_BITS) - 1))
105 116
@@ -109,8 +120,6 @@ module_param(oos_shadow, bool, 0644);
109#define PT32_LEVEL_SHIFT(level) \ 120#define PT32_LEVEL_SHIFT(level) \
110 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS) 121 (PAGE_SHIFT + (level - 1) * PT32_LEVEL_BITS)
111 122
112#define PT32_LEVEL_MASK(level) \
113 (((1ULL << PT32_LEVEL_BITS) - 1) << PT32_LEVEL_SHIFT(level))
114#define PT32_LVL_OFFSET_MASK(level) \ 123#define PT32_LVL_OFFSET_MASK(level) \
115 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \ 124 (PT32_BASE_ADDR_MASK & ((1ULL << (PAGE_SHIFT + (((level) - 1) \
116 * PT32_LEVEL_BITS))) - 1)) 125 * PT32_LEVEL_BITS))) - 1))
@@ -178,10 +187,10 @@ typedef void (*mmu_parent_walk_fn) (struct kvm_mmu_page *sp, u64 *spte);
178static struct kmem_cache *pte_chain_cache; 187static struct kmem_cache *pte_chain_cache;
179static struct kmem_cache *rmap_desc_cache; 188static struct kmem_cache *rmap_desc_cache;
180static struct kmem_cache *mmu_page_header_cache; 189static struct kmem_cache *mmu_page_header_cache;
190static struct percpu_counter kvm_total_used_mmu_pages;
181 191
182static u64 __read_mostly shadow_trap_nonpresent_pte; 192static u64 __read_mostly shadow_trap_nonpresent_pte;
183static u64 __read_mostly shadow_notrap_nonpresent_pte; 193static u64 __read_mostly shadow_notrap_nonpresent_pte;
184static u64 __read_mostly shadow_base_present_pte;
185static u64 __read_mostly shadow_nx_mask; 194static u64 __read_mostly shadow_nx_mask;
186static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */ 195static u64 __read_mostly shadow_x_mask; /* mutual exclusive with nx_mask */
187static u64 __read_mostly shadow_user_mask; 196static u64 __read_mostly shadow_user_mask;
@@ -200,12 +209,6 @@ void kvm_mmu_set_nonpresent_ptes(u64 trap_pte, u64 notrap_pte)
200} 209}
201EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes); 210EXPORT_SYMBOL_GPL(kvm_mmu_set_nonpresent_ptes);
202 211
203void kvm_mmu_set_base_ptes(u64 base_pte)
204{
205 shadow_base_present_pte = base_pte;
206}
207EXPORT_SYMBOL_GPL(kvm_mmu_set_base_ptes);
208
209void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask, 212void kvm_mmu_set_mask_ptes(u64 user_mask, u64 accessed_mask,
210 u64 dirty_mask, u64 nx_mask, u64 x_mask) 213 u64 dirty_mask, u64 nx_mask, u64 x_mask)
211{ 214{
@@ -299,18 +302,50 @@ static u64 __xchg_spte(u64 *sptep, u64 new_spte)
299#endif 302#endif
300} 303}
301 304
305static bool spte_has_volatile_bits(u64 spte)
306{
307 if (!shadow_accessed_mask)
308 return false;
309
310 if (!is_shadow_present_pte(spte))
311 return false;
312
313 if ((spte & shadow_accessed_mask) &&
314 (!is_writable_pte(spte) || (spte & shadow_dirty_mask)))
315 return false;
316
317 return true;
318}
319
320static bool spte_is_bit_cleared(u64 old_spte, u64 new_spte, u64 bit_mask)
321{
322 return (old_spte & bit_mask) && !(new_spte & bit_mask);
323}
324
302static void update_spte(u64 *sptep, u64 new_spte) 325static void update_spte(u64 *sptep, u64 new_spte)
303{ 326{
304 u64 old_spte; 327 u64 mask, old_spte = *sptep;
328
329 WARN_ON(!is_rmap_spte(new_spte));
330
331 new_spte |= old_spte & shadow_dirty_mask;
332
333 mask = shadow_accessed_mask;
334 if (is_writable_pte(old_spte))
335 mask |= shadow_dirty_mask;
305 336
306 if (!shadow_accessed_mask || (new_spte & shadow_accessed_mask) || 337 if (!spte_has_volatile_bits(old_spte) || (new_spte & mask) == mask)
307 !is_rmap_spte(*sptep))
308 __set_spte(sptep, new_spte); 338 __set_spte(sptep, new_spte);
309 else { 339 else
310 old_spte = __xchg_spte(sptep, new_spte); 340 old_spte = __xchg_spte(sptep, new_spte);
311 if (old_spte & shadow_accessed_mask) 341
312 mark_page_accessed(pfn_to_page(spte_to_pfn(old_spte))); 342 if (!shadow_accessed_mask)
313 } 343 return;
344
345 if (spte_is_bit_cleared(old_spte, new_spte, shadow_accessed_mask))
346 kvm_set_pfn_accessed(spte_to_pfn(old_spte));
347 if (spte_is_bit_cleared(old_spte, new_spte, shadow_dirty_mask))
348 kvm_set_pfn_dirty(spte_to_pfn(old_spte));
314} 349}
315 350
316static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache, 351static int mmu_topup_memory_cache(struct kvm_mmu_memory_cache *cache,
@@ -339,15 +374,15 @@ static void mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc,
339static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache, 374static int mmu_topup_memory_cache_page(struct kvm_mmu_memory_cache *cache,
340 int min) 375 int min)
341{ 376{
342 struct page *page; 377 void *page;
343 378
344 if (cache->nobjs >= min) 379 if (cache->nobjs >= min)
345 return 0; 380 return 0;
346 while (cache->nobjs < ARRAY_SIZE(cache->objects)) { 381 while (cache->nobjs < ARRAY_SIZE(cache->objects)) {
347 page = alloc_page(GFP_KERNEL); 382 page = (void *)__get_free_page(GFP_KERNEL);
348 if (!page) 383 if (!page)
349 return -ENOMEM; 384 return -ENOMEM;
350 cache->objects[cache->nobjs++] = page_address(page); 385 cache->objects[cache->nobjs++] = page;
351 } 386 }
352 return 0; 387 return 0;
353} 388}
@@ -367,7 +402,7 @@ static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu)
367 if (r) 402 if (r)
368 goto out; 403 goto out;
369 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache, 404 r = mmu_topup_memory_cache(&vcpu->arch.mmu_rmap_desc_cache,
370 rmap_desc_cache, 4); 405 rmap_desc_cache, 4 + PTE_PREFETCH_NUM);
371 if (r) 406 if (r)
372 goto out; 407 goto out;
373 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8); 408 r = mmu_topup_memory_cache_page(&vcpu->arch.mmu_page_cache, 8);
@@ -437,46 +472,46 @@ static void kvm_mmu_page_set_gfn(struct kvm_mmu_page *sp, int index, gfn_t gfn)
437} 472}
438 473
439/* 474/*
440 * Return the pointer to the largepage write count for a given 475 * Return the pointer to the large page information for a given gfn,
441 * gfn, handling slots that are not large page aligned. 476 * handling slots that are not large page aligned.
442 */ 477 */
443static int *slot_largepage_idx(gfn_t gfn, 478static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
444 struct kvm_memory_slot *slot, 479 struct kvm_memory_slot *slot,
445 int level) 480 int level)
446{ 481{
447 unsigned long idx; 482 unsigned long idx;
448 483
449 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 484 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) -
450 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); 485 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
451 return &slot->lpage_info[level - 2][idx].write_count; 486 return &slot->lpage_info[level - 2][idx];
452} 487}
453 488
454static void account_shadowed(struct kvm *kvm, gfn_t gfn) 489static void account_shadowed(struct kvm *kvm, gfn_t gfn)
455{ 490{
456 struct kvm_memory_slot *slot; 491 struct kvm_memory_slot *slot;
457 int *write_count; 492 struct kvm_lpage_info *linfo;
458 int i; 493 int i;
459 494
460 slot = gfn_to_memslot(kvm, gfn); 495 slot = gfn_to_memslot(kvm, gfn);
461 for (i = PT_DIRECTORY_LEVEL; 496 for (i = PT_DIRECTORY_LEVEL;
462 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 497 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
463 write_count = slot_largepage_idx(gfn, slot, i); 498 linfo = lpage_info_slot(gfn, slot, i);
464 *write_count += 1; 499 linfo->write_count += 1;
465 } 500 }
466} 501}
467 502
468static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn) 503static void unaccount_shadowed(struct kvm *kvm, gfn_t gfn)
469{ 504{
470 struct kvm_memory_slot *slot; 505 struct kvm_memory_slot *slot;
471 int *write_count; 506 struct kvm_lpage_info *linfo;
472 int i; 507 int i;
473 508
474 slot = gfn_to_memslot(kvm, gfn); 509 slot = gfn_to_memslot(kvm, gfn);
475 for (i = PT_DIRECTORY_LEVEL; 510 for (i = PT_DIRECTORY_LEVEL;
476 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) { 511 i < PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES; ++i) {
477 write_count = slot_largepage_idx(gfn, slot, i); 512 linfo = lpage_info_slot(gfn, slot, i);
478 *write_count -= 1; 513 linfo->write_count -= 1;
479 WARN_ON(*write_count < 0); 514 WARN_ON(linfo->write_count < 0);
480 } 515 }
481} 516}
482 517
@@ -485,12 +520,12 @@ static int has_wrprotected_page(struct kvm *kvm,
485 int level) 520 int level)
486{ 521{
487 struct kvm_memory_slot *slot; 522 struct kvm_memory_slot *slot;
488 int *largepage_idx; 523 struct kvm_lpage_info *linfo;
489 524
490 slot = gfn_to_memslot(kvm, gfn); 525 slot = gfn_to_memslot(kvm, gfn);
491 if (slot) { 526 if (slot) {
492 largepage_idx = slot_largepage_idx(gfn, slot, level); 527 linfo = lpage_info_slot(gfn, slot, level);
493 return *largepage_idx; 528 return linfo->write_count;
494 } 529 }
495 530
496 return 1; 531 return 1;
@@ -514,14 +549,28 @@ static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
514 return ret; 549 return ret;
515} 550}
516 551
517static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 552static struct kvm_memory_slot *
553gfn_to_memslot_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t gfn,
554 bool no_dirty_log)
518{ 555{
519 struct kvm_memory_slot *slot; 556 struct kvm_memory_slot *slot;
520 int host_level, level, max_level;
521 557
522 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 558 slot = gfn_to_memslot(vcpu->kvm, gfn);
523 if (slot && slot->dirty_bitmap) 559 if (!slot || slot->flags & KVM_MEMSLOT_INVALID ||
524 return PT_PAGE_TABLE_LEVEL; 560 (no_dirty_log && slot->dirty_bitmap))
561 slot = NULL;
562
563 return slot;
564}
565
566static bool mapping_level_dirty_bitmap(struct kvm_vcpu *vcpu, gfn_t large_gfn)
567{
568 return !gfn_to_memslot_dirty_bitmap(vcpu, large_gfn, true);
569}
570
571static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
572{
573 int host_level, level, max_level;
525 574
526 host_level = host_mapping_level(vcpu->kvm, large_gfn); 575 host_level = host_mapping_level(vcpu->kvm, large_gfn);
527 576
@@ -545,16 +594,15 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
545static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level) 594static unsigned long *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, int level)
546{ 595{
547 struct kvm_memory_slot *slot; 596 struct kvm_memory_slot *slot;
548 unsigned long idx; 597 struct kvm_lpage_info *linfo;
549 598
550 slot = gfn_to_memslot(kvm, gfn); 599 slot = gfn_to_memslot(kvm, gfn);
551 if (likely(level == PT_PAGE_TABLE_LEVEL)) 600 if (likely(level == PT_PAGE_TABLE_LEVEL))
552 return &slot->rmap[gfn - slot->base_gfn]; 601 return &slot->rmap[gfn - slot->base_gfn];
553 602
554 idx = (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - 603 linfo = lpage_info_slot(gfn, slot, level);
555 (slot->base_gfn >> KVM_HPAGE_GFN_SHIFT(level));
556 604
557 return &slot->lpage_info[level - 2][idx].rmap_pde; 605 return &linfo->rmap_pde;
558} 606}
559 607
560/* 608/*
@@ -591,6 +639,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
591 desc->sptes[0] = (u64 *)*rmapp; 639 desc->sptes[0] = (u64 *)*rmapp;
592 desc->sptes[1] = spte; 640 desc->sptes[1] = spte;
593 *rmapp = (unsigned long)desc | 1; 641 *rmapp = (unsigned long)desc | 1;
642 ++count;
594 } else { 643 } else {
595 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte); 644 rmap_printk("rmap_add: %p %llx many->many\n", spte, *spte);
596 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 645 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
@@ -603,7 +652,7 @@ static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
603 desc = desc->more; 652 desc = desc->more;
604 } 653 }
605 for (i = 0; desc->sptes[i]; ++i) 654 for (i = 0; desc->sptes[i]; ++i)
606 ; 655 ++count;
607 desc->sptes[i] = spte; 656 desc->sptes[i] = spte;
608 } 657 }
609 return count; 658 return count;
@@ -645,18 +694,17 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
645 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); 694 gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt);
646 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level); 695 rmapp = gfn_to_rmap(kvm, gfn, sp->role.level);
647 if (!*rmapp) { 696 if (!*rmapp) {
648 printk(KERN_ERR "rmap_remove: %p %llx 0->BUG\n", spte, *spte); 697 printk(KERN_ERR "rmap_remove: %p 0->BUG\n", spte);
649 BUG(); 698 BUG();
650 } else if (!(*rmapp & 1)) { 699 } else if (!(*rmapp & 1)) {
651 rmap_printk("rmap_remove: %p %llx 1->0\n", spte, *spte); 700 rmap_printk("rmap_remove: %p 1->0\n", spte);
652 if ((u64 *)*rmapp != spte) { 701 if ((u64 *)*rmapp != spte) {
653 printk(KERN_ERR "rmap_remove: %p %llx 1->BUG\n", 702 printk(KERN_ERR "rmap_remove: %p 1->BUG\n", spte);
654 spte, *spte);
655 BUG(); 703 BUG();
656 } 704 }
657 *rmapp = 0; 705 *rmapp = 0;
658 } else { 706 } else {
659 rmap_printk("rmap_remove: %p %llx many->many\n", spte, *spte); 707 rmap_printk("rmap_remove: %p many->many\n", spte);
660 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul); 708 desc = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
661 prev_desc = NULL; 709 prev_desc = NULL;
662 while (desc) { 710 while (desc) {
@@ -670,35 +718,36 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
670 prev_desc = desc; 718 prev_desc = desc;
671 desc = desc->more; 719 desc = desc->more;
672 } 720 }
673 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte); 721 pr_err("rmap_remove: %p many->many\n", spte);
674 BUG(); 722 BUG();
675 } 723 }
676} 724}
677 725
678static void set_spte_track_bits(u64 *sptep, u64 new_spte) 726static int set_spte_track_bits(u64 *sptep, u64 new_spte)
679{ 727{
680 pfn_t pfn; 728 pfn_t pfn;
681 u64 old_spte = *sptep; 729 u64 old_spte = *sptep;
682 730
683 if (!shadow_accessed_mask || !is_shadow_present_pte(old_spte) || 731 if (!spte_has_volatile_bits(old_spte))
684 old_spte & shadow_accessed_mask) {
685 __set_spte(sptep, new_spte); 732 __set_spte(sptep, new_spte);
686 } else 733 else
687 old_spte = __xchg_spte(sptep, new_spte); 734 old_spte = __xchg_spte(sptep, new_spte);
688 735
689 if (!is_rmap_spte(old_spte)) 736 if (!is_rmap_spte(old_spte))
690 return; 737 return 0;
738
691 pfn = spte_to_pfn(old_spte); 739 pfn = spte_to_pfn(old_spte);
692 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask) 740 if (!shadow_accessed_mask || old_spte & shadow_accessed_mask)
693 kvm_set_pfn_accessed(pfn); 741 kvm_set_pfn_accessed(pfn);
694 if (is_writable_pte(old_spte)) 742 if (!shadow_dirty_mask || (old_spte & shadow_dirty_mask))
695 kvm_set_pfn_dirty(pfn); 743 kvm_set_pfn_dirty(pfn);
744 return 1;
696} 745}
697 746
698static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte) 747static void drop_spte(struct kvm *kvm, u64 *sptep, u64 new_spte)
699{ 748{
700 set_spte_track_bits(sptep, new_spte); 749 if (set_spte_track_bits(sptep, new_spte))
701 rmap_remove(kvm, sptep); 750 rmap_remove(kvm, sptep);
702} 751}
703 752
704static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte) 753static u64 *rmap_next(struct kvm *kvm, unsigned long *rmapp, u64 *spte)
@@ -746,13 +795,6 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
746 } 795 }
747 spte = rmap_next(kvm, rmapp, spte); 796 spte = rmap_next(kvm, rmapp, spte);
748 } 797 }
749 if (write_protected) {
750 pfn_t pfn;
751
752 spte = rmap_next(kvm, rmapp, NULL);
753 pfn = spte_to_pfn(*spte);
754 kvm_set_pfn_dirty(pfn);
755 }
756 798
757 /* check for huge page mappings */ 799 /* check for huge page mappings */
758 for (i = PT_DIRECTORY_LEVEL; 800 for (i = PT_DIRECTORY_LEVEL;
@@ -848,19 +890,16 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
848 end = start + (memslot->npages << PAGE_SHIFT); 890 end = start + (memslot->npages << PAGE_SHIFT);
849 if (hva >= start && hva < end) { 891 if (hva >= start && hva < end) {
850 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 892 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
893 gfn_t gfn = memslot->base_gfn + gfn_offset;
851 894
852 ret = handler(kvm, &memslot->rmap[gfn_offset], data); 895 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
853 896
854 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 897 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
855 unsigned long idx; 898 struct kvm_lpage_info *linfo;
856 int sh; 899
857 900 linfo = lpage_info_slot(gfn, memslot,
858 sh = KVM_HPAGE_GFN_SHIFT(PT_DIRECTORY_LEVEL+j); 901 PT_DIRECTORY_LEVEL + j);
859 idx = ((memslot->base_gfn+gfn_offset) >> sh) - 902 ret |= handler(kvm, &linfo->rmap_pde, data);
860 (memslot->base_gfn >> sh);
861 ret |= handler(kvm,
862 &memslot->lpage_info[j][idx].rmap_pde,
863 data);
864 } 903 }
865 trace_kvm_age_page(hva, memslot, ret); 904 trace_kvm_age_page(hva, memslot, ret);
866 retval |= ret; 905 retval |= ret;
@@ -911,6 +950,35 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
911 return young; 950 return young;
912} 951}
913 952
953static int kvm_test_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
954 unsigned long data)
955{
956 u64 *spte;
957 int young = 0;
958
959 /*
960 * If there's no access bit in the secondary pte set by the
961 * hardware it's up to gup-fast/gup to set the access bit in
962 * the primary pte or in the page structure.
963 */
964 if (!shadow_accessed_mask)
965 goto out;
966
967 spte = rmap_next(kvm, rmapp, NULL);
968 while (spte) {
969 u64 _spte = *spte;
970 BUG_ON(!(_spte & PT_PRESENT_MASK));
971 young = _spte & PT_ACCESSED_MASK;
972 if (young) {
973 young = 1;
974 break;
975 }
976 spte = rmap_next(kvm, rmapp, spte);
977 }
978out:
979 return young;
980}
981
914#define RMAP_RECYCLE_THRESHOLD 1000 982#define RMAP_RECYCLE_THRESHOLD 1000
915 983
916static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) 984static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn)
@@ -931,6 +999,11 @@ int kvm_age_hva(struct kvm *kvm, unsigned long hva)
931 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp); 999 return kvm_handle_hva(kvm, hva, 0, kvm_age_rmapp);
932} 1000}
933 1001
1002int kvm_test_age_hva(struct kvm *kvm, unsigned long hva)
1003{
1004 return kvm_handle_hva(kvm, hva, 0, kvm_test_age_rmapp);
1005}
1006
934#ifdef MMU_DEBUG 1007#ifdef MMU_DEBUG
935static int is_empty_shadow_page(u64 *spt) 1008static int is_empty_shadow_page(u64 *spt)
936{ 1009{
@@ -947,16 +1020,28 @@ static int is_empty_shadow_page(u64 *spt)
947} 1020}
948#endif 1021#endif
949 1022
1023/*
1024 * This value is the sum of all of the kvm instances's
1025 * kvm->arch.n_used_mmu_pages values. We need a global,
1026 * aggregate version in order to make the slab shrinker
1027 * faster
1028 */
1029static inline void kvm_mod_used_mmu_pages(struct kvm *kvm, int nr)
1030{
1031 kvm->arch.n_used_mmu_pages += nr;
1032 percpu_counter_add(&kvm_total_used_mmu_pages, nr);
1033}
1034
950static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp) 1035static void kvm_mmu_free_page(struct kvm *kvm, struct kvm_mmu_page *sp)
951{ 1036{
952 ASSERT(is_empty_shadow_page(sp->spt)); 1037 ASSERT(is_empty_shadow_page(sp->spt));
953 hlist_del(&sp->hash_link); 1038 hlist_del(&sp->hash_link);
954 list_del(&sp->link); 1039 list_del(&sp->link);
955 __free_page(virt_to_page(sp->spt)); 1040 free_page((unsigned long)sp->spt);
956 if (!sp->role.direct) 1041 if (!sp->role.direct)
957 __free_page(virt_to_page(sp->gfns)); 1042 free_page((unsigned long)sp->gfns);
958 kmem_cache_free(mmu_page_header_cache, sp); 1043 kmem_cache_free(mmu_page_header_cache, sp);
959 ++kvm->arch.n_free_mmu_pages; 1044 kvm_mod_used_mmu_pages(kvm, -1);
960} 1045}
961 1046
962static unsigned kvm_page_table_hashfn(gfn_t gfn) 1047static unsigned kvm_page_table_hashfn(gfn_t gfn)
@@ -979,7 +1064,7 @@ static struct kvm_mmu_page *kvm_mmu_alloc_page(struct kvm_vcpu *vcpu,
979 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS); 1064 bitmap_zero(sp->slot_bitmap, KVM_MEMORY_SLOTS + KVM_PRIVATE_MEM_SLOTS);
980 sp->multimapped = 0; 1065 sp->multimapped = 0;
981 sp->parent_pte = parent_pte; 1066 sp->parent_pte = parent_pte;
982 --vcpu->kvm->arch.n_free_mmu_pages; 1067 kvm_mod_used_mmu_pages(vcpu->kvm, +1);
983 return sp; 1068 return sp;
984} 1069}
985 1070
@@ -1110,7 +1195,7 @@ static void nonpaging_prefetch_page(struct kvm_vcpu *vcpu,
1110} 1195}
1111 1196
1112static int nonpaging_sync_page(struct kvm_vcpu *vcpu, 1197static int nonpaging_sync_page(struct kvm_vcpu *vcpu,
1113 struct kvm_mmu_page *sp, bool clear_unsync) 1198 struct kvm_mmu_page *sp)
1114{ 1199{
1115 return 1; 1200 return 1;
1116} 1201}
@@ -1119,6 +1204,13 @@ static void nonpaging_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
1119{ 1204{
1120} 1205}
1121 1206
1207static void nonpaging_update_pte(struct kvm_vcpu *vcpu,
1208 struct kvm_mmu_page *sp, u64 *spte,
1209 const void *pte)
1210{
1211 WARN_ON(1);
1212}
1213
1122#define KVM_PAGE_ARRAY_NR 16 1214#define KVM_PAGE_ARRAY_NR 16
1123 1215
1124struct kvm_mmu_pages { 1216struct kvm_mmu_pages {
@@ -1240,7 +1332,7 @@ static int __kvm_sync_page(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
1240 if (clear_unsync) 1332 if (clear_unsync)
1241 kvm_unlink_unsync_page(vcpu->kvm, sp); 1333 kvm_unlink_unsync_page(vcpu->kvm, sp);
1242 1334
1243 if (vcpu->arch.mmu.sync_page(vcpu, sp, clear_unsync)) { 1335 if (vcpu->arch.mmu.sync_page(vcpu, sp)) {
1244 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list); 1336 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, invalid_list);
1245 return 1; 1337 return 1;
1246 } 1338 }
@@ -1281,12 +1373,12 @@ static void kvm_sync_pages(struct kvm_vcpu *vcpu, gfn_t gfn)
1281 continue; 1373 continue;
1282 1374
1283 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL); 1375 WARN_ON(s->role.level != PT_PAGE_TABLE_LEVEL);
1376 kvm_unlink_unsync_page(vcpu->kvm, s);
1284 if ((s->role.cr4_pae != !!is_pae(vcpu)) || 1377 if ((s->role.cr4_pae != !!is_pae(vcpu)) ||
1285 (vcpu->arch.mmu.sync_page(vcpu, s, true))) { 1378 (vcpu->arch.mmu.sync_page(vcpu, s))) {
1286 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list); 1379 kvm_mmu_prepare_zap_page(vcpu->kvm, s, &invalid_list);
1287 continue; 1380 continue;
1288 } 1381 }
1289 kvm_unlink_unsync_page(vcpu->kvm, s);
1290 flush = true; 1382 flush = true;
1291 } 1383 }
1292 1384
@@ -1403,7 +1495,8 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
1403 if (role.direct) 1495 if (role.direct)
1404 role.cr4_pae = 0; 1496 role.cr4_pae = 0;
1405 role.access = access; 1497 role.access = access;
1406 if (!tdp_enabled && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) { 1498 if (!vcpu->arch.mmu.direct_map
1499 && vcpu->arch.mmu.root_level <= PT32_ROOT_LEVEL) {
1407 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level)); 1500 quadrant = gaddr >> (PAGE_SHIFT + (PT64_PT_BITS * level));
1408 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1; 1501 quadrant &= (1 << ((PT32_PT_BITS - PT64_PT_BITS) * level)) - 1;
1409 role.quadrant = quadrant; 1502 role.quadrant = quadrant;
@@ -1458,6 +1551,12 @@ static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
1458 iterator->addr = addr; 1551 iterator->addr = addr;
1459 iterator->shadow_addr = vcpu->arch.mmu.root_hpa; 1552 iterator->shadow_addr = vcpu->arch.mmu.root_hpa;
1460 iterator->level = vcpu->arch.mmu.shadow_root_level; 1553 iterator->level = vcpu->arch.mmu.shadow_root_level;
1554
1555 if (iterator->level == PT64_ROOT_LEVEL &&
1556 vcpu->arch.mmu.root_level < PT64_ROOT_LEVEL &&
1557 !vcpu->arch.mmu.direct_map)
1558 --iterator->level;
1559
1461 if (iterator->level == PT32E_ROOT_LEVEL) { 1560 if (iterator->level == PT32E_ROOT_LEVEL) {
1462 iterator->shadow_addr 1561 iterator->shadow_addr
1463 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3]; 1562 = vcpu->arch.mmu.pae_root[(addr >> 30) & 3];
@@ -1665,41 +1764,31 @@ static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1665 1764
1666/* 1765/*
1667 * Changing the number of mmu pages allocated to the vm 1766 * Changing the number of mmu pages allocated to the vm
1668 * Note: if kvm_nr_mmu_pages is too small, you will get dead lock 1767 * Note: if goal_nr_mmu_pages is too small, you will get dead lock
1669 */ 1768 */
1670void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages) 1769void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int goal_nr_mmu_pages)
1671{ 1770{
1672 int used_pages;
1673 LIST_HEAD(invalid_list); 1771 LIST_HEAD(invalid_list);
1674
1675 used_pages = kvm->arch.n_alloc_mmu_pages - kvm->arch.n_free_mmu_pages;
1676 used_pages = max(0, used_pages);
1677
1678 /* 1772 /*
1679 * If we set the number of mmu pages to be smaller be than the 1773 * If we set the number of mmu pages to be smaller be than the
1680 * number of actived pages , we must to free some mmu pages before we 1774 * number of actived pages , we must to free some mmu pages before we
1681 * change the value 1775 * change the value
1682 */ 1776 */
1683 1777
1684 if (used_pages > kvm_nr_mmu_pages) { 1778 if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
1685 while (used_pages > kvm_nr_mmu_pages && 1779 while (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages &&
1686 !list_empty(&kvm->arch.active_mmu_pages)) { 1780 !list_empty(&kvm->arch.active_mmu_pages)) {
1687 struct kvm_mmu_page *page; 1781 struct kvm_mmu_page *page;
1688 1782
1689 page = container_of(kvm->arch.active_mmu_pages.prev, 1783 page = container_of(kvm->arch.active_mmu_pages.prev,
1690 struct kvm_mmu_page, link); 1784 struct kvm_mmu_page, link);
1691 used_pages -= kvm_mmu_prepare_zap_page(kvm, page, 1785 kvm_mmu_prepare_zap_page(kvm, page, &invalid_list);
1692 &invalid_list); 1786 kvm_mmu_commit_zap_page(kvm, &invalid_list);
1693 } 1787 }
1694 kvm_mmu_commit_zap_page(kvm, &invalid_list); 1788 goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
1695 kvm_nr_mmu_pages = used_pages;
1696 kvm->arch.n_free_mmu_pages = 0;
1697 } 1789 }
1698 else
1699 kvm->arch.n_free_mmu_pages += kvm_nr_mmu_pages
1700 - kvm->arch.n_alloc_mmu_pages;
1701 1790
1702 kvm->arch.n_alloc_mmu_pages = kvm_nr_mmu_pages; 1791 kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
1703} 1792}
1704 1793
1705static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn) 1794static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
@@ -1709,11 +1798,11 @@ static int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
1709 LIST_HEAD(invalid_list); 1798 LIST_HEAD(invalid_list);
1710 int r; 1799 int r;
1711 1800
1712 pgprintk("%s: looking for gfn %lx\n", __func__, gfn); 1801 pgprintk("%s: looking for gfn %llx\n", __func__, gfn);
1713 r = 0; 1802 r = 0;
1714 1803
1715 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1804 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1716 pgprintk("%s: gfn %lx role %x\n", __func__, gfn, 1805 pgprintk("%s: gfn %llx role %x\n", __func__, gfn,
1717 sp->role.word); 1806 sp->role.word);
1718 r = 1; 1807 r = 1;
1719 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1808 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
@@ -1729,7 +1818,7 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1729 LIST_HEAD(invalid_list); 1818 LIST_HEAD(invalid_list);
1730 1819
1731 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) { 1820 for_each_gfn_indirect_valid_sp(kvm, sp, gfn, node) {
1732 pgprintk("%s: zap %lx %x\n", 1821 pgprintk("%s: zap %llx %x\n",
1733 __func__, gfn, sp->role.word); 1822 __func__, gfn, sp->role.word);
1734 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list); 1823 kvm_mmu_prepare_zap_page(kvm, sp, &invalid_list);
1735 } 1824 }
@@ -1915,9 +2004,9 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1915 unsigned pte_access, int user_fault, 2004 unsigned pte_access, int user_fault,
1916 int write_fault, int dirty, int level, 2005 int write_fault, int dirty, int level,
1917 gfn_t gfn, pfn_t pfn, bool speculative, 2006 gfn_t gfn, pfn_t pfn, bool speculative,
1918 bool can_unsync, bool reset_host_protection) 2007 bool can_unsync, bool host_writable)
1919{ 2008{
1920 u64 spte; 2009 u64 spte, entry = *sptep;
1921 int ret = 0; 2010 int ret = 0;
1922 2011
1923 /* 2012 /*
@@ -1925,7 +2014,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1925 * whether the guest actually used the pte (in order to detect 2014 * whether the guest actually used the pte (in order to detect
1926 * demand paging). 2015 * demand paging).
1927 */ 2016 */
1928 spte = shadow_base_present_pte | shadow_dirty_mask; 2017 spte = PT_PRESENT_MASK;
1929 if (!speculative) 2018 if (!speculative)
1930 spte |= shadow_accessed_mask; 2019 spte |= shadow_accessed_mask;
1931 if (!dirty) 2020 if (!dirty)
@@ -1942,14 +2031,16 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1942 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn, 2031 spte |= kvm_x86_ops->get_mt_mask(vcpu, gfn,
1943 kvm_is_mmio_pfn(pfn)); 2032 kvm_is_mmio_pfn(pfn));
1944 2033
1945 if (reset_host_protection) 2034 if (host_writable)
1946 spte |= SPTE_HOST_WRITEABLE; 2035 spte |= SPTE_HOST_WRITEABLE;
2036 else
2037 pte_access &= ~ACC_WRITE_MASK;
1947 2038
1948 spte |= (u64)pfn << PAGE_SHIFT; 2039 spte |= (u64)pfn << PAGE_SHIFT;
1949 2040
1950 if ((pte_access & ACC_WRITE_MASK) 2041 if ((pte_access & ACC_WRITE_MASK)
1951 || (!tdp_enabled && write_fault && !is_write_protection(vcpu) 2042 || (!vcpu->arch.mmu.direct_map && write_fault
1952 && !user_fault)) { 2043 && !is_write_protection(vcpu) && !user_fault)) {
1953 2044
1954 if (level > PT_PAGE_TABLE_LEVEL && 2045 if (level > PT_PAGE_TABLE_LEVEL &&
1955 has_wrprotected_page(vcpu->kvm, gfn, level)) { 2046 has_wrprotected_page(vcpu->kvm, gfn, level)) {
@@ -1960,7 +2051,8 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1960 2051
1961 spte |= PT_WRITABLE_MASK; 2052 spte |= PT_WRITABLE_MASK;
1962 2053
1963 if (!tdp_enabled && !(pte_access & ACC_WRITE_MASK)) 2054 if (!vcpu->arch.mmu.direct_map
2055 && !(pte_access & ACC_WRITE_MASK))
1964 spte &= ~PT_USER_MASK; 2056 spte &= ~PT_USER_MASK;
1965 2057
1966 /* 2058 /*
@@ -1973,7 +2065,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1973 goto set_pte; 2065 goto set_pte;
1974 2066
1975 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 2067 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
1976 pgprintk("%s: found shadow page for %lx, marking ro\n", 2068 pgprintk("%s: found shadow page for %llx, marking ro\n",
1977 __func__, gfn); 2069 __func__, gfn);
1978 ret = 1; 2070 ret = 1;
1979 pte_access &= ~ACC_WRITE_MASK; 2071 pte_access &= ~ACC_WRITE_MASK;
@@ -1986,9 +2078,15 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1986 mark_page_dirty(vcpu->kvm, gfn); 2078 mark_page_dirty(vcpu->kvm, gfn);
1987 2079
1988set_pte: 2080set_pte:
1989 if (is_writable_pte(*sptep) && !is_writable_pte(spte))
1990 kvm_set_pfn_dirty(pfn);
1991 update_spte(sptep, spte); 2081 update_spte(sptep, spte);
2082 /*
2083 * If we overwrite a writable spte with a read-only one we
2084 * should flush remote TLBs. Otherwise rmap_write_protect
2085 * will find a read-only spte, even though the writable spte
2086 * might be cached on a CPU's TLB.
2087 */
2088 if (is_writable_pte(entry) && !is_writable_pte(*sptep))
2089 kvm_flush_remote_tlbs(vcpu->kvm);
1992done: 2090done:
1993 return ret; 2091 return ret;
1994} 2092}
@@ -1998,13 +2096,13 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1998 int user_fault, int write_fault, int dirty, 2096 int user_fault, int write_fault, int dirty,
1999 int *ptwrite, int level, gfn_t gfn, 2097 int *ptwrite, int level, gfn_t gfn,
2000 pfn_t pfn, bool speculative, 2098 pfn_t pfn, bool speculative,
2001 bool reset_host_protection) 2099 bool host_writable)
2002{ 2100{
2003 int was_rmapped = 0; 2101 int was_rmapped = 0;
2004 int rmap_count; 2102 int rmap_count;
2005 2103
2006 pgprintk("%s: spte %llx access %x write_fault %d" 2104 pgprintk("%s: spte %llx access %x write_fault %d"
2007 " user_fault %d gfn %lx\n", 2105 " user_fault %d gfn %llx\n",
2008 __func__, *sptep, pt_access, 2106 __func__, *sptep, pt_access,
2009 write_fault, user_fault, gfn); 2107 write_fault, user_fault, gfn);
2010 2108
@@ -2023,7 +2121,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2023 __set_spte(sptep, shadow_trap_nonpresent_pte); 2121 __set_spte(sptep, shadow_trap_nonpresent_pte);
2024 kvm_flush_remote_tlbs(vcpu->kvm); 2122 kvm_flush_remote_tlbs(vcpu->kvm);
2025 } else if (pfn != spte_to_pfn(*sptep)) { 2123 } else if (pfn != spte_to_pfn(*sptep)) {
2026 pgprintk("hfn old %lx new %lx\n", 2124 pgprintk("hfn old %llx new %llx\n",
2027 spte_to_pfn(*sptep), pfn); 2125 spte_to_pfn(*sptep), pfn);
2028 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte); 2126 drop_spte(vcpu->kvm, sptep, shadow_trap_nonpresent_pte);
2029 kvm_flush_remote_tlbs(vcpu->kvm); 2127 kvm_flush_remote_tlbs(vcpu->kvm);
@@ -2033,14 +2131,14 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
2033 2131
2034 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault, 2132 if (set_spte(vcpu, sptep, pte_access, user_fault, write_fault,
2035 dirty, level, gfn, pfn, speculative, true, 2133 dirty, level, gfn, pfn, speculative, true,
2036 reset_host_protection)) { 2134 host_writable)) {
2037 if (write_fault) 2135 if (write_fault)
2038 *ptwrite = 1; 2136 *ptwrite = 1;
2039 kvm_mmu_flush_tlb(vcpu); 2137 kvm_mmu_flush_tlb(vcpu);
2040 } 2138 }
2041 2139
2042 pgprintk("%s: setting spte %llx\n", __func__, *sptep); 2140 pgprintk("%s: setting spte %llx\n", __func__, *sptep);
2043 pgprintk("instantiating %s PTE (%s) at %ld (%llx) addr %p\n", 2141 pgprintk("instantiating %s PTE (%s) at %llx (%llx) addr %p\n",
2044 is_large_pte(*sptep)? "2MB" : "4kB", 2142 is_large_pte(*sptep)? "2MB" : "4kB",
2045 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn, 2143 *sptep & PT_PRESENT_MASK ?"RW":"R", gfn,
2046 *sptep, sptep); 2144 *sptep, sptep);
@@ -2064,8 +2162,95 @@ static void nonpaging_new_cr3(struct kvm_vcpu *vcpu)
2064{ 2162{
2065} 2163}
2066 2164
2165static pfn_t pte_prefetch_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn,
2166 bool no_dirty_log)
2167{
2168 struct kvm_memory_slot *slot;
2169 unsigned long hva;
2170
2171 slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log);
2172 if (!slot) {
2173 get_page(bad_page);
2174 return page_to_pfn(bad_page);
2175 }
2176
2177 hva = gfn_to_hva_memslot(slot, gfn);
2178
2179 return hva_to_pfn_atomic(vcpu->kvm, hva);
2180}
2181
2182static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2183 struct kvm_mmu_page *sp,
2184 u64 *start, u64 *end)
2185{
2186 struct page *pages[PTE_PREFETCH_NUM];
2187 unsigned access = sp->role.access;
2188 int i, ret;
2189 gfn_t gfn;
2190
2191 gfn = kvm_mmu_page_get_gfn(sp, start - sp->spt);
2192 if (!gfn_to_memslot_dirty_bitmap(vcpu, gfn, access & ACC_WRITE_MASK))
2193 return -1;
2194
2195 ret = gfn_to_page_many_atomic(vcpu->kvm, gfn, pages, end - start);
2196 if (ret <= 0)
2197 return -1;
2198
2199 for (i = 0; i < ret; i++, gfn++, start++)
2200 mmu_set_spte(vcpu, start, ACC_ALL,
2201 access, 0, 0, 1, NULL,
2202 sp->role.level, gfn,
2203 page_to_pfn(pages[i]), true, true);
2204
2205 return 0;
2206}
2207
2208static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
2209 struct kvm_mmu_page *sp, u64 *sptep)
2210{
2211 u64 *spte, *start = NULL;
2212 int i;
2213
2214 WARN_ON(!sp->role.direct);
2215
2216 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
2217 spte = sp->spt + i;
2218
2219 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
2220 if (*spte != shadow_trap_nonpresent_pte || spte == sptep) {
2221 if (!start)
2222 continue;
2223 if (direct_pte_prefetch_many(vcpu, sp, start, spte) < 0)
2224 break;
2225 start = NULL;
2226 } else if (!start)
2227 start = spte;
2228 }
2229}
2230
2231static void direct_pte_prefetch(struct kvm_vcpu *vcpu, u64 *sptep)
2232{
2233 struct kvm_mmu_page *sp;
2234
2235 /*
2236 * Since it's no accessed bit on EPT, it's no way to
2237 * distinguish between actually accessed translations
2238 * and prefetched, so disable pte prefetch if EPT is
2239 * enabled.
2240 */
2241 if (!shadow_accessed_mask)
2242 return;
2243
2244 sp = page_header(__pa(sptep));
2245 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
2246 return;
2247
2248 __direct_pte_prefetch(vcpu, sp, sptep);
2249}
2250
2067static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write, 2251static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2068 int level, gfn_t gfn, pfn_t pfn) 2252 int map_writable, int level, gfn_t gfn, pfn_t pfn,
2253 bool prefault)
2069{ 2254{
2070 struct kvm_shadow_walk_iterator iterator; 2255 struct kvm_shadow_walk_iterator iterator;
2071 struct kvm_mmu_page *sp; 2256 struct kvm_mmu_page *sp;
@@ -2074,9 +2259,12 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2074 2259
2075 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) { 2260 for_each_shadow_entry(vcpu, (u64)gfn << PAGE_SHIFT, iterator) {
2076 if (iterator.level == level) { 2261 if (iterator.level == level) {
2077 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, ACC_ALL, 2262 unsigned pte_access = ACC_ALL;
2263
2264 mmu_set_spte(vcpu, iterator.sptep, ACC_ALL, pte_access,
2078 0, write, 1, &pt_write, 2265 0, write, 1, &pt_write,
2079 level, gfn, pfn, false, true); 2266 level, gfn, pfn, prefault, map_writable);
2267 direct_pte_prefetch(vcpu, iterator.sptep);
2080 ++vcpu->stat.pf_fixed; 2268 ++vcpu->stat.pf_fixed;
2081 break; 2269 break;
2082 } 2270 }
@@ -2098,28 +2286,31 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t v, int write,
2098 __set_spte(iterator.sptep, 2286 __set_spte(iterator.sptep,
2099 __pa(sp->spt) 2287 __pa(sp->spt)
2100 | PT_PRESENT_MASK | PT_WRITABLE_MASK 2288 | PT_PRESENT_MASK | PT_WRITABLE_MASK
2101 | shadow_user_mask | shadow_x_mask); 2289 | shadow_user_mask | shadow_x_mask
2290 | shadow_accessed_mask);
2102 } 2291 }
2103 } 2292 }
2104 return pt_write; 2293 return pt_write;
2105} 2294}
2106 2295
2107static void kvm_send_hwpoison_signal(struct kvm *kvm, gfn_t gfn) 2296static void kvm_send_hwpoison_signal(unsigned long address, struct task_struct *tsk)
2108{ 2297{
2109 char buf[1]; 2298 siginfo_t info;
2110 void __user *hva;
2111 int r;
2112 2299
2113 /* Touch the page, so send SIGBUS */ 2300 info.si_signo = SIGBUS;
2114 hva = (void __user *)gfn_to_hva(kvm, gfn); 2301 info.si_errno = 0;
2115 r = copy_from_user(buf, hva, 1); 2302 info.si_code = BUS_MCEERR_AR;
2303 info.si_addr = (void __user *)address;
2304 info.si_addr_lsb = PAGE_SHIFT;
2305
2306 send_sig_info(SIGBUS, &info, tsk);
2116} 2307}
2117 2308
2118static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn) 2309static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2119{ 2310{
2120 kvm_release_pfn_clean(pfn); 2311 kvm_release_pfn_clean(pfn);
2121 if (is_hwpoison_pfn(pfn)) { 2312 if (is_hwpoison_pfn(pfn)) {
2122 kvm_send_hwpoison_signal(kvm, gfn); 2313 kvm_send_hwpoison_signal(gfn_to_hva(kvm, gfn), current);
2123 return 0; 2314 return 0;
2124 } else if (is_fault_pfn(pfn)) 2315 } else if (is_fault_pfn(pfn))
2125 return -EFAULT; 2316 return -EFAULT;
@@ -2127,27 +2318,81 @@ static int kvm_handle_bad_page(struct kvm *kvm, gfn_t gfn, pfn_t pfn)
2127 return 1; 2318 return 1;
2128} 2319}
2129 2320
2130static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn) 2321static void transparent_hugepage_adjust(struct kvm_vcpu *vcpu,
2322 gfn_t *gfnp, pfn_t *pfnp, int *levelp)
2323{
2324 pfn_t pfn = *pfnp;
2325 gfn_t gfn = *gfnp;
2326 int level = *levelp;
2327
2328 /*
2329 * Check if it's a transparent hugepage. If this would be an
2330 * hugetlbfs page, level wouldn't be set to
2331 * PT_PAGE_TABLE_LEVEL and there would be no adjustment done
2332 * here.
2333 */
2334 if (!is_error_pfn(pfn) && !kvm_is_mmio_pfn(pfn) &&
2335 level == PT_PAGE_TABLE_LEVEL &&
2336 PageTransCompound(pfn_to_page(pfn)) &&
2337 !has_wrprotected_page(vcpu->kvm, gfn, PT_DIRECTORY_LEVEL)) {
2338 unsigned long mask;
2339 /*
2340 * mmu_notifier_retry was successful and we hold the
2341 * mmu_lock here, so the pmd can't become splitting
2342 * from under us, and in turn
2343 * __split_huge_page_refcount() can't run from under
2344 * us and we can safely transfer the refcount from
2345 * PG_tail to PG_head as we switch the pfn to tail to
2346 * head.
2347 */
2348 *levelp = level = PT_DIRECTORY_LEVEL;
2349 mask = KVM_PAGES_PER_HPAGE(level) - 1;
2350 VM_BUG_ON((gfn & mask) != (pfn & mask));
2351 if (pfn & mask) {
2352 gfn &= ~mask;
2353 *gfnp = gfn;
2354 kvm_release_pfn_clean(pfn);
2355 pfn &= ~mask;
2356 if (!get_page_unless_zero(pfn_to_page(pfn)))
2357 BUG();
2358 *pfnp = pfn;
2359 }
2360 }
2361}
2362
2363static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2364 gva_t gva, pfn_t *pfn, bool write, bool *writable);
2365
2366static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn,
2367 bool prefault)
2131{ 2368{
2132 int r; 2369 int r;
2133 int level; 2370 int level;
2371 int force_pt_level;
2134 pfn_t pfn; 2372 pfn_t pfn;
2135 unsigned long mmu_seq; 2373 unsigned long mmu_seq;
2374 bool map_writable;
2136 2375
2137 level = mapping_level(vcpu, gfn); 2376 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2138 2377 if (likely(!force_pt_level)) {
2139 /* 2378 level = mapping_level(vcpu, gfn);
2140 * This path builds a PAE pagetable - so we can map 2mb pages at 2379 /*
2141 * maximum. Therefore check if the level is larger than that. 2380 * This path builds a PAE pagetable - so we can map
2142 */ 2381 * 2mb pages at maximum. Therefore check if the level
2143 if (level > PT_DIRECTORY_LEVEL) 2382 * is larger than that.
2144 level = PT_DIRECTORY_LEVEL; 2383 */
2384 if (level > PT_DIRECTORY_LEVEL)
2385 level = PT_DIRECTORY_LEVEL;
2145 2386
2146 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2387 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2388 } else
2389 level = PT_PAGE_TABLE_LEVEL;
2147 2390
2148 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2391 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2149 smp_rmb(); 2392 smp_rmb();
2150 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2393
2394 if (try_async_pf(vcpu, prefault, gfn, v, &pfn, write, &map_writable))
2395 return 0;
2151 2396
2152 /* mmio */ 2397 /* mmio */
2153 if (is_error_pfn(pfn)) 2398 if (is_error_pfn(pfn))
@@ -2157,7 +2402,10 @@ static int nonpaging_map(struct kvm_vcpu *vcpu, gva_t v, int write, gfn_t gfn)
2157 if (mmu_notifier_retry(vcpu, mmu_seq)) 2402 if (mmu_notifier_retry(vcpu, mmu_seq))
2158 goto out_unlock; 2403 goto out_unlock;
2159 kvm_mmu_free_some_pages(vcpu); 2404 kvm_mmu_free_some_pages(vcpu);
2160 r = __direct_map(vcpu, v, write, level, gfn, pfn); 2405 if (likely(!force_pt_level))
2406 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2407 r = __direct_map(vcpu, v, write, map_writable, level, gfn, pfn,
2408 prefault);
2161 spin_unlock(&vcpu->kvm->mmu_lock); 2409 spin_unlock(&vcpu->kvm->mmu_lock);
2162 2410
2163 2411
@@ -2179,7 +2427,9 @@ static void mmu_free_roots(struct kvm_vcpu *vcpu)
2179 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2427 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2180 return; 2428 return;
2181 spin_lock(&vcpu->kvm->mmu_lock); 2429 spin_lock(&vcpu->kvm->mmu_lock);
2182 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2430 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL &&
2431 (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL ||
2432 vcpu->arch.mmu.direct_map)) {
2183 hpa_t root = vcpu->arch.mmu.root_hpa; 2433 hpa_t root = vcpu->arch.mmu.root_hpa;
2184 2434
2185 sp = page_header(root); 2435 sp = page_header(root);
@@ -2222,83 +2472,163 @@ static int mmu_check_root(struct kvm_vcpu *vcpu, gfn_t root_gfn)
2222 return ret; 2472 return ret;
2223} 2473}
2224 2474
2225static int mmu_alloc_roots(struct kvm_vcpu *vcpu) 2475static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
2226{ 2476{
2227 int i;
2228 gfn_t root_gfn;
2229 struct kvm_mmu_page *sp; 2477 struct kvm_mmu_page *sp;
2230 int direct = 0; 2478 unsigned i;
2231 u64 pdptr;
2232
2233 root_gfn = vcpu->arch.cr3 >> PAGE_SHIFT;
2234 2479
2235 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2480 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2481 spin_lock(&vcpu->kvm->mmu_lock);
2482 kvm_mmu_free_some_pages(vcpu);
2483 sp = kvm_mmu_get_page(vcpu, 0, 0, PT64_ROOT_LEVEL,
2484 1, ACC_ALL, NULL);
2485 ++sp->root_count;
2486 spin_unlock(&vcpu->kvm->mmu_lock);
2487 vcpu->arch.mmu.root_hpa = __pa(sp->spt);
2488 } else if (vcpu->arch.mmu.shadow_root_level == PT32E_ROOT_LEVEL) {
2489 for (i = 0; i < 4; ++i) {
2490 hpa_t root = vcpu->arch.mmu.pae_root[i];
2491
2492 ASSERT(!VALID_PAGE(root));
2493 spin_lock(&vcpu->kvm->mmu_lock);
2494 kvm_mmu_free_some_pages(vcpu);
2495 sp = kvm_mmu_get_page(vcpu, i << (30 - PAGE_SHIFT),
2496 i << 30,
2497 PT32_ROOT_LEVEL, 1, ACC_ALL,
2498 NULL);
2499 root = __pa(sp->spt);
2500 ++sp->root_count;
2501 spin_unlock(&vcpu->kvm->mmu_lock);
2502 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK;
2503 }
2504 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2505 } else
2506 BUG();
2507
2508 return 0;
2509}
2510
2511static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
2512{
2513 struct kvm_mmu_page *sp;
2514 u64 pdptr, pm_mask;
2515 gfn_t root_gfn;
2516 int i;
2517
2518 root_gfn = vcpu->arch.mmu.get_cr3(vcpu) >> PAGE_SHIFT;
2519
2520 if (mmu_check_root(vcpu, root_gfn))
2521 return 1;
2522
2523 /*
2524 * Do we shadow a long mode page table? If so we need to
2525 * write-protect the guests page table root.
2526 */
2527 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2236 hpa_t root = vcpu->arch.mmu.root_hpa; 2528 hpa_t root = vcpu->arch.mmu.root_hpa;
2237 2529
2238 ASSERT(!VALID_PAGE(root)); 2530 ASSERT(!VALID_PAGE(root));
2239 if (mmu_check_root(vcpu, root_gfn)) 2531
2240 return 1;
2241 if (tdp_enabled) {
2242 direct = 1;
2243 root_gfn = 0;
2244 }
2245 spin_lock(&vcpu->kvm->mmu_lock); 2532 spin_lock(&vcpu->kvm->mmu_lock);
2246 kvm_mmu_free_some_pages(vcpu); 2533 kvm_mmu_free_some_pages(vcpu);
2247 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, 2534 sp = kvm_mmu_get_page(vcpu, root_gfn, 0, PT64_ROOT_LEVEL,
2248 PT64_ROOT_LEVEL, direct, 2535 0, ACC_ALL, NULL);
2249 ACC_ALL, NULL);
2250 root = __pa(sp->spt); 2536 root = __pa(sp->spt);
2251 ++sp->root_count; 2537 ++sp->root_count;
2252 spin_unlock(&vcpu->kvm->mmu_lock); 2538 spin_unlock(&vcpu->kvm->mmu_lock);
2253 vcpu->arch.mmu.root_hpa = root; 2539 vcpu->arch.mmu.root_hpa = root;
2254 return 0; 2540 return 0;
2255 } 2541 }
2256 direct = !is_paging(vcpu); 2542
2543 /*
2544 * We shadow a 32 bit page table. This may be a legacy 2-level
2545 * or a PAE 3-level page table. In either case we need to be aware that
2546 * the shadow page table may be a PAE or a long mode page table.
2547 */
2548 pm_mask = PT_PRESENT_MASK;
2549 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL)
2550 pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK;
2551
2257 for (i = 0; i < 4; ++i) { 2552 for (i = 0; i < 4; ++i) {
2258 hpa_t root = vcpu->arch.mmu.pae_root[i]; 2553 hpa_t root = vcpu->arch.mmu.pae_root[i];
2259 2554
2260 ASSERT(!VALID_PAGE(root)); 2555 ASSERT(!VALID_PAGE(root));
2261 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) { 2556 if (vcpu->arch.mmu.root_level == PT32E_ROOT_LEVEL) {
2262 pdptr = kvm_pdptr_read(vcpu, i); 2557 pdptr = kvm_pdptr_read_mmu(vcpu, &vcpu->arch.mmu, i);
2263 if (!is_present_gpte(pdptr)) { 2558 if (!is_present_gpte(pdptr)) {
2264 vcpu->arch.mmu.pae_root[i] = 0; 2559 vcpu->arch.mmu.pae_root[i] = 0;
2265 continue; 2560 continue;
2266 } 2561 }
2267 root_gfn = pdptr >> PAGE_SHIFT; 2562 root_gfn = pdptr >> PAGE_SHIFT;
2268 } else if (vcpu->arch.mmu.root_level == 0) 2563 if (mmu_check_root(vcpu, root_gfn))
2269 root_gfn = 0; 2564 return 1;
2270 if (mmu_check_root(vcpu, root_gfn))
2271 return 1;
2272 if (tdp_enabled) {
2273 direct = 1;
2274 root_gfn = i << 30;
2275 } 2565 }
2276 spin_lock(&vcpu->kvm->mmu_lock); 2566 spin_lock(&vcpu->kvm->mmu_lock);
2277 kvm_mmu_free_some_pages(vcpu); 2567 kvm_mmu_free_some_pages(vcpu);
2278 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30, 2568 sp = kvm_mmu_get_page(vcpu, root_gfn, i << 30,
2279 PT32_ROOT_LEVEL, direct, 2569 PT32_ROOT_LEVEL, 0,
2280 ACC_ALL, NULL); 2570 ACC_ALL, NULL);
2281 root = __pa(sp->spt); 2571 root = __pa(sp->spt);
2282 ++sp->root_count; 2572 ++sp->root_count;
2283 spin_unlock(&vcpu->kvm->mmu_lock); 2573 spin_unlock(&vcpu->kvm->mmu_lock);
2284 2574
2285 vcpu->arch.mmu.pae_root[i] = root | PT_PRESENT_MASK; 2575 vcpu->arch.mmu.pae_root[i] = root | pm_mask;
2286 } 2576 }
2287 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root); 2577 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.pae_root);
2578
2579 /*
2580 * If we shadow a 32 bit page table with a long mode page
2581 * table we enter this path.
2582 */
2583 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
2584 if (vcpu->arch.mmu.lm_root == NULL) {
2585 /*
2586 * The additional page necessary for this is only
2587 * allocated on demand.
2588 */
2589
2590 u64 *lm_root;
2591
2592 lm_root = (void*)get_zeroed_page(GFP_KERNEL);
2593 if (lm_root == NULL)
2594 return 1;
2595
2596 lm_root[0] = __pa(vcpu->arch.mmu.pae_root) | pm_mask;
2597
2598 vcpu->arch.mmu.lm_root = lm_root;
2599 }
2600
2601 vcpu->arch.mmu.root_hpa = __pa(vcpu->arch.mmu.lm_root);
2602 }
2603
2288 return 0; 2604 return 0;
2289} 2605}
2290 2606
2607static int mmu_alloc_roots(struct kvm_vcpu *vcpu)
2608{
2609 if (vcpu->arch.mmu.direct_map)
2610 return mmu_alloc_direct_roots(vcpu);
2611 else
2612 return mmu_alloc_shadow_roots(vcpu);
2613}
2614
2291static void mmu_sync_roots(struct kvm_vcpu *vcpu) 2615static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2292{ 2616{
2293 int i; 2617 int i;
2294 struct kvm_mmu_page *sp; 2618 struct kvm_mmu_page *sp;
2295 2619
2620 if (vcpu->arch.mmu.direct_map)
2621 return;
2622
2296 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa)) 2623 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
2297 return; 2624 return;
2298 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) { 2625
2626 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_SYNC);
2627 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
2299 hpa_t root = vcpu->arch.mmu.root_hpa; 2628 hpa_t root = vcpu->arch.mmu.root_hpa;
2300 sp = page_header(root); 2629 sp = page_header(root);
2301 mmu_sync_children(vcpu, sp); 2630 mmu_sync_children(vcpu, sp);
2631 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2302 return; 2632 return;
2303 } 2633 }
2304 for (i = 0; i < 4; ++i) { 2634 for (i = 0; i < 4; ++i) {
@@ -2310,6 +2640,7 @@ static void mmu_sync_roots(struct kvm_vcpu *vcpu)
2310 mmu_sync_children(vcpu, sp); 2640 mmu_sync_children(vcpu, sp);
2311 } 2641 }
2312 } 2642 }
2643 trace_kvm_mmu_audit(vcpu, AUDIT_POST_SYNC);
2313} 2644}
2314 2645
2315void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) 2646void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
@@ -2320,15 +2651,24 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2320} 2651}
2321 2652
2322static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr, 2653static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2323 u32 access, u32 *error) 2654 u32 access, struct x86_exception *exception)
2324{ 2655{
2325 if (error) 2656 if (exception)
2326 *error = 0; 2657 exception->error_code = 0;
2327 return vaddr; 2658 return vaddr;
2328} 2659}
2329 2660
2661static gpa_t nonpaging_gva_to_gpa_nested(struct kvm_vcpu *vcpu, gva_t vaddr,
2662 u32 access,
2663 struct x86_exception *exception)
2664{
2665 if (exception)
2666 exception->error_code = 0;
2667 return vcpu->arch.nested_mmu.translate_gpa(vcpu, vaddr, access);
2668}
2669
2330static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva, 2670static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2331 u32 error_code) 2671 u32 error_code, bool prefault)
2332{ 2672{
2333 gfn_t gfn; 2673 gfn_t gfn;
2334 int r; 2674 int r;
@@ -2344,17 +2684,68 @@ static int nonpaging_page_fault(struct kvm_vcpu *vcpu, gva_t gva,
2344 gfn = gva >> PAGE_SHIFT; 2684 gfn = gva >> PAGE_SHIFT;
2345 2685
2346 return nonpaging_map(vcpu, gva & PAGE_MASK, 2686 return nonpaging_map(vcpu, gva & PAGE_MASK,
2347 error_code & PFERR_WRITE_MASK, gfn); 2687 error_code & PFERR_WRITE_MASK, gfn, prefault);
2688}
2689
2690static int kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gva_t gva, gfn_t gfn)
2691{
2692 struct kvm_arch_async_pf arch;
2693
2694 arch.token = (vcpu->arch.apf.id++ << 12) | vcpu->vcpu_id;
2695 arch.gfn = gfn;
2696 arch.direct_map = vcpu->arch.mmu.direct_map;
2697 arch.cr3 = vcpu->arch.mmu.get_cr3(vcpu);
2698
2699 return kvm_setup_async_pf(vcpu, gva, gfn, &arch);
2700}
2701
2702static bool can_do_async_pf(struct kvm_vcpu *vcpu)
2703{
2704 if (unlikely(!irqchip_in_kernel(vcpu->kvm) ||
2705 kvm_event_needs_reinjection(vcpu)))
2706 return false;
2707
2708 return kvm_x86_ops->interrupt_allowed(vcpu);
2709}
2710
2711static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn,
2712 gva_t gva, pfn_t *pfn, bool write, bool *writable)
2713{
2714 bool async;
2715
2716 *pfn = gfn_to_pfn_async(vcpu->kvm, gfn, &async, write, writable);
2717
2718 if (!async)
2719 return false; /* *pfn has correct page already */
2720
2721 put_page(pfn_to_page(*pfn));
2722
2723 if (!prefault && can_do_async_pf(vcpu)) {
2724 trace_kvm_try_async_get_page(gva, gfn);
2725 if (kvm_find_async_pf_gfn(vcpu, gfn)) {
2726 trace_kvm_async_pf_doublefault(gva, gfn);
2727 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
2728 return true;
2729 } else if (kvm_arch_setup_async_pf(vcpu, gva, gfn))
2730 return true;
2731 }
2732
2733 *pfn = gfn_to_pfn_prot(vcpu->kvm, gfn, write, writable);
2734
2735 return false;
2348} 2736}
2349 2737
2350static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, 2738static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa, u32 error_code,
2351 u32 error_code) 2739 bool prefault)
2352{ 2740{
2353 pfn_t pfn; 2741 pfn_t pfn;
2354 int r; 2742 int r;
2355 int level; 2743 int level;
2744 int force_pt_level;
2356 gfn_t gfn = gpa >> PAGE_SHIFT; 2745 gfn_t gfn = gpa >> PAGE_SHIFT;
2357 unsigned long mmu_seq; 2746 unsigned long mmu_seq;
2747 int write = error_code & PFERR_WRITE_MASK;
2748 bool map_writable;
2358 2749
2359 ASSERT(vcpu); 2750 ASSERT(vcpu);
2360 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa)); 2751 ASSERT(VALID_PAGE(vcpu->arch.mmu.root_hpa));
@@ -2363,21 +2754,30 @@ static int tdp_page_fault(struct kvm_vcpu *vcpu, gva_t gpa,
2363 if (r) 2754 if (r)
2364 return r; 2755 return r;
2365 2756
2366 level = mapping_level(vcpu, gfn); 2757 force_pt_level = mapping_level_dirty_bitmap(vcpu, gfn);
2367 2758 if (likely(!force_pt_level)) {
2368 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1); 2759 level = mapping_level(vcpu, gfn);
2760 gfn &= ~(KVM_PAGES_PER_HPAGE(level) - 1);
2761 } else
2762 level = PT_PAGE_TABLE_LEVEL;
2369 2763
2370 mmu_seq = vcpu->kvm->mmu_notifier_seq; 2764 mmu_seq = vcpu->kvm->mmu_notifier_seq;
2371 smp_rmb(); 2765 smp_rmb();
2372 pfn = gfn_to_pfn(vcpu->kvm, gfn); 2766
2767 if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, write, &map_writable))
2768 return 0;
2769
2770 /* mmio */
2373 if (is_error_pfn(pfn)) 2771 if (is_error_pfn(pfn))
2374 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn); 2772 return kvm_handle_bad_page(vcpu->kvm, gfn, pfn);
2375 spin_lock(&vcpu->kvm->mmu_lock); 2773 spin_lock(&vcpu->kvm->mmu_lock);
2376 if (mmu_notifier_retry(vcpu, mmu_seq)) 2774 if (mmu_notifier_retry(vcpu, mmu_seq))
2377 goto out_unlock; 2775 goto out_unlock;
2378 kvm_mmu_free_some_pages(vcpu); 2776 kvm_mmu_free_some_pages(vcpu);
2379 r = __direct_map(vcpu, gpa, error_code & PFERR_WRITE_MASK, 2777 if (likely(!force_pt_level))
2380 level, gfn, pfn); 2778 transparent_hugepage_adjust(vcpu, &gfn, &pfn, &level);
2779 r = __direct_map(vcpu, gpa, write, map_writable,
2780 level, gfn, pfn, prefault);
2381 spin_unlock(&vcpu->kvm->mmu_lock); 2781 spin_unlock(&vcpu->kvm->mmu_lock);
2382 2782
2383 return r; 2783 return r;
@@ -2393,10 +2793,9 @@ static void nonpaging_free(struct kvm_vcpu *vcpu)
2393 mmu_free_roots(vcpu); 2793 mmu_free_roots(vcpu);
2394} 2794}
2395 2795
2396static int nonpaging_init_context(struct kvm_vcpu *vcpu) 2796static int nonpaging_init_context(struct kvm_vcpu *vcpu,
2797 struct kvm_mmu *context)
2397{ 2798{
2398 struct kvm_mmu *context = &vcpu->arch.mmu;
2399
2400 context->new_cr3 = nonpaging_new_cr3; 2799 context->new_cr3 = nonpaging_new_cr3;
2401 context->page_fault = nonpaging_page_fault; 2800 context->page_fault = nonpaging_page_fault;
2402 context->gva_to_gpa = nonpaging_gva_to_gpa; 2801 context->gva_to_gpa = nonpaging_gva_to_gpa;
@@ -2404,9 +2803,12 @@ static int nonpaging_init_context(struct kvm_vcpu *vcpu)
2404 context->prefetch_page = nonpaging_prefetch_page; 2803 context->prefetch_page = nonpaging_prefetch_page;
2405 context->sync_page = nonpaging_sync_page; 2804 context->sync_page = nonpaging_sync_page;
2406 context->invlpg = nonpaging_invlpg; 2805 context->invlpg = nonpaging_invlpg;
2806 context->update_pte = nonpaging_update_pte;
2407 context->root_level = 0; 2807 context->root_level = 0;
2408 context->shadow_root_level = PT32E_ROOT_LEVEL; 2808 context->shadow_root_level = PT32E_ROOT_LEVEL;
2409 context->root_hpa = INVALID_PAGE; 2809 context->root_hpa = INVALID_PAGE;
2810 context->direct_map = true;
2811 context->nx = false;
2410 return 0; 2812 return 0;
2411} 2813}
2412 2814
@@ -2418,15 +2820,19 @@ void kvm_mmu_flush_tlb(struct kvm_vcpu *vcpu)
2418 2820
2419static void paging_new_cr3(struct kvm_vcpu *vcpu) 2821static void paging_new_cr3(struct kvm_vcpu *vcpu)
2420{ 2822{
2421 pgprintk("%s: cr3 %lx\n", __func__, vcpu->arch.cr3); 2823 pgprintk("%s: cr3 %lx\n", __func__, kvm_read_cr3(vcpu));
2422 mmu_free_roots(vcpu); 2824 mmu_free_roots(vcpu);
2423} 2825}
2424 2826
2827static unsigned long get_cr3(struct kvm_vcpu *vcpu)
2828{
2829 return kvm_read_cr3(vcpu);
2830}
2831
2425static void inject_page_fault(struct kvm_vcpu *vcpu, 2832static void inject_page_fault(struct kvm_vcpu *vcpu,
2426 u64 addr, 2833 struct x86_exception *fault)
2427 u32 err_code)
2428{ 2834{
2429 kvm_inject_page_fault(vcpu, addr, err_code); 2835 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
2430} 2836}
2431 2837
2432static void paging_free(struct kvm_vcpu *vcpu) 2838static void paging_free(struct kvm_vcpu *vcpu)
@@ -2434,12 +2840,12 @@ static void paging_free(struct kvm_vcpu *vcpu)
2434 nonpaging_free(vcpu); 2840 nonpaging_free(vcpu);
2435} 2841}
2436 2842
2437static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level) 2843static bool is_rsvd_bits_set(struct kvm_mmu *mmu, u64 gpte, int level)
2438{ 2844{
2439 int bit7; 2845 int bit7;
2440 2846
2441 bit7 = (gpte >> 7) & 1; 2847 bit7 = (gpte >> 7) & 1;
2442 return (gpte & vcpu->arch.mmu.rsvd_bits_mask[bit7][level-1]) != 0; 2848 return (gpte & mmu->rsvd_bits_mask[bit7][level-1]) != 0;
2443} 2849}
2444 2850
2445#define PTTYPE 64 2851#define PTTYPE 64
@@ -2450,13 +2856,14 @@ static bool is_rsvd_bits_set(struct kvm_vcpu *vcpu, u64 gpte, int level)
2450#include "paging_tmpl.h" 2856#include "paging_tmpl.h"
2451#undef PTTYPE 2857#undef PTTYPE
2452 2858
2453static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level) 2859static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu,
2860 struct kvm_mmu *context,
2861 int level)
2454{ 2862{
2455 struct kvm_mmu *context = &vcpu->arch.mmu;
2456 int maxphyaddr = cpuid_maxphyaddr(vcpu); 2863 int maxphyaddr = cpuid_maxphyaddr(vcpu);
2457 u64 exb_bit_rsvd = 0; 2864 u64 exb_bit_rsvd = 0;
2458 2865
2459 if (!is_nx(vcpu)) 2866 if (!context->nx)
2460 exb_bit_rsvd = rsvd_bits(63, 63); 2867 exb_bit_rsvd = rsvd_bits(63, 63);
2461 switch (level) { 2868 switch (level) {
2462 case PT32_ROOT_LEVEL: 2869 case PT32_ROOT_LEVEL:
@@ -2511,9 +2918,13 @@ static void reset_rsvds_bits_mask(struct kvm_vcpu *vcpu, int level)
2511 } 2918 }
2512} 2919}
2513 2920
2514static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level) 2921static int paging64_init_context_common(struct kvm_vcpu *vcpu,
2922 struct kvm_mmu *context,
2923 int level)
2515{ 2924{
2516 struct kvm_mmu *context = &vcpu->arch.mmu; 2925 context->nx = is_nx(vcpu);
2926
2927 reset_rsvds_bits_mask(vcpu, context, level);
2517 2928
2518 ASSERT(is_pae(vcpu)); 2929 ASSERT(is_pae(vcpu));
2519 context->new_cr3 = paging_new_cr3; 2930 context->new_cr3 = paging_new_cr3;
@@ -2522,24 +2933,28 @@ static int paging64_init_context_common(struct kvm_vcpu *vcpu, int level)
2522 context->prefetch_page = paging64_prefetch_page; 2933 context->prefetch_page = paging64_prefetch_page;
2523 context->sync_page = paging64_sync_page; 2934 context->sync_page = paging64_sync_page;
2524 context->invlpg = paging64_invlpg; 2935 context->invlpg = paging64_invlpg;
2936 context->update_pte = paging64_update_pte;
2525 context->free = paging_free; 2937 context->free = paging_free;
2526 context->root_level = level; 2938 context->root_level = level;
2527 context->shadow_root_level = level; 2939 context->shadow_root_level = level;
2528 context->root_hpa = INVALID_PAGE; 2940 context->root_hpa = INVALID_PAGE;
2941 context->direct_map = false;
2529 return 0; 2942 return 0;
2530} 2943}
2531 2944
2532static int paging64_init_context(struct kvm_vcpu *vcpu) 2945static int paging64_init_context(struct kvm_vcpu *vcpu,
2946 struct kvm_mmu *context)
2533{ 2947{
2534 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 2948 return paging64_init_context_common(vcpu, context, PT64_ROOT_LEVEL);
2535 return paging64_init_context_common(vcpu, PT64_ROOT_LEVEL);
2536} 2949}
2537 2950
2538static int paging32_init_context(struct kvm_vcpu *vcpu) 2951static int paging32_init_context(struct kvm_vcpu *vcpu,
2952 struct kvm_mmu *context)
2539{ 2953{
2540 struct kvm_mmu *context = &vcpu->arch.mmu; 2954 context->nx = false;
2955
2956 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2541 2957
2542 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL);
2543 context->new_cr3 = paging_new_cr3; 2958 context->new_cr3 = paging_new_cr3;
2544 context->page_fault = paging32_page_fault; 2959 context->page_fault = paging32_page_fault;
2545 context->gva_to_gpa = paging32_gva_to_gpa; 2960 context->gva_to_gpa = paging32_gva_to_gpa;
@@ -2547,44 +2962,57 @@ static int paging32_init_context(struct kvm_vcpu *vcpu)
2547 context->prefetch_page = paging32_prefetch_page; 2962 context->prefetch_page = paging32_prefetch_page;
2548 context->sync_page = paging32_sync_page; 2963 context->sync_page = paging32_sync_page;
2549 context->invlpg = paging32_invlpg; 2964 context->invlpg = paging32_invlpg;
2965 context->update_pte = paging32_update_pte;
2550 context->root_level = PT32_ROOT_LEVEL; 2966 context->root_level = PT32_ROOT_LEVEL;
2551 context->shadow_root_level = PT32E_ROOT_LEVEL; 2967 context->shadow_root_level = PT32E_ROOT_LEVEL;
2552 context->root_hpa = INVALID_PAGE; 2968 context->root_hpa = INVALID_PAGE;
2969 context->direct_map = false;
2553 return 0; 2970 return 0;
2554} 2971}
2555 2972
2556static int paging32E_init_context(struct kvm_vcpu *vcpu) 2973static int paging32E_init_context(struct kvm_vcpu *vcpu,
2974 struct kvm_mmu *context)
2557{ 2975{
2558 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 2976 return paging64_init_context_common(vcpu, context, PT32E_ROOT_LEVEL);
2559 return paging64_init_context_common(vcpu, PT32E_ROOT_LEVEL);
2560} 2977}
2561 2978
2562static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu) 2979static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2563{ 2980{
2564 struct kvm_mmu *context = &vcpu->arch.mmu; 2981 struct kvm_mmu *context = vcpu->arch.walk_mmu;
2565 2982
2983 context->base_role.word = 0;
2566 context->new_cr3 = nonpaging_new_cr3; 2984 context->new_cr3 = nonpaging_new_cr3;
2567 context->page_fault = tdp_page_fault; 2985 context->page_fault = tdp_page_fault;
2568 context->free = nonpaging_free; 2986 context->free = nonpaging_free;
2569 context->prefetch_page = nonpaging_prefetch_page; 2987 context->prefetch_page = nonpaging_prefetch_page;
2570 context->sync_page = nonpaging_sync_page; 2988 context->sync_page = nonpaging_sync_page;
2571 context->invlpg = nonpaging_invlpg; 2989 context->invlpg = nonpaging_invlpg;
2990 context->update_pte = nonpaging_update_pte;
2572 context->shadow_root_level = kvm_x86_ops->get_tdp_level(); 2991 context->shadow_root_level = kvm_x86_ops->get_tdp_level();
2573 context->root_hpa = INVALID_PAGE; 2992 context->root_hpa = INVALID_PAGE;
2993 context->direct_map = true;
2994 context->set_cr3 = kvm_x86_ops->set_tdp_cr3;
2995 context->get_cr3 = get_cr3;
2996 context->inject_page_fault = kvm_inject_page_fault;
2997 context->nx = is_nx(vcpu);
2574 2998
2575 if (!is_paging(vcpu)) { 2999 if (!is_paging(vcpu)) {
3000 context->nx = false;
2576 context->gva_to_gpa = nonpaging_gva_to_gpa; 3001 context->gva_to_gpa = nonpaging_gva_to_gpa;
2577 context->root_level = 0; 3002 context->root_level = 0;
2578 } else if (is_long_mode(vcpu)) { 3003 } else if (is_long_mode(vcpu)) {
2579 reset_rsvds_bits_mask(vcpu, PT64_ROOT_LEVEL); 3004 context->nx = is_nx(vcpu);
3005 reset_rsvds_bits_mask(vcpu, context, PT64_ROOT_LEVEL);
2580 context->gva_to_gpa = paging64_gva_to_gpa; 3006 context->gva_to_gpa = paging64_gva_to_gpa;
2581 context->root_level = PT64_ROOT_LEVEL; 3007 context->root_level = PT64_ROOT_LEVEL;
2582 } else if (is_pae(vcpu)) { 3008 } else if (is_pae(vcpu)) {
2583 reset_rsvds_bits_mask(vcpu, PT32E_ROOT_LEVEL); 3009 context->nx = is_nx(vcpu);
3010 reset_rsvds_bits_mask(vcpu, context, PT32E_ROOT_LEVEL);
2584 context->gva_to_gpa = paging64_gva_to_gpa; 3011 context->gva_to_gpa = paging64_gva_to_gpa;
2585 context->root_level = PT32E_ROOT_LEVEL; 3012 context->root_level = PT32E_ROOT_LEVEL;
2586 } else { 3013 } else {
2587 reset_rsvds_bits_mask(vcpu, PT32_ROOT_LEVEL); 3014 context->nx = false;
3015 reset_rsvds_bits_mask(vcpu, context, PT32_ROOT_LEVEL);
2588 context->gva_to_gpa = paging32_gva_to_gpa; 3016 context->gva_to_gpa = paging32_gva_to_gpa;
2589 context->root_level = PT32_ROOT_LEVEL; 3017 context->root_level = PT32_ROOT_LEVEL;
2590 } 3018 }
@@ -2592,33 +3020,81 @@ static int init_kvm_tdp_mmu(struct kvm_vcpu *vcpu)
2592 return 0; 3020 return 0;
2593} 3021}
2594 3022
2595static int init_kvm_softmmu(struct kvm_vcpu *vcpu) 3023int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context)
2596{ 3024{
2597 int r; 3025 int r;
2598
2599 ASSERT(vcpu); 3026 ASSERT(vcpu);
2600 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa)); 3027 ASSERT(!VALID_PAGE(vcpu->arch.mmu.root_hpa));
2601 3028
2602 if (!is_paging(vcpu)) 3029 if (!is_paging(vcpu))
2603 r = nonpaging_init_context(vcpu); 3030 r = nonpaging_init_context(vcpu, context);
2604 else if (is_long_mode(vcpu)) 3031 else if (is_long_mode(vcpu))
2605 r = paging64_init_context(vcpu); 3032 r = paging64_init_context(vcpu, context);
2606 else if (is_pae(vcpu)) 3033 else if (is_pae(vcpu))
2607 r = paging32E_init_context(vcpu); 3034 r = paging32E_init_context(vcpu, context);
2608 else 3035 else
2609 r = paging32_init_context(vcpu); 3036 r = paging32_init_context(vcpu, context);
2610 3037
2611 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu); 3038 vcpu->arch.mmu.base_role.cr4_pae = !!is_pae(vcpu);
2612 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu); 3039 vcpu->arch.mmu.base_role.cr0_wp = is_write_protection(vcpu);
2613 3040
2614 return r; 3041 return r;
2615} 3042}
3043EXPORT_SYMBOL_GPL(kvm_init_shadow_mmu);
2616 3044
2617static int init_kvm_mmu(struct kvm_vcpu *vcpu) 3045static int init_kvm_softmmu(struct kvm_vcpu *vcpu)
2618{ 3046{
2619 vcpu->arch.update_pte.pfn = bad_pfn; 3047 int r = kvm_init_shadow_mmu(vcpu, vcpu->arch.walk_mmu);
2620 3048
2621 if (tdp_enabled) 3049 vcpu->arch.walk_mmu->set_cr3 = kvm_x86_ops->set_cr3;
3050 vcpu->arch.walk_mmu->get_cr3 = get_cr3;
3051 vcpu->arch.walk_mmu->inject_page_fault = kvm_inject_page_fault;
3052
3053 return r;
3054}
3055
3056static int init_kvm_nested_mmu(struct kvm_vcpu *vcpu)
3057{
3058 struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
3059
3060 g_context->get_cr3 = get_cr3;
3061 g_context->inject_page_fault = kvm_inject_page_fault;
3062
3063 /*
3064 * Note that arch.mmu.gva_to_gpa translates l2_gva to l1_gpa. The
3065 * translation of l2_gpa to l1_gpa addresses is done using the
3066 * arch.nested_mmu.gva_to_gpa function. Basically the gva_to_gpa
3067 * functions between mmu and nested_mmu are swapped.
3068 */
3069 if (!is_paging(vcpu)) {
3070 g_context->nx = false;
3071 g_context->root_level = 0;
3072 g_context->gva_to_gpa = nonpaging_gva_to_gpa_nested;
3073 } else if (is_long_mode(vcpu)) {
3074 g_context->nx = is_nx(vcpu);
3075 reset_rsvds_bits_mask(vcpu, g_context, PT64_ROOT_LEVEL);
3076 g_context->root_level = PT64_ROOT_LEVEL;
3077 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3078 } else if (is_pae(vcpu)) {
3079 g_context->nx = is_nx(vcpu);
3080 reset_rsvds_bits_mask(vcpu, g_context, PT32E_ROOT_LEVEL);
3081 g_context->root_level = PT32E_ROOT_LEVEL;
3082 g_context->gva_to_gpa = paging64_gva_to_gpa_nested;
3083 } else {
3084 g_context->nx = false;
3085 reset_rsvds_bits_mask(vcpu, g_context, PT32_ROOT_LEVEL);
3086 g_context->root_level = PT32_ROOT_LEVEL;
3087 g_context->gva_to_gpa = paging32_gva_to_gpa_nested;
3088 }
3089
3090 return 0;
3091}
3092
3093static int init_kvm_mmu(struct kvm_vcpu *vcpu)
3094{
3095 if (mmu_is_nested(vcpu))
3096 return init_kvm_nested_mmu(vcpu);
3097 else if (tdp_enabled)
2622 return init_kvm_tdp_mmu(vcpu); 3098 return init_kvm_tdp_mmu(vcpu);
2623 else 3099 else
2624 return init_kvm_softmmu(vcpu); 3100 return init_kvm_softmmu(vcpu);
@@ -2653,7 +3129,7 @@ int kvm_mmu_load(struct kvm_vcpu *vcpu)
2653 if (r) 3129 if (r)
2654 goto out; 3130 goto out;
2655 /* set_cr3() should ensure TLB has been flushed */ 3131 /* set_cr3() should ensure TLB has been flushed */
2656 kvm_x86_ops->set_cr3(vcpu, vcpu->arch.mmu.root_hpa); 3132 vcpu->arch.mmu.set_cr3(vcpu, vcpu->arch.mmu.root_hpa);
2657out: 3133out:
2658 return r; 3134 return r;
2659} 3135}
@@ -2663,6 +3139,7 @@ void kvm_mmu_unload(struct kvm_vcpu *vcpu)
2663{ 3139{
2664 mmu_free_roots(vcpu); 3140 mmu_free_roots(vcpu);
2665} 3141}
3142EXPORT_SYMBOL_GPL(kvm_mmu_unload);
2666 3143
2667static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu, 3144static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2668 struct kvm_mmu_page *sp, 3145 struct kvm_mmu_page *sp,
@@ -2686,8 +3163,7 @@ static void mmu_pte_write_zap_pte(struct kvm_vcpu *vcpu,
2686} 3163}
2687 3164
2688static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu, 3165static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2689 struct kvm_mmu_page *sp, 3166 struct kvm_mmu_page *sp, u64 *spte,
2690 u64 *spte,
2691 const void *new) 3167 const void *new)
2692{ 3168{
2693 if (sp->role.level != PT_PAGE_TABLE_LEVEL) { 3169 if (sp->role.level != PT_PAGE_TABLE_LEVEL) {
@@ -2695,14 +3171,8 @@ static void mmu_pte_write_new_pte(struct kvm_vcpu *vcpu,
2695 return; 3171 return;
2696 } 3172 }
2697 3173
2698 if (is_rsvd_bits_set(vcpu, *(u64 *)new, PT_PAGE_TABLE_LEVEL))
2699 return;
2700
2701 ++vcpu->kvm->stat.mmu_pte_updated; 3174 ++vcpu->kvm->stat.mmu_pte_updated;
2702 if (!sp->role.cr4_pae) 3175 vcpu->arch.mmu.update_pte(vcpu, sp, spte, new);
2703 paging32_update_pte(vcpu, sp, spte, new);
2704 else
2705 paging64_update_pte(vcpu, sp, spte, new);
2706} 3176}
2707 3177
2708static bool need_remote_flush(u64 old, u64 new) 3178static bool need_remote_flush(u64 old, u64 new)
@@ -2737,28 +3207,6 @@ static bool last_updated_pte_accessed(struct kvm_vcpu *vcpu)
2737 return !!(spte && (*spte & shadow_accessed_mask)); 3207 return !!(spte && (*spte & shadow_accessed_mask));
2738} 3208}
2739 3209
2740static void mmu_guess_page_from_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2741 u64 gpte)
2742{
2743 gfn_t gfn;
2744 pfn_t pfn;
2745
2746 if (!is_present_gpte(gpte))
2747 return;
2748 gfn = (gpte & PT64_BASE_ADDR_MASK) >> PAGE_SHIFT;
2749
2750 vcpu->arch.update_pte.mmu_seq = vcpu->kvm->mmu_notifier_seq;
2751 smp_rmb();
2752 pfn = gfn_to_pfn(vcpu->kvm, gfn);
2753
2754 if (is_error_pfn(pfn)) {
2755 kvm_release_pfn_clean(pfn);
2756 return;
2757 }
2758 vcpu->arch.update_pte.gfn = gfn;
2759 vcpu->arch.update_pte.pfn = pfn;
2760}
2761
2762static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn) 3210static void kvm_mmu_access_page(struct kvm_vcpu *vcpu, gfn_t gfn)
2763{ 3211{
2764 u64 *spte = vcpu->arch.last_pte_updated; 3212 u64 *spte = vcpu->arch.last_pte_updated;
@@ -2780,21 +3228,13 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2780 struct kvm_mmu_page *sp; 3228 struct kvm_mmu_page *sp;
2781 struct hlist_node *node; 3229 struct hlist_node *node;
2782 LIST_HEAD(invalid_list); 3230 LIST_HEAD(invalid_list);
2783 u64 entry, gentry; 3231 u64 entry, gentry, *spte;
2784 u64 *spte; 3232 unsigned pte_size, page_offset, misaligned, quadrant, offset;
2785 unsigned offset = offset_in_page(gpa); 3233 int level, npte, invlpg_counter, r, flooded = 0;
2786 unsigned pte_size;
2787 unsigned page_offset;
2788 unsigned misaligned;
2789 unsigned quadrant;
2790 int level;
2791 int flooded = 0;
2792 int npte;
2793 int r;
2794 int invlpg_counter;
2795 bool remote_flush, local_flush, zap_page; 3234 bool remote_flush, local_flush, zap_page;
2796 3235
2797 zap_page = remote_flush = local_flush = false; 3236 zap_page = remote_flush = local_flush = false;
3237 offset = offset_in_page(gpa);
2798 3238
2799 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes); 3239 pgprintk("%s: gpa %llx bytes %d\n", __func__, gpa, bytes);
2800 3240
@@ -2802,9 +3242,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2802 3242
2803 /* 3243 /*
2804 * Assume that the pte write on a page table of the same type 3244 * Assume that the pte write on a page table of the same type
2805 * as the current vcpu paging mode. This is nearly always true 3245 * as the current vcpu paging mode since we update the sptes only
2806 * (might be false while changing modes). Note it is verified later 3246 * when they have the same mode.
2807 * by update_pte().
2808 */ 3247 */
2809 if ((is_pae(vcpu) && bytes == 4) || !new) { 3248 if ((is_pae(vcpu) && bytes == 4) || !new) {
2810 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */ 3249 /* Handle a 32-bit guest writing two halves of a 64-bit gpte */
@@ -2830,15 +3269,14 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2830 break; 3269 break;
2831 } 3270 }
2832 3271
2833 mmu_guess_page_from_pte_write(vcpu, gpa, gentry);
2834 spin_lock(&vcpu->kvm->mmu_lock); 3272 spin_lock(&vcpu->kvm->mmu_lock);
2835 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter) 3273 if (atomic_read(&vcpu->kvm->arch.invlpg_counter) != invlpg_counter)
2836 gentry = 0; 3274 gentry = 0;
2837 kvm_mmu_access_page(vcpu, gfn);
2838 kvm_mmu_free_some_pages(vcpu); 3275 kvm_mmu_free_some_pages(vcpu);
2839 ++vcpu->kvm->stat.mmu_pte_write; 3276 ++vcpu->kvm->stat.mmu_pte_write;
2840 kvm_mmu_audit(vcpu, "pre pte write"); 3277 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PTE_WRITE);
2841 if (guest_initiated) { 3278 if (guest_initiated) {
3279 kvm_mmu_access_page(vcpu, gfn);
2842 if (gfn == vcpu->arch.last_pt_write_gfn 3280 if (gfn == vcpu->arch.last_pt_write_gfn
2843 && !last_updated_pte_accessed(vcpu)) { 3281 && !last_updated_pte_accessed(vcpu)) {
2844 ++vcpu->arch.last_pt_write_count; 3282 ++vcpu->arch.last_pt_write_count;
@@ -2910,12 +3348,8 @@ void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa,
2910 } 3348 }
2911 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush); 3349 mmu_pte_write_flush_tlb(vcpu, zap_page, remote_flush, local_flush);
2912 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list); 3350 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2913 kvm_mmu_audit(vcpu, "post pte write"); 3351 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PTE_WRITE);
2914 spin_unlock(&vcpu->kvm->mmu_lock); 3352 spin_unlock(&vcpu->kvm->mmu_lock);
2915 if (!is_error_pfn(vcpu->arch.update_pte.pfn)) {
2916 kvm_release_pfn_clean(vcpu->arch.update_pte.pfn);
2917 vcpu->arch.update_pte.pfn = bad_pfn;
2918 }
2919} 3353}
2920 3354
2921int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva) 3355int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
@@ -2923,7 +3357,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2923 gpa_t gpa; 3357 gpa_t gpa;
2924 int r; 3358 int r;
2925 3359
2926 if (tdp_enabled) 3360 if (vcpu->arch.mmu.direct_map)
2927 return 0; 3361 return 0;
2928 3362
2929 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL); 3363 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
@@ -2937,29 +3371,27 @@ EXPORT_SYMBOL_GPL(kvm_mmu_unprotect_page_virt);
2937 3371
2938void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 3372void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
2939{ 3373{
2940 int free_pages;
2941 LIST_HEAD(invalid_list); 3374 LIST_HEAD(invalid_list);
2942 3375
2943 free_pages = vcpu->kvm->arch.n_free_mmu_pages; 3376 while (kvm_mmu_available_pages(vcpu->kvm) < KVM_REFILL_PAGES &&
2944 while (free_pages < KVM_REFILL_PAGES &&
2945 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) { 3377 !list_empty(&vcpu->kvm->arch.active_mmu_pages)) {
2946 struct kvm_mmu_page *sp; 3378 struct kvm_mmu_page *sp;
2947 3379
2948 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev, 3380 sp = container_of(vcpu->kvm->arch.active_mmu_pages.prev,
2949 struct kvm_mmu_page, link); 3381 struct kvm_mmu_page, link);
2950 free_pages += kvm_mmu_prepare_zap_page(vcpu->kvm, sp, 3382 kvm_mmu_prepare_zap_page(vcpu->kvm, sp, &invalid_list);
2951 &invalid_list); 3383 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2952 ++vcpu->kvm->stat.mmu_recycled; 3384 ++vcpu->kvm->stat.mmu_recycled;
2953 } 3385 }
2954 kvm_mmu_commit_zap_page(vcpu->kvm, &invalid_list);
2955} 3386}
2956 3387
2957int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code) 3388int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code,
3389 void *insn, int insn_len)
2958{ 3390{
2959 int r; 3391 int r;
2960 enum emulation_result er; 3392 enum emulation_result er;
2961 3393
2962 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code); 3394 r = vcpu->arch.mmu.page_fault(vcpu, cr2, error_code, false);
2963 if (r < 0) 3395 if (r < 0)
2964 goto out; 3396 goto out;
2965 3397
@@ -2972,7 +3404,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2972 if (r) 3404 if (r)
2973 goto out; 3405 goto out;
2974 3406
2975 er = emulate_instruction(vcpu, cr2, error_code, 0); 3407 er = x86_emulate_instruction(vcpu, cr2, 0, insn, insn_len);
2976 3408
2977 switch (er) { 3409 switch (er) {
2978 case EMULATE_DONE: 3410 case EMULATE_DONE:
@@ -3013,6 +3445,8 @@ EXPORT_SYMBOL_GPL(kvm_disable_tdp);
3013static void free_mmu_pages(struct kvm_vcpu *vcpu) 3445static void free_mmu_pages(struct kvm_vcpu *vcpu)
3014{ 3446{
3015 free_page((unsigned long)vcpu->arch.mmu.pae_root); 3447 free_page((unsigned long)vcpu->arch.mmu.pae_root);
3448 if (vcpu->arch.mmu.lm_root != NULL)
3449 free_page((unsigned long)vcpu->arch.mmu.lm_root);
3016} 3450}
3017 3451
3018static int alloc_mmu_pages(struct kvm_vcpu *vcpu) 3452static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
@@ -3054,15 +3488,6 @@ int kvm_mmu_setup(struct kvm_vcpu *vcpu)
3054 return init_kvm_mmu(vcpu); 3488 return init_kvm_mmu(vcpu);
3055} 3489}
3056 3490
3057void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3058{
3059 ASSERT(vcpu);
3060
3061 destroy_kvm_mmu(vcpu);
3062 free_mmu_pages(vcpu);
3063 mmu_free_memory_caches(vcpu);
3064}
3065
3066void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot) 3491void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3067{ 3492{
3068 struct kvm_mmu_page *sp; 3493 struct kvm_mmu_page *sp;
@@ -3075,10 +3500,22 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, int slot)
3075 continue; 3500 continue;
3076 3501
3077 pt = sp->spt; 3502 pt = sp->spt;
3078 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) 3503 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3504 if (!is_shadow_present_pte(pt[i]) ||
3505 !is_last_spte(pt[i], sp->role.level))
3506 continue;
3507
3508 if (is_large_pte(pt[i])) {
3509 drop_spte(kvm, &pt[i],
3510 shadow_trap_nonpresent_pte);
3511 --kvm->stat.lpages;
3512 continue;
3513 }
3514
3079 /* avoid RMW */ 3515 /* avoid RMW */
3080 if (is_writable_pte(pt[i])) 3516 if (is_writable_pte(pt[i]))
3081 pt[i] &= ~PT_WRITABLE_MASK; 3517 update_spte(&pt[i], pt[i] & ~PT_WRITABLE_MASK);
3518 }
3082 } 3519 }
3083 kvm_flush_remote_tlbs(kvm); 3520 kvm_flush_remote_tlbs(kvm);
3084} 3521}
@@ -3108,27 +3545,27 @@ static int kvm_mmu_remove_some_alloc_mmu_pages(struct kvm *kvm,
3108 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list); 3545 return kvm_mmu_prepare_zap_page(kvm, page, invalid_list);
3109} 3546}
3110 3547
3111static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask) 3548static int mmu_shrink(struct shrinker *shrink, struct shrink_control *sc)
3112{ 3549{
3113 struct kvm *kvm; 3550 struct kvm *kvm;
3114 struct kvm *kvm_freed = NULL; 3551 struct kvm *kvm_freed = NULL;
3115 int cache_count = 0; 3552 int nr_to_scan = sc->nr_to_scan;
3553
3554 if (nr_to_scan == 0)
3555 goto out;
3116 3556
3117 spin_lock(&kvm_lock); 3557 raw_spin_lock(&kvm_lock);
3118 3558
3119 list_for_each_entry(kvm, &vm_list, vm_list) { 3559 list_for_each_entry(kvm, &vm_list, vm_list) {
3120 int npages, idx, freed_pages; 3560 int idx, freed_pages;
3121 LIST_HEAD(invalid_list); 3561 LIST_HEAD(invalid_list);
3122 3562
3123 idx = srcu_read_lock(&kvm->srcu); 3563 idx = srcu_read_lock(&kvm->srcu);
3124 spin_lock(&kvm->mmu_lock); 3564 spin_lock(&kvm->mmu_lock);
3125 npages = kvm->arch.n_alloc_mmu_pages - 3565 if (!kvm_freed && nr_to_scan > 0 &&
3126 kvm->arch.n_free_mmu_pages; 3566 kvm->arch.n_used_mmu_pages > 0) {
3127 cache_count += npages;
3128 if (!kvm_freed && nr_to_scan > 0 && npages > 0) {
3129 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm, 3567 freed_pages = kvm_mmu_remove_some_alloc_mmu_pages(kvm,
3130 &invalid_list); 3568 &invalid_list);
3131 cache_count -= freed_pages;
3132 kvm_freed = kvm; 3569 kvm_freed = kvm;
3133 } 3570 }
3134 nr_to_scan--; 3571 nr_to_scan--;
@@ -3140,9 +3577,10 @@ static int mmu_shrink(struct shrinker *shrink, int nr_to_scan, gfp_t gfp_mask)
3140 if (kvm_freed) 3577 if (kvm_freed)
3141 list_move_tail(&kvm_freed->vm_list, &vm_list); 3578 list_move_tail(&kvm_freed->vm_list, &vm_list);
3142 3579
3143 spin_unlock(&kvm_lock); 3580 raw_spin_unlock(&kvm_lock);
3144 3581
3145 return cache_count; 3582out:
3583 return percpu_counter_read_positive(&kvm_total_used_mmu_pages);
3146} 3584}
3147 3585
3148static struct shrinker mmu_shrinker = { 3586static struct shrinker mmu_shrinker = {
@@ -3160,12 +3598,6 @@ static void mmu_destroy_caches(void)
3160 kmem_cache_destroy(mmu_page_header_cache); 3598 kmem_cache_destroy(mmu_page_header_cache);
3161} 3599}
3162 3600
3163void kvm_mmu_module_exit(void)
3164{
3165 mmu_destroy_caches();
3166 unregister_shrinker(&mmu_shrinker);
3167}
3168
3169int kvm_mmu_module_init(void) 3601int kvm_mmu_module_init(void)
3170{ 3602{
3171 pte_chain_cache = kmem_cache_create("kvm_pte_chain", 3603 pte_chain_cache = kmem_cache_create("kvm_pte_chain",
@@ -3185,6 +3617,9 @@ int kvm_mmu_module_init(void)
3185 if (!mmu_page_header_cache) 3617 if (!mmu_page_header_cache)
3186 goto nomem; 3618 goto nomem;
3187 3619
3620 if (percpu_counter_init(&kvm_total_used_mmu_pages, 0))
3621 goto nomem;
3622
3188 register_shrinker(&mmu_shrinker); 3623 register_shrinker(&mmu_shrinker);
3189 3624
3190 return 0; 3625 return 0;
@@ -3259,7 +3694,7 @@ static int kvm_pv_mmu_write(struct kvm_vcpu *vcpu,
3259 3694
3260static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu) 3695static int kvm_pv_mmu_flush_tlb(struct kvm_vcpu *vcpu)
3261{ 3696{
3262 (void)kvm_set_cr3(vcpu, vcpu->arch.cr3); 3697 (void)kvm_set_cr3(vcpu, kvm_read_cr3(vcpu));
3263 return 1; 3698 return 1;
3264} 3699}
3265 3700
@@ -3355,271 +3790,25 @@ int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4])
3355} 3790}
3356EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy); 3791EXPORT_SYMBOL_GPL(kvm_mmu_get_spte_hierarchy);
3357 3792
3358#ifdef AUDIT 3793void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
3359
3360static const char *audit_msg;
3361
3362static gva_t canonicalize(gva_t gva)
3363{
3364#ifdef CONFIG_X86_64
3365 gva = (long long)(gva << 16) >> 16;
3366#endif
3367 return gva;
3368}
3369
3370
3371typedef void (*inspect_spte_fn) (struct kvm *kvm, u64 *sptep);
3372
3373static void __mmu_spte_walk(struct kvm *kvm, struct kvm_mmu_page *sp,
3374 inspect_spte_fn fn)
3375{
3376 int i;
3377
3378 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3379 u64 ent = sp->spt[i];
3380
3381 if (is_shadow_present_pte(ent)) {
3382 if (!is_last_spte(ent, sp->role.level)) {
3383 struct kvm_mmu_page *child;
3384 child = page_header(ent & PT64_BASE_ADDR_MASK);
3385 __mmu_spte_walk(kvm, child, fn);
3386 } else
3387 fn(kvm, &sp->spt[i]);
3388 }
3389 }
3390}
3391
3392static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
3393{
3394 int i;
3395 struct kvm_mmu_page *sp;
3396
3397 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
3398 return;
3399 if (vcpu->arch.mmu.shadow_root_level == PT64_ROOT_LEVEL) {
3400 hpa_t root = vcpu->arch.mmu.root_hpa;
3401 sp = page_header(root);
3402 __mmu_spte_walk(vcpu->kvm, sp, fn);
3403 return;
3404 }
3405 for (i = 0; i < 4; ++i) {
3406 hpa_t root = vcpu->arch.mmu.pae_root[i];
3407
3408 if (root && VALID_PAGE(root)) {
3409 root &= PT64_BASE_ADDR_MASK;
3410 sp = page_header(root);
3411 __mmu_spte_walk(vcpu->kvm, sp, fn);
3412 }
3413 }
3414 return;
3415}
3416
3417static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3418 gva_t va, int level)
3419{
3420 u64 *pt = __va(page_pte & PT64_BASE_ADDR_MASK);
3421 int i;
3422 gva_t va_delta = 1ul << (PAGE_SHIFT + 9 * (level - 1));
3423
3424 for (i = 0; i < PT64_ENT_PER_PAGE; ++i, va += va_delta) {
3425 u64 ent = pt[i];
3426
3427 if (ent == shadow_trap_nonpresent_pte)
3428 continue;
3429
3430 va = canonicalize(va);
3431 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3432 audit_mappings_page(vcpu, ent, va, level - 1);
3433 else {
3434 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3435 gfn_t gfn = gpa >> PAGE_SHIFT;
3436 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3437 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
3438
3439 if (is_error_pfn(pfn)) {
3440 kvm_release_pfn_clean(pfn);
3441 continue;
3442 }
3443
3444 if (is_shadow_present_pte(ent)
3445 && (ent & PT64_BASE_ADDR_MASK) != hpa)
3446 printk(KERN_ERR "xx audit error: (%s) levels %d"
3447 " gva %lx gpa %llx hpa %llx ent %llx %d\n",
3448 audit_msg, vcpu->arch.mmu.root_level,
3449 va, gpa, hpa, ent,
3450 is_shadow_present_pte(ent));
3451 else if (ent == shadow_notrap_nonpresent_pte
3452 && !is_error_hpa(hpa))
3453 printk(KERN_ERR "audit: (%s) notrap shadow,"
3454 " valid guest gva %lx\n", audit_msg, va);
3455 kvm_release_pfn_clean(pfn);
3456
3457 }
3458 }
3459}
3460
3461static void audit_mappings(struct kvm_vcpu *vcpu)
3462{
3463 unsigned i;
3464
3465 if (vcpu->arch.mmu.root_level == 4)
3466 audit_mappings_page(vcpu, vcpu->arch.mmu.root_hpa, 0, 4);
3467 else
3468 for (i = 0; i < 4; ++i)
3469 if (vcpu->arch.mmu.pae_root[i] & PT_PRESENT_MASK)
3470 audit_mappings_page(vcpu,
3471 vcpu->arch.mmu.pae_root[i],
3472 i << 30,
3473 2);
3474}
3475
3476static int count_rmaps(struct kvm_vcpu *vcpu)
3477{
3478 struct kvm *kvm = vcpu->kvm;
3479 struct kvm_memslots *slots;
3480 int nmaps = 0;
3481 int i, j, k, idx;
3482
3483 idx = srcu_read_lock(&kvm->srcu);
3484 slots = kvm_memslots(kvm);
3485 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3486 struct kvm_memory_slot *m = &slots->memslots[i];
3487 struct kvm_rmap_desc *d;
3488
3489 for (j = 0; j < m->npages; ++j) {
3490 unsigned long *rmapp = &m->rmap[j];
3491
3492 if (!*rmapp)
3493 continue;
3494 if (!(*rmapp & 1)) {
3495 ++nmaps;
3496 continue;
3497 }
3498 d = (struct kvm_rmap_desc *)(*rmapp & ~1ul);
3499 while (d) {
3500 for (k = 0; k < RMAP_EXT; ++k)
3501 if (d->sptes[k])
3502 ++nmaps;
3503 else
3504 break;
3505 d = d->more;
3506 }
3507 }
3508 }
3509 srcu_read_unlock(&kvm->srcu, idx);
3510 return nmaps;
3511}
3512
3513void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
3514{
3515 unsigned long *rmapp;
3516 struct kvm_mmu_page *rev_sp;
3517 gfn_t gfn;
3518
3519 if (is_writable_pte(*sptep)) {
3520 rev_sp = page_header(__pa(sptep));
3521 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
3522
3523 if (!gfn_to_memslot(kvm, gfn)) {
3524 if (!printk_ratelimit())
3525 return;
3526 printk(KERN_ERR "%s: no memslot for gfn %ld\n",
3527 audit_msg, gfn);
3528 printk(KERN_ERR "%s: index %ld of sp (gfn=%lx)\n",
3529 audit_msg, (long int)(sptep - rev_sp->spt),
3530 rev_sp->gfn);
3531 dump_stack();
3532 return;
3533 }
3534
3535 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
3536 if (!*rmapp) {
3537 if (!printk_ratelimit())
3538 return;
3539 printk(KERN_ERR "%s: no rmap for writable spte %llx\n",
3540 audit_msg, *sptep);
3541 dump_stack();
3542 }
3543 }
3544
3545}
3546
3547void audit_writable_sptes_have_rmaps(struct kvm_vcpu *vcpu)
3548{
3549 mmu_spte_walk(vcpu, inspect_spte_has_rmap);
3550}
3551
3552static void check_writable_mappings_rmap(struct kvm_vcpu *vcpu)
3553{ 3794{
3554 struct kvm_mmu_page *sp; 3795 ASSERT(vcpu);
3555 int i;
3556
3557 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3558 u64 *pt = sp->spt;
3559
3560 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
3561 continue;
3562
3563 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
3564 u64 ent = pt[i];
3565
3566 if (!(ent & PT_PRESENT_MASK))
3567 continue;
3568 if (!is_writable_pte(ent))
3569 continue;
3570 inspect_spte_has_rmap(vcpu->kvm, &pt[i]);
3571 }
3572 }
3573 return;
3574}
3575 3796
3576static void audit_rmap(struct kvm_vcpu *vcpu) 3797 destroy_kvm_mmu(vcpu);
3577{ 3798 free_mmu_pages(vcpu);
3578 check_writable_mappings_rmap(vcpu); 3799 mmu_free_memory_caches(vcpu);
3579 count_rmaps(vcpu);
3580} 3800}
3581 3801
3582static void audit_write_protection(struct kvm_vcpu *vcpu) 3802#ifdef CONFIG_KVM_MMU_AUDIT
3583{ 3803#include "mmu_audit.c"
3584 struct kvm_mmu_page *sp; 3804#else
3585 struct kvm_memory_slot *slot; 3805static void mmu_audit_disable(void) { }
3586 unsigned long *rmapp; 3806#endif
3587 u64 *spte;
3588 gfn_t gfn;
3589
3590 list_for_each_entry(sp, &vcpu->kvm->arch.active_mmu_pages, link) {
3591 if (sp->role.direct)
3592 continue;
3593 if (sp->unsync)
3594 continue;
3595
3596 slot = gfn_to_memslot(vcpu->kvm, sp->gfn);
3597 rmapp = &slot->rmap[gfn - slot->base_gfn];
3598
3599 spte = rmap_next(vcpu->kvm, rmapp, NULL);
3600 while (spte) {
3601 if (is_writable_pte(*spte))
3602 printk(KERN_ERR "%s: (%s) shadow page has "
3603 "writable mappings: gfn %lx role %x\n",
3604 __func__, audit_msg, sp->gfn,
3605 sp->role.word);
3606 spte = rmap_next(vcpu->kvm, rmapp, spte);
3607 }
3608 }
3609}
3610 3807
3611static void kvm_mmu_audit(struct kvm_vcpu *vcpu, const char *msg) 3808void kvm_mmu_module_exit(void)
3612{ 3809{
3613 int olddbg = dbg; 3810 mmu_destroy_caches();
3614 3811 percpu_counter_destroy(&kvm_total_used_mmu_pages);
3615 dbg = 0; 3812 unregister_shrinker(&mmu_shrinker);
3616 audit_msg = msg; 3813 mmu_audit_disable();
3617 audit_rmap(vcpu);
3618 audit_write_protection(vcpu);
3619 if (strcmp("pre pte write", audit_msg) != 0)
3620 audit_mappings(vcpu);
3621 audit_writable_sptes_have_rmaps(vcpu);
3622 dbg = olddbg;
3623} 3814}
3624
3625#endif
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index be66759321a5..7086ca85d3e7 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -49,10 +49,17 @@
49#define PFERR_FETCH_MASK (1U << 4) 49#define PFERR_FETCH_MASK (1U << 4)
50 50
51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
52int kvm_init_shadow_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *context);
53
54static inline unsigned int kvm_mmu_available_pages(struct kvm *kvm)
55{
56 return kvm->arch.n_max_mmu_pages -
57 kvm->arch.n_used_mmu_pages;
58}
52 59
53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 60static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
54{ 61{
55 if (unlikely(vcpu->kvm->arch.n_free_mmu_pages < KVM_MIN_FREE_MMU_PAGES)) 62 if (unlikely(kvm_mmu_available_pages(vcpu->kvm)< KVM_MIN_FREE_MMU_PAGES))
56 __kvm_mmu_free_some_pages(vcpu); 63 __kvm_mmu_free_some_pages(vcpu);
57} 64}
58 65
diff --git a/arch/x86/kvm/mmu_audit.c b/arch/x86/kvm/mmu_audit.c
new file mode 100644
index 000000000000..5f6223b8bcf7
--- /dev/null
+++ b/arch/x86/kvm/mmu_audit.c
@@ -0,0 +1,304 @@
1/*
2 * mmu_audit.c:
3 *
4 * Audit code for KVM MMU
5 *
6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 *
9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com>
11 * Avi Kivity <avi@qumranet.com>
12 * Marcelo Tosatti <mtosatti@redhat.com>
13 * Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
14 *
15 * This work is licensed under the terms of the GNU GPL, version 2. See
16 * the COPYING file in the top-level directory.
17 *
18 */
19
20#include <linux/ratelimit.h>
21
22#define audit_printk(kvm, fmt, args...) \
23 printk(KERN_ERR "audit: (%s) error: " \
24 fmt, audit_point_name[kvm->arch.audit_point], ##args)
25
26typedef void (*inspect_spte_fn) (struct kvm_vcpu *vcpu, u64 *sptep, int level);
27
28static void __mmu_spte_walk(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
29 inspect_spte_fn fn, int level)
30{
31 int i;
32
33 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
34 u64 *ent = sp->spt;
35
36 fn(vcpu, ent + i, level);
37
38 if (is_shadow_present_pte(ent[i]) &&
39 !is_last_spte(ent[i], level)) {
40 struct kvm_mmu_page *child;
41
42 child = page_header(ent[i] & PT64_BASE_ADDR_MASK);
43 __mmu_spte_walk(vcpu, child, fn, level - 1);
44 }
45 }
46}
47
48static void mmu_spte_walk(struct kvm_vcpu *vcpu, inspect_spte_fn fn)
49{
50 int i;
51 struct kvm_mmu_page *sp;
52
53 if (!VALID_PAGE(vcpu->arch.mmu.root_hpa))
54 return;
55
56 if (vcpu->arch.mmu.root_level == PT64_ROOT_LEVEL) {
57 hpa_t root = vcpu->arch.mmu.root_hpa;
58
59 sp = page_header(root);
60 __mmu_spte_walk(vcpu, sp, fn, PT64_ROOT_LEVEL);
61 return;
62 }
63
64 for (i = 0; i < 4; ++i) {
65 hpa_t root = vcpu->arch.mmu.pae_root[i];
66
67 if (root && VALID_PAGE(root)) {
68 root &= PT64_BASE_ADDR_MASK;
69 sp = page_header(root);
70 __mmu_spte_walk(vcpu, sp, fn, 2);
71 }
72 }
73
74 return;
75}
76
77typedef void (*sp_handler) (struct kvm *kvm, struct kvm_mmu_page *sp);
78
79static void walk_all_active_sps(struct kvm *kvm, sp_handler fn)
80{
81 struct kvm_mmu_page *sp;
82
83 list_for_each_entry(sp, &kvm->arch.active_mmu_pages, link)
84 fn(kvm, sp);
85}
86
87static void audit_mappings(struct kvm_vcpu *vcpu, u64 *sptep, int level)
88{
89 struct kvm_mmu_page *sp;
90 gfn_t gfn;
91 pfn_t pfn;
92 hpa_t hpa;
93
94 sp = page_header(__pa(sptep));
95
96 if (sp->unsync) {
97 if (level != PT_PAGE_TABLE_LEVEL) {
98 audit_printk(vcpu->kvm, "unsync sp: %p "
99 "level = %d\n", sp, level);
100 return;
101 }
102
103 if (*sptep == shadow_notrap_nonpresent_pte) {
104 audit_printk(vcpu->kvm, "notrap spte in unsync "
105 "sp: %p\n", sp);
106 return;
107 }
108 }
109
110 if (sp->role.direct && *sptep == shadow_notrap_nonpresent_pte) {
111 audit_printk(vcpu->kvm, "notrap spte in direct sp: %p\n",
112 sp);
113 return;
114 }
115
116 if (!is_shadow_present_pte(*sptep) || !is_last_spte(*sptep, level))
117 return;
118
119 gfn = kvm_mmu_page_get_gfn(sp, sptep - sp->spt);
120 pfn = gfn_to_pfn_atomic(vcpu->kvm, gfn);
121
122 if (is_error_pfn(pfn)) {
123 kvm_release_pfn_clean(pfn);
124 return;
125 }
126
127 hpa = pfn << PAGE_SHIFT;
128 if ((*sptep & PT64_BASE_ADDR_MASK) != hpa)
129 audit_printk(vcpu->kvm, "levels %d pfn %llx hpa %llx "
130 "ent %llxn", vcpu->arch.mmu.root_level, pfn,
131 hpa, *sptep);
132}
133
134static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep)
135{
136 unsigned long *rmapp;
137 struct kvm_mmu_page *rev_sp;
138 gfn_t gfn;
139
140
141 rev_sp = page_header(__pa(sptep));
142 gfn = kvm_mmu_page_get_gfn(rev_sp, sptep - rev_sp->spt);
143
144 if (!gfn_to_memslot(kvm, gfn)) {
145 if (!printk_ratelimit())
146 return;
147 audit_printk(kvm, "no memslot for gfn %llx\n", gfn);
148 audit_printk(kvm, "index %ld of sp (gfn=%llx)\n",
149 (long int)(sptep - rev_sp->spt), rev_sp->gfn);
150 dump_stack();
151 return;
152 }
153
154 rmapp = gfn_to_rmap(kvm, gfn, rev_sp->role.level);
155 if (!*rmapp) {
156 if (!printk_ratelimit())
157 return;
158 audit_printk(kvm, "no rmap for writable spte %llx\n",
159 *sptep);
160 dump_stack();
161 }
162}
163
164static void audit_sptes_have_rmaps(struct kvm_vcpu *vcpu, u64 *sptep, int level)
165{
166 if (is_shadow_present_pte(*sptep) && is_last_spte(*sptep, level))
167 inspect_spte_has_rmap(vcpu->kvm, sptep);
168}
169
170static void audit_spte_after_sync(struct kvm_vcpu *vcpu, u64 *sptep, int level)
171{
172 struct kvm_mmu_page *sp = page_header(__pa(sptep));
173
174 if (vcpu->kvm->arch.audit_point == AUDIT_POST_SYNC && sp->unsync)
175 audit_printk(vcpu->kvm, "meet unsync sp(%p) after sync "
176 "root.\n", sp);
177}
178
179static void check_mappings_rmap(struct kvm *kvm, struct kvm_mmu_page *sp)
180{
181 int i;
182
183 if (sp->role.level != PT_PAGE_TABLE_LEVEL)
184 return;
185
186 for (i = 0; i < PT64_ENT_PER_PAGE; ++i) {
187 if (!is_rmap_spte(sp->spt[i]))
188 continue;
189
190 inspect_spte_has_rmap(kvm, sp->spt + i);
191 }
192}
193
194static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp)
195{
196 struct kvm_memory_slot *slot;
197 unsigned long *rmapp;
198 u64 *spte;
199
200 if (sp->role.direct || sp->unsync || sp->role.invalid)
201 return;
202
203 slot = gfn_to_memslot(kvm, sp->gfn);
204 rmapp = &slot->rmap[sp->gfn - slot->base_gfn];
205
206 spte = rmap_next(kvm, rmapp, NULL);
207 while (spte) {
208 if (is_writable_pte(*spte))
209 audit_printk(kvm, "shadow page has writable "
210 "mappings: gfn %llx role %x\n",
211 sp->gfn, sp->role.word);
212 spte = rmap_next(kvm, rmapp, spte);
213 }
214}
215
216static void audit_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
217{
218 check_mappings_rmap(kvm, sp);
219 audit_write_protection(kvm, sp);
220}
221
222static void audit_all_active_sps(struct kvm *kvm)
223{
224 walk_all_active_sps(kvm, audit_sp);
225}
226
227static void audit_spte(struct kvm_vcpu *vcpu, u64 *sptep, int level)
228{
229 audit_sptes_have_rmaps(vcpu, sptep, level);
230 audit_mappings(vcpu, sptep, level);
231 audit_spte_after_sync(vcpu, sptep, level);
232}
233
234static void audit_vcpu_spte(struct kvm_vcpu *vcpu)
235{
236 mmu_spte_walk(vcpu, audit_spte);
237}
238
239static void kvm_mmu_audit(void *ignore, struct kvm_vcpu *vcpu, int point)
240{
241 static DEFINE_RATELIMIT_STATE(ratelimit_state, 5 * HZ, 10);
242
243 if (!__ratelimit(&ratelimit_state))
244 return;
245
246 vcpu->kvm->arch.audit_point = point;
247 audit_all_active_sps(vcpu->kvm);
248 audit_vcpu_spte(vcpu);
249}
250
251static bool mmu_audit;
252
253static void mmu_audit_enable(void)
254{
255 int ret;
256
257 if (mmu_audit)
258 return;
259
260 ret = register_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
261 WARN_ON(ret);
262
263 mmu_audit = true;
264}
265
266static void mmu_audit_disable(void)
267{
268 if (!mmu_audit)
269 return;
270
271 unregister_trace_kvm_mmu_audit(kvm_mmu_audit, NULL);
272 tracepoint_synchronize_unregister();
273 mmu_audit = false;
274}
275
276static int mmu_audit_set(const char *val, const struct kernel_param *kp)
277{
278 int ret;
279 unsigned long enable;
280
281 ret = strict_strtoul(val, 10, &enable);
282 if (ret < 0)
283 return -EINVAL;
284
285 switch (enable) {
286 case 0:
287 mmu_audit_disable();
288 break;
289 case 1:
290 mmu_audit_enable();
291 break;
292 default:
293 return -EINVAL;
294 }
295
296 return 0;
297}
298
299static struct kernel_param_ops audit_param_ops = {
300 .set = mmu_audit_set,
301 .get = param_get_bool,
302};
303
304module_param_cb(mmu_audit, &audit_param_ops, &mmu_audit, 0644);
diff --git a/arch/x86/kvm/mmutrace.h b/arch/x86/kvm/mmutrace.h
index 3aab0f0930ef..b60b4fdb3eda 100644
--- a/arch/x86/kvm/mmutrace.h
+++ b/arch/x86/kvm/mmutrace.h
@@ -195,6 +195,25 @@ DEFINE_EVENT(kvm_mmu_page_class, kvm_mmu_prepare_zap_page,
195 195
196 TP_ARGS(sp) 196 TP_ARGS(sp)
197); 197);
198
199TRACE_EVENT(
200 kvm_mmu_audit,
201 TP_PROTO(struct kvm_vcpu *vcpu, int audit_point),
202 TP_ARGS(vcpu, audit_point),
203
204 TP_STRUCT__entry(
205 __field(struct kvm_vcpu *, vcpu)
206 __field(int, audit_point)
207 ),
208
209 TP_fast_assign(
210 __entry->vcpu = vcpu;
211 __entry->audit_point = audit_point;
212 ),
213
214 TP_printk("vcpu:%d %s", __entry->vcpu->cpu,
215 audit_point_name[__entry->audit_point])
216);
198#endif /* _TRACE_KVMMMU_H */ 217#endif /* _TRACE_KVMMMU_H */
199 218
200#undef TRACE_INCLUDE_PATH 219#undef TRACE_INCLUDE_PATH
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 51ef9097960d..9d03ad4dd5ec 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -7,7 +7,7 @@
7 * MMU support 7 * MMU support
8 * 8 *
9 * Copyright (C) 2006 Qumranet, Inc. 9 * Copyright (C) 2006 Qumranet, Inc.
10 * Copyright 2010 Red Hat, Inc. and/or its affilates. 10 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
11 * 11 *
12 * Authors: 12 * Authors:
13 * Yaniv Kamay <yaniv@qumranet.com> 13 * Yaniv Kamay <yaniv@qumranet.com>
@@ -31,7 +31,6 @@
31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl) 31 #define PT_LVL_ADDR_MASK(lvl) PT64_LVL_ADDR_MASK(lvl)
32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl) 32 #define PT_LVL_OFFSET_MASK(lvl) PT64_LVL_OFFSET_MASK(lvl)
33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level) 33 #define PT_INDEX(addr, level) PT64_INDEX(addr, level)
34 #define PT_LEVEL_MASK(level) PT64_LEVEL_MASK(level)
35 #define PT_LEVEL_BITS PT64_LEVEL_BITS 34 #define PT_LEVEL_BITS PT64_LEVEL_BITS
36 #ifdef CONFIG_X86_64 35 #ifdef CONFIG_X86_64
37 #define PT_MAX_FULL_LEVELS 4 36 #define PT_MAX_FULL_LEVELS 4
@@ -48,7 +47,6 @@
48 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl) 47 #define PT_LVL_ADDR_MASK(lvl) PT32_LVL_ADDR_MASK(lvl)
49 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl) 48 #define PT_LVL_OFFSET_MASK(lvl) PT32_LVL_OFFSET_MASK(lvl)
50 #define PT_INDEX(addr, level) PT32_INDEX(addr, level) 49 #define PT_INDEX(addr, level) PT32_INDEX(addr, level)
51 #define PT_LEVEL_MASK(level) PT32_LEVEL_MASK(level)
52 #define PT_LEVEL_BITS PT32_LEVEL_BITS 50 #define PT_LEVEL_BITS PT32_LEVEL_BITS
53 #define PT_MAX_FULL_LEVELS 2 51 #define PT_MAX_FULL_LEVELS 2
54 #define CMPXCHG cmpxchg 52 #define CMPXCHG cmpxchg
@@ -67,11 +65,12 @@ struct guest_walker {
67 int level; 65 int level;
68 gfn_t table_gfn[PT_MAX_FULL_LEVELS]; 66 gfn_t table_gfn[PT_MAX_FULL_LEVELS];
69 pt_element_t ptes[PT_MAX_FULL_LEVELS]; 67 pt_element_t ptes[PT_MAX_FULL_LEVELS];
68 pt_element_t prefetch_ptes[PTE_PREFETCH_NUM];
70 gpa_t pte_gpa[PT_MAX_FULL_LEVELS]; 69 gpa_t pte_gpa[PT_MAX_FULL_LEVELS];
71 unsigned pt_access; 70 unsigned pt_access;
72 unsigned pte_access; 71 unsigned pte_access;
73 gfn_t gfn; 72 gfn_t gfn;
74 u32 error_code; 73 struct x86_exception fault;
75}; 74};
76 75
77static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl) 76static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
@@ -79,15 +78,19 @@ static gfn_t gpte_to_gfn_lvl(pt_element_t gpte, int lvl)
79 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT; 78 return (gpte & PT_LVL_ADDR_MASK(lvl)) >> PAGE_SHIFT;
80} 79}
81 80
82static bool FNAME(cmpxchg_gpte)(struct kvm *kvm, 81static int FNAME(cmpxchg_gpte)(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
83 gfn_t table_gfn, unsigned index, 82 pt_element_t __user *ptep_user, unsigned index,
84 pt_element_t orig_pte, pt_element_t new_pte) 83 pt_element_t orig_pte, pt_element_t new_pte)
85{ 84{
85 int npages;
86 pt_element_t ret; 86 pt_element_t ret;
87 pt_element_t *table; 87 pt_element_t *table;
88 struct page *page; 88 struct page *page;
89 89
90 page = gfn_to_page(kvm, table_gfn); 90 npages = get_user_pages_fast((unsigned long)ptep_user, 1, 1, &page);
91 /* Check if the user is doing something meaningless. */
92 if (unlikely(npages != 1))
93 return -EFAULT;
91 94
92 table = kmap_atomic(page, KM_USER0); 95 table = kmap_atomic(page, KM_USER0);
93 ret = CMPXCHG(&table[index], orig_pte, new_pte); 96 ret = CMPXCHG(&table[index], orig_pte, new_pte);
@@ -104,7 +107,7 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
104 107
105 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK; 108 access = (gpte & (PT_WRITABLE_MASK | PT_USER_MASK)) | ACC_EXEC_MASK;
106#if PTTYPE == 64 109#if PTTYPE == 64
107 if (is_nx(vcpu)) 110 if (vcpu->arch.mmu.nx)
108 access &= ~(gpte >> PT64_NX_SHIFT); 111 access &= ~(gpte >> PT64_NX_SHIFT);
109#endif 112#endif
110 return access; 113 return access;
@@ -113,26 +116,33 @@ static unsigned FNAME(gpte_access)(struct kvm_vcpu *vcpu, pt_element_t gpte)
113/* 116/*
114 * Fetch a guest pte for a guest virtual address 117 * Fetch a guest pte for a guest virtual address
115 */ 118 */
116static int FNAME(walk_addr)(struct guest_walker *walker, 119static int FNAME(walk_addr_generic)(struct guest_walker *walker,
117 struct kvm_vcpu *vcpu, gva_t addr, 120 struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
118 int write_fault, int user_fault, int fetch_fault) 121 gva_t addr, u32 access)
119{ 122{
120 pt_element_t pte; 123 pt_element_t pte;
124 pt_element_t __user *uninitialized_var(ptep_user);
121 gfn_t table_gfn; 125 gfn_t table_gfn;
122 unsigned index, pt_access, uninitialized_var(pte_access); 126 unsigned index, pt_access, uninitialized_var(pte_access);
123 gpa_t pte_gpa; 127 gpa_t pte_gpa;
124 bool eperm, present, rsvd_fault; 128 bool eperm, present, rsvd_fault;
129 int offset, write_fault, user_fault, fetch_fault;
130
131 write_fault = access & PFERR_WRITE_MASK;
132 user_fault = access & PFERR_USER_MASK;
133 fetch_fault = access & PFERR_FETCH_MASK;
125 134
126 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault, 135 trace_kvm_mmu_pagetable_walk(addr, write_fault, user_fault,
127 fetch_fault); 136 fetch_fault);
128walk: 137walk:
129 present = true; 138 present = true;
130 eperm = rsvd_fault = false; 139 eperm = rsvd_fault = false;
131 walker->level = vcpu->arch.mmu.root_level; 140 walker->level = mmu->root_level;
132 pte = vcpu->arch.cr3; 141 pte = mmu->get_cr3(vcpu);
142
133#if PTTYPE == 64 143#if PTTYPE == 64
134 if (!is_long_mode(vcpu)) { 144 if (walker->level == PT32E_ROOT_LEVEL) {
135 pte = kvm_pdptr_read(vcpu, (addr >> 30) & 3); 145 pte = kvm_pdptr_read_mmu(vcpu, mmu, (addr >> 30) & 3);
136 trace_kvm_mmu_paging_element(pte, walker->level); 146 trace_kvm_mmu_paging_element(pte, walker->level);
137 if (!is_present_gpte(pte)) { 147 if (!is_present_gpte(pte)) {
138 present = false; 148 present = false;
@@ -142,54 +152,80 @@ walk:
142 } 152 }
143#endif 153#endif
144 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) || 154 ASSERT((!is_long_mode(vcpu) && is_pae(vcpu)) ||
145 (vcpu->arch.cr3 & CR3_NONPAE_RESERVED_BITS) == 0); 155 (mmu->get_cr3(vcpu) & CR3_NONPAE_RESERVED_BITS) == 0);
146 156
147 pt_access = ACC_ALL; 157 pt_access = ACC_ALL;
148 158
149 for (;;) { 159 for (;;) {
160 gfn_t real_gfn;
161 unsigned long host_addr;
162
150 index = PT_INDEX(addr, walker->level); 163 index = PT_INDEX(addr, walker->level);
151 164
152 table_gfn = gpte_to_gfn(pte); 165 table_gfn = gpte_to_gfn(pte);
153 pte_gpa = gfn_to_gpa(table_gfn); 166 offset = index * sizeof(pt_element_t);
154 pte_gpa += index * sizeof(pt_element_t); 167 pte_gpa = gfn_to_gpa(table_gfn) + offset;
155 walker->table_gfn[walker->level - 1] = table_gfn; 168 walker->table_gfn[walker->level - 1] = table_gfn;
156 walker->pte_gpa[walker->level - 1] = pte_gpa; 169 walker->pte_gpa[walker->level - 1] = pte_gpa;
157 170
158 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte))) { 171 real_gfn = mmu->translate_gpa(vcpu, gfn_to_gpa(table_gfn),
172 PFERR_USER_MASK|PFERR_WRITE_MASK);
173 if (unlikely(real_gfn == UNMAPPED_GVA)) {
174 present = false;
175 break;
176 }
177 real_gfn = gpa_to_gfn(real_gfn);
178
179 host_addr = gfn_to_hva(vcpu->kvm, real_gfn);
180 if (unlikely(kvm_is_error_hva(host_addr))) {
181 present = false;
182 break;
183 }
184
185 ptep_user = (pt_element_t __user *)((void *)host_addr + offset);
186 if (unlikely(__copy_from_user(&pte, ptep_user, sizeof(pte)))) {
159 present = false; 187 present = false;
160 break; 188 break;
161 } 189 }
162 190
163 trace_kvm_mmu_paging_element(pte, walker->level); 191 trace_kvm_mmu_paging_element(pte, walker->level);
164 192
165 if (!is_present_gpte(pte)) { 193 if (unlikely(!is_present_gpte(pte))) {
166 present = false; 194 present = false;
167 break; 195 break;
168 } 196 }
169 197
170 if (is_rsvd_bits_set(vcpu, pte, walker->level)) { 198 if (unlikely(is_rsvd_bits_set(&vcpu->arch.mmu, pte,
199 walker->level))) {
171 rsvd_fault = true; 200 rsvd_fault = true;
172 break; 201 break;
173 } 202 }
174 203
175 if (write_fault && !is_writable_pte(pte)) 204 if (unlikely(write_fault && !is_writable_pte(pte)
176 if (user_fault || is_write_protection(vcpu)) 205 && (user_fault || is_write_protection(vcpu))))
177 eperm = true; 206 eperm = true;
178 207
179 if (user_fault && !(pte & PT_USER_MASK)) 208 if (unlikely(user_fault && !(pte & PT_USER_MASK)))
180 eperm = true; 209 eperm = true;
181 210
182#if PTTYPE == 64 211#if PTTYPE == 64
183 if (fetch_fault && (pte & PT64_NX_MASK)) 212 if (unlikely(fetch_fault && (pte & PT64_NX_MASK)))
184 eperm = true; 213 eperm = true;
185#endif 214#endif
186 215
187 if (!eperm && !rsvd_fault && !(pte & PT_ACCESSED_MASK)) { 216 if (!eperm && !rsvd_fault
217 && unlikely(!(pte & PT_ACCESSED_MASK))) {
218 int ret;
188 trace_kvm_mmu_set_accessed_bit(table_gfn, index, 219 trace_kvm_mmu_set_accessed_bit(table_gfn, index,
189 sizeof(pte)); 220 sizeof(pte));
190 if (FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, 221 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
191 index, pte, pte|PT_ACCESSED_MASK)) 222 pte, pte|PT_ACCESSED_MASK);
223 if (unlikely(ret < 0)) {
224 present = false;
225 break;
226 } else if (ret)
192 goto walk; 227 goto walk;
228
193 mark_page_dirty(vcpu->kvm, table_gfn); 229 mark_page_dirty(vcpu->kvm, table_gfn);
194 pte |= PT_ACCESSED_MASK; 230 pte |= PT_ACCESSED_MASK;
195 } 231 }
@@ -204,17 +240,28 @@ walk:
204 (PTTYPE == 64 || is_pse(vcpu))) || 240 (PTTYPE == 64 || is_pse(vcpu))) ||
205 ((walker->level == PT_PDPE_LEVEL) && 241 ((walker->level == PT_PDPE_LEVEL) &&
206 is_large_pte(pte) && 242 is_large_pte(pte) &&
207 is_long_mode(vcpu))) { 243 mmu->root_level == PT64_ROOT_LEVEL)) {
208 int lvl = walker->level; 244 int lvl = walker->level;
245 gpa_t real_gpa;
246 gfn_t gfn;
247 u32 ac;
209 248
210 walker->gfn = gpte_to_gfn_lvl(pte, lvl); 249 gfn = gpte_to_gfn_lvl(pte, lvl);
211 walker->gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) 250 gfn += (addr & PT_LVL_OFFSET_MASK(lvl)) >> PAGE_SHIFT;
212 >> PAGE_SHIFT;
213 251
214 if (PTTYPE == 32 && 252 if (PTTYPE == 32 &&
215 walker->level == PT_DIRECTORY_LEVEL && 253 walker->level == PT_DIRECTORY_LEVEL &&
216 is_cpuid_PSE36()) 254 is_cpuid_PSE36())
217 walker->gfn += pse36_gfn_delta(pte); 255 gfn += pse36_gfn_delta(pte);
256
257 ac = write_fault | fetch_fault | user_fault;
258
259 real_gpa = mmu->translate_gpa(vcpu, gfn_to_gpa(gfn),
260 ac);
261 if (real_gpa == UNMAPPED_GVA)
262 return 0;
263
264 walker->gfn = real_gpa >> PAGE_SHIFT;
218 265
219 break; 266 break;
220 } 267 }
@@ -223,17 +270,21 @@ walk:
223 --walker->level; 270 --walker->level;
224 } 271 }
225 272
226 if (!present || eperm || rsvd_fault) 273 if (unlikely(!present || eperm || rsvd_fault))
227 goto error; 274 goto error;
228 275
229 if (write_fault && !is_dirty_gpte(pte)) { 276 if (write_fault && unlikely(!is_dirty_gpte(pte))) {
230 bool ret; 277 int ret;
231 278
232 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte)); 279 trace_kvm_mmu_set_dirty_bit(table_gfn, index, sizeof(pte));
233 ret = FNAME(cmpxchg_gpte)(vcpu->kvm, table_gfn, index, pte, 280 ret = FNAME(cmpxchg_gpte)(vcpu, mmu, ptep_user, index,
234 pte|PT_DIRTY_MASK); 281 pte, pte|PT_DIRTY_MASK);
235 if (ret) 282 if (unlikely(ret < 0)) {
283 present = false;
284 goto error;
285 } else if (ret)
236 goto walk; 286 goto walk;
287
237 mark_page_dirty(vcpu->kvm, table_gfn); 288 mark_page_dirty(vcpu->kvm, table_gfn);
238 pte |= PT_DIRTY_MASK; 289 pte |= PT_DIRTY_MASK;
239 walker->ptes[walker->level - 1] = pte; 290 walker->ptes[walker->level - 1] = pte;
@@ -246,52 +297,87 @@ walk:
246 return 1; 297 return 1;
247 298
248error: 299error:
249 walker->error_code = 0; 300 walker->fault.vector = PF_VECTOR;
301 walker->fault.error_code_valid = true;
302 walker->fault.error_code = 0;
250 if (present) 303 if (present)
251 walker->error_code |= PFERR_PRESENT_MASK; 304 walker->fault.error_code |= PFERR_PRESENT_MASK;
252 if (write_fault) 305
253 walker->error_code |= PFERR_WRITE_MASK; 306 walker->fault.error_code |= write_fault | user_fault;
254 if (user_fault) 307
255 walker->error_code |= PFERR_USER_MASK; 308 if (fetch_fault && mmu->nx)
256 if (fetch_fault && is_nx(vcpu)) 309 walker->fault.error_code |= PFERR_FETCH_MASK;
257 walker->error_code |= PFERR_FETCH_MASK;
258 if (rsvd_fault) 310 if (rsvd_fault)
259 walker->error_code |= PFERR_RSVD_MASK; 311 walker->fault.error_code |= PFERR_RSVD_MASK;
260 trace_kvm_mmu_walker_error(walker->error_code); 312
313 walker->fault.address = addr;
314 walker->fault.nested_page_fault = mmu != vcpu->arch.walk_mmu;
315
316 trace_kvm_mmu_walker_error(walker->fault.error_code);
261 return 0; 317 return 0;
262} 318}
263 319
320static int FNAME(walk_addr)(struct guest_walker *walker,
321 struct kvm_vcpu *vcpu, gva_t addr, u32 access)
322{
323 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.mmu, addr,
324 access);
325}
326
327static int FNAME(walk_addr_nested)(struct guest_walker *walker,
328 struct kvm_vcpu *vcpu, gva_t addr,
329 u32 access)
330{
331 return FNAME(walk_addr_generic)(walker, vcpu, &vcpu->arch.nested_mmu,
332 addr, access);
333}
334
335static bool FNAME(prefetch_invalid_gpte)(struct kvm_vcpu *vcpu,
336 struct kvm_mmu_page *sp, u64 *spte,
337 pt_element_t gpte)
338{
339 u64 nonpresent = shadow_trap_nonpresent_pte;
340
341 if (is_rsvd_bits_set(&vcpu->arch.mmu, gpte, PT_PAGE_TABLE_LEVEL))
342 goto no_present;
343
344 if (!is_present_gpte(gpte)) {
345 if (!sp->unsync)
346 nonpresent = shadow_notrap_nonpresent_pte;
347 goto no_present;
348 }
349
350 if (!(gpte & PT_ACCESSED_MASK))
351 goto no_present;
352
353 return false;
354
355no_present:
356 drop_spte(vcpu->kvm, spte, nonpresent);
357 return true;
358}
359
264static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 360static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
265 u64 *spte, const void *pte) 361 u64 *spte, const void *pte)
266{ 362{
267 pt_element_t gpte; 363 pt_element_t gpte;
268 unsigned pte_access; 364 unsigned pte_access;
269 pfn_t pfn; 365 pfn_t pfn;
270 u64 new_spte;
271 366
272 gpte = *(const pt_element_t *)pte; 367 gpte = *(const pt_element_t *)pte;
273 if (~gpte & (PT_PRESENT_MASK | PT_ACCESSED_MASK)) { 368 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
274 if (!is_present_gpte(gpte)) {
275 if (sp->unsync)
276 new_spte = shadow_trap_nonpresent_pte;
277 else
278 new_spte = shadow_notrap_nonpresent_pte;
279 __set_spte(spte, new_spte);
280 }
281 return; 369 return;
282 } 370
283 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte); 371 pgprintk("%s: gpte %llx spte %p\n", __func__, (u64)gpte, spte);
284 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 372 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
285 if (gpte_to_gfn(gpte) != vcpu->arch.update_pte.gfn) 373 pfn = gfn_to_pfn_atomic(vcpu->kvm, gpte_to_gfn(gpte));
374 if (is_error_pfn(pfn)) {
375 kvm_release_pfn_clean(pfn);
286 return; 376 return;
287 pfn = vcpu->arch.update_pte.pfn; 377 }
288 if (is_error_pfn(pfn)) 378
289 return;
290 if (mmu_notifier_retry(vcpu, vcpu->arch.update_pte.mmu_seq))
291 return;
292 kvm_get_pfn(pfn);
293 /* 379 /*
294 * we call mmu_set_spte() with reset_host_protection = true beacuse that 380 * we call mmu_set_spte() with host_writable = true because that
295 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1). 381 * vcpu->arch.update_pte.pfn was fetched from get_user_pages(write = 1).
296 */ 382 */
297 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0, 383 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
@@ -302,21 +388,87 @@ static void FNAME(update_pte)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
302static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu, 388static bool FNAME(gpte_changed)(struct kvm_vcpu *vcpu,
303 struct guest_walker *gw, int level) 389 struct guest_walker *gw, int level)
304{ 390{
305 int r;
306 pt_element_t curr_pte; 391 pt_element_t curr_pte;
307 392 gpa_t base_gpa, pte_gpa = gw->pte_gpa[level - 1];
308 r = kvm_read_guest_atomic(vcpu->kvm, gw->pte_gpa[level - 1], 393 u64 mask;
394 int r, index;
395
396 if (level == PT_PAGE_TABLE_LEVEL) {
397 mask = PTE_PREFETCH_NUM * sizeof(pt_element_t) - 1;
398 base_gpa = pte_gpa & ~mask;
399 index = (pte_gpa - base_gpa) / sizeof(pt_element_t);
400
401 r = kvm_read_guest_atomic(vcpu->kvm, base_gpa,
402 gw->prefetch_ptes, sizeof(gw->prefetch_ptes));
403 curr_pte = gw->prefetch_ptes[index];
404 } else
405 r = kvm_read_guest_atomic(vcpu->kvm, pte_gpa,
309 &curr_pte, sizeof(curr_pte)); 406 &curr_pte, sizeof(curr_pte));
407
310 return r || curr_pte != gw->ptes[level - 1]; 408 return r || curr_pte != gw->ptes[level - 1];
311} 409}
312 410
411static void FNAME(pte_prefetch)(struct kvm_vcpu *vcpu, struct guest_walker *gw,
412 u64 *sptep)
413{
414 struct kvm_mmu_page *sp;
415 pt_element_t *gptep = gw->prefetch_ptes;
416 u64 *spte;
417 int i;
418
419 sp = page_header(__pa(sptep));
420
421 if (sp->role.level > PT_PAGE_TABLE_LEVEL)
422 return;
423
424 if (sp->role.direct)
425 return __direct_pte_prefetch(vcpu, sp, sptep);
426
427 i = (sptep - sp->spt) & ~(PTE_PREFETCH_NUM - 1);
428 spte = sp->spt + i;
429
430 for (i = 0; i < PTE_PREFETCH_NUM; i++, spte++) {
431 pt_element_t gpte;
432 unsigned pte_access;
433 gfn_t gfn;
434 pfn_t pfn;
435 bool dirty;
436
437 if (spte == sptep)
438 continue;
439
440 if (*spte != shadow_trap_nonpresent_pte)
441 continue;
442
443 gpte = gptep[i];
444
445 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, spte, gpte))
446 continue;
447
448 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
449 gfn = gpte_to_gfn(gpte);
450 dirty = is_dirty_gpte(gpte);
451 pfn = pte_prefetch_gfn_to_pfn(vcpu, gfn,
452 (pte_access & ACC_WRITE_MASK) && dirty);
453 if (is_error_pfn(pfn)) {
454 kvm_release_pfn_clean(pfn);
455 break;
456 }
457
458 mmu_set_spte(vcpu, spte, sp->role.access, pte_access, 0, 0,
459 dirty, NULL, PT_PAGE_TABLE_LEVEL, gfn,
460 pfn, true, true);
461 }
462}
463
313/* 464/*
314 * Fetch a shadow pte for a specific level in the paging hierarchy. 465 * Fetch a shadow pte for a specific level in the paging hierarchy.
315 */ 466 */
316static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr, 467static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
317 struct guest_walker *gw, 468 struct guest_walker *gw,
318 int user_fault, int write_fault, int hlevel, 469 int user_fault, int write_fault, int hlevel,
319 int *ptwrite, pfn_t pfn) 470 int *ptwrite, pfn_t pfn, bool map_writable,
471 bool prefault)
320{ 472{
321 unsigned access = gw->pt_access; 473 unsigned access = gw->pt_access;
322 struct kvm_mmu_page *sp = NULL; 474 struct kvm_mmu_page *sp = NULL;
@@ -390,7 +542,8 @@ static u64 *FNAME(fetch)(struct kvm_vcpu *vcpu, gva_t addr,
390 542
391 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access, 543 mmu_set_spte(vcpu, it.sptep, access, gw->pte_access & access,
392 user_fault, write_fault, dirty, ptwrite, it.level, 544 user_fault, write_fault, dirty, ptwrite, it.level,
393 gw->gfn, pfn, false, true); 545 gw->gfn, pfn, prefault, map_writable);
546 FNAME(pte_prefetch)(vcpu, gw, it.sptep);
394 547
395 return it.sptep; 548 return it.sptep;
396 549
@@ -415,22 +568,22 @@ out_gpte_changed:
415 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or 568 * Returns: 1 if we need to emulate the instruction, 0 otherwise, or
416 * a negative value on error. 569 * a negative value on error.
417 */ 570 */
418static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, 571static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr, u32 error_code,
419 u32 error_code) 572 bool prefault)
420{ 573{
421 int write_fault = error_code & PFERR_WRITE_MASK; 574 int write_fault = error_code & PFERR_WRITE_MASK;
422 int user_fault = error_code & PFERR_USER_MASK; 575 int user_fault = error_code & PFERR_USER_MASK;
423 int fetch_fault = error_code & PFERR_FETCH_MASK;
424 struct guest_walker walker; 576 struct guest_walker walker;
425 u64 *sptep; 577 u64 *sptep;
426 int write_pt = 0; 578 int write_pt = 0;
427 int r; 579 int r;
428 pfn_t pfn; 580 pfn_t pfn;
429 int level = PT_PAGE_TABLE_LEVEL; 581 int level = PT_PAGE_TABLE_LEVEL;
582 int force_pt_level;
430 unsigned long mmu_seq; 583 unsigned long mmu_seq;
584 bool map_writable;
431 585
432 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code); 586 pgprintk("%s: addr %lx err %x\n", __func__, addr, error_code);
433 kvm_mmu_audit(vcpu, "pre page fault");
434 587
435 r = mmu_topup_memory_caches(vcpu); 588 r = mmu_topup_memory_caches(vcpu);
436 if (r) 589 if (r)
@@ -439,27 +592,36 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
439 /* 592 /*
440 * Look up the guest pte for the faulting address. 593 * Look up the guest pte for the faulting address.
441 */ 594 */
442 r = FNAME(walk_addr)(&walker, vcpu, addr, write_fault, user_fault, 595 r = FNAME(walk_addr)(&walker, vcpu, addr, error_code);
443 fetch_fault);
444 596
445 /* 597 /*
446 * The page is not mapped by the guest. Let the guest handle it. 598 * The page is not mapped by the guest. Let the guest handle it.
447 */ 599 */
448 if (!r) { 600 if (!r) {
449 pgprintk("%s: guest page fault\n", __func__); 601 pgprintk("%s: guest page fault\n", __func__);
450 inject_page_fault(vcpu, addr, walker.error_code); 602 if (!prefault) {
451 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 603 inject_page_fault(vcpu, &walker.fault);
604 /* reset fork detector */
605 vcpu->arch.last_pt_write_count = 0;
606 }
452 return 0; 607 return 0;
453 } 608 }
454 609
455 if (walker.level >= PT_DIRECTORY_LEVEL) { 610 if (walker.level >= PT_DIRECTORY_LEVEL)
611 force_pt_level = mapping_level_dirty_bitmap(vcpu, walker.gfn);
612 else
613 force_pt_level = 1;
614 if (!force_pt_level) {
456 level = min(walker.level, mapping_level(vcpu, walker.gfn)); 615 level = min(walker.level, mapping_level(vcpu, walker.gfn));
457 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1); 616 walker.gfn = walker.gfn & ~(KVM_PAGES_PER_HPAGE(level) - 1);
458 } 617 }
459 618
460 mmu_seq = vcpu->kvm->mmu_notifier_seq; 619 mmu_seq = vcpu->kvm->mmu_notifier_seq;
461 smp_rmb(); 620 smp_rmb();
462 pfn = gfn_to_pfn(vcpu->kvm, walker.gfn); 621
622 if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, write_fault,
623 &map_writable))
624 return 0;
463 625
464 /* mmio */ 626 /* mmio */
465 if (is_error_pfn(pfn)) 627 if (is_error_pfn(pfn))
@@ -468,9 +630,13 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
468 spin_lock(&vcpu->kvm->mmu_lock); 630 spin_lock(&vcpu->kvm->mmu_lock);
469 if (mmu_notifier_retry(vcpu, mmu_seq)) 631 if (mmu_notifier_retry(vcpu, mmu_seq))
470 goto out_unlock; 632 goto out_unlock;
633
634 trace_kvm_mmu_audit(vcpu, AUDIT_PRE_PAGE_FAULT);
471 kvm_mmu_free_some_pages(vcpu); 635 kvm_mmu_free_some_pages(vcpu);
636 if (!force_pt_level)
637 transparent_hugepage_adjust(vcpu, &walker.gfn, &pfn, &level);
472 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault, 638 sptep = FNAME(fetch)(vcpu, addr, &walker, user_fault, write_fault,
473 level, &write_pt, pfn); 639 level, &write_pt, pfn, map_writable, prefault);
474 (void)sptep; 640 (void)sptep;
475 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__, 641 pgprintk("%s: shadow pte %p %llx ptwrite %d\n", __func__,
476 sptep, *sptep, write_pt); 642 sptep, *sptep, write_pt);
@@ -479,7 +645,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gva_t addr,
479 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */ 645 vcpu->arch.last_pt_write_count = 0; /* reset fork detector */
480 646
481 ++vcpu->stat.pf_fixed; 647 ++vcpu->stat.pf_fixed;
482 kvm_mmu_audit(vcpu, "post page fault (fixed)"); 648 trace_kvm_mmu_audit(vcpu, AUDIT_POST_PAGE_FAULT);
483 spin_unlock(&vcpu->kvm->mmu_lock); 649 spin_unlock(&vcpu->kvm->mmu_lock);
484 650
485 return write_pt; 651 return write_pt;
@@ -550,22 +716,38 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
550} 716}
551 717
552static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access, 718static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
553 u32 *error) 719 struct x86_exception *exception)
720{
721 struct guest_walker walker;
722 gpa_t gpa = UNMAPPED_GVA;
723 int r;
724
725 r = FNAME(walk_addr)(&walker, vcpu, vaddr, access);
726
727 if (r) {
728 gpa = gfn_to_gpa(walker.gfn);
729 gpa |= vaddr & ~PAGE_MASK;
730 } else if (exception)
731 *exception = walker.fault;
732
733 return gpa;
734}
735
736static gpa_t FNAME(gva_to_gpa_nested)(struct kvm_vcpu *vcpu, gva_t vaddr,
737 u32 access,
738 struct x86_exception *exception)
554{ 739{
555 struct guest_walker walker; 740 struct guest_walker walker;
556 gpa_t gpa = UNMAPPED_GVA; 741 gpa_t gpa = UNMAPPED_GVA;
557 int r; 742 int r;
558 743
559 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 744 r = FNAME(walk_addr_nested)(&walker, vcpu, vaddr, access);
560 !!(access & PFERR_WRITE_MASK),
561 !!(access & PFERR_USER_MASK),
562 !!(access & PFERR_FETCH_MASK));
563 745
564 if (r) { 746 if (r) {
565 gpa = gfn_to_gpa(walker.gfn); 747 gpa = gfn_to_gpa(walker.gfn);
566 gpa |= vaddr & ~PAGE_MASK; 748 gpa |= vaddr & ~PAGE_MASK;
567 } else if (error) 749 } else if (exception)
568 *error = walker.error_code; 750 *exception = walker.fault;
569 751
570 return gpa; 752 return gpa;
571} 753}
@@ -604,12 +786,19 @@ static void FNAME(prefetch_page)(struct kvm_vcpu *vcpu,
604 * Using the cached information from sp->gfns is safe because: 786 * Using the cached information from sp->gfns is safe because:
605 * - The spte has a reference to the struct page, so the pfn for a given gfn 787 * - The spte has a reference to the struct page, so the pfn for a given gfn
606 * can't change unless all sptes pointing to it are nuked first. 788 * can't change unless all sptes pointing to it are nuked first.
789 *
790 * Note:
791 * We should flush all tlbs if spte is dropped even though guest is
792 * responsible for it. Since if we don't, kvm_mmu_notifier_invalidate_page
793 * and kvm_mmu_notifier_invalidate_range_start detect the mapping page isn't
794 * used by guest then tlbs are not flushed, so guest is allowed to access the
795 * freed pages.
796 * And we increase kvm->tlbs_dirty to delay tlbs flush in this case.
607 */ 797 */
608static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp, 798static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp)
609 bool clear_unsync)
610{ 799{
611 int i, offset, nr_present; 800 int i, offset, nr_present;
612 bool reset_host_protection; 801 bool host_writable;
613 gpa_t first_pte_gpa; 802 gpa_t first_pte_gpa;
614 803
615 offset = nr_present = 0; 804 offset = nr_present = 0;
@@ -638,31 +827,27 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
638 return -EINVAL; 827 return -EINVAL;
639 828
640 gfn = gpte_to_gfn(gpte); 829 gfn = gpte_to_gfn(gpte);
641 if (is_rsvd_bits_set(vcpu, gpte, PT_PAGE_TABLE_LEVEL)
642 || gfn != sp->gfns[i] || !is_present_gpte(gpte)
643 || !(gpte & PT_ACCESSED_MASK)) {
644 u64 nonpresent;
645 830
646 if (is_present_gpte(gpte) || !clear_unsync) 831 if (FNAME(prefetch_invalid_gpte)(vcpu, sp, &sp->spt[i], gpte)) {
647 nonpresent = shadow_trap_nonpresent_pte; 832 vcpu->kvm->tlbs_dirty++;
648 else 833 continue;
649 nonpresent = shadow_notrap_nonpresent_pte; 834 }
650 drop_spte(vcpu->kvm, &sp->spt[i], nonpresent); 835
836 if (gfn != sp->gfns[i]) {
837 drop_spte(vcpu->kvm, &sp->spt[i],
838 shadow_trap_nonpresent_pte);
839 vcpu->kvm->tlbs_dirty++;
651 continue; 840 continue;
652 } 841 }
653 842
654 nr_present++; 843 nr_present++;
655 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte); 844 pte_access = sp->role.access & FNAME(gpte_access)(vcpu, gpte);
656 if (!(sp->spt[i] & SPTE_HOST_WRITEABLE)) { 845 host_writable = sp->spt[i] & SPTE_HOST_WRITEABLE;
657 pte_access &= ~ACC_WRITE_MASK; 846
658 reset_host_protection = 0;
659 } else {
660 reset_host_protection = 1;
661 }
662 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0, 847 set_spte(vcpu, &sp->spt[i], pte_access, 0, 0,
663 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn, 848 is_dirty_gpte(gpte), PT_PAGE_TABLE_LEVEL, gfn,
664 spte_to_pfn(sp->spt[i]), true, false, 849 spte_to_pfn(sp->spt[i]), true, false,
665 reset_host_protection); 850 host_writable);
666 } 851 }
667 852
668 return !nr_present; 853 return !nr_present;
@@ -673,7 +858,6 @@ static int FNAME(sync_page)(struct kvm_vcpu *vcpu, struct kvm_mmu_page *sp,
673#undef FNAME 858#undef FNAME
674#undef PT_BASE_ADDR_MASK 859#undef PT_BASE_ADDR_MASK
675#undef PT_INDEX 860#undef PT_INDEX
676#undef PT_LEVEL_MASK
677#undef PT_LVL_ADDR_MASK 861#undef PT_LVL_ADDR_MASK
678#undef PT_LVL_OFFSET_MASK 862#undef PT_LVL_OFFSET_MASK
679#undef PT_LEVEL_BITS 863#undef PT_LEVEL_BITS
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 8a3f9f64f86f..506e4fe23adc 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -4,7 +4,7 @@
4 * AMD SVM support 4 * AMD SVM support
5 * 5 *
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright 2010 Red Hat, Inc. and/or its affilates. 7 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
8 * 8 *
9 * Authors: 9 * Authors:
10 * Yaniv Kamay <yaniv@qumranet.com> 10 * Yaniv Kamay <yaniv@qumranet.com>
@@ -31,6 +31,7 @@
31 31
32#include <asm/tlbflush.h> 32#include <asm/tlbflush.h>
33#include <asm/desc.h> 33#include <asm/desc.h>
34#include <asm/kvm_para.h>
34 35
35#include <asm/virtext.h> 36#include <asm/virtext.h>
36#include "trace.h" 37#include "trace.h"
@@ -50,6 +51,10 @@ MODULE_LICENSE("GPL");
50#define SVM_FEATURE_LBRV (1 << 1) 51#define SVM_FEATURE_LBRV (1 << 1)
51#define SVM_FEATURE_SVML (1 << 2) 52#define SVM_FEATURE_SVML (1 << 2)
52#define SVM_FEATURE_NRIP (1 << 3) 53#define SVM_FEATURE_NRIP (1 << 3)
54#define SVM_FEATURE_TSC_RATE (1 << 4)
55#define SVM_FEATURE_VMCB_CLEAN (1 << 5)
56#define SVM_FEATURE_FLUSH_ASID (1 << 6)
57#define SVM_FEATURE_DECODE_ASSIST (1 << 7)
53#define SVM_FEATURE_PAUSE_FILTER (1 << 10) 58#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
54 59
55#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 60#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
@@ -58,6 +63,10 @@ MODULE_LICENSE("GPL");
58 63
59#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 64#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
60 65
66#define TSC_RATIO_RSVD 0xffffff0000000000ULL
67#define TSC_RATIO_MIN 0x0000000000000001ULL
68#define TSC_RATIO_MAX 0x000000ffffffffffULL
69
61static bool erratum_383_found __read_mostly; 70static bool erratum_383_found __read_mostly;
62 71
63static const u32 host_save_user_msrs[] = { 72static const u32 host_save_user_msrs[] = {
@@ -89,13 +98,13 @@ struct nested_state {
89 bool exit_required; 98 bool exit_required;
90 99
91 /* cache for intercepts of the guest */ 100 /* cache for intercepts of the guest */
92 u16 intercept_cr_read; 101 u32 intercept_cr;
93 u16 intercept_cr_write; 102 u32 intercept_dr;
94 u16 intercept_dr_read;
95 u16 intercept_dr_write;
96 u32 intercept_exceptions; 103 u32 intercept_exceptions;
97 u64 intercept; 104 u64 intercept;
98 105
106 /* Nested Paging related state */
107 u64 nested_cr3;
99}; 108};
100 109
101#define MSRPM_OFFSETS 16 110#define MSRPM_OFFSETS 16
@@ -113,18 +122,31 @@ struct vcpu_svm {
113 u64 next_rip; 122 u64 next_rip;
114 123
115 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS]; 124 u64 host_user_msrs[NR_HOST_SAVE_USER_MSRS];
116 u64 host_gs_base; 125 struct {
126 u16 fs;
127 u16 gs;
128 u16 ldt;
129 u64 gs_base;
130 } host;
117 131
118 u32 *msrpm; 132 u32 *msrpm;
119 133
134 ulong nmi_iret_rip;
135
120 struct nested_state nested; 136 struct nested_state nested;
121 137
122 bool nmi_singlestep; 138 bool nmi_singlestep;
123 139
124 unsigned int3_injected; 140 unsigned int3_injected;
125 unsigned long int3_rip; 141 unsigned long int3_rip;
142 u32 apf_reason;
143
144 u64 tsc_ratio;
126}; 145};
127 146
147static DEFINE_PER_CPU(u64, current_tsc_ratio);
148#define TSC_RATIO_DEFAULT 0x0100000000ULL
149
128#define MSR_INVALID 0xffffffffU 150#define MSR_INVALID 0xffffffffU
129 151
130static struct svm_direct_access_msrs { 152static struct svm_direct_access_msrs {
@@ -169,15 +191,153 @@ static int nested_svm_intercept(struct vcpu_svm *svm);
169static int nested_svm_vmexit(struct vcpu_svm *svm); 191static int nested_svm_vmexit(struct vcpu_svm *svm);
170static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr, 192static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
171 bool has_error_code, u32 error_code); 193 bool has_error_code, u32 error_code);
194static u64 __scale_tsc(u64 ratio, u64 tsc);
195
196enum {
197 VMCB_INTERCEPTS, /* Intercept vectors, TSC offset,
198 pause filter count */
199 VMCB_PERM_MAP, /* IOPM Base and MSRPM Base */
200 VMCB_ASID, /* ASID */
201 VMCB_INTR, /* int_ctl, int_vector */
202 VMCB_NPT, /* npt_en, nCR3, gPAT */
203 VMCB_CR, /* CR0, CR3, CR4, EFER */
204 VMCB_DR, /* DR6, DR7 */
205 VMCB_DT, /* GDT, IDT */
206 VMCB_SEG, /* CS, DS, SS, ES, CPL */
207 VMCB_CR2, /* CR2 only */
208 VMCB_LBR, /* DBGCTL, BR_FROM, BR_TO, LAST_EX_FROM, LAST_EX_TO */
209 VMCB_DIRTY_MAX,
210};
211
212/* TPR and CR2 are always written before VMRUN */
213#define VMCB_ALWAYS_DIRTY_MASK ((1U << VMCB_INTR) | (1U << VMCB_CR2))
214
215static inline void mark_all_dirty(struct vmcb *vmcb)
216{
217 vmcb->control.clean = 0;
218}
219
220static inline void mark_all_clean(struct vmcb *vmcb)
221{
222 vmcb->control.clean = ((1 << VMCB_DIRTY_MAX) - 1)
223 & ~VMCB_ALWAYS_DIRTY_MASK;
224}
225
226static inline void mark_dirty(struct vmcb *vmcb, int bit)
227{
228 vmcb->control.clean &= ~(1 << bit);
229}
172 230
173static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu) 231static inline struct vcpu_svm *to_svm(struct kvm_vcpu *vcpu)
174{ 232{
175 return container_of(vcpu, struct vcpu_svm, vcpu); 233 return container_of(vcpu, struct vcpu_svm, vcpu);
176} 234}
177 235
178static inline bool is_nested(struct vcpu_svm *svm) 236static void recalc_intercepts(struct vcpu_svm *svm)
237{
238 struct vmcb_control_area *c, *h;
239 struct nested_state *g;
240
241 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
242
243 if (!is_guest_mode(&svm->vcpu))
244 return;
245
246 c = &svm->vmcb->control;
247 h = &svm->nested.hsave->control;
248 g = &svm->nested;
249
250 c->intercept_cr = h->intercept_cr | g->intercept_cr;
251 c->intercept_dr = h->intercept_dr | g->intercept_dr;
252 c->intercept_exceptions = h->intercept_exceptions | g->intercept_exceptions;
253 c->intercept = h->intercept | g->intercept;
254}
255
256static inline struct vmcb *get_host_vmcb(struct vcpu_svm *svm)
257{
258 if (is_guest_mode(&svm->vcpu))
259 return svm->nested.hsave;
260 else
261 return svm->vmcb;
262}
263
264static inline void set_cr_intercept(struct vcpu_svm *svm, int bit)
265{
266 struct vmcb *vmcb = get_host_vmcb(svm);
267
268 vmcb->control.intercept_cr |= (1U << bit);
269
270 recalc_intercepts(svm);
271}
272
273static inline void clr_cr_intercept(struct vcpu_svm *svm, int bit)
274{
275 struct vmcb *vmcb = get_host_vmcb(svm);
276
277 vmcb->control.intercept_cr &= ~(1U << bit);
278
279 recalc_intercepts(svm);
280}
281
282static inline bool is_cr_intercept(struct vcpu_svm *svm, int bit)
179{ 283{
180 return svm->nested.vmcb; 284 struct vmcb *vmcb = get_host_vmcb(svm);
285
286 return vmcb->control.intercept_cr & (1U << bit);
287}
288
289static inline void set_dr_intercept(struct vcpu_svm *svm, int bit)
290{
291 struct vmcb *vmcb = get_host_vmcb(svm);
292
293 vmcb->control.intercept_dr |= (1U << bit);
294
295 recalc_intercepts(svm);
296}
297
298static inline void clr_dr_intercept(struct vcpu_svm *svm, int bit)
299{
300 struct vmcb *vmcb = get_host_vmcb(svm);
301
302 vmcb->control.intercept_dr &= ~(1U << bit);
303
304 recalc_intercepts(svm);
305}
306
307static inline void set_exception_intercept(struct vcpu_svm *svm, int bit)
308{
309 struct vmcb *vmcb = get_host_vmcb(svm);
310
311 vmcb->control.intercept_exceptions |= (1U << bit);
312
313 recalc_intercepts(svm);
314}
315
316static inline void clr_exception_intercept(struct vcpu_svm *svm, int bit)
317{
318 struct vmcb *vmcb = get_host_vmcb(svm);
319
320 vmcb->control.intercept_exceptions &= ~(1U << bit);
321
322 recalc_intercepts(svm);
323}
324
325static inline void set_intercept(struct vcpu_svm *svm, int bit)
326{
327 struct vmcb *vmcb = get_host_vmcb(svm);
328
329 vmcb->control.intercept |= (1ULL << bit);
330
331 recalc_intercepts(svm);
332}
333
334static inline void clr_intercept(struct vcpu_svm *svm, int bit)
335{
336 struct vmcb *vmcb = get_host_vmcb(svm);
337
338 vmcb->control.intercept &= ~(1ULL << bit);
339
340 recalc_intercepts(svm);
181} 341}
182 342
183static inline void enable_gif(struct vcpu_svm *svm) 343static inline void enable_gif(struct vcpu_svm *svm)
@@ -218,7 +378,6 @@ struct svm_cpu_data {
218}; 378};
219 379
220static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data); 380static DEFINE_PER_CPU(struct svm_cpu_data *, svm_data);
221static uint32_t svm_features;
222 381
223struct svm_init_data { 382struct svm_init_data {
224 int cpu; 383 int cpu;
@@ -254,11 +413,6 @@ static u32 svm_msrpm_offset(u32 msr)
254 413
255#define MAX_INST_SIZE 15 414#define MAX_INST_SIZE 15
256 415
257static inline u32 svm_has(u32 feat)
258{
259 return svm_features & feat;
260}
261
262static inline void clgi(void) 416static inline void clgi(void)
263{ 417{
264 asm volatile (__ex(SVM_CLGI)); 418 asm volatile (__ex(SVM_CLGI));
@@ -274,14 +428,13 @@ static inline void invlpga(unsigned long addr, u32 asid)
274 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid)); 428 asm volatile (__ex(SVM_INVLPGA) : : "a"(addr), "c"(asid));
275} 429}
276 430
277static inline void force_new_asid(struct kvm_vcpu *vcpu) 431static int get_npt_level(void)
278{
279 to_svm(vcpu)->asid_generation--;
280}
281
282static inline void flush_guest_tlb(struct kvm_vcpu *vcpu)
283{ 432{
284 force_new_asid(vcpu); 433#ifdef CONFIG_X86_64
434 return PT64_ROOT_LEVEL;
435#else
436 return PT32E_ROOT_LEVEL;
437#endif
285} 438}
286 439
287static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) 440static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -291,6 +444,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
291 efer &= ~EFER_LME; 444 efer &= ~EFER_LME;
292 445
293 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 446 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
447 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
294} 448}
295 449
296static int is_external_interrupt(u32 info) 450static int is_external_interrupt(u32 info)
@@ -328,7 +482,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
328 svm->next_rip = svm->vmcb->control.next_rip; 482 svm->next_rip = svm->vmcb->control.next_rip;
329 483
330 if (!svm->next_rip) { 484 if (!svm->next_rip) {
331 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) != 485 if (emulate_instruction(vcpu, EMULTYPE_SKIP) !=
332 EMULATE_DONE) 486 EMULATE_DONE)
333 printk(KERN_DEBUG "%s: NOP\n", __func__); 487 printk(KERN_DEBUG "%s: NOP\n", __func__);
334 return; 488 return;
@@ -355,7 +509,7 @@ static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
355 nested_svm_check_exception(svm, nr, has_error_code, error_code)) 509 nested_svm_check_exception(svm, nr, has_error_code, error_code))
356 return; 510 return;
357 511
358 if (nr == BP_VECTOR && !svm_has(SVM_FEATURE_NRIP)) { 512 if (nr == BP_VECTOR && !static_cpu_has(X86_FEATURE_NRIPS)) {
359 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu); 513 unsigned long rip, old_rip = kvm_rip_read(&svm->vcpu);
360 514
361 /* 515 /*
@@ -416,6 +570,10 @@ static int has_svm(void)
416 570
417static void svm_hardware_disable(void *garbage) 571static void svm_hardware_disable(void *garbage)
418{ 572{
573 /* Make sure we clean up behind us */
574 if (static_cpu_has(X86_FEATURE_TSCRATEMSR))
575 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
576
419 cpu_svm_disable(); 577 cpu_svm_disable();
420} 578}
421 579
@@ -457,6 +615,11 @@ static int svm_hardware_enable(void *garbage)
457 615
458 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT); 616 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
459 617
618 if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) {
619 wrmsrl(MSR_AMD64_TSC_RATIO, TSC_RATIO_DEFAULT);
620 __get_cpu_var(current_tsc_ratio) = TSC_RATIO_DEFAULT;
621 }
622
460 svm_init_erratum_383(); 623 svm_init_erratum_383();
461 624
462 return 0; 625 return 0;
@@ -638,6 +801,23 @@ static __init int svm_hardware_setup(void)
638 if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) 801 if (boot_cpu_has(X86_FEATURE_FXSR_OPT))
639 kvm_enable_efer_bits(EFER_FFXSR); 802 kvm_enable_efer_bits(EFER_FFXSR);
640 803
804 if (boot_cpu_has(X86_FEATURE_TSCRATEMSR)) {
805 u64 max;
806
807 kvm_has_tsc_control = true;
808
809 /*
810 * Make sure the user can only configure tsc_khz values that
811 * fit into a signed integer.
812 * A min value is not calculated needed because it will always
813 * be 1 on all machines and a value of 0 is used to disable
814 * tsc-scaling for the vcpu.
815 */
816 max = min(0x7fffffffULL, __scale_tsc(tsc_khz, TSC_RATIO_MAX));
817
818 kvm_max_guest_tsc_khz = max;
819 }
820
641 if (nested) { 821 if (nested) {
642 printk(KERN_INFO "kvm: Nested Virtualization enabled\n"); 822 printk(KERN_INFO "kvm: Nested Virtualization enabled\n");
643 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); 823 kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE);
@@ -649,9 +829,7 @@ static __init int svm_hardware_setup(void)
649 goto err; 829 goto err;
650 } 830 }
651 831
652 svm_features = cpuid_edx(SVM_CPUID_FUNC); 832 if (!boot_cpu_has(X86_FEATURE_NPT))
653
654 if (!svm_has(SVM_FEATURE_NPT))
655 npt_enabled = false; 833 npt_enabled = false;
656 834
657 if (npt_enabled && !npt) { 835 if (npt_enabled && !npt) {
@@ -701,68 +879,161 @@ static void init_sys_seg(struct vmcb_seg *seg, uint32_t type)
701 seg->base = 0; 879 seg->base = 0;
702} 880}
703 881
882static u64 __scale_tsc(u64 ratio, u64 tsc)
883{
884 u64 mult, frac, _tsc;
885
886 mult = ratio >> 32;
887 frac = ratio & ((1ULL << 32) - 1);
888
889 _tsc = tsc;
890 _tsc *= mult;
891 _tsc += (tsc >> 32) * frac;
892 _tsc += ((tsc & ((1ULL << 32) - 1)) * frac) >> 32;
893
894 return _tsc;
895}
896
897static u64 svm_scale_tsc(struct kvm_vcpu *vcpu, u64 tsc)
898{
899 struct vcpu_svm *svm = to_svm(vcpu);
900 u64 _tsc = tsc;
901
902 if (svm->tsc_ratio != TSC_RATIO_DEFAULT)
903 _tsc = __scale_tsc(svm->tsc_ratio, tsc);
904
905 return _tsc;
906}
907
908static void svm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
909{
910 struct vcpu_svm *svm = to_svm(vcpu);
911 u64 ratio;
912 u64 khz;
913
914 /* TSC scaling supported? */
915 if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR))
916 return;
917
918 /* TSC-Scaling disabled or guest TSC same frequency as host TSC? */
919 if (user_tsc_khz == 0) {
920 vcpu->arch.virtual_tsc_khz = 0;
921 svm->tsc_ratio = TSC_RATIO_DEFAULT;
922 return;
923 }
924
925 khz = user_tsc_khz;
926
927 /* TSC scaling required - calculate ratio */
928 ratio = khz << 32;
929 do_div(ratio, tsc_khz);
930
931 if (ratio == 0 || ratio & TSC_RATIO_RSVD) {
932 WARN_ONCE(1, "Invalid TSC ratio - virtual-tsc-khz=%u\n",
933 user_tsc_khz);
934 return;
935 }
936 vcpu->arch.virtual_tsc_khz = user_tsc_khz;
937 svm->tsc_ratio = ratio;
938}
939
940static void svm_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
941{
942 struct vcpu_svm *svm = to_svm(vcpu);
943 u64 g_tsc_offset = 0;
944
945 if (is_guest_mode(vcpu)) {
946 g_tsc_offset = svm->vmcb->control.tsc_offset -
947 svm->nested.hsave->control.tsc_offset;
948 svm->nested.hsave->control.tsc_offset = offset;
949 }
950
951 svm->vmcb->control.tsc_offset = offset + g_tsc_offset;
952
953 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
954}
955
956static void svm_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
957{
958 struct vcpu_svm *svm = to_svm(vcpu);
959
960 svm->vmcb->control.tsc_offset += adjustment;
961 if (is_guest_mode(vcpu))
962 svm->nested.hsave->control.tsc_offset += adjustment;
963 mark_dirty(svm->vmcb, VMCB_INTERCEPTS);
964}
965
966static u64 svm_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
967{
968 u64 tsc;
969
970 tsc = svm_scale_tsc(vcpu, native_read_tsc());
971
972 return target_tsc - tsc;
973}
974
704static void init_vmcb(struct vcpu_svm *svm) 975static void init_vmcb(struct vcpu_svm *svm)
705{ 976{
706 struct vmcb_control_area *control = &svm->vmcb->control; 977 struct vmcb_control_area *control = &svm->vmcb->control;
707 struct vmcb_save_area *save = &svm->vmcb->save; 978 struct vmcb_save_area *save = &svm->vmcb->save;
708 979
709 svm->vcpu.fpu_active = 1; 980 svm->vcpu.fpu_active = 1;
981 svm->vcpu.arch.hflags = 0;
710 982
711 control->intercept_cr_read = INTERCEPT_CR0_MASK | 983 set_cr_intercept(svm, INTERCEPT_CR0_READ);
712 INTERCEPT_CR3_MASK | 984 set_cr_intercept(svm, INTERCEPT_CR3_READ);
713 INTERCEPT_CR4_MASK; 985 set_cr_intercept(svm, INTERCEPT_CR4_READ);
714 986 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
715 control->intercept_cr_write = INTERCEPT_CR0_MASK | 987 set_cr_intercept(svm, INTERCEPT_CR3_WRITE);
716 INTERCEPT_CR3_MASK | 988 set_cr_intercept(svm, INTERCEPT_CR4_WRITE);
717 INTERCEPT_CR4_MASK | 989 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
718 INTERCEPT_CR8_MASK; 990
719 991 set_dr_intercept(svm, INTERCEPT_DR0_READ);
720 control->intercept_dr_read = INTERCEPT_DR0_MASK | 992 set_dr_intercept(svm, INTERCEPT_DR1_READ);
721 INTERCEPT_DR1_MASK | 993 set_dr_intercept(svm, INTERCEPT_DR2_READ);
722 INTERCEPT_DR2_MASK | 994 set_dr_intercept(svm, INTERCEPT_DR3_READ);
723 INTERCEPT_DR3_MASK | 995 set_dr_intercept(svm, INTERCEPT_DR4_READ);
724 INTERCEPT_DR4_MASK | 996 set_dr_intercept(svm, INTERCEPT_DR5_READ);
725 INTERCEPT_DR5_MASK | 997 set_dr_intercept(svm, INTERCEPT_DR6_READ);
726 INTERCEPT_DR6_MASK | 998 set_dr_intercept(svm, INTERCEPT_DR7_READ);
727 INTERCEPT_DR7_MASK; 999
728 1000 set_dr_intercept(svm, INTERCEPT_DR0_WRITE);
729 control->intercept_dr_write = INTERCEPT_DR0_MASK | 1001 set_dr_intercept(svm, INTERCEPT_DR1_WRITE);
730 INTERCEPT_DR1_MASK | 1002 set_dr_intercept(svm, INTERCEPT_DR2_WRITE);
731 INTERCEPT_DR2_MASK | 1003 set_dr_intercept(svm, INTERCEPT_DR3_WRITE);
732 INTERCEPT_DR3_MASK | 1004 set_dr_intercept(svm, INTERCEPT_DR4_WRITE);
733 INTERCEPT_DR4_MASK | 1005 set_dr_intercept(svm, INTERCEPT_DR5_WRITE);
734 INTERCEPT_DR5_MASK | 1006 set_dr_intercept(svm, INTERCEPT_DR6_WRITE);
735 INTERCEPT_DR6_MASK | 1007 set_dr_intercept(svm, INTERCEPT_DR7_WRITE);
736 INTERCEPT_DR7_MASK; 1008
737 1009 set_exception_intercept(svm, PF_VECTOR);
738 control->intercept_exceptions = (1 << PF_VECTOR) | 1010 set_exception_intercept(svm, UD_VECTOR);
739 (1 << UD_VECTOR) | 1011 set_exception_intercept(svm, MC_VECTOR);
740 (1 << MC_VECTOR); 1012
741 1013 set_intercept(svm, INTERCEPT_INTR);
742 1014 set_intercept(svm, INTERCEPT_NMI);
743 control->intercept = (1ULL << INTERCEPT_INTR) | 1015 set_intercept(svm, INTERCEPT_SMI);
744 (1ULL << INTERCEPT_NMI) | 1016 set_intercept(svm, INTERCEPT_SELECTIVE_CR0);
745 (1ULL << INTERCEPT_SMI) | 1017 set_intercept(svm, INTERCEPT_CPUID);
746 (1ULL << INTERCEPT_SELECTIVE_CR0) | 1018 set_intercept(svm, INTERCEPT_INVD);
747 (1ULL << INTERCEPT_CPUID) | 1019 set_intercept(svm, INTERCEPT_HLT);
748 (1ULL << INTERCEPT_INVD) | 1020 set_intercept(svm, INTERCEPT_INVLPG);
749 (1ULL << INTERCEPT_HLT) | 1021 set_intercept(svm, INTERCEPT_INVLPGA);
750 (1ULL << INTERCEPT_INVLPG) | 1022 set_intercept(svm, INTERCEPT_IOIO_PROT);
751 (1ULL << INTERCEPT_INVLPGA) | 1023 set_intercept(svm, INTERCEPT_MSR_PROT);
752 (1ULL << INTERCEPT_IOIO_PROT) | 1024 set_intercept(svm, INTERCEPT_TASK_SWITCH);
753 (1ULL << INTERCEPT_MSR_PROT) | 1025 set_intercept(svm, INTERCEPT_SHUTDOWN);
754 (1ULL << INTERCEPT_TASK_SWITCH) | 1026 set_intercept(svm, INTERCEPT_VMRUN);
755 (1ULL << INTERCEPT_SHUTDOWN) | 1027 set_intercept(svm, INTERCEPT_VMMCALL);
756 (1ULL << INTERCEPT_VMRUN) | 1028 set_intercept(svm, INTERCEPT_VMLOAD);
757 (1ULL << INTERCEPT_VMMCALL) | 1029 set_intercept(svm, INTERCEPT_VMSAVE);
758 (1ULL << INTERCEPT_VMLOAD) | 1030 set_intercept(svm, INTERCEPT_STGI);
759 (1ULL << INTERCEPT_VMSAVE) | 1031 set_intercept(svm, INTERCEPT_CLGI);
760 (1ULL << INTERCEPT_STGI) | 1032 set_intercept(svm, INTERCEPT_SKINIT);
761 (1ULL << INTERCEPT_CLGI) | 1033 set_intercept(svm, INTERCEPT_WBINVD);
762 (1ULL << INTERCEPT_SKINIT) | 1034 set_intercept(svm, INTERCEPT_MONITOR);
763 (1ULL << INTERCEPT_WBINVD) | 1035 set_intercept(svm, INTERCEPT_MWAIT);
764 (1ULL << INTERCEPT_MONITOR) | 1036 set_intercept(svm, INTERCEPT_XSETBV);
765 (1ULL << INTERCEPT_MWAIT);
766 1037
767 control->iopm_base_pa = iopm_base; 1038 control->iopm_base_pa = iopm_base;
768 control->msrpm_base_pa = __pa(svm->msrpm); 1039 control->msrpm_base_pa = __pa(svm->msrpm);
@@ -793,10 +1064,10 @@ static void init_vmcb(struct vcpu_svm *svm)
793 init_sys_seg(&save->ldtr, SEG_TYPE_LDT); 1064 init_sys_seg(&save->ldtr, SEG_TYPE_LDT);
794 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); 1065 init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16);
795 1066
796 save->efer = EFER_SVME; 1067 svm_set_efer(&svm->vcpu, 0);
797 save->dr6 = 0xffff0ff0; 1068 save->dr6 = 0xffff0ff0;
798 save->dr7 = 0x400; 1069 save->dr7 = 0x400;
799 save->rflags = 2; 1070 kvm_set_rflags(&svm->vcpu, 2);
800 save->rip = 0x0000fff0; 1071 save->rip = 0x0000fff0;
801 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 1072 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
802 1073
@@ -804,8 +1075,8 @@ static void init_vmcb(struct vcpu_svm *svm)
804 * This is the guest-visible cr0 value. 1075 * This is the guest-visible cr0 value.
805 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. 1076 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
806 */ 1077 */
807 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; 1078 svm->vcpu.arch.cr0 = 0;
808 (void)kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0); 1079 (void)kvm_set_cr0(&svm->vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET);
809 1080
810 save->cr4 = X86_CR4_PAE; 1081 save->cr4 = X86_CR4_PAE;
811 /* rdx = ?? */ 1082 /* rdx = ?? */
@@ -813,25 +1084,27 @@ static void init_vmcb(struct vcpu_svm *svm)
813 if (npt_enabled) { 1084 if (npt_enabled) {
814 /* Setup VMCB for Nested Paging */ 1085 /* Setup VMCB for Nested Paging */
815 control->nested_ctl = 1; 1086 control->nested_ctl = 1;
816 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 1087 clr_intercept(svm, INTERCEPT_TASK_SWITCH);
817 (1ULL << INTERCEPT_INVLPG)); 1088 clr_intercept(svm, INTERCEPT_INVLPG);
818 control->intercept_exceptions &= ~(1 << PF_VECTOR); 1089 clr_exception_intercept(svm, PF_VECTOR);
819 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK; 1090 clr_cr_intercept(svm, INTERCEPT_CR3_READ);
820 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK; 1091 clr_cr_intercept(svm, INTERCEPT_CR3_WRITE);
821 save->g_pat = 0x0007040600070406ULL; 1092 save->g_pat = 0x0007040600070406ULL;
822 save->cr3 = 0; 1093 save->cr3 = 0;
823 save->cr4 = 0; 1094 save->cr4 = 0;
824 } 1095 }
825 force_new_asid(&svm->vcpu); 1096 svm->asid_generation = 0;
826 1097
827 svm->nested.vmcb = 0; 1098 svm->nested.vmcb = 0;
828 svm->vcpu.arch.hflags = 0; 1099 svm->vcpu.arch.hflags = 0;
829 1100
830 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) { 1101 if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) {
831 control->pause_filter_count = 3000; 1102 control->pause_filter_count = 3000;
832 control->intercept |= (1ULL << INTERCEPT_PAUSE); 1103 set_intercept(svm, INTERCEPT_PAUSE);
833 } 1104 }
834 1105
1106 mark_all_dirty(svm->vmcb);
1107
835 enable_gif(svm); 1108 enable_gif(svm);
836} 1109}
837 1110
@@ -867,6 +1140,8 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
867 goto out; 1140 goto out;
868 } 1141 }
869 1142
1143 svm->tsc_ratio = TSC_RATIO_DEFAULT;
1144
870 err = kvm_vcpu_init(&svm->vcpu, kvm, id); 1145 err = kvm_vcpu_init(&svm->vcpu, kvm, id);
871 if (err) 1146 if (err)
872 goto free_svm; 1147 goto free_svm;
@@ -901,7 +1176,7 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
901 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT; 1176 svm->vmcb_pa = page_to_pfn(page) << PAGE_SHIFT;
902 svm->asid_generation = 0; 1177 svm->asid_generation = 0;
903 init_vmcb(svm); 1178 init_vmcb(svm);
904 svm->vmcb->control.tsc_offset = 0-native_read_tsc(); 1179 kvm_write_tsc(&svm->vcpu, 0);
905 1180
906 err = fx_init(&svm->vcpu); 1181 err = fx_init(&svm->vcpu);
907 if (err) 1182 if (err)
@@ -947,25 +1222,25 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
947 int i; 1222 int i;
948 1223
949 if (unlikely(cpu != vcpu->cpu)) { 1224 if (unlikely(cpu != vcpu->cpu)) {
950 u64 delta;
951
952 if (check_tsc_unstable()) {
953 /*
954 * Make sure that the guest sees a monotonically
955 * increasing TSC.
956 */
957 delta = vcpu->arch.host_tsc - native_read_tsc();
958 svm->vmcb->control.tsc_offset += delta;
959 if (is_nested(svm))
960 svm->nested.hsave->control.tsc_offset += delta;
961 }
962 vcpu->cpu = cpu;
963 kvm_migrate_timers(vcpu);
964 svm->asid_generation = 0; 1225 svm->asid_generation = 0;
1226 mark_all_dirty(svm->vmcb);
965 } 1227 }
966 1228
1229#ifdef CONFIG_X86_64
1230 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host.gs_base);
1231#endif
1232 savesegment(fs, svm->host.fs);
1233 savesegment(gs, svm->host.gs);
1234 svm->host.ldt = kvm_read_ldt();
1235
967 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1236 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
968 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1237 rdmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
1238
1239 if (static_cpu_has(X86_FEATURE_TSCRATEMSR) &&
1240 svm->tsc_ratio != __get_cpu_var(current_tsc_ratio)) {
1241 __get_cpu_var(current_tsc_ratio) = svm->tsc_ratio;
1242 wrmsrl(MSR_AMD64_TSC_RATIO, svm->tsc_ratio);
1243 }
969} 1244}
970 1245
971static void svm_vcpu_put(struct kvm_vcpu *vcpu) 1246static void svm_vcpu_put(struct kvm_vcpu *vcpu)
@@ -974,10 +1249,18 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
974 int i; 1249 int i;
975 1250
976 ++vcpu->stat.host_state_reload; 1251 ++vcpu->stat.host_state_reload;
1252 kvm_load_ldt(svm->host.ldt);
1253#ifdef CONFIG_X86_64
1254 loadsegment(fs, svm->host.fs);
1255 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
1256 load_gs_index(svm->host.gs);
1257#else
1258#ifdef CONFIG_X86_32_LAZY_GS
1259 loadsegment(gs, svm->host.gs);
1260#endif
1261#endif
977 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 1262 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
978 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 1263 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
979
980 vcpu->arch.host_tsc = native_read_tsc();
981} 1264}
982 1265
983static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 1266static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -995,7 +1278,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
995 switch (reg) { 1278 switch (reg) {
996 case VCPU_EXREG_PDPTR: 1279 case VCPU_EXREG_PDPTR:
997 BUG_ON(!npt_enabled); 1280 BUG_ON(!npt_enabled);
998 load_pdptrs(vcpu, vcpu->arch.cr3); 1281 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
999 break; 1282 break;
1000 default: 1283 default:
1001 BUG(); 1284 BUG();
@@ -1004,12 +1287,12 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1004 1287
1005static void svm_set_vintr(struct vcpu_svm *svm) 1288static void svm_set_vintr(struct vcpu_svm *svm)
1006{ 1289{
1007 svm->vmcb->control.intercept |= 1ULL << INTERCEPT_VINTR; 1290 set_intercept(svm, INTERCEPT_VINTR);
1008} 1291}
1009 1292
1010static void svm_clear_vintr(struct vcpu_svm *svm) 1293static void svm_clear_vintr(struct vcpu_svm *svm)
1011{ 1294{
1012 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VINTR); 1295 clr_intercept(svm, INTERCEPT_VINTR);
1013} 1296}
1014 1297
1015static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) 1298static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg)
@@ -1124,6 +1407,7 @@ static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1124 1407
1125 svm->vmcb->save.idtr.limit = dt->size; 1408 svm->vmcb->save.idtr.limit = dt->size;
1126 svm->vmcb->save.idtr.base = dt->address ; 1409 svm->vmcb->save.idtr.base = dt->address ;
1410 mark_dirty(svm->vmcb, VMCB_DT);
1127} 1411}
1128 1412
1129static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) 1413static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
@@ -1140,19 +1424,23 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt)
1140 1424
1141 svm->vmcb->save.gdtr.limit = dt->size; 1425 svm->vmcb->save.gdtr.limit = dt->size;
1142 svm->vmcb->save.gdtr.base = dt->address ; 1426 svm->vmcb->save.gdtr.base = dt->address ;
1427 mark_dirty(svm->vmcb, VMCB_DT);
1143} 1428}
1144 1429
1145static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu) 1430static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1146{ 1431{
1147} 1432}
1148 1433
1434static void svm_decache_cr3(struct kvm_vcpu *vcpu)
1435{
1436}
1437
1149static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1438static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1150{ 1439{
1151} 1440}
1152 1441
1153static void update_cr0_intercept(struct vcpu_svm *svm) 1442static void update_cr0_intercept(struct vcpu_svm *svm)
1154{ 1443{
1155 struct vmcb *vmcb = svm->vmcb;
1156 ulong gcr0 = svm->vcpu.arch.cr0; 1444 ulong gcr0 = svm->vcpu.arch.cr0;
1157 u64 *hcr0 = &svm->vmcb->save.cr0; 1445 u64 *hcr0 = &svm->vmcb->save.cr0;
1158 1446
@@ -1162,27 +1450,14 @@ static void update_cr0_intercept(struct vcpu_svm *svm)
1162 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK) 1450 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
1163 | (gcr0 & SVM_CR0_SELECTIVE_MASK); 1451 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
1164 1452
1453 mark_dirty(svm->vmcb, VMCB_CR);
1165 1454
1166 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) { 1455 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
1167 vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK; 1456 clr_cr_intercept(svm, INTERCEPT_CR0_READ);
1168 vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK; 1457 clr_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1169 if (is_nested(svm)) {
1170 struct vmcb *hsave = svm->nested.hsave;
1171
1172 hsave->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
1173 hsave->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
1174 vmcb->control.intercept_cr_read |= svm->nested.intercept_cr_read;
1175 vmcb->control.intercept_cr_write |= svm->nested.intercept_cr_write;
1176 }
1177 } else { 1458 } else {
1178 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK; 1459 set_cr_intercept(svm, INTERCEPT_CR0_READ);
1179 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK; 1460 set_cr_intercept(svm, INTERCEPT_CR0_WRITE);
1180 if (is_nested(svm)) {
1181 struct vmcb *hsave = svm->nested.hsave;
1182
1183 hsave->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
1184 hsave->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
1185 }
1186 } 1461 }
1187} 1462}
1188 1463
@@ -1190,27 +1465,6 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1190{ 1465{
1191 struct vcpu_svm *svm = to_svm(vcpu); 1466 struct vcpu_svm *svm = to_svm(vcpu);
1192 1467
1193 if (is_nested(svm)) {
1194 /*
1195 * We are here because we run in nested mode, the host kvm
1196 * intercepts cr0 writes but the l1 hypervisor does not.
1197 * But the L1 hypervisor may intercept selective cr0 writes.
1198 * This needs to be checked here.
1199 */
1200 unsigned long old, new;
1201
1202 /* Remove bits that would trigger a real cr0 write intercept */
1203 old = vcpu->arch.cr0 & SVM_CR0_SELECTIVE_MASK;
1204 new = cr0 & SVM_CR0_SELECTIVE_MASK;
1205
1206 if (old == new) {
1207 /* cr0 write with ts and mp unchanged */
1208 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
1209 if (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE)
1210 return;
1211 }
1212 }
1213
1214#ifdef CONFIG_X86_64 1468#ifdef CONFIG_X86_64
1215 if (vcpu->arch.efer & EFER_LME) { 1469 if (vcpu->arch.efer & EFER_LME) {
1216 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1470 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
@@ -1238,6 +1492,7 @@ static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1238 */ 1492 */
1239 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1493 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
1240 svm->vmcb->save.cr0 = cr0; 1494 svm->vmcb->save.cr0 = cr0;
1495 mark_dirty(svm->vmcb, VMCB_CR);
1241 update_cr0_intercept(svm); 1496 update_cr0_intercept(svm);
1242} 1497}
1243 1498
@@ -1247,13 +1502,14 @@ static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1247 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4; 1502 unsigned long old_cr4 = to_svm(vcpu)->vmcb->save.cr4;
1248 1503
1249 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) 1504 if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE))
1250 force_new_asid(vcpu); 1505 svm_flush_tlb(vcpu);
1251 1506
1252 vcpu->arch.cr4 = cr4; 1507 vcpu->arch.cr4 = cr4;
1253 if (!npt_enabled) 1508 if (!npt_enabled)
1254 cr4 |= X86_CR4_PAE; 1509 cr4 |= X86_CR4_PAE;
1255 cr4 |= host_cr4_mce; 1510 cr4 |= host_cr4_mce;
1256 to_svm(vcpu)->vmcb->save.cr4 = cr4; 1511 to_svm(vcpu)->vmcb->save.cr4 = cr4;
1512 mark_dirty(to_svm(vcpu)->vmcb, VMCB_CR);
1257} 1513}
1258 1514
1259static void svm_set_segment(struct kvm_vcpu *vcpu, 1515static void svm_set_segment(struct kvm_vcpu *vcpu,
@@ -1282,26 +1538,25 @@ static void svm_set_segment(struct kvm_vcpu *vcpu,
1282 = (svm->vmcb->save.cs.attrib 1538 = (svm->vmcb->save.cs.attrib
1283 >> SVM_SELECTOR_DPL_SHIFT) & 3; 1539 >> SVM_SELECTOR_DPL_SHIFT) & 3;
1284 1540
1541 mark_dirty(svm->vmcb, VMCB_SEG);
1285} 1542}
1286 1543
1287static void update_db_intercept(struct kvm_vcpu *vcpu) 1544static void update_db_intercept(struct kvm_vcpu *vcpu)
1288{ 1545{
1289 struct vcpu_svm *svm = to_svm(vcpu); 1546 struct vcpu_svm *svm = to_svm(vcpu);
1290 1547
1291 svm->vmcb->control.intercept_exceptions &= 1548 clr_exception_intercept(svm, DB_VECTOR);
1292 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1549 clr_exception_intercept(svm, BP_VECTOR);
1293 1550
1294 if (svm->nmi_singlestep) 1551 if (svm->nmi_singlestep)
1295 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1552 set_exception_intercept(svm, DB_VECTOR);
1296 1553
1297 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1554 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
1298 if (vcpu->guest_debug & 1555 if (vcpu->guest_debug &
1299 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) 1556 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP))
1300 svm->vmcb->control.intercept_exceptions |= 1557 set_exception_intercept(svm, DB_VECTOR);
1301 1 << DB_VECTOR;
1302 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 1558 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
1303 svm->vmcb->control.intercept_exceptions |= 1559 set_exception_intercept(svm, BP_VECTOR);
1304 1 << BP_VECTOR;
1305 } else 1560 } else
1306 vcpu->guest_debug = 0; 1561 vcpu->guest_debug = 0;
1307} 1562}
@@ -1315,21 +1570,9 @@ static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1315 else 1570 else
1316 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1571 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1317 1572
1318 update_db_intercept(vcpu); 1573 mark_dirty(svm->vmcb, VMCB_DR);
1319}
1320
1321static void load_host_msrs(struct kvm_vcpu *vcpu)
1322{
1323#ifdef CONFIG_X86_64
1324 wrmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1325#endif
1326}
1327 1574
1328static void save_host_msrs(struct kvm_vcpu *vcpu) 1575 update_db_intercept(vcpu);
1329{
1330#ifdef CONFIG_X86_64
1331 rdmsrl(MSR_GS_BASE, to_svm(vcpu)->host_gs_base);
1332#endif
1333} 1576}
1334 1577
1335static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) 1578static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
@@ -1342,6 +1585,8 @@ static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1342 1585
1343 svm->asid_generation = sd->asid_generation; 1586 svm->asid_generation = sd->asid_generation;
1344 svm->vmcb->control.asid = sd->next_asid++; 1587 svm->vmcb->control.asid = sd->next_asid++;
1588
1589 mark_dirty(svm->vmcb, VMCB_ASID);
1345} 1590}
1346 1591
1347static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) 1592static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
@@ -1349,20 +1594,40 @@ static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value)
1349 struct vcpu_svm *svm = to_svm(vcpu); 1594 struct vcpu_svm *svm = to_svm(vcpu);
1350 1595
1351 svm->vmcb->save.dr7 = value; 1596 svm->vmcb->save.dr7 = value;
1597 mark_dirty(svm->vmcb, VMCB_DR);
1352} 1598}
1353 1599
1354static int pf_interception(struct vcpu_svm *svm) 1600static int pf_interception(struct vcpu_svm *svm)
1355{ 1601{
1356 u64 fault_address; 1602 u64 fault_address = svm->vmcb->control.exit_info_2;
1357 u32 error_code; 1603 u32 error_code;
1604 int r = 1;
1358 1605
1359 fault_address = svm->vmcb->control.exit_info_2; 1606 switch (svm->apf_reason) {
1360 error_code = svm->vmcb->control.exit_info_1; 1607 default:
1361 1608 error_code = svm->vmcb->control.exit_info_1;
1362 trace_kvm_page_fault(fault_address, error_code); 1609
1363 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu)) 1610 trace_kvm_page_fault(fault_address, error_code);
1364 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address); 1611 if (!npt_enabled && kvm_event_needs_reinjection(&svm->vcpu))
1365 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1612 kvm_mmu_unprotect_page_virt(&svm->vcpu, fault_address);
1613 r = kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code,
1614 svm->vmcb->control.insn_bytes,
1615 svm->vmcb->control.insn_len);
1616 break;
1617 case KVM_PV_REASON_PAGE_NOT_PRESENT:
1618 svm->apf_reason = 0;
1619 local_irq_disable();
1620 kvm_async_pf_task_wait(fault_address);
1621 local_irq_enable();
1622 break;
1623 case KVM_PV_REASON_PAGE_READY:
1624 svm->apf_reason = 0;
1625 local_irq_disable();
1626 kvm_async_pf_task_wake(fault_address);
1627 local_irq_enable();
1628 break;
1629 }
1630 return r;
1366} 1631}
1367 1632
1368static int db_interception(struct vcpu_svm *svm) 1633static int db_interception(struct vcpu_svm *svm)
@@ -1410,7 +1675,7 @@ static int ud_interception(struct vcpu_svm *svm)
1410{ 1675{
1411 int er; 1676 int er;
1412 1677
1413 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD); 1678 er = emulate_instruction(&svm->vcpu, EMULTYPE_TRAP_UD);
1414 if (er != EMULATE_DONE) 1679 if (er != EMULATE_DONE)
1415 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1680 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1416 return 1; 1681 return 1;
@@ -1419,21 +1684,8 @@ static int ud_interception(struct vcpu_svm *svm)
1419static void svm_fpu_activate(struct kvm_vcpu *vcpu) 1684static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1420{ 1685{
1421 struct vcpu_svm *svm = to_svm(vcpu); 1686 struct vcpu_svm *svm = to_svm(vcpu);
1422 u32 excp;
1423
1424 if (is_nested(svm)) {
1425 u32 h_excp, n_excp;
1426
1427 h_excp = svm->nested.hsave->control.intercept_exceptions;
1428 n_excp = svm->nested.intercept_exceptions;
1429 h_excp &= ~(1 << NM_VECTOR);
1430 excp = h_excp | n_excp;
1431 } else {
1432 excp = svm->vmcb->control.intercept_exceptions;
1433 excp &= ~(1 << NM_VECTOR);
1434 }
1435 1687
1436 svm->vmcb->control.intercept_exceptions = excp; 1688 clr_exception_intercept(svm, NM_VECTOR);
1437 1689
1438 svm->vcpu.fpu_active = 1; 1690 svm->vcpu.fpu_active = 1;
1439 update_cr0_intercept(svm); 1691 update_cr0_intercept(svm);
@@ -1540,7 +1792,7 @@ static int io_interception(struct vcpu_svm *svm)
1540 string = (io_info & SVM_IOIO_STR_MASK) != 0; 1792 string = (io_info & SVM_IOIO_STR_MASK) != 0;
1541 in = (io_info & SVM_IOIO_TYPE_MASK) != 0; 1793 in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
1542 if (string || in) 1794 if (string || in)
1543 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 1795 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
1544 1796
1545 port = io_info >> 16; 1797 port = io_info >> 16;
1546 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1798 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
@@ -1581,6 +1833,56 @@ static int vmmcall_interception(struct vcpu_svm *svm)
1581 return 1; 1833 return 1;
1582} 1834}
1583 1835
1836static unsigned long nested_svm_get_tdp_cr3(struct kvm_vcpu *vcpu)
1837{
1838 struct vcpu_svm *svm = to_svm(vcpu);
1839
1840 return svm->nested.nested_cr3;
1841}
1842
1843static void nested_svm_set_tdp_cr3(struct kvm_vcpu *vcpu,
1844 unsigned long root)
1845{
1846 struct vcpu_svm *svm = to_svm(vcpu);
1847
1848 svm->vmcb->control.nested_cr3 = root;
1849 mark_dirty(svm->vmcb, VMCB_NPT);
1850 svm_flush_tlb(vcpu);
1851}
1852
1853static void nested_svm_inject_npf_exit(struct kvm_vcpu *vcpu,
1854 struct x86_exception *fault)
1855{
1856 struct vcpu_svm *svm = to_svm(vcpu);
1857
1858 svm->vmcb->control.exit_code = SVM_EXIT_NPF;
1859 svm->vmcb->control.exit_code_hi = 0;
1860 svm->vmcb->control.exit_info_1 = fault->error_code;
1861 svm->vmcb->control.exit_info_2 = fault->address;
1862
1863 nested_svm_vmexit(svm);
1864}
1865
1866static int nested_svm_init_mmu_context(struct kvm_vcpu *vcpu)
1867{
1868 int r;
1869
1870 r = kvm_init_shadow_mmu(vcpu, &vcpu->arch.mmu);
1871
1872 vcpu->arch.mmu.set_cr3 = nested_svm_set_tdp_cr3;
1873 vcpu->arch.mmu.get_cr3 = nested_svm_get_tdp_cr3;
1874 vcpu->arch.mmu.inject_page_fault = nested_svm_inject_npf_exit;
1875 vcpu->arch.mmu.shadow_root_level = get_npt_level();
1876 vcpu->arch.walk_mmu = &vcpu->arch.nested_mmu;
1877
1878 return r;
1879}
1880
1881static void nested_svm_uninit_mmu_context(struct kvm_vcpu *vcpu)
1882{
1883 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
1884}
1885
1584static int nested_svm_check_permissions(struct vcpu_svm *svm) 1886static int nested_svm_check_permissions(struct vcpu_svm *svm)
1585{ 1887{
1586 if (!(svm->vcpu.arch.efer & EFER_SVME) 1888 if (!(svm->vcpu.arch.efer & EFER_SVME)
@@ -1602,7 +1904,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1602{ 1904{
1603 int vmexit; 1905 int vmexit;
1604 1906
1605 if (!is_nested(svm)) 1907 if (!is_guest_mode(&svm->vcpu))
1606 return 0; 1908 return 0;
1607 1909
1608 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr; 1910 svm->vmcb->control.exit_code = SVM_EXIT_EXCP_BASE + nr;
@@ -1620,7 +1922,7 @@ static int nested_svm_check_exception(struct vcpu_svm *svm, unsigned nr,
1620/* This function returns true if it is save to enable the irq window */ 1922/* This function returns true if it is save to enable the irq window */
1621static inline bool nested_svm_intr(struct vcpu_svm *svm) 1923static inline bool nested_svm_intr(struct vcpu_svm *svm)
1622{ 1924{
1623 if (!is_nested(svm)) 1925 if (!is_guest_mode(&svm->vcpu))
1624 return true; 1926 return true;
1625 1927
1626 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK)) 1928 if (!(svm->vcpu.arch.hflags & HF_VINTR_MASK))
@@ -1629,6 +1931,14 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1629 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK)) 1931 if (!(svm->vcpu.arch.hflags & HF_HIF_MASK))
1630 return false; 1932 return false;
1631 1933
1934 /*
1935 * if vmexit was already requested (by intercepted exception
1936 * for instance) do not overwrite it with "external interrupt"
1937 * vmexit.
1938 */
1939 if (svm->nested.exit_required)
1940 return false;
1941
1632 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1942 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1633 svm->vmcb->control.exit_info_1 = 0; 1943 svm->vmcb->control.exit_info_1 = 0;
1634 svm->vmcb->control.exit_info_2 = 0; 1944 svm->vmcb->control.exit_info_2 = 0;
@@ -1651,7 +1961,7 @@ static inline bool nested_svm_intr(struct vcpu_svm *svm)
1651/* This function returns true if it is save to enable the nmi window */ 1961/* This function returns true if it is save to enable the nmi window */
1652static inline bool nested_svm_nmi(struct vcpu_svm *svm) 1962static inline bool nested_svm_nmi(struct vcpu_svm *svm)
1653{ 1963{
1654 if (!is_nested(svm)) 1964 if (!is_guest_mode(&svm->vcpu))
1655 return true; 1965 return true;
1656 1966
1657 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI))) 1967 if (!(svm->nested.intercept & (1ULL << INTERCEPT_NMI)))
@@ -1750,8 +2060,8 @@ static int nested_svm_exit_special(struct vcpu_svm *svm)
1750 return NESTED_EXIT_HOST; 2060 return NESTED_EXIT_HOST;
1751 break; 2061 break;
1752 case SVM_EXIT_EXCP_BASE + PF_VECTOR: 2062 case SVM_EXIT_EXCP_BASE + PF_VECTOR:
1753 /* When we're shadowing, trap PFs */ 2063 /* When we're shadowing, trap PFs, but not async PF */
1754 if (!npt_enabled) 2064 if (!npt_enabled && svm->apf_reason == 0)
1755 return NESTED_EXIT_HOST; 2065 return NESTED_EXIT_HOST;
1756 break; 2066 break;
1757 case SVM_EXIT_EXCP_BASE + NM_VECTOR: 2067 case SVM_EXIT_EXCP_BASE + NM_VECTOR:
@@ -1779,27 +2089,15 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1779 case SVM_EXIT_IOIO: 2089 case SVM_EXIT_IOIO:
1780 vmexit = nested_svm_intercept_ioio(svm); 2090 vmexit = nested_svm_intercept_ioio(svm);
1781 break; 2091 break;
1782 case SVM_EXIT_READ_CR0 ... SVM_EXIT_READ_CR8: { 2092 case SVM_EXIT_READ_CR0 ... SVM_EXIT_WRITE_CR8: {
1783 u32 cr_bits = 1 << (exit_code - SVM_EXIT_READ_CR0); 2093 u32 bit = 1U << (exit_code - SVM_EXIT_READ_CR0);
1784 if (svm->nested.intercept_cr_read & cr_bits) 2094 if (svm->nested.intercept_cr & bit)
1785 vmexit = NESTED_EXIT_DONE;
1786 break;
1787 }
1788 case SVM_EXIT_WRITE_CR0 ... SVM_EXIT_WRITE_CR8: {
1789 u32 cr_bits = 1 << (exit_code - SVM_EXIT_WRITE_CR0);
1790 if (svm->nested.intercept_cr_write & cr_bits)
1791 vmexit = NESTED_EXIT_DONE;
1792 break;
1793 }
1794 case SVM_EXIT_READ_DR0 ... SVM_EXIT_READ_DR7: {
1795 u32 dr_bits = 1 << (exit_code - SVM_EXIT_READ_DR0);
1796 if (svm->nested.intercept_dr_read & dr_bits)
1797 vmexit = NESTED_EXIT_DONE; 2095 vmexit = NESTED_EXIT_DONE;
1798 break; 2096 break;
1799 } 2097 }
1800 case SVM_EXIT_WRITE_DR0 ... SVM_EXIT_WRITE_DR7: { 2098 case SVM_EXIT_READ_DR0 ... SVM_EXIT_WRITE_DR7: {
1801 u32 dr_bits = 1 << (exit_code - SVM_EXIT_WRITE_DR0); 2099 u32 bit = 1U << (exit_code - SVM_EXIT_READ_DR0);
1802 if (svm->nested.intercept_dr_write & dr_bits) 2100 if (svm->nested.intercept_dr & bit)
1803 vmexit = NESTED_EXIT_DONE; 2101 vmexit = NESTED_EXIT_DONE;
1804 break; 2102 break;
1805 } 2103 }
@@ -1807,6 +2105,10 @@ static int nested_svm_intercept(struct vcpu_svm *svm)
1807 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE); 2105 u32 excp_bits = 1 << (exit_code - SVM_EXIT_EXCP_BASE);
1808 if (svm->nested.intercept_exceptions & excp_bits) 2106 if (svm->nested.intercept_exceptions & excp_bits)
1809 vmexit = NESTED_EXIT_DONE; 2107 vmexit = NESTED_EXIT_DONE;
2108 /* async page fault always cause vmexit */
2109 else if ((exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) &&
2110 svm->apf_reason != 0)
2111 vmexit = NESTED_EXIT_DONE;
1810 break; 2112 break;
1811 } 2113 }
1812 case SVM_EXIT_ERR: { 2114 case SVM_EXIT_ERR: {
@@ -1840,10 +2142,8 @@ static inline void copy_vmcb_control_area(struct vmcb *dst_vmcb, struct vmcb *fr
1840 struct vmcb_control_area *dst = &dst_vmcb->control; 2142 struct vmcb_control_area *dst = &dst_vmcb->control;
1841 struct vmcb_control_area *from = &from_vmcb->control; 2143 struct vmcb_control_area *from = &from_vmcb->control;
1842 2144
1843 dst->intercept_cr_read = from->intercept_cr_read; 2145 dst->intercept_cr = from->intercept_cr;
1844 dst->intercept_cr_write = from->intercept_cr_write; 2146 dst->intercept_dr = from->intercept_dr;
1845 dst->intercept_dr_read = from->intercept_dr_read;
1846 dst->intercept_dr_write = from->intercept_dr_write;
1847 dst->intercept_exceptions = from->intercept_exceptions; 2147 dst->intercept_exceptions = from->intercept_exceptions;
1848 dst->intercept = from->intercept; 2148 dst->intercept = from->intercept;
1849 dst->iopm_base_pa = from->iopm_base_pa; 2149 dst->iopm_base_pa = from->iopm_base_pa;
@@ -1884,7 +2184,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1884 if (!nested_vmcb) 2184 if (!nested_vmcb)
1885 return 1; 2185 return 1;
1886 2186
1887 /* Exit nested SVM mode */ 2187 /* Exit Guest-Mode */
2188 leave_guest_mode(&svm->vcpu);
1888 svm->nested.vmcb = 0; 2189 svm->nested.vmcb = 0;
1889 2190
1890 /* Give the current vmcb to the guest */ 2191 /* Give the current vmcb to the guest */
@@ -1896,11 +2197,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1896 nested_vmcb->save.ds = vmcb->save.ds; 2197 nested_vmcb->save.ds = vmcb->save.ds;
1897 nested_vmcb->save.gdtr = vmcb->save.gdtr; 2198 nested_vmcb->save.gdtr = vmcb->save.gdtr;
1898 nested_vmcb->save.idtr = vmcb->save.idtr; 2199 nested_vmcb->save.idtr = vmcb->save.idtr;
2200 nested_vmcb->save.efer = svm->vcpu.arch.efer;
1899 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu); 2201 nested_vmcb->save.cr0 = kvm_read_cr0(&svm->vcpu);
1900 nested_vmcb->save.cr3 = svm->vcpu.arch.cr3; 2202 nested_vmcb->save.cr3 = kvm_read_cr3(&svm->vcpu);
1901 nested_vmcb->save.cr2 = vmcb->save.cr2; 2203 nested_vmcb->save.cr2 = vmcb->save.cr2;
1902 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4; 2204 nested_vmcb->save.cr4 = svm->vcpu.arch.cr4;
1903 nested_vmcb->save.rflags = vmcb->save.rflags; 2205 nested_vmcb->save.rflags = kvm_get_rflags(&svm->vcpu);
1904 nested_vmcb->save.rip = vmcb->save.rip; 2206 nested_vmcb->save.rip = vmcb->save.rip;
1905 nested_vmcb->save.rsp = vmcb->save.rsp; 2207 nested_vmcb->save.rsp = vmcb->save.rsp;
1906 nested_vmcb->save.rax = vmcb->save.rax; 2208 nested_vmcb->save.rax = vmcb->save.rax;
@@ -1917,6 +2219,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1917 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 2219 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1918 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 2220 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1919 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 2221 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
2222 nested_vmcb->control.next_rip = vmcb->control.next_rip;
1920 2223
1921 /* 2224 /*
1922 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have 2225 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
@@ -1947,6 +2250,8 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1947 kvm_clear_exception_queue(&svm->vcpu); 2250 kvm_clear_exception_queue(&svm->vcpu);
1948 kvm_clear_interrupt_queue(&svm->vcpu); 2251 kvm_clear_interrupt_queue(&svm->vcpu);
1949 2252
2253 svm->nested.nested_cr3 = 0;
2254
1950 /* Restore selected save entries */ 2255 /* Restore selected save entries */
1951 svm->vmcb->save.es = hsave->save.es; 2256 svm->vmcb->save.es = hsave->save.es;
1952 svm->vmcb->save.cs = hsave->save.cs; 2257 svm->vmcb->save.cs = hsave->save.cs;
@@ -1954,7 +2259,7 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1954 svm->vmcb->save.ds = hsave->save.ds; 2259 svm->vmcb->save.ds = hsave->save.ds;
1955 svm->vmcb->save.gdtr = hsave->save.gdtr; 2260 svm->vmcb->save.gdtr = hsave->save.gdtr;
1956 svm->vmcb->save.idtr = hsave->save.idtr; 2261 svm->vmcb->save.idtr = hsave->save.idtr;
1957 svm->vmcb->save.rflags = hsave->save.rflags; 2262 kvm_set_rflags(&svm->vcpu, hsave->save.rflags);
1958 svm_set_efer(&svm->vcpu, hsave->save.efer); 2263 svm_set_efer(&svm->vcpu, hsave->save.efer);
1959 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE); 2264 svm_set_cr0(&svm->vcpu, hsave->save.cr0 | X86_CR0_PE);
1960 svm_set_cr4(&svm->vcpu, hsave->save.cr4); 2265 svm_set_cr4(&svm->vcpu, hsave->save.cr4);
@@ -1971,8 +2276,11 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1971 svm->vmcb->save.cpl = 0; 2276 svm->vmcb->save.cpl = 0;
1972 svm->vmcb->control.exit_int_info = 0; 2277 svm->vmcb->control.exit_int_info = 0;
1973 2278
2279 mark_all_dirty(svm->vmcb);
2280
1974 nested_svm_unmap(page); 2281 nested_svm_unmap(page);
1975 2282
2283 nested_svm_uninit_mmu_context(&svm->vcpu);
1976 kvm_mmu_reset_context(&svm->vcpu); 2284 kvm_mmu_reset_context(&svm->vcpu);
1977 kvm_mmu_load(&svm->vcpu); 2285 kvm_mmu_load(&svm->vcpu);
1978 2286
@@ -2012,6 +2320,20 @@ static bool nested_svm_vmrun_msrpm(struct vcpu_svm *svm)
2012 return true; 2320 return true;
2013} 2321}
2014 2322
2323static bool nested_vmcb_checks(struct vmcb *vmcb)
2324{
2325 if ((vmcb->control.intercept & (1ULL << INTERCEPT_VMRUN)) == 0)
2326 return false;
2327
2328 if (vmcb->control.asid == 0)
2329 return false;
2330
2331 if (vmcb->control.nested_ctl && !npt_enabled)
2332 return false;
2333
2334 return true;
2335}
2336
2015static bool nested_svm_vmrun(struct vcpu_svm *svm) 2337static bool nested_svm_vmrun(struct vcpu_svm *svm)
2016{ 2338{
2017 struct vmcb *nested_vmcb; 2339 struct vmcb *nested_vmcb;
@@ -2026,14 +2348,25 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2026 if (!nested_vmcb) 2348 if (!nested_vmcb)
2027 return false; 2349 return false;
2028 2350
2029 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, vmcb_gpa, 2351 if (!nested_vmcb_checks(nested_vmcb)) {
2352 nested_vmcb->control.exit_code = SVM_EXIT_ERR;
2353 nested_vmcb->control.exit_code_hi = 0;
2354 nested_vmcb->control.exit_info_1 = 0;
2355 nested_vmcb->control.exit_info_2 = 0;
2356
2357 nested_svm_unmap(page);
2358
2359 return false;
2360 }
2361
2362 trace_kvm_nested_vmrun(svm->vmcb->save.rip, vmcb_gpa,
2030 nested_vmcb->save.rip, 2363 nested_vmcb->save.rip,
2031 nested_vmcb->control.int_ctl, 2364 nested_vmcb->control.int_ctl,
2032 nested_vmcb->control.event_inj, 2365 nested_vmcb->control.event_inj,
2033 nested_vmcb->control.nested_ctl); 2366 nested_vmcb->control.nested_ctl);
2034 2367
2035 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr_read, 2368 trace_kvm_nested_intercepts(nested_vmcb->control.intercept_cr & 0xffff,
2036 nested_vmcb->control.intercept_cr_write, 2369 nested_vmcb->control.intercept_cr >> 16,
2037 nested_vmcb->control.intercept_exceptions, 2370 nested_vmcb->control.intercept_exceptions,
2038 nested_vmcb->control.intercept); 2371 nested_vmcb->control.intercept);
2039 2372
@@ -2054,22 +2387,28 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2054 hsave->save.efer = svm->vcpu.arch.efer; 2387 hsave->save.efer = svm->vcpu.arch.efer;
2055 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu); 2388 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
2056 hsave->save.cr4 = svm->vcpu.arch.cr4; 2389 hsave->save.cr4 = svm->vcpu.arch.cr4;
2057 hsave->save.rflags = vmcb->save.rflags; 2390 hsave->save.rflags = kvm_get_rflags(&svm->vcpu);
2058 hsave->save.rip = svm->next_rip; 2391 hsave->save.rip = kvm_rip_read(&svm->vcpu);
2059 hsave->save.rsp = vmcb->save.rsp; 2392 hsave->save.rsp = vmcb->save.rsp;
2060 hsave->save.rax = vmcb->save.rax; 2393 hsave->save.rax = vmcb->save.rax;
2061 if (npt_enabled) 2394 if (npt_enabled)
2062 hsave->save.cr3 = vmcb->save.cr3; 2395 hsave->save.cr3 = vmcb->save.cr3;
2063 else 2396 else
2064 hsave->save.cr3 = svm->vcpu.arch.cr3; 2397 hsave->save.cr3 = kvm_read_cr3(&svm->vcpu);
2065 2398
2066 copy_vmcb_control_area(hsave, vmcb); 2399 copy_vmcb_control_area(hsave, vmcb);
2067 2400
2068 if (svm->vmcb->save.rflags & X86_EFLAGS_IF) 2401 if (kvm_get_rflags(&svm->vcpu) & X86_EFLAGS_IF)
2069 svm->vcpu.arch.hflags |= HF_HIF_MASK; 2402 svm->vcpu.arch.hflags |= HF_HIF_MASK;
2070 else 2403 else
2071 svm->vcpu.arch.hflags &= ~HF_HIF_MASK; 2404 svm->vcpu.arch.hflags &= ~HF_HIF_MASK;
2072 2405
2406 if (nested_vmcb->control.nested_ctl) {
2407 kvm_mmu_unload(&svm->vcpu);
2408 svm->nested.nested_cr3 = nested_vmcb->control.nested_cr3;
2409 nested_svm_init_mmu_context(&svm->vcpu);
2410 }
2411
2073 /* Load the nested guest state */ 2412 /* Load the nested guest state */
2074 svm->vmcb->save.es = nested_vmcb->save.es; 2413 svm->vmcb->save.es = nested_vmcb->save.es;
2075 svm->vmcb->save.cs = nested_vmcb->save.cs; 2414 svm->vmcb->save.cs = nested_vmcb->save.cs;
@@ -2077,7 +2416,7 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2077 svm->vmcb->save.ds = nested_vmcb->save.ds; 2416 svm->vmcb->save.ds = nested_vmcb->save.ds;
2078 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr; 2417 svm->vmcb->save.gdtr = nested_vmcb->save.gdtr;
2079 svm->vmcb->save.idtr = nested_vmcb->save.idtr; 2418 svm->vmcb->save.idtr = nested_vmcb->save.idtr;
2080 svm->vmcb->save.rflags = nested_vmcb->save.rflags; 2419 kvm_set_rflags(&svm->vcpu, nested_vmcb->save.rflags);
2081 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer); 2420 svm_set_efer(&svm->vcpu, nested_vmcb->save.efer);
2082 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0); 2421 svm_set_cr0(&svm->vcpu, nested_vmcb->save.cr0);
2083 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4); 2422 svm_set_cr4(&svm->vcpu, nested_vmcb->save.cr4);
@@ -2107,14 +2446,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2107 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL; 2446 svm->nested.vmcb_iopm = nested_vmcb->control.iopm_base_pa & ~0x0fffULL;
2108 2447
2109 /* cache intercepts */ 2448 /* cache intercepts */
2110 svm->nested.intercept_cr_read = nested_vmcb->control.intercept_cr_read; 2449 svm->nested.intercept_cr = nested_vmcb->control.intercept_cr;
2111 svm->nested.intercept_cr_write = nested_vmcb->control.intercept_cr_write; 2450 svm->nested.intercept_dr = nested_vmcb->control.intercept_dr;
2112 svm->nested.intercept_dr_read = nested_vmcb->control.intercept_dr_read;
2113 svm->nested.intercept_dr_write = nested_vmcb->control.intercept_dr_write;
2114 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions; 2451 svm->nested.intercept_exceptions = nested_vmcb->control.intercept_exceptions;
2115 svm->nested.intercept = nested_vmcb->control.intercept; 2452 svm->nested.intercept = nested_vmcb->control.intercept;
2116 2453
2117 force_new_asid(&svm->vcpu); 2454 svm_flush_tlb(&svm->vcpu);
2118 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 2455 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
2119 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 2456 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
2120 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 2457 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
@@ -2123,29 +2460,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2123 2460
2124 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) { 2461 if (svm->vcpu.arch.hflags & HF_VINTR_MASK) {
2125 /* We only want the cr8 intercept bits of the guest */ 2462 /* We only want the cr8 intercept bits of the guest */
2126 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR8_MASK; 2463 clr_cr_intercept(svm, INTERCEPT_CR8_READ);
2127 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2464 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2128 } 2465 }
2129 2466
2130 /* We don't want to see VMMCALLs from a nested guest */ 2467 /* We don't want to see VMMCALLs from a nested guest */
2131 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_VMMCALL); 2468 clr_intercept(svm, INTERCEPT_VMMCALL);
2132
2133 /*
2134 * We don't want a nested guest to be more powerful than the guest, so
2135 * all intercepts are ORed
2136 */
2137 svm->vmcb->control.intercept_cr_read |=
2138 nested_vmcb->control.intercept_cr_read;
2139 svm->vmcb->control.intercept_cr_write |=
2140 nested_vmcb->control.intercept_cr_write;
2141 svm->vmcb->control.intercept_dr_read |=
2142 nested_vmcb->control.intercept_dr_read;
2143 svm->vmcb->control.intercept_dr_write |=
2144 nested_vmcb->control.intercept_dr_write;
2145 svm->vmcb->control.intercept_exceptions |=
2146 nested_vmcb->control.intercept_exceptions;
2147
2148 svm->vmcb->control.intercept |= nested_vmcb->control.intercept;
2149 2469
2150 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl; 2470 svm->vmcb->control.lbr_ctl = nested_vmcb->control.lbr_ctl;
2151 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 2471 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
@@ -2156,11 +2476,21 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
2156 2476
2157 nested_svm_unmap(page); 2477 nested_svm_unmap(page);
2158 2478
2159 /* nested_vmcb is our indicator if nested SVM is activated */ 2479 /* Enter Guest-Mode */
2480 enter_guest_mode(&svm->vcpu);
2481
2482 /*
2483 * Merge guest and host intercepts - must be called with vcpu in
2484 * guest-mode to take affect here
2485 */
2486 recalc_intercepts(svm);
2487
2160 svm->nested.vmcb = vmcb_gpa; 2488 svm->nested.vmcb = vmcb_gpa;
2161 2489
2162 enable_gif(svm); 2490 enable_gif(svm);
2163 2491
2492 mark_all_dirty(svm->vmcb);
2493
2164 return true; 2494 return true;
2165} 2495}
2166 2496
@@ -2188,13 +2518,13 @@ static int vmload_interception(struct vcpu_svm *svm)
2188 if (nested_svm_check_permissions(svm)) 2518 if (nested_svm_check_permissions(svm))
2189 return 1; 2519 return 1;
2190 2520
2191 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2192 skip_emulated_instruction(&svm->vcpu);
2193
2194 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2521 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2195 if (!nested_vmcb) 2522 if (!nested_vmcb)
2196 return 1; 2523 return 1;
2197 2524
2525 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2526 skip_emulated_instruction(&svm->vcpu);
2527
2198 nested_svm_vmloadsave(nested_vmcb, svm->vmcb); 2528 nested_svm_vmloadsave(nested_vmcb, svm->vmcb);
2199 nested_svm_unmap(page); 2529 nested_svm_unmap(page);
2200 2530
@@ -2209,13 +2539,13 @@ static int vmsave_interception(struct vcpu_svm *svm)
2209 if (nested_svm_check_permissions(svm)) 2539 if (nested_svm_check_permissions(svm))
2210 return 1; 2540 return 1;
2211 2541
2212 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2213 skip_emulated_instruction(&svm->vcpu);
2214
2215 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page); 2542 nested_vmcb = nested_svm_map(svm, svm->vmcb->save.rax, &page);
2216 if (!nested_vmcb) 2543 if (!nested_vmcb)
2217 return 1; 2544 return 1;
2218 2545
2546 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2547 skip_emulated_instruction(&svm->vcpu);
2548
2219 nested_svm_vmloadsave(svm->vmcb, nested_vmcb); 2549 nested_svm_vmloadsave(svm->vmcb, nested_vmcb);
2220 nested_svm_unmap(page); 2550 nested_svm_unmap(page);
2221 2551
@@ -2227,8 +2557,8 @@ static int vmrun_interception(struct vcpu_svm *svm)
2227 if (nested_svm_check_permissions(svm)) 2557 if (nested_svm_check_permissions(svm))
2228 return 1; 2558 return 1;
2229 2559
2230 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2560 /* Save rip after vmrun instruction */
2231 skip_emulated_instruction(&svm->vcpu); 2561 kvm_rip_write(&svm->vcpu, kvm_rip_read(&svm->vcpu) + 3);
2232 2562
2233 if (!nested_svm_vmrun(svm)) 2563 if (!nested_svm_vmrun(svm))
2234 return 1; 2564 return 1;
@@ -2257,6 +2587,7 @@ static int stgi_interception(struct vcpu_svm *svm)
2257 2587
2258 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 2588 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2259 skip_emulated_instruction(&svm->vcpu); 2589 skip_emulated_instruction(&svm->vcpu);
2590 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2260 2591
2261 enable_gif(svm); 2592 enable_gif(svm);
2262 2593
@@ -2277,6 +2608,8 @@ static int clgi_interception(struct vcpu_svm *svm)
2277 svm_clear_vintr(svm); 2608 svm_clear_vintr(svm);
2278 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2609 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2279 2610
2611 mark_dirty(svm->vmcb, VMCB_INTR);
2612
2280 return 1; 2613 return 1;
2281} 2614}
2282 2615
@@ -2303,6 +2636,19 @@ static int skinit_interception(struct vcpu_svm *svm)
2303 return 1; 2636 return 1;
2304} 2637}
2305 2638
2639static int xsetbv_interception(struct vcpu_svm *svm)
2640{
2641 u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
2642 u32 index = kvm_register_read(&svm->vcpu, VCPU_REGS_RCX);
2643
2644 if (kvm_set_xcr(&svm->vcpu, index, new_bv) == 0) {
2645 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
2646 skip_emulated_instruction(&svm->vcpu);
2647 }
2648
2649 return 1;
2650}
2651
2306static int invalid_op_interception(struct vcpu_svm *svm) 2652static int invalid_op_interception(struct vcpu_svm *svm)
2307{ 2653{
2308 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2654 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
@@ -2384,34 +2730,162 @@ static int cpuid_interception(struct vcpu_svm *svm)
2384static int iret_interception(struct vcpu_svm *svm) 2730static int iret_interception(struct vcpu_svm *svm)
2385{ 2731{
2386 ++svm->vcpu.stat.nmi_window_exits; 2732 ++svm->vcpu.stat.nmi_window_exits;
2387 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 2733 clr_intercept(svm, INTERCEPT_IRET);
2388 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2734 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2735 svm->nmi_iret_rip = kvm_rip_read(&svm->vcpu);
2389 return 1; 2736 return 1;
2390} 2737}
2391 2738
2392static int invlpg_interception(struct vcpu_svm *svm) 2739static int invlpg_interception(struct vcpu_svm *svm)
2393{ 2740{
2394 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2741 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2742 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2743
2744 kvm_mmu_invlpg(&svm->vcpu, svm->vmcb->control.exit_info_1);
2745 skip_emulated_instruction(&svm->vcpu);
2746 return 1;
2395} 2747}
2396 2748
2397static int emulate_on_interception(struct vcpu_svm *svm) 2749static int emulate_on_interception(struct vcpu_svm *svm)
2398{ 2750{
2399 return emulate_instruction(&svm->vcpu, 0, 0, 0) == EMULATE_DONE; 2751 return emulate_instruction(&svm->vcpu, 0) == EMULATE_DONE;
2752}
2753
2754bool check_selective_cr0_intercepted(struct vcpu_svm *svm, unsigned long val)
2755{
2756 unsigned long cr0 = svm->vcpu.arch.cr0;
2757 bool ret = false;
2758 u64 intercept;
2759
2760 intercept = svm->nested.intercept;
2761
2762 if (!is_guest_mode(&svm->vcpu) ||
2763 (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0))))
2764 return false;
2765
2766 cr0 &= ~SVM_CR0_SELECTIVE_MASK;
2767 val &= ~SVM_CR0_SELECTIVE_MASK;
2768
2769 if (cr0 ^ val) {
2770 svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE;
2771 ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE);
2772 }
2773
2774 return ret;
2775}
2776
2777#define CR_VALID (1ULL << 63)
2778
2779static int cr_interception(struct vcpu_svm *svm)
2780{
2781 int reg, cr;
2782 unsigned long val;
2783 int err;
2784
2785 if (!static_cpu_has(X86_FEATURE_DECODEASSISTS))
2786 return emulate_on_interception(svm);
2787
2788 if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0))
2789 return emulate_on_interception(svm);
2790
2791 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2792 cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0;
2793
2794 err = 0;
2795 if (cr >= 16) { /* mov to cr */
2796 cr -= 16;
2797 val = kvm_register_read(&svm->vcpu, reg);
2798 switch (cr) {
2799 case 0:
2800 if (!check_selective_cr0_intercepted(svm, val))
2801 err = kvm_set_cr0(&svm->vcpu, val);
2802 else
2803 return 1;
2804
2805 break;
2806 case 3:
2807 err = kvm_set_cr3(&svm->vcpu, val);
2808 break;
2809 case 4:
2810 err = kvm_set_cr4(&svm->vcpu, val);
2811 break;
2812 case 8:
2813 err = kvm_set_cr8(&svm->vcpu, val);
2814 break;
2815 default:
2816 WARN(1, "unhandled write to CR%d", cr);
2817 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2818 return 1;
2819 }
2820 } else { /* mov from cr */
2821 switch (cr) {
2822 case 0:
2823 val = kvm_read_cr0(&svm->vcpu);
2824 break;
2825 case 2:
2826 val = svm->vcpu.arch.cr2;
2827 break;
2828 case 3:
2829 val = kvm_read_cr3(&svm->vcpu);
2830 break;
2831 case 4:
2832 val = kvm_read_cr4(&svm->vcpu);
2833 break;
2834 case 8:
2835 val = kvm_get_cr8(&svm->vcpu);
2836 break;
2837 default:
2838 WARN(1, "unhandled read from CR%d", cr);
2839 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2840 return 1;
2841 }
2842 kvm_register_write(&svm->vcpu, reg, val);
2843 }
2844 kvm_complete_insn_gp(&svm->vcpu, err);
2845
2846 return 1;
2847}
2848
2849static int dr_interception(struct vcpu_svm *svm)
2850{
2851 int reg, dr;
2852 unsigned long val;
2853 int err;
2854
2855 if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS))
2856 return emulate_on_interception(svm);
2857
2858 reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK;
2859 dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0;
2860
2861 if (dr >= 16) { /* mov to DRn */
2862 val = kvm_register_read(&svm->vcpu, reg);
2863 kvm_set_dr(&svm->vcpu, dr - 16, val);
2864 } else {
2865 err = kvm_get_dr(&svm->vcpu, dr, &val);
2866 if (!err)
2867 kvm_register_write(&svm->vcpu, reg, val);
2868 }
2869
2870 skip_emulated_instruction(&svm->vcpu);
2871
2872 return 1;
2400} 2873}
2401 2874
2402static int cr8_write_interception(struct vcpu_svm *svm) 2875static int cr8_write_interception(struct vcpu_svm *svm)
2403{ 2876{
2404 struct kvm_run *kvm_run = svm->vcpu.run; 2877 struct kvm_run *kvm_run = svm->vcpu.run;
2878 int r;
2405 2879
2406 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2880 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2407 /* instruction emulation calls kvm_set_cr8() */ 2881 /* instruction emulation calls kvm_set_cr8() */
2408 emulate_instruction(&svm->vcpu, 0, 0, 0); 2882 r = cr_interception(svm);
2409 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2883 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2410 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2884 clr_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2411 return 1; 2885 return r;
2412 } 2886 }
2413 if (cr8_prev <= kvm_get_cr8(&svm->vcpu)) 2887 if (cr8_prev <= kvm_get_cr8(&svm->vcpu))
2414 return 1; 2888 return r;
2415 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 2889 kvm_run->exit_reason = KVM_EXIT_SET_TPR;
2416 return 0; 2890 return 0;
2417} 2891}
@@ -2422,14 +2896,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2422 2896
2423 switch (ecx) { 2897 switch (ecx) {
2424 case MSR_IA32_TSC: { 2898 case MSR_IA32_TSC: {
2425 u64 tsc_offset; 2899 struct vmcb *vmcb = get_host_vmcb(svm);
2426 2900
2427 if (is_nested(svm)) 2901 *data = vmcb->control.tsc_offset +
2428 tsc_offset = svm->nested.hsave->control.tsc_offset; 2902 svm_scale_tsc(vcpu, native_read_tsc());
2429 else
2430 tsc_offset = svm->vmcb->control.tsc_offset;
2431 2903
2432 *data = tsc_offset + native_read_tsc();
2433 break; 2904 break;
2434 } 2905 }
2435 case MSR_STAR: 2906 case MSR_STAR:
@@ -2542,20 +3013,9 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2542 struct vcpu_svm *svm = to_svm(vcpu); 3013 struct vcpu_svm *svm = to_svm(vcpu);
2543 3014
2544 switch (ecx) { 3015 switch (ecx) {
2545 case MSR_IA32_TSC: { 3016 case MSR_IA32_TSC:
2546 u64 tsc_offset = data - native_read_tsc(); 3017 kvm_write_tsc(vcpu, data);
2547 u64 g_tsc_offset = 0;
2548
2549 if (is_nested(svm)) {
2550 g_tsc_offset = svm->vmcb->control.tsc_offset -
2551 svm->nested.hsave->control.tsc_offset;
2552 svm->nested.hsave->control.tsc_offset = tsc_offset;
2553 }
2554
2555 svm->vmcb->control.tsc_offset = tsc_offset + g_tsc_offset;
2556
2557 break; 3018 break;
2558 }
2559 case MSR_STAR: 3019 case MSR_STAR:
2560 svm->vmcb->save.star = data; 3020 svm->vmcb->save.star = data;
2561 break; 3021 break;
@@ -2585,7 +3045,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2585 svm->vmcb->save.sysenter_esp = data; 3045 svm->vmcb->save.sysenter_esp = data;
2586 break; 3046 break;
2587 case MSR_IA32_DEBUGCTLMSR: 3047 case MSR_IA32_DEBUGCTLMSR:
2588 if (!svm_has(SVM_FEATURE_LBRV)) { 3048 if (!boot_cpu_has(X86_FEATURE_LBRV)) {
2589 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n", 3049 pr_unimpl(vcpu, "%s: MSR_IA32_DEBUGCTL 0x%llx, nop\n",
2590 __func__, data); 3050 __func__, data);
2591 break; 3051 break;
@@ -2594,6 +3054,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2594 return 1; 3054 return 1;
2595 3055
2596 svm->vmcb->save.dbgctl = data; 3056 svm->vmcb->save.dbgctl = data;
3057 mark_dirty(svm->vmcb, VMCB_LBR);
2597 if (data & (1ULL<<0)) 3058 if (data & (1ULL<<0))
2598 svm_enable_lbrv(svm); 3059 svm_enable_lbrv(svm);
2599 else 3060 else
@@ -2643,8 +3104,10 @@ static int interrupt_window_interception(struct vcpu_svm *svm)
2643{ 3104{
2644 struct kvm_run *kvm_run = svm->vcpu.run; 3105 struct kvm_run *kvm_run = svm->vcpu.run;
2645 3106
3107 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
2646 svm_clear_vintr(svm); 3108 svm_clear_vintr(svm);
2647 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 3109 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
3110 mark_dirty(svm->vmcb, VMCB_INTR);
2648 /* 3111 /*
2649 * If the user space waits to inject interrupts, exit as soon as 3112 * If the user space waits to inject interrupts, exit as soon as
2650 * possible 3113 * possible
@@ -2667,31 +3130,31 @@ static int pause_interception(struct vcpu_svm *svm)
2667} 3130}
2668 3131
2669static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = { 3132static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2670 [SVM_EXIT_READ_CR0] = emulate_on_interception, 3133 [SVM_EXIT_READ_CR0] = cr_interception,
2671 [SVM_EXIT_READ_CR3] = emulate_on_interception, 3134 [SVM_EXIT_READ_CR3] = cr_interception,
2672 [SVM_EXIT_READ_CR4] = emulate_on_interception, 3135 [SVM_EXIT_READ_CR4] = cr_interception,
2673 [SVM_EXIT_READ_CR8] = emulate_on_interception, 3136 [SVM_EXIT_READ_CR8] = cr_interception,
2674 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception, 3137 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2675 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 3138 [SVM_EXIT_WRITE_CR0] = cr_interception,
2676 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 3139 [SVM_EXIT_WRITE_CR3] = cr_interception,
2677 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 3140 [SVM_EXIT_WRITE_CR4] = cr_interception,
2678 [SVM_EXIT_WRITE_CR8] = cr8_write_interception, 3141 [SVM_EXIT_WRITE_CR8] = cr8_write_interception,
2679 [SVM_EXIT_READ_DR0] = emulate_on_interception, 3142 [SVM_EXIT_READ_DR0] = dr_interception,
2680 [SVM_EXIT_READ_DR1] = emulate_on_interception, 3143 [SVM_EXIT_READ_DR1] = dr_interception,
2681 [SVM_EXIT_READ_DR2] = emulate_on_interception, 3144 [SVM_EXIT_READ_DR2] = dr_interception,
2682 [SVM_EXIT_READ_DR3] = emulate_on_interception, 3145 [SVM_EXIT_READ_DR3] = dr_interception,
2683 [SVM_EXIT_READ_DR4] = emulate_on_interception, 3146 [SVM_EXIT_READ_DR4] = dr_interception,
2684 [SVM_EXIT_READ_DR5] = emulate_on_interception, 3147 [SVM_EXIT_READ_DR5] = dr_interception,
2685 [SVM_EXIT_READ_DR6] = emulate_on_interception, 3148 [SVM_EXIT_READ_DR6] = dr_interception,
2686 [SVM_EXIT_READ_DR7] = emulate_on_interception, 3149 [SVM_EXIT_READ_DR7] = dr_interception,
2687 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 3150 [SVM_EXIT_WRITE_DR0] = dr_interception,
2688 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 3151 [SVM_EXIT_WRITE_DR1] = dr_interception,
2689 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 3152 [SVM_EXIT_WRITE_DR2] = dr_interception,
2690 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 3153 [SVM_EXIT_WRITE_DR3] = dr_interception,
2691 [SVM_EXIT_WRITE_DR4] = emulate_on_interception, 3154 [SVM_EXIT_WRITE_DR4] = dr_interception,
2692 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 3155 [SVM_EXIT_WRITE_DR5] = dr_interception,
2693 [SVM_EXIT_WRITE_DR6] = emulate_on_interception, 3156 [SVM_EXIT_WRITE_DR6] = dr_interception,
2694 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 3157 [SVM_EXIT_WRITE_DR7] = dr_interception,
2695 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 3158 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2696 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 3159 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
2697 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, 3160 [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception,
@@ -2724,100 +3187,121 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2724 [SVM_EXIT_WBINVD] = emulate_on_interception, 3187 [SVM_EXIT_WBINVD] = emulate_on_interception,
2725 [SVM_EXIT_MONITOR] = invalid_op_interception, 3188 [SVM_EXIT_MONITOR] = invalid_op_interception,
2726 [SVM_EXIT_MWAIT] = invalid_op_interception, 3189 [SVM_EXIT_MWAIT] = invalid_op_interception,
3190 [SVM_EXIT_XSETBV] = xsetbv_interception,
2727 [SVM_EXIT_NPF] = pf_interception, 3191 [SVM_EXIT_NPF] = pf_interception,
2728}; 3192};
2729 3193
2730void dump_vmcb(struct kvm_vcpu *vcpu) 3194static void dump_vmcb(struct kvm_vcpu *vcpu)
2731{ 3195{
2732 struct vcpu_svm *svm = to_svm(vcpu); 3196 struct vcpu_svm *svm = to_svm(vcpu);
2733 struct vmcb_control_area *control = &svm->vmcb->control; 3197 struct vmcb_control_area *control = &svm->vmcb->control;
2734 struct vmcb_save_area *save = &svm->vmcb->save; 3198 struct vmcb_save_area *save = &svm->vmcb->save;
2735 3199
2736 pr_err("VMCB Control Area:\n"); 3200 pr_err("VMCB Control Area:\n");
2737 pr_err("cr_read: %04x\n", control->intercept_cr_read); 3201 pr_err("%-20s%04x\n", "cr_read:", control->intercept_cr & 0xffff);
2738 pr_err("cr_write: %04x\n", control->intercept_cr_write); 3202 pr_err("%-20s%04x\n", "cr_write:", control->intercept_cr >> 16);
2739 pr_err("dr_read: %04x\n", control->intercept_dr_read); 3203 pr_err("%-20s%04x\n", "dr_read:", control->intercept_dr & 0xffff);
2740 pr_err("dr_write: %04x\n", control->intercept_dr_write); 3204 pr_err("%-20s%04x\n", "dr_write:", control->intercept_dr >> 16);
2741 pr_err("exceptions: %08x\n", control->intercept_exceptions); 3205 pr_err("%-20s%08x\n", "exceptions:", control->intercept_exceptions);
2742 pr_err("intercepts: %016llx\n", control->intercept); 3206 pr_err("%-20s%016llx\n", "intercepts:", control->intercept);
2743 pr_err("pause filter count: %d\n", control->pause_filter_count); 3207 pr_err("%-20s%d\n", "pause filter count:", control->pause_filter_count);
2744 pr_err("iopm_base_pa: %016llx\n", control->iopm_base_pa); 3208 pr_err("%-20s%016llx\n", "iopm_base_pa:", control->iopm_base_pa);
2745 pr_err("msrpm_base_pa: %016llx\n", control->msrpm_base_pa); 3209 pr_err("%-20s%016llx\n", "msrpm_base_pa:", control->msrpm_base_pa);
2746 pr_err("tsc_offset: %016llx\n", control->tsc_offset); 3210 pr_err("%-20s%016llx\n", "tsc_offset:", control->tsc_offset);
2747 pr_err("asid: %d\n", control->asid); 3211 pr_err("%-20s%d\n", "asid:", control->asid);
2748 pr_err("tlb_ctl: %d\n", control->tlb_ctl); 3212 pr_err("%-20s%d\n", "tlb_ctl:", control->tlb_ctl);
2749 pr_err("int_ctl: %08x\n", control->int_ctl); 3213 pr_err("%-20s%08x\n", "int_ctl:", control->int_ctl);
2750 pr_err("int_vector: %08x\n", control->int_vector); 3214 pr_err("%-20s%08x\n", "int_vector:", control->int_vector);
2751 pr_err("int_state: %08x\n", control->int_state); 3215 pr_err("%-20s%08x\n", "int_state:", control->int_state);
2752 pr_err("exit_code: %08x\n", control->exit_code); 3216 pr_err("%-20s%08x\n", "exit_code:", control->exit_code);
2753 pr_err("exit_info1: %016llx\n", control->exit_info_1); 3217 pr_err("%-20s%016llx\n", "exit_info1:", control->exit_info_1);
2754 pr_err("exit_info2: %016llx\n", control->exit_info_2); 3218 pr_err("%-20s%016llx\n", "exit_info2:", control->exit_info_2);
2755 pr_err("exit_int_info: %08x\n", control->exit_int_info); 3219 pr_err("%-20s%08x\n", "exit_int_info:", control->exit_int_info);
2756 pr_err("exit_int_info_err: %08x\n", control->exit_int_info_err); 3220 pr_err("%-20s%08x\n", "exit_int_info_err:", control->exit_int_info_err);
2757 pr_err("nested_ctl: %lld\n", control->nested_ctl); 3221 pr_err("%-20s%lld\n", "nested_ctl:", control->nested_ctl);
2758 pr_err("nested_cr3: %016llx\n", control->nested_cr3); 3222 pr_err("%-20s%016llx\n", "nested_cr3:", control->nested_cr3);
2759 pr_err("event_inj: %08x\n", control->event_inj); 3223 pr_err("%-20s%08x\n", "event_inj:", control->event_inj);
2760 pr_err("event_inj_err: %08x\n", control->event_inj_err); 3224 pr_err("%-20s%08x\n", "event_inj_err:", control->event_inj_err);
2761 pr_err("lbr_ctl: %lld\n", control->lbr_ctl); 3225 pr_err("%-20s%lld\n", "lbr_ctl:", control->lbr_ctl);
2762 pr_err("next_rip: %016llx\n", control->next_rip); 3226 pr_err("%-20s%016llx\n", "next_rip:", control->next_rip);
2763 pr_err("VMCB State Save Area:\n"); 3227 pr_err("VMCB State Save Area:\n");
2764 pr_err("es: s: %04x a: %04x l: %08x b: %016llx\n", 3228 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2765 save->es.selector, save->es.attrib, 3229 "es:",
2766 save->es.limit, save->es.base); 3230 save->es.selector, save->es.attrib,
2767 pr_err("cs: s: %04x a: %04x l: %08x b: %016llx\n", 3231 save->es.limit, save->es.base);
2768 save->cs.selector, save->cs.attrib, 3232 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2769 save->cs.limit, save->cs.base); 3233 "cs:",
2770 pr_err("ss: s: %04x a: %04x l: %08x b: %016llx\n", 3234 save->cs.selector, save->cs.attrib,
2771 save->ss.selector, save->ss.attrib, 3235 save->cs.limit, save->cs.base);
2772 save->ss.limit, save->ss.base); 3236 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2773 pr_err("ds: s: %04x a: %04x l: %08x b: %016llx\n", 3237 "ss:",
2774 save->ds.selector, save->ds.attrib, 3238 save->ss.selector, save->ss.attrib,
2775 save->ds.limit, save->ds.base); 3239 save->ss.limit, save->ss.base);
2776 pr_err("fs: s: %04x a: %04x l: %08x b: %016llx\n", 3240 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2777 save->fs.selector, save->fs.attrib, 3241 "ds:",
2778 save->fs.limit, save->fs.base); 3242 save->ds.selector, save->ds.attrib,
2779 pr_err("gs: s: %04x a: %04x l: %08x b: %016llx\n", 3243 save->ds.limit, save->ds.base);
2780 save->gs.selector, save->gs.attrib, 3244 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2781 save->gs.limit, save->gs.base); 3245 "fs:",
2782 pr_err("gdtr: s: %04x a: %04x l: %08x b: %016llx\n", 3246 save->fs.selector, save->fs.attrib,
2783 save->gdtr.selector, save->gdtr.attrib, 3247 save->fs.limit, save->fs.base);
2784 save->gdtr.limit, save->gdtr.base); 3248 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2785 pr_err("ldtr: s: %04x a: %04x l: %08x b: %016llx\n", 3249 "gs:",
2786 save->ldtr.selector, save->ldtr.attrib, 3250 save->gs.selector, save->gs.attrib,
2787 save->ldtr.limit, save->ldtr.base); 3251 save->gs.limit, save->gs.base);
2788 pr_err("idtr: s: %04x a: %04x l: %08x b: %016llx\n", 3252 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2789 save->idtr.selector, save->idtr.attrib, 3253 "gdtr:",
2790 save->idtr.limit, save->idtr.base); 3254 save->gdtr.selector, save->gdtr.attrib,
2791 pr_err("tr: s: %04x a: %04x l: %08x b: %016llx\n", 3255 save->gdtr.limit, save->gdtr.base);
2792 save->tr.selector, save->tr.attrib, 3256 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
2793 save->tr.limit, save->tr.base); 3257 "ldtr:",
3258 save->ldtr.selector, save->ldtr.attrib,
3259 save->ldtr.limit, save->ldtr.base);
3260 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3261 "idtr:",
3262 save->idtr.selector, save->idtr.attrib,
3263 save->idtr.limit, save->idtr.base);
3264 pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n",
3265 "tr:",
3266 save->tr.selector, save->tr.attrib,
3267 save->tr.limit, save->tr.base);
2794 pr_err("cpl: %d efer: %016llx\n", 3268 pr_err("cpl: %d efer: %016llx\n",
2795 save->cpl, save->efer); 3269 save->cpl, save->efer);
2796 pr_err("cr0: %016llx cr2: %016llx\n", 3270 pr_err("%-15s %016llx %-13s %016llx\n",
2797 save->cr0, save->cr2); 3271 "cr0:", save->cr0, "cr2:", save->cr2);
2798 pr_err("cr3: %016llx cr4: %016llx\n", 3272 pr_err("%-15s %016llx %-13s %016llx\n",
2799 save->cr3, save->cr4); 3273 "cr3:", save->cr3, "cr4:", save->cr4);
2800 pr_err("dr6: %016llx dr7: %016llx\n", 3274 pr_err("%-15s %016llx %-13s %016llx\n",
2801 save->dr6, save->dr7); 3275 "dr6:", save->dr6, "dr7:", save->dr7);
2802 pr_err("rip: %016llx rflags: %016llx\n", 3276 pr_err("%-15s %016llx %-13s %016llx\n",
2803 save->rip, save->rflags); 3277 "rip:", save->rip, "rflags:", save->rflags);
2804 pr_err("rsp: %016llx rax: %016llx\n", 3278 pr_err("%-15s %016llx %-13s %016llx\n",
2805 save->rsp, save->rax); 3279 "rsp:", save->rsp, "rax:", save->rax);
2806 pr_err("star: %016llx lstar: %016llx\n", 3280 pr_err("%-15s %016llx %-13s %016llx\n",
2807 save->star, save->lstar); 3281 "star:", save->star, "lstar:", save->lstar);
2808 pr_err("cstar: %016llx sfmask: %016llx\n", 3282 pr_err("%-15s %016llx %-13s %016llx\n",
2809 save->cstar, save->sfmask); 3283 "cstar:", save->cstar, "sfmask:", save->sfmask);
2810 pr_err("kernel_gs_base: %016llx sysenter_cs: %016llx\n", 3284 pr_err("%-15s %016llx %-13s %016llx\n",
2811 save->kernel_gs_base, save->sysenter_cs); 3285 "kernel_gs_base:", save->kernel_gs_base,
2812 pr_err("sysenter_esp: %016llx sysenter_eip: %016llx\n", 3286 "sysenter_cs:", save->sysenter_cs);
2813 save->sysenter_esp, save->sysenter_eip); 3287 pr_err("%-15s %016llx %-13s %016llx\n",
2814 pr_err("gpat: %016llx dbgctl: %016llx\n", 3288 "sysenter_esp:", save->sysenter_esp,
2815 save->g_pat, save->dbgctl); 3289 "sysenter_eip:", save->sysenter_eip);
2816 pr_err("br_from: %016llx br_to: %016llx\n", 3290 pr_err("%-15s %016llx %-13s %016llx\n",
2817 save->br_from, save->br_to); 3291 "gpat:", save->g_pat, "dbgctl:", save->dbgctl);
2818 pr_err("excp_from: %016llx excp_to: %016llx\n", 3292 pr_err("%-15s %016llx %-13s %016llx\n",
2819 save->last_excp_from, save->last_excp_to); 3293 "br_from:", save->br_from, "br_to:", save->br_to);
2820 3294 pr_err("%-15s %016llx %-13s %016llx\n",
3295 "excp_from:", save->last_excp_from,
3296 "excp_to:", save->last_excp_to);
3297}
3298
3299static void svm_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3300{
3301 struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control;
3302
3303 *info1 = control->exit_info_1;
3304 *info2 = control->exit_info_2;
2821} 3305}
2822 3306
2823static int handle_exit(struct kvm_vcpu *vcpu) 3307static int handle_exit(struct kvm_vcpu *vcpu)
@@ -2826,9 +3310,9 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2826 struct kvm_run *kvm_run = vcpu->run; 3310 struct kvm_run *kvm_run = vcpu->run;
2827 u32 exit_code = svm->vmcb->control.exit_code; 3311 u32 exit_code = svm->vmcb->control.exit_code;
2828 3312
2829 trace_kvm_exit(exit_code, vcpu); 3313 trace_kvm_exit(exit_code, vcpu, KVM_ISA_SVM);
2830 3314
2831 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK)) 3315 if (!is_cr_intercept(svm, INTERCEPT_CR0_WRITE))
2832 vcpu->arch.cr0 = svm->vmcb->save.cr0; 3316 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2833 if (npt_enabled) 3317 if (npt_enabled)
2834 vcpu->arch.cr3 = svm->vmcb->save.cr3; 3318 vcpu->arch.cr3 = svm->vmcb->save.cr3;
@@ -2840,7 +3324,7 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2840 return 1; 3324 return 1;
2841 } 3325 }
2842 3326
2843 if (is_nested(svm)) { 3327 if (is_guest_mode(vcpu)) {
2844 int vmexit; 3328 int vmexit;
2845 3329
2846 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code, 3330 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
@@ -2871,7 +3355,8 @@ static int handle_exit(struct kvm_vcpu *vcpu)
2871 3355
2872 if (is_external_interrupt(svm->vmcb->control.exit_int_info) && 3356 if (is_external_interrupt(svm->vmcb->control.exit_int_info) &&
2873 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR && 3357 exit_code != SVM_EXIT_EXCP_BASE + PF_VECTOR &&
2874 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH) 3358 exit_code != SVM_EXIT_NPF && exit_code != SVM_EXIT_TASK_SWITCH &&
3359 exit_code != SVM_EXIT_INTR && exit_code != SVM_EXIT_NMI)
2875 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x " 3360 printk(KERN_ERR "%s: unexpected exit_ini_info 0x%x "
2876 "exit_code 0x%x\n", 3361 "exit_code 0x%x\n",
2877 __func__, svm->vmcb->control.exit_int_info, 3362 __func__, svm->vmcb->control.exit_int_info,
@@ -2902,7 +3387,6 @@ static void pre_svm_run(struct vcpu_svm *svm)
2902 3387
2903 struct svm_cpu_data *sd = per_cpu(svm_data, cpu); 3388 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2904 3389
2905 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2906 /* FIXME: handle wraparound of asid_generation */ 3390 /* FIXME: handle wraparound of asid_generation */
2907 if (svm->asid_generation != sd->asid_generation) 3391 if (svm->asid_generation != sd->asid_generation)
2908 new_asid(svm, sd); 3392 new_asid(svm, sd);
@@ -2914,7 +3398,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2914 3398
2915 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 3399 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2916 vcpu->arch.hflags |= HF_NMI_MASK; 3400 vcpu->arch.hflags |= HF_NMI_MASK;
2917 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3401 set_intercept(svm, INTERCEPT_IRET);
2918 ++vcpu->stat.nmi_injections; 3402 ++vcpu->stat.nmi_injections;
2919} 3403}
2920 3404
@@ -2927,6 +3411,7 @@ static inline void svm_inject_irq(struct vcpu_svm *svm, int irq)
2927 control->int_ctl &= ~V_INTR_PRIO_MASK; 3411 control->int_ctl &= ~V_INTR_PRIO_MASK;
2928 control->int_ctl |= V_IRQ_MASK | 3412 control->int_ctl |= V_IRQ_MASK |
2929 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); 3413 ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT);
3414 mark_dirty(svm->vmcb, VMCB_INTR);
2930} 3415}
2931 3416
2932static void svm_set_irq(struct kvm_vcpu *vcpu) 3417static void svm_set_irq(struct kvm_vcpu *vcpu)
@@ -2946,14 +3431,14 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
2946{ 3431{
2947 struct vcpu_svm *svm = to_svm(vcpu); 3432 struct vcpu_svm *svm = to_svm(vcpu);
2948 3433
2949 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3434 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
2950 return; 3435 return;
2951 3436
2952 if (irr == -1) 3437 if (irr == -1)
2953 return; 3438 return;
2954 3439
2955 if (tpr >= irr) 3440 if (tpr >= irr)
2956 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR8_MASK; 3441 set_cr_intercept(svm, INTERCEPT_CR8_WRITE);
2957} 3442}
2958 3443
2959static int svm_nmi_allowed(struct kvm_vcpu *vcpu) 3444static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2981,10 +3466,10 @@ static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2981 3466
2982 if (masked) { 3467 if (masked) {
2983 svm->vcpu.arch.hflags |= HF_NMI_MASK; 3468 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2984 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET); 3469 set_intercept(svm, INTERCEPT_IRET);
2985 } else { 3470 } else {
2986 svm->vcpu.arch.hflags &= ~HF_NMI_MASK; 3471 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2987 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET); 3472 clr_intercept(svm, INTERCEPT_IRET);
2988 } 3473 }
2989} 3474}
2990 3475
@@ -2998,9 +3483,9 @@ static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2998 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK)) 3483 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2999 return 0; 3484 return 0;
3000 3485
3001 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF); 3486 ret = !!(kvm_get_rflags(vcpu) & X86_EFLAGS_IF);
3002 3487
3003 if (is_nested(svm)) 3488 if (is_guest_mode(vcpu))
3004 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK); 3489 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
3005 3490
3006 return ret; 3491 return ret;
@@ -3046,7 +3531,12 @@ static int svm_set_tss_addr(struct kvm *kvm, unsigned int addr)
3046 3531
3047static void svm_flush_tlb(struct kvm_vcpu *vcpu) 3532static void svm_flush_tlb(struct kvm_vcpu *vcpu)
3048{ 3533{
3049 force_new_asid(vcpu); 3534 struct vcpu_svm *svm = to_svm(vcpu);
3535
3536 if (static_cpu_has(X86_FEATURE_FLUSHBYASID))
3537 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID;
3538 else
3539 svm->asid_generation--;
3050} 3540}
3051 3541
3052static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu) 3542static void svm_prepare_guest_switch(struct kvm_vcpu *vcpu)
@@ -3057,10 +3547,10 @@ static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu)
3057{ 3547{
3058 struct vcpu_svm *svm = to_svm(vcpu); 3548 struct vcpu_svm *svm = to_svm(vcpu);
3059 3549
3060 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3550 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3061 return; 3551 return;
3062 3552
3063 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR8_MASK)) { 3553 if (!is_cr_intercept(svm, INTERCEPT_CR8_WRITE)) {
3064 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; 3554 int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK;
3065 kvm_set_cr8(vcpu, cr8); 3555 kvm_set_cr8(vcpu, cr8);
3066 } 3556 }
@@ -3071,7 +3561,7 @@ static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu)
3071 struct vcpu_svm *svm = to_svm(vcpu); 3561 struct vcpu_svm *svm = to_svm(vcpu);
3072 u64 cr8; 3562 u64 cr8;
3073 3563
3074 if (is_nested(svm) && (vcpu->arch.hflags & HF_VINTR_MASK)) 3564 if (is_guest_mode(vcpu) && (vcpu->arch.hflags & HF_VINTR_MASK))
3075 return; 3565 return;
3076 3566
3077 cr8 = kvm_get_cr8(vcpu); 3567 cr8 = kvm_get_cr8(vcpu);
@@ -3088,8 +3578,15 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3088 3578
3089 svm->int3_injected = 0; 3579 svm->int3_injected = 0;
3090 3580
3091 if (svm->vcpu.arch.hflags & HF_IRET_MASK) 3581 /*
3582 * If we've made progress since setting HF_IRET_MASK, we've
3583 * executed an IRET and can allow NMI injection.
3584 */
3585 if ((svm->vcpu.arch.hflags & HF_IRET_MASK)
3586 && kvm_rip_read(&svm->vcpu) != svm->nmi_iret_rip) {
3092 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK); 3587 svm->vcpu.arch.hflags &= ~(HF_NMI_MASK | HF_IRET_MASK);
3588 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3589 }
3093 3590
3094 svm->vcpu.arch.nmi_injected = false; 3591 svm->vcpu.arch.nmi_injected = false;
3095 kvm_clear_exception_queue(&svm->vcpu); 3592 kvm_clear_exception_queue(&svm->vcpu);
@@ -3098,6 +3595,8 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3098 if (!(exitintinfo & SVM_EXITINTINFO_VALID)) 3595 if (!(exitintinfo & SVM_EXITINTINFO_VALID))
3099 return; 3596 return;
3100 3597
3598 kvm_make_request(KVM_REQ_EVENT, &svm->vcpu);
3599
3101 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; 3600 vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK;
3102 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; 3601 type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK;
3103 3602
@@ -3134,6 +3633,17 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3134 } 3633 }
3135} 3634}
3136 3635
3636static void svm_cancel_injection(struct kvm_vcpu *vcpu)
3637{
3638 struct vcpu_svm *svm = to_svm(vcpu);
3639 struct vmcb_control_area *control = &svm->vmcb->control;
3640
3641 control->exit_int_info = control->event_inj;
3642 control->exit_int_info_err = control->event_inj_err;
3643 control->event_inj = 0;
3644 svm_complete_interrupts(svm);
3645}
3646
3137#ifdef CONFIG_X86_64 3647#ifdef CONFIG_X86_64
3138#define R "r" 3648#define R "r"
3139#else 3649#else
@@ -3143,9 +3653,6 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
3143static void svm_vcpu_run(struct kvm_vcpu *vcpu) 3653static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3144{ 3654{
3145 struct vcpu_svm *svm = to_svm(vcpu); 3655 struct vcpu_svm *svm = to_svm(vcpu);
3146 u16 fs_selector;
3147 u16 gs_selector;
3148 u16 ldt_selector;
3149 3656
3150 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 3657 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
3151 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 3658 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
@@ -3162,14 +3669,7 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3162 3669
3163 sync_lapic_to_cr8(vcpu); 3670 sync_lapic_to_cr8(vcpu);
3164 3671
3165 save_host_msrs(vcpu);
3166 savesegment(fs, fs_selector);
3167 savesegment(gs, gs_selector);
3168 ldt_selector = kvm_read_ldt();
3169 svm->vmcb->save.cr2 = vcpu->arch.cr2; 3672 svm->vmcb->save.cr2 = vcpu->arch.cr2;
3170 /* required for live migration with NPT */
3171 if (npt_enabled)
3172 svm->vmcb->save.cr3 = vcpu->arch.cr3;
3173 3673
3174 clgi(); 3674 clgi();
3175 3675
@@ -3246,31 +3746,44 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3246#endif 3746#endif
3247 ); 3747 );
3248 3748
3249 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3250 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3251 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3252 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3253
3254 load_host_msrs(vcpu);
3255 loadsegment(fs, fs_selector);
3256#ifdef CONFIG_X86_64 3749#ifdef CONFIG_X86_64
3257 load_gs_index(gs_selector); 3750 wrmsrl(MSR_GS_BASE, svm->host.gs_base);
3258 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
3259#else 3751#else
3260 loadsegment(gs, gs_selector); 3752 loadsegment(fs, svm->host.fs);
3753#ifndef CONFIG_X86_32_LAZY_GS
3754 loadsegment(gs, svm->host.gs);
3755#endif
3261#endif 3756#endif
3262 kvm_load_ldt(ldt_selector);
3263 3757
3264 reload_tss(vcpu); 3758 reload_tss(vcpu);
3265 3759
3266 local_irq_disable(); 3760 local_irq_disable();
3267 3761
3762 vcpu->arch.cr2 = svm->vmcb->save.cr2;
3763 vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax;
3764 vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp;
3765 vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip;
3766
3767 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3768 kvm_before_handle_nmi(&svm->vcpu);
3769
3268 stgi(); 3770 stgi();
3269 3771
3772 /* Any pending NMI will happen here */
3773
3774 if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI))
3775 kvm_after_handle_nmi(&svm->vcpu);
3776
3270 sync_cr8_to_lapic(vcpu); 3777 sync_cr8_to_lapic(vcpu);
3271 3778
3272 svm->next_rip = 0; 3779 svm->next_rip = 0;
3273 3780
3781 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
3782
3783 /* if exit due to PF check for async PF */
3784 if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR)
3785 svm->apf_reason = kvm_read_and_reset_pf_reason();
3786
3274 if (npt_enabled) { 3787 if (npt_enabled) {
3275 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR); 3788 vcpu->arch.regs_avail &= ~(1 << VCPU_EXREG_PDPTR);
3276 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR); 3789 vcpu->arch.regs_dirty &= ~(1 << VCPU_EXREG_PDPTR);
@@ -3283,6 +3796,8 @@ static void svm_vcpu_run(struct kvm_vcpu *vcpu)
3283 if (unlikely(svm->vmcb->control.exit_code == 3796 if (unlikely(svm->vmcb->control.exit_code ==
3284 SVM_EXIT_EXCP_BASE + MC_VECTOR)) 3797 SVM_EXIT_EXCP_BASE + MC_VECTOR))
3285 svm_handle_mce(svm); 3798 svm_handle_mce(svm);
3799
3800 mark_all_clean(svm->vmcb);
3286} 3801}
3287 3802
3288#undef R 3803#undef R
@@ -3291,14 +3806,23 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3291{ 3806{
3292 struct vcpu_svm *svm = to_svm(vcpu); 3807 struct vcpu_svm *svm = to_svm(vcpu);
3293 3808
3294 if (npt_enabled) {
3295 svm->vmcb->control.nested_cr3 = root;
3296 force_new_asid(vcpu);
3297 return;
3298 }
3299
3300 svm->vmcb->save.cr3 = root; 3809 svm->vmcb->save.cr3 = root;
3301 force_new_asid(vcpu); 3810 mark_dirty(svm->vmcb, VMCB_CR);
3811 svm_flush_tlb(vcpu);
3812}
3813
3814static void set_tdp_cr3(struct kvm_vcpu *vcpu, unsigned long root)
3815{
3816 struct vcpu_svm *svm = to_svm(vcpu);
3817
3818 svm->vmcb->control.nested_cr3 = root;
3819 mark_dirty(svm->vmcb, VMCB_NPT);
3820
3821 /* Also sync guest cr3 here in case we live migrate */
3822 svm->vmcb->save.cr3 = kvm_read_cr3(vcpu);
3823 mark_dirty(svm->vmcb, VMCB_CR);
3824
3825 svm_flush_tlb(vcpu);
3302} 3826}
3303 3827
3304static int is_disabled(void) 3828static int is_disabled(void)
@@ -3333,15 +3857,6 @@ static bool svm_cpu_has_accelerated_tpr(void)
3333 return false; 3857 return false;
3334} 3858}
3335 3859
3336static int get_npt_level(void)
3337{
3338#ifdef CONFIG_X86_64
3339 return PT64_ROOT_LEVEL;
3340#else
3341 return PT32E_ROOT_LEVEL;
3342#endif
3343}
3344
3345static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio) 3860static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3346{ 3861{
3347 return 0; 3862 return 0;
@@ -3354,12 +3869,25 @@ static void svm_cpuid_update(struct kvm_vcpu *vcpu)
3354static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry) 3869static void svm_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
3355{ 3870{
3356 switch (func) { 3871 switch (func) {
3872 case 0x80000001:
3873 if (nested)
3874 entry->ecx |= (1 << 2); /* Set SVM bit */
3875 break;
3357 case 0x8000000A: 3876 case 0x8000000A:
3358 entry->eax = 1; /* SVM revision 1 */ 3877 entry->eax = 1; /* SVM revision 1 */
3359 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper 3878 entry->ebx = 8; /* Lets support 8 ASIDs in case we add proper
3360 ASID emulation to nested SVM */ 3879 ASID emulation to nested SVM */
3361 entry->ecx = 0; /* Reserved */ 3880 entry->ecx = 0; /* Reserved */
3362 entry->edx = 0; /* Do not support any additional features */ 3881 entry->edx = 0; /* Per default do not support any
3882 additional features */
3883
3884 /* Support next_rip if host supports it */
3885 if (boot_cpu_has(X86_FEATURE_NRIPS))
3886 entry->edx |= SVM_FEATURE_NRIP;
3887
3888 /* Support NPT for the guest if enabled */
3889 if (npt_enabled)
3890 entry->edx |= SVM_FEATURE_NPT;
3363 3891
3364 break; 3892 break;
3365 } 3893 }
@@ -3414,6 +3942,7 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
3414 { SVM_EXIT_WBINVD, "wbinvd" }, 3942 { SVM_EXIT_WBINVD, "wbinvd" },
3415 { SVM_EXIT_MONITOR, "monitor" }, 3943 { SVM_EXIT_MONITOR, "monitor" },
3416 { SVM_EXIT_MWAIT, "mwait" }, 3944 { SVM_EXIT_MWAIT, "mwait" },
3945 { SVM_EXIT_XSETBV, "xsetbv" },
3417 { SVM_EXIT_NPF, "npf" }, 3946 { SVM_EXIT_NPF, "npf" },
3418 { -1, NULL } 3947 { -1, NULL }
3419}; 3948};
@@ -3437,12 +3966,190 @@ static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
3437{ 3966{
3438 struct vcpu_svm *svm = to_svm(vcpu); 3967 struct vcpu_svm *svm = to_svm(vcpu);
3439 3968
3440 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR; 3969 set_exception_intercept(svm, NM_VECTOR);
3441 if (is_nested(svm))
3442 svm->nested.hsave->control.intercept_exceptions |= 1 << NM_VECTOR;
3443 update_cr0_intercept(svm); 3970 update_cr0_intercept(svm);
3444} 3971}
3445 3972
3973#define PRE_EX(exit) { .exit_code = (exit), \
3974 .stage = X86_ICPT_PRE_EXCEPT, }
3975#define POST_EX(exit) { .exit_code = (exit), \
3976 .stage = X86_ICPT_POST_EXCEPT, }
3977#define POST_MEM(exit) { .exit_code = (exit), \
3978 .stage = X86_ICPT_POST_MEMACCESS, }
3979
3980static struct __x86_intercept {
3981 u32 exit_code;
3982 enum x86_intercept_stage stage;
3983} x86_intercept_map[] = {
3984 [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0),
3985 [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0),
3986 [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0),
3987 [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0),
3988 [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0),
3989 [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0),
3990 [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0),
3991 [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ),
3992 [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ),
3993 [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE),
3994 [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE),
3995 [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ),
3996 [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ),
3997 [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE),
3998 [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE),
3999 [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN),
4000 [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL),
4001 [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD),
4002 [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE),
4003 [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI),
4004 [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI),
4005 [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT),
4006 [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA),
4007 [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP),
4008 [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR),
4009 [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT),
4010 [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG),
4011 [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD),
4012 [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD),
4013 [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR),
4014 [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC),
4015 [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR),
4016 [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC),
4017 [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID),
4018 [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM),
4019 [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE),
4020 [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF),
4021 [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF),
4022 [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT),
4023 [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET),
4024 [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP),
4025 [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT),
4026 [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO),
4027 [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO),
4028 [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO),
4029 [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO),
4030};
4031
4032#undef PRE_EX
4033#undef POST_EX
4034#undef POST_MEM
4035
4036static int svm_check_intercept(struct kvm_vcpu *vcpu,
4037 struct x86_instruction_info *info,
4038 enum x86_intercept_stage stage)
4039{
4040 struct vcpu_svm *svm = to_svm(vcpu);
4041 int vmexit, ret = X86EMUL_CONTINUE;
4042 struct __x86_intercept icpt_info;
4043 struct vmcb *vmcb = svm->vmcb;
4044
4045 if (info->intercept >= ARRAY_SIZE(x86_intercept_map))
4046 goto out;
4047
4048 icpt_info = x86_intercept_map[info->intercept];
4049
4050 if (stage != icpt_info.stage)
4051 goto out;
4052
4053 switch (icpt_info.exit_code) {
4054 case SVM_EXIT_READ_CR0:
4055 if (info->intercept == x86_intercept_cr_read)
4056 icpt_info.exit_code += info->modrm_reg;
4057 break;
4058 case SVM_EXIT_WRITE_CR0: {
4059 unsigned long cr0, val;
4060 u64 intercept;
4061
4062 if (info->intercept == x86_intercept_cr_write)
4063 icpt_info.exit_code += info->modrm_reg;
4064
4065 if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0)
4066 break;
4067
4068 intercept = svm->nested.intercept;
4069
4070 if (!(intercept & (1ULL << INTERCEPT_SELECTIVE_CR0)))
4071 break;
4072
4073 cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK;
4074 val = info->src_val & ~SVM_CR0_SELECTIVE_MASK;
4075
4076 if (info->intercept == x86_intercept_lmsw) {
4077 cr0 &= 0xfUL;
4078 val &= 0xfUL;
4079 /* lmsw can't clear PE - catch this here */
4080 if (cr0 & X86_CR0_PE)
4081 val |= X86_CR0_PE;
4082 }
4083
4084 if (cr0 ^ val)
4085 icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE;
4086
4087 break;
4088 }
4089 case SVM_EXIT_READ_DR0:
4090 case SVM_EXIT_WRITE_DR0:
4091 icpt_info.exit_code += info->modrm_reg;
4092 break;
4093 case SVM_EXIT_MSR:
4094 if (info->intercept == x86_intercept_wrmsr)
4095 vmcb->control.exit_info_1 = 1;
4096 else
4097 vmcb->control.exit_info_1 = 0;
4098 break;
4099 case SVM_EXIT_PAUSE:
4100 /*
4101 * We get this for NOP only, but pause
4102 * is rep not, check this here
4103 */
4104 if (info->rep_prefix != REPE_PREFIX)
4105 goto out;
4106 case SVM_EXIT_IOIO: {
4107 u64 exit_info;
4108 u32 bytes;
4109
4110 exit_info = (vcpu->arch.regs[VCPU_REGS_RDX] & 0xffff) << 16;
4111
4112 if (info->intercept == x86_intercept_in ||
4113 info->intercept == x86_intercept_ins) {
4114 exit_info |= SVM_IOIO_TYPE_MASK;
4115 bytes = info->src_bytes;
4116 } else {
4117 bytes = info->dst_bytes;
4118 }
4119
4120 if (info->intercept == x86_intercept_outs ||
4121 info->intercept == x86_intercept_ins)
4122 exit_info |= SVM_IOIO_STR_MASK;
4123
4124 if (info->rep_prefix)
4125 exit_info |= SVM_IOIO_REP_MASK;
4126
4127 bytes = min(bytes, 4u);
4128
4129 exit_info |= bytes << SVM_IOIO_SIZE_SHIFT;
4130
4131 exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1);
4132
4133 vmcb->control.exit_info_1 = exit_info;
4134 vmcb->control.exit_info_2 = info->next_rip;
4135
4136 break;
4137 }
4138 default:
4139 break;
4140 }
4141
4142 vmcb->control.next_rip = info->next_rip;
4143 vmcb->control.exit_code = icpt_info.exit_code;
4144 vmexit = nested_svm_exit_handled(svm);
4145
4146 ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED
4147 : X86EMUL_CONTINUE;
4148
4149out:
4150 return ret;
4151}
4152
3446static struct kvm_x86_ops svm_x86_ops = { 4153static struct kvm_x86_ops svm_x86_ops = {
3447 .cpu_has_kvm_support = has_svm, 4154 .cpu_has_kvm_support = has_svm,
3448 .disabled_by_bios = is_disabled, 4155 .disabled_by_bios = is_disabled,
@@ -3470,6 +4177,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3470 .get_cpl = svm_get_cpl, 4177 .get_cpl = svm_get_cpl,
3471 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 4178 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
3472 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits, 4179 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
4180 .decache_cr3 = svm_decache_cr3,
3473 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 4181 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
3474 .set_cr0 = svm_set_cr0, 4182 .set_cr0 = svm_set_cr0,
3475 .set_cr3 = svm_set_cr3, 4183 .set_cr3 = svm_set_cr3,
@@ -3497,6 +4205,7 @@ static struct kvm_x86_ops svm_x86_ops = {
3497 .set_irq = svm_set_irq, 4205 .set_irq = svm_set_irq,
3498 .set_nmi = svm_inject_nmi, 4206 .set_nmi = svm_inject_nmi,
3499 .queue_exception = svm_queue_exception, 4207 .queue_exception = svm_queue_exception,
4208 .cancel_injection = svm_cancel_injection,
3500 .interrupt_allowed = svm_interrupt_allowed, 4209 .interrupt_allowed = svm_interrupt_allowed,
3501 .nmi_allowed = svm_nmi_allowed, 4210 .nmi_allowed = svm_nmi_allowed,
3502 .get_nmi_mask = svm_get_nmi_mask, 4211 .get_nmi_mask = svm_get_nmi_mask,
@@ -3509,7 +4218,9 @@ static struct kvm_x86_ops svm_x86_ops = {
3509 .get_tdp_level = get_npt_level, 4218 .get_tdp_level = get_npt_level,
3510 .get_mt_mask = svm_get_mt_mask, 4219 .get_mt_mask = svm_get_mt_mask,
3511 4220
4221 .get_exit_info = svm_get_exit_info,
3512 .exit_reasons_str = svm_exit_reasons_str, 4222 .exit_reasons_str = svm_exit_reasons_str,
4223
3513 .get_lpage_level = svm_get_lpage_level, 4224 .get_lpage_level = svm_get_lpage_level,
3514 4225
3515 .cpuid_update = svm_cpuid_update, 4226 .cpuid_update = svm_cpuid_update,
@@ -3519,6 +4230,15 @@ static struct kvm_x86_ops svm_x86_ops = {
3519 .set_supported_cpuid = svm_set_supported_cpuid, 4230 .set_supported_cpuid = svm_set_supported_cpuid,
3520 4231
3521 .has_wbinvd_exit = svm_has_wbinvd_exit, 4232 .has_wbinvd_exit = svm_has_wbinvd_exit,
4233
4234 .set_tsc_khz = svm_set_tsc_khz,
4235 .write_tsc_offset = svm_write_tsc_offset,
4236 .adjust_tsc_offset = svm_adjust_tsc_offset,
4237 .compute_tsc_offset = svm_compute_tsc_offset,
4238
4239 .set_tdp_cr3 = set_tdp_cr3,
4240
4241 .check_intercept = svm_check_intercept,
3522}; 4242};
3523 4243
3524static int __init svm_init(void) 4244static int __init svm_init(void)
diff --git a/arch/x86/kvm/timer.c b/arch/x86/kvm/timer.c
index e16a0dbe74d8..abd86e865be3 100644
--- a/arch/x86/kvm/timer.c
+++ b/arch/x86/kvm/timer.c
@@ -6,7 +6,7 @@
6 * 6 *
7 * timer support 7 * timer support
8 * 8 *
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * This work is licensed under the terms of the GNU GPL, version 2. See 11 * This work is licensed under the terms of the GNU GPL, version 2. See
12 * the COPYING file in the top-level directory. 12 * the COPYING file in the top-level directory.
@@ -25,7 +25,7 @@ static int __kvm_timer_fn(struct kvm_vcpu *vcpu, struct kvm_timer *ktimer)
25 25
26 /* 26 /*
27 * There is a race window between reading and incrementing, but we do 27 * There is a race window between reading and incrementing, but we do
28 * not care about potentially loosing timer events in the !reinject 28 * not care about potentially losing timer events in the !reinject
29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked 29 * case anyway. Note: KVM_REQ_PENDING_TIMER is implicitly checked
30 * in vcpu_enter_guest. 30 * in vcpu_enter_guest.
31 */ 31 */
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index a6544b8e7c0f..db932760ea82 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -62,21 +62,21 @@ TRACE_EVENT(kvm_hv_hypercall,
62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa), 62 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
63 63
64 TP_STRUCT__entry( 64 TP_STRUCT__entry(
65 __field( __u16, code )
66 __field( bool, fast )
67 __field( __u16, rep_cnt ) 65 __field( __u16, rep_cnt )
68 __field( __u16, rep_idx ) 66 __field( __u16, rep_idx )
69 __field( __u64, ingpa ) 67 __field( __u64, ingpa )
70 __field( __u64, outgpa ) 68 __field( __u64, outgpa )
69 __field( __u16, code )
70 __field( bool, fast )
71 ), 71 ),
72 72
73 TP_fast_assign( 73 TP_fast_assign(
74 __entry->code = code;
75 __entry->fast = fast;
76 __entry->rep_cnt = rep_cnt; 74 __entry->rep_cnt = rep_cnt;
77 __entry->rep_idx = rep_idx; 75 __entry->rep_idx = rep_idx;
78 __entry->ingpa = ingpa; 76 __entry->ingpa = ingpa;
79 __entry->outgpa = outgpa; 77 __entry->outgpa = outgpa;
78 __entry->code = code;
79 __entry->fast = fast;
80 ), 80 ),
81 81
82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx", 82 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
@@ -178,27 +178,36 @@ TRACE_EVENT(kvm_apic,
178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val) 178#define trace_kvm_apic_read(reg, val) trace_kvm_apic(0, reg, val)
179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val) 179#define trace_kvm_apic_write(reg, val) trace_kvm_apic(1, reg, val)
180 180
181#define KVM_ISA_VMX 1
182#define KVM_ISA_SVM 2
183
181/* 184/*
182 * Tracepoint for kvm guest exit: 185 * Tracepoint for kvm guest exit:
183 */ 186 */
184TRACE_EVENT(kvm_exit, 187TRACE_EVENT(kvm_exit,
185 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu), 188 TP_PROTO(unsigned int exit_reason, struct kvm_vcpu *vcpu, u32 isa),
186 TP_ARGS(exit_reason, vcpu), 189 TP_ARGS(exit_reason, vcpu, isa),
187 190
188 TP_STRUCT__entry( 191 TP_STRUCT__entry(
189 __field( unsigned int, exit_reason ) 192 __field( unsigned int, exit_reason )
190 __field( unsigned long, guest_rip ) 193 __field( unsigned long, guest_rip )
194 __field( u32, isa )
195 __field( u64, info1 )
196 __field( u64, info2 )
191 ), 197 ),
192 198
193 TP_fast_assign( 199 TP_fast_assign(
194 __entry->exit_reason = exit_reason; 200 __entry->exit_reason = exit_reason;
195 __entry->guest_rip = kvm_rip_read(vcpu); 201 __entry->guest_rip = kvm_rip_read(vcpu);
202 __entry->isa = isa;
203 kvm_x86_ops->get_exit_info(vcpu, &__entry->info1,
204 &__entry->info2);
196 ), 205 ),
197 206
198 TP_printk("reason %s rip 0x%lx", 207 TP_printk("reason %s rip 0x%lx info %llx %llx",
199 ftrace_print_symbols_seq(p, __entry->exit_reason, 208 ftrace_print_symbols_seq(p, __entry->exit_reason,
200 kvm_x86_ops->exit_reasons_str), 209 kvm_x86_ops->exit_reasons_str),
201 __entry->guest_rip) 210 __entry->guest_rip, __entry->info1, __entry->info2)
202); 211);
203 212
204/* 213/*
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 7bddfab12013..d48ec60ea421 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -5,7 +5,7 @@
5 * machines without emulation or binary translation. 5 * machines without emulation or binary translation.
6 * 6 *
7 * Copyright (C) 2006 Qumranet, Inc. 7 * Copyright (C) 2006 Qumranet, Inc.
8 * Copyright 2010 Red Hat, Inc. and/or its affilates. 8 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
9 * 9 *
10 * Authors: 10 * Authors:
11 * Avi Kivity <avi@qumranet.com> 11 * Avi Kivity <avi@qumranet.com>
@@ -69,6 +69,9 @@ module_param(emulate_invalid_guest_state, bool, S_IRUGO);
69static int __read_mostly vmm_exclusive = 1; 69static int __read_mostly vmm_exclusive = 1;
70module_param(vmm_exclusive, bool, S_IRUGO); 70module_param(vmm_exclusive, bool, S_IRUGO);
71 71
72static int __read_mostly yield_on_hlt = 1;
73module_param(yield_on_hlt, bool, S_IRUGO);
74
72#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \ 75#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
73 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD) 76 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
74#define KVM_GUEST_CR0_MASK \ 77#define KVM_GUEST_CR0_MASK \
@@ -90,14 +93,14 @@ module_param(vmm_exclusive, bool, S_IRUGO);
90 * These 2 parameters are used to config the controls for Pause-Loop Exiting: 93 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
91 * ple_gap: upper bound on the amount of time between two successive 94 * ple_gap: upper bound on the amount of time between two successive
92 * executions of PAUSE in a loop. Also indicate if ple enabled. 95 * executions of PAUSE in a loop. Also indicate if ple enabled.
93 * According to test, this time is usually small than 41 cycles. 96 * According to test, this time is usually smaller than 128 cycles.
94 * ple_window: upper bound on the amount of time a guest is allowed to execute 97 * ple_window: upper bound on the amount of time a guest is allowed to execute
95 * in a PAUSE loop. Tests indicate that most spinlocks are held for 98 * in a PAUSE loop. Tests indicate that most spinlocks are held for
96 * less than 2^12 cycles 99 * less than 2^12 cycles
97 * Time is measured based on a counter that runs at the same rate as the TSC, 100 * Time is measured based on a counter that runs at the same rate as the TSC,
98 * refer SDM volume 3b section 21.6.13 & 22.1.3. 101 * refer SDM volume 3b section 21.6.13 & 22.1.3.
99 */ 102 */
100#define KVM_VMX_DEFAULT_PLE_GAP 41 103#define KVM_VMX_DEFAULT_PLE_GAP 128
101#define KVM_VMX_DEFAULT_PLE_WINDOW 4096 104#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
102static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP; 105static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
103module_param(ple_gap, int, S_IRUGO); 106module_param(ple_gap, int, S_IRUGO);
@@ -125,7 +128,11 @@ struct vcpu_vmx {
125 unsigned long host_rsp; 128 unsigned long host_rsp;
126 int launched; 129 int launched;
127 u8 fail; 130 u8 fail;
131 u8 cpl;
132 bool nmi_known_unmasked;
133 u32 exit_intr_info;
128 u32 idt_vectoring_info; 134 u32 idt_vectoring_info;
135 ulong rflags;
129 struct shared_msr_entry *guest_msrs; 136 struct shared_msr_entry *guest_msrs;
130 int nmsrs; 137 int nmsrs;
131 int save_nmsrs; 138 int save_nmsrs;
@@ -154,12 +161,11 @@ struct vcpu_vmx {
154 u32 limit; 161 u32 limit;
155 u32 ar; 162 u32 ar;
156 } tr, es, ds, fs, gs; 163 } tr, es, ds, fs, gs;
157 struct {
158 bool pending;
159 u8 vector;
160 unsigned rip;
161 } irq;
162 } rmode; 164 } rmode;
165 struct {
166 u32 bitmask; /* 4 bits per segment (1 bit per field) */
167 struct kvm_save_segment seg[8];
168 } segment_cache;
163 int vpid; 169 int vpid;
164 bool emulation_required; 170 bool emulation_required;
165 171
@@ -172,15 +178,25 @@ struct vcpu_vmx {
172 bool rdtscp_enabled; 178 bool rdtscp_enabled;
173}; 179};
174 180
181enum segment_cache_field {
182 SEG_FIELD_SEL = 0,
183 SEG_FIELD_BASE = 1,
184 SEG_FIELD_LIMIT = 2,
185 SEG_FIELD_AR = 3,
186
187 SEG_FIELD_NR = 4
188};
189
175static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 190static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
176{ 191{
177 return container_of(vcpu, struct vcpu_vmx, vcpu); 192 return container_of(vcpu, struct vcpu_vmx, vcpu);
178} 193}
179 194
180static int init_rmode(struct kvm *kvm);
181static u64 construct_eptp(unsigned long root_hpa); 195static u64 construct_eptp(unsigned long root_hpa);
182static void kvm_cpu_vmxon(u64 addr); 196static void kvm_cpu_vmxon(u64 addr);
183static void kvm_cpu_vmxoff(void); 197static void kvm_cpu_vmxoff(void);
198static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3);
199static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr);
184 200
185static DEFINE_PER_CPU(struct vmcs *, vmxarea); 201static DEFINE_PER_CPU(struct vmcs *, vmxarea);
186static DEFINE_PER_CPU(struct vmcs *, current_vmcs); 202static DEFINE_PER_CPU(struct vmcs *, current_vmcs);
@@ -192,6 +208,8 @@ static unsigned long *vmx_io_bitmap_b;
192static unsigned long *vmx_msr_bitmap_legacy; 208static unsigned long *vmx_msr_bitmap_legacy;
193static unsigned long *vmx_msr_bitmap_longmode; 209static unsigned long *vmx_msr_bitmap_longmode;
194 210
211static bool cpu_has_load_ia32_efer;
212
195static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS); 213static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
196static DEFINE_SPINLOCK(vmx_vpid_lock); 214static DEFINE_SPINLOCK(vmx_vpid_lock);
197 215
@@ -476,7 +494,7 @@ static void vmcs_clear(struct vmcs *vmcs)
476 u8 error; 494 u8 error;
477 495
478 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0" 496 asm volatile (__ex(ASM_VMX_VMCLEAR_RAX) "; setna %0"
479 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 497 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
480 : "cc", "memory"); 498 : "cc", "memory");
481 if (error) 499 if (error)
482 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n", 500 printk(KERN_ERR "kvm: vmclear fail: %p/%llx\n",
@@ -489,7 +507,7 @@ static void vmcs_load(struct vmcs *vmcs)
489 u8 error; 507 u8 error;
490 508
491 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0" 509 asm volatile (__ex(ASM_VMX_VMPTRLD_RAX) "; setna %0"
492 : "=g"(error) : "a"(&phys_addr), "m"(phys_addr) 510 : "=qm"(error) : "a"(&phys_addr), "m"(phys_addr)
493 : "cc", "memory"); 511 : "cc", "memory");
494 if (error) 512 if (error)
495 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n", 513 printk(KERN_ERR "kvm: vmptrld %p/%llx fail\n",
@@ -505,7 +523,6 @@ static void __vcpu_clear(void *arg)
505 vmcs_clear(vmx->vmcs); 523 vmcs_clear(vmx->vmcs);
506 if (per_cpu(current_vmcs, cpu) == vmx->vmcs) 524 if (per_cpu(current_vmcs, cpu) == vmx->vmcs)
507 per_cpu(current_vmcs, cpu) = NULL; 525 per_cpu(current_vmcs, cpu) = NULL;
508 rdtscll(vmx->vcpu.arch.host_tsc);
509 list_del(&vmx->local_vcpus_link); 526 list_del(&vmx->local_vcpus_link);
510 vmx->vcpu.cpu = -1; 527 vmx->vcpu.cpu = -1;
511 vmx->launched = 0; 528 vmx->launched = 0;
@@ -570,10 +587,10 @@ static inline void ept_sync_individual_addr(u64 eptp, gpa_t gpa)
570 587
571static unsigned long vmcs_readl(unsigned long field) 588static unsigned long vmcs_readl(unsigned long field)
572{ 589{
573 unsigned long value; 590 unsigned long value = 0;
574 591
575 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX) 592 asm volatile (__ex(ASM_VMX_VMREAD_RDX_RAX)
576 : "=a"(value) : "d"(field) : "cc"); 593 : "+a"(value) : "d"(field) : "cc");
577 return value; 594 return value;
578} 595}
579 596
@@ -642,6 +659,62 @@ static void vmcs_set_bits(unsigned long field, u32 mask)
642 vmcs_writel(field, vmcs_readl(field) | mask); 659 vmcs_writel(field, vmcs_readl(field) | mask);
643} 660}
644 661
662static void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
663{
664 vmx->segment_cache.bitmask = 0;
665}
666
667static bool vmx_segment_cache_test_set(struct vcpu_vmx *vmx, unsigned seg,
668 unsigned field)
669{
670 bool ret;
671 u32 mask = 1 << (seg * SEG_FIELD_NR + field);
672
673 if (!(vmx->vcpu.arch.regs_avail & (1 << VCPU_EXREG_SEGMENTS))) {
674 vmx->vcpu.arch.regs_avail |= (1 << VCPU_EXREG_SEGMENTS);
675 vmx->segment_cache.bitmask = 0;
676 }
677 ret = vmx->segment_cache.bitmask & mask;
678 vmx->segment_cache.bitmask |= mask;
679 return ret;
680}
681
682static u16 vmx_read_guest_seg_selector(struct vcpu_vmx *vmx, unsigned seg)
683{
684 u16 *p = &vmx->segment_cache.seg[seg].selector;
685
686 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_SEL))
687 *p = vmcs_read16(kvm_vmx_segment_fields[seg].selector);
688 return *p;
689}
690
691static ulong vmx_read_guest_seg_base(struct vcpu_vmx *vmx, unsigned seg)
692{
693 ulong *p = &vmx->segment_cache.seg[seg].base;
694
695 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_BASE))
696 *p = vmcs_readl(kvm_vmx_segment_fields[seg].base);
697 return *p;
698}
699
700static u32 vmx_read_guest_seg_limit(struct vcpu_vmx *vmx, unsigned seg)
701{
702 u32 *p = &vmx->segment_cache.seg[seg].limit;
703
704 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_LIMIT))
705 *p = vmcs_read32(kvm_vmx_segment_fields[seg].limit);
706 return *p;
707}
708
709static u32 vmx_read_guest_seg_ar(struct vcpu_vmx *vmx, unsigned seg)
710{
711 u32 *p = &vmx->segment_cache.seg[seg].ar;
712
713 if (!vmx_segment_cache_test_set(vmx, seg, SEG_FIELD_AR))
714 *p = vmcs_read32(kvm_vmx_segment_fields[seg].ar_bytes);
715 return *p;
716}
717
645static void update_exception_bitmap(struct kvm_vcpu *vcpu) 718static void update_exception_bitmap(struct kvm_vcpu *vcpu)
646{ 719{
647 u32 eb; 720 u32 eb;
@@ -666,6 +739,12 @@ static void clear_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr)
666 unsigned i; 739 unsigned i;
667 struct msr_autoload *m = &vmx->msr_autoload; 740 struct msr_autoload *m = &vmx->msr_autoload;
668 741
742 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
743 vmcs_clear_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
744 vmcs_clear_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
745 return;
746 }
747
669 for (i = 0; i < m->nr; ++i) 748 for (i = 0; i < m->nr; ++i)
670 if (m->guest[i].index == msr) 749 if (m->guest[i].index == msr)
671 break; 750 break;
@@ -685,6 +764,14 @@ static void add_atomic_switch_msr(struct vcpu_vmx *vmx, unsigned msr,
685 unsigned i; 764 unsigned i;
686 struct msr_autoload *m = &vmx->msr_autoload; 765 struct msr_autoload *m = &vmx->msr_autoload;
687 766
767 if (msr == MSR_EFER && cpu_has_load_ia32_efer) {
768 vmcs_write64(GUEST_IA32_EFER, guest_val);
769 vmcs_write64(HOST_IA32_EFER, host_val);
770 vmcs_set_bits(VM_ENTRY_CONTROLS, VM_ENTRY_LOAD_IA32_EFER);
771 vmcs_set_bits(VM_EXIT_CONTROLS, VM_EXIT_LOAD_IA32_EFER);
772 return;
773 }
774
688 for (i = 0; i < m->nr; ++i) 775 for (i = 0; i < m->nr; ++i)
689 if (m->guest[i].index == msr) 776 if (m->guest[i].index == msr)
690 break; 777 break;
@@ -706,11 +793,10 @@ static void reload_tss(void)
706 /* 793 /*
707 * VT restores TR but not its size. Useless. 794 * VT restores TR but not its size. Useless.
708 */ 795 */
709 struct desc_ptr gdt; 796 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
710 struct desc_struct *descs; 797 struct desc_struct *descs;
711 798
712 native_store_gdt(&gdt); 799 descs = (void *)gdt->address;
713 descs = (void *)gdt.address;
714 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */ 800 descs[GDT_ENTRY_TSS].type = 9; /* available TSS */
715 load_TR_desc(); 801 load_TR_desc();
716} 802}
@@ -753,7 +839,7 @@ static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
753 839
754static unsigned long segment_base(u16 selector) 840static unsigned long segment_base(u16 selector)
755{ 841{
756 struct desc_ptr gdt; 842 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
757 struct desc_struct *d; 843 struct desc_struct *d;
758 unsigned long table_base; 844 unsigned long table_base;
759 unsigned long v; 845 unsigned long v;
@@ -761,8 +847,7 @@ static unsigned long segment_base(u16 selector)
761 if (!(selector & ~3)) 847 if (!(selector & ~3))
762 return 0; 848 return 0;
763 849
764 native_store_gdt(&gdt); 850 table_base = gdt->address;
765 table_base = gdt.address;
766 851
767 if (selector & 4) { /* from ldt */ 852 if (selector & 4) { /* from ldt */
768 u16 ldt_selector = kvm_read_ldt(); 853 u16 ldt_selector = kvm_read_ldt();
@@ -828,10 +913,9 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
828#endif 913#endif
829 914
830#ifdef CONFIG_X86_64 915#ifdef CONFIG_X86_64
831 if (is_long_mode(&vmx->vcpu)) { 916 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
832 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base); 917 if (is_long_mode(&vmx->vcpu))
833 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base); 918 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
834 }
835#endif 919#endif
836 for (i = 0; i < vmx->save_nmsrs; ++i) 920 for (i = 0; i < vmx->save_nmsrs; ++i)
837 kvm_set_shared_msr(vmx->guest_msrs[i].index, 921 kvm_set_shared_msr(vmx->guest_msrs[i].index,
@@ -846,23 +930,23 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
846 930
847 ++vmx->vcpu.stat.host_state_reload; 931 ++vmx->vcpu.stat.host_state_reload;
848 vmx->host_state.loaded = 0; 932 vmx->host_state.loaded = 0;
849 if (vmx->host_state.fs_reload_needed) 933#ifdef CONFIG_X86_64
850 loadsegment(fs, vmx->host_state.fs_sel); 934 if (is_long_mode(&vmx->vcpu))
935 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
936#endif
851 if (vmx->host_state.gs_ldt_reload_needed) { 937 if (vmx->host_state.gs_ldt_reload_needed) {
852 kvm_load_ldt(vmx->host_state.ldt_sel); 938 kvm_load_ldt(vmx->host_state.ldt_sel);
853#ifdef CONFIG_X86_64 939#ifdef CONFIG_X86_64
854 load_gs_index(vmx->host_state.gs_sel); 940 load_gs_index(vmx->host_state.gs_sel);
855 wrmsrl(MSR_KERNEL_GS_BASE, current->thread.gs);
856#else 941#else
857 loadsegment(gs, vmx->host_state.gs_sel); 942 loadsegment(gs, vmx->host_state.gs_sel);
858#endif 943#endif
859 } 944 }
945 if (vmx->host_state.fs_reload_needed)
946 loadsegment(fs, vmx->host_state.fs_sel);
860 reload_tss(); 947 reload_tss();
861#ifdef CONFIG_X86_64 948#ifdef CONFIG_X86_64
862 if (is_long_mode(&vmx->vcpu)) { 949 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
863 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
864 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
865 }
866#endif 950#endif
867 if (current_thread_info()->status & TS_USEDFPU) 951 if (current_thread_info()->status & TS_USEDFPU)
868 clts(); 952 clts();
@@ -883,7 +967,6 @@ static void vmx_load_host_state(struct vcpu_vmx *vmx)
883static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 967static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
884{ 968{
885 struct vcpu_vmx *vmx = to_vmx(vcpu); 969 struct vcpu_vmx *vmx = to_vmx(vcpu);
886 u64 tsc_this, delta, new_offset;
887 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 970 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
888 971
889 if (!vmm_exclusive) 972 if (!vmm_exclusive)
@@ -897,37 +980,24 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
897 } 980 }
898 981
899 if (vcpu->cpu != cpu) { 982 if (vcpu->cpu != cpu) {
900 struct desc_ptr dt; 983 struct desc_ptr *gdt = &__get_cpu_var(host_gdt);
901 unsigned long sysenter_esp; 984 unsigned long sysenter_esp;
902 985
903 kvm_migrate_timers(vcpu);
904 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu); 986 kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
905 local_irq_disable(); 987 local_irq_disable();
906 list_add(&vmx->local_vcpus_link, 988 list_add(&vmx->local_vcpus_link,
907 &per_cpu(vcpus_on_cpu, cpu)); 989 &per_cpu(vcpus_on_cpu, cpu));
908 local_irq_enable(); 990 local_irq_enable();
909 991
910 vcpu->cpu = cpu;
911 /* 992 /*
912 * Linux uses per-cpu TSS and GDT, so set these when switching 993 * Linux uses per-cpu TSS and GDT, so set these when switching
913 * processors. 994 * processors.
914 */ 995 */
915 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */ 996 vmcs_writel(HOST_TR_BASE, kvm_read_tr_base()); /* 22.2.4 */
916 native_store_gdt(&dt); 997 vmcs_writel(HOST_GDTR_BASE, gdt->address); /* 22.2.4 */
917 vmcs_writel(HOST_GDTR_BASE, dt.address); /* 22.2.4 */
918 998
919 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp); 999 rdmsrl(MSR_IA32_SYSENTER_ESP, sysenter_esp);
920 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */ 1000 vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
921
922 /*
923 * Make sure the time stamp counter is monotonous.
924 */
925 rdtscll(tsc_this);
926 if (tsc_this < vcpu->arch.host_tsc) {
927 delta = vcpu->arch.host_tsc - tsc_this;
928 new_offset = vmcs_read64(TSC_OFFSET) + delta;
929 vmcs_write64(TSC_OFFSET, new_offset);
930 }
931 } 1001 }
932} 1002}
933 1003
@@ -972,17 +1042,24 @@ static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
972{ 1042{
973 unsigned long rflags, save_rflags; 1043 unsigned long rflags, save_rflags;
974 1044
975 rflags = vmcs_readl(GUEST_RFLAGS); 1045 if (!test_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail)) {
976 if (to_vmx(vcpu)->rmode.vm86_active) { 1046 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
977 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS; 1047 rflags = vmcs_readl(GUEST_RFLAGS);
978 save_rflags = to_vmx(vcpu)->rmode.save_rflags; 1048 if (to_vmx(vcpu)->rmode.vm86_active) {
979 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS; 1049 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1050 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
1051 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1052 }
1053 to_vmx(vcpu)->rflags = rflags;
980 } 1054 }
981 return rflags; 1055 return to_vmx(vcpu)->rflags;
982} 1056}
983 1057
984static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 1058static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
985{ 1059{
1060 __set_bit(VCPU_EXREG_RFLAGS, (ulong *)&vcpu->arch.regs_avail);
1061 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1062 to_vmx(vcpu)->rflags = rflags;
986 if (to_vmx(vcpu)->rmode.vm86_active) { 1063 if (to_vmx(vcpu)->rmode.vm86_active) {
987 to_vmx(vcpu)->rmode.save_rflags = rflags; 1064 to_vmx(vcpu)->rmode.save_rflags = rflags;
988 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1065 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
@@ -1031,6 +1108,17 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
1031 vmx_set_interrupt_shadow(vcpu, 0); 1108 vmx_set_interrupt_shadow(vcpu, 0);
1032} 1109}
1033 1110
1111static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1112{
1113 /* Ensure that we clear the HLT state in the VMCS. We don't need to
1114 * explicitly skip the instruction because if the HLT state is set, then
1115 * the instruction is already executing and RIP has already been
1116 * advanced. */
1117 if (!yield_on_hlt &&
1118 vmcs_read32(GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1119 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1120}
1121
1034static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 1122static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1035 bool has_error_code, u32 error_code, 1123 bool has_error_code, u32 error_code,
1036 bool reinject) 1124 bool reinject)
@@ -1044,16 +1132,11 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1044 } 1132 }
1045 1133
1046 if (vmx->rmode.vm86_active) { 1134 if (vmx->rmode.vm86_active) {
1047 vmx->rmode.irq.pending = true; 1135 int inc_eip = 0;
1048 vmx->rmode.irq.vector = nr;
1049 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
1050 if (kvm_exception_is_soft(nr)) 1136 if (kvm_exception_is_soft(nr))
1051 vmx->rmode.irq.rip += 1137 inc_eip = vcpu->arch.event_exit_inst_len;
1052 vmx->vcpu.arch.event_exit_inst_len; 1138 if (kvm_inject_realmode_interrupt(vcpu, nr, inc_eip) != EMULATE_DONE)
1053 intr_info |= INTR_TYPE_SOFT_INTR; 1139 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
1054 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1055 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
1056 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
1057 return; 1140 return;
1058 } 1141 }
1059 1142
@@ -1065,6 +1148,7 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
1065 intr_info |= INTR_TYPE_HARD_EXCEPTION; 1148 intr_info |= INTR_TYPE_HARD_EXCEPTION;
1066 1149
1067 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 1150 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
1151 vmx_clear_hlt(vcpu);
1068} 1152}
1069 1153
1070static bool vmx_rdtscp_supported(void) 1154static bool vmx_rdtscp_supported(void)
@@ -1149,12 +1233,32 @@ static u64 guest_read_tsc(void)
1149} 1233}
1150 1234
1151/* 1235/*
1152 * writes 'guest_tsc' into guest's timestamp counter "register" 1236 * Empty call-back. Needs to be implemented when VMX enables the SET_TSC_KHZ
1153 * guest_tsc = host_tsc + tsc_offset ==> tsc_offset = guest_tsc - host_tsc 1237 * ioctl. In this case the call-back should update internal vmx state to make
1238 * the changes effective.
1239 */
1240static void vmx_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz)
1241{
1242 /* Nothing to do here */
1243}
1244
1245/*
1246 * writes 'offset' into guest's timestamp counter offset register
1154 */ 1247 */
1155static void guest_write_tsc(u64 guest_tsc, u64 host_tsc) 1248static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu, u64 offset)
1156{ 1249{
1157 vmcs_write64(TSC_OFFSET, guest_tsc - host_tsc); 1250 vmcs_write64(TSC_OFFSET, offset);
1251}
1252
1253static void vmx_adjust_tsc_offset(struct kvm_vcpu *vcpu, s64 adjustment)
1254{
1255 u64 offset = vmcs_read64(TSC_OFFSET);
1256 vmcs_write64(TSC_OFFSET, offset + adjustment);
1257}
1258
1259static u64 vmx_compute_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc)
1260{
1261 return target_tsc - native_read_tsc();
1158} 1262}
1159 1263
1160/* 1264/*
@@ -1227,7 +1331,6 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1227{ 1331{
1228 struct vcpu_vmx *vmx = to_vmx(vcpu); 1332 struct vcpu_vmx *vmx = to_vmx(vcpu);
1229 struct shared_msr_entry *msr; 1333 struct shared_msr_entry *msr;
1230 u64 host_tsc;
1231 int ret = 0; 1334 int ret = 0;
1232 1335
1233 switch (msr_index) { 1336 switch (msr_index) {
@@ -1237,9 +1340,11 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1237 break; 1340 break;
1238#ifdef CONFIG_X86_64 1341#ifdef CONFIG_X86_64
1239 case MSR_FS_BASE: 1342 case MSR_FS_BASE:
1343 vmx_segment_cache_clear(vmx);
1240 vmcs_writel(GUEST_FS_BASE, data); 1344 vmcs_writel(GUEST_FS_BASE, data);
1241 break; 1345 break;
1242 case MSR_GS_BASE: 1346 case MSR_GS_BASE:
1347 vmx_segment_cache_clear(vmx);
1243 vmcs_writel(GUEST_GS_BASE, data); 1348 vmcs_writel(GUEST_GS_BASE, data);
1244 break; 1349 break;
1245 case MSR_KERNEL_GS_BASE: 1350 case MSR_KERNEL_GS_BASE:
@@ -1257,8 +1362,7 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1257 vmcs_writel(GUEST_SYSENTER_ESP, data); 1362 vmcs_writel(GUEST_SYSENTER_ESP, data);
1258 break; 1363 break;
1259 case MSR_IA32_TSC: 1364 case MSR_IA32_TSC:
1260 rdtscll(host_tsc); 1365 kvm_write_tsc(vcpu, data);
1261 guest_write_tsc(data, host_tsc);
1262 break; 1366 break;
1263 case MSR_IA32_CR_PAT: 1367 case MSR_IA32_CR_PAT:
1264 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) { 1368 if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT) {
@@ -1328,16 +1432,25 @@ static __init int vmx_disabled_by_bios(void)
1328 1432
1329 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); 1433 rdmsrl(MSR_IA32_FEATURE_CONTROL, msr);
1330 if (msr & FEATURE_CONTROL_LOCKED) { 1434 if (msr & FEATURE_CONTROL_LOCKED) {
1435 /* launched w/ TXT and VMX disabled */
1331 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX) 1436 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1332 && tboot_enabled()) 1437 && tboot_enabled())
1333 return 1; 1438 return 1;
1439 /* launched w/o TXT and VMX only enabled w/ TXT */
1440 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1441 && (msr & FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX)
1442 && !tboot_enabled()) {
1443 printk(KERN_WARNING "kvm: disable TXT in the BIOS or "
1444 "activate TXT before enabling KVM\n");
1445 return 1;
1446 }
1447 /* launched w/o TXT and VMX disabled */
1334 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX) 1448 if (!(msr & FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX)
1335 && !tboot_enabled()) 1449 && !tboot_enabled())
1336 return 1; 1450 return 1;
1337 } 1451 }
1338 1452
1339 return 0; 1453 return 0;
1340 /* locked but not enabled */
1341} 1454}
1342 1455
1343static void kvm_cpu_vmxon(u64 addr) 1456static void kvm_cpu_vmxon(u64 addr)
@@ -1427,6 +1540,14 @@ static __init int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt,
1427 return 0; 1540 return 0;
1428} 1541}
1429 1542
1543static __init bool allow_1_setting(u32 msr, u32 ctl)
1544{
1545 u32 vmx_msr_low, vmx_msr_high;
1546
1547 rdmsr(msr, vmx_msr_low, vmx_msr_high);
1548 return vmx_msr_high & ctl;
1549}
1550
1430static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf) 1551static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1431{ 1552{
1432 u32 vmx_msr_low, vmx_msr_high; 1553 u32 vmx_msr_low, vmx_msr_high;
@@ -1443,7 +1564,7 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1443 &_pin_based_exec_control) < 0) 1564 &_pin_based_exec_control) < 0)
1444 return -EIO; 1565 return -EIO;
1445 1566
1446 min = CPU_BASED_HLT_EXITING | 1567 min =
1447#ifdef CONFIG_X86_64 1568#ifdef CONFIG_X86_64
1448 CPU_BASED_CR8_LOAD_EXITING | 1569 CPU_BASED_CR8_LOAD_EXITING |
1449 CPU_BASED_CR8_STORE_EXITING | 1570 CPU_BASED_CR8_STORE_EXITING |
@@ -1456,6 +1577,10 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1456 CPU_BASED_MWAIT_EXITING | 1577 CPU_BASED_MWAIT_EXITING |
1457 CPU_BASED_MONITOR_EXITING | 1578 CPU_BASED_MONITOR_EXITING |
1458 CPU_BASED_INVLPG_EXITING; 1579 CPU_BASED_INVLPG_EXITING;
1580
1581 if (yield_on_hlt)
1582 min |= CPU_BASED_HLT_EXITING;
1583
1459 opt = CPU_BASED_TPR_SHADOW | 1584 opt = CPU_BASED_TPR_SHADOW |
1460 CPU_BASED_USE_MSR_BITMAPS | 1585 CPU_BASED_USE_MSR_BITMAPS |
1461 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS; 1586 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
@@ -1537,6 +1662,12 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1537 vmcs_conf->vmexit_ctrl = _vmexit_control; 1662 vmcs_conf->vmexit_ctrl = _vmexit_control;
1538 vmcs_conf->vmentry_ctrl = _vmentry_control; 1663 vmcs_conf->vmentry_ctrl = _vmentry_control;
1539 1664
1665 cpu_has_load_ia32_efer =
1666 allow_1_setting(MSR_IA32_VMX_ENTRY_CTLS,
1667 VM_ENTRY_LOAD_IA32_EFER)
1668 && allow_1_setting(MSR_IA32_VMX_EXIT_CTLS,
1669 VM_EXIT_LOAD_IA32_EFER);
1670
1540 return 0; 1671 return 0;
1541} 1672}
1542 1673
@@ -1657,6 +1788,9 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1657 vmx->emulation_required = 1; 1788 vmx->emulation_required = 1;
1658 vmx->rmode.vm86_active = 0; 1789 vmx->rmode.vm86_active = 0;
1659 1790
1791 vmx_segment_cache_clear(vmx);
1792
1793 vmcs_write16(GUEST_TR_SELECTOR, vmx->rmode.tr.selector);
1660 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base); 1794 vmcs_writel(GUEST_TR_BASE, vmx->rmode.tr.base);
1661 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit); 1795 vmcs_write32(GUEST_TR_LIMIT, vmx->rmode.tr.limit);
1662 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1796 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
@@ -1679,6 +1813,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1679 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs); 1813 fix_pmode_dataseg(VCPU_SREG_GS, &vmx->rmode.gs);
1680 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs); 1814 fix_pmode_dataseg(VCPU_SREG_FS, &vmx->rmode.fs);
1681 1815
1816 vmx_segment_cache_clear(vmx);
1817
1682 vmcs_write16(GUEST_SS_SELECTOR, 0); 1818 vmcs_write16(GUEST_SS_SELECTOR, 0);
1683 vmcs_write32(GUEST_SS_AR_BYTES, 0x93); 1819 vmcs_write32(GUEST_SS_AR_BYTES, 0x93);
1684 1820
@@ -1710,9 +1846,13 @@ static void fix_rmode_seg(int seg, struct kvm_save_segment *save)
1710 save->limit = vmcs_read32(sf->limit); 1846 save->limit = vmcs_read32(sf->limit);
1711 save->ar = vmcs_read32(sf->ar_bytes); 1847 save->ar = vmcs_read32(sf->ar_bytes);
1712 vmcs_write16(sf->selector, save->base >> 4); 1848 vmcs_write16(sf->selector, save->base >> 4);
1713 vmcs_write32(sf->base, save->base & 0xfffff); 1849 vmcs_write32(sf->base, save->base & 0xffff0);
1714 vmcs_write32(sf->limit, 0xffff); 1850 vmcs_write32(sf->limit, 0xffff);
1715 vmcs_write32(sf->ar_bytes, 0xf3); 1851 vmcs_write32(sf->ar_bytes, 0xf3);
1852 if (save->base & 0xf)
1853 printk_once(KERN_WARNING "kvm: segment base is not paragraph"
1854 " aligned when entering protected mode (seg=%d)",
1855 seg);
1716} 1856}
1717 1857
1718static void enter_rmode(struct kvm_vcpu *vcpu) 1858static void enter_rmode(struct kvm_vcpu *vcpu)
@@ -1726,6 +1866,21 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1726 vmx->emulation_required = 1; 1866 vmx->emulation_required = 1;
1727 vmx->rmode.vm86_active = 1; 1867 vmx->rmode.vm86_active = 1;
1728 1868
1869 /*
1870 * Very old userspace does not call KVM_SET_TSS_ADDR before entering
1871 * vcpu. Call it here with phys address pointing 16M below 4G.
1872 */
1873 if (!vcpu->kvm->arch.tss_addr) {
1874 printk_once(KERN_WARNING "kvm: KVM_SET_TSS_ADDR need to be "
1875 "called before entering vcpu\n");
1876 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
1877 vmx_set_tss_addr(vcpu->kvm, 0xfeffd000);
1878 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
1879 }
1880
1881 vmx_segment_cache_clear(vmx);
1882
1883 vmx->rmode.tr.selector = vmcs_read16(GUEST_TR_SELECTOR);
1729 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE); 1884 vmx->rmode.tr.base = vmcs_readl(GUEST_TR_BASE);
1730 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm)); 1885 vmcs_writel(GUEST_TR_BASE, rmode_tss_base(vcpu->kvm));
1731 1886
@@ -1764,7 +1919,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1764 1919
1765continue_rmode: 1920continue_rmode:
1766 kvm_mmu_reset_context(vcpu); 1921 kvm_mmu_reset_context(vcpu);
1767 init_rmode(vcpu->kvm);
1768} 1922}
1769 1923
1770static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1924static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
@@ -1802,6 +1956,8 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1802{ 1956{
1803 u32 guest_tr_ar; 1957 u32 guest_tr_ar;
1804 1958
1959 vmx_segment_cache_clear(to_vmx(vcpu));
1960
1805 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES); 1961 guest_tr_ar = vmcs_read32(GUEST_TR_AR_BYTES);
1806 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) { 1962 if ((guest_tr_ar & AR_TYPE_MASK) != AR_TYPE_BUSY_64_TSS) {
1807 printk(KERN_DEBUG "%s: tss fixup for long mode. \n", 1963 printk(KERN_DEBUG "%s: tss fixup for long mode. \n",
@@ -1841,6 +1997,13 @@ static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1841 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits; 1997 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1842} 1998}
1843 1999
2000static void vmx_decache_cr3(struct kvm_vcpu *vcpu)
2001{
2002 if (enable_ept && is_paging(vcpu))
2003 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
2004 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
2005}
2006
1844static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 2007static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1845{ 2008{
1846 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits; 2009 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
@@ -1856,20 +2019,20 @@ static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
1856 return; 2019 return;
1857 2020
1858 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 2021 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1859 vmcs_write64(GUEST_PDPTR0, vcpu->arch.pdptrs[0]); 2022 vmcs_write64(GUEST_PDPTR0, vcpu->arch.mmu.pdptrs[0]);
1860 vmcs_write64(GUEST_PDPTR1, vcpu->arch.pdptrs[1]); 2023 vmcs_write64(GUEST_PDPTR1, vcpu->arch.mmu.pdptrs[1]);
1861 vmcs_write64(GUEST_PDPTR2, vcpu->arch.pdptrs[2]); 2024 vmcs_write64(GUEST_PDPTR2, vcpu->arch.mmu.pdptrs[2]);
1862 vmcs_write64(GUEST_PDPTR3, vcpu->arch.pdptrs[3]); 2025 vmcs_write64(GUEST_PDPTR3, vcpu->arch.mmu.pdptrs[3]);
1863 } 2026 }
1864} 2027}
1865 2028
1866static void ept_save_pdptrs(struct kvm_vcpu *vcpu) 2029static void ept_save_pdptrs(struct kvm_vcpu *vcpu)
1867{ 2030{
1868 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) { 2031 if (is_paging(vcpu) && is_pae(vcpu) && !is_long_mode(vcpu)) {
1869 vcpu->arch.pdptrs[0] = vmcs_read64(GUEST_PDPTR0); 2032 vcpu->arch.mmu.pdptrs[0] = vmcs_read64(GUEST_PDPTR0);
1870 vcpu->arch.pdptrs[1] = vmcs_read64(GUEST_PDPTR1); 2033 vcpu->arch.mmu.pdptrs[1] = vmcs_read64(GUEST_PDPTR1);
1871 vcpu->arch.pdptrs[2] = vmcs_read64(GUEST_PDPTR2); 2034 vcpu->arch.mmu.pdptrs[2] = vmcs_read64(GUEST_PDPTR2);
1872 vcpu->arch.pdptrs[3] = vmcs_read64(GUEST_PDPTR3); 2035 vcpu->arch.mmu.pdptrs[3] = vmcs_read64(GUEST_PDPTR3);
1873 } 2036 }
1874 2037
1875 __set_bit(VCPU_EXREG_PDPTR, 2038 __set_bit(VCPU_EXREG_PDPTR,
@@ -1884,6 +2047,8 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1884 unsigned long cr0, 2047 unsigned long cr0,
1885 struct kvm_vcpu *vcpu) 2048 struct kvm_vcpu *vcpu)
1886{ 2049{
2050 if (!test_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail))
2051 vmx_decache_cr3(vcpu);
1887 if (!(cr0 & X86_CR0_PG)) { 2052 if (!(cr0 & X86_CR0_PG)) {
1888 /* From paging/starting to nonpaging */ 2053 /* From paging/starting to nonpaging */
1889 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 2054 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1941,6 +2106,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1941 vmcs_writel(CR0_READ_SHADOW, cr0); 2106 vmcs_writel(CR0_READ_SHADOW, cr0);
1942 vmcs_writel(GUEST_CR0, hw_cr0); 2107 vmcs_writel(GUEST_CR0, hw_cr0);
1943 vcpu->arch.cr0 = cr0; 2108 vcpu->arch.cr0 = cr0;
2109 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
1944} 2110}
1945 2111
1946static u64 construct_eptp(unsigned long root_hpa) 2112static u64 construct_eptp(unsigned long root_hpa)
@@ -1964,7 +2130,7 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1964 if (enable_ept) { 2130 if (enable_ept) {
1965 eptp = construct_eptp(cr3); 2131 eptp = construct_eptp(cr3);
1966 vmcs_write64(EPT_POINTER, eptp); 2132 vmcs_write64(EPT_POINTER, eptp);
1967 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 2133 guest_cr3 = is_paging(vcpu) ? kvm_read_cr3(vcpu) :
1968 vcpu->kvm->arch.ept_identity_map_addr; 2134 vcpu->kvm->arch.ept_identity_map_addr;
1969 ept_load_pdptrs(vcpu); 2135 ept_load_pdptrs(vcpu);
1970 } 2136 }
@@ -1992,23 +2158,39 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1992 vmcs_writel(GUEST_CR4, hw_cr4); 2158 vmcs_writel(GUEST_CR4, hw_cr4);
1993} 2159}
1994 2160
1995static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
1996{
1997 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
1998
1999 return vmcs_readl(sf->base);
2000}
2001
2002static void vmx_get_segment(struct kvm_vcpu *vcpu, 2161static void vmx_get_segment(struct kvm_vcpu *vcpu,
2003 struct kvm_segment *var, int seg) 2162 struct kvm_segment *var, int seg)
2004{ 2163{
2005 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2164 struct vcpu_vmx *vmx = to_vmx(vcpu);
2165 struct kvm_save_segment *save;
2006 u32 ar; 2166 u32 ar;
2007 2167
2008 var->base = vmcs_readl(sf->base); 2168 if (vmx->rmode.vm86_active
2009 var->limit = vmcs_read32(sf->limit); 2169 && (seg == VCPU_SREG_TR || seg == VCPU_SREG_ES
2010 var->selector = vmcs_read16(sf->selector); 2170 || seg == VCPU_SREG_DS || seg == VCPU_SREG_FS
2011 ar = vmcs_read32(sf->ar_bytes); 2171 || seg == VCPU_SREG_GS)
2172 && !emulate_invalid_guest_state) {
2173 switch (seg) {
2174 case VCPU_SREG_TR: save = &vmx->rmode.tr; break;
2175 case VCPU_SREG_ES: save = &vmx->rmode.es; break;
2176 case VCPU_SREG_DS: save = &vmx->rmode.ds; break;
2177 case VCPU_SREG_FS: save = &vmx->rmode.fs; break;
2178 case VCPU_SREG_GS: save = &vmx->rmode.gs; break;
2179 default: BUG();
2180 }
2181 var->selector = save->selector;
2182 var->base = save->base;
2183 var->limit = save->limit;
2184 ar = save->ar;
2185 if (seg == VCPU_SREG_TR
2186 || var->selector == vmx_read_guest_seg_selector(vmx, seg))
2187 goto use_saved_rmode_seg;
2188 }
2189 var->base = vmx_read_guest_seg_base(vmx, seg);
2190 var->limit = vmx_read_guest_seg_limit(vmx, seg);
2191 var->selector = vmx_read_guest_seg_selector(vmx, seg);
2192 ar = vmx_read_guest_seg_ar(vmx, seg);
2193use_saved_rmode_seg:
2012 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state) 2194 if ((ar & AR_UNUSABLE_MASK) && !emulate_invalid_guest_state)
2013 ar = 0; 2195 ar = 0;
2014 var->type = ar & 15; 2196 var->type = ar & 15;
@@ -2022,17 +2204,39 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
2022 var->unusable = (ar >> 16) & 1; 2204 var->unusable = (ar >> 16) & 1;
2023} 2205}
2024 2206
2025static int vmx_get_cpl(struct kvm_vcpu *vcpu) 2207static u64 vmx_get_segment_base(struct kvm_vcpu *vcpu, int seg)
2208{
2209 struct kvm_segment s;
2210
2211 if (to_vmx(vcpu)->rmode.vm86_active) {
2212 vmx_get_segment(vcpu, &s, seg);
2213 return s.base;
2214 }
2215 return vmx_read_guest_seg_base(to_vmx(vcpu), seg);
2216}
2217
2218static int __vmx_get_cpl(struct kvm_vcpu *vcpu)
2026{ 2219{
2027 if (!is_protmode(vcpu)) 2220 if (!is_protmode(vcpu))
2028 return 0; 2221 return 0;
2029 2222
2030 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 2223 if (!is_long_mode(vcpu)
2224 && (kvm_get_rflags(vcpu) & X86_EFLAGS_VM)) /* if virtual 8086 */
2031 return 3; 2225 return 3;
2032 2226
2033 return vmcs_read16(GUEST_CS_SELECTOR) & 3; 2227 return vmx_read_guest_seg_selector(to_vmx(vcpu), VCPU_SREG_CS) & 3;
2034} 2228}
2035 2229
2230static int vmx_get_cpl(struct kvm_vcpu *vcpu)
2231{
2232 if (!test_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail)) {
2233 __set_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2234 to_vmx(vcpu)->cpl = __vmx_get_cpl(vcpu);
2235 }
2236 return to_vmx(vcpu)->cpl;
2237}
2238
2239
2036static u32 vmx_segment_access_rights(struct kvm_segment *var) 2240static u32 vmx_segment_access_rights(struct kvm_segment *var)
2037{ 2241{
2038 u32 ar; 2242 u32 ar;
@@ -2062,7 +2266,10 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2062 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; 2266 struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
2063 u32 ar; 2267 u32 ar;
2064 2268
2269 vmx_segment_cache_clear(vmx);
2270
2065 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) { 2271 if (vmx->rmode.vm86_active && seg == VCPU_SREG_TR) {
2272 vmcs_write16(sf->selector, var->selector);
2066 vmx->rmode.tr.selector = var->selector; 2273 vmx->rmode.tr.selector = var->selector;
2067 vmx->rmode.tr.base = var->base; 2274 vmx->rmode.tr.base = var->base;
2068 vmx->rmode.tr.limit = var->limit; 2275 vmx->rmode.tr.limit = var->limit;
@@ -2097,11 +2304,12 @@ static void vmx_set_segment(struct kvm_vcpu *vcpu,
2097 ar |= 0x1; /* Accessed */ 2304 ar |= 0x1; /* Accessed */
2098 2305
2099 vmcs_write32(sf->ar_bytes, ar); 2306 vmcs_write32(sf->ar_bytes, ar);
2307 __clear_bit(VCPU_EXREG_CPL, (ulong *)&vcpu->arch.regs_avail);
2100} 2308}
2101 2309
2102static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) 2310static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l)
2103{ 2311{
2104 u32 ar = vmcs_read32(GUEST_CS_AR_BYTES); 2312 u32 ar = vmx_read_guest_seg_ar(to_vmx(vcpu), VCPU_SREG_CS);
2105 2313
2106 *db = (ar >> 14) & 1; 2314 *db = (ar >> 14) & 1;
2107 *l = (ar >> 13) & 1; 2315 *l = (ar >> 13) & 1;
@@ -2323,11 +2531,12 @@ static bool guest_state_valid(struct kvm_vcpu *vcpu)
2323 2531
2324static int init_rmode_tss(struct kvm *kvm) 2532static int init_rmode_tss(struct kvm *kvm)
2325{ 2533{
2326 gfn_t fn = rmode_tss_base(kvm) >> PAGE_SHIFT; 2534 gfn_t fn;
2327 u16 data = 0; 2535 u16 data = 0;
2328 int ret = 0; 2536 int r, idx, ret = 0;
2329 int r;
2330 2537
2538 idx = srcu_read_lock(&kvm->srcu);
2539 fn = rmode_tss_base(kvm) >> PAGE_SHIFT;
2331 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE); 2540 r = kvm_clear_guest_page(kvm, fn, 0, PAGE_SIZE);
2332 if (r < 0) 2541 if (r < 0)
2333 goto out; 2542 goto out;
@@ -2351,12 +2560,13 @@ static int init_rmode_tss(struct kvm *kvm)
2351 2560
2352 ret = 1; 2561 ret = 1;
2353out: 2562out:
2563 srcu_read_unlock(&kvm->srcu, idx);
2354 return ret; 2564 return ret;
2355} 2565}
2356 2566
2357static int init_rmode_identity_map(struct kvm *kvm) 2567static int init_rmode_identity_map(struct kvm *kvm)
2358{ 2568{
2359 int i, r, ret; 2569 int i, idx, r, ret;
2360 pfn_t identity_map_pfn; 2570 pfn_t identity_map_pfn;
2361 u32 tmp; 2571 u32 tmp;
2362 2572
@@ -2371,6 +2581,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2371 return 1; 2581 return 1;
2372 ret = 0; 2582 ret = 0;
2373 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT; 2583 identity_map_pfn = kvm->arch.ept_identity_map_addr >> PAGE_SHIFT;
2584 idx = srcu_read_lock(&kvm->srcu);
2374 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE); 2585 r = kvm_clear_guest_page(kvm, identity_map_pfn, 0, PAGE_SIZE);
2375 if (r < 0) 2586 if (r < 0)
2376 goto out; 2587 goto out;
@@ -2386,6 +2597,7 @@ static int init_rmode_identity_map(struct kvm *kvm)
2386 kvm->arch.ept_identity_pagetable_done = true; 2597 kvm->arch.ept_identity_pagetable_done = true;
2387 ret = 1; 2598 ret = 1;
2388out: 2599out:
2600 srcu_read_unlock(&kvm->srcu, idx);
2389 return ret; 2601 return ret;
2390} 2602}
2391 2603
@@ -2515,7 +2727,7 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2515{ 2727{
2516 u32 host_sysenter_cs, msr_low, msr_high; 2728 u32 host_sysenter_cs, msr_low, msr_high;
2517 u32 junk; 2729 u32 junk;
2518 u64 host_pat, tsc_this, tsc_base; 2730 u64 host_pat;
2519 unsigned long a; 2731 unsigned long a;
2520 struct desc_ptr dt; 2732 struct desc_ptr dt;
2521 int i; 2733 int i;
@@ -2656,32 +2868,11 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2656 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE; 2868 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2657 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits); 2869 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2658 2870
2659 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2871 kvm_write_tsc(&vmx->vcpu, 0);
2660 rdtscll(tsc_this);
2661 if (tsc_this < vmx->vcpu.kvm->arch.vm_init_tsc)
2662 tsc_base = tsc_this;
2663
2664 guest_write_tsc(0, tsc_base);
2665 2872
2666 return 0; 2873 return 0;
2667} 2874}
2668 2875
2669static int init_rmode(struct kvm *kvm)
2670{
2671 int idx, ret = 0;
2672
2673 idx = srcu_read_lock(&kvm->srcu);
2674 if (!init_rmode_tss(kvm))
2675 goto exit;
2676 if (!init_rmode_identity_map(kvm))
2677 goto exit;
2678
2679 ret = 1;
2680exit:
2681 srcu_read_unlock(&kvm->srcu, idx);
2682 return ret;
2683}
2684
2685static int vmx_vcpu_reset(struct kvm_vcpu *vcpu) 2876static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2686{ 2877{
2687 struct vcpu_vmx *vmx = to_vmx(vcpu); 2878 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -2689,10 +2880,6 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2689 int ret; 2880 int ret;
2690 2881
2691 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2882 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2692 if (!init_rmode(vmx->vcpu.kvm)) {
2693 ret = -ENOMEM;
2694 goto out;
2695 }
2696 2883
2697 vmx->rmode.vm86_active = 0; 2884 vmx->rmode.vm86_active = 0;
2698 2885
@@ -2709,6 +2896,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2709 if (ret != 0) 2896 if (ret != 0)
2710 goto out; 2897 goto out;
2711 2898
2899 vmx_segment_cache_clear(vmx);
2900
2712 seg_setup(VCPU_SREG_CS); 2901 seg_setup(VCPU_SREG_CS);
2713 /* 2902 /*
2714 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode 2903 * GUEST_CS_BASE should really be 0xffff0000, but VT vm86 mode
@@ -2757,7 +2946,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2757 vmcs_writel(GUEST_IDTR_BASE, 0); 2946 vmcs_writel(GUEST_IDTR_BASE, 0);
2758 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff); 2947 vmcs_write32(GUEST_IDTR_LIMIT, 0xffff);
2759 2948
2760 vmcs_write32(GUEST_ACTIVITY_STATE, 0); 2949 vmcs_write32(GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
2761 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0); 2950 vmcs_write32(GUEST_INTERRUPTIBILITY_INFO, 0);
2762 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0); 2951 vmcs_write32(GUEST_PENDING_DBG_EXCEPTIONS, 0);
2763 2952
@@ -2772,7 +2961,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2772 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); 2961 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0);
2773 if (vm_need_tpr_shadow(vmx->vcpu.kvm)) 2962 if (vm_need_tpr_shadow(vmx->vcpu.kvm))
2774 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 2963 vmcs_write64(VIRTUAL_APIC_PAGE_ADDR,
2775 page_to_phys(vmx->vcpu.arch.apic->regs_page)); 2964 __pa(vmx->vcpu.arch.apic->regs));
2776 vmcs_write32(TPR_THRESHOLD, 0); 2965 vmcs_write32(TPR_THRESHOLD, 0);
2777 } 2966 }
2778 2967
@@ -2819,6 +3008,10 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2819 return; 3008 return;
2820 } 3009 }
2821 3010
3011 if (vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
3012 enable_irq_window(vcpu);
3013 return;
3014 }
2822 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL); 3015 cpu_based_vm_exec_control = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
2823 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING; 3016 cpu_based_vm_exec_control |= CPU_BASED_VIRTUAL_NMI_PENDING;
2824 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3017 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
@@ -2834,16 +3027,11 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2834 3027
2835 ++vcpu->stat.irq_injections; 3028 ++vcpu->stat.irq_injections;
2836 if (vmx->rmode.vm86_active) { 3029 if (vmx->rmode.vm86_active) {
2837 vmx->rmode.irq.pending = true; 3030 int inc_eip = 0;
2838 vmx->rmode.irq.vector = irq;
2839 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2840 if (vcpu->arch.interrupt.soft) 3031 if (vcpu->arch.interrupt.soft)
2841 vmx->rmode.irq.rip += 3032 inc_eip = vcpu->arch.event_exit_inst_len;
2842 vmx->vcpu.arch.event_exit_inst_len; 3033 if (kvm_inject_realmode_interrupt(vcpu, irq, inc_eip) != EMULATE_DONE)
2843 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 3034 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2844 irq | INTR_TYPE_SOFT_INTR | INTR_INFO_VALID_MASK);
2845 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2846 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2847 return; 3035 return;
2848 } 3036 }
2849 intr = irq | INTR_INFO_VALID_MASK; 3037 intr = irq | INTR_INFO_VALID_MASK;
@@ -2854,6 +3042,7 @@ static void vmx_inject_irq(struct kvm_vcpu *vcpu)
2854 } else 3042 } else
2855 intr |= INTR_TYPE_EXT_INTR; 3043 intr |= INTR_TYPE_EXT_INTR;
2856 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr); 3044 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr);
3045 vmx_clear_hlt(vcpu);
2857} 3046}
2858 3047
2859static void vmx_inject_nmi(struct kvm_vcpu *vcpu) 3048static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2874,19 +3063,15 @@ static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
2874 } 3063 }
2875 3064
2876 ++vcpu->stat.nmi_injections; 3065 ++vcpu->stat.nmi_injections;
3066 vmx->nmi_known_unmasked = false;
2877 if (vmx->rmode.vm86_active) { 3067 if (vmx->rmode.vm86_active) {
2878 vmx->rmode.irq.pending = true; 3068 if (kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, 0) != EMULATE_DONE)
2879 vmx->rmode.irq.vector = NMI_VECTOR; 3069 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2880 vmx->rmode.irq.rip = kvm_rip_read(vcpu);
2881 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2882 NMI_VECTOR | INTR_TYPE_SOFT_INTR |
2883 INTR_INFO_VALID_MASK);
2884 vmcs_write32(VM_ENTRY_INSTRUCTION_LEN, 1);
2885 kvm_rip_write(vcpu, vmx->rmode.irq.rip - 1);
2886 return; 3070 return;
2887 } 3071 }
2888 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 3072 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
2889 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR); 3073 INTR_TYPE_NMI_INTR | INTR_INFO_VALID_MASK | NMI_VECTOR);
3074 vmx_clear_hlt(vcpu);
2890} 3075}
2891 3076
2892static int vmx_nmi_allowed(struct kvm_vcpu *vcpu) 3077static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
@@ -2895,13 +3080,16 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2895 return 0; 3080 return 0;
2896 3081
2897 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 3082 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2898 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI)); 3083 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_STI
3084 | GUEST_INTR_STATE_NMI));
2899} 3085}
2900 3086
2901static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu) 3087static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2902{ 3088{
2903 if (!cpu_has_virtual_nmis()) 3089 if (!cpu_has_virtual_nmis())
2904 return to_vmx(vcpu)->soft_vnmi_blocked; 3090 return to_vmx(vcpu)->soft_vnmi_blocked;
3091 if (to_vmx(vcpu)->nmi_known_unmasked)
3092 return false;
2905 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI; 3093 return vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
2906} 3094}
2907 3095
@@ -2915,6 +3103,7 @@ static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2915 vmx->vnmi_blocked_time = 0; 3103 vmx->vnmi_blocked_time = 0;
2916 } 3104 }
2917 } else { 3105 } else {
3106 vmx->nmi_known_unmasked = !masked;
2918 if (masked) 3107 if (masked)
2919 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 3108 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2920 GUEST_INTR_STATE_NMI); 3109 GUEST_INTR_STATE_NMI);
@@ -2945,6 +3134,9 @@ static int vmx_set_tss_addr(struct kvm *kvm, unsigned int addr)
2945 if (ret) 3134 if (ret)
2946 return ret; 3135 return ret;
2947 kvm->arch.tss_addr = addr; 3136 kvm->arch.tss_addr = addr;
3137 if (!init_rmode_tss(kvm))
3138 return -ENOMEM;
3139
2948 return 0; 3140 return 0;
2949} 3141}
2950 3142
@@ -2956,7 +3148,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2956 * Cause the #SS fault with 0 error code in VM86 mode. 3148 * Cause the #SS fault with 0 error code in VM86 mode.
2957 */ 3149 */
2958 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 3150 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2959 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE) 3151 if (emulate_instruction(vcpu, 0) == EMULATE_DONE)
2960 return 1; 3152 return 1;
2961 /* 3153 /*
2962 * Forward all other exceptions that are valid in real mode. 3154 * Forward all other exceptions that are valid in real mode.
@@ -3029,7 +3221,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3029 enum emulation_result er; 3221 enum emulation_result er;
3030 3222
3031 vect_info = vmx->idt_vectoring_info; 3223 vect_info = vmx->idt_vectoring_info;
3032 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 3224 intr_info = vmx->exit_intr_info;
3033 3225
3034 if (is_machine_check(intr_info)) 3226 if (is_machine_check(intr_info))
3035 return handle_machine_check(vcpu); 3227 return handle_machine_check(vcpu);
@@ -3053,14 +3245,13 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3053 } 3245 }
3054 3246
3055 if (is_invalid_opcode(intr_info)) { 3247 if (is_invalid_opcode(intr_info)) {
3056 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD); 3248 er = emulate_instruction(vcpu, EMULTYPE_TRAP_UD);
3057 if (er != EMULATE_DONE) 3249 if (er != EMULATE_DONE)
3058 kvm_queue_exception(vcpu, UD_VECTOR); 3250 kvm_queue_exception(vcpu, UD_VECTOR);
3059 return 1; 3251 return 1;
3060 } 3252 }
3061 3253
3062 error_code = 0; 3254 error_code = 0;
3063 rip = kvm_rip_read(vcpu);
3064 if (intr_info & INTR_INFO_DELIVER_CODE_MASK) 3255 if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
3065 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE); 3256 error_code = vmcs_read32(VM_EXIT_INTR_ERROR_CODE);
3066 if (is_page_fault(intr_info)) { 3257 if (is_page_fault(intr_info)) {
@@ -3072,7 +3263,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3072 3263
3073 if (kvm_event_needs_reinjection(vcpu)) 3264 if (kvm_event_needs_reinjection(vcpu))
3074 kvm_mmu_unprotect_page_virt(vcpu, cr2); 3265 kvm_mmu_unprotect_page_virt(vcpu, cr2);
3075 return kvm_mmu_page_fault(vcpu, cr2, error_code); 3266 return kvm_mmu_page_fault(vcpu, cr2, error_code, NULL, 0);
3076 } 3267 }
3077 3268
3078 if (vmx->rmode.vm86_active && 3269 if (vmx->rmode.vm86_active &&
@@ -3107,6 +3298,7 @@ static int handle_exception(struct kvm_vcpu *vcpu)
3107 vmx->vcpu.arch.event_exit_inst_len = 3298 vmx->vcpu.arch.event_exit_inst_len =
3108 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 3299 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
3109 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3300 kvm_run->exit_reason = KVM_EXIT_DEBUG;
3301 rip = kvm_rip_read(vcpu);
3110 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 3302 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
3111 kvm_run->debug.arch.exception = ex_no; 3303 kvm_run->debug.arch.exception = ex_no;
3112 break; 3304 break;
@@ -3144,7 +3336,7 @@ static int handle_io(struct kvm_vcpu *vcpu)
3144 ++vcpu->stat.io_exits; 3336 ++vcpu->stat.io_exits;
3145 3337
3146 if (string || in) 3338 if (string || in)
3147 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3339 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3148 3340
3149 port = exit_qualification >> 16; 3341 port = exit_qualification >> 16;
3150 size = (exit_qualification & 7) + 1; 3342 size = (exit_qualification & 7) + 1;
@@ -3164,14 +3356,6 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
3164 hypercall[2] = 0xc1; 3356 hypercall[2] = 0xc1;
3165} 3357}
3166 3358
3167static void complete_insn_gp(struct kvm_vcpu *vcpu, int err)
3168{
3169 if (err)
3170 kvm_inject_gp(vcpu, 0);
3171 else
3172 skip_emulated_instruction(vcpu);
3173}
3174
3175static int handle_cr(struct kvm_vcpu *vcpu) 3359static int handle_cr(struct kvm_vcpu *vcpu)
3176{ 3360{
3177 unsigned long exit_qualification, val; 3361 unsigned long exit_qualification, val;
@@ -3189,21 +3373,21 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3189 switch (cr) { 3373 switch (cr) {
3190 case 0: 3374 case 0:
3191 err = kvm_set_cr0(vcpu, val); 3375 err = kvm_set_cr0(vcpu, val);
3192 complete_insn_gp(vcpu, err); 3376 kvm_complete_insn_gp(vcpu, err);
3193 return 1; 3377 return 1;
3194 case 3: 3378 case 3:
3195 err = kvm_set_cr3(vcpu, val); 3379 err = kvm_set_cr3(vcpu, val);
3196 complete_insn_gp(vcpu, err); 3380 kvm_complete_insn_gp(vcpu, err);
3197 return 1; 3381 return 1;
3198 case 4: 3382 case 4:
3199 err = kvm_set_cr4(vcpu, val); 3383 err = kvm_set_cr4(vcpu, val);
3200 complete_insn_gp(vcpu, err); 3384 kvm_complete_insn_gp(vcpu, err);
3201 return 1; 3385 return 1;
3202 case 8: { 3386 case 8: {
3203 u8 cr8_prev = kvm_get_cr8(vcpu); 3387 u8 cr8_prev = kvm_get_cr8(vcpu);
3204 u8 cr8 = kvm_register_read(vcpu, reg); 3388 u8 cr8 = kvm_register_read(vcpu, reg);
3205 kvm_set_cr8(vcpu, cr8); 3389 err = kvm_set_cr8(vcpu, cr8);
3206 skip_emulated_instruction(vcpu); 3390 kvm_complete_insn_gp(vcpu, err);
3207 if (irqchip_in_kernel(vcpu->kvm)) 3391 if (irqchip_in_kernel(vcpu->kvm))
3208 return 1; 3392 return 1;
3209 if (cr8_prev <= cr8) 3393 if (cr8_prev <= cr8)
@@ -3222,8 +3406,9 @@ static int handle_cr(struct kvm_vcpu *vcpu)
3222 case 1: /*mov from cr*/ 3406 case 1: /*mov from cr*/
3223 switch (cr) { 3407 switch (cr) {
3224 case 3: 3408 case 3:
3225 kvm_register_write(vcpu, reg, vcpu->arch.cr3); 3409 val = kvm_read_cr3(vcpu);
3226 trace_kvm_cr_read(cr, vcpu->arch.cr3); 3410 kvm_register_write(vcpu, reg, val);
3411 trace_kvm_cr_read(cr, val);
3227 skip_emulated_instruction(vcpu); 3412 skip_emulated_instruction(vcpu);
3228 return 1; 3413 return 1;
3229 case 8: 3414 case 8:
@@ -3346,6 +3531,7 @@ static int handle_wrmsr(struct kvm_vcpu *vcpu)
3346 3531
3347static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu) 3532static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3348{ 3533{
3534 kvm_make_request(KVM_REQ_EVENT, vcpu);
3349 return 1; 3535 return 1;
3350} 3536}
3351 3537
@@ -3358,6 +3544,8 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3358 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING; 3544 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_INTR_PENDING;
3359 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3545 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3360 3546
3547 kvm_make_request(KVM_REQ_EVENT, vcpu);
3548
3361 ++vcpu->stat.irq_window_exits; 3549 ++vcpu->stat.irq_window_exits;
3362 3550
3363 /* 3551 /*
@@ -3392,6 +3580,11 @@ static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3392 return 1; 3580 return 1;
3393} 3581}
3394 3582
3583static int handle_invd(struct kvm_vcpu *vcpu)
3584{
3585 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3586}
3587
3395static int handle_invlpg(struct kvm_vcpu *vcpu) 3588static int handle_invlpg(struct kvm_vcpu *vcpu)
3396{ 3589{
3397 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3590 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
@@ -3420,7 +3613,7 @@ static int handle_xsetbv(struct kvm_vcpu *vcpu)
3420 3613
3421static int handle_apic_access(struct kvm_vcpu *vcpu) 3614static int handle_apic_access(struct kvm_vcpu *vcpu)
3422{ 3615{
3423 return emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE; 3616 return emulate_instruction(vcpu, 0) == EMULATE_DONE;
3424} 3617}
3425 3618
3426static int handle_task_switch(struct kvm_vcpu *vcpu) 3619static int handle_task_switch(struct kvm_vcpu *vcpu)
@@ -3442,9 +3635,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu)
3442 switch (type) { 3635 switch (type) {
3443 case INTR_TYPE_NMI_INTR: 3636 case INTR_TYPE_NMI_INTR:
3444 vcpu->arch.nmi_injected = false; 3637 vcpu->arch.nmi_injected = false;
3445 if (cpu_has_virtual_nmis()) 3638 vmx_set_nmi_mask(vcpu, true);
3446 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3447 GUEST_INTR_STATE_NMI);
3448 break; 3639 break;
3449 case INTR_TYPE_EXT_INTR: 3640 case INTR_TYPE_EXT_INTR:
3450 case INTR_TYPE_SOFT_INTR: 3641 case INTR_TYPE_SOFT_INTR:
@@ -3519,7 +3710,7 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu)
3519 3710
3520 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS); 3711 gpa = vmcs_read64(GUEST_PHYSICAL_ADDRESS);
3521 trace_kvm_page_fault(gpa, exit_qualification); 3712 trace_kvm_page_fault(gpa, exit_qualification);
3522 return kvm_mmu_page_fault(vcpu, gpa & PAGE_MASK, 0); 3713 return kvm_mmu_page_fault(vcpu, gpa, exit_qualification & 0x3, NULL, 0);
3523} 3714}
3524 3715
3525static u64 ept_rsvd_mask(u64 spte, int level) 3716static u64 ept_rsvd_mask(u64 spte, int level)
@@ -3614,6 +3805,7 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu)
3614 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING; 3805 cpu_based_vm_exec_control &= ~CPU_BASED_VIRTUAL_NMI_PENDING;
3615 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control); 3806 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, cpu_based_vm_exec_control);
3616 ++vcpu->stat.nmi_window_exits; 3807 ++vcpu->stat.nmi_window_exits;
3808 kvm_make_request(KVM_REQ_EVENT, vcpu);
3617 3809
3618 return 1; 3810 return 1;
3619} 3811}
@@ -3623,9 +3815,18 @@ static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3623 struct vcpu_vmx *vmx = to_vmx(vcpu); 3815 struct vcpu_vmx *vmx = to_vmx(vcpu);
3624 enum emulation_result err = EMULATE_DONE; 3816 enum emulation_result err = EMULATE_DONE;
3625 int ret = 1; 3817 int ret = 1;
3818 u32 cpu_exec_ctrl;
3819 bool intr_window_requested;
3820
3821 cpu_exec_ctrl = vmcs_read32(CPU_BASED_VM_EXEC_CONTROL);
3822 intr_window_requested = cpu_exec_ctrl & CPU_BASED_VIRTUAL_INTR_PENDING;
3626 3823
3627 while (!guest_state_valid(vcpu)) { 3824 while (!guest_state_valid(vcpu)) {
3628 err = emulate_instruction(vcpu, 0, 0, 0); 3825 if (intr_window_requested
3826 && (kvm_get_rflags(&vmx->vcpu) & X86_EFLAGS_IF))
3827 return handle_interrupt_window(&vmx->vcpu);
3828
3829 err = emulate_instruction(vcpu, 0);
3629 3830
3630 if (err == EMULATE_DO_MMIO) { 3831 if (err == EMULATE_DO_MMIO) {
3631 ret = 0; 3832 ret = 0;
@@ -3682,6 +3883,7 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3682 [EXIT_REASON_MSR_WRITE] = handle_wrmsr, 3883 [EXIT_REASON_MSR_WRITE] = handle_wrmsr,
3683 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window, 3884 [EXIT_REASON_PENDING_INTERRUPT] = handle_interrupt_window,
3684 [EXIT_REASON_HLT] = handle_halt, 3885 [EXIT_REASON_HLT] = handle_halt,
3886 [EXIT_REASON_INVD] = handle_invd,
3685 [EXIT_REASON_INVLPG] = handle_invlpg, 3887 [EXIT_REASON_INVLPG] = handle_invlpg,
3686 [EXIT_REASON_VMCALL] = handle_vmcall, 3888 [EXIT_REASON_VMCALL] = handle_vmcall,
3687 [EXIT_REASON_VMCLEAR] = handle_vmx_insn, 3889 [EXIT_REASON_VMCLEAR] = handle_vmx_insn,
@@ -3709,6 +3911,12 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3709static const int kvm_vmx_max_exit_handlers = 3911static const int kvm_vmx_max_exit_handlers =
3710 ARRAY_SIZE(kvm_vmx_exit_handlers); 3912 ARRAY_SIZE(kvm_vmx_exit_handlers);
3711 3913
3914static void vmx_get_exit_info(struct kvm_vcpu *vcpu, u64 *info1, u64 *info2)
3915{
3916 *info1 = vmcs_readl(EXIT_QUALIFICATION);
3917 *info2 = vmcs_read32(VM_EXIT_INTR_INFO);
3918}
3919
3712/* 3920/*
3713 * The guest has exited. See if we can fix it or if we need userspace 3921 * The guest has exited. See if we can fix it or if we need userspace
3714 * assistance. 3922 * assistance.
@@ -3719,17 +3927,12 @@ static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3719 u32 exit_reason = vmx->exit_reason; 3927 u32 exit_reason = vmx->exit_reason;
3720 u32 vectoring_info = vmx->idt_vectoring_info; 3928 u32 vectoring_info = vmx->idt_vectoring_info;
3721 3929
3722 trace_kvm_exit(exit_reason, vcpu); 3930 trace_kvm_exit(exit_reason, vcpu, KVM_ISA_VMX);
3723 3931
3724 /* If guest state is invalid, start emulating */ 3932 /* If guest state is invalid, start emulating */
3725 if (vmx->emulation_required && emulate_invalid_guest_state) 3933 if (vmx->emulation_required && emulate_invalid_guest_state)
3726 return handle_invalid_guest_state(vcpu); 3934 return handle_invalid_guest_state(vcpu);
3727 3935
3728 /* Access CR3 don't cause VMExit in paging mode, so we need
3729 * to sync with guest real CR3. */
3730 if (enable_ept && is_paging(vcpu))
3731 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3732
3733 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) { 3936 if (exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY) {
3734 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3937 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3735 vcpu->run->fail_entry.hardware_entry_failure_reason 3938 vcpu->run->fail_entry.hardware_entry_failure_reason
@@ -3790,23 +3993,19 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr)
3790 vmcs_write32(TPR_THRESHOLD, irr); 3993 vmcs_write32(TPR_THRESHOLD, irr);
3791} 3994}
3792 3995
3793static void vmx_complete_interrupts(struct vcpu_vmx *vmx) 3996static void vmx_complete_atomic_exit(struct vcpu_vmx *vmx)
3794{ 3997{
3795 u32 exit_intr_info; 3998 u32 exit_intr_info;
3796 u32 idt_vectoring_info = vmx->idt_vectoring_info;
3797 bool unblock_nmi;
3798 u8 vector;
3799 int type;
3800 bool idtv_info_valid;
3801 3999
3802 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 4000 if (!(vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY
4001 || vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI))
4002 return;
3803 4003
3804 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON); 4004 vmx->exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
4005 exit_intr_info = vmx->exit_intr_info;
3805 4006
3806 /* Handle machine checks before interrupts are enabled */ 4007 /* Handle machine checks before interrupts are enabled */
3807 if ((vmx->exit_reason == EXIT_REASON_MCE_DURING_VMENTRY) 4008 if (is_machine_check(exit_intr_info))
3808 || (vmx->exit_reason == EXIT_REASON_EXCEPTION_NMI
3809 && is_machine_check(exit_intr_info)))
3810 kvm_machine_check(); 4009 kvm_machine_check();
3811 4010
3812 /* We need to handle NMIs before interrupts are enabled */ 4011 /* We need to handle NMIs before interrupts are enabled */
@@ -3816,10 +4015,25 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3816 asm("int $2"); 4015 asm("int $2");
3817 kvm_after_handle_nmi(&vmx->vcpu); 4016 kvm_after_handle_nmi(&vmx->vcpu);
3818 } 4017 }
4018}
3819 4019
3820 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK; 4020static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
4021{
4022 u32 exit_intr_info;
4023 bool unblock_nmi;
4024 u8 vector;
4025 bool idtv_info_valid;
4026
4027 idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3821 4028
3822 if (cpu_has_virtual_nmis()) { 4029 if (cpu_has_virtual_nmis()) {
4030 if (vmx->nmi_known_unmasked)
4031 return;
4032 /*
4033 * Can't use vmx->exit_intr_info since we're not sure what
4034 * the exit reason is.
4035 */
4036 exit_intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
3823 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0; 4037 unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != 0;
3824 vector = exit_intr_info & INTR_INFO_VECTOR_MASK; 4038 vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
3825 /* 4039 /*
@@ -3836,9 +4050,25 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3836 vector != DF_VECTOR && !idtv_info_valid) 4050 vector != DF_VECTOR && !idtv_info_valid)
3837 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO, 4051 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
3838 GUEST_INTR_STATE_NMI); 4052 GUEST_INTR_STATE_NMI);
4053 else
4054 vmx->nmi_known_unmasked =
4055 !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO)
4056 & GUEST_INTR_STATE_NMI);
3839 } else if (unlikely(vmx->soft_vnmi_blocked)) 4057 } else if (unlikely(vmx->soft_vnmi_blocked))
3840 vmx->vnmi_blocked_time += 4058 vmx->vnmi_blocked_time +=
3841 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time)); 4059 ktime_to_ns(ktime_sub(ktime_get(), vmx->entry_time));
4060}
4061
4062static void __vmx_complete_interrupts(struct vcpu_vmx *vmx,
4063 u32 idt_vectoring_info,
4064 int instr_len_field,
4065 int error_code_field)
4066{
4067 u8 vector;
4068 int type;
4069 bool idtv_info_valid;
4070
4071 idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
3842 4072
3843 vmx->vcpu.arch.nmi_injected = false; 4073 vmx->vcpu.arch.nmi_injected = false;
3844 kvm_clear_exception_queue(&vmx->vcpu); 4074 kvm_clear_exception_queue(&vmx->vcpu);
@@ -3847,6 +4077,8 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3847 if (!idtv_info_valid) 4077 if (!idtv_info_valid)
3848 return; 4078 return;
3849 4079
4080 kvm_make_request(KVM_REQ_EVENT, &vmx->vcpu);
4081
3850 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK; 4082 vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
3851 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK; 4083 type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
3852 4084
@@ -3858,23 +4090,22 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3858 * Clear bit "block by NMI" before VM entry if a NMI 4090 * Clear bit "block by NMI" before VM entry if a NMI
3859 * delivery faulted. 4091 * delivery faulted.
3860 */ 4092 */
3861 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO, 4093 vmx_set_nmi_mask(&vmx->vcpu, false);
3862 GUEST_INTR_STATE_NMI);
3863 break; 4094 break;
3864 case INTR_TYPE_SOFT_EXCEPTION: 4095 case INTR_TYPE_SOFT_EXCEPTION:
3865 vmx->vcpu.arch.event_exit_inst_len = 4096 vmx->vcpu.arch.event_exit_inst_len =
3866 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4097 vmcs_read32(instr_len_field);
3867 /* fall through */ 4098 /* fall through */
3868 case INTR_TYPE_HARD_EXCEPTION: 4099 case INTR_TYPE_HARD_EXCEPTION:
3869 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) { 4100 if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
3870 u32 err = vmcs_read32(IDT_VECTORING_ERROR_CODE); 4101 u32 err = vmcs_read32(error_code_field);
3871 kvm_queue_exception_e(&vmx->vcpu, vector, err); 4102 kvm_queue_exception_e(&vmx->vcpu, vector, err);
3872 } else 4103 } else
3873 kvm_queue_exception(&vmx->vcpu, vector); 4104 kvm_queue_exception(&vmx->vcpu, vector);
3874 break; 4105 break;
3875 case INTR_TYPE_SOFT_INTR: 4106 case INTR_TYPE_SOFT_INTR:
3876 vmx->vcpu.arch.event_exit_inst_len = 4107 vmx->vcpu.arch.event_exit_inst_len =
3877 vmcs_read32(VM_EXIT_INSTRUCTION_LEN); 4108 vmcs_read32(instr_len_field);
3878 /* fall through */ 4109 /* fall through */
3879 case INTR_TYPE_EXT_INTR: 4110 case INTR_TYPE_EXT_INTR:
3880 kvm_queue_interrupt(&vmx->vcpu, vector, 4111 kvm_queue_interrupt(&vmx->vcpu, vector,
@@ -3885,27 +4116,21 @@ static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3885 } 4116 }
3886} 4117}
3887 4118
3888/* 4119static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
3889 * Failure to inject an interrupt should give us the information
3890 * in IDT_VECTORING_INFO_FIELD. However, if the failure occurs
3891 * when fetching the interrupt redirection bitmap in the real-mode
3892 * tss, this doesn't happen. So we do it ourselves.
3893 */
3894static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3895{ 4120{
3896 vmx->rmode.irq.pending = 0; 4121 __vmx_complete_interrupts(vmx, vmx->idt_vectoring_info,
3897 if (kvm_rip_read(&vmx->vcpu) + 1 != vmx->rmode.irq.rip) 4122 VM_EXIT_INSTRUCTION_LEN,
3898 return; 4123 IDT_VECTORING_ERROR_CODE);
3899 kvm_rip_write(&vmx->vcpu, vmx->rmode.irq.rip); 4124}
3900 if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) { 4125
3901 vmx->idt_vectoring_info &= ~VECTORING_INFO_TYPE_MASK; 4126static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
3902 vmx->idt_vectoring_info |= INTR_TYPE_EXT_INTR; 4127{
3903 return; 4128 __vmx_complete_interrupts(to_vmx(vcpu),
3904 } 4129 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
3905 vmx->idt_vectoring_info = 4130 VM_ENTRY_INSTRUCTION_LEN,
3906 VECTORING_INFO_VALID_MASK 4131 VM_ENTRY_EXCEPTION_ERROR_CODE);
3907 | INTR_TYPE_EXT_INTR 4132
3908 | vmx->rmode.irq.vector; 4133 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
3909} 4134}
3910 4135
3911#ifdef CONFIG_X86_64 4136#ifdef CONFIG_X86_64
@@ -3916,7 +4141,7 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3916#define Q "l" 4141#define Q "l"
3917#endif 4142#endif
3918 4143
3919static void vmx_vcpu_run(struct kvm_vcpu *vcpu) 4144static void __noclone vmx_vcpu_run(struct kvm_vcpu *vcpu)
3920{ 4145{
3921 struct vcpu_vmx *vmx = to_vmx(vcpu); 4146 struct vcpu_vmx *vmx = to_vmx(vcpu);
3922 4147
@@ -3945,6 +4170,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3945 asm( 4170 asm(
3946 /* Store host registers */ 4171 /* Store host registers */
3947 "push %%"R"dx; push %%"R"bp;" 4172 "push %%"R"dx; push %%"R"bp;"
4173 "push %%"R"cx \n\t" /* placeholder for guest rcx */
3948 "push %%"R"cx \n\t" 4174 "push %%"R"cx \n\t"
3949 "cmp %%"R"sp, %c[host_rsp](%0) \n\t" 4175 "cmp %%"R"sp, %c[host_rsp](%0) \n\t"
3950 "je 1f \n\t" 4176 "je 1f \n\t"
@@ -3986,10 +4212,11 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3986 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t" 4212 ".Llaunched: " __ex(ASM_VMX_VMRESUME) "\n\t"
3987 ".Lkvm_vmx_return: " 4213 ".Lkvm_vmx_return: "
3988 /* Save guest registers, load host registers, keep flags */ 4214 /* Save guest registers, load host registers, keep flags */
3989 "xchg %0, (%%"R"sp) \n\t" 4215 "mov %0, %c[wordsize](%%"R"sp) \n\t"
4216 "pop %0 \n\t"
3990 "mov %%"R"ax, %c[rax](%0) \n\t" 4217 "mov %%"R"ax, %c[rax](%0) \n\t"
3991 "mov %%"R"bx, %c[rbx](%0) \n\t" 4218 "mov %%"R"bx, %c[rbx](%0) \n\t"
3992 "push"Q" (%%"R"sp); pop"Q" %c[rcx](%0) \n\t" 4219 "pop"Q" %c[rcx](%0) \n\t"
3993 "mov %%"R"dx, %c[rdx](%0) \n\t" 4220 "mov %%"R"dx, %c[rdx](%0) \n\t"
3994 "mov %%"R"si, %c[rsi](%0) \n\t" 4221 "mov %%"R"si, %c[rsi](%0) \n\t"
3995 "mov %%"R"di, %c[rdi](%0) \n\t" 4222 "mov %%"R"di, %c[rdi](%0) \n\t"
@@ -4007,7 +4234,7 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4007 "mov %%cr2, %%"R"ax \n\t" 4234 "mov %%cr2, %%"R"ax \n\t"
4008 "mov %%"R"ax, %c[cr2](%0) \n\t" 4235 "mov %%"R"ax, %c[cr2](%0) \n\t"
4009 4236
4010 "pop %%"R"bp; pop %%"R"bp; pop %%"R"dx \n\t" 4237 "pop %%"R"bp; pop %%"R"dx \n\t"
4011 "setbe %c[fail](%0) \n\t" 4238 "setbe %c[fail](%0) \n\t"
4012 : : "c"(vmx), "d"((unsigned long)HOST_RSP), 4239 : : "c"(vmx), "d"((unsigned long)HOST_RSP),
4013 [launched]"i"(offsetof(struct vcpu_vmx, launched)), 4240 [launched]"i"(offsetof(struct vcpu_vmx, launched)),
@@ -4030,25 +4257,32 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
4030 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])), 4257 [r14]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R14])),
4031 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])), 4258 [r15]"i"(offsetof(struct vcpu_vmx, vcpu.arch.regs[VCPU_REGS_R15])),
4032#endif 4259#endif
4033 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)) 4260 [cr2]"i"(offsetof(struct vcpu_vmx, vcpu.arch.cr2)),
4261 [wordsize]"i"(sizeof(ulong))
4034 : "cc", "memory" 4262 : "cc", "memory"
4035 , R"bx", R"di", R"si" 4263 , R"ax", R"bx", R"di", R"si"
4036#ifdef CONFIG_X86_64 4264#ifdef CONFIG_X86_64
4037 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15" 4265 , "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
4038#endif 4266#endif
4039 ); 4267 );
4040 4268
4041 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP) 4269 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)
4042 | (1 << VCPU_EXREG_PDPTR)); 4270 | (1 << VCPU_EXREG_RFLAGS)
4271 | (1 << VCPU_EXREG_CPL)
4272 | (1 << VCPU_EXREG_PDPTR)
4273 | (1 << VCPU_EXREG_SEGMENTS)
4274 | (1 << VCPU_EXREG_CR3));
4043 vcpu->arch.regs_dirty = 0; 4275 vcpu->arch.regs_dirty = 0;
4044 4276
4045 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 4277 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
4046 if (vmx->rmode.irq.pending)
4047 fixup_rmode_irq(vmx);
4048 4278
4049 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS)); 4279 asm("mov %0, %%ds; mov %0, %%es" : : "r"(__USER_DS));
4050 vmx->launched = 1; 4280 vmx->launched = 1;
4051 4281
4282 vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
4283
4284 vmx_complete_atomic_exit(vmx);
4285 vmx_recover_nmi_blocking(vmx);
4052 vmx_complete_interrupts(vmx); 4286 vmx_complete_interrupts(vmx);
4053} 4287}
4054 4288
@@ -4106,8 +4340,8 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4106 goto free_vcpu; 4340 goto free_vcpu;
4107 4341
4108 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL); 4342 vmx->guest_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
4343 err = -ENOMEM;
4109 if (!vmx->guest_msrs) { 4344 if (!vmx->guest_msrs) {
4110 err = -ENOMEM;
4111 goto uninit_vcpu; 4345 goto uninit_vcpu;
4112 } 4346 }
4113 4347
@@ -4119,21 +4353,26 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
4119 4353
4120 cpu = get_cpu(); 4354 cpu = get_cpu();
4121 vmx_vcpu_load(&vmx->vcpu, cpu); 4355 vmx_vcpu_load(&vmx->vcpu, cpu);
4356 vmx->vcpu.cpu = cpu;
4122 err = vmx_vcpu_setup(vmx); 4357 err = vmx_vcpu_setup(vmx);
4123 vmx_vcpu_put(&vmx->vcpu); 4358 vmx_vcpu_put(&vmx->vcpu);
4124 put_cpu(); 4359 put_cpu();
4125 if (err) 4360 if (err)
4126 goto free_vmcs; 4361 goto free_vmcs;
4127 if (vm_need_virtualize_apic_accesses(kvm)) 4362 if (vm_need_virtualize_apic_accesses(kvm))
4128 if (alloc_apic_access_page(kvm) != 0) 4363 err = alloc_apic_access_page(kvm);
4364 if (err)
4129 goto free_vmcs; 4365 goto free_vmcs;
4130 4366
4131 if (enable_ept) { 4367 if (enable_ept) {
4132 if (!kvm->arch.ept_identity_map_addr) 4368 if (!kvm->arch.ept_identity_map_addr)
4133 kvm->arch.ept_identity_map_addr = 4369 kvm->arch.ept_identity_map_addr =
4134 VMX_EPT_IDENTITY_PAGETABLE_ADDR; 4370 VMX_EPT_IDENTITY_PAGETABLE_ADDR;
4371 err = -ENOMEM;
4135 if (alloc_identity_pagetable(kvm) != 0) 4372 if (alloc_identity_pagetable(kvm) != 0)
4136 goto free_vmcs; 4373 goto free_vmcs;
4374 if (!init_rmode_identity_map(kvm))
4375 goto free_vmcs;
4137 } 4376 }
4138 4377
4139 return &vmx->vcpu; 4378 return &vmx->vcpu;
@@ -4249,11 +4488,6 @@ static int vmx_get_lpage_level(void)
4249 return PT_PDPE_LEVEL; 4488 return PT_PDPE_LEVEL;
4250} 4489}
4251 4490
4252static inline u32 bit(int bitno)
4253{
4254 return 1 << (bitno & 31);
4255}
4256
4257static void vmx_cpuid_update(struct kvm_vcpu *vcpu) 4491static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4258{ 4492{
4259 struct kvm_cpuid_entry2 *best; 4493 struct kvm_cpuid_entry2 *best;
@@ -4280,6 +4514,13 @@ static void vmx_set_supported_cpuid(u32 func, struct kvm_cpuid_entry2 *entry)
4280{ 4514{
4281} 4515}
4282 4516
4517static int vmx_check_intercept(struct kvm_vcpu *vcpu,
4518 struct x86_instruction_info *info,
4519 enum x86_intercept_stage stage)
4520{
4521 return X86EMUL_CONTINUE;
4522}
4523
4283static struct kvm_x86_ops vmx_x86_ops = { 4524static struct kvm_x86_ops vmx_x86_ops = {
4284 .cpu_has_kvm_support = cpu_has_kvm_support, 4525 .cpu_has_kvm_support = cpu_has_kvm_support,
4285 .disabled_by_bios = vmx_disabled_by_bios, 4526 .disabled_by_bios = vmx_disabled_by_bios,
@@ -4307,6 +4548,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4307 .get_cpl = vmx_get_cpl, 4548 .get_cpl = vmx_get_cpl,
4308 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4549 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4309 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits, 4550 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
4551 .decache_cr3 = vmx_decache_cr3,
4310 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4552 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
4311 .set_cr0 = vmx_set_cr0, 4553 .set_cr0 = vmx_set_cr0,
4312 .set_cr3 = vmx_set_cr3, 4554 .set_cr3 = vmx_set_cr3,
@@ -4334,6 +4576,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
4334 .set_irq = vmx_inject_irq, 4576 .set_irq = vmx_inject_irq,
4335 .set_nmi = vmx_inject_nmi, 4577 .set_nmi = vmx_inject_nmi,
4336 .queue_exception = vmx_queue_exception, 4578 .queue_exception = vmx_queue_exception,
4579 .cancel_injection = vmx_cancel_injection,
4337 .interrupt_allowed = vmx_interrupt_allowed, 4580 .interrupt_allowed = vmx_interrupt_allowed,
4338 .nmi_allowed = vmx_nmi_allowed, 4581 .nmi_allowed = vmx_nmi_allowed,
4339 .get_nmi_mask = vmx_get_nmi_mask, 4582 .get_nmi_mask = vmx_get_nmi_mask,
@@ -4346,7 +4589,9 @@ static struct kvm_x86_ops vmx_x86_ops = {
4346 .get_tdp_level = get_ept_level, 4589 .get_tdp_level = get_ept_level,
4347 .get_mt_mask = vmx_get_mt_mask, 4590 .get_mt_mask = vmx_get_mt_mask,
4348 4591
4592 .get_exit_info = vmx_get_exit_info,
4349 .exit_reasons_str = vmx_exit_reasons_str, 4593 .exit_reasons_str = vmx_exit_reasons_str,
4594
4350 .get_lpage_level = vmx_get_lpage_level, 4595 .get_lpage_level = vmx_get_lpage_level,
4351 4596
4352 .cpuid_update = vmx_cpuid_update, 4597 .cpuid_update = vmx_cpuid_update,
@@ -4356,6 +4601,15 @@ static struct kvm_x86_ops vmx_x86_ops = {
4356 .set_supported_cpuid = vmx_set_supported_cpuid, 4601 .set_supported_cpuid = vmx_set_supported_cpuid,
4357 4602
4358 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit, 4603 .has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
4604
4605 .set_tsc_khz = vmx_set_tsc_khz,
4606 .write_tsc_offset = vmx_write_tsc_offset,
4607 .adjust_tsc_offset = vmx_adjust_tsc_offset,
4608 .compute_tsc_offset = vmx_compute_tsc_offset,
4609
4610 .set_tdp_cr3 = vmx_set_cr3,
4611
4612 .check_intercept = vmx_check_intercept,
4359}; 4613};
4360 4614
4361static int __init vmx_init(void) 4615static int __init vmx_init(void)
@@ -4417,8 +4671,6 @@ static int __init vmx_init(void)
4417 4671
4418 if (enable_ept) { 4672 if (enable_ept) {
4419 bypass_guest_pf = 0; 4673 bypass_guest_pf = 0;
4420 kvm_mmu_set_base_ptes(VMX_EPT_READABLE_MASK |
4421 VMX_EPT_WRITABLE_MASK);
4422 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull, 4674 kvm_mmu_set_mask_ptes(0ull, 0ull, 0ull, 0ull,
4423 VMX_EPT_EXECUTABLE_MASK); 4675 VMX_EPT_EXECUTABLE_MASK);
4424 kvm_enable_tdp(); 4676 kvm_enable_tdp();
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 3a09c625d526..77c9d8673dc4 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -6,7 +6,7 @@
6 * Copyright (C) 2006 Qumranet, Inc. 6 * Copyright (C) 2006 Qumranet, Inc.
7 * Copyright (C) 2008 Qumranet, Inc. 7 * Copyright (C) 2008 Qumranet, Inc.
8 * Copyright IBM Corporation, 2008 8 * Copyright IBM Corporation, 2008
9 * Copyright 2010 Red Hat, Inc. and/or its affilates. 9 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10 * 10 *
11 * Authors: 11 * Authors:
12 * Avi Kivity <avi@qumranet.com> 12 * Avi Kivity <avi@qumranet.com>
@@ -43,6 +43,7 @@
43#include <linux/slab.h> 43#include <linux/slab.h>
44#include <linux/perf_event.h> 44#include <linux/perf_event.h>
45#include <linux/uaccess.h> 45#include <linux/uaccess.h>
46#include <linux/hash.h>
46#include <trace/events/kvm.h> 47#include <trace/events/kvm.h>
47 48
48#define CREATE_TRACE_POINTS 49#define CREATE_TRACE_POINTS
@@ -55,32 +56,25 @@
55#include <asm/mce.h> 56#include <asm/mce.h>
56#include <asm/i387.h> 57#include <asm/i387.h>
57#include <asm/xcr.h> 58#include <asm/xcr.h>
59#include <asm/pvclock.h>
60#include <asm/div64.h>
58 61
59#define MAX_IO_MSRS 256 62#define MAX_IO_MSRS 256
60#define CR0_RESERVED_BITS \
61 (~(unsigned long)(X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | X86_CR0_TS \
62 | X86_CR0_ET | X86_CR0_NE | X86_CR0_WP | X86_CR0_AM \
63 | X86_CR0_NW | X86_CR0_CD | X86_CR0_PG))
64#define CR4_RESERVED_BITS \
65 (~(unsigned long)(X86_CR4_VME | X86_CR4_PVI | X86_CR4_TSD | X86_CR4_DE\
66 | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_MCE \
67 | X86_CR4_PGE | X86_CR4_PCE | X86_CR4_OSFXSR \
68 | X86_CR4_OSXSAVE \
69 | X86_CR4_OSXMMEXCPT | X86_CR4_VMXE))
70
71#define CR8_RESERVED_BITS (~(unsigned long)X86_CR8_TPR)
72
73#define KVM_MAX_MCE_BANKS 32 63#define KVM_MAX_MCE_BANKS 32
74#define KVM_MCE_CAP_SUPPORTED MCG_CTL_P 64#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
65
66#define emul_to_vcpu(ctxt) \
67 container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)
75 68
76/* EFER defaults: 69/* EFER defaults:
77 * - enable syscall per default because its emulated by KVM 70 * - enable syscall per default because its emulated by KVM
78 * - enable LME and LMA per default on 64 bit KVM 71 * - enable LME and LMA per default on 64 bit KVM
79 */ 72 */
80#ifdef CONFIG_X86_64 73#ifdef CONFIG_X86_64
81static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffafeULL; 74static
75u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
82#else 76#else
83static u64 __read_mostly efer_reserved_bits = 0xfffffffffffffffeULL; 77static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
84#endif 78#endif
85 79
86#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM 80#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
@@ -96,6 +90,11 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
96int ignore_msrs = 0; 90int ignore_msrs = 0;
97module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 91module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
98 92
93bool kvm_has_tsc_control;
94EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
95u32 kvm_max_guest_tsc_khz;
96EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);
97
99#define KVM_NR_SHARED_MSRS 16 98#define KVM_NR_SHARED_MSRS 16
100 99
101struct kvm_shared_msrs_global { 100struct kvm_shared_msrs_global {
@@ -153,9 +152,13 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
153 152
154u64 __read_mostly host_xcr0; 153u64 __read_mostly host_xcr0;
155 154
156static inline u32 bit(int bitno) 155int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
156
157static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
157{ 158{
158 return 1 << (bitno & 31); 159 int i;
160 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
161 vcpu->arch.apf.gfns[i] = ~0;
159} 162}
160 163
161static void kvm_on_user_return(struct user_return_notifier *urn) 164static void kvm_on_user_return(struct user_return_notifier *urn)
@@ -282,6 +285,8 @@ static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
282 u32 prev_nr; 285 u32 prev_nr;
283 int class1, class2; 286 int class1, class2;
284 287
288 kvm_make_request(KVM_REQ_EVENT, vcpu);
289
285 if (!vcpu->arch.exception.pending) { 290 if (!vcpu->arch.exception.pending) {
286 queue: 291 queue:
287 vcpu->arch.exception.pending = true; 292 vcpu->arch.exception.pending = true;
@@ -327,16 +332,33 @@ void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
327} 332}
328EXPORT_SYMBOL_GPL(kvm_requeue_exception); 333EXPORT_SYMBOL_GPL(kvm_requeue_exception);
329 334
330void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr, 335void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
331 u32 error_code) 336{
337 if (err)
338 kvm_inject_gp(vcpu, 0);
339 else
340 kvm_x86_ops->skip_emulated_instruction(vcpu);
341}
342EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
343
344void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
332{ 345{
333 ++vcpu->stat.pf_guest; 346 ++vcpu->stat.pf_guest;
334 vcpu->arch.cr2 = addr; 347 vcpu->arch.cr2 = fault->address;
335 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 348 kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
349}
350
351void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
352{
353 if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
354 vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
355 else
356 vcpu->arch.mmu.inject_page_fault(vcpu, fault);
336} 357}
337 358
338void kvm_inject_nmi(struct kvm_vcpu *vcpu) 359void kvm_inject_nmi(struct kvm_vcpu *vcpu)
339{ 360{
361 kvm_make_request(KVM_REQ_EVENT, vcpu);
340 vcpu->arch.nmi_pending = 1; 362 vcpu->arch.nmi_pending = 1;
341} 363}
342EXPORT_SYMBOL_GPL(kvm_inject_nmi); 364EXPORT_SYMBOL_GPL(kvm_inject_nmi);
@@ -367,18 +389,49 @@ bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
367EXPORT_SYMBOL_GPL(kvm_require_cpl); 389EXPORT_SYMBOL_GPL(kvm_require_cpl);
368 390
369/* 391/*
392 * This function will be used to read from the physical memory of the currently
393 * running guest. The difference to kvm_read_guest_page is that this function
394 * can read from guest physical or from the guest's guest physical memory.
395 */
396int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
397 gfn_t ngfn, void *data, int offset, int len,
398 u32 access)
399{
400 gfn_t real_gfn;
401 gpa_t ngpa;
402
403 ngpa = gfn_to_gpa(ngfn);
404 real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
405 if (real_gfn == UNMAPPED_GVA)
406 return -EFAULT;
407
408 real_gfn = gpa_to_gfn(real_gfn);
409
410 return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
411}
412EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);
413
414int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
415 void *data, int offset, int len, u32 access)
416{
417 return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
418 data, offset, len, access);
419}
420
421/*
370 * Load the pae pdptrs. Return true is they are all valid. 422 * Load the pae pdptrs. Return true is they are all valid.
371 */ 423 */
372int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) 424int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
373{ 425{
374 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; 426 gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
375 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2; 427 unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
376 int i; 428 int i;
377 int ret; 429 int ret;
378 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 430 u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
379 431
380 ret = kvm_read_guest_page(vcpu->kvm, pdpt_gfn, pdpte, 432 ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
381 offset * sizeof(u64), sizeof(pdpte)); 433 offset * sizeof(u64), sizeof(pdpte),
434 PFERR_USER_MASK|PFERR_WRITE_MASK);
382 if (ret < 0) { 435 if (ret < 0) {
383 ret = 0; 436 ret = 0;
384 goto out; 437 goto out;
@@ -392,7 +445,7 @@ int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3)
392 } 445 }
393 ret = 1; 446 ret = 1;
394 447
395 memcpy(vcpu->arch.pdptrs, pdpte, sizeof(vcpu->arch.pdptrs)); 448 memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
396 __set_bit(VCPU_EXREG_PDPTR, 449 __set_bit(VCPU_EXREG_PDPTR,
397 (unsigned long *)&vcpu->arch.regs_avail); 450 (unsigned long *)&vcpu->arch.regs_avail);
398 __set_bit(VCPU_EXREG_PDPTR, 451 __set_bit(VCPU_EXREG_PDPTR,
@@ -405,8 +458,10 @@ EXPORT_SYMBOL_GPL(load_pdptrs);
405 458
406static bool pdptrs_changed(struct kvm_vcpu *vcpu) 459static bool pdptrs_changed(struct kvm_vcpu *vcpu)
407{ 460{
408 u64 pdpte[ARRAY_SIZE(vcpu->arch.pdptrs)]; 461 u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
409 bool changed = true; 462 bool changed = true;
463 int offset;
464 gfn_t gfn;
410 int r; 465 int r;
411 466
412 if (is_long_mode(vcpu) || !is_pae(vcpu)) 467 if (is_long_mode(vcpu) || !is_pae(vcpu))
@@ -416,10 +471,13 @@ static bool pdptrs_changed(struct kvm_vcpu *vcpu)
416 (unsigned long *)&vcpu->arch.regs_avail)) 471 (unsigned long *)&vcpu->arch.regs_avail))
417 return true; 472 return true;
418 473
419 r = kvm_read_guest(vcpu->kvm, vcpu->arch.cr3 & ~31u, pdpte, sizeof(pdpte)); 474 gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
475 offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
476 r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
477 PFERR_USER_MASK | PFERR_WRITE_MASK);
420 if (r < 0) 478 if (r < 0)
421 goto out; 479 goto out;
422 changed = memcmp(pdpte, vcpu->arch.pdptrs, sizeof(pdpte)) != 0; 480 changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
423out: 481out:
424 482
425 return changed; 483 return changed;
@@ -458,12 +516,18 @@ int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
458 return 1; 516 return 1;
459 } else 517 } else
460#endif 518#endif
461 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) 519 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
520 kvm_read_cr3(vcpu)))
462 return 1; 521 return 1;
463 } 522 }
464 523
465 kvm_x86_ops->set_cr0(vcpu, cr0); 524 kvm_x86_ops->set_cr0(vcpu, cr0);
466 525
526 if ((cr0 ^ old_cr0) & X86_CR0_PG) {
527 kvm_clear_async_pf_completion_queue(vcpu);
528 kvm_async_pf_hash_reset(vcpu);
529 }
530
467 if ((cr0 ^ old_cr0) & update_bits) 531 if ((cr0 ^ old_cr0) & update_bits)
468 kvm_mmu_reset_context(vcpu); 532 kvm_mmu_reset_context(vcpu);
469 return 0; 533 return 0;
@@ -547,7 +611,8 @@ int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
547 return 1; 611 return 1;
548 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 612 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
549 && ((cr4 ^ old_cr4) & pdptr_bits) 613 && ((cr4 ^ old_cr4) & pdptr_bits)
550 && !load_pdptrs(vcpu, vcpu->arch.cr3)) 614 && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
615 kvm_read_cr3(vcpu)))
551 return 1; 616 return 1;
552 617
553 if (cr4 & X86_CR4_VMXE) 618 if (cr4 & X86_CR4_VMXE)
@@ -567,7 +632,7 @@ EXPORT_SYMBOL_GPL(kvm_set_cr4);
567 632
568int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) 633int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
569{ 634{
570 if (cr3 == vcpu->arch.cr3 && !pdptrs_changed(vcpu)) { 635 if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
571 kvm_mmu_sync_roots(vcpu); 636 kvm_mmu_sync_roots(vcpu);
572 kvm_mmu_flush_tlb(vcpu); 637 kvm_mmu_flush_tlb(vcpu);
573 return 0; 638 return 0;
@@ -580,7 +645,8 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
580 if (is_pae(vcpu)) { 645 if (is_pae(vcpu)) {
581 if (cr3 & CR3_PAE_RESERVED_BITS) 646 if (cr3 & CR3_PAE_RESERVED_BITS)
582 return 1; 647 return 1;
583 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) 648 if (is_paging(vcpu) &&
649 !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
584 return 1; 650 return 1;
585 } 651 }
586 /* 652 /*
@@ -601,12 +667,13 @@ int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
601 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT))) 667 if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
602 return 1; 668 return 1;
603 vcpu->arch.cr3 = cr3; 669 vcpu->arch.cr3 = cr3;
670 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
604 vcpu->arch.mmu.new_cr3(vcpu); 671 vcpu->arch.mmu.new_cr3(vcpu);
605 return 0; 672 return 0;
606} 673}
607EXPORT_SYMBOL_GPL(kvm_set_cr3); 674EXPORT_SYMBOL_GPL(kvm_set_cr3);
608 675
609int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 676int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
610{ 677{
611 if (cr8 & CR8_RESERVED_BITS) 678 if (cr8 & CR8_RESERVED_BITS)
612 return 1; 679 return 1;
@@ -616,12 +683,6 @@ int __kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
616 vcpu->arch.cr8 = cr8; 683 vcpu->arch.cr8 = cr8;
617 return 0; 684 return 0;
618} 685}
619
620void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
621{
622 if (__kvm_set_cr8(vcpu, cr8))
623 kvm_inject_gp(vcpu, 0);
624}
625EXPORT_SYMBOL_GPL(kvm_set_cr8); 686EXPORT_SYMBOL_GPL(kvm_set_cr8);
626 687
627unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) 688unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
@@ -726,18 +787,18 @@ EXPORT_SYMBOL_GPL(kvm_get_dr);
726 * kvm-specific. Those are put in the beginning of the list. 787 * kvm-specific. Those are put in the beginning of the list.
727 */ 788 */
728 789
729#define KVM_SAVE_MSRS_BEGIN 7 790#define KVM_SAVE_MSRS_BEGIN 8
730static u32 msrs_to_save[] = { 791static u32 msrs_to_save[] = {
731 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 792 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
732 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, 793 MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
733 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, 794 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
734 HV_X64_MSR_APIC_ASSIST_PAGE, 795 HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN,
735 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 796 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
736 MSR_STAR, 797 MSR_STAR,
737#ifdef CONFIG_X86_64 798#ifdef CONFIG_X86_64
738 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 799 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
739#endif 800#endif
740 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA 801 MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
741}; 802};
742 803
743static unsigned num_msrs_to_save; 804static unsigned num_msrs_to_save;
@@ -781,7 +842,6 @@ static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
781 kvm_x86_ops->set_efer(vcpu, efer); 842 kvm_x86_ops->set_efer(vcpu, efer);
782 843
783 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 844 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
784 kvm_mmu_reset_context(vcpu);
785 845
786 /* Update reserved bits */ 846 /* Update reserved bits */
787 if ((efer ^ old_efer) & EFER_NX) 847 if ((efer ^ old_efer) & EFER_NX)
@@ -838,7 +898,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
838 898
839 /* 899 /*
840 * The guest calculates current wall clock time by adding 900 * The guest calculates current wall clock time by adding
841 * system time (updated by kvm_write_guest_time below) to the 901 * system time (updated by kvm_guest_time_update below) to the
842 * wall clock specified here. guest system time equals host 902 * wall clock specified here. guest system time equals host
843 * system time for us, thus we must fill in host boot time here. 903 * system time for us, thus we must fill in host boot time here.
844 */ 904 */
@@ -866,65 +926,235 @@ static uint32_t div_frac(uint32_t dividend, uint32_t divisor)
866 return quotient; 926 return quotient;
867} 927}
868 928
869static void kvm_set_time_scale(uint32_t tsc_khz, struct pvclock_vcpu_time_info *hv_clock) 929static void kvm_get_time_scale(uint32_t scaled_khz, uint32_t base_khz,
930 s8 *pshift, u32 *pmultiplier)
870{ 931{
871 uint64_t nsecs = 1000000000LL; 932 uint64_t scaled64;
872 int32_t shift = 0; 933 int32_t shift = 0;
873 uint64_t tps64; 934 uint64_t tps64;
874 uint32_t tps32; 935 uint32_t tps32;
875 936
876 tps64 = tsc_khz * 1000LL; 937 tps64 = base_khz * 1000LL;
877 while (tps64 > nsecs*2) { 938 scaled64 = scaled_khz * 1000LL;
939 while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) {
878 tps64 >>= 1; 940 tps64 >>= 1;
879 shift--; 941 shift--;
880 } 942 }
881 943
882 tps32 = (uint32_t)tps64; 944 tps32 = (uint32_t)tps64;
883 while (tps32 <= (uint32_t)nsecs) { 945 while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) {
884 tps32 <<= 1; 946 if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000)
947 scaled64 >>= 1;
948 else
949 tps32 <<= 1;
885 shift++; 950 shift++;
886 } 951 }
887 952
888 hv_clock->tsc_shift = shift; 953 *pshift = shift;
889 hv_clock->tsc_to_system_mul = div_frac(nsecs, tps32); 954 *pmultiplier = div_frac(scaled64, tps32);
890 955
891 pr_debug("%s: tsc_khz %u, tsc_shift %d, tsc_mul %u\n", 956 pr_debug("%s: base_khz %u => %u, shift %d, mul %u\n",
892 __func__, tsc_khz, hv_clock->tsc_shift, 957 __func__, base_khz, scaled_khz, shift, *pmultiplier);
893 hv_clock->tsc_to_system_mul); 958}
959
960static inline u64 get_kernel_ns(void)
961{
962 struct timespec ts;
963
964 WARN_ON(preemptible());
965 ktime_get_ts(&ts);
966 monotonic_to_bootbased(&ts);
967 return timespec_to_ns(&ts);
894} 968}
895 969
896static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); 970static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz);
971unsigned long max_tsc_khz;
897 972
898static void kvm_write_guest_time(struct kvm_vcpu *v) 973static inline int kvm_tsc_changes_freq(void)
974{
975 int cpu = get_cpu();
976 int ret = !boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
977 cpufreq_quick_get(cpu) != 0;
978 put_cpu();
979 return ret;
980}
981
982static u64 vcpu_tsc_khz(struct kvm_vcpu *vcpu)
983{
984 if (vcpu->arch.virtual_tsc_khz)
985 return vcpu->arch.virtual_tsc_khz;
986 else
987 return __this_cpu_read(cpu_tsc_khz);
988}
989
990static inline u64 nsec_to_cycles(struct kvm_vcpu *vcpu, u64 nsec)
991{
992 u64 ret;
993
994 WARN_ON(preemptible());
995 if (kvm_tsc_changes_freq())
996 printk_once(KERN_WARNING
997 "kvm: unreliable cycle conversion on adjustable rate TSC\n");
998 ret = nsec * vcpu_tsc_khz(vcpu);
999 do_div(ret, USEC_PER_SEC);
1000 return ret;
1001}
1002
1003static void kvm_init_tsc_catchup(struct kvm_vcpu *vcpu, u32 this_tsc_khz)
1004{
1005 /* Compute a scale to convert nanoseconds in TSC cycles */
1006 kvm_get_time_scale(this_tsc_khz, NSEC_PER_SEC / 1000,
1007 &vcpu->arch.tsc_catchup_shift,
1008 &vcpu->arch.tsc_catchup_mult);
1009}
1010
1011static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns)
1012{
1013 u64 tsc = pvclock_scale_delta(kernel_ns-vcpu->arch.last_tsc_nsec,
1014 vcpu->arch.tsc_catchup_mult,
1015 vcpu->arch.tsc_catchup_shift);
1016 tsc += vcpu->arch.last_tsc_write;
1017 return tsc;
1018}
1019
1020void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data)
1021{
1022 struct kvm *kvm = vcpu->kvm;
1023 u64 offset, ns, elapsed;
1024 unsigned long flags;
1025 s64 sdiff;
1026
1027 raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags);
1028 offset = kvm_x86_ops->compute_tsc_offset(vcpu, data);
1029 ns = get_kernel_ns();
1030 elapsed = ns - kvm->arch.last_tsc_nsec;
1031 sdiff = data - kvm->arch.last_tsc_write;
1032 if (sdiff < 0)
1033 sdiff = -sdiff;
1034
1035 /*
1036 * Special case: close write to TSC within 5 seconds of
1037 * another CPU is interpreted as an attempt to synchronize
1038 * The 5 seconds is to accommodate host load / swapping as
1039 * well as any reset of TSC during the boot process.
1040 *
1041 * In that case, for a reliable TSC, we can match TSC offsets,
1042 * or make a best guest using elapsed value.
1043 */
1044 if (sdiff < nsec_to_cycles(vcpu, 5ULL * NSEC_PER_SEC) &&
1045 elapsed < 5ULL * NSEC_PER_SEC) {
1046 if (!check_tsc_unstable()) {
1047 offset = kvm->arch.last_tsc_offset;
1048 pr_debug("kvm: matched tsc offset for %llu\n", data);
1049 } else {
1050 u64 delta = nsec_to_cycles(vcpu, elapsed);
1051 offset += delta;
1052 pr_debug("kvm: adjusted tsc offset by %llu\n", delta);
1053 }
1054 ns = kvm->arch.last_tsc_nsec;
1055 }
1056 kvm->arch.last_tsc_nsec = ns;
1057 kvm->arch.last_tsc_write = data;
1058 kvm->arch.last_tsc_offset = offset;
1059 kvm_x86_ops->write_tsc_offset(vcpu, offset);
1060 raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags);
1061
1062 /* Reset of TSC must disable overshoot protection below */
1063 vcpu->arch.hv_clock.tsc_timestamp = 0;
1064 vcpu->arch.last_tsc_write = data;
1065 vcpu->arch.last_tsc_nsec = ns;
1066}
1067EXPORT_SYMBOL_GPL(kvm_write_tsc);
1068
1069static int kvm_guest_time_update(struct kvm_vcpu *v)
899{ 1070{
900 struct timespec ts;
901 unsigned long flags; 1071 unsigned long flags;
902 struct kvm_vcpu_arch *vcpu = &v->arch; 1072 struct kvm_vcpu_arch *vcpu = &v->arch;
903 void *shared_kaddr; 1073 void *shared_kaddr;
904 unsigned long this_tsc_khz; 1074 unsigned long this_tsc_khz;
1075 s64 kernel_ns, max_kernel_ns;
1076 u64 tsc_timestamp;
905 1077
906 if ((!vcpu->time_page)) 1078 /* Keep irq disabled to prevent changes to the clock */
907 return; 1079 local_irq_save(flags);
1080 kvm_get_msr(v, MSR_IA32_TSC, &tsc_timestamp);
1081 kernel_ns = get_kernel_ns();
1082 this_tsc_khz = vcpu_tsc_khz(v);
1083 if (unlikely(this_tsc_khz == 0)) {
1084 local_irq_restore(flags);
1085 kvm_make_request(KVM_REQ_CLOCK_UPDATE, v);
1086 return 1;
1087 }
908 1088
909 this_tsc_khz = get_cpu_var(cpu_tsc_khz); 1089 /*
910 if (unlikely(vcpu->hv_clock_tsc_khz != this_tsc_khz)) { 1090 * We may have to catch up the TSC to match elapsed wall clock
911 kvm_set_time_scale(this_tsc_khz, &vcpu->hv_clock); 1091 * time for two reasons, even if kvmclock is used.
912 vcpu->hv_clock_tsc_khz = this_tsc_khz; 1092 * 1) CPU could have been running below the maximum TSC rate
1093 * 2) Broken TSC compensation resets the base at each VCPU
1094 * entry to avoid unknown leaps of TSC even when running
1095 * again on the same CPU. This may cause apparent elapsed
1096 * time to disappear, and the guest to stand still or run
1097 * very slowly.
1098 */
1099 if (vcpu->tsc_catchup) {
1100 u64 tsc = compute_guest_tsc(v, kernel_ns);
1101 if (tsc > tsc_timestamp) {
1102 kvm_x86_ops->adjust_tsc_offset(v, tsc - tsc_timestamp);
1103 tsc_timestamp = tsc;
1104 }
913 } 1105 }
914 put_cpu_var(cpu_tsc_khz);
915 1106
916 /* Keep irq disabled to prevent changes to the clock */
917 local_irq_save(flags);
918 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
919 ktime_get_ts(&ts);
920 monotonic_to_bootbased(&ts);
921 local_irq_restore(flags); 1107 local_irq_restore(flags);
922 1108
923 /* With all the info we got, fill in the values */ 1109 if (!vcpu->time_page)
1110 return 0;
924 1111
925 vcpu->hv_clock.system_time = ts.tv_nsec + 1112 /*
926 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset; 1113 * Time as measured by the TSC may go backwards when resetting the base
1114 * tsc_timestamp. The reason for this is that the TSC resolution is
1115 * higher than the resolution of the other clock scales. Thus, many
1116 * possible measurments of the TSC correspond to one measurement of any
1117 * other clock, and so a spread of values is possible. This is not a
1118 * problem for the computation of the nanosecond clock; with TSC rates
1119 * around 1GHZ, there can only be a few cycles which correspond to one
1120 * nanosecond value, and any path through this code will inevitably
1121 * take longer than that. However, with the kernel_ns value itself,
1122 * the precision may be much lower, down to HZ granularity. If the
1123 * first sampling of TSC against kernel_ns ends in the low part of the
1124 * range, and the second in the high end of the range, we can get:
1125 *
1126 * (TSC - offset_low) * S + kns_old > (TSC - offset_high) * S + kns_new
1127 *
1128 * As the sampling errors potentially range in the thousands of cycles,
1129 * it is possible such a time value has already been observed by the
1130 * guest. To protect against this, we must compute the system time as
1131 * observed by the guest and ensure the new system time is greater.
1132 */
1133 max_kernel_ns = 0;
1134 if (vcpu->hv_clock.tsc_timestamp && vcpu->last_guest_tsc) {
1135 max_kernel_ns = vcpu->last_guest_tsc -
1136 vcpu->hv_clock.tsc_timestamp;
1137 max_kernel_ns = pvclock_scale_delta(max_kernel_ns,
1138 vcpu->hv_clock.tsc_to_system_mul,
1139 vcpu->hv_clock.tsc_shift);
1140 max_kernel_ns += vcpu->last_kernel_ns;
1141 }
927 1142
1143 if (unlikely(vcpu->hw_tsc_khz != this_tsc_khz)) {
1144 kvm_get_time_scale(NSEC_PER_SEC / 1000, this_tsc_khz,
1145 &vcpu->hv_clock.tsc_shift,
1146 &vcpu->hv_clock.tsc_to_system_mul);
1147 vcpu->hw_tsc_khz = this_tsc_khz;
1148 }
1149
1150 if (max_kernel_ns > kernel_ns)
1151 kernel_ns = max_kernel_ns;
1152
1153 /* With all the info we got, fill in the values */
1154 vcpu->hv_clock.tsc_timestamp = tsc_timestamp;
1155 vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset;
1156 vcpu->last_kernel_ns = kernel_ns;
1157 vcpu->last_guest_tsc = tsc_timestamp;
928 vcpu->hv_clock.flags = 0; 1158 vcpu->hv_clock.flags = 0;
929 1159
930 /* 1160 /*
@@ -942,16 +1172,7 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
942 kunmap_atomic(shared_kaddr, KM_USER0); 1172 kunmap_atomic(shared_kaddr, KM_USER0);
943 1173
944 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT); 1174 mark_page_dirty(v->kvm, vcpu->time >> PAGE_SHIFT);
945} 1175 return 0;
946
947static int kvm_request_guest_time_update(struct kvm_vcpu *v)
948{
949 struct kvm_vcpu_arch *vcpu = &v->arch;
950
951 if (!vcpu->time_page)
952 return 0;
953 kvm_make_request(KVM_REQ_KVMCLOCK_UPDATE, v);
954 return 1;
955} 1176}
956 1177
957static bool msr_mtrr_valid(unsigned msr) 1178static bool msr_mtrr_valid(unsigned msr)
@@ -1214,6 +1435,38 @@ static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1214 return 0; 1435 return 0;
1215} 1436}
1216 1437
1438static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data)
1439{
1440 gpa_t gpa = data & ~0x3f;
1441
1442 /* Bits 2:5 are resrved, Should be zero */
1443 if (data & 0x3c)
1444 return 1;
1445
1446 vcpu->arch.apf.msr_val = data;
1447
1448 if (!(data & KVM_ASYNC_PF_ENABLED)) {
1449 kvm_clear_async_pf_completion_queue(vcpu);
1450 kvm_async_pf_hash_reset(vcpu);
1451 return 0;
1452 }
1453
1454 if (kvm_gfn_to_hva_cache_init(vcpu->kvm, &vcpu->arch.apf.data, gpa))
1455 return 1;
1456
1457 vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS);
1458 kvm_async_pf_wakeup_all(vcpu);
1459 return 0;
1460}
1461
1462static void kvmclock_reset(struct kvm_vcpu *vcpu)
1463{
1464 if (vcpu->arch.time_page) {
1465 kvm_release_page_dirty(vcpu->arch.time_page);
1466 vcpu->arch.time_page = NULL;
1467 }
1468}
1469
1217int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1470int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1218{ 1471{
1219 switch (msr) { 1472 switch (msr) {
@@ -1271,12 +1524,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1271 break; 1524 break;
1272 case MSR_KVM_SYSTEM_TIME_NEW: 1525 case MSR_KVM_SYSTEM_TIME_NEW:
1273 case MSR_KVM_SYSTEM_TIME: { 1526 case MSR_KVM_SYSTEM_TIME: {
1274 if (vcpu->arch.time_page) { 1527 kvmclock_reset(vcpu);
1275 kvm_release_page_dirty(vcpu->arch.time_page);
1276 vcpu->arch.time_page = NULL;
1277 }
1278 1528
1279 vcpu->arch.time = data; 1529 vcpu->arch.time = data;
1530 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
1280 1531
1281 /* we verify if the enable bit is set... */ 1532 /* we verify if the enable bit is set... */
1282 if (!(data & 1)) 1533 if (!(data & 1))
@@ -1292,10 +1543,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1292 kvm_release_page_clean(vcpu->arch.time_page); 1543 kvm_release_page_clean(vcpu->arch.time_page);
1293 vcpu->arch.time_page = NULL; 1544 vcpu->arch.time_page = NULL;
1294 } 1545 }
1295
1296 kvm_request_guest_time_update(vcpu);
1297 break; 1546 break;
1298 } 1547 }
1548 case MSR_KVM_ASYNC_PF_EN:
1549 if (kvm_pv_enable_async_pf(vcpu, data))
1550 return 1;
1551 break;
1299 case MSR_IA32_MCG_CTL: 1552 case MSR_IA32_MCG_CTL:
1300 case MSR_IA32_MCG_STATUS: 1553 case MSR_IA32_MCG_STATUS:
1301 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1554 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
@@ -1330,6 +1583,16 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1330 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1583 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
1331 "0x%x data 0x%llx\n", msr, data); 1584 "0x%x data 0x%llx\n", msr, data);
1332 break; 1585 break;
1586 case MSR_K7_CLK_CTL:
1587 /*
1588 * Ignore all writes to this no longer documented MSR.
1589 * Writes are only relevant for old K7 processors,
1590 * all pre-dating SVM, but a recommended workaround from
1591 * AMD for these chips. It is possible to speicify the
1592 * affected processor models on the command line, hence
1593 * the need to ignore the workaround.
1594 */
1595 break;
1333 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1596 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1334 if (kvm_hv_msr_partition_wide(msr)) { 1597 if (kvm_hv_msr_partition_wide(msr)) {
1335 int r; 1598 int r;
@@ -1340,6 +1603,12 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1340 } else 1603 } else
1341 return set_msr_hyperv(vcpu, msr, data); 1604 return set_msr_hyperv(vcpu, msr, data);
1342 break; 1605 break;
1606 case MSR_IA32_BBL_CR_CTL3:
1607 /* Drop writes to this legacy MSR -- see rdmsr
1608 * counterpart for further detail.
1609 */
1610 pr_unimpl(vcpu, "ignored wrmsr: 0x%x data %llx\n", msr, data);
1611 break;
1343 default: 1612 default:
1344 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr)) 1613 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1345 return xen_hvm_config(vcpu, data); 1614 return xen_hvm_config(vcpu, data);
@@ -1522,6 +1791,20 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1522 case 0xcd: /* fsb frequency */ 1791 case 0xcd: /* fsb frequency */
1523 data = 3; 1792 data = 3;
1524 break; 1793 break;
1794 /*
1795 * MSR_EBC_FREQUENCY_ID
1796 * Conservative value valid for even the basic CPU models.
1797 * Models 0,1: 000 in bits 23:21 indicating a bus speed of
1798 * 100MHz, model 2 000 in bits 18:16 indicating 100MHz,
1799 * and 266MHz for model 3, or 4. Set Core Clock
1800 * Frequency to System Bus Frequency Ratio to 1 (bits
1801 * 31:24) even though these are only valid for CPU
1802 * models > 2, however guests may end up dividing or
1803 * multiplying by zero otherwise.
1804 */
1805 case MSR_EBC_FREQUENCY_ID:
1806 data = 1 << 24;
1807 break;
1525 case MSR_IA32_APICBASE: 1808 case MSR_IA32_APICBASE:
1526 data = kvm_get_apic_base(vcpu); 1809 data = kvm_get_apic_base(vcpu);
1527 break; 1810 break;
@@ -1548,6 +1831,9 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1548 case MSR_KVM_SYSTEM_TIME_NEW: 1831 case MSR_KVM_SYSTEM_TIME_NEW:
1549 data = vcpu->arch.time; 1832 data = vcpu->arch.time;
1550 break; 1833 break;
1834 case MSR_KVM_ASYNC_PF_EN:
1835 data = vcpu->arch.apf.msr_val;
1836 break;
1551 case MSR_IA32_P5_MC_ADDR: 1837 case MSR_IA32_P5_MC_ADDR:
1552 case MSR_IA32_P5_MC_TYPE: 1838 case MSR_IA32_P5_MC_TYPE:
1553 case MSR_IA32_MCG_CAP: 1839 case MSR_IA32_MCG_CAP:
@@ -1555,6 +1841,18 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1555 case MSR_IA32_MCG_STATUS: 1841 case MSR_IA32_MCG_STATUS:
1556 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1842 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1557 return get_msr_mce(vcpu, msr, pdata); 1843 return get_msr_mce(vcpu, msr, pdata);
1844 case MSR_K7_CLK_CTL:
1845 /*
1846 * Provide expected ramp-up count for K7. All other
1847 * are set to zero, indicating minimum divisors for
1848 * every field.
1849 *
1850 * This prevents guest kernels on AMD host with CPU
1851 * type 6, model 8 and higher from exploding due to
1852 * the rdmsr failing.
1853 */
1854 data = 0x20000000;
1855 break;
1558 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: 1856 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1559 if (kvm_hv_msr_partition_wide(msr)) { 1857 if (kvm_hv_msr_partition_wide(msr)) {
1560 int r; 1858 int r;
@@ -1565,6 +1863,19 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1565 } else 1863 } else
1566 return get_msr_hyperv(vcpu, msr, pdata); 1864 return get_msr_hyperv(vcpu, msr, pdata);
1567 break; 1865 break;
1866 case MSR_IA32_BBL_CR_CTL3:
1867 /* This legacy MSR exists but isn't fully documented in current
1868 * silicon. It is however accessed by winxp in very narrow
1869 * scenarios where it sets bit #19, itself documented as
1870 * a "reserved" bit. Best effort attempt to source coherent
1871 * read data here should the balance of the register be
1872 * interpreted by the guest:
1873 *
1874 * L2 cache control register 3: 64GB range, 256KB size,
1875 * enabled, latency 0x1, configured
1876 */
1877 data = 0xbe702111;
1878 break;
1568 default: 1879 default:
1569 if (!ignore_msrs) { 1880 if (!ignore_msrs) {
1570 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1881 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1665,6 +1976,7 @@ int kvm_dev_ioctl_check_extension(long ext)
1665 case KVM_CAP_NOP_IO_DELAY: 1976 case KVM_CAP_NOP_IO_DELAY:
1666 case KVM_CAP_MP_STATE: 1977 case KVM_CAP_MP_STATE:
1667 case KVM_CAP_SYNC_MMU: 1978 case KVM_CAP_SYNC_MMU:
1979 case KVM_CAP_USER_NMI:
1668 case KVM_CAP_REINJECT_CONTROL: 1980 case KVM_CAP_REINJECT_CONTROL:
1669 case KVM_CAP_IRQ_INJECT_STATUS: 1981 case KVM_CAP_IRQ_INJECT_STATUS:
1670 case KVM_CAP_ASSIGN_DEV_IRQ: 1982 case KVM_CAP_ASSIGN_DEV_IRQ:
@@ -1683,6 +1995,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1683 case KVM_CAP_DEBUGREGS: 1995 case KVM_CAP_DEBUGREGS:
1684 case KVM_CAP_X86_ROBUST_SINGLESTEP: 1996 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1685 case KVM_CAP_XSAVE: 1997 case KVM_CAP_XSAVE:
1998 case KVM_CAP_ASYNC_PF:
1999 case KVM_CAP_GET_TSC_KHZ:
1686 r = 1; 2000 r = 1;
1687 break; 2001 break;
1688 case KVM_CAP_COALESCED_MMIO: 2002 case KVM_CAP_COALESCED_MMIO:
@@ -1709,6 +2023,9 @@ int kvm_dev_ioctl_check_extension(long ext)
1709 case KVM_CAP_XCRS: 2023 case KVM_CAP_XCRS:
1710 r = cpu_has_xsave; 2024 r = cpu_has_xsave;
1711 break; 2025 break;
2026 case KVM_CAP_TSC_CONTROL:
2027 r = kvm_has_tsc_control;
2028 break;
1712 default: 2029 default:
1713 r = 0; 2030 r = 0;
1714 break; 2031 break;
@@ -1808,19 +2125,33 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1808 } 2125 }
1809 2126
1810 kvm_x86_ops->vcpu_load(vcpu, cpu); 2127 kvm_x86_ops->vcpu_load(vcpu, cpu);
1811 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) { 2128 if (unlikely(vcpu->cpu != cpu) || check_tsc_unstable()) {
1812 unsigned long khz = cpufreq_quick_get(cpu); 2129 /* Make sure TSC doesn't go backwards */
1813 if (!khz) 2130 s64 tsc_delta;
1814 khz = tsc_khz; 2131 u64 tsc;
1815 per_cpu(cpu_tsc_khz, cpu) = khz; 2132
2133 kvm_get_msr(vcpu, MSR_IA32_TSC, &tsc);
2134 tsc_delta = !vcpu->arch.last_guest_tsc ? 0 :
2135 tsc - vcpu->arch.last_guest_tsc;
2136
2137 if (tsc_delta < 0)
2138 mark_tsc_unstable("KVM discovered backwards TSC");
2139 if (check_tsc_unstable()) {
2140 kvm_x86_ops->adjust_tsc_offset(vcpu, -tsc_delta);
2141 vcpu->arch.tsc_catchup = 1;
2142 }
2143 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
2144 if (vcpu->cpu != cpu)
2145 kvm_migrate_timers(vcpu);
2146 vcpu->cpu = cpu;
1816 } 2147 }
1817 kvm_request_guest_time_update(vcpu);
1818} 2148}
1819 2149
1820void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 2150void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1821{ 2151{
1822 kvm_x86_ops->vcpu_put(vcpu); 2152 kvm_x86_ops->vcpu_put(vcpu);
1823 kvm_put_guest_fpu(vcpu); 2153 kvm_put_guest_fpu(vcpu);
2154 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
1824} 2155}
1825 2156
1826static int is_efer_nx(void) 2157static int is_efer_nx(void)
@@ -1937,6 +2268,11 @@ out:
1937 return r; 2268 return r;
1938} 2269}
1939 2270
2271static void cpuid_mask(u32 *word, int wordnum)
2272{
2273 *word &= boot_cpu_data.x86_capability[wordnum];
2274}
2275
1940static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function, 2276static void do_cpuid_1_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1941 u32 index) 2277 u32 index)
1942{ 2278{
@@ -1991,13 +2327,20 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1991 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ | 2327 0 /* Reserved */ | F(CX16) | 0 /* xTPR Update, PDCM */ |
1992 0 /* Reserved, DCA */ | F(XMM4_1) | 2328 0 /* Reserved, DCA */ | F(XMM4_1) |
1993 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) | 2329 F(XMM4_2) | F(X2APIC) | F(MOVBE) | F(POPCNT) |
1994 0 /* Reserved, AES */ | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX); 2330 0 /* Reserved*/ | F(AES) | F(XSAVE) | 0 /* OSXSAVE */ | F(AVX) |
2331 F(F16C);
1995 /* cpuid 0x80000001.ecx */ 2332 /* cpuid 0x80000001.ecx */
1996 const u32 kvm_supported_word6_x86_features = 2333 const u32 kvm_supported_word6_x86_features =
1997 F(LAHF_LM) | F(CMP_LEGACY) | F(SVM) | 0 /* ExtApicSpace */ | 2334 F(LAHF_LM) | F(CMP_LEGACY) | 0 /*SVM*/ | 0 /* ExtApicSpace */ |
1998 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) | 2335 F(CR8_LEGACY) | F(ABM) | F(SSE4A) | F(MISALIGNSSE) |
1999 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(SSE5) | 2336 F(3DNOWPREFETCH) | 0 /* OSVW */ | 0 /* IBS */ | F(XOP) |
2000 0 /* SKINIT */ | 0 /* WDT */; 2337 0 /* SKINIT, WDT, LWP */ | F(FMA4) | F(TBM);
2338
2339 /* cpuid 0xC0000001.edx */
2340 const u32 kvm_supported_word5_x86_features =
2341 F(XSTORE) | F(XSTORE_EN) | F(XCRYPT) | F(XCRYPT_EN) |
2342 F(ACE2) | F(ACE2_EN) | F(PHE) | F(PHE_EN) |
2343 F(PMM) | F(PMM_EN);
2001 2344
2002 /* all calls to cpuid_count() should be made on the same cpu */ 2345 /* all calls to cpuid_count() should be made on the same cpu */
2003 get_cpu(); 2346 get_cpu();
@@ -2010,7 +2353,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2010 break; 2353 break;
2011 case 1: 2354 case 1:
2012 entry->edx &= kvm_supported_word0_x86_features; 2355 entry->edx &= kvm_supported_word0_x86_features;
2356 cpuid_mask(&entry->edx, 0);
2013 entry->ecx &= kvm_supported_word4_x86_features; 2357 entry->ecx &= kvm_supported_word4_x86_features;
2358 cpuid_mask(&entry->ecx, 4);
2014 /* we support x2apic emulation even if host does not support 2359 /* we support x2apic emulation even if host does not support
2015 * it since we emulate x2apic in software */ 2360 * it since we emulate x2apic in software */
2016 entry->ecx |= F(X2APIC); 2361 entry->ecx |= F(X2APIC);
@@ -2068,9 +2413,9 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2068 int i; 2413 int i;
2069 2414
2070 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2415 entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
2071 for (i = 1; *nent < maxnent; ++i) { 2416 for (i = 1; *nent < maxnent && i < 64; ++i) {
2072 if (entry[i - 1].eax == 0 && i != 2) 2417 if (entry[i].eax == 0)
2073 break; 2418 continue;
2074 do_cpuid_1_ent(&entry[i], function, i); 2419 do_cpuid_1_ent(&entry[i], function, i);
2075 entry[i].flags |= 2420 entry[i].flags |=
2076 KVM_CPUID_FLAG_SIGNIFCANT_INDEX; 2421 KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
@@ -2091,6 +2436,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2091 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) | 2436 entry->eax = (1 << KVM_FEATURE_CLOCKSOURCE) |
2092 (1 << KVM_FEATURE_NOP_IO_DELAY) | 2437 (1 << KVM_FEATURE_NOP_IO_DELAY) |
2093 (1 << KVM_FEATURE_CLOCKSOURCE2) | 2438 (1 << KVM_FEATURE_CLOCKSOURCE2) |
2439 (1 << KVM_FEATURE_ASYNC_PF) |
2094 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT); 2440 (1 << KVM_FEATURE_CLOCKSOURCE_STABLE_BIT);
2095 entry->ebx = 0; 2441 entry->ebx = 0;
2096 entry->ecx = 0; 2442 entry->ecx = 0;
@@ -2101,7 +2447,23 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
2101 break; 2447 break;
2102 case 0x80000001: 2448 case 0x80000001:
2103 entry->edx &= kvm_supported_word1_x86_features; 2449 entry->edx &= kvm_supported_word1_x86_features;
2450 cpuid_mask(&entry->edx, 1);
2104 entry->ecx &= kvm_supported_word6_x86_features; 2451 entry->ecx &= kvm_supported_word6_x86_features;
2452 cpuid_mask(&entry->ecx, 6);
2453 break;
2454 /*Add support for Centaur's CPUID instruction*/
2455 case 0xC0000000:
2456 /*Just support up to 0xC0000004 now*/
2457 entry->eax = min(entry->eax, 0xC0000004);
2458 break;
2459 case 0xC0000001:
2460 entry->edx &= kvm_supported_word5_x86_features;
2461 cpuid_mask(&entry->edx, 5);
2462 break;
2463 case 0xC0000002:
2464 case 0xC0000003:
2465 case 0xC0000004:
2466 /*Now nothing to do, reserved for the future*/
2105 break; 2467 break;
2106 } 2468 }
2107 2469
@@ -2149,6 +2511,26 @@ static int kvm_dev_ioctl_get_supported_cpuid(struct kvm_cpuid2 *cpuid,
2149 if (nent >= cpuid->nent) 2511 if (nent >= cpuid->nent)
2150 goto out_free; 2512 goto out_free;
2151 2513
2514 /* Add support for Centaur's CPUID instruction. */
2515 if (boot_cpu_data.x86_vendor == X86_VENDOR_CENTAUR) {
2516 do_cpuid_ent(&cpuid_entries[nent], 0xC0000000, 0,
2517 &nent, cpuid->nent);
2518
2519 r = -E2BIG;
2520 if (nent >= cpuid->nent)
2521 goto out_free;
2522
2523 limit = cpuid_entries[nent - 1].eax;
2524 for (func = 0xC0000001;
2525 func <= limit && nent < cpuid->nent; ++func)
2526 do_cpuid_ent(&cpuid_entries[nent], func, 0,
2527 &nent, cpuid->nent);
2528
2529 r = -E2BIG;
2530 if (nent >= cpuid->nent)
2531 goto out_free;
2532 }
2533
2152 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent, 2534 do_cpuid_ent(&cpuid_entries[nent], KVM_CPUID_SIGNATURE, 0, &nent,
2153 cpuid->nent); 2535 cpuid->nent);
2154 2536
@@ -2203,6 +2585,7 @@ static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu,
2203 return -ENXIO; 2585 return -ENXIO;
2204 2586
2205 kvm_queue_interrupt(vcpu, irq->irq, false); 2587 kvm_queue_interrupt(vcpu, irq->irq, false);
2588 kvm_make_request(KVM_REQ_EVENT, vcpu);
2206 2589
2207 return 0; 2590 return 0;
2208} 2591}
@@ -2272,9 +2655,6 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
2272 if (mce->status & MCI_STATUS_UC) { 2655 if (mce->status & MCI_STATUS_UC) {
2273 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2656 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
2274 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) { 2657 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
2275 printk(KERN_DEBUG "kvm: set_mce: "
2276 "injects mce exception while "
2277 "previous one is in progress!\n");
2278 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); 2658 kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
2279 return 0; 2659 return 0;
2280 } 2660 }
@@ -2305,6 +2685,7 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2305 !kvm_exception_is_soft(vcpu->arch.exception.nr); 2685 !kvm_exception_is_soft(vcpu->arch.exception.nr);
2306 events->exception.nr = vcpu->arch.exception.nr; 2686 events->exception.nr = vcpu->arch.exception.nr;
2307 events->exception.has_error_code = vcpu->arch.exception.has_error_code; 2687 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2688 events->exception.pad = 0;
2308 events->exception.error_code = vcpu->arch.exception.error_code; 2689 events->exception.error_code = vcpu->arch.exception.error_code;
2309 2690
2310 events->interrupt.injected = 2691 events->interrupt.injected =
@@ -2318,12 +2699,14 @@ static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2318 events->nmi.injected = vcpu->arch.nmi_injected; 2699 events->nmi.injected = vcpu->arch.nmi_injected;
2319 events->nmi.pending = vcpu->arch.nmi_pending; 2700 events->nmi.pending = vcpu->arch.nmi_pending;
2320 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu); 2701 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2702 events->nmi.pad = 0;
2321 2703
2322 events->sipi_vector = vcpu->arch.sipi_vector; 2704 events->sipi_vector = vcpu->arch.sipi_vector;
2323 2705
2324 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING 2706 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2325 | KVM_VCPUEVENT_VALID_SIPI_VECTOR 2707 | KVM_VCPUEVENT_VALID_SIPI_VECTOR
2326 | KVM_VCPUEVENT_VALID_SHADOW); 2708 | KVM_VCPUEVENT_VALID_SHADOW);
2709 memset(&events->reserved, 0, sizeof(events->reserved));
2327} 2710}
2328 2711
2329static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, 2712static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
@@ -2342,8 +2725,6 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2342 vcpu->arch.interrupt.pending = events->interrupt.injected; 2725 vcpu->arch.interrupt.pending = events->interrupt.injected;
2343 vcpu->arch.interrupt.nr = events->interrupt.nr; 2726 vcpu->arch.interrupt.nr = events->interrupt.nr;
2344 vcpu->arch.interrupt.soft = events->interrupt.soft; 2727 vcpu->arch.interrupt.soft = events->interrupt.soft;
2345 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2346 kvm_pic_clear_isr_ack(vcpu->kvm);
2347 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) 2728 if (events->flags & KVM_VCPUEVENT_VALID_SHADOW)
2348 kvm_x86_ops->set_interrupt_shadow(vcpu, 2729 kvm_x86_ops->set_interrupt_shadow(vcpu,
2349 events->interrupt.shadow); 2730 events->interrupt.shadow);
@@ -2356,6 +2737,8 @@ static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2356 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR) 2737 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2357 vcpu->arch.sipi_vector = events->sipi_vector; 2738 vcpu->arch.sipi_vector = events->sipi_vector;
2358 2739
2740 kvm_make_request(KVM_REQ_EVENT, vcpu);
2741
2359 return 0; 2742 return 0;
2360} 2743}
2361 2744
@@ -2366,6 +2749,7 @@ static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu,
2366 dbgregs->dr6 = vcpu->arch.dr6; 2749 dbgregs->dr6 = vcpu->arch.dr6;
2367 dbgregs->dr7 = vcpu->arch.dr7; 2750 dbgregs->dr7 = vcpu->arch.dr7;
2368 dbgregs->flags = 0; 2751 dbgregs->flags = 0;
2752 memset(&dbgregs->reserved, 0, sizeof(dbgregs->reserved));
2369} 2753}
2370 2754
2371static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, 2755static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu,
@@ -2715,6 +3099,32 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
2715 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs); 3099 r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, u.xcrs);
2716 break; 3100 break;
2717 } 3101 }
3102 case KVM_SET_TSC_KHZ: {
3103 u32 user_tsc_khz;
3104
3105 r = -EINVAL;
3106 if (!kvm_has_tsc_control)
3107 break;
3108
3109 user_tsc_khz = (u32)arg;
3110
3111 if (user_tsc_khz >= kvm_max_guest_tsc_khz)
3112 goto out;
3113
3114 kvm_x86_ops->set_tsc_khz(vcpu, user_tsc_khz);
3115
3116 r = 0;
3117 goto out;
3118 }
3119 case KVM_GET_TSC_KHZ: {
3120 r = -EIO;
3121 if (check_tsc_unstable())
3122 goto out;
3123
3124 r = vcpu_tsc_khz(vcpu);
3125
3126 goto out;
3127 }
2718 default: 3128 default:
2719 r = -EINVAL; 3129 r = -EINVAL;
2720 } 3130 }
@@ -2759,7 +3169,7 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
2759 3169
2760static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm) 3170static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
2761{ 3171{
2762 return kvm->arch.n_alloc_mmu_pages; 3172 return kvm->arch.n_max_mmu_pages;
2763} 3173}
2764 3174
2765static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) 3175static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
@@ -2795,18 +3205,18 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2795 r = 0; 3205 r = 0;
2796 switch (chip->chip_id) { 3206 switch (chip->chip_id) {
2797 case KVM_IRQCHIP_PIC_MASTER: 3207 case KVM_IRQCHIP_PIC_MASTER:
2798 raw_spin_lock(&pic_irqchip(kvm)->lock); 3208 spin_lock(&pic_irqchip(kvm)->lock);
2799 memcpy(&pic_irqchip(kvm)->pics[0], 3209 memcpy(&pic_irqchip(kvm)->pics[0],
2800 &chip->chip.pic, 3210 &chip->chip.pic,
2801 sizeof(struct kvm_pic_state)); 3211 sizeof(struct kvm_pic_state));
2802 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3212 spin_unlock(&pic_irqchip(kvm)->lock);
2803 break; 3213 break;
2804 case KVM_IRQCHIP_PIC_SLAVE: 3214 case KVM_IRQCHIP_PIC_SLAVE:
2805 raw_spin_lock(&pic_irqchip(kvm)->lock); 3215 spin_lock(&pic_irqchip(kvm)->lock);
2806 memcpy(&pic_irqchip(kvm)->pics[1], 3216 memcpy(&pic_irqchip(kvm)->pics[1],
2807 &chip->chip.pic, 3217 &chip->chip.pic,
2808 sizeof(struct kvm_pic_state)); 3218 sizeof(struct kvm_pic_state));
2809 raw_spin_unlock(&pic_irqchip(kvm)->lock); 3219 spin_unlock(&pic_irqchip(kvm)->lock);
2810 break; 3220 break;
2811 case KVM_IRQCHIP_IOAPIC: 3221 case KVM_IRQCHIP_IOAPIC:
2812 r = kvm_set_ioapic(kvm, &chip->chip.ioapic); 3222 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
@@ -2849,6 +3259,7 @@ static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps)
2849 sizeof(ps->channels)); 3259 sizeof(ps->channels));
2850 ps->flags = kvm->arch.vpit->pit_state.flags; 3260 ps->flags = kvm->arch.vpit->pit_state.flags;
2851 mutex_unlock(&kvm->arch.vpit->pit_state.lock); 3261 mutex_unlock(&kvm->arch.vpit->pit_state.lock);
3262 memset(&ps->reserved, 0, sizeof(ps->reserved));
2852 return r; 3263 return r;
2853} 3264}
2854 3265
@@ -2912,24 +3323,18 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2912 struct kvm_memslots *slots, *old_slots; 3323 struct kvm_memslots *slots, *old_slots;
2913 unsigned long *dirty_bitmap; 3324 unsigned long *dirty_bitmap;
2914 3325
2915 spin_lock(&kvm->mmu_lock); 3326 dirty_bitmap = memslot->dirty_bitmap_head;
2916 kvm_mmu_slot_remove_write_access(kvm, log->slot); 3327 if (memslot->dirty_bitmap == dirty_bitmap)
2917 spin_unlock(&kvm->mmu_lock); 3328 dirty_bitmap += n / sizeof(long);
2918
2919 r = -ENOMEM;
2920 dirty_bitmap = vmalloc(n);
2921 if (!dirty_bitmap)
2922 goto out;
2923 memset(dirty_bitmap, 0, n); 3329 memset(dirty_bitmap, 0, n);
2924 3330
2925 r = -ENOMEM; 3331 r = -ENOMEM;
2926 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL); 3332 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2927 if (!slots) { 3333 if (!slots)
2928 vfree(dirty_bitmap);
2929 goto out; 3334 goto out;
2930 }
2931 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots)); 3335 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2932 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap; 3336 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
3337 slots->generation++;
2933 3338
2934 old_slots = kvm->memslots; 3339 old_slots = kvm->memslots;
2935 rcu_assign_pointer(kvm->memslots, slots); 3340 rcu_assign_pointer(kvm->memslots, slots);
@@ -2937,12 +3342,13 @@ int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2937 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap; 3342 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2938 kfree(old_slots); 3343 kfree(old_slots);
2939 3344
3345 spin_lock(&kvm->mmu_lock);
3346 kvm_mmu_slot_remove_write_access(kvm, log->slot);
3347 spin_unlock(&kvm->mmu_lock);
3348
2940 r = -EFAULT; 3349 r = -EFAULT;
2941 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n)) { 3350 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2942 vfree(dirty_bitmap);
2943 goto out; 3351 goto out;
2944 }
2945 vfree(dirty_bitmap);
2946 } else { 3352 } else {
2947 r = -EFAULT; 3353 r = -EFAULT;
2948 if (clear_user(log->dirty_bitmap, n)) 3354 if (clear_user(log->dirty_bitmap, n))
@@ -3009,8 +3415,10 @@ long kvm_arch_vm_ioctl(struct file *filp,
3009 if (vpic) { 3415 if (vpic) {
3010 r = kvm_ioapic_init(kvm); 3416 r = kvm_ioapic_init(kvm);
3011 if (r) { 3417 if (r) {
3418 mutex_lock(&kvm->slots_lock);
3012 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, 3419 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
3013 &vpic->dev); 3420 &vpic->dev);
3421 mutex_unlock(&kvm->slots_lock);
3014 kfree(vpic); 3422 kfree(vpic);
3015 goto create_irqchip_unlock; 3423 goto create_irqchip_unlock;
3016 } 3424 }
@@ -3021,10 +3429,12 @@ long kvm_arch_vm_ioctl(struct file *filp,
3021 smp_wmb(); 3429 smp_wmb();
3022 r = kvm_setup_default_irq_routing(kvm); 3430 r = kvm_setup_default_irq_routing(kvm);
3023 if (r) { 3431 if (r) {
3432 mutex_lock(&kvm->slots_lock);
3024 mutex_lock(&kvm->irq_lock); 3433 mutex_lock(&kvm->irq_lock);
3025 kvm_ioapic_destroy(kvm); 3434 kvm_ioapic_destroy(kvm);
3026 kvm_destroy_pic(kvm); 3435 kvm_destroy_pic(kvm);
3027 mutex_unlock(&kvm->irq_lock); 3436 mutex_unlock(&kvm->irq_lock);
3437 mutex_unlock(&kvm->slots_lock);
3028 } 3438 }
3029 create_irqchip_unlock: 3439 create_irqchip_unlock:
3030 mutex_unlock(&kvm->lock); 3440 mutex_unlock(&kvm->lock);
@@ -3200,7 +3610,6 @@ long kvm_arch_vm_ioctl(struct file *filp,
3200 break; 3610 break;
3201 } 3611 }
3202 case KVM_SET_CLOCK: { 3612 case KVM_SET_CLOCK: {
3203 struct timespec now;
3204 struct kvm_clock_data user_ns; 3613 struct kvm_clock_data user_ns;
3205 u64 now_ns; 3614 u64 now_ns;
3206 s64 delta; 3615 s64 delta;
@@ -3214,21 +3623,23 @@ long kvm_arch_vm_ioctl(struct file *filp,
3214 goto out; 3623 goto out;
3215 3624
3216 r = 0; 3625 r = 0;
3217 ktime_get_ts(&now); 3626 local_irq_disable();
3218 now_ns = timespec_to_ns(&now); 3627 now_ns = get_kernel_ns();
3219 delta = user_ns.clock - now_ns; 3628 delta = user_ns.clock - now_ns;
3629 local_irq_enable();
3220 kvm->arch.kvmclock_offset = delta; 3630 kvm->arch.kvmclock_offset = delta;
3221 break; 3631 break;
3222 } 3632 }
3223 case KVM_GET_CLOCK: { 3633 case KVM_GET_CLOCK: {
3224 struct timespec now;
3225 struct kvm_clock_data user_ns; 3634 struct kvm_clock_data user_ns;
3226 u64 now_ns; 3635 u64 now_ns;
3227 3636
3228 ktime_get_ts(&now); 3637 local_irq_disable();
3229 now_ns = timespec_to_ns(&now); 3638 now_ns = get_kernel_ns();
3230 user_ns.clock = kvm->arch.kvmclock_offset + now_ns; 3639 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
3640 local_irq_enable();
3231 user_ns.flags = 0; 3641 user_ns.flags = 0;
3642 memset(&user_ns.pad, 0, sizeof(user_ns.pad));
3232 3643
3233 r = -EFAULT; 3644 r = -EFAULT;
3234 if (copy_to_user(argp, &user_ns, sizeof(user_ns))) 3645 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
@@ -3263,20 +3674,43 @@ static void kvm_init_msr_list(void)
3263static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, 3674static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
3264 const void *v) 3675 const void *v)
3265{ 3676{
3266 if (vcpu->arch.apic && 3677 int handled = 0;
3267 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3678 int n;
3268 return 0; 3679
3680 do {
3681 n = min(len, 8);
3682 if (!(vcpu->arch.apic &&
3683 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, n, v))
3684 && kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3685 break;
3686 handled += n;
3687 addr += n;
3688 len -= n;
3689 v += n;
3690 } while (len);
3269 3691
3270 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3692 return handled;
3271} 3693}
3272 3694
3273static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3695static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
3274{ 3696{
3275 if (vcpu->arch.apic && 3697 int handled = 0;
3276 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3698 int n;
3277 return 0; 3699
3700 do {
3701 n = min(len, 8);
3702 if (!(vcpu->arch.apic &&
3703 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, n, v))
3704 && kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, n, v))
3705 break;
3706 trace_kvm_mmio(KVM_TRACE_MMIO_READ, n, addr, *(u64 *)v);
3707 handled += n;
3708 addr += n;
3709 len -= n;
3710 v += n;
3711 } while (len);
3278 3712
3279 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v); 3713 return handled;
3280} 3714}
3281 3715
3282static void kvm_set_segment(struct kvm_vcpu *vcpu, 3716static void kvm_set_segment(struct kvm_vcpu *vcpu,
@@ -3291,49 +3725,71 @@ void kvm_get_segment(struct kvm_vcpu *vcpu,
3291 kvm_x86_ops->get_segment(vcpu, var, seg); 3725 kvm_x86_ops->get_segment(vcpu, var, seg);
3292} 3726}
3293 3727
3294gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3728static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3729{
3730 return gpa;
3731}
3732
3733static gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access)
3734{
3735 gpa_t t_gpa;
3736 struct x86_exception exception;
3737
3738 BUG_ON(!mmu_is_nested(vcpu));
3739
3740 /* NPT walks are always user-walks */
3741 access |= PFERR_USER_MASK;
3742 t_gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gpa, access, &exception);
3743
3744 return t_gpa;
3745}
3746
3747gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva,
3748 struct x86_exception *exception)
3295{ 3749{
3296 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3750 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3297 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3751 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3298} 3752}
3299 3753
3300 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3754 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
3755 struct x86_exception *exception)
3301{ 3756{
3302 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3757 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3303 access |= PFERR_FETCH_MASK; 3758 access |= PFERR_FETCH_MASK;
3304 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3759 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3305} 3760}
3306 3761
3307gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3762gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva,
3763 struct x86_exception *exception)
3308{ 3764{
3309 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3765 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3310 access |= PFERR_WRITE_MASK; 3766 access |= PFERR_WRITE_MASK;
3311 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error); 3767 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
3312} 3768}
3313 3769
3314/* uses this to access any guest's mapped memory without checking CPL */ 3770/* uses this to access any guest's mapped memory without checking CPL */
3315gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error) 3771gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
3772 struct x86_exception *exception)
3316{ 3773{
3317 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error); 3774 return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, 0, exception);
3318} 3775}
3319 3776
3320static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, 3777static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3321 struct kvm_vcpu *vcpu, u32 access, 3778 struct kvm_vcpu *vcpu, u32 access,
3322 u32 *error) 3779 struct x86_exception *exception)
3323{ 3780{
3324 void *data = val; 3781 void *data = val;
3325 int r = X86EMUL_CONTINUE; 3782 int r = X86EMUL_CONTINUE;
3326 3783
3327 while (bytes) { 3784 while (bytes) {
3328 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error); 3785 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr, access,
3786 exception);
3329 unsigned offset = addr & (PAGE_SIZE-1); 3787 unsigned offset = addr & (PAGE_SIZE-1);
3330 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3788 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
3331 int ret; 3789 int ret;
3332 3790
3333 if (gpa == UNMAPPED_GVA) { 3791 if (gpa == UNMAPPED_GVA)
3334 r = X86EMUL_PROPAGATE_FAULT; 3792 return X86EMUL_PROPAGATE_FAULT;
3335 goto out;
3336 }
3337 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread); 3793 ret = kvm_read_guest(vcpu->kvm, gpa, data, toread);
3338 if (ret < 0) { 3794 if (ret < 0) {
3339 r = X86EMUL_IO_NEEDED; 3795 r = X86EMUL_IO_NEEDED;
@@ -3349,47 +3805,56 @@ out:
3349} 3805}
3350 3806
3351/* used for instruction fetching */ 3807/* used for instruction fetching */
3352static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes, 3808static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt,
3353 struct kvm_vcpu *vcpu, u32 *error) 3809 gva_t addr, void *val, unsigned int bytes,
3810 struct x86_exception *exception)
3354{ 3811{
3812 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3355 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3813 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3814
3356 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 3815 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3357 access | PFERR_FETCH_MASK, error); 3816 access | PFERR_FETCH_MASK,
3817 exception);
3358} 3818}
3359 3819
3360static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3820static int kvm_read_guest_virt(struct x86_emulate_ctxt *ctxt,
3361 struct kvm_vcpu *vcpu, u32 *error) 3821 gva_t addr, void *val, unsigned int bytes,
3822 struct x86_exception *exception)
3362{ 3823{
3824 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3363 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; 3825 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3826
3364 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, 3827 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3365 error); 3828 exception);
3366} 3829}
3367 3830
3368static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes, 3831static int kvm_read_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3369 struct kvm_vcpu *vcpu, u32 *error) 3832 gva_t addr, void *val, unsigned int bytes,
3833 struct x86_exception *exception)
3370{ 3834{
3371 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error); 3835 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3836 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, exception);
3372} 3837}
3373 3838
3374static int kvm_write_guest_virt_system(gva_t addr, void *val, 3839static int kvm_write_guest_virt_system(struct x86_emulate_ctxt *ctxt,
3840 gva_t addr, void *val,
3375 unsigned int bytes, 3841 unsigned int bytes,
3376 struct kvm_vcpu *vcpu, 3842 struct x86_exception *exception)
3377 u32 *error)
3378{ 3843{
3844 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3379 void *data = val; 3845 void *data = val;
3380 int r = X86EMUL_CONTINUE; 3846 int r = X86EMUL_CONTINUE;
3381 3847
3382 while (bytes) { 3848 while (bytes) {
3383 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, 3849 gpa_t gpa = vcpu->arch.walk_mmu->gva_to_gpa(vcpu, addr,
3384 PFERR_WRITE_MASK, error); 3850 PFERR_WRITE_MASK,
3851 exception);
3385 unsigned offset = addr & (PAGE_SIZE-1); 3852 unsigned offset = addr & (PAGE_SIZE-1);
3386 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3853 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
3387 int ret; 3854 int ret;
3388 3855
3389 if (gpa == UNMAPPED_GVA) { 3856 if (gpa == UNMAPPED_GVA)
3390 r = X86EMUL_PROPAGATE_FAULT; 3857 return X86EMUL_PROPAGATE_FAULT;
3391 goto out;
3392 }
3393 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite); 3858 ret = kvm_write_guest(vcpu->kvm, gpa, data, towrite);
3394 if (ret < 0) { 3859 if (ret < 0) {
3395 r = X86EMUL_IO_NEEDED; 3860 r = X86EMUL_IO_NEEDED;
@@ -3404,13 +3869,15 @@ out:
3404 return r; 3869 return r;
3405} 3870}
3406 3871
3407static int emulator_read_emulated(unsigned long addr, 3872static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt,
3873 unsigned long addr,
3408 void *val, 3874 void *val,
3409 unsigned int bytes, 3875 unsigned int bytes,
3410 unsigned int *error_code, 3876 struct x86_exception *exception)
3411 struct kvm_vcpu *vcpu)
3412{ 3877{
3878 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3413 gpa_t gpa; 3879 gpa_t gpa;
3880 int handled;
3414 3881
3415 if (vcpu->mmio_read_completed) { 3882 if (vcpu->mmio_read_completed) {
3416 memcpy(val, vcpu->mmio_data, bytes); 3883 memcpy(val, vcpu->mmio_data, bytes);
@@ -3420,7 +3887,7 @@ static int emulator_read_emulated(unsigned long addr,
3420 return X86EMUL_CONTINUE; 3887 return X86EMUL_CONTINUE;
3421 } 3888 }
3422 3889
3423 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, error_code); 3890 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, exception);
3424 3891
3425 if (gpa == UNMAPPED_GVA) 3892 if (gpa == UNMAPPED_GVA)
3426 return X86EMUL_PROPAGATE_FAULT; 3893 return X86EMUL_PROPAGATE_FAULT;
@@ -3429,32 +3896,38 @@ static int emulator_read_emulated(unsigned long addr,
3429 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3896 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
3430 goto mmio; 3897 goto mmio;
3431 3898
3432 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL) 3899 if (kvm_read_guest_virt(ctxt, addr, val, bytes, exception)
3433 == X86EMUL_CONTINUE) 3900 == X86EMUL_CONTINUE)
3434 return X86EMUL_CONTINUE; 3901 return X86EMUL_CONTINUE;
3435 3902
3436mmio: 3903mmio:
3437 /* 3904 /*
3438 * Is this MMIO handled locally? 3905 * Is this MMIO handled locally?
3439 */ 3906 */
3440 if (!vcpu_mmio_read(vcpu, gpa, bytes, val)) { 3907 handled = vcpu_mmio_read(vcpu, gpa, bytes, val);
3441 trace_kvm_mmio(KVM_TRACE_MMIO_READ, bytes, gpa, *(u64 *)val); 3908
3909 if (handled == bytes)
3442 return X86EMUL_CONTINUE; 3910 return X86EMUL_CONTINUE;
3443 } 3911
3912 gpa += handled;
3913 bytes -= handled;
3914 val += handled;
3444 3915
3445 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0); 3916 trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, bytes, gpa, 0);
3446 3917
3447 vcpu->mmio_needed = 1; 3918 vcpu->mmio_needed = 1;
3448 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3919 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3449 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3920 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3450 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3921 vcpu->mmio_size = bytes;
3922 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3451 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0; 3923 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 0;
3924 vcpu->mmio_index = 0;
3452 3925
3453 return X86EMUL_IO_NEEDED; 3926 return X86EMUL_IO_NEEDED;
3454} 3927}
3455 3928
3456int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, 3929int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3457 const void *val, int bytes) 3930 const void *val, int bytes)
3458{ 3931{
3459 int ret; 3932 int ret;
3460 3933
@@ -3468,12 +3941,13 @@ int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa,
3468static int emulator_write_emulated_onepage(unsigned long addr, 3941static int emulator_write_emulated_onepage(unsigned long addr,
3469 const void *val, 3942 const void *val,
3470 unsigned int bytes, 3943 unsigned int bytes,
3471 unsigned int *error_code, 3944 struct x86_exception *exception,
3472 struct kvm_vcpu *vcpu) 3945 struct kvm_vcpu *vcpu)
3473{ 3946{
3474 gpa_t gpa; 3947 gpa_t gpa;
3948 int handled;
3475 3949
3476 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error_code); 3950 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, exception);
3477 3951
3478 if (gpa == UNMAPPED_GVA) 3952 if (gpa == UNMAPPED_GVA)
3479 return X86EMUL_PROPAGATE_FAULT; 3953 return X86EMUL_PROPAGATE_FAULT;
@@ -3490,31 +3964,41 @@ mmio:
3490 /* 3964 /*
3491 * Is this MMIO handled locally? 3965 * Is this MMIO handled locally?
3492 */ 3966 */
3493 if (!vcpu_mmio_write(vcpu, gpa, bytes, val)) 3967 handled = vcpu_mmio_write(vcpu, gpa, bytes, val);
3968 if (handled == bytes)
3494 return X86EMUL_CONTINUE; 3969 return X86EMUL_CONTINUE;
3495 3970
3971 gpa += handled;
3972 bytes -= handled;
3973 val += handled;
3974
3496 vcpu->mmio_needed = 1; 3975 vcpu->mmio_needed = 1;
3976 memcpy(vcpu->mmio_data, val, bytes);
3497 vcpu->run->exit_reason = KVM_EXIT_MMIO; 3977 vcpu->run->exit_reason = KVM_EXIT_MMIO;
3498 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa; 3978 vcpu->run->mmio.phys_addr = vcpu->mmio_phys_addr = gpa;
3499 vcpu->run->mmio.len = vcpu->mmio_size = bytes; 3979 vcpu->mmio_size = bytes;
3980 vcpu->run->mmio.len = min(vcpu->mmio_size, 8);
3500 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1; 3981 vcpu->run->mmio.is_write = vcpu->mmio_is_write = 1;
3501 memcpy(vcpu->run->mmio.data, val, bytes); 3982 memcpy(vcpu->run->mmio.data, vcpu->mmio_data, 8);
3983 vcpu->mmio_index = 0;
3502 3984
3503 return X86EMUL_CONTINUE; 3985 return X86EMUL_CONTINUE;
3504} 3986}
3505 3987
3506int emulator_write_emulated(unsigned long addr, 3988int emulator_write_emulated(struct x86_emulate_ctxt *ctxt,
3989 unsigned long addr,
3507 const void *val, 3990 const void *val,
3508 unsigned int bytes, 3991 unsigned int bytes,
3509 unsigned int *error_code, 3992 struct x86_exception *exception)
3510 struct kvm_vcpu *vcpu)
3511{ 3993{
3994 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3995
3512 /* Crossing a page boundary? */ 3996 /* Crossing a page boundary? */
3513 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { 3997 if (((addr + bytes - 1) ^ addr) & PAGE_MASK) {
3514 int rc, now; 3998 int rc, now;
3515 3999
3516 now = -addr & ~PAGE_MASK; 4000 now = -addr & ~PAGE_MASK;
3517 rc = emulator_write_emulated_onepage(addr, val, now, error_code, 4001 rc = emulator_write_emulated_onepage(addr, val, now, exception,
3518 vcpu); 4002 vcpu);
3519 if (rc != X86EMUL_CONTINUE) 4003 if (rc != X86EMUL_CONTINUE)
3520 return rc; 4004 return rc;
@@ -3522,7 +4006,7 @@ int emulator_write_emulated(unsigned long addr,
3522 val += now; 4006 val += now;
3523 bytes -= now; 4007 bytes -= now;
3524 } 4008 }
3525 return emulator_write_emulated_onepage(addr, val, bytes, error_code, 4009 return emulator_write_emulated_onepage(addr, val, bytes, exception,
3526 vcpu); 4010 vcpu);
3527} 4011}
3528 4012
@@ -3536,13 +4020,14 @@ int emulator_write_emulated(unsigned long addr,
3536 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old)) 4020 (cmpxchg64((u64 *)(ptr), *(u64 *)(old), *(u64 *)(new)) == *(u64 *)(old))
3537#endif 4021#endif
3538 4022
3539static int emulator_cmpxchg_emulated(unsigned long addr, 4023static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt,
4024 unsigned long addr,
3540 const void *old, 4025 const void *old,
3541 const void *new, 4026 const void *new,
3542 unsigned int bytes, 4027 unsigned int bytes,
3543 unsigned int *error_code, 4028 struct x86_exception *exception)
3544 struct kvm_vcpu *vcpu)
3545{ 4029{
4030 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3546 gpa_t gpa; 4031 gpa_t gpa;
3547 struct page *page; 4032 struct page *page;
3548 char *kaddr; 4033 char *kaddr;
@@ -3598,7 +4083,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
3598emul_write: 4083emul_write:
3599 printk_once(KERN_WARNING "kvm: emulating exchange as write\n"); 4084 printk_once(KERN_WARNING "kvm: emulating exchange as write\n");
3600 4085
3601 return emulator_write_emulated(addr, new, bytes, error_code, vcpu); 4086 return emulator_write_emulated(ctxt, addr, new, bytes, exception);
3602} 4087}
3603 4088
3604static int kernel_pio(struct kvm_vcpu *vcpu, void *pd) 4089static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
@@ -3617,13 +4102,16 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
3617} 4102}
3618 4103
3619 4104
3620static int emulator_pio_in_emulated(int size, unsigned short port, void *val, 4105static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt,
3621 unsigned int count, struct kvm_vcpu *vcpu) 4106 int size, unsigned short port, void *val,
4107 unsigned int count)
3622{ 4108{
4109 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4110
3623 if (vcpu->arch.pio.count) 4111 if (vcpu->arch.pio.count)
3624 goto data_avail; 4112 goto data_avail;
3625 4113
3626 trace_kvm_pio(1, port, size, 1); 4114 trace_kvm_pio(0, port, size, count);
3627 4115
3628 vcpu->arch.pio.port = port; 4116 vcpu->arch.pio.port = port;
3629 vcpu->arch.pio.in = 1; 4117 vcpu->arch.pio.in = 1;
@@ -3647,11 +4135,13 @@ static int emulator_pio_in_emulated(int size, unsigned short port, void *val,
3647 return 0; 4135 return 0;
3648} 4136}
3649 4137
3650static int emulator_pio_out_emulated(int size, unsigned short port, 4138static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt,
3651 const void *val, unsigned int count, 4139 int size, unsigned short port,
3652 struct kvm_vcpu *vcpu) 4140 const void *val, unsigned int count)
3653{ 4141{
3654 trace_kvm_pio(0, port, size, 1); 4142 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4143
4144 trace_kvm_pio(1, port, size, count);
3655 4145
3656 vcpu->arch.pio.port = port; 4146 vcpu->arch.pio.port = port;
3657 vcpu->arch.pio.in = 0; 4147 vcpu->arch.pio.in = 0;
@@ -3680,10 +4170,9 @@ static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg)
3680 return kvm_x86_ops->get_segment_base(vcpu, seg); 4170 return kvm_x86_ops->get_segment_base(vcpu, seg);
3681} 4171}
3682 4172
3683int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address) 4173static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address)
3684{ 4174{
3685 kvm_mmu_invlpg(vcpu, address); 4175 kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
3686 return X86EMUL_CONTINUE;
3687} 4176}
3688 4177
3689int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) 4178int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
@@ -3692,31 +4181,33 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
3692 return X86EMUL_CONTINUE; 4181 return X86EMUL_CONTINUE;
3693 4182
3694 if (kvm_x86_ops->has_wbinvd_exit()) { 4183 if (kvm_x86_ops->has_wbinvd_exit()) {
4184 int cpu = get_cpu();
4185
4186 cpumask_set_cpu(cpu, vcpu->arch.wbinvd_dirty_mask);
3695 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask, 4187 smp_call_function_many(vcpu->arch.wbinvd_dirty_mask,
3696 wbinvd_ipi, NULL, 1); 4188 wbinvd_ipi, NULL, 1);
4189 put_cpu();
3697 cpumask_clear(vcpu->arch.wbinvd_dirty_mask); 4190 cpumask_clear(vcpu->arch.wbinvd_dirty_mask);
3698 } 4191 } else
3699 wbinvd(); 4192 wbinvd();
3700 return X86EMUL_CONTINUE; 4193 return X86EMUL_CONTINUE;
3701} 4194}
3702EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); 4195EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
3703 4196
3704int emulate_clts(struct kvm_vcpu *vcpu) 4197static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
3705{ 4198{
3706 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); 4199 kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
3707 kvm_x86_ops->fpu_activate(vcpu);
3708 return X86EMUL_CONTINUE;
3709} 4200}
3710 4201
3711int emulator_get_dr(int dr, unsigned long *dest, struct kvm_vcpu *vcpu) 4202int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
3712{ 4203{
3713 return _kvm_get_dr(vcpu, dr, dest); 4204 return _kvm_get_dr(emul_to_vcpu(ctxt), dr, dest);
3714} 4205}
3715 4206
3716int emulator_set_dr(int dr, unsigned long value, struct kvm_vcpu *vcpu) 4207int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
3717{ 4208{
3718 4209
3719 return __kvm_set_dr(vcpu, dr, value); 4210 return __kvm_set_dr(emul_to_vcpu(ctxt), dr, value);
3720} 4211}
3721 4212
3722static u64 mk_cr_64(u64 curr_cr, u32 new_val) 4213static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3724,8 +4215,9 @@ static u64 mk_cr_64(u64 curr_cr, u32 new_val)
3724 return (curr_cr & ~((1ULL << 32) - 1)) | new_val; 4215 return (curr_cr & ~((1ULL << 32) - 1)) | new_val;
3725} 4216}
3726 4217
3727static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu) 4218static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr)
3728{ 4219{
4220 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3729 unsigned long value; 4221 unsigned long value;
3730 4222
3731 switch (cr) { 4223 switch (cr) {
@@ -3736,7 +4228,7 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3736 value = vcpu->arch.cr2; 4228 value = vcpu->arch.cr2;
3737 break; 4229 break;
3738 case 3: 4230 case 3:
3739 value = vcpu->arch.cr3; 4231 value = kvm_read_cr3(vcpu);
3740 break; 4232 break;
3741 case 4: 4233 case 4:
3742 value = kvm_read_cr4(vcpu); 4234 value = kvm_read_cr4(vcpu);
@@ -3752,8 +4244,9 @@ static unsigned long emulator_get_cr(int cr, struct kvm_vcpu *vcpu)
3752 return value; 4244 return value;
3753} 4245}
3754 4246
3755static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu) 4247static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val)
3756{ 4248{
4249 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3757 int res = 0; 4250 int res = 0;
3758 4251
3759 switch (cr) { 4252 switch (cr) {
@@ -3770,7 +4263,7 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3770 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val)); 4263 res = kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3771 break; 4264 break;
3772 case 8: 4265 case 8:
3773 res = __kvm_set_cr8(vcpu, val & 0xfUL); 4266 res = kvm_set_cr8(vcpu, val);
3774 break; 4267 break;
3775 default: 4268 default:
3776 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr); 4269 vcpu_printf(vcpu, "%s: unexpected cr %u\n", __func__, cr);
@@ -3780,28 +4273,45 @@ static int emulator_set_cr(int cr, unsigned long val, struct kvm_vcpu *vcpu)
3780 return res; 4273 return res;
3781} 4274}
3782 4275
3783static int emulator_get_cpl(struct kvm_vcpu *vcpu) 4276static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt)
3784{ 4277{
3785 return kvm_x86_ops->get_cpl(vcpu); 4278 return kvm_x86_ops->get_cpl(emul_to_vcpu(ctxt));
3786} 4279}
3787 4280
3788static void emulator_get_gdt(struct desc_ptr *dt, struct kvm_vcpu *vcpu) 4281static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
3789{ 4282{
3790 kvm_x86_ops->get_gdt(vcpu, dt); 4283 kvm_x86_ops->get_gdt(emul_to_vcpu(ctxt), dt);
3791} 4284}
3792 4285
3793static unsigned long emulator_get_cached_segment_base(int seg, 4286static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
3794 struct kvm_vcpu *vcpu)
3795{ 4287{
3796 return get_segment_base(vcpu, seg); 4288 kvm_x86_ops->get_idt(emul_to_vcpu(ctxt), dt);
3797} 4289}
3798 4290
3799static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg, 4291static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
3800 struct kvm_vcpu *vcpu) 4292{
4293 kvm_x86_ops->set_gdt(emul_to_vcpu(ctxt), dt);
4294}
4295
4296static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt)
4297{
4298 kvm_x86_ops->set_idt(emul_to_vcpu(ctxt), dt);
4299}
4300
4301static unsigned long emulator_get_cached_segment_base(
4302 struct x86_emulate_ctxt *ctxt, int seg)
4303{
4304 return get_segment_base(emul_to_vcpu(ctxt), seg);
4305}
4306
4307static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector,
4308 struct desc_struct *desc, u32 *base3,
4309 int seg)
3801{ 4310{
3802 struct kvm_segment var; 4311 struct kvm_segment var;
3803 4312
3804 kvm_get_segment(vcpu, &var, seg); 4313 kvm_get_segment(emul_to_vcpu(ctxt), &var, seg);
4314 *selector = var.selector;
3805 4315
3806 if (var.unusable) 4316 if (var.unusable)
3807 return false; 4317 return false;
@@ -3810,6 +4320,10 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3810 var.limit >>= 12; 4320 var.limit >>= 12;
3811 set_desc_limit(desc, var.limit); 4321 set_desc_limit(desc, var.limit);
3812 set_desc_base(desc, (unsigned long)var.base); 4322 set_desc_base(desc, (unsigned long)var.base);
4323#ifdef CONFIG_X86_64
4324 if (base3)
4325 *base3 = var.base >> 32;
4326#endif
3813 desc->type = var.type; 4327 desc->type = var.type;
3814 desc->s = var.s; 4328 desc->s = var.s;
3815 desc->dpl = var.dpl; 4329 desc->dpl = var.dpl;
@@ -3822,15 +4336,18 @@ static bool emulator_get_cached_descriptor(struct desc_struct *desc, int seg,
3822 return true; 4336 return true;
3823} 4337}
3824 4338
3825static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg, 4339static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector,
3826 struct kvm_vcpu *vcpu) 4340 struct desc_struct *desc, u32 base3,
4341 int seg)
3827{ 4342{
4343 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
3828 struct kvm_segment var; 4344 struct kvm_segment var;
3829 4345
3830 /* needed to preserve selector */ 4346 var.selector = selector;
3831 kvm_get_segment(vcpu, &var, seg);
3832
3833 var.base = get_desc_base(desc); 4347 var.base = get_desc_base(desc);
4348#ifdef CONFIG_X86_64
4349 var.base |= ((u64)base3) << 32;
4350#endif
3834 var.limit = get_desc_limit(desc); 4351 var.limit = get_desc_limit(desc);
3835 if (desc->g) 4352 if (desc->g)
3836 var.limit = (var.limit << 12) | 0xfff; 4353 var.limit = (var.limit << 12) | 0xfff;
@@ -3850,22 +4367,44 @@ static void emulator_set_cached_descriptor(struct desc_struct *desc, int seg,
3850 return; 4367 return;
3851} 4368}
3852 4369
3853static u16 emulator_get_segment_selector(int seg, struct kvm_vcpu *vcpu) 4370static int emulator_get_msr(struct x86_emulate_ctxt *ctxt,
4371 u32 msr_index, u64 *pdata)
3854{ 4372{
3855 struct kvm_segment kvm_seg; 4373 return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata);
4374}
3856 4375
3857 kvm_get_segment(vcpu, &kvm_seg, seg); 4376static int emulator_set_msr(struct x86_emulate_ctxt *ctxt,
3858 return kvm_seg.selector; 4377 u32 msr_index, u64 data)
4378{
4379 return kvm_set_msr(emul_to_vcpu(ctxt), msr_index, data);
4380}
4381
4382static void emulator_halt(struct x86_emulate_ctxt *ctxt)
4383{
4384 emul_to_vcpu(ctxt)->arch.halt_request = 1;
4385}
4386
4387static void emulator_get_fpu(struct x86_emulate_ctxt *ctxt)
4388{
4389 preempt_disable();
4390 kvm_load_guest_fpu(emul_to_vcpu(ctxt));
4391 /*
4392 * CR0.TS may reference the host fpu state, not the guest fpu state,
4393 * so it may be clear at this point.
4394 */
4395 clts();
3859} 4396}
3860 4397
3861static void emulator_set_segment_selector(u16 sel, int seg, 4398static void emulator_put_fpu(struct x86_emulate_ctxt *ctxt)
3862 struct kvm_vcpu *vcpu)
3863{ 4399{
3864 struct kvm_segment kvm_seg; 4400 preempt_enable();
4401}
3865 4402
3866 kvm_get_segment(vcpu, &kvm_seg, seg); 4403static int emulator_intercept(struct x86_emulate_ctxt *ctxt,
3867 kvm_seg.selector = sel; 4404 struct x86_instruction_info *info,
3868 kvm_set_segment(vcpu, &kvm_seg, seg); 4405 enum x86_intercept_stage stage)
4406{
4407 return kvm_x86_ops->check_intercept(emul_to_vcpu(ctxt), info, stage);
3869} 4408}
3870 4409
3871static struct x86_emulate_ops emulate_ops = { 4410static struct x86_emulate_ops emulate_ops = {
@@ -3875,21 +4414,29 @@ static struct x86_emulate_ops emulate_ops = {
3875 .read_emulated = emulator_read_emulated, 4414 .read_emulated = emulator_read_emulated,
3876 .write_emulated = emulator_write_emulated, 4415 .write_emulated = emulator_write_emulated,
3877 .cmpxchg_emulated = emulator_cmpxchg_emulated, 4416 .cmpxchg_emulated = emulator_cmpxchg_emulated,
4417 .invlpg = emulator_invlpg,
3878 .pio_in_emulated = emulator_pio_in_emulated, 4418 .pio_in_emulated = emulator_pio_in_emulated,
3879 .pio_out_emulated = emulator_pio_out_emulated, 4419 .pio_out_emulated = emulator_pio_out_emulated,
3880 .get_cached_descriptor = emulator_get_cached_descriptor, 4420 .get_segment = emulator_get_segment,
3881 .set_cached_descriptor = emulator_set_cached_descriptor, 4421 .set_segment = emulator_set_segment,
3882 .get_segment_selector = emulator_get_segment_selector,
3883 .set_segment_selector = emulator_set_segment_selector,
3884 .get_cached_segment_base = emulator_get_cached_segment_base, 4422 .get_cached_segment_base = emulator_get_cached_segment_base,
3885 .get_gdt = emulator_get_gdt, 4423 .get_gdt = emulator_get_gdt,
4424 .get_idt = emulator_get_idt,
4425 .set_gdt = emulator_set_gdt,
4426 .set_idt = emulator_set_idt,
3886 .get_cr = emulator_get_cr, 4427 .get_cr = emulator_get_cr,
3887 .set_cr = emulator_set_cr, 4428 .set_cr = emulator_set_cr,
3888 .cpl = emulator_get_cpl, 4429 .cpl = emulator_get_cpl,
3889 .get_dr = emulator_get_dr, 4430 .get_dr = emulator_get_dr,
3890 .set_dr = emulator_set_dr, 4431 .set_dr = emulator_set_dr,
3891 .set_msr = kvm_set_msr, 4432 .set_msr = emulator_set_msr,
3892 .get_msr = kvm_get_msr, 4433 .get_msr = emulator_get_msr,
4434 .halt = emulator_halt,
4435 .wbinvd = emulator_wbinvd,
4436 .fix_hypercall = emulator_fix_hypercall,
4437 .get_fpu = emulator_get_fpu,
4438 .put_fpu = emulator_put_fpu,
4439 .intercept = emulator_intercept,
3893}; 4440};
3894 4441
3895static void cache_all_regs(struct kvm_vcpu *vcpu) 4442static void cache_all_regs(struct kvm_vcpu *vcpu)
@@ -3917,23 +4464,89 @@ static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask)
3917static void inject_emulated_exception(struct kvm_vcpu *vcpu) 4464static void inject_emulated_exception(struct kvm_vcpu *vcpu)
3918{ 4465{
3919 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt; 4466 struct x86_emulate_ctxt *ctxt = &vcpu->arch.emulate_ctxt;
3920 if (ctxt->exception == PF_VECTOR) 4467 if (ctxt->exception.vector == PF_VECTOR)
3921 kvm_inject_page_fault(vcpu, ctxt->cr2, ctxt->error_code); 4468 kvm_propagate_fault(vcpu, &ctxt->exception);
3922 else if (ctxt->error_code_valid) 4469 else if (ctxt->exception.error_code_valid)
3923 kvm_queue_exception_e(vcpu, ctxt->exception, ctxt->error_code); 4470 kvm_queue_exception_e(vcpu, ctxt->exception.vector,
4471 ctxt->exception.error_code);
4472 else
4473 kvm_queue_exception(vcpu, ctxt->exception.vector);
4474}
4475
4476static void init_emulate_ctxt(struct kvm_vcpu *vcpu)
4477{
4478 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4479 int cs_db, cs_l;
4480
4481 /*
4482 * TODO: fix emulate.c to use guest_read/write_register
4483 * instead of direct ->regs accesses, can save hundred cycles
4484 * on Intel for instructions that don't read/change RSP, for
4485 * for example.
4486 */
4487 cache_all_regs(vcpu);
4488
4489 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
4490
4491 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
4492 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
4493 vcpu->arch.emulate_ctxt.mode =
4494 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
4495 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
4496 ? X86EMUL_MODE_VM86 : cs_l
4497 ? X86EMUL_MODE_PROT64 : cs_db
4498 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
4499 vcpu->arch.emulate_ctxt.guest_mode = is_guest_mode(vcpu);
4500 memset(c, 0, sizeof(struct decode_cache));
4501 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4502 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4503}
4504
4505int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip)
4506{
4507 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4508 int ret;
4509
4510 init_emulate_ctxt(vcpu);
4511
4512 vcpu->arch.emulate_ctxt.decode.op_bytes = 2;
4513 vcpu->arch.emulate_ctxt.decode.ad_bytes = 2;
4514 vcpu->arch.emulate_ctxt.decode.eip = vcpu->arch.emulate_ctxt.eip +
4515 inc_eip;
4516 ret = emulate_int_real(&vcpu->arch.emulate_ctxt, &emulate_ops, irq);
4517
4518 if (ret != X86EMUL_CONTINUE)
4519 return EMULATE_FAIL;
4520
4521 vcpu->arch.emulate_ctxt.eip = c->eip;
4522 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4523 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4524 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4525
4526 if (irq == NMI_VECTOR)
4527 vcpu->arch.nmi_pending = false;
3924 else 4528 else
3925 kvm_queue_exception(vcpu, ctxt->exception); 4529 vcpu->arch.interrupt.pending = false;
4530
4531 return EMULATE_DONE;
3926} 4532}
4533EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt);
3927 4534
3928static int handle_emulation_failure(struct kvm_vcpu *vcpu) 4535static int handle_emulation_failure(struct kvm_vcpu *vcpu)
3929{ 4536{
4537 int r = EMULATE_DONE;
4538
3930 ++vcpu->stat.insn_emulation_fail; 4539 ++vcpu->stat.insn_emulation_fail;
3931 trace_kvm_emulate_insn_failed(vcpu); 4540 trace_kvm_emulate_insn_failed(vcpu);
3932 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 4541 if (!is_guest_mode(vcpu)) {
3933 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 4542 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3934 vcpu->run->internal.ndata = 0; 4543 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
4544 vcpu->run->internal.ndata = 0;
4545 r = EMULATE_FAIL;
4546 }
3935 kvm_queue_exception(vcpu, UD_VECTOR); 4547 kvm_queue_exception(vcpu, UD_VECTOR);
3936 return EMULATE_FAIL; 4548
4549 return r;
3937} 4550}
3938 4551
3939static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva) 4552static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
@@ -3962,74 +4575,34 @@ static bool reexecute_instruction(struct kvm_vcpu *vcpu, gva_t gva)
3962 return false; 4575 return false;
3963} 4576}
3964 4577
3965int emulate_instruction(struct kvm_vcpu *vcpu, 4578int x86_emulate_instruction(struct kvm_vcpu *vcpu,
3966 unsigned long cr2, 4579 unsigned long cr2,
3967 u16 error_code, 4580 int emulation_type,
3968 int emulation_type) 4581 void *insn,
4582 int insn_len)
3969{ 4583{
3970 int r; 4584 int r;
3971 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 4585 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
4586 bool writeback = true;
3972 4587
3973 kvm_clear_exception_queue(vcpu); 4588 kvm_clear_exception_queue(vcpu);
3974 vcpu->arch.mmio_fault_cr2 = cr2;
3975 /*
3976 * TODO: fix emulate.c to use guest_read/write_register
3977 * instead of direct ->regs accesses, can save hundred cycles
3978 * on Intel for instructions that don't read/change RSP, for
3979 * for example.
3980 */
3981 cache_all_regs(vcpu);
3982 4589
3983 if (!(emulation_type & EMULTYPE_NO_DECODE)) { 4590 if (!(emulation_type & EMULTYPE_NO_DECODE)) {
3984 int cs_db, cs_l; 4591 init_emulate_ctxt(vcpu);
3985 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3986
3987 vcpu->arch.emulate_ctxt.vcpu = vcpu;
3988 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
3989 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
3990 vcpu->arch.emulate_ctxt.mode =
3991 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
3992 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
3993 ? X86EMUL_MODE_VM86 : cs_l
3994 ? X86EMUL_MODE_PROT64 : cs_db
3995 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
3996 memset(c, 0, sizeof(struct decode_cache));
3997 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
3998 vcpu->arch.emulate_ctxt.interruptibility = 0; 4592 vcpu->arch.emulate_ctxt.interruptibility = 0;
3999 vcpu->arch.emulate_ctxt.exception = -1; 4593 vcpu->arch.emulate_ctxt.have_exception = false;
4000 4594 vcpu->arch.emulate_ctxt.perm_ok = false;
4001 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, &emulate_ops);
4002 trace_kvm_emulate_insn_start(vcpu);
4003 4595
4004 /* Only allow emulation of specific instructions on #UD 4596 vcpu->arch.emulate_ctxt.only_vendor_specific_insn
4005 * (namely VMMCALL, sysenter, sysexit, syscall)*/ 4597 = emulation_type & EMULTYPE_TRAP_UD;
4006 if (emulation_type & EMULTYPE_TRAP_UD) {
4007 if (!c->twobyte)
4008 return EMULATE_FAIL;
4009 switch (c->b) {
4010 case 0x01: /* VMMCALL */
4011 if (c->modrm_mod != 3 || c->modrm_rm != 1)
4012 return EMULATE_FAIL;
4013 break;
4014 case 0x34: /* sysenter */
4015 case 0x35: /* sysexit */
4016 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4017 return EMULATE_FAIL;
4018 break;
4019 case 0x05: /* syscall */
4020 if (c->modrm_mod != 0 || c->modrm_rm != 0)
4021 return EMULATE_FAIL;
4022 break;
4023 default:
4024 return EMULATE_FAIL;
4025 }
4026 4598
4027 if (!(c->modrm_reg == 0 || c->modrm_reg == 3)) 4599 r = x86_decode_insn(&vcpu->arch.emulate_ctxt, insn, insn_len);
4028 return EMULATE_FAIL;
4029 }
4030 4600
4601 trace_kvm_emulate_insn_start(vcpu);
4031 ++vcpu->stat.insn_emulation; 4602 ++vcpu->stat.insn_emulation;
4032 if (r) { 4603 if (r) {
4604 if (emulation_type & EMULTYPE_TRAP_UD)
4605 return EMULATE_FAIL;
4033 if (reexecute_instruction(vcpu, cr2)) 4606 if (reexecute_instruction(vcpu, cr2))
4034 return EMULATE_DONE; 4607 return EMULATE_DONE;
4035 if (emulation_type & EMULTYPE_SKIP) 4608 if (emulation_type & EMULTYPE_SKIP)
@@ -4043,62 +4616,87 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
4043 return EMULATE_DONE; 4616 return EMULATE_DONE;
4044 } 4617 }
4045 4618
4046 /* this is needed for vmware backdor interface to work since it 4619 /* this is needed for vmware backdoor interface to work since it
4047 changes registers values during IO operation */ 4620 changes registers values during IO operation */
4048 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs); 4621 if (vcpu->arch.emulate_regs_need_sync_from_vcpu) {
4622 vcpu->arch.emulate_regs_need_sync_from_vcpu = false;
4623 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
4624 }
4049 4625
4050restart: 4626restart:
4051 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt, &emulate_ops); 4627 r = x86_emulate_insn(&vcpu->arch.emulate_ctxt);
4628
4629 if (r == EMULATION_INTERCEPTED)
4630 return EMULATE_DONE;
4052 4631
4053 if (r) { /* emulation failed */ 4632 if (r == EMULATION_FAILED) {
4054 if (reexecute_instruction(vcpu, cr2)) 4633 if (reexecute_instruction(vcpu, cr2))
4055 return EMULATE_DONE; 4634 return EMULATE_DONE;
4056 4635
4057 return handle_emulation_failure(vcpu); 4636 return handle_emulation_failure(vcpu);
4058 } 4637 }
4059 4638
4060 toggle_interruptibility(vcpu, vcpu->arch.emulate_ctxt.interruptibility); 4639 if (vcpu->arch.emulate_ctxt.have_exception) {
4061 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4062 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4063 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4064
4065 if (vcpu->arch.emulate_ctxt.exception >= 0) {
4066 inject_emulated_exception(vcpu); 4640 inject_emulated_exception(vcpu);
4067 return EMULATE_DONE; 4641 r = EMULATE_DONE;
4068 } 4642 } else if (vcpu->arch.pio.count) {
4069
4070 if (vcpu->arch.pio.count) {
4071 if (!vcpu->arch.pio.in) 4643 if (!vcpu->arch.pio.in)
4072 vcpu->arch.pio.count = 0; 4644 vcpu->arch.pio.count = 0;
4073 return EMULATE_DO_MMIO; 4645 else
4074 } 4646 writeback = false;
4075 4647 r = EMULATE_DO_MMIO;
4076 if (vcpu->mmio_needed) { 4648 } else if (vcpu->mmio_needed) {
4077 if (vcpu->mmio_is_write) 4649 if (!vcpu->mmio_is_write)
4078 vcpu->mmio_needed = 0; 4650 writeback = false;
4079 return EMULATE_DO_MMIO; 4651 r = EMULATE_DO_MMIO;
4080 } 4652 } else if (r == EMULATION_RESTART)
4081
4082 if (vcpu->arch.emulate_ctxt.restart)
4083 goto restart; 4653 goto restart;
4654 else
4655 r = EMULATE_DONE;
4656
4657 if (writeback) {
4658 toggle_interruptibility(vcpu,
4659 vcpu->arch.emulate_ctxt.interruptibility);
4660 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
4661 kvm_make_request(KVM_REQ_EVENT, vcpu);
4662 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
4663 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
4664 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
4665 } else
4666 vcpu->arch.emulate_regs_need_sync_to_vcpu = true;
4084 4667
4085 return EMULATE_DONE; 4668 return r;
4086} 4669}
4087EXPORT_SYMBOL_GPL(emulate_instruction); 4670EXPORT_SYMBOL_GPL(x86_emulate_instruction);
4088 4671
4089int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port) 4672int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port)
4090{ 4673{
4091 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX); 4674 unsigned long val = kvm_register_read(vcpu, VCPU_REGS_RAX);
4092 int ret = emulator_pio_out_emulated(size, port, &val, 1, vcpu); 4675 int ret = emulator_pio_out_emulated(&vcpu->arch.emulate_ctxt,
4676 size, port, &val, 1);
4093 /* do not return to emulator after return from userspace */ 4677 /* do not return to emulator after return from userspace */
4094 vcpu->arch.pio.count = 0; 4678 vcpu->arch.pio.count = 0;
4095 return ret; 4679 return ret;
4096} 4680}
4097EXPORT_SYMBOL_GPL(kvm_fast_pio_out); 4681EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
4098 4682
4099static void bounce_off(void *info) 4683static void tsc_bad(void *info)
4100{ 4684{
4101 /* nothing */ 4685 __this_cpu_write(cpu_tsc_khz, 0);
4686}
4687
4688static void tsc_khz_changed(void *data)
4689{
4690 struct cpufreq_freqs *freq = data;
4691 unsigned long khz = 0;
4692
4693 if (data)
4694 khz = freq->new;
4695 else if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4696 khz = cpufreq_quick_get(raw_smp_processor_id());
4697 if (!khz)
4698 khz = tsc_khz;
4699 __this_cpu_write(cpu_tsc_khz, khz);
4102} 4700}
4103 4701
4104static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 4702static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
@@ -4109,24 +4707,63 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4109 struct kvm_vcpu *vcpu; 4707 struct kvm_vcpu *vcpu;
4110 int i, send_ipi = 0; 4708 int i, send_ipi = 0;
4111 4709
4710 /*
4711 * We allow guests to temporarily run on slowing clocks,
4712 * provided we notify them after, or to run on accelerating
4713 * clocks, provided we notify them before. Thus time never
4714 * goes backwards.
4715 *
4716 * However, we have a problem. We can't atomically update
4717 * the frequency of a given CPU from this function; it is
4718 * merely a notifier, which can be called from any CPU.
4719 * Changing the TSC frequency at arbitrary points in time
4720 * requires a recomputation of local variables related to
4721 * the TSC for each VCPU. We must flag these local variables
4722 * to be updated and be sure the update takes place with the
4723 * new frequency before any guests proceed.
4724 *
4725 * Unfortunately, the combination of hotplug CPU and frequency
4726 * change creates an intractable locking scenario; the order
4727 * of when these callouts happen is undefined with respect to
4728 * CPU hotplug, and they can race with each other. As such,
4729 * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is
4730 * undefined; you can actually have a CPU frequency change take
4731 * place in between the computation of X and the setting of the
4732 * variable. To protect against this problem, all updates of
4733 * the per_cpu tsc_khz variable are done in an interrupt
4734 * protected IPI, and all callers wishing to update the value
4735 * must wait for a synchronous IPI to complete (which is trivial
4736 * if the caller is on the CPU already). This establishes the
4737 * necessary total order on variable updates.
4738 *
4739 * Note that because a guest time update may take place
4740 * anytime after the setting of the VCPU's request bit, the
4741 * correct TSC value must be set before the request. However,
4742 * to ensure the update actually makes it to any guest which
4743 * starts running in hardware virtualization between the set
4744 * and the acquisition of the spinlock, we must also ping the
4745 * CPU after setting the request bit.
4746 *
4747 */
4748
4112 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 4749 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
4113 return 0; 4750 return 0;
4114 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 4751 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
4115 return 0; 4752 return 0;
4116 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
4117 4753
4118 spin_lock(&kvm_lock); 4754 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4755
4756 raw_spin_lock(&kvm_lock);
4119 list_for_each_entry(kvm, &vm_list, vm_list) { 4757 list_for_each_entry(kvm, &vm_list, vm_list) {
4120 kvm_for_each_vcpu(i, vcpu, kvm) { 4758 kvm_for_each_vcpu(i, vcpu, kvm) {
4121 if (vcpu->cpu != freq->cpu) 4759 if (vcpu->cpu != freq->cpu)
4122 continue; 4760 continue;
4123 if (!kvm_request_guest_time_update(vcpu)) 4761 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
4124 continue;
4125 if (vcpu->cpu != smp_processor_id()) 4762 if (vcpu->cpu != smp_processor_id())
4126 send_ipi++; 4763 send_ipi = 1;
4127 } 4764 }
4128 } 4765 }
4129 spin_unlock(&kvm_lock); 4766 raw_spin_unlock(&kvm_lock);
4130 4767
4131 if (freq->old < freq->new && send_ipi) { 4768 if (freq->old < freq->new && send_ipi) {
4132 /* 4769 /*
@@ -4141,32 +4778,59 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
4141 * guest context is entered kvmclock will be updated, 4778 * guest context is entered kvmclock will be updated,
4142 * so the guest will not see stale values. 4779 * so the guest will not see stale values.
4143 */ 4780 */
4144 smp_call_function_single(freq->cpu, bounce_off, NULL, 1); 4781 smp_call_function_single(freq->cpu, tsc_khz_changed, freq, 1);
4145 } 4782 }
4146 return 0; 4783 return 0;
4147} 4784}
4148 4785
4149static struct notifier_block kvmclock_cpufreq_notifier_block = { 4786static struct notifier_block kvmclock_cpufreq_notifier_block = {
4150 .notifier_call = kvmclock_cpufreq_notifier 4787 .notifier_call = kvmclock_cpufreq_notifier
4788};
4789
4790static int kvmclock_cpu_notifier(struct notifier_block *nfb,
4791 unsigned long action, void *hcpu)
4792{
4793 unsigned int cpu = (unsigned long)hcpu;
4794
4795 switch (action) {
4796 case CPU_ONLINE:
4797 case CPU_DOWN_FAILED:
4798 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4799 break;
4800 case CPU_DOWN_PREPARE:
4801 smp_call_function_single(cpu, tsc_bad, NULL, 1);
4802 break;
4803 }
4804 return NOTIFY_OK;
4805}
4806
4807static struct notifier_block kvmclock_cpu_notifier_block = {
4808 .notifier_call = kvmclock_cpu_notifier,
4809 .priority = -INT_MAX
4151}; 4810};
4152 4811
4153static void kvm_timer_init(void) 4812static void kvm_timer_init(void)
4154{ 4813{
4155 int cpu; 4814 int cpu;
4156 4815
4816 max_tsc_khz = tsc_khz;
4817 register_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4157 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { 4818 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
4819#ifdef CONFIG_CPU_FREQ
4820 struct cpufreq_policy policy;
4821 memset(&policy, 0, sizeof(policy));
4822 cpu = get_cpu();
4823 cpufreq_get_policy(&policy, cpu);
4824 if (policy.cpuinfo.max_freq)
4825 max_tsc_khz = policy.cpuinfo.max_freq;
4826 put_cpu();
4827#endif
4158 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block, 4828 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
4159 CPUFREQ_TRANSITION_NOTIFIER); 4829 CPUFREQ_TRANSITION_NOTIFIER);
4160 for_each_online_cpu(cpu) {
4161 unsigned long khz = cpufreq_get(cpu);
4162 if (!khz)
4163 khz = tsc_khz;
4164 per_cpu(cpu_tsc_khz, cpu) = khz;
4165 }
4166 } else {
4167 for_each_possible_cpu(cpu)
4168 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
4169 } 4830 }
4831 pr_debug("kvm: max_tsc_khz = %ld\n", max_tsc_khz);
4832 for_each_online_cpu(cpu)
4833 smp_call_function_single(cpu, tsc_khz_changed, NULL, 1);
4170} 4834}
4171 4835
4172static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu); 4836static DEFINE_PER_CPU(struct kvm_vcpu *, current_vcpu);
@@ -4244,7 +4908,6 @@ int kvm_arch_init(void *opaque)
4244 4908
4245 kvm_x86_ops = ops; 4909 kvm_x86_ops = ops;
4246 kvm_mmu_set_nonpresent_ptes(0ull, 0ull); 4910 kvm_mmu_set_nonpresent_ptes(0ull, 0ull);
4247 kvm_mmu_set_base_ptes(PT_PRESENT_MASK);
4248 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 4911 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
4249 PT_DIRTY_MASK, PT64_NX_MASK, 0); 4912 PT_DIRTY_MASK, PT64_NX_MASK, 0);
4250 4913
@@ -4268,6 +4931,7 @@ void kvm_arch_exit(void)
4268 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) 4931 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC))
4269 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block, 4932 cpufreq_unregister_notifier(&kvmclock_cpufreq_notifier_block,
4270 CPUFREQ_TRANSITION_NOTIFIER); 4933 CPUFREQ_TRANSITION_NOTIFIER);
4934 unregister_hotcpu_notifier(&kvmclock_cpu_notifier_block);
4271 kvm_x86_ops = NULL; 4935 kvm_x86_ops = NULL;
4272 kvm_mmu_module_exit(); 4936 kvm_mmu_module_exit();
4273} 4937}
@@ -4403,8 +5067,9 @@ out:
4403} 5067}
4404EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); 5068EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
4405 5069
4406int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 5070int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt)
4407{ 5071{
5072 struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt);
4408 char instruction[3]; 5073 char instruction[3];
4409 unsigned long rip = kvm_rip_read(vcpu); 5074 unsigned long rip = kvm_rip_read(vcpu);
4410 5075
@@ -4417,21 +5082,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
4417 5082
4418 kvm_x86_ops->patch_hypercall(vcpu, instruction); 5083 kvm_x86_ops->patch_hypercall(vcpu, instruction);
4419 5084
4420 return emulator_write_emulated(rip, instruction, 3, NULL, vcpu); 5085 return emulator_write_emulated(&vcpu->arch.emulate_ctxt,
4421} 5086 rip, instruction, 3, NULL);
4422
4423void realmode_lgdt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4424{
4425 struct desc_ptr dt = { limit, base };
4426
4427 kvm_x86_ops->set_gdt(vcpu, &dt);
4428}
4429
4430void realmode_lidt(struct kvm_vcpu *vcpu, u16 limit, unsigned long base)
4431{
4432 struct desc_ptr dt = { limit, base };
4433
4434 kvm_x86_ops->set_idt(vcpu, &dt);
4435} 5087}
4436 5088
4437static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i) 5089static int move_to_next_stateful_cpuid_entry(struct kvm_vcpu *vcpu, int i)
@@ -4482,12 +5134,6 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
4482 best = e; 5134 best = e;
4483 break; 5135 break;
4484 } 5136 }
4485 /*
4486 * Both basic or both extended?
4487 */
4488 if (((e->function ^ function) & 0x80000000) == 0)
4489 if (!best || e->function > best->function)
4490 best = e;
4491 } 5137 }
4492 return best; 5138 return best;
4493} 5139}
@@ -4507,6 +5153,27 @@ not_found:
4507 return 36; 5153 return 36;
4508} 5154}
4509 5155
5156/*
5157 * If no match is found, check whether we exceed the vCPU's limit
5158 * and return the content of the highest valid _standard_ leaf instead.
5159 * This is to satisfy the CPUID specification.
5160 */
5161static struct kvm_cpuid_entry2* check_cpuid_limit(struct kvm_vcpu *vcpu,
5162 u32 function, u32 index)
5163{
5164 struct kvm_cpuid_entry2 *maxlevel;
5165
5166 maxlevel = kvm_find_cpuid_entry(vcpu, function & 0x80000000, 0);
5167 if (!maxlevel || maxlevel->eax >= function)
5168 return NULL;
5169 if (function & 0x80000000) {
5170 maxlevel = kvm_find_cpuid_entry(vcpu, 0, 0);
5171 if (!maxlevel)
5172 return NULL;
5173 }
5174 return kvm_find_cpuid_entry(vcpu, maxlevel->eax, index);
5175}
5176
4510void kvm_emulate_cpuid(struct kvm_vcpu *vcpu) 5177void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4511{ 5178{
4512 u32 function, index; 5179 u32 function, index;
@@ -4519,6 +5186,10 @@ void kvm_emulate_cpuid(struct kvm_vcpu *vcpu)
4519 kvm_register_write(vcpu, VCPU_REGS_RCX, 0); 5186 kvm_register_write(vcpu, VCPU_REGS_RCX, 0);
4520 kvm_register_write(vcpu, VCPU_REGS_RDX, 0); 5187 kvm_register_write(vcpu, VCPU_REGS_RDX, 0);
4521 best = kvm_find_cpuid_entry(vcpu, function, index); 5188 best = kvm_find_cpuid_entry(vcpu, function, index);
5189
5190 if (!best)
5191 best = check_cpuid_limit(vcpu, function, index);
5192
4522 if (best) { 5193 if (best) {
4523 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax); 5194 kvm_register_write(vcpu, VCPU_REGS_RAX, best->eax);
4524 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx); 5195 kvm_register_write(vcpu, VCPU_REGS_RBX, best->ebx);
@@ -4675,6 +5346,7 @@ static void kvm_put_guest_xcr0(struct kvm_vcpu *vcpu)
4675static int vcpu_enter_guest(struct kvm_vcpu *vcpu) 5346static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4676{ 5347{
4677 int r; 5348 int r;
5349 bool nmi_pending;
4678 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 5350 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
4679 vcpu->run->request_interrupt_window; 5351 vcpu->run->request_interrupt_window;
4680 5352
@@ -4683,8 +5355,11 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4683 kvm_mmu_unload(vcpu); 5355 kvm_mmu_unload(vcpu);
4684 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) 5356 if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu))
4685 __kvm_migrate_timers(vcpu); 5357 __kvm_migrate_timers(vcpu);
4686 if (kvm_check_request(KVM_REQ_KVMCLOCK_UPDATE, vcpu)) 5358 if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) {
4687 kvm_write_guest_time(vcpu); 5359 r = kvm_guest_time_update(vcpu);
5360 if (unlikely(r))
5361 goto out;
5362 }
4688 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) 5363 if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu))
4689 kvm_mmu_sync_roots(vcpu); 5364 kvm_mmu_sync_roots(vcpu);
4690 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) 5365 if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu))
@@ -4703,12 +5378,41 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4703 vcpu->fpu_active = 0; 5378 vcpu->fpu_active = 0;
4704 kvm_x86_ops->fpu_deactivate(vcpu); 5379 kvm_x86_ops->fpu_deactivate(vcpu);
4705 } 5380 }
5381 if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) {
5382 /* Page is swapped out. Do synthetic halt */
5383 vcpu->arch.apf.halted = true;
5384 r = 1;
5385 goto out;
5386 }
4706 } 5387 }
4707 5388
4708 r = kvm_mmu_reload(vcpu); 5389 r = kvm_mmu_reload(vcpu);
4709 if (unlikely(r)) 5390 if (unlikely(r))
4710 goto out; 5391 goto out;
4711 5392
5393 /*
5394 * An NMI can be injected between local nmi_pending read and
5395 * vcpu->arch.nmi_pending read inside inject_pending_event().
5396 * But in that case, KVM_REQ_EVENT will be set, which makes
5397 * the race described above benign.
5398 */
5399 nmi_pending = ACCESS_ONCE(vcpu->arch.nmi_pending);
5400
5401 if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win) {
5402 inject_pending_event(vcpu);
5403
5404 /* enable NMI/IRQ window open exits if needed */
5405 if (nmi_pending)
5406 kvm_x86_ops->enable_nmi_window(vcpu);
5407 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
5408 kvm_x86_ops->enable_irq_window(vcpu);
5409
5410 if (kvm_lapic_enabled(vcpu)) {
5411 update_cr8_intercept(vcpu);
5412 kvm_lapic_sync_to_vapic(vcpu);
5413 }
5414 }
5415
4712 preempt_disable(); 5416 preempt_disable();
4713 5417
4714 kvm_x86_ops->prepare_guest_switch(vcpu); 5418 kvm_x86_ops->prepare_guest_switch(vcpu);
@@ -4716,34 +5420,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4716 kvm_load_guest_fpu(vcpu); 5420 kvm_load_guest_fpu(vcpu);
4717 kvm_load_guest_xcr0(vcpu); 5421 kvm_load_guest_xcr0(vcpu);
4718 5422
4719 atomic_set(&vcpu->guest_mode, 1); 5423 vcpu->mode = IN_GUEST_MODE;
4720 smp_wmb(); 5424
5425 /* We should set ->mode before check ->requests,
5426 * see the comment in make_all_cpus_request.
5427 */
5428 smp_mb();
4721 5429
4722 local_irq_disable(); 5430 local_irq_disable();
4723 5431
4724 if (!atomic_read(&vcpu->guest_mode) || vcpu->requests 5432 if (vcpu->mode == EXITING_GUEST_MODE || vcpu->requests
4725 || need_resched() || signal_pending(current)) { 5433 || need_resched() || signal_pending(current)) {
4726 atomic_set(&vcpu->guest_mode, 0); 5434 vcpu->mode = OUTSIDE_GUEST_MODE;
4727 smp_wmb(); 5435 smp_wmb();
4728 local_irq_enable(); 5436 local_irq_enable();
4729 preempt_enable(); 5437 preempt_enable();
5438 kvm_x86_ops->cancel_injection(vcpu);
4730 r = 1; 5439 r = 1;
4731 goto out; 5440 goto out;
4732 } 5441 }
4733 5442
4734 inject_pending_event(vcpu);
4735
4736 /* enable NMI/IRQ window open exits if needed */
4737 if (vcpu->arch.nmi_pending)
4738 kvm_x86_ops->enable_nmi_window(vcpu);
4739 else if (kvm_cpu_has_interrupt(vcpu) || req_int_win)
4740 kvm_x86_ops->enable_irq_window(vcpu);
4741
4742 if (kvm_lapic_enabled(vcpu)) {
4743 update_cr8_intercept(vcpu);
4744 kvm_lapic_sync_to_vapic(vcpu);
4745 }
4746
4747 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); 5443 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4748 5444
4749 kvm_guest_enter(); 5445 kvm_guest_enter();
@@ -4769,7 +5465,9 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
4769 if (hw_breakpoint_active()) 5465 if (hw_breakpoint_active())
4770 hw_breakpoint_restore(); 5466 hw_breakpoint_restore();
4771 5467
4772 atomic_set(&vcpu->guest_mode, 0); 5468 kvm_get_msr(vcpu, MSR_IA32_TSC, &vcpu->arch.last_guest_tsc);
5469
5470 vcpu->mode = OUTSIDE_GUEST_MODE;
4773 smp_wmb(); 5471 smp_wmb();
4774 local_irq_enable(); 5472 local_irq_enable();
4775 5473
@@ -4826,7 +5524,8 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4826 5524
4827 r = 1; 5525 r = 1;
4828 while (r > 0) { 5526 while (r > 0) {
4829 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 5527 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
5528 !vcpu->arch.apf.halted)
4830 r = vcpu_enter_guest(vcpu); 5529 r = vcpu_enter_guest(vcpu);
4831 else { 5530 else {
4832 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx); 5531 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
@@ -4839,6 +5538,7 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4839 vcpu->arch.mp_state = 5538 vcpu->arch.mp_state =
4840 KVM_MP_STATE_RUNNABLE; 5539 KVM_MP_STATE_RUNNABLE;
4841 case KVM_MP_STATE_RUNNABLE: 5540 case KVM_MP_STATE_RUNNABLE:
5541 vcpu->arch.apf.halted = false;
4842 break; 5542 break;
4843 case KVM_MP_STATE_SIPI_RECEIVED: 5543 case KVM_MP_STATE_SIPI_RECEIVED:
4844 default: 5544 default:
@@ -4860,6 +5560,9 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4860 vcpu->run->exit_reason = KVM_EXIT_INTR; 5560 vcpu->run->exit_reason = KVM_EXIT_INTR;
4861 ++vcpu->stat.request_irq_exits; 5561 ++vcpu->stat.request_irq_exits;
4862 } 5562 }
5563
5564 kvm_check_async_pf_completion(vcpu);
5565
4863 if (signal_pending(current)) { 5566 if (signal_pending(current)) {
4864 r = -EINTR; 5567 r = -EINTR;
4865 vcpu->run->exit_reason = KVM_EXIT_INTR; 5568 vcpu->run->exit_reason = KVM_EXIT_INTR;
@@ -4879,11 +5582,49 @@ static int __vcpu_run(struct kvm_vcpu *vcpu)
4879 return r; 5582 return r;
4880} 5583}
4881 5584
5585static int complete_mmio(struct kvm_vcpu *vcpu)
5586{
5587 struct kvm_run *run = vcpu->run;
5588 int r;
5589
5590 if (!(vcpu->arch.pio.count || vcpu->mmio_needed))
5591 return 1;
5592
5593 if (vcpu->mmio_needed) {
5594 vcpu->mmio_needed = 0;
5595 if (!vcpu->mmio_is_write)
5596 memcpy(vcpu->mmio_data + vcpu->mmio_index,
5597 run->mmio.data, 8);
5598 vcpu->mmio_index += 8;
5599 if (vcpu->mmio_index < vcpu->mmio_size) {
5600 run->exit_reason = KVM_EXIT_MMIO;
5601 run->mmio.phys_addr = vcpu->mmio_phys_addr + vcpu->mmio_index;
5602 memcpy(run->mmio.data, vcpu->mmio_data + vcpu->mmio_index, 8);
5603 run->mmio.len = min(vcpu->mmio_size - vcpu->mmio_index, 8);
5604 run->mmio.is_write = vcpu->mmio_is_write;
5605 vcpu->mmio_needed = 1;
5606 return 0;
5607 }
5608 if (vcpu->mmio_is_write)
5609 return 1;
5610 vcpu->mmio_read_completed = 1;
5611 }
5612 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
5613 r = emulate_instruction(vcpu, EMULTYPE_NO_DECODE);
5614 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
5615 if (r != EMULATE_DONE)
5616 return 0;
5617 return 1;
5618}
5619
4882int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 5620int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4883{ 5621{
4884 int r; 5622 int r;
4885 sigset_t sigsaved; 5623 sigset_t sigsaved;
4886 5624
5625 if (!tsk_used_math(current) && init_fpu(current))
5626 return -ENOMEM;
5627
4887 if (vcpu->sigset_active) 5628 if (vcpu->sigset_active)
4888 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved); 5629 sigprocmask(SIG_SETMASK, &vcpu->sigset, &sigsaved);
4889 5630
@@ -4895,24 +5636,17 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
4895 } 5636 }
4896 5637
4897 /* re-sync apic's tpr */ 5638 /* re-sync apic's tpr */
4898 if (!irqchip_in_kernel(vcpu->kvm)) 5639 if (!irqchip_in_kernel(vcpu->kvm)) {
4899 kvm_set_cr8(vcpu, kvm_run->cr8); 5640 if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) {
4900 5641 r = -EINVAL;
4901 if (vcpu->arch.pio.count || vcpu->mmio_needed ||
4902 vcpu->arch.emulate_ctxt.restart) {
4903 if (vcpu->mmio_needed) {
4904 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
4905 vcpu->mmio_read_completed = 1;
4906 vcpu->mmio_needed = 0;
4907 }
4908 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
4909 r = emulate_instruction(vcpu, 0, 0, EMULTYPE_NO_DECODE);
4910 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
4911 if (r != EMULATE_DONE) {
4912 r = 0;
4913 goto out; 5642 goto out;
4914 } 5643 }
4915 } 5644 }
5645
5646 r = complete_mmio(vcpu);
5647 if (r <= 0)
5648 goto out;
5649
4916 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 5650 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
4917 kvm_register_write(vcpu, VCPU_REGS_RAX, 5651 kvm_register_write(vcpu, VCPU_REGS_RAX,
4918 kvm_run->hypercall.ret); 5652 kvm_run->hypercall.ret);
@@ -4929,6 +5663,18 @@ out:
4929 5663
4930int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5664int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4931{ 5665{
5666 if (vcpu->arch.emulate_regs_need_sync_to_vcpu) {
5667 /*
5668 * We are here if userspace calls get_regs() in the middle of
5669 * instruction emulation. Registers state needs to be copied
5670 * back from emulation context to vcpu. Usrapace shouldn't do
5671 * that usually, but some bad designed PV devices (vmware
5672 * backdoor interface) need this to work
5673 */
5674 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5675 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5676 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5677 }
4932 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX); 5678 regs->rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4933 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX); 5679 regs->rbx = kvm_register_read(vcpu, VCPU_REGS_RBX);
4934 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX); 5680 regs->rcx = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -4956,6 +5702,9 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4956 5702
4957int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) 5703int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4958{ 5704{
5705 vcpu->arch.emulate_regs_need_sync_from_vcpu = true;
5706 vcpu->arch.emulate_regs_need_sync_to_vcpu = false;
5707
4959 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax); 5708 kvm_register_write(vcpu, VCPU_REGS_RAX, regs->rax);
4960 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx); 5709 kvm_register_write(vcpu, VCPU_REGS_RBX, regs->rbx);
4961 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx); 5710 kvm_register_write(vcpu, VCPU_REGS_RCX, regs->rcx);
@@ -4980,6 +5729,8 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
4980 5729
4981 vcpu->arch.exception.pending = false; 5730 vcpu->arch.exception.pending = false;
4982 5731
5732 kvm_make_request(KVM_REQ_EVENT, vcpu);
5733
4983 return 0; 5734 return 0;
4984} 5735}
4985 5736
@@ -5017,7 +5768,7 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
5017 5768
5018 sregs->cr0 = kvm_read_cr0(vcpu); 5769 sregs->cr0 = kvm_read_cr0(vcpu);
5019 sregs->cr2 = vcpu->arch.cr2; 5770 sregs->cr2 = vcpu->arch.cr2;
5020 sregs->cr3 = vcpu->arch.cr3; 5771 sregs->cr3 = kvm_read_cr3(vcpu);
5021 sregs->cr4 = kvm_read_cr4(vcpu); 5772 sregs->cr4 = kvm_read_cr4(vcpu);
5022 sregs->cr8 = kvm_get_cr8(vcpu); 5773 sregs->cr8 = kvm_get_cr8(vcpu);
5023 sregs->efer = vcpu->arch.efer; 5774 sregs->efer = vcpu->arch.efer;
@@ -5043,6 +5794,7 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
5043 struct kvm_mp_state *mp_state) 5794 struct kvm_mp_state *mp_state)
5044{ 5795{
5045 vcpu->arch.mp_state = mp_state->mp_state; 5796 vcpu->arch.mp_state = mp_state->mp_state;
5797 kvm_make_request(KVM_REQ_EVENT, vcpu);
5046 return 0; 5798 return 0;
5047} 5799}
5048 5800
@@ -5050,24 +5802,11 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5050 bool has_error_code, u32 error_code) 5802 bool has_error_code, u32 error_code)
5051{ 5803{
5052 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode; 5804 struct decode_cache *c = &vcpu->arch.emulate_ctxt.decode;
5053 int cs_db, cs_l, ret; 5805 int ret;
5054 cache_all_regs(vcpu);
5055
5056 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
5057 5806
5058 vcpu->arch.emulate_ctxt.vcpu = vcpu; 5807 init_emulate_ctxt(vcpu);
5059 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu);
5060 vcpu->arch.emulate_ctxt.eip = kvm_rip_read(vcpu);
5061 vcpu->arch.emulate_ctxt.mode =
5062 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
5063 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
5064 ? X86EMUL_MODE_VM86 : cs_l
5065 ? X86EMUL_MODE_PROT64 : cs_db
5066 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
5067 memset(c, 0, sizeof(struct decode_cache));
5068 memcpy(c->regs, vcpu->arch.regs, sizeof c->regs);
5069 5808
5070 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt, &emulate_ops, 5809 ret = emulator_task_switch(&vcpu->arch.emulate_ctxt,
5071 tss_selector, reason, has_error_code, 5810 tss_selector, reason, has_error_code,
5072 error_code); 5811 error_code);
5073 5812
@@ -5076,7 +5815,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason,
5076 5815
5077 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs); 5816 memcpy(vcpu->arch.regs, c->regs, sizeof c->regs);
5078 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip); 5817 kvm_rip_write(vcpu, vcpu->arch.emulate_ctxt.eip);
5079 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 5818 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
5819 kvm_make_request(KVM_REQ_EVENT, vcpu);
5080 return EMULATE_DONE; 5820 return EMULATE_DONE;
5081} 5821}
5082EXPORT_SYMBOL_GPL(kvm_task_switch); 5822EXPORT_SYMBOL_GPL(kvm_task_switch);
@@ -5085,7 +5825,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5085 struct kvm_sregs *sregs) 5825 struct kvm_sregs *sregs)
5086{ 5826{
5087 int mmu_reset_needed = 0; 5827 int mmu_reset_needed = 0;
5088 int pending_vec, max_bits; 5828 int pending_vec, max_bits, idx;
5089 struct desc_ptr dt; 5829 struct desc_ptr dt;
5090 5830
5091 dt.size = sregs->idt.limit; 5831 dt.size = sregs->idt.limit;
@@ -5096,8 +5836,9 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5096 kvm_x86_ops->set_gdt(vcpu, &dt); 5836 kvm_x86_ops->set_gdt(vcpu, &dt);
5097 5837
5098 vcpu->arch.cr2 = sregs->cr2; 5838 vcpu->arch.cr2 = sregs->cr2;
5099 mmu_reset_needed |= vcpu->arch.cr3 != sregs->cr3; 5839 mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3;
5100 vcpu->arch.cr3 = sregs->cr3; 5840 vcpu->arch.cr3 = sregs->cr3;
5841 __set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
5101 5842
5102 kvm_set_cr8(vcpu, sregs->cr8); 5843 kvm_set_cr8(vcpu, sregs->cr8);
5103 5844
@@ -5111,10 +5852,15 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5111 5852
5112 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; 5853 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
5113 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5854 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
5855 if (sregs->cr4 & X86_CR4_OSXSAVE)
5856 update_cpuid(vcpu);
5857
5858 idx = srcu_read_lock(&vcpu->kvm->srcu);
5114 if (!is_long_mode(vcpu) && is_pae(vcpu)) { 5859 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
5115 load_pdptrs(vcpu, vcpu->arch.cr3); 5860 load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu));
5116 mmu_reset_needed = 1; 5861 mmu_reset_needed = 1;
5117 } 5862 }
5863 srcu_read_unlock(&vcpu->kvm->srcu, idx);
5118 5864
5119 if (mmu_reset_needed) 5865 if (mmu_reset_needed)
5120 kvm_mmu_reset_context(vcpu); 5866 kvm_mmu_reset_context(vcpu);
@@ -5125,8 +5871,6 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5125 if (pending_vec < max_bits) { 5871 if (pending_vec < max_bits) {
5126 kvm_queue_interrupt(vcpu, pending_vec, false); 5872 kvm_queue_interrupt(vcpu, pending_vec, false);
5127 pr_debug("Set back pending irq %d\n", pending_vec); 5873 pr_debug("Set back pending irq %d\n", pending_vec);
5128 if (irqchip_in_kernel(vcpu->kvm))
5129 kvm_pic_clear_isr_ack(vcpu->kvm);
5130 } 5874 }
5131 5875
5132 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS); 5876 kvm_set_segment(vcpu, &sregs->cs, VCPU_SREG_CS);
@@ -5147,6 +5891,8 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
5147 !is_protmode(vcpu)) 5891 !is_protmode(vcpu))
5148 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5892 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5149 5893
5894 kvm_make_request(KVM_REQ_EVENT, vcpu);
5895
5150 return 0; 5896 return 0;
5151} 5897}
5152 5898
@@ -5320,10 +6066,7 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
5320 6066
5321void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 6067void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5322{ 6068{
5323 if (vcpu->arch.time_page) { 6069 kvmclock_reset(vcpu);
5324 kvm_release_page_dirty(vcpu->arch.time_page);
5325 vcpu->arch.time_page = NULL;
5326 }
5327 6070
5328 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask); 6071 free_cpumask_var(vcpu->arch.wbinvd_dirty_mask);
5329 fx_free(vcpu); 6072 fx_free(vcpu);
@@ -5333,6 +6076,10 @@ void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
5333struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm, 6076struct kvm_vcpu *kvm_arch_vcpu_create(struct kvm *kvm,
5334 unsigned int id) 6077 unsigned int id)
5335{ 6078{
6079 if (check_tsc_unstable() && atomic_read(&kvm->online_vcpus) != 0)
6080 printk_once(KERN_WARNING
6081 "kvm: SMP vm created on host with unstable TSC; "
6082 "guest TSC will not be reliable\n");
5336 return kvm_x86_ops->vcpu_create(kvm, id); 6083 return kvm_x86_ops->vcpu_create(kvm, id);
5337} 6084}
5338 6085
@@ -5357,6 +6104,8 @@ free_vcpu:
5357 6104
5358void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) 6105void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
5359{ 6106{
6107 vcpu->arch.apf.msr_val = 0;
6108
5360 vcpu_load(vcpu); 6109 vcpu_load(vcpu);
5361 kvm_mmu_unload(vcpu); 6110 kvm_mmu_unload(vcpu);
5362 vcpu_put(vcpu); 6111 vcpu_put(vcpu);
@@ -5375,22 +6124,29 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
5375 vcpu->arch.dr6 = DR6_FIXED_1; 6124 vcpu->arch.dr6 = DR6_FIXED_1;
5376 vcpu->arch.dr7 = DR7_FIXED_1; 6125 vcpu->arch.dr7 = DR7_FIXED_1;
5377 6126
6127 kvm_make_request(KVM_REQ_EVENT, vcpu);
6128 vcpu->arch.apf.msr_val = 0;
6129
6130 kvmclock_reset(vcpu);
6131
6132 kvm_clear_async_pf_completion_queue(vcpu);
6133 kvm_async_pf_hash_reset(vcpu);
6134 vcpu->arch.apf.halted = false;
6135
5378 return kvm_x86_ops->vcpu_reset(vcpu); 6136 return kvm_x86_ops->vcpu_reset(vcpu);
5379} 6137}
5380 6138
5381int kvm_arch_hardware_enable(void *garbage) 6139int kvm_arch_hardware_enable(void *garbage)
5382{ 6140{
5383 /* 6141 struct kvm *kvm;
5384 * Since this may be called from a hotplug notifcation, 6142 struct kvm_vcpu *vcpu;
5385 * we can't get the CPU frequency directly. 6143 int i;
5386 */
5387 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5388 int cpu = raw_smp_processor_id();
5389 per_cpu(cpu_tsc_khz, cpu) = 0;
5390 }
5391 6144
5392 kvm_shared_msr_cpu_online(); 6145 kvm_shared_msr_cpu_online();
5393 6146 list_for_each_entry(kvm, &vm_list, vm_list)
6147 kvm_for_each_vcpu(i, vcpu, kvm)
6148 if (vcpu->cpu == smp_processor_id())
6149 kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
5394 return kvm_x86_ops->hardware_enable(garbage); 6150 return kvm_x86_ops->hardware_enable(garbage);
5395} 6151}
5396 6152
@@ -5424,7 +6180,11 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5424 BUG_ON(vcpu->kvm == NULL); 6180 BUG_ON(vcpu->kvm == NULL);
5425 kvm = vcpu->kvm; 6181 kvm = vcpu->kvm;
5426 6182
6183 vcpu->arch.emulate_ctxt.ops = &emulate_ops;
6184 vcpu->arch.walk_mmu = &vcpu->arch.mmu;
5427 vcpu->arch.mmu.root_hpa = INVALID_PAGE; 6185 vcpu->arch.mmu.root_hpa = INVALID_PAGE;
6186 vcpu->arch.mmu.translate_gpa = translate_gpa;
6187 vcpu->arch.nested_mmu.translate_gpa = translate_nested_gpa;
5428 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu)) 6188 if (!irqchip_in_kernel(kvm) || kvm_vcpu_is_bsp(vcpu))
5429 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 6189 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
5430 else 6190 else
@@ -5437,6 +6197,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5437 } 6197 }
5438 vcpu->arch.pio_data = page_address(page); 6198 vcpu->arch.pio_data = page_address(page);
5439 6199
6200 kvm_init_tsc_catchup(vcpu, max_tsc_khz);
6201
5440 r = kvm_mmu_create(vcpu); 6202 r = kvm_mmu_create(vcpu);
5441 if (r < 0) 6203 if (r < 0)
5442 goto fail_free_pio_data; 6204 goto fail_free_pio_data;
@@ -5458,6 +6220,8 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
5458 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL)) 6220 if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, GFP_KERNEL))
5459 goto fail_free_mce_banks; 6221 goto fail_free_mce_banks;
5460 6222
6223 kvm_async_pf_hash_reset(vcpu);
6224
5461 return 0; 6225 return 0;
5462fail_free_mce_banks: 6226fail_free_mce_banks:
5463 kfree(vcpu->arch.mce_banks); 6227 kfree(vcpu->arch.mce_banks);
@@ -5483,22 +6247,17 @@ void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
5483 free_page((unsigned long)vcpu->arch.pio_data); 6247 free_page((unsigned long)vcpu->arch.pio_data);
5484} 6248}
5485 6249
5486struct kvm *kvm_arch_create_vm(void) 6250int kvm_arch_init_vm(struct kvm *kvm)
5487{ 6251{
5488 struct kvm *kvm = kzalloc(sizeof(struct kvm), GFP_KERNEL);
5489
5490 if (!kvm)
5491 return ERR_PTR(-ENOMEM);
5492
5493 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 6252 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
5494 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 6253 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
5495 6254
5496 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ 6255 /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */
5497 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap); 6256 set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, &kvm->arch.irq_sources_bitmap);
5498 6257
5499 rdtscll(kvm->arch.vm_init_tsc); 6258 raw_spin_lock_init(&kvm->arch.tsc_write_lock);
5500 6259
5501 return kvm; 6260 return 0;
5502} 6261}
5503 6262
5504static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) 6263static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu)
@@ -5516,8 +6275,10 @@ static void kvm_free_vcpus(struct kvm *kvm)
5516 /* 6275 /*
5517 * Unpin any mmu pages first. 6276 * Unpin any mmu pages first.
5518 */ 6277 */
5519 kvm_for_each_vcpu(i, vcpu, kvm) 6278 kvm_for_each_vcpu(i, vcpu, kvm) {
6279 kvm_clear_async_pf_completion_queue(vcpu);
5520 kvm_unload_vcpu_mmu(vcpu); 6280 kvm_unload_vcpu_mmu(vcpu);
6281 }
5521 kvm_for_each_vcpu(i, vcpu, kvm) 6282 kvm_for_each_vcpu(i, vcpu, kvm)
5522 kvm_arch_vcpu_free(vcpu); 6283 kvm_arch_vcpu_free(vcpu);
5523 6284
@@ -5541,13 +6302,10 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
5541 kfree(kvm->arch.vpic); 6302 kfree(kvm->arch.vpic);
5542 kfree(kvm->arch.vioapic); 6303 kfree(kvm->arch.vioapic);
5543 kvm_free_vcpus(kvm); 6304 kvm_free_vcpus(kvm);
5544 kvm_free_physmem(kvm);
5545 if (kvm->arch.apic_access_page) 6305 if (kvm->arch.apic_access_page)
5546 put_page(kvm->arch.apic_access_page); 6306 put_page(kvm->arch.apic_access_page);
5547 if (kvm->arch.ept_identity_pagetable) 6307 if (kvm->arch.ept_identity_pagetable)
5548 put_page(kvm->arch.ept_identity_pagetable); 6308 put_page(kvm->arch.ept_identity_pagetable);
5549 cleanup_srcu_struct(&kvm->srcu);
5550 kfree(kvm);
5551} 6309}
5552 6310
5553int kvm_arch_prepare_memory_region(struct kvm *kvm, 6311int kvm_arch_prepare_memory_region(struct kvm *kvm,
@@ -5595,7 +6353,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
5595 int user_alloc) 6353 int user_alloc)
5596{ 6354{
5597 6355
5598 int npages = mem->memory_size >> PAGE_SHIFT; 6356 int nr_mmu_pages = 0, npages = mem->memory_size >> PAGE_SHIFT;
5599 6357
5600 if (!user_alloc && !old.user_alloc && old.rmap && !npages) { 6358 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
5601 int ret; 6359 int ret;
@@ -5610,12 +6368,12 @@ void kvm_arch_commit_memory_region(struct kvm *kvm,
5610 "failed to munmap memory\n"); 6368 "failed to munmap memory\n");
5611 } 6369 }
5612 6370
6371 if (!kvm->arch.n_requested_mmu_pages)
6372 nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
6373
5613 spin_lock(&kvm->mmu_lock); 6374 spin_lock(&kvm->mmu_lock);
5614 if (!kvm->arch.n_requested_mmu_pages) { 6375 if (nr_mmu_pages)
5615 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
5616 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages); 6376 kvm_mmu_change_mmu_pages(kvm, nr_mmu_pages);
5617 }
5618
5619 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 6377 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
5620 spin_unlock(&kvm->mmu_lock); 6378 spin_unlock(&kvm->mmu_lock);
5621} 6379}
@@ -5628,7 +6386,9 @@ void kvm_arch_flush_shadow(struct kvm *kvm)
5628 6386
5629int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) 6387int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu)
5630{ 6388{
5631 return vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE 6389 return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE &&
6390 !vcpu->arch.apf.halted)
6391 || !list_empty_careful(&vcpu->async_pf.done)
5632 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED 6392 || vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED
5633 || vcpu->arch.nmi_pending || 6393 || vcpu->arch.nmi_pending ||
5634 (kvm_arch_interrupt_allowed(vcpu) && 6394 (kvm_arch_interrupt_allowed(vcpu) &&
@@ -5647,7 +6407,7 @@ void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
5647 6407
5648 me = get_cpu(); 6408 me = get_cpu();
5649 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) 6409 if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
5650 if (atomic_xchg(&vcpu->guest_mode, 0)) 6410 if (kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE)
5651 smp_send_reschedule(cpu); 6411 smp_send_reschedule(cpu);
5652 put_cpu(); 6412 put_cpu();
5653} 6413}
@@ -5683,9 +6443,151 @@ void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5683 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) 6443 kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip))
5684 rflags |= X86_EFLAGS_TF; 6444 rflags |= X86_EFLAGS_TF;
5685 kvm_x86_ops->set_rflags(vcpu, rflags); 6445 kvm_x86_ops->set_rflags(vcpu, rflags);
6446 kvm_make_request(KVM_REQ_EVENT, vcpu);
5686} 6447}
5687EXPORT_SYMBOL_GPL(kvm_set_rflags); 6448EXPORT_SYMBOL_GPL(kvm_set_rflags);
5688 6449
6450void kvm_arch_async_page_ready(struct kvm_vcpu *vcpu, struct kvm_async_pf *work)
6451{
6452 int r;
6453
6454 if ((vcpu->arch.mmu.direct_map != work->arch.direct_map) ||
6455 is_error_page(work->page))
6456 return;
6457
6458 r = kvm_mmu_reload(vcpu);
6459 if (unlikely(r))
6460 return;
6461
6462 if (!vcpu->arch.mmu.direct_map &&
6463 work->arch.cr3 != vcpu->arch.mmu.get_cr3(vcpu))
6464 return;
6465
6466 vcpu->arch.mmu.page_fault(vcpu, work->gva, 0, true);
6467}
6468
6469static inline u32 kvm_async_pf_hash_fn(gfn_t gfn)
6470{
6471 return hash_32(gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU));
6472}
6473
6474static inline u32 kvm_async_pf_next_probe(u32 key)
6475{
6476 return (key + 1) & (roundup_pow_of_two(ASYNC_PF_PER_VCPU) - 1);
6477}
6478
6479static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6480{
6481 u32 key = kvm_async_pf_hash_fn(gfn);
6482
6483 while (vcpu->arch.apf.gfns[key] != ~0)
6484 key = kvm_async_pf_next_probe(key);
6485
6486 vcpu->arch.apf.gfns[key] = gfn;
6487}
6488
6489static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn)
6490{
6491 int i;
6492 u32 key = kvm_async_pf_hash_fn(gfn);
6493
6494 for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU) &&
6495 (vcpu->arch.apf.gfns[key] != gfn &&
6496 vcpu->arch.apf.gfns[key] != ~0); i++)
6497 key = kvm_async_pf_next_probe(key);
6498
6499 return key;
6500}
6501
6502bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6503{
6504 return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn;
6505}
6506
6507static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
6508{
6509 u32 i, j, k;
6510
6511 i = j = kvm_async_pf_gfn_slot(vcpu, gfn);
6512 while (true) {
6513 vcpu->arch.apf.gfns[i] = ~0;
6514 do {
6515 j = kvm_async_pf_next_probe(j);
6516 if (vcpu->arch.apf.gfns[j] == ~0)
6517 return;
6518 k = kvm_async_pf_hash_fn(vcpu->arch.apf.gfns[j]);
6519 /*
6520 * k lies cyclically in ]i,j]
6521 * | i.k.j |
6522 * |....j i.k.| or |.k..j i...|
6523 */
6524 } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j));
6525 vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j];
6526 i = j;
6527 }
6528}
6529
6530static int apf_put_user(struct kvm_vcpu *vcpu, u32 val)
6531{
6532
6533 return kvm_write_guest_cached(vcpu->kvm, &vcpu->arch.apf.data, &val,
6534 sizeof(val));
6535}
6536
6537void kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu,
6538 struct kvm_async_pf *work)
6539{
6540 struct x86_exception fault;
6541
6542 trace_kvm_async_pf_not_present(work->arch.token, work->gva);
6543 kvm_add_async_pf_gfn(vcpu, work->arch.gfn);
6544
6545 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) ||
6546 (vcpu->arch.apf.send_user_only &&
6547 kvm_x86_ops->get_cpl(vcpu) == 0))
6548 kvm_make_request(KVM_REQ_APF_HALT, vcpu);
6549 else if (!apf_put_user(vcpu, KVM_PV_REASON_PAGE_NOT_PRESENT)) {
6550 fault.vector = PF_VECTOR;
6551 fault.error_code_valid = true;
6552 fault.error_code = 0;
6553 fault.nested_page_fault = false;
6554 fault.address = work->arch.token;
6555 kvm_inject_page_fault(vcpu, &fault);
6556 }
6557}
6558
6559void kvm_arch_async_page_present(struct kvm_vcpu *vcpu,
6560 struct kvm_async_pf *work)
6561{
6562 struct x86_exception fault;
6563
6564 trace_kvm_async_pf_ready(work->arch.token, work->gva);
6565 if (is_error_page(work->page))
6566 work->arch.token = ~0; /* broadcast wakeup */
6567 else
6568 kvm_del_async_pf_gfn(vcpu, work->arch.gfn);
6569
6570 if ((vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED) &&
6571 !apf_put_user(vcpu, KVM_PV_REASON_PAGE_READY)) {
6572 fault.vector = PF_VECTOR;
6573 fault.error_code_valid = true;
6574 fault.error_code = 0;
6575 fault.nested_page_fault = false;
6576 fault.address = work->arch.token;
6577 kvm_inject_page_fault(vcpu, &fault);
6578 }
6579 vcpu->arch.apf.halted = false;
6580}
6581
6582bool kvm_arch_can_inject_async_page_present(struct kvm_vcpu *vcpu)
6583{
6584 if (!(vcpu->arch.apf.msr_val & KVM_ASYNC_PF_ENABLED))
6585 return true;
6586 else
6587 return !kvm_event_needs_reinjection(vcpu) &&
6588 kvm_x86_ops->interrupt_allowed(vcpu);
6589}
6590
5689EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 6591EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
5690EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 6592EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
5691EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 6593EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index b7a404722d2b..e407ed3df817 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -50,6 +50,11 @@ static inline int is_long_mode(struct kvm_vcpu *vcpu)
50#endif 50#endif
51} 51}
52 52
53static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
54{
55 return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
56}
57
53static inline int is_pae(struct kvm_vcpu *vcpu) 58static inline int is_pae(struct kvm_vcpu *vcpu)
54{ 59{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE); 60 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
@@ -65,7 +70,15 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG); 70 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66} 71}
67 72
73static inline u32 bit(int bitno)
74{
75 return 1 << (bitno & 31);
76}
77
68void kvm_before_handle_nmi(struct kvm_vcpu *vcpu); 78void kvm_before_handle_nmi(struct kvm_vcpu *vcpu);
69void kvm_after_handle_nmi(struct kvm_vcpu *vcpu); 79void kvm_after_handle_nmi(struct kvm_vcpu *vcpu);
80int kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip);
81
82void kvm_write_tsc(struct kvm_vcpu *vcpu, u64 data);
70 83
71#endif 84#endif
diff --git a/arch/x86/lguest/Kconfig b/arch/x86/lguest/Kconfig
index 38718041efc3..6e121a2a49e1 100644
--- a/arch/x86/lguest/Kconfig
+++ b/arch/x86/lguest/Kconfig
@@ -2,6 +2,7 @@ config LGUEST_GUEST
2 bool "Lguest guest support" 2 bool "Lguest guest support"
3 select PARAVIRT 3 select PARAVIRT
4 depends on X86_32 4 depends on X86_32
5 select VIRTUALIZATION
5 select VIRTIO 6 select VIRTIO
6 select VIRTIO_RING 7 select VIRTIO_RING
7 select VIRTIO_CONSOLE 8 select VIRTIO_CONSOLE
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 9d5f55848455..db832fd65ecb 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -7,7 +7,7 @@
7 * kernel and insert a module (lg.ko) which allows us to run other Linux 7 * kernel and insert a module (lg.ko) which allows us to run other Linux
8 * kernels the same way we'd run processes. We call the first kernel the Host, 8 * kernels the same way we'd run processes. We call the first kernel the Host,
9 * and the others the Guests. The program which sets up and configures Guests 9 * and the others the Guests. The program which sets up and configures Guests
10 * (such as the example in Documentation/lguest/lguest.c) is called the 10 * (such as the example in Documentation/virtual/lguest/lguest.c) is called the
11 * Launcher. 11 * Launcher.
12 * 12 *
13 * Secondly, we only run specially modified Guests, not normal kernels: setting 13 * Secondly, we only run specially modified Guests, not normal kernels: setting
@@ -397,7 +397,7 @@ static void lguest_load_tr_desc(void)
397 * instead we just use the real "cpuid" instruction. Then I pretty much turned 397 * instead we just use the real "cpuid" instruction. Then I pretty much turned
398 * off feature bits until the Guest booted. (Don't say that: you'll damage 398 * off feature bits until the Guest booted. (Don't say that: you'll damage
399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is 399 * lguest sales!) Shut up, inner voice! (Hey, just pointing out that this is
400 * hardly future proof.) Noone's listening! They don't like you anyway, 400 * hardly future proof.) No one's listening! They don't like you anyway,
401 * parenthetic weirdo! 401 * parenthetic weirdo!
402 * 402 *
403 * Replacing the cpuid so we can turn features off is great for the kernel, but 403 * Replacing the cpuid so we can turn features off is great for the kernel, but
@@ -531,7 +531,10 @@ static void lguest_write_cr3(unsigned long cr3)
531{ 531{
532 lguest_data.pgdir = cr3; 532 lguest_data.pgdir = cr3;
533 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3); 533 lazy_hcall1(LHCALL_NEW_PGTABLE, cr3);
534 cr3_changed = true; 534
535 /* These two page tables are simple, linear, and used during boot */
536 if (cr3 != __pa(swapper_pg_dir) && cr3 != __pa(initial_page_table))
537 cr3_changed = true;
535} 538}
536 539
537static unsigned long lguest_read_cr3(void) 540static unsigned long lguest_read_cr3(void)
@@ -703,9 +706,9 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval)
703 * to forget all of them. Fortunately, this is very rare. 706 * to forget all of them. Fortunately, this is very rare.
704 * 707 *
705 * ... except in early boot when the kernel sets up the initial pagetables, 708 * ... except in early boot when the kernel sets up the initial pagetables,
706 * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell 709 * which makes booting astonishingly slow: 48 seconds! So we don't even tell
707 * the Host anything changed until we've done the first page table switch, 710 * the Host anything changed until we've done the first real page table switch,
708 * which brings boot back to 0.25 seconds. 711 * which brings boot back to 4.3 seconds.
709 */ 712 */
710static void lguest_set_pte(pte_t *ptep, pte_t pteval) 713static void lguest_set_pte(pte_t *ptep, pte_t pteval)
711{ 714{
@@ -791,22 +794,22 @@ static void lguest_flush_tlb_kernel(void)
791 * simple as setting a bit. We don't actually "ack" interrupts as such, we 794 * simple as setting a bit. We don't actually "ack" interrupts as such, we
792 * just mask and unmask them. I wonder if we should be cleverer? 795 * just mask and unmask them. I wonder if we should be cleverer?
793 */ 796 */
794static void disable_lguest_irq(unsigned int irq) 797static void disable_lguest_irq(struct irq_data *data)
795{ 798{
796 set_bit(irq, lguest_data.blocked_interrupts); 799 set_bit(data->irq, lguest_data.blocked_interrupts);
797} 800}
798 801
799static void enable_lguest_irq(unsigned int irq) 802static void enable_lguest_irq(struct irq_data *data)
800{ 803{
801 clear_bit(irq, lguest_data.blocked_interrupts); 804 clear_bit(data->irq, lguest_data.blocked_interrupts);
802} 805}
803 806
804/* This structure describes the lguest IRQ controller. */ 807/* This structure describes the lguest IRQ controller. */
805static struct irq_chip lguest_irq_controller = { 808static struct irq_chip lguest_irq_controller = {
806 .name = "lguest", 809 .name = "lguest",
807 .mask = disable_lguest_irq, 810 .irq_mask = disable_lguest_irq,
808 .mask_ack = disable_lguest_irq, 811 .irq_mask_ack = disable_lguest_irq,
809 .unmask = enable_lguest_irq, 812 .irq_unmask = enable_lguest_irq,
810}; 813};
811 814
812/* 815/*
@@ -821,7 +824,7 @@ static void __init lguest_init_IRQ(void)
821 824
822 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { 825 for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) {
823 /* Some systems map "vectors" to interrupts weirdly. Not us! */ 826 /* Some systems map "vectors" to interrupts weirdly. Not us! */
824 __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; 827 __this_cpu_write(vector_irq[i], i - FIRST_EXTERNAL_VECTOR);
825 if (i != SYSCALL_VECTOR) 828 if (i != SYSCALL_VECTOR)
826 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); 829 set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
827 } 830 }
@@ -838,13 +841,13 @@ static void __init lguest_init_IRQ(void)
838 * rather than set them in lguest_init_IRQ we are called here every time an 841 * rather than set them in lguest_init_IRQ we are called here every time an
839 * lguest device needs an interrupt. 842 * lguest device needs an interrupt.
840 * 843 *
841 * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should 844 * FIXME: irq_alloc_desc_at() can fail due to lack of memory, we should
842 * pass that up! 845 * pass that up!
843 */ 846 */
844void lguest_setup_irq(unsigned int irq) 847void lguest_setup_irq(unsigned int irq)
845{ 848{
846 irq_to_desc_alloc_node(irq, 0); 849 irq_alloc_desc_at(irq, 0);
847 set_irq_chip_and_handler_name(irq, &lguest_irq_controller, 850 irq_set_chip_and_handler_name(irq, &lguest_irq_controller,
848 handle_level_irq, "level"); 851 handle_level_irq, "level");
849} 852}
850 853
@@ -910,8 +913,6 @@ static struct clocksource lguest_clock = {
910 .rating = 200, 913 .rating = 200,
911 .read = lguest_clock_read, 914 .read = lguest_clock_read,
912 .mask = CLOCKSOURCE_MASK(64), 915 .mask = CLOCKSOURCE_MASK(64),
913 .mult = 1 << 22,
914 .shift = 22,
915 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 916 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
916}; 917};
917 918
@@ -992,9 +993,10 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc)
992static void lguest_time_init(void) 993static void lguest_time_init(void)
993{ 994{
994 /* Set up the timer interrupt (0) to go to our simple timer routine */ 995 /* Set up the timer interrupt (0) to go to our simple timer routine */
995 set_irq_handler(0, lguest_time_irq); 996 lguest_setup_irq(0);
997 irq_set_handler(0, lguest_time_irq);
996 998
997 clocksource_register(&lguest_clock); 999 clocksource_register_hz(&lguest_clock, NSEC_PER_SEC);
998 1000
999 /* We can't set cpumask in the initializer: damn C limitations! Set it 1001 /* We can't set cpumask in the initializer: damn C limitations! Set it
1000 * here and register our timer device. */ 1002 * here and register our timer device. */
@@ -1002,7 +1004,7 @@ static void lguest_time_init(void)
1002 clockevents_register_device(&lguest_clockevent); 1004 clockevents_register_device(&lguest_clockevent);
1003 1005
1004 /* Finally, we unblock the timer interrupt. */ 1006 /* Finally, we unblock the timer interrupt. */
1005 enable_lguest_irq(0); 1007 clear_bit(0, lguest_data.blocked_interrupts);
1006} 1008}
1007 1009
1008/* 1010/*
@@ -1349,9 +1351,6 @@ __init void lguest_init(void)
1349 */ 1351 */
1350 switch_to_new_gdt(0); 1352 switch_to_new_gdt(0);
1351 1353
1352 /* We actually boot with all memory mapped, but let's say 128MB. */
1353 max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT;
1354
1355 /* 1354 /*
1356 * The Host<->Guest Switcher lives at the top of our address space, and 1355 * The Host<->Guest Switcher lives at the top of our address space, and
1357 * the Host told us how big it is when we made LGUEST_INIT hypercall: 1356 * the Host told us how big it is when we made LGUEST_INIT hypercall:
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index e10cf070ede0..f2479f19ddde 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -42,4 +42,5 @@ else
42 lib-y += memmove_64.o memset_64.o 42 lib-y += memmove_64.o memset_64.o
43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 43 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o 44 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
45 lib-y += cmpxchg16b_emu.o
45endif 46endif
diff --git a/arch/x86/lib/atomic64_386_32.S b/arch/x86/lib/atomic64_386_32.S
index 2cda60a06e65..e8e7e0d06f42 100644
--- a/arch/x86/lib/atomic64_386_32.S
+++ b/arch/x86/lib/atomic64_386_32.S
@@ -15,14 +15,12 @@
15 15
16/* if you want SMP support, implement these with real spinlocks */ 16/* if you want SMP support, implement these with real spinlocks */
17.macro LOCK reg 17.macro LOCK reg
18 pushfl 18 pushfl_cfi
19 CFI_ADJUST_CFA_OFFSET 4
20 cli 19 cli
21.endm 20.endm
22 21
23.macro UNLOCK reg 22.macro UNLOCK reg
24 popfl 23 popfl_cfi
25 CFI_ADJUST_CFA_OFFSET -4
26.endm 24.endm
27 25
28#define BEGIN(op) \ 26#define BEGIN(op) \
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index 71e080de3352..391a083674b4 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -14,14 +14,12 @@
14#include <asm/dwarf2.h> 14#include <asm/dwarf2.h>
15 15
16.macro SAVE reg 16.macro SAVE reg
17 pushl %\reg 17 pushl_cfi %\reg
18 CFI_ADJUST_CFA_OFFSET 4
19 CFI_REL_OFFSET \reg, 0 18 CFI_REL_OFFSET \reg, 0
20.endm 19.endm
21 20
22.macro RESTORE reg 21.macro RESTORE reg
23 popl %\reg 22 popl_cfi %\reg
24 CFI_ADJUST_CFA_OFFSET -4
25 CFI_RESTORE \reg 23 CFI_RESTORE \reg
26.endm 24.endm
27 25
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index adbccd0bbb78..78d16a554db0 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -50,11 +50,9 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
50 */ 50 */
51ENTRY(csum_partial) 51ENTRY(csum_partial)
52 CFI_STARTPROC 52 CFI_STARTPROC
53 pushl %esi 53 pushl_cfi %esi
54 CFI_ADJUST_CFA_OFFSET 4
55 CFI_REL_OFFSET esi, 0 54 CFI_REL_OFFSET esi, 0
56 pushl %ebx 55 pushl_cfi %ebx
57 CFI_ADJUST_CFA_OFFSET 4
58 CFI_REL_OFFSET ebx, 0 56 CFI_REL_OFFSET ebx, 0
59 movl 20(%esp),%eax # Function arg: unsigned int sum 57 movl 20(%esp),%eax # Function arg: unsigned int sum
60 movl 16(%esp),%ecx # Function arg: int len 58 movl 16(%esp),%ecx # Function arg: int len
@@ -132,11 +130,9 @@ ENTRY(csum_partial)
132 jz 8f 130 jz 8f
133 roll $8, %eax 131 roll $8, %eax
1348: 1328:
135 popl %ebx 133 popl_cfi %ebx
136 CFI_ADJUST_CFA_OFFSET -4
137 CFI_RESTORE ebx 134 CFI_RESTORE ebx
138 popl %esi 135 popl_cfi %esi
139 CFI_ADJUST_CFA_OFFSET -4
140 CFI_RESTORE esi 136 CFI_RESTORE esi
141 ret 137 ret
142 CFI_ENDPROC 138 CFI_ENDPROC
@@ -148,11 +144,9 @@ ENDPROC(csum_partial)
148 144
149ENTRY(csum_partial) 145ENTRY(csum_partial)
150 CFI_STARTPROC 146 CFI_STARTPROC
151 pushl %esi 147 pushl_cfi %esi
152 CFI_ADJUST_CFA_OFFSET 4
153 CFI_REL_OFFSET esi, 0 148 CFI_REL_OFFSET esi, 0
154 pushl %ebx 149 pushl_cfi %ebx
155 CFI_ADJUST_CFA_OFFSET 4
156 CFI_REL_OFFSET ebx, 0 150 CFI_REL_OFFSET ebx, 0
157 movl 20(%esp),%eax # Function arg: unsigned int sum 151 movl 20(%esp),%eax # Function arg: unsigned int sum
158 movl 16(%esp),%ecx # Function arg: int len 152 movl 16(%esp),%ecx # Function arg: int len
@@ -260,11 +254,9 @@ ENTRY(csum_partial)
260 jz 90f 254 jz 90f
261 roll $8, %eax 255 roll $8, %eax
26290: 25690:
263 popl %ebx 257 popl_cfi %ebx
264 CFI_ADJUST_CFA_OFFSET -4
265 CFI_RESTORE ebx 258 CFI_RESTORE ebx
266 popl %esi 259 popl_cfi %esi
267 CFI_ADJUST_CFA_OFFSET -4
268 CFI_RESTORE esi 260 CFI_RESTORE esi
269 ret 261 ret
270 CFI_ENDPROC 262 CFI_ENDPROC
@@ -309,14 +301,11 @@ ENTRY(csum_partial_copy_generic)
309 CFI_STARTPROC 301 CFI_STARTPROC
310 subl $4,%esp 302 subl $4,%esp
311 CFI_ADJUST_CFA_OFFSET 4 303 CFI_ADJUST_CFA_OFFSET 4
312 pushl %edi 304 pushl_cfi %edi
313 CFI_ADJUST_CFA_OFFSET 4
314 CFI_REL_OFFSET edi, 0 305 CFI_REL_OFFSET edi, 0
315 pushl %esi 306 pushl_cfi %esi
316 CFI_ADJUST_CFA_OFFSET 4
317 CFI_REL_OFFSET esi, 0 307 CFI_REL_OFFSET esi, 0
318 pushl %ebx 308 pushl_cfi %ebx
319 CFI_ADJUST_CFA_OFFSET 4
320 CFI_REL_OFFSET ebx, 0 309 CFI_REL_OFFSET ebx, 0
321 movl ARGBASE+16(%esp),%eax # sum 310 movl ARGBASE+16(%esp),%eax # sum
322 movl ARGBASE+12(%esp),%ecx # len 311 movl ARGBASE+12(%esp),%ecx # len
@@ -426,17 +415,13 @@ DST( movb %cl, (%edi) )
426 415
427.previous 416.previous
428 417
429 popl %ebx 418 popl_cfi %ebx
430 CFI_ADJUST_CFA_OFFSET -4
431 CFI_RESTORE ebx 419 CFI_RESTORE ebx
432 popl %esi 420 popl_cfi %esi
433 CFI_ADJUST_CFA_OFFSET -4
434 CFI_RESTORE esi 421 CFI_RESTORE esi
435 popl %edi 422 popl_cfi %edi
436 CFI_ADJUST_CFA_OFFSET -4
437 CFI_RESTORE edi 423 CFI_RESTORE edi
438 popl %ecx # equivalent to addl $4,%esp 424 popl_cfi %ecx # equivalent to addl $4,%esp
439 CFI_ADJUST_CFA_OFFSET -4
440 ret 425 ret
441 CFI_ENDPROC 426 CFI_ENDPROC
442ENDPROC(csum_partial_copy_generic) 427ENDPROC(csum_partial_copy_generic)
@@ -459,14 +444,11 @@ ENDPROC(csum_partial_copy_generic)
459 444
460ENTRY(csum_partial_copy_generic) 445ENTRY(csum_partial_copy_generic)
461 CFI_STARTPROC 446 CFI_STARTPROC
462 pushl %ebx 447 pushl_cfi %ebx
463 CFI_ADJUST_CFA_OFFSET 4
464 CFI_REL_OFFSET ebx, 0 448 CFI_REL_OFFSET ebx, 0
465 pushl %edi 449 pushl_cfi %edi
466 CFI_ADJUST_CFA_OFFSET 4
467 CFI_REL_OFFSET edi, 0 450 CFI_REL_OFFSET edi, 0
468 pushl %esi 451 pushl_cfi %esi
469 CFI_ADJUST_CFA_OFFSET 4
470 CFI_REL_OFFSET esi, 0 452 CFI_REL_OFFSET esi, 0
471 movl ARGBASE+4(%esp),%esi #src 453 movl ARGBASE+4(%esp),%esi #src
472 movl ARGBASE+8(%esp),%edi #dst 454 movl ARGBASE+8(%esp),%edi #dst
@@ -527,14 +509,11 @@ DST( movb %dl, (%edi) )
527 jmp 7b 509 jmp 7b
528.previous 510.previous
529 511
530 popl %esi 512 popl_cfi %esi
531 CFI_ADJUST_CFA_OFFSET -4
532 CFI_RESTORE esi 513 CFI_RESTORE esi
533 popl %edi 514 popl_cfi %edi
534 CFI_ADJUST_CFA_OFFSET -4
535 CFI_RESTORE edi 515 CFI_RESTORE edi
536 popl %ebx 516 popl_cfi %ebx
537 CFI_ADJUST_CFA_OFFSET -4
538 CFI_RESTORE ebx 517 CFI_RESTORE ebx
539 ret 518 ret
540 CFI_ENDPROC 519 CFI_ENDPROC
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index aa4326bfb24a..f2145cfa12a6 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,5 +1,6 @@
1#include <linux/linkage.h> 1#include <linux/linkage.h>
2#include <asm/dwarf2.h> 2#include <asm/dwarf2.h>
3#include <asm/alternative-asm.h>
3 4
4/* 5/*
5 * Zero a page. 6 * Zero a page.
@@ -14,6 +15,15 @@ ENTRY(clear_page_c)
14 CFI_ENDPROC 15 CFI_ENDPROC
15ENDPROC(clear_page_c) 16ENDPROC(clear_page_c)
16 17
18ENTRY(clear_page_c_e)
19 CFI_STARTPROC
20 movl $4096,%ecx
21 xorl %eax,%eax
22 rep stosb
23 ret
24 CFI_ENDPROC
25ENDPROC(clear_page_c_e)
26
17ENTRY(clear_page) 27ENTRY(clear_page)
18 CFI_STARTPROC 28 CFI_STARTPROC
19 xorl %eax,%eax 29 xorl %eax,%eax
@@ -38,21 +48,26 @@ ENTRY(clear_page)
38.Lclear_page_end: 48.Lclear_page_end:
39ENDPROC(clear_page) 49ENDPROC(clear_page)
40 50
41 /* Some CPUs run faster using the string instructions. 51 /*
42 It is also a lot simpler. Use this when possible */ 52 * Some CPUs support enhanced REP MOVSB/STOSB instructions.
53 * It is recommended to use this when possible.
54 * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
55 * Otherwise, use original function.
56 *
57 */
43 58
44#include <asm/cpufeature.h> 59#include <asm/cpufeature.h>
45 60
46 .section .altinstr_replacement,"ax" 61 .section .altinstr_replacement,"ax"
471: .byte 0xeb /* jmp <disp8> */ 621: .byte 0xeb /* jmp <disp8> */
48 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ 63 .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
492: 642: .byte 0xeb /* jmp <disp8> */
65 .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
663:
50 .previous 67 .previous
51 .section .altinstructions,"a" 68 .section .altinstructions,"a"
52 .align 8 69 altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
53 .quad clear_page 70 .Lclear_page_end-clear_page, 2b-1b
54 .quad 1b 71 altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
55 .word X86_FEATURE_REP_GOOD 72 .Lclear_page_end-clear_page,3b-2b
56 .byte .Lclear_page_end - clear_page
57 .byte 2b - 1b
58 .previous 73 .previous
diff --git a/arch/x86/lib/cmpxchg16b_emu.S b/arch/x86/lib/cmpxchg16b_emu.S
new file mode 100644
index 000000000000..1e572c507d06
--- /dev/null
+++ b/arch/x86/lib/cmpxchg16b_emu.S
@@ -0,0 +1,65 @@
1/*
2 * This program is free software; you can redistribute it and/or
3 * modify it under the terms of the GNU General Public License
4 * as published by the Free Software Foundation; version 2
5 * of the License.
6 *
7 */
8#include <linux/linkage.h>
9#include <asm/alternative-asm.h>
10#include <asm/frame.h>
11#include <asm/dwarf2.h>
12
13#ifdef CONFIG_SMP
14#define SEG_PREFIX %gs:
15#else
16#define SEG_PREFIX
17#endif
18
19.text
20
21/*
22 * Inputs:
23 * %rsi : memory location to compare
24 * %rax : low 64 bits of old value
25 * %rdx : high 64 bits of old value
26 * %rbx : low 64 bits of new value
27 * %rcx : high 64 bits of new value
28 * %al : Operation successful
29 */
30ENTRY(this_cpu_cmpxchg16b_emu)
31CFI_STARTPROC
32
33#
34# Emulate 'cmpxchg16b %gs:(%rsi)' except we return the result in %al not
35# via the ZF. Caller will access %al to get result.
36#
37# Note that this is only useful for a cpuops operation. Meaning that we
38# do *not* have a fully atomic operation but just an operation that is
39# *atomic* on a single cpu (as provided by the this_cpu_xx class of
40# macros).
41#
42this_cpu_cmpxchg16b_emu:
43 pushf
44 cli
45
46 cmpq SEG_PREFIX(%rsi), %rax
47 jne not_same
48 cmpq SEG_PREFIX 8(%rsi), %rdx
49 jne not_same
50
51 movq %rbx, SEG_PREFIX(%rsi)
52 movq %rcx, SEG_PREFIX 8(%rsi)
53
54 popf
55 mov $1, %al
56 ret
57
58 not_same:
59 popf
60 xor %al,%al
61 ret
62
63CFI_ENDPROC
64
65ENDPROC(this_cpu_cmpxchg16b_emu)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index a460158b5ac5..024840266ba0 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -15,23 +15,30 @@
15#include <asm/asm-offsets.h> 15#include <asm/asm-offsets.h>
16#include <asm/thread_info.h> 16#include <asm/thread_info.h>
17#include <asm/cpufeature.h> 17#include <asm/cpufeature.h>
18#include <asm/alternative-asm.h>
18 19
19 .macro ALTERNATIVE_JUMP feature,orig,alt 20/*
21 * By placing feature2 after feature1 in altinstructions section, we logically
22 * implement:
23 * If CPU has feature2, jmp to alt2 is used
24 * else if CPU has feature1, jmp to alt1 is used
25 * else jmp to orig is used.
26 */
27 .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
200: 280:
21 .byte 0xe9 /* 32bit jump */ 29 .byte 0xe9 /* 32bit jump */
22 .long \orig-1f /* by default jump to orig */ 30 .long \orig-1f /* by default jump to orig */
231: 311:
24 .section .altinstr_replacement,"ax" 32 .section .altinstr_replacement,"ax"
252: .byte 0xe9 /* near jump with 32bit immediate */ 332: .byte 0xe9 /* near jump with 32bit immediate */
26 .long \alt-1b /* offset */ /* or alternatively to alt */ 34 .long \alt1-1b /* offset */ /* or alternatively to alt1 */
353: .byte 0xe9 /* near jump with 32bit immediate */
36 .long \alt2-1b /* offset */ /* or alternatively to alt2 */
27 .previous 37 .previous
38
28 .section .altinstructions,"a" 39 .section .altinstructions,"a"
29 .align 8 40 altinstruction_entry 0b,2b,\feature1,5,5
30 .quad 0b 41 altinstruction_entry 0b,3b,\feature2,5,5
31 .quad 2b
32 .word \feature /* when feature is set */
33 .byte 5
34 .byte 5
35 .previous 42 .previous
36 .endm 43 .endm
37 44
@@ -72,8 +79,10 @@ ENTRY(_copy_to_user)
72 addq %rdx,%rcx 79 addq %rdx,%rcx
73 jc bad_to_user 80 jc bad_to_user
74 cmpq TI_addr_limit(%rax),%rcx 81 cmpq TI_addr_limit(%rax),%rcx
75 jae bad_to_user 82 ja bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 83 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
84 copy_user_generic_unrolled,copy_user_generic_string, \
85 copy_user_enhanced_fast_string
77 CFI_ENDPROC 86 CFI_ENDPROC
78ENDPROC(_copy_to_user) 87ENDPROC(_copy_to_user)
79 88
@@ -85,8 +94,10 @@ ENTRY(_copy_from_user)
85 addq %rdx,%rcx 94 addq %rdx,%rcx
86 jc bad_from_user 95 jc bad_from_user
87 cmpq TI_addr_limit(%rax),%rcx 96 cmpq TI_addr_limit(%rax),%rcx
88 jae bad_from_user 97 ja bad_from_user
89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 98 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
99 copy_user_generic_unrolled,copy_user_generic_string, \
100 copy_user_enhanced_fast_string
90 CFI_ENDPROC 101 CFI_ENDPROC
91ENDPROC(_copy_from_user) 102ENDPROC(_copy_from_user)
92 103
@@ -117,7 +128,7 @@ ENDPROC(bad_from_user)
117 * rdx count 128 * rdx count
118 * 129 *
119 * Output: 130 * Output:
120 * eax uncopied bytes or 0 if successfull. 131 * eax uncopied bytes or 0 if successful.
121 */ 132 */
122ENTRY(copy_user_generic_unrolled) 133ENTRY(copy_user_generic_unrolled)
123 CFI_STARTPROC 134 CFI_STARTPROC
@@ -255,3 +266,37 @@ ENTRY(copy_user_generic_string)
255 .previous 266 .previous
256 CFI_ENDPROC 267 CFI_ENDPROC
257ENDPROC(copy_user_generic_string) 268ENDPROC(copy_user_generic_string)
269
270/*
271 * Some CPUs are adding enhanced REP MOVSB/STOSB instructions.
272 * It's recommended to use enhanced REP MOVSB/STOSB if it's enabled.
273 *
274 * Input:
275 * rdi destination
276 * rsi source
277 * rdx count
278 *
279 * Output:
280 * eax uncopied bytes or 0 if successful.
281 */
282ENTRY(copy_user_enhanced_fast_string)
283 CFI_STARTPROC
284 andl %edx,%edx
285 jz 2f
286 movl %edx,%ecx
2871: rep
288 movsb
2892: xorl %eax,%eax
290 ret
291
292 .section .fixup,"ax"
29312: movl %ecx,%edx /* ecx is zerorest also */
294 jmp copy_user_handle_tail
295 .previous
296
297 .section __ex_table,"a"
298 .align 8
299 .quad 1b,12b
300 .previous
301 CFI_ENDPROC
302ENDPROC(copy_user_enhanced_fast_string)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index f0dba36578ea..fb903b758da8 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -1,6 +1,6 @@
1/* 1/*
2 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 2 * Copyright 2002, 2003 Andi Kleen, SuSE Labs.
3 * 3 *
4 * This file is subject to the terms and conditions of the GNU General Public 4 * This file is subject to the terms and conditions of the GNU General Public
5 * License. See the file COPYING in the main directory of this archive 5 * License. See the file COPYING in the main directory of this archive
6 * for more details. No warranty for anything given at all. 6 * for more details. No warranty for anything given at all.
@@ -11,82 +11,82 @@
11 11
12/* 12/*
13 * Checksum copy with exception handling. 13 * Checksum copy with exception handling.
14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the 14 * On exceptions src_err_ptr or dst_err_ptr is set to -EFAULT and the
15 * destination is zeroed. 15 * destination is zeroed.
16 * 16 *
17 * Input 17 * Input
18 * rdi source 18 * rdi source
19 * rsi destination 19 * rsi destination
20 * edx len (32bit) 20 * edx len (32bit)
21 * ecx sum (32bit) 21 * ecx sum (32bit)
22 * r8 src_err_ptr (int) 22 * r8 src_err_ptr (int)
23 * r9 dst_err_ptr (int) 23 * r9 dst_err_ptr (int)
24 * 24 *
25 * Output 25 * Output
26 * eax 64bit sum. undefined in case of exception. 26 * eax 64bit sum. undefined in case of exception.
27 * 27 *
28 * Wrappers need to take care of valid exception sum and zeroing. 28 * Wrappers need to take care of valid exception sum and zeroing.
29 * They also should align source or destination to 8 bytes. 29 * They also should align source or destination to 8 bytes.
30 */ 30 */
31 31
32 .macro source 32 .macro source
3310: 3310:
34 .section __ex_table,"a" 34 .section __ex_table, "a"
35 .align 8 35 .align 8
36 .quad 10b,.Lbad_source 36 .quad 10b, .Lbad_source
37 .previous 37 .previous
38 .endm 38 .endm
39 39
40 .macro dest 40 .macro dest
4120: 4120:
42 .section __ex_table,"a" 42 .section __ex_table, "a"
43 .align 8 43 .align 8
44 .quad 20b,.Lbad_dest 44 .quad 20b, .Lbad_dest
45 .previous 45 .previous
46 .endm 46 .endm
47 47
48 .macro ignore L=.Lignore 48 .macro ignore L=.Lignore
4930: 4930:
50 .section __ex_table,"a" 50 .section __ex_table, "a"
51 .align 8 51 .align 8
52 .quad 30b,\L 52 .quad 30b, \L
53 .previous 53 .previous
54 .endm 54 .endm
55 55
56 56
57ENTRY(csum_partial_copy_generic) 57ENTRY(csum_partial_copy_generic)
58 CFI_STARTPROC 58 CFI_STARTPROC
59 cmpl $3*64,%edx 59 cmpl $3*64, %edx
60 jle .Lignore 60 jle .Lignore
61 61
62.Lignore: 62.Lignore:
63 subq $7*8,%rsp 63 subq $7*8, %rsp
64 CFI_ADJUST_CFA_OFFSET 7*8 64 CFI_ADJUST_CFA_OFFSET 7*8
65 movq %rbx,2*8(%rsp) 65 movq %rbx, 2*8(%rsp)
66 CFI_REL_OFFSET rbx, 2*8 66 CFI_REL_OFFSET rbx, 2*8
67 movq %r12,3*8(%rsp) 67 movq %r12, 3*8(%rsp)
68 CFI_REL_OFFSET r12, 3*8 68 CFI_REL_OFFSET r12, 3*8
69 movq %r14,4*8(%rsp) 69 movq %r14, 4*8(%rsp)
70 CFI_REL_OFFSET r14, 4*8 70 CFI_REL_OFFSET r14, 4*8
71 movq %r13,5*8(%rsp) 71 movq %r13, 5*8(%rsp)
72 CFI_REL_OFFSET r13, 5*8 72 CFI_REL_OFFSET r13, 5*8
73 movq %rbp,6*8(%rsp) 73 movq %rbp, 6*8(%rsp)
74 CFI_REL_OFFSET rbp, 6*8 74 CFI_REL_OFFSET rbp, 6*8
75 75
76 movq %r8,(%rsp) 76 movq %r8, (%rsp)
77 movq %r9,1*8(%rsp) 77 movq %r9, 1*8(%rsp)
78
79 movl %ecx,%eax
80 movl %edx,%ecx
81 78
82 xorl %r9d,%r9d 79 movl %ecx, %eax
83 movq %rcx,%r12 80 movl %edx, %ecx
84 81
85 shrq $6,%r12 82 xorl %r9d, %r9d
86 jz .Lhandle_tail /* < 64 */ 83 movq %rcx, %r12
84
85 shrq $6, %r12
86 jz .Lhandle_tail /* < 64 */
87 87
88 clc 88 clc
89 89
90 /* main loop. clear in 64 byte blocks */ 90 /* main loop. clear in 64 byte blocks */
91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */ 91 /* r9: zero, r8: temp2, rbx: temp1, rax: sum, rcx: saved length */
92 /* r11: temp3, rdx: temp4, r12 loopcnt */ 92 /* r11: temp3, rdx: temp4, r12 loopcnt */
@@ -94,156 +94,156 @@ ENTRY(csum_partial_copy_generic)
94 .p2align 4 94 .p2align 4
95.Lloop: 95.Lloop:
96 source 96 source
97 movq (%rdi),%rbx 97 movq (%rdi), %rbx
98 source 98 source
99 movq 8(%rdi),%r8 99 movq 8(%rdi), %r8
100 source 100 source
101 movq 16(%rdi),%r11 101 movq 16(%rdi), %r11
102 source 102 source
103 movq 24(%rdi),%rdx 103 movq 24(%rdi), %rdx
104 104
105 source 105 source
106 movq 32(%rdi),%r10 106 movq 32(%rdi), %r10
107 source 107 source
108 movq 40(%rdi),%rbp 108 movq 40(%rdi), %rbp
109 source 109 source
110 movq 48(%rdi),%r14 110 movq 48(%rdi), %r14
111 source 111 source
112 movq 56(%rdi),%r13 112 movq 56(%rdi), %r13
113 113
114 ignore 2f 114 ignore 2f
115 prefetcht0 5*64(%rdi) 115 prefetcht0 5*64(%rdi)
1162: 1162:
117 adcq %rbx,%rax 117 adcq %rbx, %rax
118 adcq %r8,%rax 118 adcq %r8, %rax
119 adcq %r11,%rax 119 adcq %r11, %rax
120 adcq %rdx,%rax 120 adcq %rdx, %rax
121 adcq %r10,%rax 121 adcq %r10, %rax
122 adcq %rbp,%rax 122 adcq %rbp, %rax
123 adcq %r14,%rax 123 adcq %r14, %rax
124 adcq %r13,%rax 124 adcq %r13, %rax
125 125
126 decl %r12d 126 decl %r12d
127 127
128 dest 128 dest
129 movq %rbx,(%rsi) 129 movq %rbx, (%rsi)
130 dest 130 dest
131 movq %r8,8(%rsi) 131 movq %r8, 8(%rsi)
132 dest 132 dest
133 movq %r11,16(%rsi) 133 movq %r11, 16(%rsi)
134 dest 134 dest
135 movq %rdx,24(%rsi) 135 movq %rdx, 24(%rsi)
136 136
137 dest 137 dest
138 movq %r10,32(%rsi) 138 movq %r10, 32(%rsi)
139 dest 139 dest
140 movq %rbp,40(%rsi) 140 movq %rbp, 40(%rsi)
141 dest 141 dest
142 movq %r14,48(%rsi) 142 movq %r14, 48(%rsi)
143 dest 143 dest
144 movq %r13,56(%rsi) 144 movq %r13, 56(%rsi)
145 145
1463: 1463:
147
148 leaq 64(%rdi),%rdi
149 leaq 64(%rsi),%rsi
150 147
151 jnz .Lloop 148 leaq 64(%rdi), %rdi
149 leaq 64(%rsi), %rsi
152 150
153 adcq %r9,%rax 151 jnz .Lloop
154 152
155 /* do last upto 56 bytes */ 153 adcq %r9, %rax
154
155 /* do last up to 56 bytes */
156.Lhandle_tail: 156.Lhandle_tail:
157 /* ecx: count */ 157 /* ecx: count */
158 movl %ecx,%r10d 158 movl %ecx, %r10d
159 andl $63,%ecx 159 andl $63, %ecx
160 shrl $3,%ecx 160 shrl $3, %ecx
161 jz .Lfold 161 jz .Lfold
162 clc 162 clc
163 .p2align 4 163 .p2align 4
164.Lloop_8: 164.Lloop_8:
165 source 165 source
166 movq (%rdi),%rbx 166 movq (%rdi), %rbx
167 adcq %rbx,%rax 167 adcq %rbx, %rax
168 decl %ecx 168 decl %ecx
169 dest 169 dest
170 movq %rbx,(%rsi) 170 movq %rbx, (%rsi)
171 leaq 8(%rsi),%rsi /* preserve carry */ 171 leaq 8(%rsi), %rsi /* preserve carry */
172 leaq 8(%rdi),%rdi 172 leaq 8(%rdi), %rdi
173 jnz .Lloop_8 173 jnz .Lloop_8
174 adcq %r9,%rax /* add in carry */ 174 adcq %r9, %rax /* add in carry */
175 175
176.Lfold: 176.Lfold:
177 /* reduce checksum to 32bits */ 177 /* reduce checksum to 32bits */
178 movl %eax,%ebx 178 movl %eax, %ebx
179 shrq $32,%rax 179 shrq $32, %rax
180 addl %ebx,%eax 180 addl %ebx, %eax
181 adcl %r9d,%eax 181 adcl %r9d, %eax
182 182
183 /* do last upto 6 bytes */ 183 /* do last up to 6 bytes */
184.Lhandle_7: 184.Lhandle_7:
185 movl %r10d,%ecx 185 movl %r10d, %ecx
186 andl $7,%ecx 186 andl $7, %ecx
187 shrl $1,%ecx 187 shrl $1, %ecx
188 jz .Lhandle_1 188 jz .Lhandle_1
189 movl $2,%edx 189 movl $2, %edx
190 xorl %ebx,%ebx 190 xorl %ebx, %ebx
191 clc 191 clc
192 .p2align 4 192 .p2align 4
193.Lloop_1: 193.Lloop_1:
194 source 194 source
195 movw (%rdi),%bx 195 movw (%rdi), %bx
196 adcl %ebx,%eax 196 adcl %ebx, %eax
197 decl %ecx 197 decl %ecx
198 dest 198 dest
199 movw %bx,(%rsi) 199 movw %bx, (%rsi)
200 leaq 2(%rdi),%rdi 200 leaq 2(%rdi), %rdi
201 leaq 2(%rsi),%rsi 201 leaq 2(%rsi), %rsi
202 jnz .Lloop_1 202 jnz .Lloop_1
203 adcl %r9d,%eax /* add in carry */ 203 adcl %r9d, %eax /* add in carry */
204 204
205 /* handle last odd byte */ 205 /* handle last odd byte */
206.Lhandle_1: 206.Lhandle_1:
207 testl $1,%r10d 207 testl $1, %r10d
208 jz .Lende 208 jz .Lende
209 xorl %ebx,%ebx 209 xorl %ebx, %ebx
210 source 210 source
211 movb (%rdi),%bl 211 movb (%rdi), %bl
212 dest 212 dest
213 movb %bl,(%rsi) 213 movb %bl, (%rsi)
214 addl %ebx,%eax 214 addl %ebx, %eax
215 adcl %r9d,%eax /* carry */ 215 adcl %r9d, %eax /* carry */
216 216
217 CFI_REMEMBER_STATE 217 CFI_REMEMBER_STATE
218.Lende: 218.Lende:
219 movq 2*8(%rsp),%rbx 219 movq 2*8(%rsp), %rbx
220 CFI_RESTORE rbx 220 CFI_RESTORE rbx
221 movq 3*8(%rsp),%r12 221 movq 3*8(%rsp), %r12
222 CFI_RESTORE r12 222 CFI_RESTORE r12
223 movq 4*8(%rsp),%r14 223 movq 4*8(%rsp), %r14
224 CFI_RESTORE r14 224 CFI_RESTORE r14
225 movq 5*8(%rsp),%r13 225 movq 5*8(%rsp), %r13
226 CFI_RESTORE r13 226 CFI_RESTORE r13
227 movq 6*8(%rsp),%rbp 227 movq 6*8(%rsp), %rbp
228 CFI_RESTORE rbp 228 CFI_RESTORE rbp
229 addq $7*8,%rsp 229 addq $7*8, %rsp
230 CFI_ADJUST_CFA_OFFSET -7*8 230 CFI_ADJUST_CFA_OFFSET -7*8
231 ret 231 ret
232 CFI_RESTORE_STATE 232 CFI_RESTORE_STATE
233 233
234 /* Exception handlers. Very simple, zeroing is done in the wrappers */ 234 /* Exception handlers. Very simple, zeroing is done in the wrappers */
235.Lbad_source: 235.Lbad_source:
236 movq (%rsp),%rax 236 movq (%rsp), %rax
237 testq %rax,%rax 237 testq %rax, %rax
238 jz .Lende 238 jz .Lende
239 movl $-EFAULT,(%rax) 239 movl $-EFAULT, (%rax)
240 jmp .Lende 240 jmp .Lende
241 241
242.Lbad_dest: 242.Lbad_dest:
243 movq 8(%rsp),%rax 243 movq 8(%rsp), %rax
244 testq %rax,%rax 244 testq %rax, %rax
245 jz .Lende 245 jz .Lende
246 movl $-EFAULT,(%rax) 246 movl $-EFAULT, (%rax)
247 jmp .Lende 247 jmp .Lende
248 CFI_ENDPROC 248 CFI_ENDPROC
249ENDPROC(csum_partial_copy_generic) 249ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/csum-partial_64.c b/arch/x86/lib/csum-partial_64.c
index bf51144d97e1..9845371c5c36 100644
--- a/arch/x86/lib/csum-partial_64.c
+++ b/arch/x86/lib/csum-partial_64.c
@@ -84,7 +84,7 @@ static unsigned do_csum(const unsigned char *buff, unsigned len)
84 count64--; 84 count64--;
85 } 85 }
86 86
87 /* last upto 7 8byte blocks */ 87 /* last up to 7 8byte blocks */
88 count %= 8; 88 count %= 8;
89 while (count) { 89 while (count) {
90 asm("addq %1,%0\n\t" 90 asm("addq %1,%0\n\t"
diff --git a/arch/x86/lib/delay.c b/arch/x86/lib/delay.c
index ff485d361182..fc45ba887d05 100644
--- a/arch/x86/lib/delay.c
+++ b/arch/x86/lib/delay.c
@@ -121,7 +121,7 @@ inline void __const_udelay(unsigned long xloops)
121 asm("mull %%edx" 121 asm("mull %%edx"
122 :"=d" (xloops), "=&a" (d0) 122 :"=d" (xloops), "=&a" (d0)
123 :"1" (xloops), "0" 123 :"1" (xloops), "0"
124 (cpu_data(raw_smp_processor_id()).loops_per_jiffy * (HZ/4))); 124 (this_cpu_read(cpu_info.loops_per_jiffy) * (HZ/4)));
125 125
126 __delay(++xloops); 126 __delay(++xloops);
127} 127}
diff --git a/arch/x86/lib/memcpy_32.c b/arch/x86/lib/memcpy_32.c
index 5415a9d06f53..b908a59eccf5 100644
--- a/arch/x86/lib/memcpy_32.c
+++ b/arch/x86/lib/memcpy_32.c
@@ -22,22 +22,187 @@ EXPORT_SYMBOL(memset);
22 22
23void *memmove(void *dest, const void *src, size_t n) 23void *memmove(void *dest, const void *src, size_t n)
24{ 24{
25 int d0, d1, d2; 25 int d0,d1,d2,d3,d4,d5;
26 26 char *ret = dest;
27 if (dest < src) { 27
28 memcpy(dest, src, n); 28 __asm__ __volatile__(
29 } else { 29 /* Handle more 16bytes in loop */
30 __asm__ __volatile__( 30 "cmp $0x10, %0\n\t"
31 "std\n\t" 31 "jb 1f\n\t"
32 "rep\n\t" 32
33 "movsb\n\t" 33 /* Decide forward/backward copy mode */
34 "cld" 34 "cmp %2, %1\n\t"
35 : "=&c" (d0), "=&S" (d1), "=&D" (d2) 35 "jb 2f\n\t"
36 :"0" (n), 36
37 "1" (n-1+src), 37 /*
38 "2" (n-1+dest) 38 * movs instruction have many startup latency
39 :"memory"); 39 * so we handle small size by general register.
40 } 40 */
41 return dest; 41 "cmp $680, %0\n\t"
42 "jb 3f\n\t"
43 /*
44 * movs instruction is only good for aligned case.
45 */
46 "mov %1, %3\n\t"
47 "xor %2, %3\n\t"
48 "and $0xff, %3\n\t"
49 "jz 4f\n\t"
50 "3:\n\t"
51 "sub $0x10, %0\n\t"
52
53 /*
54 * We gobble 16byts forward in each loop.
55 */
56 "3:\n\t"
57 "sub $0x10, %0\n\t"
58 "mov 0*4(%1), %3\n\t"
59 "mov 1*4(%1), %4\n\t"
60 "mov %3, 0*4(%2)\n\t"
61 "mov %4, 1*4(%2)\n\t"
62 "mov 2*4(%1), %3\n\t"
63 "mov 3*4(%1), %4\n\t"
64 "mov %3, 2*4(%2)\n\t"
65 "mov %4, 3*4(%2)\n\t"
66 "lea 0x10(%1), %1\n\t"
67 "lea 0x10(%2), %2\n\t"
68 "jae 3b\n\t"
69 "add $0x10, %0\n\t"
70 "jmp 1f\n\t"
71
72 /*
73 * Handle data forward by movs.
74 */
75 ".p2align 4\n\t"
76 "4:\n\t"
77 "mov -4(%1, %0), %3\n\t"
78 "lea -4(%2, %0), %4\n\t"
79 "shr $2, %0\n\t"
80 "rep movsl\n\t"
81 "mov %3, (%4)\n\t"
82 "jmp 11f\n\t"
83 /*
84 * Handle data backward by movs.
85 */
86 ".p2align 4\n\t"
87 "6:\n\t"
88 "mov (%1), %3\n\t"
89 "mov %2, %4\n\t"
90 "lea -4(%1, %0), %1\n\t"
91 "lea -4(%2, %0), %2\n\t"
92 "shr $2, %0\n\t"
93 "std\n\t"
94 "rep movsl\n\t"
95 "mov %3,(%4)\n\t"
96 "cld\n\t"
97 "jmp 11f\n\t"
98
99 /*
100 * Start to prepare for backward copy.
101 */
102 ".p2align 4\n\t"
103 "2:\n\t"
104 "cmp $680, %0\n\t"
105 "jb 5f\n\t"
106 "mov %1, %3\n\t"
107 "xor %2, %3\n\t"
108 "and $0xff, %3\n\t"
109 "jz 6b\n\t"
110
111 /*
112 * Calculate copy position to tail.
113 */
114 "5:\n\t"
115 "add %0, %1\n\t"
116 "add %0, %2\n\t"
117 "sub $0x10, %0\n\t"
118
119 /*
120 * We gobble 16byts backward in each loop.
121 */
122 "7:\n\t"
123 "sub $0x10, %0\n\t"
124
125 "mov -1*4(%1), %3\n\t"
126 "mov -2*4(%1), %4\n\t"
127 "mov %3, -1*4(%2)\n\t"
128 "mov %4, -2*4(%2)\n\t"
129 "mov -3*4(%1), %3\n\t"
130 "mov -4*4(%1), %4\n\t"
131 "mov %3, -3*4(%2)\n\t"
132 "mov %4, -4*4(%2)\n\t"
133 "lea -0x10(%1), %1\n\t"
134 "lea -0x10(%2), %2\n\t"
135 "jae 7b\n\t"
136 /*
137 * Calculate copy position to head.
138 */
139 "add $0x10, %0\n\t"
140 "sub %0, %1\n\t"
141 "sub %0, %2\n\t"
142
143 /*
144 * Move data from 8 bytes to 15 bytes.
145 */
146 ".p2align 4\n\t"
147 "1:\n\t"
148 "cmp $8, %0\n\t"
149 "jb 8f\n\t"
150 "mov 0*4(%1), %3\n\t"
151 "mov 1*4(%1), %4\n\t"
152 "mov -2*4(%1, %0), %5\n\t"
153 "mov -1*4(%1, %0), %1\n\t"
154
155 "mov %3, 0*4(%2)\n\t"
156 "mov %4, 1*4(%2)\n\t"
157 "mov %5, -2*4(%2, %0)\n\t"
158 "mov %1, -1*4(%2, %0)\n\t"
159 "jmp 11f\n\t"
160
161 /*
162 * Move data from 4 bytes to 7 bytes.
163 */
164 ".p2align 4\n\t"
165 "8:\n\t"
166 "cmp $4, %0\n\t"
167 "jb 9f\n\t"
168 "mov 0*4(%1), %3\n\t"
169 "mov -1*4(%1, %0), %4\n\t"
170 "mov %3, 0*4(%2)\n\t"
171 "mov %4, -1*4(%2, %0)\n\t"
172 "jmp 11f\n\t"
173
174 /*
175 * Move data from 2 bytes to 3 bytes.
176 */
177 ".p2align 4\n\t"
178 "9:\n\t"
179 "cmp $2, %0\n\t"
180 "jb 10f\n\t"
181 "movw 0*2(%1), %%dx\n\t"
182 "movw -1*2(%1, %0), %%bx\n\t"
183 "movw %%dx, 0*2(%2)\n\t"
184 "movw %%bx, -1*2(%2, %0)\n\t"
185 "jmp 11f\n\t"
186
187 /*
188 * Move data for 1 byte.
189 */
190 ".p2align 4\n\t"
191 "10:\n\t"
192 "cmp $1, %0\n\t"
193 "jb 11f\n\t"
194 "movb (%1), %%cl\n\t"
195 "movb %%cl, (%2)\n\t"
196 ".p2align 4\n\t"
197 "11:"
198 : "=&c" (d0), "=&S" (d1), "=&D" (d2),
199 "=r" (d3),"=r" (d4), "=r"(d5)
200 :"0" (n),
201 "1" (src),
202 "2" (dest)
203 :"memory");
204
205 return ret;
206
42} 207}
43EXPORT_SYMBOL(memmove); 208EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index bcbcd1e0f7d5..efbf2a0ecdea 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -4,6 +4,7 @@
4 4
5#include <asm/cpufeature.h> 5#include <asm/cpufeature.h>
6#include <asm/dwarf2.h> 6#include <asm/dwarf2.h>
7#include <asm/alternative-asm.h>
7 8
8/* 9/*
9 * memcpy - Copy a memory block. 10 * memcpy - Copy a memory block.
@@ -37,107 +38,173 @@
37.Lmemcpy_e: 38.Lmemcpy_e:
38 .previous 39 .previous
39 40
41/*
42 * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
43 * memcpy_c. Use memcpy_c_e when possible.
44 *
45 * This gets patched over the unrolled variant (below) via the
46 * alternative instructions framework:
47 */
48 .section .altinstr_replacement, "ax", @progbits
49.Lmemcpy_c_e:
50 movq %rdi, %rax
51
52 movl %edx, %ecx
53 rep movsb
54 ret
55.Lmemcpy_e_e:
56 .previous
57
40ENTRY(__memcpy) 58ENTRY(__memcpy)
41ENTRY(memcpy) 59ENTRY(memcpy)
42 CFI_STARTPROC 60 CFI_STARTPROC
61 movq %rdi, %rax
43 62
44 /* 63 /*
45 * Put the number of full 64-byte blocks into %ecx. 64 * Use 32bit CMP here to avoid long NOP padding.
46 * Tail portion is handled at the end:
47 */ 65 */
48 movq %rdi, %rax 66 cmp $0x20, %edx
49 movl %edx, %ecx 67 jb .Lhandle_tail
50 shrl $6, %ecx
51 jz .Lhandle_tail
52 68
53 .p2align 4
54.Lloop_64:
55 /* 69 /*
56 * We decrement the loop index here - and the zero-flag is 70 * We check whether memory false dependence could occur,
57 * checked at the end of the loop (instructions inbetween do 71 * then jump to corresponding copy mode.
58 * not change the zero flag):
59 */ 72 */
60 decl %ecx 73 cmp %dil, %sil
74 jl .Lcopy_backward
75 subl $0x20, %edx
76.Lcopy_forward_loop:
77 subq $0x20, %rdx
61 78
62 /* 79 /*
63 * Move in blocks of 4x16 bytes: 80 * Move in blocks of 4x8 bytes:
64 */ 81 */
65 movq 0*8(%rsi), %r11 82 movq 0*8(%rsi), %r8
66 movq 1*8(%rsi), %r8 83 movq 1*8(%rsi), %r9
67 movq %r11, 0*8(%rdi) 84 movq 2*8(%rsi), %r10
68 movq %r8, 1*8(%rdi) 85 movq 3*8(%rsi), %r11
69 86 leaq 4*8(%rsi), %rsi
70 movq 2*8(%rsi), %r9 87
71 movq 3*8(%rsi), %r10 88 movq %r8, 0*8(%rdi)
72 movq %r9, 2*8(%rdi) 89 movq %r9, 1*8(%rdi)
73 movq %r10, 3*8(%rdi) 90 movq %r10, 2*8(%rdi)
74 91 movq %r11, 3*8(%rdi)
75 movq 4*8(%rsi), %r11 92 leaq 4*8(%rdi), %rdi
76 movq 5*8(%rsi), %r8 93 jae .Lcopy_forward_loop
77 movq %r11, 4*8(%rdi) 94 addq $0x20, %rdx
78 movq %r8, 5*8(%rdi) 95 jmp .Lhandle_tail
79 96
80 movq 6*8(%rsi), %r9 97.Lcopy_backward:
81 movq 7*8(%rsi), %r10 98 /*
82 movq %r9, 6*8(%rdi) 99 * Calculate copy position to tail.
83 movq %r10, 7*8(%rdi) 100 */
84 101 addq %rdx, %rsi
85 leaq 64(%rsi), %rsi 102 addq %rdx, %rdi
86 leaq 64(%rdi), %rdi 103 subq $0x20, %rdx
87 104 /*
88 jnz .Lloop_64 105 * At most 3 ALU operations in one cycle,
106 * so append NOPS in the same 16bytes trunk.
107 */
108 .p2align 4
109.Lcopy_backward_loop:
110 subq $0x20, %rdx
111 movq -1*8(%rsi), %r8
112 movq -2*8(%rsi), %r9
113 movq -3*8(%rsi), %r10
114 movq -4*8(%rsi), %r11
115 leaq -4*8(%rsi), %rsi
116 movq %r8, -1*8(%rdi)
117 movq %r9, -2*8(%rdi)
118 movq %r10, -3*8(%rdi)
119 movq %r11, -4*8(%rdi)
120 leaq -4*8(%rdi), %rdi
121 jae .Lcopy_backward_loop
89 122
123 /*
124 * Calculate copy position to head.
125 */
126 addq $0x20, %rdx
127 subq %rdx, %rsi
128 subq %rdx, %rdi
90.Lhandle_tail: 129.Lhandle_tail:
91 movl %edx, %ecx 130 cmpq $16, %rdx
92 andl $63, %ecx 131 jb .Lless_16bytes
93 shrl $3, %ecx
94 jz .Lhandle_7
95 132
133 /*
134 * Move data from 16 bytes to 31 bytes.
135 */
136 movq 0*8(%rsi), %r8
137 movq 1*8(%rsi), %r9
138 movq -2*8(%rsi, %rdx), %r10
139 movq -1*8(%rsi, %rdx), %r11
140 movq %r8, 0*8(%rdi)
141 movq %r9, 1*8(%rdi)
142 movq %r10, -2*8(%rdi, %rdx)
143 movq %r11, -1*8(%rdi, %rdx)
144 retq
96 .p2align 4 145 .p2align 4
97.Lloop_8: 146.Lless_16bytes:
98 decl %ecx 147 cmpq $8, %rdx
99 movq (%rsi), %r8 148 jb .Lless_8bytes
100 movq %r8, (%rdi) 149 /*
101 leaq 8(%rdi), %rdi 150 * Move data from 8 bytes to 15 bytes.
102 leaq 8(%rsi), %rsi 151 */
103 jnz .Lloop_8 152 movq 0*8(%rsi), %r8
104 153 movq -1*8(%rsi, %rdx), %r9
105.Lhandle_7: 154 movq %r8, 0*8(%rdi)
106 movl %edx, %ecx 155 movq %r9, -1*8(%rdi, %rdx)
107 andl $7, %ecx 156 retq
108 jz .Lend 157 .p2align 4
158.Lless_8bytes:
159 cmpq $4, %rdx
160 jb .Lless_3bytes
109 161
162 /*
163 * Move data from 4 bytes to 7 bytes.
164 */
165 movl (%rsi), %ecx
166 movl -4(%rsi, %rdx), %r8d
167 movl %ecx, (%rdi)
168 movl %r8d, -4(%rdi, %rdx)
169 retq
110 .p2align 4 170 .p2align 4
171.Lless_3bytes:
172 cmpl $0, %edx
173 je .Lend
174 /*
175 * Move data from 1 bytes to 3 bytes.
176 */
111.Lloop_1: 177.Lloop_1:
112 movb (%rsi), %r8b 178 movb (%rsi), %r8b
113 movb %r8b, (%rdi) 179 movb %r8b, (%rdi)
114 incq %rdi 180 incq %rdi
115 incq %rsi 181 incq %rsi
116 decl %ecx 182 decl %edx
117 jnz .Lloop_1 183 jnz .Lloop_1
118 184
119.Lend: 185.Lend:
120 ret 186 retq
121 CFI_ENDPROC 187 CFI_ENDPROC
122ENDPROC(memcpy) 188ENDPROC(memcpy)
123ENDPROC(__memcpy) 189ENDPROC(__memcpy)
124 190
125 /* 191 /*
126 * Some CPUs run faster using the string copy instructions. 192 * Some CPUs are adding enhanced REP MOVSB/STOSB feature
127 * It is also a lot simpler. Use this when possible: 193 * If the feature is supported, memcpy_c_e() is the first choice.
128 */ 194 * If enhanced rep movsb copy is not available, use fast string copy
129 195 * memcpy_c() when possible. This is faster and code is simpler than
130 .section .altinstructions, "a" 196 * original memcpy().
131 .align 8 197 * Otherwise, original memcpy() is used.
132 .quad memcpy 198 * In .altinstructions section, ERMS feature is placed after REG_GOOD
133 .quad .Lmemcpy_c 199 * feature to implement the right patch order.
134 .word X86_FEATURE_REP_GOOD 200 *
135
136 /*
137 * Replace only beginning, memcpy is used to apply alternatives, 201 * Replace only beginning, memcpy is used to apply alternatives,
138 * so it is silly to overwrite itself with nops - reboot is the 202 * so it is silly to overwrite itself with nops - reboot is the
139 * only outcome... 203 * only outcome...
140 */ 204 */
141 .byte .Lmemcpy_e - .Lmemcpy_c 205 .section .altinstructions, "a"
142 .byte .Lmemcpy_e - .Lmemcpy_c 206 altinstruction_entry memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
207 .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
208 altinstruction_entry memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
209 .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
143 .previous 210 .previous
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
new file mode 100644
index 000000000000..d0ec9c2936d7
--- /dev/null
+++ b/arch/x86/lib/memmove_64.S
@@ -0,0 +1,224 @@
1/*
2 * Normally compiler builtins are used, but sometimes the compiler calls out
3 * of line code. Based on asm-i386/string.h.
4 *
5 * This assembly file is re-written from memmove_64.c file.
6 * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
7 */
8#define _STRING_C
9#include <linux/linkage.h>
10#include <asm/dwarf2.h>
11#include <asm/cpufeature.h>
12
13#undef memmove
14
15/*
16 * Implement memmove(). This can handle overlap between src and dst.
17 *
18 * Input:
19 * rdi: dest
20 * rsi: src
21 * rdx: count
22 *
23 * Output:
24 * rax: dest
25 */
26ENTRY(memmove)
27 CFI_STARTPROC
28
29 /* Handle more 32bytes in loop */
30 mov %rdi, %rax
31 cmp $0x20, %rdx
32 jb 1f
33
34 /* Decide forward/backward copy mode */
35 cmp %rdi, %rsi
36 jge .Lmemmove_begin_forward
37 mov %rsi, %r8
38 add %rdx, %r8
39 cmp %rdi, %r8
40 jg 2f
41
42.Lmemmove_begin_forward:
43 /*
44 * movsq instruction have many startup latency
45 * so we handle small size by general register.
46 */
47 cmp $680, %rdx
48 jb 3f
49 /*
50 * movsq instruction is only good for aligned case.
51 */
52
53 cmpb %dil, %sil
54 je 4f
553:
56 sub $0x20, %rdx
57 /*
58 * We gobble 32byts forward in each loop.
59 */
605:
61 sub $0x20, %rdx
62 movq 0*8(%rsi), %r11
63 movq 1*8(%rsi), %r10
64 movq 2*8(%rsi), %r9
65 movq 3*8(%rsi), %r8
66 leaq 4*8(%rsi), %rsi
67
68 movq %r11, 0*8(%rdi)
69 movq %r10, 1*8(%rdi)
70 movq %r9, 2*8(%rdi)
71 movq %r8, 3*8(%rdi)
72 leaq 4*8(%rdi), %rdi
73 jae 5b
74 addq $0x20, %rdx
75 jmp 1f
76 /*
77 * Handle data forward by movsq.
78 */
79 .p2align 4
804:
81 movq %rdx, %rcx
82 movq -8(%rsi, %rdx), %r11
83 lea -8(%rdi, %rdx), %r10
84 shrq $3, %rcx
85 rep movsq
86 movq %r11, (%r10)
87 jmp 13f
88.Lmemmove_end_forward:
89
90 /*
91 * Handle data backward by movsq.
92 */
93 .p2align 4
947:
95 movq %rdx, %rcx
96 movq (%rsi), %r11
97 movq %rdi, %r10
98 leaq -8(%rsi, %rdx), %rsi
99 leaq -8(%rdi, %rdx), %rdi
100 shrq $3, %rcx
101 std
102 rep movsq
103 cld
104 movq %r11, (%r10)
105 jmp 13f
106
107 /*
108 * Start to prepare for backward copy.
109 */
110 .p2align 4
1112:
112 cmp $680, %rdx
113 jb 6f
114 cmp %dil, %sil
115 je 7b
1166:
117 /*
118 * Calculate copy position to tail.
119 */
120 addq %rdx, %rsi
121 addq %rdx, %rdi
122 subq $0x20, %rdx
123 /*
124 * We gobble 32byts backward in each loop.
125 */
1268:
127 subq $0x20, %rdx
128 movq -1*8(%rsi), %r11
129 movq -2*8(%rsi), %r10
130 movq -3*8(%rsi), %r9
131 movq -4*8(%rsi), %r8
132 leaq -4*8(%rsi), %rsi
133
134 movq %r11, -1*8(%rdi)
135 movq %r10, -2*8(%rdi)
136 movq %r9, -3*8(%rdi)
137 movq %r8, -4*8(%rdi)
138 leaq -4*8(%rdi), %rdi
139 jae 8b
140 /*
141 * Calculate copy position to head.
142 */
143 addq $0x20, %rdx
144 subq %rdx, %rsi
145 subq %rdx, %rdi
1461:
147 cmpq $16, %rdx
148 jb 9f
149 /*
150 * Move data from 16 bytes to 31 bytes.
151 */
152 movq 0*8(%rsi), %r11
153 movq 1*8(%rsi), %r10
154 movq -2*8(%rsi, %rdx), %r9
155 movq -1*8(%rsi, %rdx), %r8
156 movq %r11, 0*8(%rdi)
157 movq %r10, 1*8(%rdi)
158 movq %r9, -2*8(%rdi, %rdx)
159 movq %r8, -1*8(%rdi, %rdx)
160 jmp 13f
161 .p2align 4
1629:
163 cmpq $8, %rdx
164 jb 10f
165 /*
166 * Move data from 8 bytes to 15 bytes.
167 */
168 movq 0*8(%rsi), %r11
169 movq -1*8(%rsi, %rdx), %r10
170 movq %r11, 0*8(%rdi)
171 movq %r10, -1*8(%rdi, %rdx)
172 jmp 13f
17310:
174 cmpq $4, %rdx
175 jb 11f
176 /*
177 * Move data from 4 bytes to 7 bytes.
178 */
179 movl (%rsi), %r11d
180 movl -4(%rsi, %rdx), %r10d
181 movl %r11d, (%rdi)
182 movl %r10d, -4(%rdi, %rdx)
183 jmp 13f
18411:
185 cmp $2, %rdx
186 jb 12f
187 /*
188 * Move data from 2 bytes to 3 bytes.
189 */
190 movw (%rsi), %r11w
191 movw -2(%rsi, %rdx), %r10w
192 movw %r11w, (%rdi)
193 movw %r10w, -2(%rdi, %rdx)
194 jmp 13f
19512:
196 cmp $1, %rdx
197 jb 13f
198 /*
199 * Move data for 1 byte.
200 */
201 movb (%rsi), %r11b
202 movb %r11b, (%rdi)
20313:
204 retq
205 CFI_ENDPROC
206
207 .section .altinstr_replacement,"ax"
208.Lmemmove_begin_forward_efs:
209 /* Forward moving data. */
210 movq %rdx, %rcx
211 rep movsb
212 retq
213.Lmemmove_end_forward_efs:
214 .previous
215
216 .section .altinstructions,"a"
217 .align 8
218 .quad .Lmemmove_begin_forward
219 .quad .Lmemmove_begin_forward_efs
220 .word X86_FEATURE_ERMS
221 .byte .Lmemmove_end_forward-.Lmemmove_begin_forward
222 .byte .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
223 .previous
224ENDPROC(memmove)
diff --git a/arch/x86/lib/memmove_64.c b/arch/x86/lib/memmove_64.c
deleted file mode 100644
index 0a33909bf122..000000000000
--- a/arch/x86/lib/memmove_64.c
+++ /dev/null
@@ -1,21 +0,0 @@
1/* Normally compiler builtins are used, but sometimes the compiler calls out
2 of line code. Based on asm-i386/string.h.
3 */
4#define _STRING_C
5#include <linux/string.h>
6#include <linux/module.h>
7
8#undef memmove
9void *memmove(void *dest, const void *src, size_t count)
10{
11 if (dest < src) {
12 return memcpy(dest, src, count);
13 } else {
14 char *p = dest + count;
15 const char *s = src + count;
16 while (count--)
17 *--p = *--s;
18 }
19 return dest;
20}
21EXPORT_SYMBOL(memmove);
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 09d344269652..79bd454b78a3 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -2,9 +2,13 @@
2 2
3#include <linux/linkage.h> 3#include <linux/linkage.h>
4#include <asm/dwarf2.h> 4#include <asm/dwarf2.h>
5#include <asm/cpufeature.h>
6#include <asm/alternative-asm.h>
5 7
6/* 8/*
7 * ISO C memset - set a memory block to a byte value. 9 * ISO C memset - set a memory block to a byte value. This function uses fast
10 * string to get better performance than the original function. The code is
11 * simpler and shorter than the orignal function as well.
8 * 12 *
9 * rdi destination 13 * rdi destination
10 * rsi value (char) 14 * rsi value (char)
@@ -31,6 +35,28 @@
31.Lmemset_e: 35.Lmemset_e:
32 .previous 36 .previous
33 37
38/*
39 * ISO C memset - set a memory block to a byte value. This function uses
40 * enhanced rep stosb to override the fast string function.
41 * The code is simpler and shorter than the fast string function as well.
42 *
43 * rdi destination
44 * rsi value (char)
45 * rdx count (bytes)
46 *
47 * rax original destination
48 */
49 .section .altinstr_replacement, "ax", @progbits
50.Lmemset_c_e:
51 movq %rdi,%r9
52 movb %sil,%al
53 movl %edx,%ecx
54 rep stosb
55 movq %r9,%rax
56 ret
57.Lmemset_e_e:
58 .previous
59
34ENTRY(memset) 60ENTRY(memset)
35ENTRY(__memset) 61ENTRY(__memset)
36 CFI_STARTPROC 62 CFI_STARTPROC
@@ -112,16 +138,20 @@ ENTRY(__memset)
112ENDPROC(memset) 138ENDPROC(memset)
113ENDPROC(__memset) 139ENDPROC(__memset)
114 140
115 /* Some CPUs run faster using the string instructions. 141 /* Some CPUs support enhanced REP MOVSB/STOSB feature.
116 It is also a lot simpler. Use this when possible */ 142 * It is recommended to use this when possible.
117 143 *
118#include <asm/cpufeature.h> 144 * If enhanced REP MOVSB/STOSB feature is not available, use fast string
119 145 * instructions.
146 *
147 * Otherwise, use original memset function.
148 *
149 * In .altinstructions section, ERMS feature is placed after REG_GOOD
150 * feature to implement the right patch order.
151 */
120 .section .altinstructions,"a" 152 .section .altinstructions,"a"
121 .align 8 153 altinstruction_entry memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
122 .quad memset 154 .Lfinal-memset,.Lmemset_e-.Lmemset_c
123 .quad .Lmemset_c 155 altinstruction_entry memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
124 .word X86_FEATURE_REP_GOOD 156 .Lfinal-memset,.Lmemset_e_e-.Lmemset_c_e
125 .byte .Lfinal - memset
126 .byte .Lmemset_e - .Lmemset_c
127 .previous 157 .previous
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
index 41fcf00e49df..67743977398b 100644
--- a/arch/x86/lib/rwsem_64.S
+++ b/arch/x86/lib/rwsem_64.S
@@ -23,43 +23,50 @@
23#include <asm/dwarf2.h> 23#include <asm/dwarf2.h>
24 24
25#define save_common_regs \ 25#define save_common_regs \
26 pushq %rdi; \ 26 pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
27 pushq %rsi; \ 27 pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
28 pushq %rcx; \ 28 pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
29 pushq %r8; \ 29 pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
30 pushq %r9; \ 30 pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
31 pushq %r10; \ 31 pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
32 pushq %r11 32 pushq_cfi %r11; CFI_REL_OFFSET r11, 0
33 33
34#define restore_common_regs \ 34#define restore_common_regs \
35 popq %r11; \ 35 popq_cfi %r11; CFI_RESTORE r11; \
36 popq %r10; \ 36 popq_cfi %r10; CFI_RESTORE r10; \
37 popq %r9; \ 37 popq_cfi %r9; CFI_RESTORE r9; \
38 popq %r8; \ 38 popq_cfi %r8; CFI_RESTORE r8; \
39 popq %rcx; \ 39 popq_cfi %rcx; CFI_RESTORE rcx; \
40 popq %rsi; \ 40 popq_cfi %rsi; CFI_RESTORE rsi; \
41 popq %rdi 41 popq_cfi %rdi; CFI_RESTORE rdi
42 42
43/* Fix up special calling conventions */ 43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed) 44ENTRY(call_rwsem_down_read_failed)
45 CFI_STARTPROC
45 save_common_regs 46 save_common_regs
46 pushq %rdx 47 pushq_cfi %rdx
48 CFI_REL_OFFSET rdx, 0
47 movq %rax,%rdi 49 movq %rax,%rdi
48 call rwsem_down_read_failed 50 call rwsem_down_read_failed
49 popq %rdx 51 popq_cfi %rdx
52 CFI_RESTORE rdx
50 restore_common_regs 53 restore_common_regs
51 ret 54 ret
52 ENDPROC(call_rwsem_down_read_failed) 55 CFI_ENDPROC
56ENDPROC(call_rwsem_down_read_failed)
53 57
54ENTRY(call_rwsem_down_write_failed) 58ENTRY(call_rwsem_down_write_failed)
59 CFI_STARTPROC
55 save_common_regs 60 save_common_regs
56 movq %rax,%rdi 61 movq %rax,%rdi
57 call rwsem_down_write_failed 62 call rwsem_down_write_failed
58 restore_common_regs 63 restore_common_regs
59 ret 64 ret
60 ENDPROC(call_rwsem_down_write_failed) 65 CFI_ENDPROC
66ENDPROC(call_rwsem_down_write_failed)
61 67
62ENTRY(call_rwsem_wake) 68ENTRY(call_rwsem_wake)
69 CFI_STARTPROC
63 decl %edx /* do nothing if still outstanding active readers */ 70 decl %edx /* do nothing if still outstanding active readers */
64 jnz 1f 71 jnz 1f
65 save_common_regs 72 save_common_regs
@@ -67,15 +74,20 @@ ENTRY(call_rwsem_wake)
67 call rwsem_wake 74 call rwsem_wake
68 restore_common_regs 75 restore_common_regs
691: ret 761: ret
70 ENDPROC(call_rwsem_wake) 77 CFI_ENDPROC
78ENDPROC(call_rwsem_wake)
71 79
72/* Fix up special calling conventions */ 80/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake) 81ENTRY(call_rwsem_downgrade_wake)
82 CFI_STARTPROC
74 save_common_regs 83 save_common_regs
75 pushq %rdx 84 pushq_cfi %rdx
85 CFI_REL_OFFSET rdx, 0
76 movq %rax,%rdi 86 movq %rax,%rdi
77 call rwsem_downgrade_wake 87 call rwsem_downgrade_wake
78 popq %rdx 88 popq_cfi %rdx
89 CFI_RESTORE rdx
79 restore_common_regs 90 restore_common_regs
80 ret 91 ret
81 ENDPROC(call_rwsem_downgrade_wake) 92 CFI_ENDPROC
93ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/semaphore_32.S b/arch/x86/lib/semaphore_32.S
index 648fe4741782..06691daa4108 100644
--- a/arch/x86/lib/semaphore_32.S
+++ b/arch/x86/lib/semaphore_32.S
@@ -36,7 +36,7 @@
36 */ 36 */
37#ifdef CONFIG_SMP 37#ifdef CONFIG_SMP
38ENTRY(__write_lock_failed) 38ENTRY(__write_lock_failed)
39 CFI_STARTPROC simple 39 CFI_STARTPROC
40 FRAME 40 FRAME
412: LOCK_PREFIX 412: LOCK_PREFIX
42 addl $ RW_LOCK_BIAS,(%eax) 42 addl $ RW_LOCK_BIAS,(%eax)
@@ -74,29 +74,23 @@ ENTRY(__read_lock_failed)
74/* Fix up special calling conventions */ 74/* Fix up special calling conventions */
75ENTRY(call_rwsem_down_read_failed) 75ENTRY(call_rwsem_down_read_failed)
76 CFI_STARTPROC 76 CFI_STARTPROC
77 push %ecx 77 pushl_cfi %ecx
78 CFI_ADJUST_CFA_OFFSET 4
79 CFI_REL_OFFSET ecx,0 78 CFI_REL_OFFSET ecx,0
80 push %edx 79 pushl_cfi %edx
81 CFI_ADJUST_CFA_OFFSET 4
82 CFI_REL_OFFSET edx,0 80 CFI_REL_OFFSET edx,0
83 call rwsem_down_read_failed 81 call rwsem_down_read_failed
84 pop %edx 82 popl_cfi %edx
85 CFI_ADJUST_CFA_OFFSET -4 83 popl_cfi %ecx
86 pop %ecx
87 CFI_ADJUST_CFA_OFFSET -4
88 ret 84 ret
89 CFI_ENDPROC 85 CFI_ENDPROC
90 ENDPROC(call_rwsem_down_read_failed) 86 ENDPROC(call_rwsem_down_read_failed)
91 87
92ENTRY(call_rwsem_down_write_failed) 88ENTRY(call_rwsem_down_write_failed)
93 CFI_STARTPROC 89 CFI_STARTPROC
94 push %ecx 90 pushl_cfi %ecx
95 CFI_ADJUST_CFA_OFFSET 4
96 CFI_REL_OFFSET ecx,0 91 CFI_REL_OFFSET ecx,0
97 calll rwsem_down_write_failed 92 calll rwsem_down_write_failed
98 pop %ecx 93 popl_cfi %ecx
99 CFI_ADJUST_CFA_OFFSET -4
100 ret 94 ret
101 CFI_ENDPROC 95 CFI_ENDPROC
102 ENDPROC(call_rwsem_down_write_failed) 96 ENDPROC(call_rwsem_down_write_failed)
@@ -105,12 +99,10 @@ ENTRY(call_rwsem_wake)
105 CFI_STARTPROC 99 CFI_STARTPROC
106 decw %dx /* do nothing if still outstanding active readers */ 100 decw %dx /* do nothing if still outstanding active readers */
107 jnz 1f 101 jnz 1f
108 push %ecx 102 pushl_cfi %ecx
109 CFI_ADJUST_CFA_OFFSET 4
110 CFI_REL_OFFSET ecx,0 103 CFI_REL_OFFSET ecx,0
111 call rwsem_wake 104 call rwsem_wake
112 pop %ecx 105 popl_cfi %ecx
113 CFI_ADJUST_CFA_OFFSET -4
1141: ret 1061: ret
115 CFI_ENDPROC 107 CFI_ENDPROC
116 ENDPROC(call_rwsem_wake) 108 ENDPROC(call_rwsem_wake)
@@ -118,17 +110,13 @@ ENTRY(call_rwsem_wake)
118/* Fix up special calling conventions */ 110/* Fix up special calling conventions */
119ENTRY(call_rwsem_downgrade_wake) 111ENTRY(call_rwsem_downgrade_wake)
120 CFI_STARTPROC 112 CFI_STARTPROC
121 push %ecx 113 pushl_cfi %ecx
122 CFI_ADJUST_CFA_OFFSET 4
123 CFI_REL_OFFSET ecx,0 114 CFI_REL_OFFSET ecx,0
124 push %edx 115 pushl_cfi %edx
125 CFI_ADJUST_CFA_OFFSET 4
126 CFI_REL_OFFSET edx,0 116 CFI_REL_OFFSET edx,0
127 call rwsem_downgrade_wake 117 call rwsem_downgrade_wake
128 pop %edx 118 popl_cfi %edx
129 CFI_ADJUST_CFA_OFFSET -4 119 popl_cfi %ecx
130 pop %ecx
131 CFI_ADJUST_CFA_OFFSET -4
132 ret 120 ret
133 CFI_ENDPROC 121 CFI_ENDPROC
134 ENDPROC(call_rwsem_downgrade_wake) 122 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index 650b11e00ecc..2930ae05d773 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -7,24 +7,6 @@
7 7
8 #include <linux/linkage.h> 8 #include <linux/linkage.h>
9 9
10#define ARCH_TRACE_IRQS_ON \
11 pushl %eax; \
12 pushl %ecx; \
13 pushl %edx; \
14 call trace_hardirqs_on; \
15 popl %edx; \
16 popl %ecx; \
17 popl %eax;
18
19#define ARCH_TRACE_IRQS_OFF \
20 pushl %eax; \
21 pushl %ecx; \
22 pushl %edx; \
23 call trace_hardirqs_off; \
24 popl %edx; \
25 popl %ecx; \
26 popl %eax;
27
28#ifdef CONFIG_TRACE_IRQFLAGS 10#ifdef CONFIG_TRACE_IRQFLAGS
29 /* put return address in eax (arg1) */ 11 /* put return address in eax (arg1) */
30 .macro thunk_ra name,func 12 .macro thunk_ra name,func
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index bf9a7d5a5428..782b082c9ff7 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -22,26 +22,6 @@
22 CFI_ENDPROC 22 CFI_ENDPROC
23 .endm 23 .endm
24 24
25 /* rdi: arg1 ... normal C conventions. rax is passed from C. */
26 .macro thunk_retrax name,func
27 .globl \name
28\name:
29 CFI_STARTPROC
30 SAVE_ARGS
31 call \func
32 jmp restore_norax
33 CFI_ENDPROC
34 .endm
35
36
37 .section .sched.text, "ax"
38#ifdef CONFIG_RWSEM_XCHGADD_ALGORITHM
39 thunk rwsem_down_read_failed_thunk,rwsem_down_read_failed
40 thunk rwsem_down_write_failed_thunk,rwsem_down_write_failed
41 thunk rwsem_wake_thunk,rwsem_wake
42 thunk rwsem_downgrade_thunk,rwsem_downgrade_wake
43#endif
44
45#ifdef CONFIG_TRACE_IRQFLAGS 25#ifdef CONFIG_TRACE_IRQFLAGS
46 /* put return address in rdi (arg1) */ 26 /* put return address in rdi (arg1) */
47 .macro thunk_ra name,func 27 .macro thunk_ra name,func
@@ -72,10 +52,3 @@ restore:
72 RESTORE_ARGS 52 RESTORE_ARGS
73 ret 53 ret
74 CFI_ENDPROC 54 CFI_ENDPROC
75
76 CFI_STARTPROC
77 SAVE_ARGS
78restore_norax:
79 RESTORE_ARGS 1
80 ret
81 CFI_ENDPROC
diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile
index a4c768397baa..3d11327c9ab4 100644
--- a/arch/x86/mm/Makefile
+++ b/arch/x86/mm/Makefile
@@ -23,7 +23,10 @@ mmiotrace-y := kmmio.o pf_in.o mmio-mod.o
23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o 23obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o
24 24
25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o 25obj-$(CONFIG_NUMA) += numa.o numa_$(BITS).o
26obj-$(CONFIG_K8_NUMA) += k8topology_64.o 26obj-$(CONFIG_AMD_NUMA) += amdtopology.o
27obj-$(CONFIG_ACPI_NUMA) += srat_$(BITS).o 27obj-$(CONFIG_ACPI_NUMA) += srat.o
28obj-$(CONFIG_NUMA_EMU) += numa_emulation.o
29
30obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o
28 31
29obj-$(CONFIG_MEMTEST) += memtest.o 32obj-$(CONFIG_MEMTEST) += memtest.o
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/amdtopology.c
index 970ed579d4e4..5247d01329ca 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/amdtopology.c
@@ -1,8 +1,8 @@
1/* 1/*
2 * AMD K8 NUMA support. 2 * AMD NUMA support.
3 * Discover the memory map and associated nodes. 3 * Discover the memory map and associated nodes.
4 * 4 *
5 * This version reads it directly from the K8 northbridge. 5 * This version reads it directly from the AMD northbridge.
6 * 6 *
7 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 7 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
8 */ 8 */
@@ -11,6 +11,9 @@
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/module.h> 12#include <linux/module.h>
13#include <linux/nodemask.h> 13#include <linux/nodemask.h>
14#include <linux/memblock.h>
15#include <linux/bootmem.h>
16
14#include <asm/io.h> 17#include <asm/io.h>
15#include <linux/pci_ids.h> 18#include <linux/pci_ids.h>
16#include <linux/acpi.h> 19#include <linux/acpi.h>
@@ -22,10 +25,9 @@
22#include <asm/numa.h> 25#include <asm/numa.h>
23#include <asm/mpspec.h> 26#include <asm/mpspec.h>
24#include <asm/apic.h> 27#include <asm/apic.h>
25#include <asm/k8.h> 28#include <asm/amd_nb.h>
26 29
27static struct bootnode __initdata nodes[8]; 30static unsigned char __initdata nodeids[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29 31
30static __init int find_northbridge(void) 32static __init int find_northbridge(void)
31{ 33{
@@ -48,14 +50,14 @@ static __init int find_northbridge(void)
48 return num; 50 return num;
49 } 51 }
50 52
51 return -1; 53 return -ENOENT;
52} 54}
53 55
54static __init void early_get_boot_cpu_id(void) 56static __init void early_get_boot_cpu_id(void)
55{ 57{
56 /* 58 /*
57 * need to get boot_cpu_id so can use that to create apicid_to_node 59 * need to get the APIC ID of the BSP so can use that to
58 * in k8_scan_nodes() 60 * create apicid_to_node in amd_scan_nodes()
59 */ 61 */
60#ifdef CONFIG_X86_MPPARSE 62#ifdef CONFIG_X86_MPPARSE
61 /* 63 /*
@@ -64,33 +66,20 @@ static __init void early_get_boot_cpu_id(void)
64 if (smp_found_config) 66 if (smp_found_config)
65 early_get_smp_config(); 67 early_get_smp_config();
66#endif 68#endif
67 early_init_lapic_mapping();
68}
69
70int __init k8_get_nodes(struct bootnode *physnodes)
71{
72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81} 69}
82 70
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn) 71int __init amd_numa_init(void)
84{ 72{
85 unsigned long start = PFN_PHYS(start_pfn); 73 u64 start = PFN_PHYS(0);
86 unsigned long end = PFN_PHYS(end_pfn); 74 u64 end = PFN_PHYS(max_pfn);
87 unsigned numnodes; 75 unsigned numnodes;
88 unsigned long prevbase; 76 u64 prevbase;
89 int i, nb, found = 0; 77 int i, j, nb;
90 u32 nodeid, reg; 78 u32 nodeid, reg;
79 unsigned int bits, cores, apicid_base;
91 80
92 if (!early_pci_allowed()) 81 if (!early_pci_allowed())
93 return -1; 82 return -EINVAL;
94 83
95 nb = find_northbridge(); 84 nb = find_northbridge();
96 if (nb < 0) 85 if (nb < 0)
@@ -101,40 +90,40 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
101 reg = read_pci_config(0, nb, 0, 0x60); 90 reg = read_pci_config(0, nb, 0, 0x60);
102 numnodes = ((reg >> 4) & 0xF) + 1; 91 numnodes = ((reg >> 4) & 0xF) + 1;
103 if (numnodes <= 1) 92 if (numnodes <= 1)
104 return -1; 93 return -ENOENT;
105 94
106 pr_info("Number of physical nodes %d\n", numnodes); 95 pr_info("Number of physical nodes %d\n", numnodes);
107 96
108 prevbase = 0; 97 prevbase = 0;
109 for (i = 0; i < 8; i++) { 98 for (i = 0; i < 8; i++) {
110 unsigned long base, limit; 99 u64 base, limit;
111 100
112 base = read_pci_config(0, nb, 1, 0x40 + i*8); 101 base = read_pci_config(0, nb, 1, 0x40 + i*8);
113 limit = read_pci_config(0, nb, 1, 0x44 + i*8); 102 limit = read_pci_config(0, nb, 1, 0x44 + i*8);
114 103
115 nodeid = limit & 7; 104 nodeids[i] = nodeid = limit & 7;
116 if ((base & 3) == 0) { 105 if ((base & 3) == 0) {
117 if (i < numnodes) 106 if (i < numnodes)
118 pr_info("Skipping disabled node %d\n", i); 107 pr_info("Skipping disabled node %d\n", i);
119 continue; 108 continue;
120 } 109 }
121 if (nodeid >= numnodes) { 110 if (nodeid >= numnodes) {
122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid, 111 pr_info("Ignoring excess node %d (%Lx:%Lx)\n", nodeid,
123 base, limit); 112 base, limit);
124 continue; 113 continue;
125 } 114 }
126 115
127 if (!limit) { 116 if (!limit) {
128 pr_info("Skipping node entry %d (base %lx)\n", 117 pr_info("Skipping node entry %d (base %Lx)\n",
129 i, base); 118 i, base);
130 continue; 119 continue;
131 } 120 }
132 if ((base >> 8) & 3 || (limit >> 8) & 3) { 121 if ((base >> 8) & 3 || (limit >> 8) & 3) {
133 pr_err("Node %d using interleaving mode %lx/%lx\n", 122 pr_err("Node %d using interleaving mode %Lx/%Lx\n",
134 nodeid, (base >> 8) & 3, (limit >> 8) & 3); 123 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
135 return -1; 124 return -EINVAL;
136 } 125 }
137 if (node_isset(nodeid, nodes_parsed)) { 126 if (node_isset(nodeid, numa_nodes_parsed)) {
138 pr_info("Node %d already present, skipping\n", 127 pr_info("Node %d already present, skipping\n",
139 nodeid); 128 nodeid);
140 continue; 129 continue;
@@ -162,74 +151,47 @@ int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
162 continue; 151 continue;
163 } 152 }
164 if (limit < base) { 153 if (limit < base) {
165 pr_err("Node %d bogus settings %lx-%lx.\n", 154 pr_err("Node %d bogus settings %Lx-%Lx.\n",
166 nodeid, base, limit); 155 nodeid, base, limit);
167 continue; 156 continue;
168 } 157 }
169 158
170 /* Could sort here, but pun for now. Should not happen anyroads. */ 159 /* Could sort here, but pun for now. Should not happen anyroads. */
171 if (prevbase > base) { 160 if (prevbase > base) {
172 pr_err("Node map not sorted %lx,%lx\n", 161 pr_err("Node map not sorted %Lx,%Lx\n",
173 prevbase, base); 162 prevbase, base);
174 return -1; 163 return -EINVAL;
175 } 164 }
176 165
177 pr_info("Node %d MemBase %016lx Limit %016lx\n", 166 pr_info("Node %d MemBase %016Lx Limit %016Lx\n",
178 nodeid, base, limit); 167 nodeid, base, limit);
179 168
180 found++;
181
182 nodes[nodeid].start = base;
183 nodes[nodeid].end = limit;
184
185 prevbase = base; 169 prevbase = base;
186 170 numa_add_memblk(nodeid, base, limit);
187 node_set(nodeid, nodes_parsed); 171 node_set(nodeid, numa_nodes_parsed);
188 } 172 }
189 173
190 if (!found) 174 if (!nodes_weight(numa_nodes_parsed))
191 return -1; 175 return -ENOENT;
192 return 0;
193}
194 176
195int __init k8_scan_nodes(void) 177 /*
196{ 178 * We seem to have valid NUMA configuration. Map apicids to nodes
197 unsigned int bits; 179 * using the coreid bits from early_identify_cpu.
198 unsigned int cores; 180 */
199 unsigned int apicid_base;
200 int i;
201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
205 if (memnode_shift < 0) {
206 pr_err("No NUMA node hash function found. Contact maintainer\n");
207 return -1;
208 }
209 pr_info("Using node hash shift of %d\n", memnode_shift);
210
211 /* use the coreid bits from early_identify_cpu */
212 bits = boot_cpu_data.x86_coreid_bits; 181 bits = boot_cpu_data.x86_coreid_bits;
213 cores = (1<<bits); 182 cores = 1 << bits;
214 apicid_base = 0; 183 apicid_base = 0;
215 /* need to get boot_cpu_id early for system with apicid lifting */ 184
185 /* get the APIC ID of the BSP early for systems with apicid lifting */
216 early_get_boot_cpu_id(); 186 early_get_boot_cpu_id();
217 if (boot_cpu_physical_apicid > 0) { 187 if (boot_cpu_physical_apicid > 0) {
218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid); 188 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
219 apicid_base = boot_cpu_physical_apicid; 189 apicid_base = boot_cpu_physical_apicid;
220 } 190 }
221 191
222 for_each_node_mask(i, node_possible_map) { 192 for_each_node_mask(i, numa_nodes_parsed)
223 int j;
224
225 e820_register_active_regions(i,
226 nodes[i].start >> PAGE_SHIFT,
227 nodes[i].end >> PAGE_SHIFT);
228 for (j = apicid_base; j < cores + apicid_base; j++) 193 for (j = apicid_base; j < cores + apicid_base; j++)
229 apicid_to_node[(i << bits) + j] = i; 194 set_apicid_to_node((i << bits) + j, i);
230 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
231 }
232 195
233 numa_init_array();
234 return 0; 196 return 0;
235} 197}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4c4508e8a204..2dbf6bf4c7e5 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -11,6 +11,8 @@
11#include <linux/kprobes.h> /* __kprobes, ... */ 11#include <linux/kprobes.h> /* __kprobes, ... */
12#include <linux/mmiotrace.h> /* kmmio_handler, ... */ 12#include <linux/mmiotrace.h> /* kmmio_handler, ... */
13#include <linux/perf_event.h> /* perf_sw_event */ 13#include <linux/perf_event.h> /* perf_sw_event */
14#include <linux/hugetlb.h> /* hstate_index_to_shift */
15#include <linux/prefetch.h> /* prefetchw */
14 16
15#include <asm/traps.h> /* dotraplinkage, ... */ 17#include <asm/traps.h> /* dotraplinkage, ... */
16#include <asm/pgalloc.h> /* pgd_*(), ... */ 18#include <asm/pgalloc.h> /* pgd_*(), ... */
@@ -160,15 +162,20 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
160 162
161static void 163static void
162force_sig_info_fault(int si_signo, int si_code, unsigned long address, 164force_sig_info_fault(int si_signo, int si_code, unsigned long address,
163 struct task_struct *tsk) 165 struct task_struct *tsk, int fault)
164{ 166{
167 unsigned lsb = 0;
165 siginfo_t info; 168 siginfo_t info;
166 169
167 info.si_signo = si_signo; 170 info.si_signo = si_signo;
168 info.si_errno = 0; 171 info.si_errno = 0;
169 info.si_code = si_code; 172 info.si_code = si_code;
170 info.si_addr = (void __user *)address; 173 info.si_addr = (void __user *)address;
171 info.si_addr_lsb = si_code == BUS_MCEERR_AR ? PAGE_SHIFT : 0; 174 if (fault & VM_FAULT_HWPOISON_LARGE)
175 lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault));
176 if (fault & VM_FAULT_HWPOISON)
177 lsb = PAGE_SHIFT;
178 info.si_addr_lsb = lsb;
172 179
173 force_sig_info(si_signo, &info, tsk); 180 force_sig_info(si_signo, &info, tsk);
174} 181}
@@ -223,16 +230,24 @@ void vmalloc_sync_all(void)
223 for (address = VMALLOC_START & PMD_MASK; 230 for (address = VMALLOC_START & PMD_MASK;
224 address >= TASK_SIZE && address < FIXADDR_TOP; 231 address >= TASK_SIZE && address < FIXADDR_TOP;
225 address += PMD_SIZE) { 232 address += PMD_SIZE) {
226
227 unsigned long flags;
228 struct page *page; 233 struct page *page;
229 234
230 spin_lock_irqsave(&pgd_lock, flags); 235 spin_lock(&pgd_lock);
231 list_for_each_entry(page, &pgd_list, lru) { 236 list_for_each_entry(page, &pgd_list, lru) {
232 if (!vmalloc_sync_one(page_address(page), address)) 237 spinlock_t *pgt_lock;
238 pmd_t *ret;
239
240 /* the pgt_lock only for Xen */
241 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
242
243 spin_lock(pgt_lock);
244 ret = vmalloc_sync_one(page_address(page), address);
245 spin_unlock(pgt_lock);
246
247 if (!ret)
233 break; 248 break;
234 } 249 }
235 spin_unlock_irqrestore(&pgd_lock, flags); 250 spin_unlock(&pgd_lock);
236 } 251 }
237} 252}
238 253
@@ -251,6 +266,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
251 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 266 if (!(address >= VMALLOC_START && address < VMALLOC_END))
252 return -1; 267 return -1;
253 268
269 WARN_ON_ONCE(in_nmi());
270
254 /* 271 /*
255 * Synchronize this task's top level page-table 272 * Synchronize this task's top level page-table
256 * with the 'reference' page table. 273 * with the 'reference' page table.
@@ -326,29 +343,7 @@ out:
326 343
327void vmalloc_sync_all(void) 344void vmalloc_sync_all(void)
328{ 345{
329 unsigned long address; 346 sync_global_pgds(VMALLOC_START & PGDIR_MASK, VMALLOC_END);
330
331 for (address = VMALLOC_START & PGDIR_MASK; address <= VMALLOC_END;
332 address += PGDIR_SIZE) {
333
334 const pgd_t *pgd_ref = pgd_offset_k(address);
335 unsigned long flags;
336 struct page *page;
337
338 if (pgd_none(*pgd_ref))
339 continue;
340
341 spin_lock_irqsave(&pgd_lock, flags);
342 list_for_each_entry(page, &pgd_list, lru) {
343 pgd_t *pgd;
344 pgd = (pgd_t *)page_address(page) + pgd_index(address);
345 if (pgd_none(*pgd))
346 set_pgd(pgd, *pgd_ref);
347 else
348 BUG_ON(pgd_page_vaddr(*pgd) != pgd_page_vaddr(*pgd_ref));
349 }
350 spin_unlock_irqrestore(&pgd_lock, flags);
351 }
352} 347}
353 348
354/* 349/*
@@ -369,6 +364,8 @@ static noinline __kprobes int vmalloc_fault(unsigned long address)
369 if (!(address >= VMALLOC_START && address < VMALLOC_END)) 364 if (!(address >= VMALLOC_START && address < VMALLOC_END))
370 return -1; 365 return -1;
371 366
367 WARN_ON_ONCE(in_nmi());
368
372 /* 369 /*
373 * Copy kernel mappings over when needed. This can also 370 * Copy kernel mappings over when needed. This can also
374 * happen within a race in page table update. In the later 371 * happen within a race in page table update. In the later
@@ -731,7 +728,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
731 tsk->thread.error_code = error_code | (address >= TASK_SIZE); 728 tsk->thread.error_code = error_code | (address >= TASK_SIZE);
732 tsk->thread.trap_no = 14; 729 tsk->thread.trap_no = 14;
733 730
734 force_sig_info_fault(SIGSEGV, si_code, address, tsk); 731 force_sig_info_fault(SIGSEGV, si_code, address, tsk, 0);
735 732
736 return; 733 return;
737 } 734 }
@@ -816,28 +813,51 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
816 tsk->thread.trap_no = 14; 813 tsk->thread.trap_no = 14;
817 814
818#ifdef CONFIG_MEMORY_FAILURE 815#ifdef CONFIG_MEMORY_FAILURE
819 if (fault & VM_FAULT_HWPOISON) { 816 if (fault & (VM_FAULT_HWPOISON|VM_FAULT_HWPOISON_LARGE)) {
820 printk(KERN_ERR 817 printk(KERN_ERR
821 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n", 818 "MCE: Killing %s:%d due to hardware memory corruption fault at %lx\n",
822 tsk->comm, tsk->pid, address); 819 tsk->comm, tsk->pid, address);
823 code = BUS_MCEERR_AR; 820 code = BUS_MCEERR_AR;
824 } 821 }
825#endif 822#endif
826 force_sig_info_fault(SIGBUS, code, address, tsk); 823 force_sig_info_fault(SIGBUS, code, address, tsk, fault);
827} 824}
828 825
829static noinline void 826static noinline int
830mm_fault_error(struct pt_regs *regs, unsigned long error_code, 827mm_fault_error(struct pt_regs *regs, unsigned long error_code,
831 unsigned long address, unsigned int fault) 828 unsigned long address, unsigned int fault)
832{ 829{
830 /*
831 * Pagefault was interrupted by SIGKILL. We have no reason to
832 * continue pagefault.
833 */
834 if (fatal_signal_pending(current)) {
835 if (!(fault & VM_FAULT_RETRY))
836 up_read(&current->mm->mmap_sem);
837 if (!(error_code & PF_USER))
838 no_context(regs, error_code, address);
839 return 1;
840 }
841 if (!(fault & VM_FAULT_ERROR))
842 return 0;
843
833 if (fault & VM_FAULT_OOM) { 844 if (fault & VM_FAULT_OOM) {
845 /* Kernel mode? Handle exceptions or die: */
846 if (!(error_code & PF_USER)) {
847 up_read(&current->mm->mmap_sem);
848 no_context(regs, error_code, address);
849 return 1;
850 }
851
834 out_of_memory(regs, error_code, address); 852 out_of_memory(regs, error_code, address);
835 } else { 853 } else {
836 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON)) 854 if (fault & (VM_FAULT_SIGBUS|VM_FAULT_HWPOISON|
855 VM_FAULT_HWPOISON_LARGE))
837 do_sigbus(regs, error_code, address, fault); 856 do_sigbus(regs, error_code, address, fault);
838 else 857 else
839 BUG(); 858 BUG();
840 } 859 }
860 return 1;
841} 861}
842 862
843static int spurious_fault_check(unsigned long error_code, pte_t *pte) 863static int spurious_fault_check(unsigned long error_code, pte_t *pte)
@@ -894,8 +914,14 @@ spurious_fault(unsigned long error_code, unsigned long address)
894 if (pmd_large(*pmd)) 914 if (pmd_large(*pmd))
895 return spurious_fault_check(error_code, (pte_t *) pmd); 915 return spurious_fault_check(error_code, (pte_t *) pmd);
896 916
917 /*
918 * Note: don't use pte_present() here, since it returns true
919 * if the _PAGE_PROTNONE bit is set. However, this aliases the
920 * _PAGE_GLOBAL bit, which for kernel pages give false positives
921 * when CONFIG_DEBUG_PAGEALLOC is used.
922 */
897 pte = pte_offset_kernel(pmd, address); 923 pte = pte_offset_kernel(pmd, address);
898 if (!pte_present(*pte)) 924 if (!(pte_flags(*pte) & _PAGE_PRESENT))
899 return 0; 925 return 0;
900 926
901 ret = spurious_fault_check(error_code, pte); 927 ret = spurious_fault_check(error_code, pte);
@@ -915,9 +941,9 @@ spurious_fault(unsigned long error_code, unsigned long address)
915int show_unhandled_signals = 1; 941int show_unhandled_signals = 1;
916 942
917static inline int 943static inline int
918access_error(unsigned long error_code, int write, struct vm_area_struct *vma) 944access_error(unsigned long error_code, struct vm_area_struct *vma)
919{ 945{
920 if (write) { 946 if (error_code & PF_WRITE) {
921 /* write, present and write, not present: */ 947 /* write, present and write, not present: */
922 if (unlikely(!(vma->vm_flags & VM_WRITE))) 948 if (unlikely(!(vma->vm_flags & VM_WRITE)))
923 return 1; 949 return 1;
@@ -952,8 +978,10 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
952 struct task_struct *tsk; 978 struct task_struct *tsk;
953 unsigned long address; 979 unsigned long address;
954 struct mm_struct *mm; 980 struct mm_struct *mm;
955 int write;
956 int fault; 981 int fault;
982 int write = error_code & PF_WRITE;
983 unsigned int flags = FAULT_FLAG_ALLOW_RETRY | FAULT_FLAG_KILLABLE |
984 (write ? FAULT_FLAG_WRITE : 0);
957 985
958 tsk = current; 986 tsk = current;
959 mm = tsk->mm; 987 mm = tsk->mm;
@@ -1064,6 +1092,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1064 bad_area_nosemaphore(regs, error_code, address); 1092 bad_area_nosemaphore(regs, error_code, address);
1065 return; 1093 return;
1066 } 1094 }
1095retry:
1067 down_read(&mm->mmap_sem); 1096 down_read(&mm->mmap_sem);
1068 } else { 1097 } else {
1069 /* 1098 /*
@@ -1107,9 +1136,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code)
1107 * we can handle it.. 1136 * we can handle it..
1108 */ 1137 */
1109good_area: 1138good_area:
1110 write = error_code & PF_WRITE; 1139 if (unlikely(access_error(error_code, vma))) {
1111
1112 if (unlikely(access_error(error_code, write, vma))) {
1113 bad_area_access_error(regs, error_code, address); 1140 bad_area_access_error(regs, error_code, address);
1114 return; 1141 return;
1115 } 1142 }
@@ -1119,21 +1146,34 @@ good_area:
1119 * make sure we exit gracefully rather than endlessly redo 1146 * make sure we exit gracefully rather than endlessly redo
1120 * the fault: 1147 * the fault:
1121 */ 1148 */
1122 fault = handle_mm_fault(mm, vma, address, write ? FAULT_FLAG_WRITE : 0); 1149 fault = handle_mm_fault(mm, vma, address, flags);
1123 1150
1124 if (unlikely(fault & VM_FAULT_ERROR)) { 1151 if (unlikely(fault & (VM_FAULT_RETRY|VM_FAULT_ERROR))) {
1125 mm_fault_error(regs, error_code, address, fault); 1152 if (mm_fault_error(regs, error_code, address, fault))
1126 return; 1153 return;
1127 } 1154 }
1128 1155
1129 if (fault & VM_FAULT_MAJOR) { 1156 /*
1130 tsk->maj_flt++; 1157 * Major/minor page fault accounting is only done on the
1131 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0, 1158 * initial attempt. If we go through a retry, it is extremely
1132 regs, address); 1159 * likely that the page will be found in page cache at that point.
1133 } else { 1160 */
1134 tsk->min_flt++; 1161 if (flags & FAULT_FLAG_ALLOW_RETRY) {
1135 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0, 1162 if (fault & VM_FAULT_MAJOR) {
1136 regs, address); 1163 tsk->maj_flt++;
1164 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, 0,
1165 regs, address);
1166 } else {
1167 tsk->min_flt++;
1168 perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, 0,
1169 regs, address);
1170 }
1171 if (fault & VM_FAULT_RETRY) {
1172 /* Clear FAULT_FLAG_ALLOW_RETRY to avoid any risk
1173 * of starvation. */
1174 flags &= ~FAULT_FLAG_ALLOW_RETRY;
1175 goto retry;
1176 }
1137 } 1177 }
1138 1178
1139 check_v8086_mode(regs, address, tsk); 1179 check_v8086_mode(regs, address, tsk);
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 738e6593799d..dbe34b931374 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -8,6 +8,7 @@
8#include <linux/mm.h> 8#include <linux/mm.h>
9#include <linux/vmstat.h> 9#include <linux/vmstat.h>
10#include <linux/highmem.h> 10#include <linux/highmem.h>
11#include <linux/swap.h>
11 12
12#include <asm/pgtable.h> 13#include <asm/pgtable.h>
13 14
@@ -89,6 +90,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
89 VM_BUG_ON(!pfn_valid(pte_pfn(pte))); 90 VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
90 page = pte_page(pte); 91 page = pte_page(pte);
91 get_page(page); 92 get_page(page);
93 SetPageReferenced(page);
92 pages[*nr] = page; 94 pages[*nr] = page;
93 (*nr)++; 95 (*nr)++;
94 96
@@ -103,6 +105,17 @@ static inline void get_head_page_multiple(struct page *page, int nr)
103 VM_BUG_ON(page != compound_head(page)); 105 VM_BUG_ON(page != compound_head(page));
104 VM_BUG_ON(page_count(page) == 0); 106 VM_BUG_ON(page_count(page) == 0);
105 atomic_add(nr, &page->_count); 107 atomic_add(nr, &page->_count);
108 SetPageReferenced(page);
109}
110
111static inline void get_huge_page_tail(struct page *page)
112{
113 /*
114 * __split_huge_page_refcount() cannot run
115 * from under us.
116 */
117 VM_BUG_ON(atomic_read(&page->_count) < 0);
118 atomic_inc(&page->_count);
106} 119}
107 120
108static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr, 121static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
@@ -128,6 +141,8 @@ static noinline int gup_huge_pmd(pmd_t pmd, unsigned long addr,
128 do { 141 do {
129 VM_BUG_ON(compound_head(page) != head); 142 VM_BUG_ON(compound_head(page) != head);
130 pages[*nr] = page; 143 pages[*nr] = page;
144 if (PageTail(page))
145 get_huge_page_tail(page);
131 (*nr)++; 146 (*nr)++;
132 page++; 147 page++;
133 refs++; 148 refs++;
@@ -148,7 +163,18 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
148 pmd_t pmd = *pmdp; 163 pmd_t pmd = *pmdp;
149 164
150 next = pmd_addr_end(addr, end); 165 next = pmd_addr_end(addr, end);
151 if (pmd_none(pmd)) 166 /*
167 * The pmd_trans_splitting() check below explains why
168 * pmdp_splitting_flush has to flush the tlb, to stop
169 * this gup-fast code from running while we set the
170 * splitting bit in the pmd. Returning zero will take
171 * the slow path that will call wait_split_huge_page()
172 * if the pmd is still in splitting state. gup-fast
173 * can't because it has irq disabled and
174 * wait_split_huge_page() would never return as the
175 * tlb flush IPI wouldn't run.
176 */
177 if (pmd_none(pmd) || pmd_trans_splitting(pmd))
152 return 0; 178 return 0;
153 if (unlikely(pmd_large(pmd))) { 179 if (unlikely(pmd_large(pmd))) {
154 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) 180 if (!gup_huge_pmd(pmd, addr, next, write, pages, nr))
diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c
index 5e8fa12ef861..b49962662101 100644
--- a/arch/x86/mm/highmem_32.c
+++ b/arch/x86/mm/highmem_32.c
@@ -9,6 +9,7 @@ void *kmap(struct page *page)
9 return page_address(page); 9 return page_address(page);
10 return kmap_high(page); 10 return kmap_high(page);
11} 11}
12EXPORT_SYMBOL(kmap);
12 13
13void kunmap(struct page *page) 14void kunmap(struct page *page)
14{ 15{
@@ -18,6 +19,7 @@ void kunmap(struct page *page)
18 return; 19 return;
19 kunmap_high(page); 20 kunmap_high(page);
20} 21}
22EXPORT_SYMBOL(kunmap);
21 23
22/* 24/*
23 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because 25 * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because
@@ -27,10 +29,10 @@ void kunmap(struct page *page)
27 * However when holding an atomic kmap it is not legal to sleep, so atomic 29 * However when holding an atomic kmap it is not legal to sleep, so atomic
28 * kmaps are appropriate for short, tight code paths only. 30 * kmaps are appropriate for short, tight code paths only.
29 */ 31 */
30void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) 32void *kmap_atomic_prot(struct page *page, pgprot_t prot)
31{ 33{
32 enum fixed_addresses idx;
33 unsigned long vaddr; 34 unsigned long vaddr;
35 int idx, type;
34 36
35 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */ 37 /* even !CONFIG_PREEMPT needs this, for in_atomic in do_page_fault */
36 pagefault_disable(); 38 pagefault_disable();
@@ -38,8 +40,7 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
38 if (!PageHighMem(page)) 40 if (!PageHighMem(page))
39 return page_address(page); 41 return page_address(page);
40 42
41 debug_kmap_atomic(type); 43 type = kmap_atomic_idx_push();
42
43 idx = type + KM_TYPE_NR*smp_processor_id(); 44 idx = type + KM_TYPE_NR*smp_processor_id();
44 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 45 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
45 BUG_ON(!pte_none(*(kmap_pte-idx))); 46 BUG_ON(!pte_none(*(kmap_pte-idx)));
@@ -47,44 +48,57 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot)
47 48
48 return (void *)vaddr; 49 return (void *)vaddr;
49} 50}
51EXPORT_SYMBOL(kmap_atomic_prot);
52
53void *__kmap_atomic(struct page *page)
54{
55 return kmap_atomic_prot(page, kmap_prot);
56}
57EXPORT_SYMBOL(__kmap_atomic);
50 58
51void *kmap_atomic(struct page *page, enum km_type type) 59/*
60 * This is the same as kmap_atomic() but can map memory that doesn't
61 * have a struct page associated with it.
62 */
63void *kmap_atomic_pfn(unsigned long pfn)
52{ 64{
53 return kmap_atomic_prot(page, type, kmap_prot); 65 return kmap_atomic_prot_pfn(pfn, kmap_prot);
54} 66}
67EXPORT_SYMBOL_GPL(kmap_atomic_pfn);
55 68
56void kunmap_atomic_notypecheck(void *kvaddr, enum km_type type) 69void __kunmap_atomic(void *kvaddr)
57{ 70{
58 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 71 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
59 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id(); 72
60 73 if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
61 /* 74 vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
62 * Force other mappings to Oops if they'll try to access this pte 75 int idx, type;
63 * without first remap it. Keeping stale mappings around is a bad idea 76
64 * also, in case the page changes cacheability attributes or becomes 77 type = kmap_atomic_idx();
65 * a protected page in a hypervisor. 78 idx = type + KM_TYPE_NR * smp_processor_id();
66 */ 79
67 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 80#ifdef CONFIG_DEBUG_HIGHMEM
81 WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
82#endif
83 /*
84 * Force other mappings to Oops if they'll try to access this
85 * pte without first remap it. Keeping stale mappings around
86 * is a bad idea also, in case the page changes cacheability
87 * attributes or becomes a protected page in a hypervisor.
88 */
68 kpte_clear_flush(kmap_pte-idx, vaddr); 89 kpte_clear_flush(kmap_pte-idx, vaddr);
69 else { 90 kmap_atomic_idx_pop();
91 }
70#ifdef CONFIG_DEBUG_HIGHMEM 92#ifdef CONFIG_DEBUG_HIGHMEM
93 else {
71 BUG_ON(vaddr < PAGE_OFFSET); 94 BUG_ON(vaddr < PAGE_OFFSET);
72 BUG_ON(vaddr >= (unsigned long)high_memory); 95 BUG_ON(vaddr >= (unsigned long)high_memory);
73#endif
74 } 96 }
97#endif
75 98
76 pagefault_enable(); 99 pagefault_enable();
77} 100}
78 101EXPORT_SYMBOL(__kunmap_atomic);
79/*
80 * This is the same as kmap_atomic() but can map memory that doesn't
81 * have a struct page associated with it.
82 */
83void *kmap_atomic_pfn(unsigned long pfn, enum km_type type)
84{
85 return kmap_atomic_prot_pfn(pfn, type, kmap_prot);
86}
87EXPORT_SYMBOL_GPL(kmap_atomic_pfn); /* temporarily in use by i915 GEM until vmap */
88 102
89struct page *kmap_atomic_to_page(void *ptr) 103struct page *kmap_atomic_to_page(void *ptr)
90{ 104{
@@ -98,12 +112,6 @@ struct page *kmap_atomic_to_page(void *ptr)
98 pte = kmap_pte - (idx - FIX_KMAP_BEGIN); 112 pte = kmap_pte - (idx - FIX_KMAP_BEGIN);
99 return pte_page(*pte); 113 return pte_page(*pte);
100} 114}
101
102EXPORT_SYMBOL(kmap);
103EXPORT_SYMBOL(kunmap);
104EXPORT_SYMBOL(kmap_atomic);
105EXPORT_SYMBOL(kunmap_atomic_notypecheck);
106EXPORT_SYMBOL(kmap_atomic_prot);
107EXPORT_SYMBOL(kmap_atomic_to_page); 115EXPORT_SYMBOL(kmap_atomic_to_page);
108 116
109void __init set_highmem_pages_init(void) 117void __init set_highmem_pages_init(void)
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index 069ce7c37c01..f581a18c0d4d 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -72,7 +72,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
72 if (!vma_shareable(vma, addr)) 72 if (!vma_shareable(vma, addr))
73 return; 73 return;
74 74
75 spin_lock(&mapping->i_mmap_lock); 75 mutex_lock(&mapping->i_mmap_mutex);
76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) { 76 vma_prio_tree_foreach(svma, &iter, &mapping->i_mmap, idx, idx) {
77 if (svma == vma) 77 if (svma == vma)
78 continue; 78 continue;
@@ -97,7 +97,7 @@ static void huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
97 put_page(virt_to_page(spte)); 97 put_page(virt_to_page(spte));
98 spin_unlock(&mm->page_table_lock); 98 spin_unlock(&mm->page_table_lock);
99out: 99out:
100 spin_unlock(&mapping->i_mmap_lock); 100 mutex_unlock(&mapping->i_mmap_mutex);
101} 101}
102 102
103/* 103/*
@@ -326,7 +326,7 @@ try_again:
326 if (mm->free_area_cache < len) 326 if (mm->free_area_cache < len)
327 goto fail; 327 goto fail;
328 328
329 /* either no address requested or cant fit in requested address hole */ 329 /* either no address requested or can't fit in requested address hole */
330 addr = (mm->free_area_cache - len) & huge_page_mask(h); 330 addr = (mm->free_area_cache - len) & huge_page_mask(h);
331 do { 331 do {
332 /* 332 /*
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index b278535b14aa..30326443ab81 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -2,6 +2,7 @@
2#include <linux/initrd.h> 2#include <linux/initrd.h>
3#include <linux/ioport.h> 3#include <linux/ioport.h>
4#include <linux/swap.h> 4#include <linux/swap.h>
5#include <linux/memblock.h>
5 6
6#include <asm/cacheflush.h> 7#include <asm/cacheflush.h>
7#include <asm/e820.h> 8#include <asm/e820.h>
@@ -15,11 +16,9 @@
15#include <asm/tlb.h> 16#include <asm/tlb.h>
16#include <asm/proto.h> 17#include <asm/proto.h>
17 18
18DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); 19unsigned long __initdata pgt_buf_start;
19 20unsigned long __meminitdata pgt_buf_end;
20unsigned long __initdata e820_table_start; 21unsigned long __meminitdata pgt_buf_top;
21unsigned long __meminitdata e820_table_end;
22unsigned long __meminitdata e820_table_top;
23 22
24int after_bootmem; 23int after_bootmem;
25 24
@@ -32,7 +31,8 @@ int direct_gbpages
32static void __init find_early_table_space(unsigned long end, int use_pse, 31static void __init find_early_table_space(unsigned long end, int use_pse,
33 int use_gbpages) 32 int use_gbpages)
34{ 33{
35 unsigned long puds, pmds, ptes, tables, start; 34 unsigned long puds, pmds, ptes, tables, start = 0, good_end = end;
35 phys_addr_t base;
36 36
37 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT; 37 puds = (end + PUD_SIZE - 1) >> PUD_SHIFT;
38 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE); 38 tables = roundup(puds * sizeof(pud_t), PAGE_SIZE);
@@ -63,29 +63,25 @@ static void __init find_early_table_space(unsigned long end, int use_pse,
63#ifdef CONFIG_X86_32 63#ifdef CONFIG_X86_32
64 /* for fixmap */ 64 /* for fixmap */
65 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE); 65 tables += roundup(__end_of_fixed_addresses * sizeof(pte_t), PAGE_SIZE);
66#endif
67 66
68 /* 67 good_end = max_pfn_mapped << PAGE_SHIFT;
69 * RED-PEN putting page tables only on node 0 could
70 * cause a hotspot and fill up ZONE_DMA. The page tables
71 * need roughly 0.5KB per GB.
72 */
73#ifdef CONFIG_X86_32
74 start = 0x7000;
75#else
76 start = 0x8000;
77#endif 68#endif
78 e820_table_start = find_e820_area(start, max_pfn_mapped<<PAGE_SHIFT, 69
79 tables, PAGE_SIZE); 70 base = memblock_find_in_range(start, good_end, tables, PAGE_SIZE);
80 if (e820_table_start == -1UL) 71 if (base == MEMBLOCK_ERROR)
81 panic("Cannot find space for the kernel page tables"); 72 panic("Cannot find space for the kernel page tables");
82 73
83 e820_table_start >>= PAGE_SHIFT; 74 pgt_buf_start = base >> PAGE_SHIFT;
84 e820_table_end = e820_table_start; 75 pgt_buf_end = pgt_buf_start;
85 e820_table_top = e820_table_start + (tables >> PAGE_SHIFT); 76 pgt_buf_top = pgt_buf_start + (tables >> PAGE_SHIFT);
86 77
87 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n", 78 printk(KERN_DEBUG "kernel direct mapping tables up to %lx @ %lx-%lx\n",
88 end, e820_table_start << PAGE_SHIFT, e820_table_top << PAGE_SHIFT); 79 end, pgt_buf_start << PAGE_SHIFT, pgt_buf_top << PAGE_SHIFT);
80}
81
82void __init native_pagetable_reserve(u64 start, u64 end)
83{
84 memblock_x86_reserve_range(start, end, "PGTABLE");
89} 85}
90 86
91struct map_range { 87struct map_range {
@@ -277,30 +273,26 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
277 load_cr3(swapper_pg_dir); 273 load_cr3(swapper_pg_dir);
278#endif 274#endif
279 275
280#ifdef CONFIG_X86_64
281 if (!after_bootmem && !start) {
282 pud_t *pud;
283 pmd_t *pmd;
284
285 mmu_cr4_features = read_cr4();
286
287 /*
288 * _brk_end cannot change anymore, but it and _end may be
289 * located on different 2M pages. cleanup_highmap(), however,
290 * can only consider _end when it runs, so destroy any
291 * mappings beyond _brk_end here.
292 */
293 pud = pud_offset(pgd_offset_k(_brk_end), _brk_end);
294 pmd = pmd_offset(pud, _brk_end - 1);
295 while (++pmd <= pmd_offset(pud, (unsigned long)_end - 1))
296 pmd_clear(pmd);
297 }
298#endif
299 __flush_tlb_all(); 276 __flush_tlb_all();
300 277
301 if (!after_bootmem && e820_table_end > e820_table_start) 278 /*
302 reserve_early(e820_table_start << PAGE_SHIFT, 279 * Reserve the kernel pagetable pages we used (pgt_buf_start -
303 e820_table_end << PAGE_SHIFT, "PGTABLE"); 280 * pgt_buf_end) and free the other ones (pgt_buf_end - pgt_buf_top)
281 * so that they can be reused for other purposes.
282 *
283 * On native it just means calling memblock_x86_reserve_range, on Xen it
284 * also means marking RW the pagetable pages that we allocated before
285 * but that haven't been used.
286 *
287 * In fact on xen we mark RO the whole range pgt_buf_start -
288 * pgt_buf_top, because we have to make sure that when
289 * init_memory_mapping reaches the pagetable pages area, it maps
290 * RO all the pagetable pages, including the ones that are beyond
291 * pgt_buf_end at that time.
292 */
293 if (!after_bootmem && pgt_buf_end > pgt_buf_start)
294 x86_init.mapping.pagetable_reserve(PFN_PHYS(pgt_buf_start),
295 PFN_PHYS(pgt_buf_end));
304 296
305 if (!after_bootmem) 297 if (!after_bootmem)
306 early_memtest(start, end); 298 early_memtest(start, end);
@@ -362,8 +354,9 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
362 /* 354 /*
363 * We just marked the kernel text read only above, now that 355 * We just marked the kernel text read only above, now that
364 * we are going to free part of that, we need to make that 356 * we are going to free part of that, we need to make that
365 * writeable first. 357 * writeable and non-executable first.
366 */ 358 */
359 set_memory_nx(begin, (end - begin) >> PAGE_SHIFT);
367 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT); 360 set_memory_rw(begin, (end - begin) >> PAGE_SHIFT);
368 361
369 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10); 362 printk(KERN_INFO "Freeing %s: %luk freed\n", what, (end - begin) >> 10);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index bca79091b9d6..29f7c6d98179 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,6 +25,7 @@
25#include <linux/pfn.h> 25#include <linux/pfn.h>
26#include <linux/poison.h> 26#include <linux/poison.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/memblock.h>
28#include <linux/proc_fs.h> 29#include <linux/proc_fs.h>
29#include <linux/memory_hotplug.h> 30#include <linux/memory_hotplug.h>
30#include <linux/initrd.h> 31#include <linux/initrd.h>
@@ -44,6 +45,7 @@
44#include <asm/bugs.h> 45#include <asm/bugs.h>
45#include <asm/tlb.h> 46#include <asm/tlb.h>
46#include <asm/tlbflush.h> 47#include <asm/tlbflush.h>
48#include <asm/olpc_ofw.h>
47#include <asm/pgalloc.h> 49#include <asm/pgalloc.h>
48#include <asm/sections.h> 50#include <asm/sections.h>
49#include <asm/paravirt.h> 51#include <asm/paravirt.h>
@@ -60,14 +62,14 @@ bool __read_mostly __vmalloc_start_set = false;
60 62
61static __init void *alloc_low_page(void) 63static __init void *alloc_low_page(void)
62{ 64{
63 unsigned long pfn = e820_table_end++; 65 unsigned long pfn = pgt_buf_end++;
64 void *adr; 66 void *adr;
65 67
66 if (pfn >= e820_table_top) 68 if (pfn >= pgt_buf_top)
67 panic("alloc_low_page: ran out of memory"); 69 panic("alloc_low_page: ran out of memory");
68 70
69 adr = __va(pfn * PAGE_SIZE); 71 adr = __va(pfn * PAGE_SIZE);
70 memset(adr, 0, PAGE_SIZE); 72 clear_page(adr);
71 return adr; 73 return adr;
72} 74}
73 75
@@ -161,8 +163,8 @@ static pte_t *__init page_table_kmap_check(pte_t *pte, pmd_t *pmd,
161 if (pmd_idx_kmap_begin != pmd_idx_kmap_end 163 if (pmd_idx_kmap_begin != pmd_idx_kmap_end
162 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin 164 && (vaddr >> PMD_SHIFT) >= pmd_idx_kmap_begin
163 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end 165 && (vaddr >> PMD_SHIFT) <= pmd_idx_kmap_end
164 && ((__pa(pte) >> PAGE_SHIFT) < e820_table_start 166 && ((__pa(pte) >> PAGE_SHIFT) < pgt_buf_start
165 || (__pa(pte) >> PAGE_SHIFT) >= e820_table_end)) { 167 || (__pa(pte) >> PAGE_SHIFT) >= pgt_buf_end)) {
166 pte_t *newpte; 168 pte_t *newpte;
167 int i; 169 int i;
168 170
@@ -225,7 +227,7 @@ page_table_range_init(unsigned long start, unsigned long end, pgd_t *pgd_base)
225 227
226static inline int is_kernel_text(unsigned long addr) 228static inline int is_kernel_text(unsigned long addr)
227{ 229{
228 if (addr >= PAGE_OFFSET && addr <= (unsigned long)__init_end) 230 if (addr >= (unsigned long)_text && addr <= (unsigned long)__init_end)
229 return 1; 231 return 1;
230 return 0; 232 return 0;
231} 233}
@@ -422,49 +424,28 @@ static void __init add_one_highpage_init(struct page *page)
422 totalhigh_pages++; 424 totalhigh_pages++;
423} 425}
424 426
425struct add_highpages_data { 427void __init add_highpages_with_active_regions(int nid,
426 unsigned long start_pfn; 428 unsigned long start_pfn, unsigned long end_pfn)
427 unsigned long end_pfn;
428};
429
430static int __init add_highpages_work_fn(unsigned long start_pfn,
431 unsigned long end_pfn, void *datax)
432{ 429{
433 int node_pfn; 430 struct range *range;
434 struct page *page; 431 int nr_range;
435 unsigned long final_start_pfn, final_end_pfn; 432 int i;
436 struct add_highpages_data *data;
437 433
438 data = (struct add_highpages_data *)datax; 434 nr_range = __get_free_all_memory_range(&range, nid, start_pfn, end_pfn);
439 435
440 final_start_pfn = max(start_pfn, data->start_pfn); 436 for (i = 0; i < nr_range; i++) {
441 final_end_pfn = min(end_pfn, data->end_pfn); 437 struct page *page;
442 if (final_start_pfn >= final_end_pfn) 438 int node_pfn;
443 return 0;
444 439
445 for (node_pfn = final_start_pfn; node_pfn < final_end_pfn; 440 for (node_pfn = range[i].start; node_pfn < range[i].end;
446 node_pfn++) { 441 node_pfn++) {
447 if (!pfn_valid(node_pfn)) 442 if (!pfn_valid(node_pfn))
448 continue; 443 continue;
449 page = pfn_to_page(node_pfn); 444 page = pfn_to_page(node_pfn);
450 add_one_highpage_init(page); 445 add_one_highpage_init(page);
446 }
451 } 447 }
452
453 return 0;
454
455} 448}
456
457void __init add_highpages_with_active_regions(int nid, unsigned long start_pfn,
458 unsigned long end_pfn)
459{
460 struct add_highpages_data data;
461
462 data.start_pfn = start_pfn;
463 data.end_pfn = end_pfn;
464
465 work_with_active_regions(nid, add_highpages_work_fn, &data);
466}
467
468#else 449#else
469static inline void permanent_kmaps_init(pgd_t *pgd_base) 450static inline void permanent_kmaps_init(pgd_t *pgd_base)
470{ 451{
@@ -548,48 +529,6 @@ static void __init pagetable_init(void)
548 permanent_kmaps_init(pgd_base); 529 permanent_kmaps_init(pgd_base);
549} 530}
550 531
551#ifdef CONFIG_ACPI_SLEEP
552/*
553 * ACPI suspend needs this for resume, because things like the intel-agp
554 * driver might have split up a kernel 4MB mapping.
555 */
556char swsusp_pg_dir[PAGE_SIZE]
557 __attribute__ ((aligned(PAGE_SIZE)));
558
559static inline void save_pg_dir(void)
560{
561 memcpy(swsusp_pg_dir, swapper_pg_dir, PAGE_SIZE);
562}
563#else /* !CONFIG_ACPI_SLEEP */
564static inline void save_pg_dir(void)
565{
566}
567#endif /* !CONFIG_ACPI_SLEEP */
568
569void zap_low_mappings(bool early)
570{
571 int i;
572
573 /*
574 * Zap initial low-memory mappings.
575 *
576 * Note that "pgd_clear()" doesn't do it for
577 * us, because pgd_clear() is a no-op on i386.
578 */
579 for (i = 0; i < KERNEL_PGD_BOUNDARY; i++) {
580#ifdef CONFIG_X86_PAE
581 set_pgd(swapper_pg_dir+i, __pgd(1 + __pa(empty_zero_page)));
582#else
583 set_pgd(swapper_pg_dir+i, __pgd(0));
584#endif
585 }
586
587 if (early)
588 __flush_tlb();
589 else
590 flush_tlb_all();
591}
592
593pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); 532pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
594EXPORT_SYMBOL_GPL(__supported_pte_mask); 533EXPORT_SYMBOL_GPL(__supported_pte_mask);
595 534
@@ -705,21 +644,20 @@ void __init find_low_pfn_range(void)
705} 644}
706 645
707#ifndef CONFIG_NEED_MULTIPLE_NODES 646#ifndef CONFIG_NEED_MULTIPLE_NODES
708void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 647void __init initmem_init(void)
709 int acpi, int k8)
710{ 648{
711#ifdef CONFIG_HIGHMEM 649#ifdef CONFIG_HIGHMEM
712 highstart_pfn = highend_pfn = max_pfn; 650 highstart_pfn = highend_pfn = max_pfn;
713 if (max_pfn > max_low_pfn) 651 if (max_pfn > max_low_pfn)
714 highstart_pfn = max_low_pfn; 652 highstart_pfn = max_low_pfn;
715 e820_register_active_regions(0, 0, highend_pfn); 653 memblock_x86_register_active_regions(0, 0, highend_pfn);
716 sparse_memory_present_with_active_regions(0); 654 sparse_memory_present_with_active_regions(0);
717 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", 655 printk(KERN_NOTICE "%ldMB HIGHMEM available.\n",
718 pages_to_mb(highend_pfn - highstart_pfn)); 656 pages_to_mb(highend_pfn - highstart_pfn));
719 num_physpages = highend_pfn; 657 num_physpages = highend_pfn;
720 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; 658 high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1;
721#else 659#else
722 e820_register_active_regions(0, 0, max_low_pfn); 660 memblock_x86_register_active_regions(0, 0, max_low_pfn);
723 sparse_memory_present_with_active_regions(0); 661 sparse_memory_present_with_active_regions(0);
724 num_physpages = max_low_pfn; 662 num_physpages = max_low_pfn;
725 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; 663 high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1;
@@ -740,8 +678,10 @@ static void __init zone_sizes_init(void)
740{ 678{
741 unsigned long max_zone_pfns[MAX_NR_ZONES]; 679 unsigned long max_zone_pfns[MAX_NR_ZONES];
742 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 680 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
681#ifdef CONFIG_ZONE_DMA
743 max_zone_pfns[ZONE_DMA] = 682 max_zone_pfns[ZONE_DMA] =
744 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT; 683 virt_to_phys((char *)MAX_DMA_ADDRESS) >> PAGE_SHIFT;
684#endif
745 max_zone_pfns[ZONE_NORMAL] = max_low_pfn; 685 max_zone_pfns[ZONE_NORMAL] = max_low_pfn;
746#ifdef CONFIG_HIGHMEM 686#ifdef CONFIG_HIGHMEM
747 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn; 687 max_zone_pfns[ZONE_HIGHMEM] = highend_pfn;
@@ -750,68 +690,12 @@ static void __init zone_sizes_init(void)
750 free_area_init_nodes(max_zone_pfns); 690 free_area_init_nodes(max_zone_pfns);
751} 691}
752 692
753#ifndef CONFIG_NO_BOOTMEM
754static unsigned long __init setup_node_bootmem(int nodeid,
755 unsigned long start_pfn,
756 unsigned long end_pfn,
757 unsigned long bootmap)
758{
759 unsigned long bootmap_size;
760
761 /* don't touch min_low_pfn */
762 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
763 bootmap >> PAGE_SHIFT,
764 start_pfn, end_pfn);
765 printk(KERN_INFO " node %d low ram: %08lx - %08lx\n",
766 nodeid, start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
767 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
768 nodeid, bootmap, bootmap + bootmap_size);
769 free_bootmem_with_active_regions(nodeid, end_pfn);
770
771 return bootmap + bootmap_size;
772}
773#endif
774
775void __init setup_bootmem_allocator(void) 693void __init setup_bootmem_allocator(void)
776{ 694{
777#ifndef CONFIG_NO_BOOTMEM
778 int nodeid;
779 unsigned long bootmap_size, bootmap;
780 /*
781 * Initialize the boot-time allocator (with low memory only):
782 */
783 bootmap_size = bootmem_bootmap_pages(max_low_pfn)<<PAGE_SHIFT;
784 bootmap = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, bootmap_size,
785 PAGE_SIZE);
786 if (bootmap == -1L)
787 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
788 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
789#endif
790
791 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 695 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
792 max_pfn_mapped<<PAGE_SHIFT); 696 max_pfn_mapped<<PAGE_SHIFT);
793 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 697 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
794 698
795#ifndef CONFIG_NO_BOOTMEM
796 for_each_online_node(nodeid) {
797 unsigned long start_pfn, end_pfn;
798
799#ifdef CONFIG_NEED_MULTIPLE_NODES
800 start_pfn = node_start_pfn[nodeid];
801 end_pfn = node_end_pfn[nodeid];
802 if (start_pfn > max_low_pfn)
803 continue;
804 if (end_pfn > max_low_pfn)
805 end_pfn = max_low_pfn;
806#else
807 start_pfn = 0;
808 end_pfn = max_low_pfn;
809#endif
810 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
811 bootmap);
812 }
813#endif
814
815 after_bootmem = 1; 699 after_bootmem = 1;
816} 700}
817 701
@@ -833,6 +717,8 @@ void __init paging_init(void)
833 /* 717 /*
834 * NOTE: at this point the bootmem allocator is fully available. 718 * NOTE: at this point the bootmem allocator is fully available.
835 */ 719 */
720 olpc_dt_build_devicetree();
721 sparse_memory_present_with_active_regions(MAX_NUMNODES);
836 sparse_init(); 722 sparse_init();
837 zone_sizes_init(); 723 zone_sizes_init();
838} 724}
@@ -958,9 +844,6 @@ void __init mem_init(void)
958 844
959 if (boot_cpu_data.wp_works_ok < 0) 845 if (boot_cpu_data.wp_works_ok < 0)
960 test_wp_bit(); 846 test_wp_bit();
961
962 save_pg_dir();
963 zap_low_mappings(true);
964} 847}
965 848
966#ifdef CONFIG_MEMORY_HOTPLUG 849#ifdef CONFIG_MEMORY_HOTPLUG
@@ -1033,6 +916,23 @@ void set_kernel_text_ro(void)
1033 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 916 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1034} 917}
1035 918
919static void mark_nxdata_nx(void)
920{
921 /*
922 * When this called, init has already been executed and released,
923 * so everything past _etext should be NX.
924 */
925 unsigned long start = PFN_ALIGN(_etext);
926 /*
927 * This comes from is_kernel_text upper limit. Also HPAGE where used:
928 */
929 unsigned long size = (((unsigned long)__init_end + HPAGE_SIZE) & HPAGE_MASK) - start;
930
931 if (__supported_pte_mask & _PAGE_NX)
932 printk(KERN_INFO "NX-protecting the kernel data: %luk\n", size >> 10);
933 set_pages_nx(virt_to_page(start), size >> PAGE_SHIFT);
934}
935
1036void mark_rodata_ro(void) 936void mark_rodata_ro(void)
1037{ 937{
1038 unsigned long start = PFN_ALIGN(_text); 938 unsigned long start = PFN_ALIGN(_text);
@@ -1067,11 +967,7 @@ void mark_rodata_ro(void)
1067 printk(KERN_INFO "Testing CPA: write protecting again\n"); 967 printk(KERN_INFO "Testing CPA: write protecting again\n");
1068 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); 968 set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT);
1069#endif 969#endif
970 mark_nxdata_nx();
1070} 971}
1071#endif 972#endif
1072 973
1073int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
1074 int flags)
1075{
1076 return reserve_bootmem(phys, len, flags);
1077}
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 9a6674689a20..bbaaa005bf0e 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -21,12 +21,14 @@
21#include <linux/initrd.h> 21#include <linux/initrd.h>
22#include <linux/pagemap.h> 22#include <linux/pagemap.h>
23#include <linux/bootmem.h> 23#include <linux/bootmem.h>
24#include <linux/memblock.h>
24#include <linux/proc_fs.h> 25#include <linux/proc_fs.h>
25#include <linux/pci.h> 26#include <linux/pci.h>
26#include <linux/pfn.h> 27#include <linux/pfn.h>
27#include <linux/poison.h> 28#include <linux/poison.h>
28#include <linux/dma-mapping.h> 29#include <linux/dma-mapping.h>
29#include <linux/module.h> 30#include <linux/module.h>
31#include <linux/memory.h>
30#include <linux/memory_hotplug.h> 32#include <linux/memory_hotplug.h>
31#include <linux/nmi.h> 33#include <linux/nmi.h>
32#include <linux/gfp.h> 34#include <linux/gfp.h>
@@ -50,9 +52,8 @@
50#include <asm/numa.h> 52#include <asm/numa.h>
51#include <asm/cacheflush.h> 53#include <asm/cacheflush.h>
52#include <asm/init.h> 54#include <asm/init.h>
53#include <linux/bootmem.h> 55#include <asm/uv/uv.h>
54 56#include <asm/setup.h>
55static unsigned long dma_reserve __initdata;
56 57
57static int __init parse_direct_gbpages_off(char *arg) 58static int __init parse_direct_gbpages_off(char *arg)
58{ 59{
@@ -98,6 +99,43 @@ static int __init nonx32_setup(char *str)
98__setup("noexec32=", nonx32_setup); 99__setup("noexec32=", nonx32_setup);
99 100
100/* 101/*
102 * When memory was added/removed make sure all the processes MM have
103 * suitable PGD entries in the local PGD level page.
104 */
105void sync_global_pgds(unsigned long start, unsigned long end)
106{
107 unsigned long address;
108
109 for (address = start; address <= end; address += PGDIR_SIZE) {
110 const pgd_t *pgd_ref = pgd_offset_k(address);
111 struct page *page;
112
113 if (pgd_none(*pgd_ref))
114 continue;
115
116 spin_lock(&pgd_lock);
117 list_for_each_entry(page, &pgd_list, lru) {
118 pgd_t *pgd;
119 spinlock_t *pgt_lock;
120
121 pgd = (pgd_t *)page_address(page) + pgd_index(address);
122 /* the pgt_lock only for Xen */
123 pgt_lock = &pgd_page_get_mm(page)->page_table_lock;
124 spin_lock(pgt_lock);
125
126 if (pgd_none(*pgd))
127 set_pgd(pgd, *pgd_ref);
128 else
129 BUG_ON(pgd_page_vaddr(*pgd)
130 != pgd_page_vaddr(*pgd_ref));
131
132 spin_unlock(pgt_lock);
133 }
134 spin_unlock(&pgd_lock);
135 }
136}
137
138/*
101 * NOTE: This function is marked __ref because it calls __init function 139 * NOTE: This function is marked __ref because it calls __init function
102 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0. 140 * (alloc_bootmem_pages). It's safe to do it ONLY when after_bootmem == 0.
103 */ 141 */
@@ -258,18 +296,18 @@ void __init init_extra_mapping_uc(unsigned long phys, unsigned long size)
258 * to the compile time generated pmds. This results in invalid pmds up 296 * to the compile time generated pmds. This results in invalid pmds up
259 * to the point where we hit the physaddr 0 mapping. 297 * to the point where we hit the physaddr 0 mapping.
260 * 298 *
261 * We limit the mappings to the region from _text to _end. _end is 299 * We limit the mappings to the region from _text to _brk_end. _brk_end
262 * rounded up to the 2MB boundary. This catches the invalid pmds as 300 * is rounded up to the 2MB boundary. This catches the invalid pmds as
263 * well, as they are located before _text: 301 * well, as they are located before _text:
264 */ 302 */
265void __init cleanup_highmap(void) 303void __init cleanup_highmap(void)
266{ 304{
267 unsigned long vaddr = __START_KERNEL_map; 305 unsigned long vaddr = __START_KERNEL_map;
268 unsigned long end = roundup((unsigned long)_end, PMD_SIZE) - 1; 306 unsigned long vaddr_end = __START_KERNEL_map + (max_pfn_mapped << PAGE_SHIFT);
307 unsigned long end = roundup((unsigned long)_brk_end, PMD_SIZE) - 1;
269 pmd_t *pmd = level2_kernel_pgt; 308 pmd_t *pmd = level2_kernel_pgt;
270 pmd_t *last_pmd = pmd + PTRS_PER_PMD;
271 309
272 for (; pmd < last_pmd; pmd++, vaddr += PMD_SIZE) { 310 for (; vaddr + PMD_SIZE - 1 < vaddr_end; pmd++, vaddr += PMD_SIZE) {
273 if (pmd_none(*pmd)) 311 if (pmd_none(*pmd))
274 continue; 312 continue;
275 if (vaddr < (unsigned long) _text || vaddr > end) 313 if (vaddr < (unsigned long) _text || vaddr > end)
@@ -279,7 +317,7 @@ void __init cleanup_highmap(void)
279 317
280static __ref void *alloc_low_page(unsigned long *phys) 318static __ref void *alloc_low_page(unsigned long *phys)
281{ 319{
282 unsigned long pfn = e820_table_end++; 320 unsigned long pfn = pgt_buf_end++;
283 void *adr; 321 void *adr;
284 322
285 if (after_bootmem) { 323 if (after_bootmem) {
@@ -289,21 +327,37 @@ static __ref void *alloc_low_page(unsigned long *phys)
289 return adr; 327 return adr;
290 } 328 }
291 329
292 if (pfn >= e820_table_top) 330 if (pfn >= pgt_buf_top)
293 panic("alloc_low_page: ran out of memory"); 331 panic("alloc_low_page: ran out of memory");
294 332
295 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE); 333 adr = early_memremap(pfn * PAGE_SIZE, PAGE_SIZE);
296 memset(adr, 0, PAGE_SIZE); 334 clear_page(adr);
297 *phys = pfn * PAGE_SIZE; 335 *phys = pfn * PAGE_SIZE;
298 return adr; 336 return adr;
299} 337}
300 338
339static __ref void *map_low_page(void *virt)
340{
341 void *adr;
342 unsigned long phys, left;
343
344 if (after_bootmem)
345 return virt;
346
347 phys = __pa(virt);
348 left = phys & (PAGE_SIZE - 1);
349 adr = early_memremap(phys & PAGE_MASK, PAGE_SIZE);
350 adr = (void *)(((unsigned long)adr) | left);
351
352 return adr;
353}
354
301static __ref void unmap_low_page(void *adr) 355static __ref void unmap_low_page(void *adr)
302{ 356{
303 if (after_bootmem) 357 if (after_bootmem)
304 return; 358 return;
305 359
306 early_iounmap(adr, PAGE_SIZE); 360 early_iounmap((void *)((unsigned long)adr & PAGE_MASK), PAGE_SIZE);
307} 361}
308 362
309static unsigned long __meminit 363static unsigned long __meminit
@@ -351,15 +405,6 @@ phys_pte_init(pte_t *pte_page, unsigned long addr, unsigned long end,
351} 405}
352 406
353static unsigned long __meminit 407static unsigned long __meminit
354phys_pte_update(pmd_t *pmd, unsigned long address, unsigned long end,
355 pgprot_t prot)
356{
357 pte_t *pte = (pte_t *)pmd_page_vaddr(*pmd);
358
359 return phys_pte_init(pte, address, end, prot);
360}
361
362static unsigned long __meminit
363phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end, 408phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
364 unsigned long page_size_mask, pgprot_t prot) 409 unsigned long page_size_mask, pgprot_t prot)
365{ 410{
@@ -385,8 +430,10 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
385 if (pmd_val(*pmd)) { 430 if (pmd_val(*pmd)) {
386 if (!pmd_large(*pmd)) { 431 if (!pmd_large(*pmd)) {
387 spin_lock(&init_mm.page_table_lock); 432 spin_lock(&init_mm.page_table_lock);
388 last_map_addr = phys_pte_update(pmd, address, 433 pte = map_low_page((pte_t *)pmd_page_vaddr(*pmd));
434 last_map_addr = phys_pte_init(pte, address,
389 end, prot); 435 end, prot);
436 unmap_low_page(pte);
390 spin_unlock(&init_mm.page_table_lock); 437 spin_unlock(&init_mm.page_table_lock);
391 continue; 438 continue;
392 } 439 }
@@ -433,18 +480,6 @@ phys_pmd_init(pmd_t *pmd_page, unsigned long address, unsigned long end,
433} 480}
434 481
435static unsigned long __meminit 482static unsigned long __meminit
436phys_pmd_update(pud_t *pud, unsigned long address, unsigned long end,
437 unsigned long page_size_mask, pgprot_t prot)
438{
439 pmd_t *pmd = pmd_offset(pud, 0);
440 unsigned long last_map_addr;
441
442 last_map_addr = phys_pmd_init(pmd, address, end, page_size_mask, prot);
443 __flush_tlb_all();
444 return last_map_addr;
445}
446
447static unsigned long __meminit
448phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end, 483phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
449 unsigned long page_size_mask) 484 unsigned long page_size_mask)
450{ 485{
@@ -469,8 +504,11 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
469 504
470 if (pud_val(*pud)) { 505 if (pud_val(*pud)) {
471 if (!pud_large(*pud)) { 506 if (!pud_large(*pud)) {
472 last_map_addr = phys_pmd_update(pud, addr, end, 507 pmd = map_low_page(pmd_offset(pud, 0));
508 last_map_addr = phys_pmd_init(pmd, addr, end,
473 page_size_mask, prot); 509 page_size_mask, prot);
510 unmap_low_page(pmd);
511 __flush_tlb_all();
474 continue; 512 continue;
475 } 513 }
476 /* 514 /*
@@ -518,27 +556,18 @@ phys_pud_init(pud_t *pud_page, unsigned long addr, unsigned long end,
518 return last_map_addr; 556 return last_map_addr;
519} 557}
520 558
521static unsigned long __meminit
522phys_pud_update(pgd_t *pgd, unsigned long addr, unsigned long end,
523 unsigned long page_size_mask)
524{
525 pud_t *pud;
526
527 pud = (pud_t *)pgd_page_vaddr(*pgd);
528
529 return phys_pud_init(pud, addr, end, page_size_mask);
530}
531
532unsigned long __meminit 559unsigned long __meminit
533kernel_physical_mapping_init(unsigned long start, 560kernel_physical_mapping_init(unsigned long start,
534 unsigned long end, 561 unsigned long end,
535 unsigned long page_size_mask) 562 unsigned long page_size_mask)
536{ 563{
537 564 bool pgd_changed = false;
538 unsigned long next, last_map_addr = end; 565 unsigned long next, last_map_addr = end;
566 unsigned long addr;
539 567
540 start = (unsigned long)__va(start); 568 start = (unsigned long)__va(start);
541 end = (unsigned long)__va(end); 569 end = (unsigned long)__va(end);
570 addr = start;
542 571
543 for (; start < end; start = next) { 572 for (; start < end; start = next) {
544 pgd_t *pgd = pgd_offset_k(start); 573 pgd_t *pgd = pgd_offset_k(start);
@@ -550,8 +579,10 @@ kernel_physical_mapping_init(unsigned long start,
550 next = end; 579 next = end;
551 580
552 if (pgd_val(*pgd)) { 581 if (pgd_val(*pgd)) {
553 last_map_addr = phys_pud_update(pgd, __pa(start), 582 pud = map_low_page((pud_t *)pgd_page_vaddr(*pgd));
583 last_map_addr = phys_pud_init(pud, __pa(start),
554 __pa(end), page_size_mask); 584 __pa(end), page_size_mask);
585 unmap_low_page(pud);
555 continue; 586 continue;
556 } 587 }
557 588
@@ -563,33 +594,21 @@ kernel_physical_mapping_init(unsigned long start,
563 spin_lock(&init_mm.page_table_lock); 594 spin_lock(&init_mm.page_table_lock);
564 pgd_populate(&init_mm, pgd, __va(pud_phys)); 595 pgd_populate(&init_mm, pgd, __va(pud_phys));
565 spin_unlock(&init_mm.page_table_lock); 596 spin_unlock(&init_mm.page_table_lock);
597 pgd_changed = true;
566 } 598 }
599
600 if (pgd_changed)
601 sync_global_pgds(addr, end);
602
567 __flush_tlb_all(); 603 __flush_tlb_all();
568 604
569 return last_map_addr; 605 return last_map_addr;
570} 606}
571 607
572#ifndef CONFIG_NUMA 608#ifndef CONFIG_NUMA
573void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn, 609void __init initmem_init(void)
574 int acpi, int k8) 610{
575{ 611 memblock_x86_register_active_regions(0, 0, max_pfn);
576#ifndef CONFIG_NO_BOOTMEM
577 unsigned long bootmap_size, bootmap;
578
579 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
580 bootmap = find_e820_area(0, end_pfn<<PAGE_SHIFT, bootmap_size,
581 PAGE_SIZE);
582 if (bootmap == -1L)
583 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
584 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
585 /* don't touch min_low_pfn */
586 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
587 0, end_pfn);
588 e820_register_active_regions(0, start_pfn, end_pfn);
589 free_bootmem_with_active_regions(0, end_pfn);
590#else
591 e820_register_active_regions(0, start_pfn, end_pfn);
592#endif
593} 612}
594#endif 613#endif
595 614
@@ -598,7 +617,9 @@ void __init paging_init(void)
598 unsigned long max_zone_pfns[MAX_NR_ZONES]; 617 unsigned long max_zone_pfns[MAX_NR_ZONES];
599 618
600 memset(max_zone_pfns, 0, sizeof(max_zone_pfns)); 619 memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
620#ifdef CONFIG_ZONE_DMA
601 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; 621 max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN;
622#endif
602 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; 623 max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN;
603 max_zone_pfns[ZONE_NORMAL] = max_pfn; 624 max_zone_pfns[ZONE_NORMAL] = max_pfn;
604 625
@@ -661,14 +682,6 @@ int arch_add_memory(int nid, u64 start, u64 size)
661} 682}
662EXPORT_SYMBOL_GPL(arch_add_memory); 683EXPORT_SYMBOL_GPL(arch_add_memory);
663 684
664#if !defined(CONFIG_ACPI_NUMA) && defined(CONFIG_NUMA)
665int memory_add_physaddr_to_nid(u64 start)
666{
667 return 0;
668}
669EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
670#endif
671
672#endif /* CONFIG_MEMORY_HOTPLUG */ 685#endif /* CONFIG_MEMORY_HOTPLUG */
673 686
674static struct kcore_list kcore_vsyscall; 687static struct kcore_list kcore_vsyscall;
@@ -799,52 +812,6 @@ void mark_rodata_ro(void)
799 812
800#endif 813#endif
801 814
802int __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
803 int flags)
804{
805#ifdef CONFIG_NUMA
806 int nid, next_nid;
807 int ret;
808#endif
809 unsigned long pfn = phys >> PAGE_SHIFT;
810
811 if (pfn >= max_pfn) {
812 /*
813 * This can happen with kdump kernels when accessing
814 * firmware tables:
815 */
816 if (pfn < max_pfn_mapped)
817 return -EFAULT;
818
819 printk(KERN_ERR "reserve_bootmem: illegal reserve %lx %lu\n",
820 phys, len);
821 return -EFAULT;
822 }
823
824 /* Should check here against the e820 map to avoid double free */
825#ifdef CONFIG_NUMA
826 nid = phys_to_nid(phys);
827 next_nid = phys_to_nid(phys + len - 1);
828 if (nid == next_nid)
829 ret = reserve_bootmem_node(NODE_DATA(nid), phys, len, flags);
830 else
831 ret = reserve_bootmem(phys, len, flags);
832
833 if (ret != 0)
834 return ret;
835
836#else
837 reserve_bootmem(phys, len, flags);
838#endif
839
840 if (phys+len <= MAX_DMA_PFN*PAGE_SIZE) {
841 dma_reserve += len / PAGE_SIZE;
842 set_dma_reserve(dma_reserve);
843 }
844
845 return 0;
846}
847
848int kern_addr_valid(unsigned long addr) 815int kern_addr_valid(unsigned long addr)
849{ 816{
850 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT; 817 unsigned long above = ((long)addr) >> __VIRTUAL_MASK_SHIFT;
@@ -890,18 +857,18 @@ static struct vm_area_struct gate_vma = {
890 .vm_flags = VM_READ | VM_EXEC 857 .vm_flags = VM_READ | VM_EXEC
891}; 858};
892 859
893struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 860struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
894{ 861{
895#ifdef CONFIG_IA32_EMULATION 862#ifdef CONFIG_IA32_EMULATION
896 if (test_tsk_thread_flag(tsk, TIF_IA32)) 863 if (!mm || mm->context.ia32_compat)
897 return NULL; 864 return NULL;
898#endif 865#endif
899 return &gate_vma; 866 return &gate_vma;
900} 867}
901 868
902int in_gate_area(struct task_struct *task, unsigned long addr) 869int in_gate_area(struct mm_struct *mm, unsigned long addr)
903{ 870{
904 struct vm_area_struct *vma = get_gate_vma(task); 871 struct vm_area_struct *vma = get_gate_vma(mm);
905 872
906 if (!vma) 873 if (!vma)
907 return 0; 874 return 0;
@@ -910,11 +877,11 @@ int in_gate_area(struct task_struct *task, unsigned long addr)
910} 877}
911 878
912/* 879/*
913 * Use this when you have no reliable task/vma, typically from interrupt 880 * Use this when you have no reliable mm, typically from interrupt
914 * context. It is less reliable than using the task's vma and may give 881 * context. It is less reliable than using a task's mm and may give
915 * false positives: 882 * false positives.
916 */ 883 */
917int in_gate_area_no_task(unsigned long addr) 884int in_gate_area_no_mm(unsigned long addr)
918{ 885{
919 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END); 886 return (addr >= VSYSCALL_START) && (addr < VSYSCALL_END);
920} 887}
@@ -928,6 +895,17 @@ const char *arch_vma_name(struct vm_area_struct *vma)
928 return NULL; 895 return NULL;
929} 896}
930 897
898#ifdef CONFIG_X86_UV
899unsigned long memory_block_size_bytes(void)
900{
901 if (is_uv_system()) {
902 printk(KERN_INFO "UV: memory block size 2GB\n");
903 return 2UL * 1024 * 1024 * 1024;
904 }
905 return MIN_MEMORY_BLOCK_SIZE;
906}
907#endif
908
931#ifdef CONFIG_SPARSEMEM_VMEMMAP 909#ifdef CONFIG_SPARSEMEM_VMEMMAP
932/* 910/*
933 * Initialise the sparsemem vmemmap using huge-pages at the PMD level. 911 * Initialise the sparsemem vmemmap using huge-pages at the PMD level.
@@ -1003,6 +981,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
1003 } 981 }
1004 982
1005 } 983 }
984 sync_global_pgds((unsigned long)start_page, end);
1006 return 0; 985 return 0;
1007} 986}
1008 987
diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c
index 72fc70cf6184..7b179b499fa3 100644
--- a/arch/x86/mm/iomap_32.c
+++ b/arch/x86/mm/iomap_32.c
@@ -48,21 +48,20 @@ int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot)
48} 48}
49EXPORT_SYMBOL_GPL(iomap_create_wc); 49EXPORT_SYMBOL_GPL(iomap_create_wc);
50 50
51void 51void iomap_free(resource_size_t base, unsigned long size)
52iomap_free(resource_size_t base, unsigned long size)
53{ 52{
54 io_free_memtype(base, base + size); 53 io_free_memtype(base, base + size);
55} 54}
56EXPORT_SYMBOL_GPL(iomap_free); 55EXPORT_SYMBOL_GPL(iomap_free);
57 56
58void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 57void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
59{ 58{
60 enum fixed_addresses idx;
61 unsigned long vaddr; 59 unsigned long vaddr;
60 int idx, type;
62 61
63 pagefault_disable(); 62 pagefault_disable();
64 63
65 debug_kmap_atomic(type); 64 type = kmap_atomic_idx_push();
66 idx = type + KM_TYPE_NR * smp_processor_id(); 65 idx = type + KM_TYPE_NR * smp_processor_id();
67 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); 66 vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
68 set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); 67 set_pte(kmap_pte - idx, pfn_pte(pfn, prot));
@@ -72,10 +71,10 @@ void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
72} 71}
73 72
74/* 73/*
75 * Map 'pfn' using fixed map 'type' and protections 'prot' 74 * Map 'pfn' using protections 'prot'
76 */ 75 */
77void __iomem * 76void __iomem *
78iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot) 77iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot)
79{ 78{
80 /* 79 /*
81 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS. 80 * For non-PAT systems, promote PAGE_KERNEL_WC to PAGE_KERNEL_UC_MINUS.
@@ -86,24 +85,34 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot)
86 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC)) 85 if (!pat_enabled && pgprot_val(prot) == pgprot_val(PAGE_KERNEL_WC))
87 prot = PAGE_KERNEL_UC_MINUS; 86 prot = PAGE_KERNEL_UC_MINUS;
88 87
89 return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, type, prot); 88 return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot);
90} 89}
91EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); 90EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn);
92 91
93void 92void
94iounmap_atomic(void __iomem *kvaddr, enum km_type type) 93iounmap_atomic(void __iomem *kvaddr)
95{ 94{
96 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; 95 unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK;
97 enum fixed_addresses idx = type + KM_TYPE_NR*smp_processor_id();
98 96
99 /* 97 if (vaddr >= __fix_to_virt(FIX_KMAP_END) &&
100 * Force other mappings to Oops if they'll try to access this pte 98 vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) {
101 * without first remap it. Keeping stale mappings around is a bad idea 99 int idx, type;
102 * also, in case the page changes cacheability attributes or becomes 100
103 * a protected page in a hypervisor. 101 type = kmap_atomic_idx();
104 */ 102 idx = type + KM_TYPE_NR * smp_processor_id();
105 if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) 103
104#ifdef CONFIG_DEBUG_HIGHMEM
105 WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
106#endif
107 /*
108 * Force other mappings to Oops if they'll try to access this
109 * pte without first remap it. Keeping stale mappings around
110 * is a bad idea also, in case the page changes cacheability
111 * attributes or becomes a protected page in a hypervisor.
112 */
106 kpte_clear_flush(kmap_pte-idx, vaddr); 113 kpte_clear_flush(kmap_pte-idx, vaddr);
114 kmap_atomic_idx_pop();
115 }
107 116
108 pagefault_enable(); 117 pagefault_enable();
109} 118}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 3ba6e0608c55..be1ef574ce9a 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -91,13 +91,6 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
91 return (__force void __iomem *)phys_to_virt(phys_addr); 91 return (__force void __iomem *)phys_to_virt(phys_addr);
92 92
93 /* 93 /*
94 * Check if the request spans more than any BAR in the iomem resource
95 * tree.
96 */
97 WARN_ONCE(iomem_map_sanity_check(phys_addr, size),
98 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
99
100 /*
101 * Don't allow anybody to remap normal RAM that we're using.. 94 * Don't allow anybody to remap normal RAM that we're using..
102 */ 95 */
103 last_pfn = last_addr >> PAGE_SHIFT; 96 last_pfn = last_addr >> PAGE_SHIFT;
@@ -170,6 +163,13 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
170 ret_addr = (void __iomem *) (vaddr + offset); 163 ret_addr = (void __iomem *) (vaddr + offset);
171 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr); 164 mmiotrace_ioremap(unaligned_phys_addr, unaligned_size, ret_addr);
172 165
166 /*
167 * Check if the request spans more than any BAR in the iomem resource
168 * tree.
169 */
170 WARN_ONCE(iomem_map_sanity_check(unaligned_phys_addr, unaligned_size),
171 KERN_INFO "Info: mapping multiple BARs. Your kernel is fine.");
172
173 return ret_addr; 173 return ret_addr;
174err_free_area: 174err_free_area:
175 free_vm_area(area); 175 free_vm_area(area);
@@ -362,6 +362,11 @@ static inline pte_t * __init early_ioremap_pte(unsigned long addr)
362 return &bm_pte[pte_index(addr)]; 362 return &bm_pte[pte_index(addr)];
363} 363}
364 364
365bool __init is_early_ioremap_ptep(pte_t *ptep)
366{
367 return ptep >= &bm_pte[0] && ptep < &bm_pte[PAGE_SIZE/sizeof(pte_t)];
368}
369
365static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata; 370static unsigned long slot_virt[FIX_BTMAPS_SLOTS] __initdata;
366 371
367void __init early_ioremap_init(void) 372void __init early_ioremap_init(void)
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index af3b6c8a436f..704a37cedddb 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -185,7 +185,7 @@ void kmemcheck_error_save(enum kmemcheck_shadow state,
185 e->trace.entries = e->trace_entries; 185 e->trace.entries = e->trace_entries;
186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries); 186 e->trace.max_entries = ARRAY_SIZE(e->trace_entries);
187 e->trace.skip = 0; 187 e->trace.skip = 0;
188 save_stack_trace_bp(&e->trace, regs->bp); 188 save_stack_trace_regs(&e->trace, regs);
189 189
190 /* Round address down to nearest 16 bytes */ 190 /* Round address down to nearest 16 bytes */
191 shadow_copy = kmemcheck_shadow_lookup(address 191 shadow_copy = kmemcheck_shadow_lookup(address
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index b3b531a4f8e5..d87dd6d042d6 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -631,6 +631,8 @@ bool kmemcheck_fault(struct pt_regs *regs, unsigned long address,
631 if (!pte) 631 if (!pte)
632 return false; 632 return false;
633 633
634 WARN_ON_ONCE(in_nmi());
635
634 if (error_code & 2) 636 if (error_code & 2)
635 kmemcheck_access(regs, address, KMEMCHECK_WRITE); 637 kmemcheck_access(regs, address, KMEMCHECK_WRITE);
636 else 638 else
diff --git a/arch/x86/mm/kmemcheck/opcode.c b/arch/x86/mm/kmemcheck/opcode.c
index 63c19e27aa6f..324aa3f07237 100644
--- a/arch/x86/mm/kmemcheck/opcode.c
+++ b/arch/x86/mm/kmemcheck/opcode.c
@@ -9,7 +9,7 @@ static bool opcode_is_prefix(uint8_t b)
9 b == 0xf0 || b == 0xf2 || b == 0xf3 9 b == 0xf0 || b == 0xf2 || b == 0xf3
10 /* Group 2 */ 10 /* Group 2 */
11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26 11 || b == 0x2e || b == 0x36 || b == 0x3e || b == 0x26
12 || b == 0x64 || b == 0x65 || b == 0x2e || b == 0x3e 12 || b == 0x64 || b == 0x65
13 /* Group 3 */ 13 /* Group 3 */
14 || b == 0x66 14 || b == 0x66
15 /* Group 4 */ 15 /* Group 4 */
diff --git a/arch/x86/mm/memblock.c b/arch/x86/mm/memblock.c
new file mode 100644
index 000000000000..992da5ec5a64
--- /dev/null
+++ b/arch/x86/mm/memblock.c
@@ -0,0 +1,348 @@
1#include <linux/kernel.h>
2#include <linux/types.h>
3#include <linux/init.h>
4#include <linux/bitops.h>
5#include <linux/memblock.h>
6#include <linux/bootmem.h>
7#include <linux/mm.h>
8#include <linux/range.h>
9
10/* Check for already reserved areas */
11bool __init memblock_x86_check_reserved_size(u64 *addrp, u64 *sizep, u64 align)
12{
13 struct memblock_region *r;
14 u64 addr = *addrp, last;
15 u64 size = *sizep;
16 bool changed = false;
17
18again:
19 last = addr + size;
20 for_each_memblock(reserved, r) {
21 if (last > r->base && addr < r->base) {
22 size = r->base - addr;
23 changed = true;
24 goto again;
25 }
26 if (last > (r->base + r->size) && addr < (r->base + r->size)) {
27 addr = round_up(r->base + r->size, align);
28 size = last - addr;
29 changed = true;
30 goto again;
31 }
32 if (last <= (r->base + r->size) && addr >= r->base) {
33 *sizep = 0;
34 return false;
35 }
36 }
37 if (changed) {
38 *addrp = addr;
39 *sizep = size;
40 }
41 return changed;
42}
43
44/*
45 * Find next free range after start, and size is returned in *sizep
46 */
47u64 __init memblock_x86_find_in_range_size(u64 start, u64 *sizep, u64 align)
48{
49 struct memblock_region *r;
50
51 for_each_memblock(memory, r) {
52 u64 ei_start = r->base;
53 u64 ei_last = ei_start + r->size;
54 u64 addr;
55
56 addr = round_up(ei_start, align);
57 if (addr < start)
58 addr = round_up(start, align);
59 if (addr >= ei_last)
60 continue;
61 *sizep = ei_last - addr;
62 while (memblock_x86_check_reserved_size(&addr, sizep, align))
63 ;
64
65 if (*sizep)
66 return addr;
67 }
68
69 return MEMBLOCK_ERROR;
70}
71
72static __init struct range *find_range_array(int count)
73{
74 u64 end, size, mem;
75 struct range *range;
76
77 size = sizeof(struct range) * count;
78 end = memblock.current_limit;
79
80 mem = memblock_find_in_range(0, end, size, sizeof(struct range));
81 if (mem == MEMBLOCK_ERROR)
82 panic("can not find more space for range array");
83
84 /*
85 * This range is tempoaray, so don't reserve it, it will not be
86 * overlapped because We will not alloccate new buffer before
87 * We discard this one
88 */
89 range = __va(mem);
90 memset(range, 0, size);
91
92 return range;
93}
94
95static void __init memblock_x86_subtract_reserved(struct range *range, int az)
96{
97 u64 final_start, final_end;
98 struct memblock_region *r;
99
100 /* Take out region array itself at first*/
101 memblock_free_reserved_regions();
102
103 memblock_dbg("Subtract (%ld early reservations)\n", memblock.reserved.cnt);
104
105 for_each_memblock(reserved, r) {
106 memblock_dbg(" [%010llx-%010llx]\n", (u64)r->base, (u64)r->base + r->size - 1);
107 final_start = PFN_DOWN(r->base);
108 final_end = PFN_UP(r->base + r->size);
109 if (final_start >= final_end)
110 continue;
111 subtract_range(range, az, final_start, final_end);
112 }
113
114 /* Put region array back ? */
115 memblock_reserve_reserved_regions();
116}
117
118struct count_data {
119 int nr;
120};
121
122static int __init count_work_fn(unsigned long start_pfn,
123 unsigned long end_pfn, void *datax)
124{
125 struct count_data *data = datax;
126
127 data->nr++;
128
129 return 0;
130}
131
132static int __init count_early_node_map(int nodeid)
133{
134 struct count_data data;
135
136 data.nr = 0;
137 work_with_active_regions(nodeid, count_work_fn, &data);
138
139 return data.nr;
140}
141
142int __init __get_free_all_memory_range(struct range **rangep, int nodeid,
143 unsigned long start_pfn, unsigned long end_pfn)
144{
145 int count;
146 struct range *range;
147 int nr_range;
148
149 count = (memblock.reserved.cnt + count_early_node_map(nodeid)) * 2;
150
151 range = find_range_array(count);
152 nr_range = 0;
153
154 /*
155 * Use early_node_map[] and memblock.reserved.region to get range array
156 * at first
157 */
158 nr_range = add_from_early_node_map(range, count, nr_range, nodeid);
159 subtract_range(range, count, 0, start_pfn);
160 subtract_range(range, count, end_pfn, -1ULL);
161
162 memblock_x86_subtract_reserved(range, count);
163 nr_range = clean_sort_range(range, count);
164
165 *rangep = range;
166 return nr_range;
167}
168
169int __init get_free_all_memory_range(struct range **rangep, int nodeid)
170{
171 unsigned long end_pfn = -1UL;
172
173#ifdef CONFIG_X86_32
174 end_pfn = max_low_pfn;
175#endif
176 return __get_free_all_memory_range(rangep, nodeid, 0, end_pfn);
177}
178
179static u64 __init __memblock_x86_memory_in_range(u64 addr, u64 limit, bool get_free)
180{
181 int i, count;
182 struct range *range;
183 int nr_range;
184 u64 final_start, final_end;
185 u64 free_size;
186 struct memblock_region *r;
187
188 count = (memblock.reserved.cnt + memblock.memory.cnt) * 2;
189
190 range = find_range_array(count);
191 nr_range = 0;
192
193 addr = PFN_UP(addr);
194 limit = PFN_DOWN(limit);
195
196 for_each_memblock(memory, r) {
197 final_start = PFN_UP(r->base);
198 final_end = PFN_DOWN(r->base + r->size);
199 if (final_start >= final_end)
200 continue;
201 if (final_start >= limit || final_end <= addr)
202 continue;
203
204 nr_range = add_range(range, count, nr_range, final_start, final_end);
205 }
206 subtract_range(range, count, 0, addr);
207 subtract_range(range, count, limit, -1ULL);
208
209 /* Subtract memblock.reserved.region in range ? */
210 if (!get_free)
211 goto sort_and_count_them;
212 for_each_memblock(reserved, r) {
213 final_start = PFN_DOWN(r->base);
214 final_end = PFN_UP(r->base + r->size);
215 if (final_start >= final_end)
216 continue;
217 if (final_start >= limit || final_end <= addr)
218 continue;
219
220 subtract_range(range, count, final_start, final_end);
221 }
222
223sort_and_count_them:
224 nr_range = clean_sort_range(range, count);
225
226 free_size = 0;
227 for (i = 0; i < nr_range; i++)
228 free_size += range[i].end - range[i].start;
229
230 return free_size << PAGE_SHIFT;
231}
232
233u64 __init memblock_x86_free_memory_in_range(u64 addr, u64 limit)
234{
235 return __memblock_x86_memory_in_range(addr, limit, true);
236}
237
238u64 __init memblock_x86_memory_in_range(u64 addr, u64 limit)
239{
240 return __memblock_x86_memory_in_range(addr, limit, false);
241}
242
243void __init memblock_x86_reserve_range(u64 start, u64 end, char *name)
244{
245 if (start == end)
246 return;
247
248 if (WARN_ONCE(start > end, "memblock_x86_reserve_range: wrong range [%#llx, %#llx)\n", start, end))
249 return;
250
251 memblock_dbg(" memblock_x86_reserve_range: [%#010llx-%#010llx] %16s\n", start, end - 1, name);
252
253 memblock_reserve(start, end - start);
254}
255
256void __init memblock_x86_free_range(u64 start, u64 end)
257{
258 if (start == end)
259 return;
260
261 if (WARN_ONCE(start > end, "memblock_x86_free_range: wrong range [%#llx, %#llx)\n", start, end))
262 return;
263
264 memblock_dbg(" memblock_x86_free_range: [%#010llx-%#010llx]\n", start, end - 1);
265
266 memblock_free(start, end - start);
267}
268
269/*
270 * Need to call this function after memblock_x86_register_active_regions,
271 * so early_node_map[] is filled already.
272 */
273u64 __init memblock_x86_find_in_range_node(int nid, u64 start, u64 end, u64 size, u64 align)
274{
275 u64 addr;
276 addr = find_memory_core_early(nid, size, align, start, end);
277 if (addr != MEMBLOCK_ERROR)
278 return addr;
279
280 /* Fallback, should already have start end within node range */
281 return memblock_find_in_range(start, end, size, align);
282}
283
284/*
285 * Finds an active region in the address range from start_pfn to last_pfn and
286 * returns its range in ei_startpfn and ei_endpfn for the memblock entry.
287 */
288static int __init memblock_x86_find_active_region(const struct memblock_region *ei,
289 unsigned long start_pfn,
290 unsigned long last_pfn,
291 unsigned long *ei_startpfn,
292 unsigned long *ei_endpfn)
293{
294 u64 align = PAGE_SIZE;
295
296 *ei_startpfn = round_up(ei->base, align) >> PAGE_SHIFT;
297 *ei_endpfn = round_down(ei->base + ei->size, align) >> PAGE_SHIFT;
298
299 /* Skip map entries smaller than a page */
300 if (*ei_startpfn >= *ei_endpfn)
301 return 0;
302
303 /* Skip if map is outside the node */
304 if (*ei_endpfn <= start_pfn || *ei_startpfn >= last_pfn)
305 return 0;
306
307 /* Check for overlaps */
308 if (*ei_startpfn < start_pfn)
309 *ei_startpfn = start_pfn;
310 if (*ei_endpfn > last_pfn)
311 *ei_endpfn = last_pfn;
312
313 return 1;
314}
315
316/* Walk the memblock.memory map and register active regions within a node */
317void __init memblock_x86_register_active_regions(int nid, unsigned long start_pfn,
318 unsigned long last_pfn)
319{
320 unsigned long ei_startpfn;
321 unsigned long ei_endpfn;
322 struct memblock_region *r;
323
324 for_each_memblock(memory, r)
325 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
326 &ei_startpfn, &ei_endpfn))
327 add_active_range(nid, ei_startpfn, ei_endpfn);
328}
329
330/*
331 * Find the hole size (in bytes) in the memory range.
332 * @start: starting address of the memory range to scan
333 * @end: ending address of the memory range to scan
334 */
335u64 __init memblock_x86_hole_size(u64 start, u64 end)
336{
337 unsigned long start_pfn = start >> PAGE_SHIFT;
338 unsigned long last_pfn = end >> PAGE_SHIFT;
339 unsigned long ei_startpfn, ei_endpfn, ram = 0;
340 struct memblock_region *r;
341
342 for_each_memblock(memory, r)
343 if (memblock_x86_find_active_region(r, start_pfn, last_pfn,
344 &ei_startpfn, &ei_endpfn))
345 ram += ei_endpfn - ei_startpfn;
346
347 return end - start - ((u64)ram << PAGE_SHIFT);
348}
diff --git a/arch/x86/mm/memtest.c b/arch/x86/mm/memtest.c
index 18d244f70205..92faf3a1c53e 100644
--- a/arch/x86/mm/memtest.c
+++ b/arch/x86/mm/memtest.c
@@ -6,8 +6,7 @@
6#include <linux/smp.h> 6#include <linux/smp.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/pfn.h> 8#include <linux/pfn.h>
9 9#include <linux/memblock.h>
10#include <asm/e820.h>
11 10
12static u64 patterns[] __initdata = { 11static u64 patterns[] __initdata = {
13 0, 12 0,
@@ -35,7 +34,7 @@ static void __init reserve_bad_mem(u64 pattern, u64 start_bad, u64 end_bad)
35 (unsigned long long) pattern, 34 (unsigned long long) pattern,
36 (unsigned long long) start_bad, 35 (unsigned long long) start_bad,
37 (unsigned long long) end_bad); 36 (unsigned long long) end_bad);
38 reserve_early(start_bad, end_bad, "BAD RAM"); 37 memblock_x86_reserve_range(start_bad, end_bad, "BAD RAM");
39} 38}
40 39
41static void __init memtest(u64 pattern, u64 start_phys, u64 size) 40static void __init memtest(u64 pattern, u64 start_phys, u64 size)
@@ -74,7 +73,7 @@ static void __init do_one_pass(u64 pattern, u64 start, u64 end)
74 u64 size = 0; 73 u64 size = 0;
75 74
76 while (start < end) { 75 while (start < end) {
77 start = find_e820_area_size(start, &size, 1); 76 start = memblock_x86_find_in_range_size(start, &size, 1);
78 77
79 /* done ? */ 78 /* done ? */
80 if (start >= end) 79 if (start >= end)
diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c
index 787c52ca49c3..f5510d889a22 100644
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -1,15 +1,112 @@
1/* Common code for 32 and 64-bit NUMA */ 1/* Common code for 32 and 64-bit NUMA */
2#include <linux/topology.h> 2#include <linux/kernel.h>
3#include <linux/module.h> 3#include <linux/mm.h>
4#include <linux/string.h>
5#include <linux/init.h>
4#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/memblock.h>
8#include <linux/mmzone.h>
9#include <linux/ctype.h>
10#include <linux/module.h>
11#include <linux/nodemask.h>
12#include <linux/sched.h>
13#include <linux/topology.h>
14
15#include <asm/e820.h>
16#include <asm/proto.h>
17#include <asm/dma.h>
18#include <asm/acpi.h>
19#include <asm/amd_nb.h>
20
21#include "numa_internal.h"
22
23int __initdata numa_off;
24nodemask_t numa_nodes_parsed __initdata;
25
26struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
27EXPORT_SYMBOL(node_data);
28
29static struct numa_meminfo numa_meminfo
30#ifndef CONFIG_MEMORY_HOTPLUG
31__initdata
32#endif
33;
34
35static int numa_distance_cnt;
36static u8 *numa_distance;
37
38static __init int numa_setup(char *opt)
39{
40 if (!opt)
41 return -EINVAL;
42 if (!strncmp(opt, "off", 3))
43 numa_off = 1;
44#ifdef CONFIG_NUMA_EMU
45 if (!strncmp(opt, "fake=", 5))
46 numa_emu_cmdline(opt + 5);
47#endif
48#ifdef CONFIG_ACPI_NUMA
49 if (!strncmp(opt, "noacpi", 6))
50 acpi_numa = -1;
51#endif
52 return 0;
53}
54early_param("numa", numa_setup);
5 55
6/* 56/*
7 * Which logical CPUs are on which nodes 57 * apicid, cpu, node mappings
8 */ 58 */
59s16 __apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
60 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
61};
62
63int __cpuinit numa_cpu_node(int cpu)
64{
65 int apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
66
67 if (apicid != BAD_APICID)
68 return __apicid_to_node[apicid];
69 return NUMA_NO_NODE;
70}
71
9cpumask_var_t node_to_cpumask_map[MAX_NUMNODES]; 72cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
10EXPORT_SYMBOL(node_to_cpumask_map); 73EXPORT_SYMBOL(node_to_cpumask_map);
11 74
12/* 75/*
76 * Map cpu index to node index
77 */
78DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
79EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
80
81void __cpuinit numa_set_node(int cpu, int node)
82{
83 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
84
85 /* early setting, no percpu area yet */
86 if (cpu_to_node_map) {
87 cpu_to_node_map[cpu] = node;
88 return;
89 }
90
91#ifdef CONFIG_DEBUG_PER_CPU_MAPS
92 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
93 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
94 dump_stack();
95 return;
96 }
97#endif
98 per_cpu(x86_cpu_to_node_map, cpu) = node;
99
100 if (node != NUMA_NO_NODE)
101 set_cpu_numa_node(cpu, node);
102}
103
104void __cpuinit numa_clear_node(int cpu)
105{
106 numa_set_node(cpu, NUMA_NO_NODE);
107}
108
109/*
13 * Allocate node_to_cpumask_map based on number of available nodes 110 * Allocate node_to_cpumask_map based on number of available nodes
14 * Requires node_possible_map to be valid. 111 * Requires node_possible_map to be valid.
15 * 112 *
@@ -35,7 +132,659 @@ void __init setup_node_to_cpumask_map(void)
35 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids); 132 pr_debug("Node to cpumask map for %d nodes\n", nr_node_ids);
36} 133}
37 134
38#ifdef CONFIG_DEBUG_PER_CPU_MAPS 135static int __init numa_add_memblk_to(int nid, u64 start, u64 end,
136 struct numa_meminfo *mi)
137{
138 /* ignore zero length blks */
139 if (start == end)
140 return 0;
141
142 /* whine about and ignore invalid blks */
143 if (start > end || nid < 0 || nid >= MAX_NUMNODES) {
144 pr_warning("NUMA: Warning: invalid memblk node %d (%Lx-%Lx)\n",
145 nid, start, end);
146 return 0;
147 }
148
149 if (mi->nr_blks >= NR_NODE_MEMBLKS) {
150 pr_err("NUMA: too many memblk ranges\n");
151 return -EINVAL;
152 }
153
154 mi->blk[mi->nr_blks].start = start;
155 mi->blk[mi->nr_blks].end = end;
156 mi->blk[mi->nr_blks].nid = nid;
157 mi->nr_blks++;
158 return 0;
159}
160
161/**
162 * numa_remove_memblk_from - Remove one numa_memblk from a numa_meminfo
163 * @idx: Index of memblk to remove
164 * @mi: numa_meminfo to remove memblk from
165 *
166 * Remove @idx'th numa_memblk from @mi by shifting @mi->blk[] and
167 * decrementing @mi->nr_blks.
168 */
169void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi)
170{
171 mi->nr_blks--;
172 memmove(&mi->blk[idx], &mi->blk[idx + 1],
173 (mi->nr_blks - idx) * sizeof(mi->blk[0]));
174}
175
176/**
177 * numa_add_memblk - Add one numa_memblk to numa_meminfo
178 * @nid: NUMA node ID of the new memblk
179 * @start: Start address of the new memblk
180 * @end: End address of the new memblk
181 *
182 * Add a new memblk to the default numa_meminfo.
183 *
184 * RETURNS:
185 * 0 on success, -errno on failure.
186 */
187int __init numa_add_memblk(int nid, u64 start, u64 end)
188{
189 return numa_add_memblk_to(nid, start, end, &numa_meminfo);
190}
191
192/* Initialize NODE_DATA for a node on the local memory */
193static void __init setup_node_data(int nid, u64 start, u64 end)
194{
195 const u64 nd_low = PFN_PHYS(MAX_DMA_PFN);
196 const u64 nd_high = PFN_PHYS(max_pfn_mapped);
197 const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
198 bool remapped = false;
199 u64 nd_pa;
200 void *nd;
201 int tnid;
202
203 /*
204 * Don't confuse VM with a node that doesn't have the
205 * minimum amount of memory:
206 */
207 if (end && (end - start) < NODE_MIN_SIZE)
208 return;
209
210 /* initialize remap allocator before aligning to ZONE_ALIGN */
211 init_alloc_remap(nid, start, end);
212
213 start = roundup(start, ZONE_ALIGN);
214
215 printk(KERN_INFO "Initmem setup node %d %016Lx-%016Lx\n",
216 nid, start, end);
217
218 /*
219 * Allocate node data. Try remap allocator first, node-local
220 * memory and then any node. Never allocate in DMA zone.
221 */
222 nd = alloc_remap(nid, nd_size);
223 if (nd) {
224 nd_pa = __pa(nd);
225 remapped = true;
226 } else {
227 nd_pa = memblock_x86_find_in_range_node(nid, nd_low, nd_high,
228 nd_size, SMP_CACHE_BYTES);
229 if (nd_pa == MEMBLOCK_ERROR)
230 nd_pa = memblock_find_in_range(nd_low, nd_high,
231 nd_size, SMP_CACHE_BYTES);
232 if (nd_pa == MEMBLOCK_ERROR) {
233 pr_err("Cannot find %zu bytes in node %d\n",
234 nd_size, nid);
235 return;
236 }
237 memblock_x86_reserve_range(nd_pa, nd_pa + nd_size, "NODE_DATA");
238 nd = __va(nd_pa);
239 }
240
241 /* report and initialize */
242 printk(KERN_INFO " NODE_DATA [%016Lx - %016Lx]%s\n",
243 nd_pa, nd_pa + nd_size - 1, remapped ? " (remapped)" : "");
244 tnid = early_pfn_to_nid(nd_pa >> PAGE_SHIFT);
245 if (!remapped && tnid != nid)
246 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nid, tnid);
247
248 node_data[nid] = nd;
249 memset(NODE_DATA(nid), 0, sizeof(pg_data_t));
250 NODE_DATA(nid)->node_id = nid;
251 NODE_DATA(nid)->node_start_pfn = start >> PAGE_SHIFT;
252 NODE_DATA(nid)->node_spanned_pages = (end - start) >> PAGE_SHIFT;
253
254 node_set_online(nid);
255}
256
257/**
258 * numa_cleanup_meminfo - Cleanup a numa_meminfo
259 * @mi: numa_meminfo to clean up
260 *
261 * Sanitize @mi by merging and removing unncessary memblks. Also check for
262 * conflicts and clear unused memblks.
263 *
264 * RETURNS:
265 * 0 on success, -errno on failure.
266 */
267int __init numa_cleanup_meminfo(struct numa_meminfo *mi)
268{
269 const u64 low = 0;
270 const u64 high = PFN_PHYS(max_pfn);
271 int i, j, k;
272
273 /* first, trim all entries */
274 for (i = 0; i < mi->nr_blks; i++) {
275 struct numa_memblk *bi = &mi->blk[i];
276
277 /* make sure all blocks are inside the limits */
278 bi->start = max(bi->start, low);
279 bi->end = min(bi->end, high);
280
281 /* and there's no empty block */
282 if (bi->start >= bi->end)
283 numa_remove_memblk_from(i--, mi);
284 }
285
286 /* merge neighboring / overlapping entries */
287 for (i = 0; i < mi->nr_blks; i++) {
288 struct numa_memblk *bi = &mi->blk[i];
289
290 for (j = i + 1; j < mi->nr_blks; j++) {
291 struct numa_memblk *bj = &mi->blk[j];
292 u64 start, end;
293
294 /*
295 * See whether there are overlapping blocks. Whine
296 * about but allow overlaps of the same nid. They
297 * will be merged below.
298 */
299 if (bi->end > bj->start && bi->start < bj->end) {
300 if (bi->nid != bj->nid) {
301 pr_err("NUMA: node %d (%Lx-%Lx) overlaps with node %d (%Lx-%Lx)\n",
302 bi->nid, bi->start, bi->end,
303 bj->nid, bj->start, bj->end);
304 return -EINVAL;
305 }
306 pr_warning("NUMA: Warning: node %d (%Lx-%Lx) overlaps with itself (%Lx-%Lx)\n",
307 bi->nid, bi->start, bi->end,
308 bj->start, bj->end);
309 }
310
311 /*
312 * Join together blocks on the same node, holes
313 * between which don't overlap with memory on other
314 * nodes.
315 */
316 if (bi->nid != bj->nid)
317 continue;
318 start = min(bi->start, bj->start);
319 end = max(bi->end, bj->end);
320 for (k = 0; k < mi->nr_blks; k++) {
321 struct numa_memblk *bk = &mi->blk[k];
322
323 if (bi->nid == bk->nid)
324 continue;
325 if (start < bk->end && end > bk->start)
326 break;
327 }
328 if (k < mi->nr_blks)
329 continue;
330 printk(KERN_INFO "NUMA: Node %d [%Lx,%Lx) + [%Lx,%Lx) -> [%Lx,%Lx)\n",
331 bi->nid, bi->start, bi->end, bj->start, bj->end,
332 start, end);
333 bi->start = start;
334 bi->end = end;
335 numa_remove_memblk_from(j--, mi);
336 }
337 }
338
339 /* clear unused ones */
340 for (i = mi->nr_blks; i < ARRAY_SIZE(mi->blk); i++) {
341 mi->blk[i].start = mi->blk[i].end = 0;
342 mi->blk[i].nid = NUMA_NO_NODE;
343 }
344
345 return 0;
346}
347
348/*
349 * Set nodes, which have memory in @mi, in *@nodemask.
350 */
351static void __init numa_nodemask_from_meminfo(nodemask_t *nodemask,
352 const struct numa_meminfo *mi)
353{
354 int i;
355
356 for (i = 0; i < ARRAY_SIZE(mi->blk); i++)
357 if (mi->blk[i].start != mi->blk[i].end &&
358 mi->blk[i].nid != NUMA_NO_NODE)
359 node_set(mi->blk[i].nid, *nodemask);
360}
361
362/**
363 * numa_reset_distance - Reset NUMA distance table
364 *
365 * The current table is freed. The next numa_set_distance() call will
366 * create a new one.
367 */
368void __init numa_reset_distance(void)
369{
370 size_t size = numa_distance_cnt * numa_distance_cnt * sizeof(numa_distance[0]);
371
372 /* numa_distance could be 1LU marking allocation failure, test cnt */
373 if (numa_distance_cnt)
374 memblock_x86_free_range(__pa(numa_distance),
375 __pa(numa_distance) + size);
376 numa_distance_cnt = 0;
377 numa_distance = NULL; /* enable table creation */
378}
379
380static int __init numa_alloc_distance(void)
381{
382 nodemask_t nodes_parsed;
383 size_t size;
384 int i, j, cnt = 0;
385 u64 phys;
386
387 /* size the new table and allocate it */
388 nodes_parsed = numa_nodes_parsed;
389 numa_nodemask_from_meminfo(&nodes_parsed, &numa_meminfo);
390
391 for_each_node_mask(i, nodes_parsed)
392 cnt = i;
393 cnt++;
394 size = cnt * cnt * sizeof(numa_distance[0]);
395
396 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
397 size, PAGE_SIZE);
398 if (phys == MEMBLOCK_ERROR) {
399 pr_warning("NUMA: Warning: can't allocate distance table!\n");
400 /* don't retry until explicitly reset */
401 numa_distance = (void *)1LU;
402 return -ENOMEM;
403 }
404 memblock_x86_reserve_range(phys, phys + size, "NUMA DIST");
405
406 numa_distance = __va(phys);
407 numa_distance_cnt = cnt;
408
409 /* fill with the default distances */
410 for (i = 0; i < cnt; i++)
411 for (j = 0; j < cnt; j++)
412 numa_distance[i * cnt + j] = i == j ?
413 LOCAL_DISTANCE : REMOTE_DISTANCE;
414 printk(KERN_DEBUG "NUMA: Initialized distance table, cnt=%d\n", cnt);
415
416 return 0;
417}
418
419/**
420 * numa_set_distance - Set NUMA distance from one NUMA to another
421 * @from: the 'from' node to set distance
422 * @to: the 'to' node to set distance
423 * @distance: NUMA distance
424 *
425 * Set the distance from node @from to @to to @distance. If distance table
426 * doesn't exist, one which is large enough to accommodate all the currently
427 * known nodes will be created.
428 *
429 * If such table cannot be allocated, a warning is printed and further
430 * calls are ignored until the distance table is reset with
431 * numa_reset_distance().
432 *
433 * If @from or @to is higher than the highest known node at the time of
434 * table creation or @distance doesn't make sense, the call is ignored.
435 * This is to allow simplification of specific NUMA config implementations.
436 */
437void __init numa_set_distance(int from, int to, int distance)
438{
439 if (!numa_distance && numa_alloc_distance() < 0)
440 return;
441
442 if (from >= numa_distance_cnt || to >= numa_distance_cnt) {
443 printk_once(KERN_DEBUG "NUMA: Debug: distance out of bound, from=%d to=%d distance=%d\n",
444 from, to, distance);
445 return;
446 }
447
448 if ((u8)distance != distance ||
449 (from == to && distance != LOCAL_DISTANCE)) {
450 pr_warn_once("NUMA: Warning: invalid distance parameter, from=%d to=%d distance=%d\n",
451 from, to, distance);
452 return;
453 }
454
455 numa_distance[from * numa_distance_cnt + to] = distance;
456}
457
458int __node_distance(int from, int to)
459{
460 if (from >= numa_distance_cnt || to >= numa_distance_cnt)
461 return from == to ? LOCAL_DISTANCE : REMOTE_DISTANCE;
462 return numa_distance[from * numa_distance_cnt + to];
463}
464EXPORT_SYMBOL(__node_distance);
465
466/*
467 * Sanity check to catch more bad NUMA configurations (they are amazingly
468 * common). Make sure the nodes cover all memory.
469 */
470static bool __init numa_meminfo_cover_memory(const struct numa_meminfo *mi)
471{
472 u64 numaram, e820ram;
473 int i;
474
475 numaram = 0;
476 for (i = 0; i < mi->nr_blks; i++) {
477 u64 s = mi->blk[i].start >> PAGE_SHIFT;
478 u64 e = mi->blk[i].end >> PAGE_SHIFT;
479 numaram += e - s;
480 numaram -= __absent_pages_in_range(mi->blk[i].nid, s, e);
481 if ((s64)numaram < 0)
482 numaram = 0;
483 }
484
485 e820ram = max_pfn - (memblock_x86_hole_size(0,
486 PFN_PHYS(max_pfn)) >> PAGE_SHIFT);
487 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
488 if ((s64)(e820ram - numaram) >= (1 << (20 - PAGE_SHIFT))) {
489 printk(KERN_ERR "NUMA: nodes only cover %LuMB of your %LuMB e820 RAM. Not used.\n",
490 (numaram << PAGE_SHIFT) >> 20,
491 (e820ram << PAGE_SHIFT) >> 20);
492 return false;
493 }
494 return true;
495}
496
497static int __init numa_register_memblks(struct numa_meminfo *mi)
498{
499 int i, nid;
500
501 /* Account for nodes with cpus and no memory */
502 node_possible_map = numa_nodes_parsed;
503 numa_nodemask_from_meminfo(&node_possible_map, mi);
504 if (WARN_ON(nodes_empty(node_possible_map)))
505 return -EINVAL;
506
507 for (i = 0; i < mi->nr_blks; i++)
508 memblock_x86_register_active_regions(mi->blk[i].nid,
509 mi->blk[i].start >> PAGE_SHIFT,
510 mi->blk[i].end >> PAGE_SHIFT);
511
512 /* for out of order entries */
513 sort_node_map();
514 if (!numa_meminfo_cover_memory(mi))
515 return -EINVAL;
516
517 /* Finally register nodes. */
518 for_each_node_mask(nid, node_possible_map) {
519 u64 start = PFN_PHYS(max_pfn);
520 u64 end = 0;
521
522 for (i = 0; i < mi->nr_blks; i++) {
523 if (nid != mi->blk[i].nid)
524 continue;
525 start = min(mi->blk[i].start, start);
526 end = max(mi->blk[i].end, end);
527 }
528
529 if (start < end)
530 setup_node_data(nid, start, end);
531 }
532
533 return 0;
534}
535
536/*
537 * There are unfortunately some poorly designed mainboards around that
538 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
539 * mapping. To avoid this fill in the mapping for all possible CPUs,
540 * as the number of CPUs is not known yet. We round robin the existing
541 * nodes.
542 */
543static void __init numa_init_array(void)
544{
545 int rr, i;
546
547 rr = first_node(node_online_map);
548 for (i = 0; i < nr_cpu_ids; i++) {
549 if (early_cpu_to_node(i) != NUMA_NO_NODE)
550 continue;
551 numa_set_node(i, rr);
552 rr = next_node(rr, node_online_map);
553 if (rr == MAX_NUMNODES)
554 rr = first_node(node_online_map);
555 }
556}
557
558static int __init numa_init(int (*init_func)(void))
559{
560 int i;
561 int ret;
562
563 for (i = 0; i < MAX_LOCAL_APIC; i++)
564 set_apicid_to_node(i, NUMA_NO_NODE);
565
566 nodes_clear(numa_nodes_parsed);
567 nodes_clear(node_possible_map);
568 nodes_clear(node_online_map);
569 memset(&numa_meminfo, 0, sizeof(numa_meminfo));
570 remove_all_active_ranges();
571 numa_reset_distance();
572
573 ret = init_func();
574 if (ret < 0)
575 return ret;
576 ret = numa_cleanup_meminfo(&numa_meminfo);
577 if (ret < 0)
578 return ret;
579
580 numa_emulation(&numa_meminfo, numa_distance_cnt);
581
582 ret = numa_register_memblks(&numa_meminfo);
583 if (ret < 0)
584 return ret;
585
586 for (i = 0; i < nr_cpu_ids; i++) {
587 int nid = early_cpu_to_node(i);
588
589 if (nid == NUMA_NO_NODE)
590 continue;
591 if (!node_online(nid))
592 numa_clear_node(i);
593 }
594 numa_init_array();
595 return 0;
596}
597
598/**
599 * dummy_numa_init - Fallback dummy NUMA init
600 *
601 * Used if there's no underlying NUMA architecture, NUMA initialization
602 * fails, or NUMA is disabled on the command line.
603 *
604 * Must online at least one node and add memory blocks that cover all
605 * allowed memory. This function must not fail.
606 */
607static int __init dummy_numa_init(void)
608{
609 printk(KERN_INFO "%s\n",
610 numa_off ? "NUMA turned off" : "No NUMA configuration found");
611 printk(KERN_INFO "Faking a node at %016Lx-%016Lx\n",
612 0LLU, PFN_PHYS(max_pfn));
613
614 node_set(0, numa_nodes_parsed);
615 numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
616
617 return 0;
618}
619
620/**
621 * x86_numa_init - Initialize NUMA
622 *
623 * Try each configured NUMA initialization method until one succeeds. The
624 * last fallback is dummy single node config encomapssing whole memory and
625 * never fails.
626 */
627void __init x86_numa_init(void)
628{
629 if (!numa_off) {
630#ifdef CONFIG_X86_NUMAQ
631 if (!numa_init(numaq_numa_init))
632 return;
633#endif
634#ifdef CONFIG_ACPI_NUMA
635 if (!numa_init(x86_acpi_numa_init))
636 return;
637#endif
638#ifdef CONFIG_AMD_NUMA
639 if (!numa_init(amd_numa_init))
640 return;
641#endif
642 }
643
644 numa_init(dummy_numa_init);
645}
646
647static __init int find_near_online_node(int node)
648{
649 int n, val;
650 int min_val = INT_MAX;
651 int best_node = -1;
652
653 for_each_online_node(n) {
654 val = node_distance(node, n);
655
656 if (val < min_val) {
657 min_val = val;
658 best_node = n;
659 }
660 }
661
662 return best_node;
663}
664
665/*
666 * Setup early cpu_to_node.
667 *
668 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
669 * and apicid_to_node[] tables have valid entries for a CPU.
670 * This means we skip cpu_to_node[] initialisation for NUMA
671 * emulation and faking node case (when running a kernel compiled
672 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
673 * is already initialized in a round robin manner at numa_init_array,
674 * prior to this call, and this initialization is good enough
675 * for the fake NUMA cases.
676 *
677 * Called before the per_cpu areas are setup.
678 */
679void __init init_cpu_to_node(void)
680{
681 int cpu;
682 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
683
684 BUG_ON(cpu_to_apicid == NULL);
685
686 for_each_possible_cpu(cpu) {
687 int node = numa_cpu_node(cpu);
688
689 if (node == NUMA_NO_NODE)
690 continue;
691 if (!node_online(node))
692 node = find_near_online_node(node);
693 numa_set_node(cpu, node);
694 }
695}
696
697#ifndef CONFIG_DEBUG_PER_CPU_MAPS
698
699# ifndef CONFIG_NUMA_EMU
700void __cpuinit numa_add_cpu(int cpu)
701{
702 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
703}
704
705void __cpuinit numa_remove_cpu(int cpu)
706{
707 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
708}
709# endif /* !CONFIG_NUMA_EMU */
710
711#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
712
713int __cpu_to_node(int cpu)
714{
715 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
716 printk(KERN_WARNING
717 "cpu_to_node(%d): usage too early!\n", cpu);
718 dump_stack();
719 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
720 }
721 return per_cpu(x86_cpu_to_node_map, cpu);
722}
723EXPORT_SYMBOL(__cpu_to_node);
724
725/*
726 * Same function as cpu_to_node() but used if called before the
727 * per_cpu areas are setup.
728 */
729int early_cpu_to_node(int cpu)
730{
731 if (early_per_cpu_ptr(x86_cpu_to_node_map))
732 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
733
734 if (!cpu_possible(cpu)) {
735 printk(KERN_WARNING
736 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
737 dump_stack();
738 return NUMA_NO_NODE;
739 }
740 return per_cpu(x86_cpu_to_node_map, cpu);
741}
742
743void debug_cpumask_set_cpu(int cpu, int node, bool enable)
744{
745 struct cpumask *mask;
746 char buf[64];
747
748 if (node == NUMA_NO_NODE) {
749 /* early_cpu_to_node() already emits a warning and trace */
750 return;
751 }
752 mask = node_to_cpumask_map[node];
753 if (!mask) {
754 pr_err("node_to_cpumask_map[%i] NULL\n", node);
755 dump_stack();
756 return;
757 }
758
759 if (enable)
760 cpumask_set_cpu(cpu, mask);
761 else
762 cpumask_clear_cpu(cpu, mask);
763
764 cpulist_scnprintf(buf, sizeof(buf), mask);
765 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
766 enable ? "numa_add_cpu" : "numa_remove_cpu",
767 cpu, node, buf);
768 return;
769}
770
771# ifndef CONFIG_NUMA_EMU
772static void __cpuinit numa_set_cpumask(int cpu, bool enable)
773{
774 debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
775}
776
777void __cpuinit numa_add_cpu(int cpu)
778{
779 numa_set_cpumask(cpu, true);
780}
781
782void __cpuinit numa_remove_cpu(int cpu)
783{
784 numa_set_cpumask(cpu, false);
785}
786# endif /* !CONFIG_NUMA_EMU */
787
39/* 788/*
40 * Returns a pointer to the bitmask of CPUs on Node 'node'. 789 * Returns a pointer to the bitmask of CPUs on Node 'node'.
41 */ 790 */
@@ -58,4 +807,20 @@ const struct cpumask *cpumask_of_node(int node)
58 return node_to_cpumask_map[node]; 807 return node_to_cpumask_map[node];
59} 808}
60EXPORT_SYMBOL(cpumask_of_node); 809EXPORT_SYMBOL(cpumask_of_node);
810
811#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
812
813#ifdef CONFIG_MEMORY_HOTPLUG
814int memory_add_physaddr_to_nid(u64 start)
815{
816 struct numa_meminfo *mi = &numa_meminfo;
817 int nid = mi->blk[0].nid;
818 int i;
819
820 for (i = 0; i < mi->nr_blks; i++)
821 if (mi->blk[i].start <= start && mi->blk[i].end > start)
822 nid = mi->blk[i].nid;
823 return nid;
824}
825EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
61#endif 826#endif
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index 809baaaf48b1..849a975d3fa0 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -22,38 +22,11 @@
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. 22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 */ 23 */
24 24
25#include <linux/mm.h>
26#include <linux/bootmem.h> 25#include <linux/bootmem.h>
27#include <linux/mmzone.h> 26#include <linux/memblock.h>
28#include <linux/highmem.h>
29#include <linux/initrd.h>
30#include <linux/nodemask.h>
31#include <linux/module.h> 27#include <linux/module.h>
32#include <linux/kexec.h>
33#include <linux/pfn.h>
34#include <linux/swap.h>
35#include <linux/acpi.h>
36
37#include <asm/e820.h>
38#include <asm/setup.h>
39#include <asm/mmzone.h>
40#include <asm/bios_ebda.h>
41#include <asm/proto.h>
42
43struct pglist_data *node_data[MAX_NUMNODES] __read_mostly;
44EXPORT_SYMBOL(node_data);
45
46/*
47 * numa interface - we expect the numa architecture specific code to have
48 * populated the following initialisation.
49 *
50 * 1) node_online_map - the map of all nodes configured (online) in the system
51 * 2) node_start_pfn - the starting page frame number for a node
52 * 3) node_end_pfn - the ending page fram number for a node
53 */
54unsigned long node_start_pfn[MAX_NUMNODES] __read_mostly;
55unsigned long node_end_pfn[MAX_NUMNODES] __read_mostly;
56 28
29#include "numa_internal.h"
57 30
58#ifdef CONFIG_DISCONTIGMEM 31#ifdef CONFIG_DISCONTIGMEM
59/* 32/*
@@ -98,102 +71,46 @@ unsigned long node_memmap_size_bytes(int nid, unsigned long start_pfn,
98} 71}
99#endif 72#endif
100 73
101extern unsigned long find_max_low_pfn(void);
102extern unsigned long highend_pfn, highstart_pfn; 74extern unsigned long highend_pfn, highstart_pfn;
103 75
104#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE) 76#define LARGE_PAGE_BYTES (PTRS_PER_PTE * PAGE_SIZE)
105 77
106unsigned long node_remap_size[MAX_NUMNODES];
107static void *node_remap_start_vaddr[MAX_NUMNODES]; 78static void *node_remap_start_vaddr[MAX_NUMNODES];
108void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 79void set_pmd_pfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
109 80
110static unsigned long kva_start_pfn;
111static unsigned long kva_pages;
112/*
113 * FLAT - support for basic PC memory model with discontig enabled, essentially
114 * a single node with all available processors in it with a flat
115 * memory map.
116 */
117int __init get_memcfg_numa_flat(void)
118{
119 printk(KERN_DEBUG "NUMA - single node, flat memory mode\n");
120
121 node_start_pfn[0] = 0;
122 node_end_pfn[0] = max_pfn;
123 e820_register_active_regions(0, 0, max_pfn);
124 memory_present(0, 0, max_pfn);
125 node_remap_size[0] = node_memmap_size_bytes(0, 0, max_pfn);
126
127 /* Indicate there is one node available. */
128 nodes_clear(node_online_map);
129 node_set_online(0);
130 return 1;
131}
132
133/*
134 * Find the highest page frame number we have available for the node
135 */
136static void __init propagate_e820_map_node(int nid)
137{
138 if (node_end_pfn[nid] > max_pfn)
139 node_end_pfn[nid] = max_pfn;
140 /*
141 * if a user has given mem=XXXX, then we need to make sure
142 * that the node _starts_ before that, too, not just ends
143 */
144 if (node_start_pfn[nid] > max_pfn)
145 node_start_pfn[nid] = max_pfn;
146 BUG_ON(node_start_pfn[nid] > node_end_pfn[nid]);
147}
148
149/*
150 * Allocate memory for the pg_data_t for this node via a crude pre-bootmem
151 * method. For node zero take this from the bottom of memory, for
152 * subsequent nodes place them at node_remap_start_vaddr which contains
153 * node local data in physically node local memory. See setup_memory()
154 * for details.
155 */
156static void __init allocate_pgdat(int nid)
157{
158 char buf[16];
159
160 if (node_has_online_mem(nid) && node_remap_start_vaddr[nid])
161 NODE_DATA(nid) = (pg_data_t *)node_remap_start_vaddr[nid];
162 else {
163 unsigned long pgdat_phys;
164 pgdat_phys = find_e820_area(min_low_pfn<<PAGE_SHIFT,
165 max_pfn_mapped<<PAGE_SHIFT,
166 sizeof(pg_data_t),
167 PAGE_SIZE);
168 NODE_DATA(nid) = (pg_data_t *)(pfn_to_kaddr(pgdat_phys>>PAGE_SHIFT));
169 memset(buf, 0, sizeof(buf));
170 sprintf(buf, "NODE_DATA %d", nid);
171 reserve_early(pgdat_phys, pgdat_phys + sizeof(pg_data_t), buf);
172 }
173 printk(KERN_DEBUG "allocate_pgdat: node %d NODE_DATA %08lx\n",
174 nid, (unsigned long)NODE_DATA(nid));
175}
176
177/* 81/*
178 * In the DISCONTIGMEM and SPARSEMEM memory model, a portion of the kernel 82 * Remap memory allocator
179 * virtual address space (KVA) is reserved and portions of nodes are mapped
180 * using it. This is to allow node-local memory to be allocated for
181 * structures that would normally require ZONE_NORMAL. The memory is
182 * allocated with alloc_remap() and callers should be prepared to allocate
183 * from the bootmem allocator instead.
184 */ 83 */
185static unsigned long node_remap_start_pfn[MAX_NUMNODES]; 84static unsigned long node_remap_start_pfn[MAX_NUMNODES];
186static void *node_remap_end_vaddr[MAX_NUMNODES]; 85static void *node_remap_end_vaddr[MAX_NUMNODES];
187static void *node_remap_alloc_vaddr[MAX_NUMNODES]; 86static void *node_remap_alloc_vaddr[MAX_NUMNODES];
188static unsigned long node_remap_offset[MAX_NUMNODES];
189 87
88/**
89 * alloc_remap - Allocate remapped memory
90 * @nid: NUMA node to allocate memory from
91 * @size: The size of allocation
92 *
93 * Allocate @size bytes from the remap area of NUMA node @nid. The
94 * size of the remap area is predetermined by init_alloc_remap() and
95 * only the callers considered there should call this function. For
96 * more info, please read the comment on top of init_alloc_remap().
97 *
98 * The caller must be ready to handle allocation failure from this
99 * function and fall back to regular memory allocator in such cases.
100 *
101 * CONTEXT:
102 * Single CPU early boot context.
103 *
104 * RETURNS:
105 * Pointer to the allocated memory on success, %NULL on failure.
106 */
190void *alloc_remap(int nid, unsigned long size) 107void *alloc_remap(int nid, unsigned long size)
191{ 108{
192 void *allocation = node_remap_alloc_vaddr[nid]; 109 void *allocation = node_remap_alloc_vaddr[nid];
193 110
194 size = ALIGN(size, L1_CACHE_BYTES); 111 size = ALIGN(size, L1_CACHE_BYTES);
195 112
196 if (!allocation || (allocation + size) >= node_remap_end_vaddr[nid]) 113 if (!allocation || (allocation + size) > node_remap_end_vaddr[nid])
197 return NULL; 114 return NULL;
198 115
199 node_remap_alloc_vaddr[nid] += size; 116 node_remap_alloc_vaddr[nid] += size;
@@ -202,26 +119,6 @@ void *alloc_remap(int nid, unsigned long size)
202 return allocation; 119 return allocation;
203} 120}
204 121
205static void __init remap_numa_kva(void)
206{
207 void *vaddr;
208 unsigned long pfn;
209 int node;
210
211 for_each_online_node(node) {
212 printk(KERN_DEBUG "remap_numa_kva: node %d\n", node);
213 for (pfn=0; pfn < node_remap_size[node]; pfn += PTRS_PER_PTE) {
214 vaddr = node_remap_start_vaddr[node]+(pfn<<PAGE_SHIFT);
215 printk(KERN_DEBUG "remap_numa_kva: %08lx to pfn %08lx\n",
216 (unsigned long)vaddr,
217 node_remap_start_pfn[node] + pfn);
218 set_pmd_pfn((ulong) vaddr,
219 node_remap_start_pfn[node] + pfn,
220 PAGE_KERNEL_LARGE);
221 }
222 }
223}
224
225#ifdef CONFIG_HIBERNATION 122#ifdef CONFIG_HIBERNATION
226/** 123/**
227 * resume_map_numa_kva - add KVA mapping to the temporary page tables created 124 * resume_map_numa_kva - add KVA mapping to the temporary page tables created
@@ -233,15 +130,16 @@ void resume_map_numa_kva(pgd_t *pgd_base)
233 int node; 130 int node;
234 131
235 for_each_online_node(node) { 132 for_each_online_node(node) {
236 unsigned long start_va, start_pfn, size, pfn; 133 unsigned long start_va, start_pfn, nr_pages, pfn;
237 134
238 start_va = (unsigned long)node_remap_start_vaddr[node]; 135 start_va = (unsigned long)node_remap_start_vaddr[node];
239 start_pfn = node_remap_start_pfn[node]; 136 start_pfn = node_remap_start_pfn[node];
240 size = node_remap_size[node]; 137 nr_pages = (node_remap_end_vaddr[node] -
138 node_remap_start_vaddr[node]) >> PAGE_SHIFT;
241 139
242 printk(KERN_DEBUG "%s: node %d\n", __func__, node); 140 printk(KERN_DEBUG "%s: node %d\n", __func__, node);
243 141
244 for (pfn = 0; pfn < size; pfn += PTRS_PER_PTE) { 142 for (pfn = 0; pfn < nr_pages; pfn += PTRS_PER_PTE) {
245 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT); 143 unsigned long vaddr = start_va + (pfn << PAGE_SHIFT);
246 pgd_t *pgd = pgd_base + pgd_index(vaddr); 144 pgd_t *pgd = pgd_base + pgd_index(vaddr);
247 pud_t *pud = pud_offset(pgd, vaddr); 145 pud_t *pud = pud_offset(pgd, vaddr);
@@ -257,134 +155,89 @@ void resume_map_numa_kva(pgd_t *pgd_base)
257} 155}
258#endif 156#endif
259 157
260static __init unsigned long calculate_numa_remap_pages(void) 158/**
261{ 159 * init_alloc_remap - Initialize remap allocator for a NUMA node
262 int nid; 160 * @nid: NUMA node to initizlie remap allocator for
263 unsigned long size, reserve_pages = 0; 161 *
264 162 * NUMA nodes may end up without any lowmem. As allocating pgdat and
265 for_each_online_node(nid) { 163 * memmap on a different node with lowmem is inefficient, a special
266 u64 node_kva_target; 164 * remap allocator is implemented which can be used by alloc_remap().
267 u64 node_kva_final; 165 *
268 166 * For each node, the amount of memory which will be necessary for
269 /* 167 * pgdat and memmap is calculated and two memory areas of the size are
270 * The acpi/srat node info can show hot-add memroy zones 168 * allocated - one in the node and the other in lowmem; then, the area
271 * where memory could be added but not currently present. 169 * in the node is remapped to the lowmem area.
272 */ 170 *
273 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n", 171 * As pgdat and memmap must be allocated in lowmem anyway, this
274 nid, node_start_pfn[nid], node_end_pfn[nid]); 172 * doesn't waste lowmem address space; however, the actual lowmem
275 if (node_start_pfn[nid] > max_pfn) 173 * which gets remapped over is wasted. The amount shouldn't be
276 continue; 174 * problematic on machines this feature will be used.
277 if (!node_end_pfn[nid]) 175 *
278 continue; 176 * Initialization failure isn't fatal. alloc_remap() is used
279 if (node_end_pfn[nid] > max_pfn) 177 * opportunistically and the callers will fall back to other memory
280 node_end_pfn[nid] = max_pfn; 178 * allocation mechanisms on failure.
281 179 */
282 /* ensure the remap includes space for the pgdat. */ 180void __init init_alloc_remap(int nid, u64 start, u64 end)
283 size = node_remap_size[nid] + sizeof(pg_data_t);
284
285 /* convert size to large (pmd size) pages, rounding up */
286 size = (size + LARGE_PAGE_BYTES - 1) / LARGE_PAGE_BYTES;
287 /* now the roundup is correct, convert to PAGE_SIZE pages */
288 size = size * PTRS_PER_PTE;
289
290 node_kva_target = round_down(node_end_pfn[nid] - size,
291 PTRS_PER_PTE);
292 node_kva_target <<= PAGE_SHIFT;
293 do {
294 node_kva_final = find_e820_area(node_kva_target,
295 ((u64)node_end_pfn[nid])<<PAGE_SHIFT,
296 ((u64)size)<<PAGE_SHIFT,
297 LARGE_PAGE_BYTES);
298 node_kva_target -= LARGE_PAGE_BYTES;
299 } while (node_kva_final == -1ULL &&
300 (node_kva_target>>PAGE_SHIFT) > (node_start_pfn[nid]));
301
302 if (node_kva_final == -1ULL)
303 panic("Can not get kva ram\n");
304
305 node_remap_size[nid] = size;
306 node_remap_offset[nid] = reserve_pages;
307 reserve_pages += size;
308 printk(KERN_DEBUG "Reserving %ld pages of KVA for lmem_map of"
309 " node %d at %llx\n",
310 size, nid, node_kva_final>>PAGE_SHIFT);
311
312 /*
313 * prevent kva address below max_low_pfn want it on system
314 * with less memory later.
315 * layout will be: KVA address , KVA RAM
316 *
317 * we are supposed to only record the one less then max_low_pfn
318 * but we could have some hole in high memory, and it will only
319 * check page_is_ram(pfn) && !page_is_reserved_early(pfn) to decide
320 * to use it as free.
321 * So reserve_early here, hope we don't run out of that array
322 */
323 reserve_early(node_kva_final,
324 node_kva_final+(((u64)size)<<PAGE_SHIFT),
325 "KVA RAM");
326
327 node_remap_start_pfn[nid] = node_kva_final>>PAGE_SHIFT;
328 remove_active_range(nid, node_remap_start_pfn[nid],
329 node_remap_start_pfn[nid] + size);
330 }
331 printk(KERN_INFO "Reserving total of %lx pages for numa KVA remap\n",
332 reserve_pages);
333 return reserve_pages;
334}
335
336static void init_remap_allocator(int nid)
337{
338 node_remap_start_vaddr[nid] = pfn_to_kaddr(
339 kva_start_pfn + node_remap_offset[nid]);
340 node_remap_end_vaddr[nid] = node_remap_start_vaddr[nid] +
341 (node_remap_size[nid] * PAGE_SIZE);
342 node_remap_alloc_vaddr[nid] = node_remap_start_vaddr[nid] +
343 ALIGN(sizeof(pg_data_t), PAGE_SIZE);
344
345 printk(KERN_DEBUG "node %d will remap to vaddr %08lx - %08lx\n", nid,
346 (ulong) node_remap_start_vaddr[nid],
347 (ulong) node_remap_end_vaddr[nid]);
348}
349
350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 int acpi, int k8)
352{ 181{
353 int nid; 182 unsigned long start_pfn = start >> PAGE_SHIFT;
354 long kva_target_pfn; 183 unsigned long end_pfn = end >> PAGE_SHIFT;
184 unsigned long size, pfn;
185 u64 node_pa, remap_pa;
186 void *remap_va;
355 187
356 /* 188 /*
357 * When mapping a NUMA machine we allocate the node_mem_map arrays 189 * The acpi/srat node info can show hot-add memroy zones where
358 * from node local memory. They are then mapped directly into KVA 190 * memory could be added but not currently present.
359 * between zone normal and vmalloc space. Calculate the size of
360 * this space and use it to adjust the boundary between ZONE_NORMAL
361 * and ZONE_HIGHMEM.
362 */ 191 */
192 printk(KERN_DEBUG "node %d pfn: [%lx - %lx]\n",
193 nid, start_pfn, end_pfn);
194
195 /* calculate the necessary space aligned to large page size */
196 size = node_memmap_size_bytes(nid, start_pfn, end_pfn);
197 size += ALIGN(sizeof(pg_data_t), PAGE_SIZE);
198 size = ALIGN(size, LARGE_PAGE_BYTES);
199
200 /* allocate node memory and the lowmem remap area */
201 node_pa = memblock_find_in_range(start, end, size, LARGE_PAGE_BYTES);
202 if (node_pa == MEMBLOCK_ERROR) {
203 pr_warning("remap_alloc: failed to allocate %lu bytes for node %d\n",
204 size, nid);
205 return;
206 }
207 memblock_x86_reserve_range(node_pa, node_pa + size, "KVA RAM");
208
209 remap_pa = memblock_find_in_range(min_low_pfn << PAGE_SHIFT,
210 max_low_pfn << PAGE_SHIFT,
211 size, LARGE_PAGE_BYTES);
212 if (remap_pa == MEMBLOCK_ERROR) {
213 pr_warning("remap_alloc: failed to allocate %lu bytes remap area for node %d\n",
214 size, nid);
215 memblock_x86_free_range(node_pa, node_pa + size);
216 return;
217 }
218 memblock_x86_reserve_range(remap_pa, remap_pa + size, "KVA PG");
219 remap_va = phys_to_virt(remap_pa);
220
221 /* perform actual remap */
222 for (pfn = 0; pfn < size >> PAGE_SHIFT; pfn += PTRS_PER_PTE)
223 set_pmd_pfn((unsigned long)remap_va + (pfn << PAGE_SHIFT),
224 (node_pa >> PAGE_SHIFT) + pfn,
225 PAGE_KERNEL_LARGE);
226
227 /* initialize remap allocator parameters */
228 node_remap_start_pfn[nid] = node_pa >> PAGE_SHIFT;
229 node_remap_start_vaddr[nid] = remap_va;
230 node_remap_end_vaddr[nid] = remap_va + size;
231 node_remap_alloc_vaddr[nid] = remap_va;
232
233 printk(KERN_DEBUG "remap_alloc: node %d [%08llx-%08llx) -> [%p-%p)\n",
234 nid, node_pa, node_pa + size, remap_va, remap_va + size);
235}
363 236
364 get_memcfg_numa(); 237void __init initmem_init(void)
365 238{
366 kva_pages = roundup(calculate_numa_remap_pages(), PTRS_PER_PTE); 239 x86_numa_init();
367
368 kva_target_pfn = round_down(max_low_pfn - kva_pages, PTRS_PER_PTE);
369 do {
370 kva_start_pfn = find_e820_area(kva_target_pfn<<PAGE_SHIFT,
371 max_low_pfn<<PAGE_SHIFT,
372 kva_pages<<PAGE_SHIFT,
373 PTRS_PER_PTE<<PAGE_SHIFT) >> PAGE_SHIFT;
374 kva_target_pfn -= PTRS_PER_PTE;
375 } while (kva_start_pfn == -1UL && kva_target_pfn > min_low_pfn);
376
377 if (kva_start_pfn == -1UL)
378 panic("Can not get kva space\n");
379
380 printk(KERN_INFO "kva_start_pfn ~ %lx max_low_pfn ~ %lx\n",
381 kva_start_pfn, max_low_pfn);
382 printk(KERN_INFO "max_pfn = %lx\n", max_pfn);
383 240
384 /* avoid clash with initrd */
385 reserve_early(kva_start_pfn<<PAGE_SHIFT,
386 (kva_start_pfn + kva_pages)<<PAGE_SHIFT,
387 "KVA PG");
388#ifdef CONFIG_HIGHMEM 241#ifdef CONFIG_HIGHMEM
389 highstart_pfn = highend_pfn = max_pfn; 242 highstart_pfn = highend_pfn = max_pfn;
390 if (max_pfn > max_low_pfn) 243 if (max_pfn > max_low_pfn)
@@ -404,54 +257,9 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
404 257
405 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n", 258 printk(KERN_DEBUG "Low memory ends at vaddr %08lx\n",
406 (ulong) pfn_to_kaddr(max_low_pfn)); 259 (ulong) pfn_to_kaddr(max_low_pfn));
407 for_each_online_node(nid) {
408 init_remap_allocator(nid);
409
410 allocate_pgdat(nid);
411 }
412 remap_numa_kva();
413 260
414 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n", 261 printk(KERN_DEBUG "High memory starts at vaddr %08lx\n",
415 (ulong) pfn_to_kaddr(highstart_pfn)); 262 (ulong) pfn_to_kaddr(highstart_pfn));
416 for_each_online_node(nid)
417 propagate_e820_map_node(nid);
418
419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->node_id = nid;
422#ifndef CONFIG_NO_BOOTMEM
423 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
424#endif
425 }
426 263
427 setup_bootmem_allocator(); 264 setup_bootmem_allocator();
428} 265}
429
430#ifdef CONFIG_MEMORY_HOTPLUG
431static int paddr_to_nid(u64 addr)
432{
433 int nid;
434 unsigned long pfn = PFN_DOWN(addr);
435
436 for_each_node(nid)
437 if (node_start_pfn[nid] <= pfn &&
438 pfn < node_end_pfn[nid])
439 return nid;
440
441 return -1;
442}
443
444/*
445 * This function is used to ask node id BEFORE memmap and mem_section's
446 * initialization (pfn_to_nid() can't be used yet).
447 * If _PXM is not defined on ACPI's DSDT, node id must be found by this.
448 */
449int memory_add_physaddr_to_nid(u64 addr)
450{
451 int nid = paddr_to_nid(addr);
452 return (nid >= 0) ? nid : 0;
453}
454
455EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
456#endif
457
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index a7bcc23ef96c..dd27f401f0a0 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -2,697 +2,13 @@
2 * Generic VM initialization for x86-64 NUMA setups. 2 * Generic VM initialization for x86-64 NUMA setups.
3 * Copyright 2002,2003 Andi Kleen, SuSE Labs. 3 * Copyright 2002,2003 Andi Kleen, SuSE Labs.
4 */ 4 */
5#include <linux/kernel.h>
6#include <linux/mm.h>
7#include <linux/string.h>
8#include <linux/init.h>
9#include <linux/bootmem.h> 5#include <linux/bootmem.h>
10#include <linux/mmzone.h>
11#include <linux/ctype.h>
12#include <linux/module.h>
13#include <linux/nodemask.h>
14#include <linux/sched.h>
15 6
16#include <asm/e820.h> 7#include "numa_internal.h"
17#include <asm/proto.h>
18#include <asm/dma.h>
19#include <asm/numa.h>
20#include <asm/acpi.h>
21#include <asm/k8.h>
22 8
23struct pglist_data *node_data[MAX_NUMNODES] __read_mostly; 9void __init initmem_init(void)
24EXPORT_SYMBOL(node_data);
25
26struct memnode memnode;
27
28s16 apicid_to_node[MAX_LOCAL_APIC] __cpuinitdata = {
29 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
30};
31
32int numa_off __initdata;
33static unsigned long __initdata nodemap_addr;
34static unsigned long __initdata nodemap_size;
35
36/*
37 * Map cpu index to node index
38 */
39DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
40EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
41
42/*
43 * Given a shift value, try to populate memnodemap[]
44 * Returns :
45 * 1 if OK
46 * 0 if memnodmap[] too small (of shift too small)
47 * -1 if node overlap or lost ram (shift too big)
48 */
49static int __init populate_memnodemap(const struct bootnode *nodes,
50 int numnodes, int shift, int *nodeids)
51{
52 unsigned long addr, end;
53 int i, res = -1;
54
55 memset(memnodemap, 0xff, sizeof(s16)*memnodemapsize);
56 for (i = 0; i < numnodes; i++) {
57 addr = nodes[i].start;
58 end = nodes[i].end;
59 if (addr >= end)
60 continue;
61 if ((end >> shift) >= memnodemapsize)
62 return 0;
63 do {
64 if (memnodemap[addr >> shift] != NUMA_NO_NODE)
65 return -1;
66
67 if (!nodeids)
68 memnodemap[addr >> shift] = i;
69 else
70 memnodemap[addr >> shift] = nodeids[i];
71
72 addr += (1UL << shift);
73 } while (addr < end);
74 res = 1;
75 }
76 return res;
77}
78
79static int __init allocate_cachealigned_memnodemap(void)
80{
81 unsigned long addr;
82
83 memnodemap = memnode.embedded_map;
84 if (memnodemapsize <= ARRAY_SIZE(memnode.embedded_map))
85 return 0;
86
87 addr = 0x8000;
88 nodemap_size = roundup(sizeof(s16) * memnodemapsize, L1_CACHE_BYTES);
89 nodemap_addr = find_e820_area(addr, max_pfn<<PAGE_SHIFT,
90 nodemap_size, L1_CACHE_BYTES);
91 if (nodemap_addr == -1UL) {
92 printk(KERN_ERR
93 "NUMA: Unable to allocate Memory to Node hash map\n");
94 nodemap_addr = nodemap_size = 0;
95 return -1;
96 }
97 memnodemap = phys_to_virt(nodemap_addr);
98 reserve_early(nodemap_addr, nodemap_addr + nodemap_size, "MEMNODEMAP");
99
100 printk(KERN_DEBUG "NUMA: Allocated memnodemap from %lx - %lx\n",
101 nodemap_addr, nodemap_addr + nodemap_size);
102 return 0;
103}
104
105/*
106 * The LSB of all start and end addresses in the node map is the value of the
107 * maximum possible shift.
108 */
109static int __init extract_lsb_from_nodes(const struct bootnode *nodes,
110 int numnodes)
111{
112 int i, nodes_used = 0;
113 unsigned long start, end;
114 unsigned long bitfield = 0, memtop = 0;
115
116 for (i = 0; i < numnodes; i++) {
117 start = nodes[i].start;
118 end = nodes[i].end;
119 if (start >= end)
120 continue;
121 bitfield |= start;
122 nodes_used++;
123 if (end > memtop)
124 memtop = end;
125 }
126 if (nodes_used <= 1)
127 i = 63;
128 else
129 i = find_first_bit(&bitfield, sizeof(unsigned long)*8);
130 memnodemapsize = (memtop >> i)+1;
131 return i;
132}
133
134int __init compute_hash_shift(struct bootnode *nodes, int numnodes,
135 int *nodeids)
136{
137 int shift;
138
139 shift = extract_lsb_from_nodes(nodes, numnodes);
140 if (allocate_cachealigned_memnodemap())
141 return -1;
142 printk(KERN_DEBUG "NUMA: Using %d for the hash shift.\n",
143 shift);
144
145 if (populate_memnodemap(nodes, numnodes, shift, nodeids) != 1) {
146 printk(KERN_INFO "Your memory is not aligned you need to "
147 "rebuild your kernel with a bigger NODEMAPSIZE "
148 "shift=%d\n", shift);
149 return -1;
150 }
151 return shift;
152}
153
154int __meminit __early_pfn_to_nid(unsigned long pfn)
155{
156 return phys_to_nid(pfn << PAGE_SHIFT);
157}
158
159static void * __init early_node_mem(int nodeid, unsigned long start,
160 unsigned long end, unsigned long size,
161 unsigned long align)
162{
163 unsigned long mem;
164
165 /*
166 * put it on high as possible
167 * something will go with NODE_DATA
168 */
169 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
170 start = MAX_DMA_PFN<<PAGE_SHIFT;
171 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
172 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
173 start = MAX_DMA32_PFN<<PAGE_SHIFT;
174 mem = find_e820_area(start, end, size, align);
175 if (mem != -1L)
176 return __va(mem);
177
178 /* extend the search scope */
179 end = max_pfn_mapped << PAGE_SHIFT;
180 if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
181 start = MAX_DMA32_PFN<<PAGE_SHIFT;
182 else
183 start = MAX_DMA_PFN<<PAGE_SHIFT;
184 mem = find_e820_area(start, end, size, align);
185 if (mem != -1L)
186 return __va(mem);
187
188 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
189 size, nodeid);
190
191 return NULL;
192}
193
194/* Initialize bootmem allocator for a node */
195void __init
196setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
197{
198 unsigned long start_pfn, last_pfn, nodedata_phys;
199 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
200 int nid;
201#ifndef CONFIG_NO_BOOTMEM
202 unsigned long bootmap_start, bootmap_pages, bootmap_size;
203 void *bootmap;
204#endif
205
206 if (!end)
207 return;
208
209 /*
210 * Don't confuse VM with a node that doesn't have the
211 * minimum amount of memory:
212 */
213 if (end && (end - start) < NODE_MIN_SIZE)
214 return;
215
216 start = roundup(start, ZONE_ALIGN);
217
218 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
219 start, end);
220
221 start_pfn = start >> PAGE_SHIFT;
222 last_pfn = end >> PAGE_SHIFT;
223
224 node_data[nodeid] = early_node_mem(nodeid, start, end, pgdat_size,
225 SMP_CACHE_BYTES);
226 if (node_data[nodeid] == NULL)
227 return;
228 nodedata_phys = __pa(node_data[nodeid]);
229 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
230 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
231 nodedata_phys + pgdat_size - 1);
232 nid = phys_to_nid(nodedata_phys);
233 if (nid != nodeid)
234 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
235
236 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
237 NODE_DATA(nodeid)->node_id = nodeid;
238 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
239 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
240
241#ifndef CONFIG_NO_BOOTMEM
242 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
243
244 /*
245 * Find a place for the bootmem map
246 * nodedata_phys could be on other nodes by alloc_bootmem,
247 * so need to sure bootmap_start not to be small, otherwise
248 * early_node_mem will get that with find_e820_area instead
249 * of alloc_bootmem, that could clash with reserved range
250 */
251 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
252 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
253 /*
254 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
255 * to use that to align to PAGE_SIZE
256 */
257 bootmap = early_node_mem(nodeid, bootmap_start, end,
258 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
259 if (bootmap == NULL) {
260 free_early(nodedata_phys, nodedata_phys + pgdat_size);
261 node_data[nodeid] = NULL;
262 return;
263 }
264 bootmap_start = __pa(bootmap);
265 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
266 "BOOTMAP");
267
268 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
269 bootmap_start >> PAGE_SHIFT,
270 start_pfn, last_pfn);
271
272 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
273 bootmap_start, bootmap_start + bootmap_size - 1,
274 bootmap_pages);
275 nid = phys_to_nid(bootmap_start);
276 if (nid != nodeid)
277 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
278
279 free_bootmem_with_active_regions(nodeid, end);
280#endif
281
282 node_set_online(nodeid);
283}
284
285/*
286 * There are unfortunately some poorly designed mainboards around that
287 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
288 * mapping. To avoid this fill in the mapping for all possible CPUs,
289 * as the number of CPUs is not known yet. We round robin the existing
290 * nodes.
291 */
292void __init numa_init_array(void)
293{
294 int rr, i;
295
296 rr = first_node(node_online_map);
297 for (i = 0; i < nr_cpu_ids; i++) {
298 if (early_cpu_to_node(i) != NUMA_NO_NODE)
299 continue;
300 numa_set_node(i, rr);
301 rr = next_node(rr, node_online_map);
302 if (rr == MAX_NUMNODES)
303 rr = first_node(node_online_map);
304 }
305}
306
307#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */
309static struct bootnode nodes[MAX_NUMNODES] __initdata;
310static struct bootnode physnodes[MAX_NUMNODES] __initdata;
311static char *cmdline __initdata;
312
313static int __init setup_physnodes(unsigned long start, unsigned long end,
314 int acpi, int k8)
315{
316 int nr_nodes = 0;
317 int ret = 0;
318 int i;
319
320#ifdef CONFIG_ACPI_NUMA
321 if (acpi)
322 nr_nodes = acpi_get_nodes(physnodes);
323#endif
324#ifdef CONFIG_K8_NUMA
325 if (k8)
326 nr_nodes = k8_get_nodes(physnodes);
327#endif
328 /*
329 * Basic sanity checking on the physical node map: there may be errors
330 * if the SRAT or K8 incorrectly reported the topology or the mem=
331 * kernel parameter is used.
332 */
333 for (i = 0; i < nr_nodes; i++) {
334 if (physnodes[i].start == physnodes[i].end)
335 continue;
336 if (physnodes[i].start > end) {
337 physnodes[i].end = physnodes[i].start;
338 continue;
339 }
340 if (physnodes[i].end < start) {
341 physnodes[i].start = physnodes[i].end;
342 continue;
343 }
344 if (physnodes[i].start < start)
345 physnodes[i].start = start;
346 if (physnodes[i].end > end)
347 physnodes[i].end = end;
348 }
349
350 /*
351 * Remove all nodes that have no memory or were truncated because of the
352 * limited address range.
353 */
354 for (i = 0; i < nr_nodes; i++) {
355 if (physnodes[i].start == physnodes[i].end)
356 continue;
357 physnodes[ret].start = physnodes[i].start;
358 physnodes[ret].end = physnodes[i].end;
359 ret++;
360 }
361
362 /*
363 * If no physical topology was detected, a single node is faked to cover
364 * the entire address space.
365 */
366 if (!ret) {
367 physnodes[ret].start = start;
368 physnodes[ret].end = end;
369 ret = 1;
370 }
371 return ret;
372}
373
374/*
375 * Setups up nid to range from addr to addr + size. If the end
376 * boundary is greater than max_addr, then max_addr is used instead.
377 * The return value is 0 if there is additional memory left for
378 * allocation past addr and -1 otherwise. addr is adjusted to be at
379 * the end of the node.
380 */
381static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
382{
383 int ret = 0;
384 nodes[nid].start = *addr;
385 *addr += size;
386 if (*addr >= max_addr) {
387 *addr = max_addr;
388 ret = -1;
389 }
390 nodes[nid].end = *addr;
391 node_set(nid, node_possible_map);
392 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
393 nodes[nid].start, nodes[nid].end,
394 (nodes[nid].end - nodes[nid].start) >> 20);
395 return ret;
396}
397
398/*
399 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
400 * to max_addr. The return value is the number of nodes allocated.
401 */
402static int __init split_nodes_interleave(u64 addr, u64 max_addr,
403 int nr_phys_nodes, int nr_nodes)
404{ 10{
405 nodemask_t physnode_mask = NODE_MASK_NONE; 11 x86_numa_init();
406 u64 size;
407 int big;
408 int ret = 0;
409 int i;
410
411 if (nr_nodes <= 0)
412 return -1;
413 if (nr_nodes > MAX_NUMNODES) {
414 pr_info("numa=fake=%d too large, reducing to %d\n",
415 nr_nodes, MAX_NUMNODES);
416 nr_nodes = MAX_NUMNODES;
417 }
418
419 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
420 /*
421 * Calculate the number of big nodes that can be allocated as a result
422 * of consolidating the remainder.
423 */
424 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
425 FAKE_NODE_MIN_SIZE;
426
427 size &= FAKE_NODE_MIN_HASH_MASK;
428 if (!size) {
429 pr_err("Not enough memory for each node. "
430 "NUMA emulation disabled.\n");
431 return -1;
432 }
433
434 for (i = 0; i < nr_phys_nodes; i++)
435 if (physnodes[i].start != physnodes[i].end)
436 node_set(i, physnode_mask);
437
438 /*
439 * Continue to fill physical nodes with fake nodes until there is no
440 * memory left on any of them.
441 */
442 while (nodes_weight(physnode_mask)) {
443 for_each_node_mask(i, physnode_mask) {
444 u64 end = physnodes[i].start + size;
445 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
446
447 if (ret < big)
448 end += FAKE_NODE_MIN_SIZE;
449
450 /*
451 * Continue to add memory to this fake node if its
452 * non-reserved memory is less than the per-node size.
453 */
454 while (end - physnodes[i].start -
455 e820_hole_size(physnodes[i].start, end) < size) {
456 end += FAKE_NODE_MIN_SIZE;
457 if (end > physnodes[i].end) {
458 end = physnodes[i].end;
459 break;
460 }
461 }
462
463 /*
464 * If there won't be at least FAKE_NODE_MIN_SIZE of
465 * non-reserved memory in ZONE_DMA32 for the next node,
466 * this one must extend to the boundary.
467 */
468 if (end < dma32_end && dma32_end - end -
469 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
470 end = dma32_end;
471
472 /*
473 * If there won't be enough non-reserved memory for the
474 * next node, this one must extend to the end of the
475 * physical node.
476 */
477 if (physnodes[i].end - end -
478 e820_hole_size(end, physnodes[i].end) < size)
479 end = physnodes[i].end;
480
481 /*
482 * Avoid allocating more nodes than requested, which can
483 * happen as a result of rounding down each node's size
484 * to FAKE_NODE_MIN_SIZE.
485 */
486 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
487 end = physnodes[i].end;
488
489 if (setup_node_range(ret++, &physnodes[i].start,
490 end - physnodes[i].start,
491 physnodes[i].end) < 0)
492 node_clear(i, physnode_mask);
493 }
494 }
495 return ret;
496}
497
498/*
499 * Returns the end address of a node so that there is at least `size' amount of
500 * non-reserved memory or `max_addr' is reached.
501 */
502static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
503{
504 u64 end = start + size;
505
506 while (end - start - e820_hole_size(start, end) < size) {
507 end += FAKE_NODE_MIN_SIZE;
508 if (end > max_addr) {
509 end = max_addr;
510 break;
511 }
512 }
513 return end;
514}
515
516/*
517 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
518 * `addr' to `max_addr'. The return value is the number of nodes allocated.
519 */
520static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
521{
522 nodemask_t physnode_mask = NODE_MASK_NONE;
523 u64 min_size;
524 int ret = 0;
525 int i;
526
527 if (!size)
528 return -1;
529 /*
530 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
531 * increased accordingly if the requested size is too small. This
532 * creates a uniform distribution of node sizes across the entire
533 * machine (but not necessarily over physical nodes).
534 */
535 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
536 MAX_NUMNODES;
537 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
538 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
539 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
540 FAKE_NODE_MIN_HASH_MASK;
541 if (size < min_size) {
542 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
543 size >> 20, min_size >> 20);
544 size = min_size;
545 }
546 size &= FAKE_NODE_MIN_HASH_MASK;
547
548 for (i = 0; i < MAX_NUMNODES; i++)
549 if (physnodes[i].start != physnodes[i].end)
550 node_set(i, physnode_mask);
551 /*
552 * Fill physical nodes with fake nodes of size until there is no memory
553 * left on any of them.
554 */
555 while (nodes_weight(physnode_mask)) {
556 for_each_node_mask(i, physnode_mask) {
557 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
558 u64 end;
559
560 end = find_end_of_node(physnodes[i].start,
561 physnodes[i].end, size);
562 /*
563 * If there won't be at least FAKE_NODE_MIN_SIZE of
564 * non-reserved memory in ZONE_DMA32 for the next node,
565 * this one must extend to the boundary.
566 */
567 if (end < dma32_end && dma32_end - end -
568 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
569 end = dma32_end;
570
571 /*
572 * If there won't be enough non-reserved memory for the
573 * next node, this one must extend to the end of the
574 * physical node.
575 */
576 if (physnodes[i].end - end -
577 e820_hole_size(end, physnodes[i].end) < size)
578 end = physnodes[i].end;
579
580 /*
581 * Setup the fake node that will be allocated as bootmem
582 * later. If setup_node_range() returns non-zero, there
583 * is no more memory available on this physical node.
584 */
585 if (setup_node_range(ret++, &physnodes[i].start,
586 end - physnodes[i].start,
587 physnodes[i].end) < 0)
588 node_clear(i, physnode_mask);
589 }
590 }
591 return ret;
592}
593
594/*
595 * Sets up the system RAM area from start_pfn to last_pfn according to the
596 * numa=fake command-line option.
597 */
598static int __init numa_emulation(unsigned long start_pfn,
599 unsigned long last_pfn, int acpi, int k8)
600{
601 u64 addr = start_pfn << PAGE_SHIFT;
602 u64 max_addr = last_pfn << PAGE_SHIFT;
603 int num_phys_nodes;
604 int num_nodes;
605 int i;
606
607 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
608 /*
609 * If the numa=fake command-line contains a 'M' or 'G', it represents
610 * the fixed node size. Otherwise, if it is just a single number N,
611 * split the system RAM into N fake nodes.
612 */
613 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
614 u64 size;
615
616 size = memparse(cmdline, &cmdline);
617 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
618 } else {
619 unsigned long n;
620
621 n = simple_strtoul(cmdline, NULL, 0);
622 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
623 }
624
625 if (num_nodes < 0)
626 return num_nodes;
627 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
628 if (memnode_shift < 0) {
629 memnode_shift = 0;
630 printk(KERN_ERR "No NUMA hash function found. NUMA emulation "
631 "disabled.\n");
632 return -1;
633 }
634
635 /*
636 * We need to vacate all active ranges that may have been registered for
637 * the e820 memory map.
638 */
639 remove_all_active_ranges();
640 for_each_node_mask(i, node_possible_map) {
641 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
642 nodes[i].end >> PAGE_SHIFT);
643 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
644 }
645 acpi_fake_nodes(nodes, num_nodes);
646 numa_init_array();
647 return 0;
648}
649#endif /* CONFIG_NUMA_EMU */
650
651void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
652 int acpi, int k8)
653{
654 int i;
655
656 nodes_clear(node_possible_map);
657 nodes_clear(node_online_map);
658
659#ifdef CONFIG_NUMA_EMU
660 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
661 return;
662 nodes_clear(node_possible_map);
663 nodes_clear(node_online_map);
664#endif
665
666#ifdef CONFIG_ACPI_NUMA
667 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
668 last_pfn << PAGE_SHIFT))
669 return;
670 nodes_clear(node_possible_map);
671 nodes_clear(node_online_map);
672#endif
673
674#ifdef CONFIG_K8_NUMA
675 if (!numa_off && k8 && !k8_scan_nodes())
676 return;
677 nodes_clear(node_possible_map);
678 nodes_clear(node_online_map);
679#endif
680 printk(KERN_INFO "%s\n",
681 numa_off ? "NUMA turned off" : "No NUMA configuration found");
682
683 printk(KERN_INFO "Faking a node at %016lx-%016lx\n",
684 start_pfn << PAGE_SHIFT,
685 last_pfn << PAGE_SHIFT);
686 /* setup dummy node covering all memory */
687 memnode_shift = 63;
688 memnodemap = memnode.embedded_map;
689 memnodemap[0] = 0;
690 node_set_online(0);
691 node_set(0, node_possible_map);
692 for (i = 0; i < nr_cpu_ids; i++)
693 numa_set_node(i, 0);
694 e820_register_active_regions(0, start_pfn, last_pfn);
695 setup_node_bootmem(0, start_pfn << PAGE_SHIFT, last_pfn << PAGE_SHIFT);
696} 12}
697 13
698unsigned long __init numa_free_all_bootmem(void) 14unsigned long __init numa_free_all_bootmem(void)
@@ -703,199 +19,7 @@ unsigned long __init numa_free_all_bootmem(void)
703 for_each_online_node(i) 19 for_each_online_node(i)
704 pages += free_all_bootmem_node(NODE_DATA(i)); 20 pages += free_all_bootmem_node(NODE_DATA(i));
705 21
706#ifdef CONFIG_NO_BOOTMEM
707 pages += free_all_memory_core_early(MAX_NUMNODES); 22 pages += free_all_memory_core_early(MAX_NUMNODES);
708#endif
709 23
710 return pages; 24 return pages;
711} 25}
712
713static __init int numa_setup(char *opt)
714{
715 if (!opt)
716 return -EINVAL;
717 if (!strncmp(opt, "off", 3))
718 numa_off = 1;
719#ifdef CONFIG_NUMA_EMU
720 if (!strncmp(opt, "fake=", 5))
721 cmdline = opt + 5;
722#endif
723#ifdef CONFIG_ACPI_NUMA
724 if (!strncmp(opt, "noacpi", 6))
725 acpi_numa = -1;
726#endif
727 return 0;
728}
729early_param("numa", numa_setup);
730
731#ifdef CONFIG_NUMA
732
733static __init int find_near_online_node(int node)
734{
735 int n, val;
736 int min_val = INT_MAX;
737 int best_node = -1;
738
739 for_each_online_node(n) {
740 val = node_distance(node, n);
741
742 if (val < min_val) {
743 min_val = val;
744 best_node = n;
745 }
746 }
747
748 return best_node;
749}
750
751/*
752 * Setup early cpu_to_node.
753 *
754 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
755 * and apicid_to_node[] tables have valid entries for a CPU.
756 * This means we skip cpu_to_node[] initialisation for NUMA
757 * emulation and faking node case (when running a kernel compiled
758 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
759 * is already initialized in a round robin manner at numa_init_array,
760 * prior to this call, and this initialization is good enough
761 * for the fake NUMA cases.
762 *
763 * Called before the per_cpu areas are setup.
764 */
765void __init init_cpu_to_node(void)
766{
767 int cpu;
768 u16 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
769
770 BUG_ON(cpu_to_apicid == NULL);
771
772 for_each_possible_cpu(cpu) {
773 int node;
774 u16 apicid = cpu_to_apicid[cpu];
775
776 if (apicid == BAD_APICID)
777 continue;
778 node = apicid_to_node[apicid];
779 if (node == NUMA_NO_NODE)
780 continue;
781 if (!node_online(node))
782 node = find_near_online_node(node);
783 numa_set_node(cpu, node);
784 }
785}
786#endif
787
788
789void __cpuinit numa_set_node(int cpu, int node)
790{
791 int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
792
793 /* early setting, no percpu area yet */
794 if (cpu_to_node_map) {
795 cpu_to_node_map[cpu] = node;
796 return;
797 }
798
799#ifdef CONFIG_DEBUG_PER_CPU_MAPS
800 if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
801 printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
802 dump_stack();
803 return;
804 }
805#endif
806 per_cpu(x86_cpu_to_node_map, cpu) = node;
807
808 if (node != NUMA_NO_NODE)
809 set_cpu_numa_node(cpu, node);
810}
811
812void __cpuinit numa_clear_node(int cpu)
813{
814 numa_set_node(cpu, NUMA_NO_NODE);
815}
816
817#ifndef CONFIG_DEBUG_PER_CPU_MAPS
818
819void __cpuinit numa_add_cpu(int cpu)
820{
821 cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
822}
823
824void __cpuinit numa_remove_cpu(int cpu)
825{
826 cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
827}
828
829#else /* CONFIG_DEBUG_PER_CPU_MAPS */
830
831/*
832 * --------- debug versions of the numa functions ---------
833 */
834static void __cpuinit numa_set_cpumask(int cpu, int enable)
835{
836 int node = early_cpu_to_node(cpu);
837 struct cpumask *mask;
838 char buf[64];
839
840 mask = node_to_cpumask_map[node];
841 if (mask == NULL) {
842 printk(KERN_ERR "node_to_cpumask_map[%i] NULL\n", node);
843 dump_stack();
844 return;
845 }
846
847 if (enable)
848 cpumask_set_cpu(cpu, mask);
849 else
850 cpumask_clear_cpu(cpu, mask);
851
852 cpulist_scnprintf(buf, sizeof(buf), mask);
853 printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n",
854 enable ? "numa_add_cpu" : "numa_remove_cpu", cpu, node, buf);
855}
856
857void __cpuinit numa_add_cpu(int cpu)
858{
859 numa_set_cpumask(cpu, 1);
860}
861
862void __cpuinit numa_remove_cpu(int cpu)
863{
864 numa_set_cpumask(cpu, 0);
865}
866
867int __cpu_to_node(int cpu)
868{
869 if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
870 printk(KERN_WARNING
871 "cpu_to_node(%d): usage too early!\n", cpu);
872 dump_stack();
873 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
874 }
875 return per_cpu(x86_cpu_to_node_map, cpu);
876}
877EXPORT_SYMBOL(__cpu_to_node);
878
879/*
880 * Same function as cpu_to_node() but used if called before the
881 * per_cpu areas are setup.
882 */
883int early_cpu_to_node(int cpu)
884{
885 if (early_per_cpu_ptr(x86_cpu_to_node_map))
886 return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
887
888 if (!cpu_possible(cpu)) {
889 printk(KERN_WARNING
890 "early_cpu_to_node(%d): no per_cpu area!\n", cpu);
891 dump_stack();
892 return NUMA_NO_NODE;
893 }
894 return per_cpu(x86_cpu_to_node_map, cpu);
895}
896
897/*
898 * --------- end of debug versions of the numa functions ---------
899 */
900
901#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_emulation.c b/arch/x86/mm/numa_emulation.c
new file mode 100644
index 000000000000..d0ed086b6247
--- /dev/null
+++ b/arch/x86/mm/numa_emulation.c
@@ -0,0 +1,492 @@
1/*
2 * NUMA emulation
3 */
4#include <linux/kernel.h>
5#include <linux/errno.h>
6#include <linux/topology.h>
7#include <linux/memblock.h>
8#include <linux/bootmem.h>
9#include <asm/dma.h>
10
11#include "numa_internal.h"
12
13static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
14static char *emu_cmdline __initdata;
15
16void __init numa_emu_cmdline(char *str)
17{
18 emu_cmdline = str;
19}
20
21static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
22{
23 int i;
24
25 for (i = 0; i < mi->nr_blks; i++)
26 if (mi->blk[i].nid == nid)
27 return i;
28 return -ENOENT;
29}
30
31/*
32 * Sets up nid to range from @start to @end. The return value is -errno if
33 * something went wrong, 0 otherwise.
34 */
35static int __init emu_setup_memblk(struct numa_meminfo *ei,
36 struct numa_meminfo *pi,
37 int nid, int phys_blk, u64 size)
38{
39 struct numa_memblk *eb = &ei->blk[ei->nr_blks];
40 struct numa_memblk *pb = &pi->blk[phys_blk];
41
42 if (ei->nr_blks >= NR_NODE_MEMBLKS) {
43 pr_err("NUMA: Too many emulated memblks, failing emulation\n");
44 return -EINVAL;
45 }
46
47 ei->nr_blks++;
48 eb->start = pb->start;
49 eb->end = pb->start + size;
50 eb->nid = nid;
51
52 if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
53 emu_nid_to_phys[nid] = pb->nid;
54
55 pb->start += size;
56 if (pb->start >= pb->end) {
57 WARN_ON_ONCE(pb->start > pb->end);
58 numa_remove_memblk_from(phys_blk, pi);
59 }
60
61 printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
62 eb->start, eb->end, (eb->end - eb->start) >> 20);
63 return 0;
64}
65
66/*
67 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
68 * to max_addr. The return value is the number of nodes allocated.
69 */
70static int __init split_nodes_interleave(struct numa_meminfo *ei,
71 struct numa_meminfo *pi,
72 u64 addr, u64 max_addr, int nr_nodes)
73{
74 nodemask_t physnode_mask = NODE_MASK_NONE;
75 u64 size;
76 int big;
77 int nid = 0;
78 int i, ret;
79
80 if (nr_nodes <= 0)
81 return -1;
82 if (nr_nodes > MAX_NUMNODES) {
83 pr_info("numa=fake=%d too large, reducing to %d\n",
84 nr_nodes, MAX_NUMNODES);
85 nr_nodes = MAX_NUMNODES;
86 }
87
88 /*
89 * Calculate target node size. x86_32 freaks on __udivdi3() so do
90 * the division in ulong number of pages and convert back.
91 */
92 size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
93 size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
94
95 /*
96 * Calculate the number of big nodes that can be allocated as a result
97 * of consolidating the remainder.
98 */
99 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
100 FAKE_NODE_MIN_SIZE;
101
102 size &= FAKE_NODE_MIN_HASH_MASK;
103 if (!size) {
104 pr_err("Not enough memory for each node. "
105 "NUMA emulation disabled.\n");
106 return -1;
107 }
108
109 for (i = 0; i < pi->nr_blks; i++)
110 node_set(pi->blk[i].nid, physnode_mask);
111
112 /*
113 * Continue to fill physical nodes with fake nodes until there is no
114 * memory left on any of them.
115 */
116 while (nodes_weight(physnode_mask)) {
117 for_each_node_mask(i, physnode_mask) {
118 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
119 u64 start, limit, end;
120 int phys_blk;
121
122 phys_blk = emu_find_memblk_by_nid(i, pi);
123 if (phys_blk < 0) {
124 node_clear(i, physnode_mask);
125 continue;
126 }
127 start = pi->blk[phys_blk].start;
128 limit = pi->blk[phys_blk].end;
129 end = start + size;
130
131 if (nid < big)
132 end += FAKE_NODE_MIN_SIZE;
133
134 /*
135 * Continue to add memory to this fake node if its
136 * non-reserved memory is less than the per-node size.
137 */
138 while (end - start -
139 memblock_x86_hole_size(start, end) < size) {
140 end += FAKE_NODE_MIN_SIZE;
141 if (end > limit) {
142 end = limit;
143 break;
144 }
145 }
146
147 /*
148 * If there won't be at least FAKE_NODE_MIN_SIZE of
149 * non-reserved memory in ZONE_DMA32 for the next node,
150 * this one must extend to the boundary.
151 */
152 if (end < dma32_end && dma32_end - end -
153 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
154 end = dma32_end;
155
156 /*
157 * If there won't be enough non-reserved memory for the
158 * next node, this one must extend to the end of the
159 * physical node.
160 */
161 if (limit - end -
162 memblock_x86_hole_size(end, limit) < size)
163 end = limit;
164
165 ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
166 phys_blk,
167 min(end, limit) - start);
168 if (ret < 0)
169 return ret;
170 }
171 }
172 return 0;
173}
174
175/*
176 * Returns the end address of a node so that there is at least `size' amount of
177 * non-reserved memory or `max_addr' is reached.
178 */
179static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
180{
181 u64 end = start + size;
182
183 while (end - start - memblock_x86_hole_size(start, end) < size) {
184 end += FAKE_NODE_MIN_SIZE;
185 if (end > max_addr) {
186 end = max_addr;
187 break;
188 }
189 }
190 return end;
191}
192
193/*
194 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
195 * `addr' to `max_addr'. The return value is the number of nodes allocated.
196 */
197static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
198 struct numa_meminfo *pi,
199 u64 addr, u64 max_addr, u64 size)
200{
201 nodemask_t physnode_mask = NODE_MASK_NONE;
202 u64 min_size;
203 int nid = 0;
204 int i, ret;
205
206 if (!size)
207 return -1;
208 /*
209 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
210 * increased accordingly if the requested size is too small. This
211 * creates a uniform distribution of node sizes across the entire
212 * machine (but not necessarily over physical nodes).
213 */
214 min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
215 MAX_NUMNODES;
216 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
217 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
218 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
219 FAKE_NODE_MIN_HASH_MASK;
220 if (size < min_size) {
221 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
222 size >> 20, min_size >> 20);
223 size = min_size;
224 }
225 size &= FAKE_NODE_MIN_HASH_MASK;
226
227 for (i = 0; i < pi->nr_blks; i++)
228 node_set(pi->blk[i].nid, physnode_mask);
229
230 /*
231 * Fill physical nodes with fake nodes of size until there is no memory
232 * left on any of them.
233 */
234 while (nodes_weight(physnode_mask)) {
235 for_each_node_mask(i, physnode_mask) {
236 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
237 u64 start, limit, end;
238 int phys_blk;
239
240 phys_blk = emu_find_memblk_by_nid(i, pi);
241 if (phys_blk < 0) {
242 node_clear(i, physnode_mask);
243 continue;
244 }
245 start = pi->blk[phys_blk].start;
246 limit = pi->blk[phys_blk].end;
247
248 end = find_end_of_node(start, limit, size);
249 /*
250 * If there won't be at least FAKE_NODE_MIN_SIZE of
251 * non-reserved memory in ZONE_DMA32 for the next node,
252 * this one must extend to the boundary.
253 */
254 if (end < dma32_end && dma32_end - end -
255 memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
256 end = dma32_end;
257
258 /*
259 * If there won't be enough non-reserved memory for the
260 * next node, this one must extend to the end of the
261 * physical node.
262 */
263 if (limit - end -
264 memblock_x86_hole_size(end, limit) < size)
265 end = limit;
266
267 ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
268 phys_blk,
269 min(end, limit) - start);
270 if (ret < 0)
271 return ret;
272 }
273 }
274 return 0;
275}
276
277/**
278 * numa_emulation - Emulate NUMA nodes
279 * @numa_meminfo: NUMA configuration to massage
280 * @numa_dist_cnt: The size of the physical NUMA distance table
281 *
282 * Emulate NUMA nodes according to the numa=fake kernel parameter.
283 * @numa_meminfo contains the physical memory configuration and is modified
284 * to reflect the emulated configuration on success. @numa_dist_cnt is
285 * used to determine the size of the physical distance table.
286 *
287 * On success, the following modifications are made.
288 *
289 * - @numa_meminfo is updated to reflect the emulated nodes.
290 *
291 * - __apicid_to_node[] is updated such that APIC IDs are mapped to the
292 * emulated nodes.
293 *
294 * - NUMA distance table is rebuilt to represent distances between emulated
295 * nodes. The distances are determined considering how emulated nodes
296 * are mapped to physical nodes and match the actual distances.
297 *
298 * - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
299 * nodes. This is used by numa_add_cpu() and numa_remove_cpu().
300 *
301 * If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
302 * identity mapping and no other modification is made.
303 */
304void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
305{
306 static struct numa_meminfo ei __initdata;
307 static struct numa_meminfo pi __initdata;
308 const u64 max_addr = PFN_PHYS(max_pfn);
309 u8 *phys_dist = NULL;
310 size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
311 int max_emu_nid, dfl_phys_nid;
312 int i, j, ret;
313
314 if (!emu_cmdline)
315 goto no_emu;
316
317 memset(&ei, 0, sizeof(ei));
318 pi = *numa_meminfo;
319
320 for (i = 0; i < MAX_NUMNODES; i++)
321 emu_nid_to_phys[i] = NUMA_NO_NODE;
322
323 /*
324 * If the numa=fake command-line contains a 'M' or 'G', it represents
325 * the fixed node size. Otherwise, if it is just a single number N,
326 * split the system RAM into N fake nodes.
327 */
328 if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
329 u64 size;
330
331 size = memparse(emu_cmdline, &emu_cmdline);
332 ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
333 } else {
334 unsigned long n;
335
336 n = simple_strtoul(emu_cmdline, NULL, 0);
337 ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
338 }
339
340 if (ret < 0)
341 goto no_emu;
342
343 if (numa_cleanup_meminfo(&ei) < 0) {
344 pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
345 goto no_emu;
346 }
347
348 /* copy the physical distance table */
349 if (numa_dist_cnt) {
350 u64 phys;
351
352 phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
353 phys_size, PAGE_SIZE);
354 if (phys == MEMBLOCK_ERROR) {
355 pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
356 goto no_emu;
357 }
358 memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
359 phys_dist = __va(phys);
360
361 for (i = 0; i < numa_dist_cnt; i++)
362 for (j = 0; j < numa_dist_cnt; j++)
363 phys_dist[i * numa_dist_cnt + j] =
364 node_distance(i, j);
365 }
366
367 /*
368 * Determine the max emulated nid and the default phys nid to use
369 * for unmapped nodes.
370 */
371 max_emu_nid = 0;
372 dfl_phys_nid = NUMA_NO_NODE;
373 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
374 if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
375 max_emu_nid = i;
376 if (dfl_phys_nid == NUMA_NO_NODE)
377 dfl_phys_nid = emu_nid_to_phys[i];
378 }
379 }
380 if (dfl_phys_nid == NUMA_NO_NODE) {
381 pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
382 goto no_emu;
383 }
384
385 /* commit */
386 *numa_meminfo = ei;
387
388 /*
389 * Transform __apicid_to_node table to use emulated nids by
390 * reverse-mapping phys_nid. The maps should always exist but fall
391 * back to zero just in case.
392 */
393 for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
394 if (__apicid_to_node[i] == NUMA_NO_NODE)
395 continue;
396 for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
397 if (__apicid_to_node[i] == emu_nid_to_phys[j])
398 break;
399 __apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
400 }
401
402 /* make sure all emulated nodes are mapped to a physical node */
403 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
404 if (emu_nid_to_phys[i] == NUMA_NO_NODE)
405 emu_nid_to_phys[i] = dfl_phys_nid;
406
407 /* transform distance table */
408 numa_reset_distance();
409 for (i = 0; i < max_emu_nid + 1; i++) {
410 for (j = 0; j < max_emu_nid + 1; j++) {
411 int physi = emu_nid_to_phys[i];
412 int physj = emu_nid_to_phys[j];
413 int dist;
414
415 if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
416 dist = physi == physj ?
417 LOCAL_DISTANCE : REMOTE_DISTANCE;
418 else
419 dist = phys_dist[physi * numa_dist_cnt + physj];
420
421 numa_set_distance(i, j, dist);
422 }
423 }
424
425 /* free the copied physical distance table */
426 if (phys_dist)
427 memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
428 return;
429
430no_emu:
431 /* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
432 for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
433 emu_nid_to_phys[i] = i;
434}
435
436#ifndef CONFIG_DEBUG_PER_CPU_MAPS
437void __cpuinit numa_add_cpu(int cpu)
438{
439 int physnid, nid;
440
441 nid = early_cpu_to_node(cpu);
442 BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
443
444 physnid = emu_nid_to_phys[nid];
445
446 /*
447 * Map the cpu to each emulated node that is allocated on the physical
448 * node of the cpu's apic id.
449 */
450 for_each_online_node(nid)
451 if (emu_nid_to_phys[nid] == physnid)
452 cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
453}
454
455void __cpuinit numa_remove_cpu(int cpu)
456{
457 int i;
458
459 for_each_online_node(i)
460 cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
461}
462#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
463static void __cpuinit numa_set_cpumask(int cpu, bool enable)
464{
465 int nid, physnid;
466
467 nid = early_cpu_to_node(cpu);
468 if (nid == NUMA_NO_NODE) {
469 /* early_cpu_to_node() already emits a warning and trace */
470 return;
471 }
472
473 physnid = emu_nid_to_phys[nid];
474
475 for_each_online_node(nid) {
476 if (emu_nid_to_phys[nid] != physnid)
477 continue;
478
479 debug_cpumask_set_cpu(cpu, nid, enable);
480 }
481}
482
483void __cpuinit numa_add_cpu(int cpu)
484{
485 numa_set_cpumask(cpu, true);
486}
487
488void __cpuinit numa_remove_cpu(int cpu)
489{
490 numa_set_cpumask(cpu, false);
491}
492#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
diff --git a/arch/x86/mm/numa_internal.h b/arch/x86/mm/numa_internal.h
new file mode 100644
index 000000000000..7178c3afe05e
--- /dev/null
+++ b/arch/x86/mm/numa_internal.h
@@ -0,0 +1,39 @@
1#ifndef __X86_MM_NUMA_INTERNAL_H
2#define __X86_MM_NUMA_INTERNAL_H
3
4#include <linux/types.h>
5#include <asm/numa.h>
6
7struct numa_memblk {
8 u64 start;
9 u64 end;
10 int nid;
11};
12
13struct numa_meminfo {
14 int nr_blks;
15 struct numa_memblk blk[NR_NODE_MEMBLKS];
16};
17
18void __init numa_remove_memblk_from(int idx, struct numa_meminfo *mi);
19int __init numa_cleanup_meminfo(struct numa_meminfo *mi);
20void __init numa_reset_distance(void);
21
22void __init x86_numa_init(void);
23
24#ifdef CONFIG_X86_64
25static inline void init_alloc_remap(int nid, u64 start, u64 end) { }
26#else
27void __init init_alloc_remap(int nid, u64 start, u64 end);
28#endif
29
30#ifdef CONFIG_NUMA_EMU
31void __init numa_emulation(struct numa_meminfo *numa_meminfo,
32 int numa_dist_cnt);
33#else
34static inline void numa_emulation(struct numa_meminfo *numa_meminfo,
35 int numa_dist_cnt)
36{ }
37#endif
38
39#endif /* __X86_MM_NUMA_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 532e7933d606..f9e526742fa1 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -13,6 +13,7 @@
13#include <linux/pfn.h> 13#include <linux/pfn.h>
14#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <linux/gfp.h> 15#include <linux/gfp.h>
16#include <linux/pci.h>
16 17
17#include <asm/e820.h> 18#include <asm/e820.h>
18#include <asm/processor.h> 19#include <asm/processor.h>
@@ -56,12 +57,10 @@ static unsigned long direct_pages_count[PG_LEVEL_NUM];
56 57
57void update_page_count(int level, unsigned long pages) 58void update_page_count(int level, unsigned long pages)
58{ 59{
59 unsigned long flags;
60
61 /* Protect against CPA */ 60 /* Protect against CPA */
62 spin_lock_irqsave(&pgd_lock, flags); 61 spin_lock(&pgd_lock);
63 direct_pages_count[level] += pages; 62 direct_pages_count[level] += pages;
64 spin_unlock_irqrestore(&pgd_lock, flags); 63 spin_unlock(&pgd_lock);
65} 64}
66 65
67static void split_page_count(int level) 66static void split_page_count(int level)
@@ -260,8 +259,10 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
260 * The BIOS area between 640k and 1Mb needs to be executable for 259 * The BIOS area between 640k and 1Mb needs to be executable for
261 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support. 260 * PCI BIOS based config access (CONFIG_PCI_GOBIOS) support.
262 */ 261 */
263 if (within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT)) 262#ifdef CONFIG_PCI_BIOS
263 if (pcibios_enabled && within(pfn, BIOS_BEGIN >> PAGE_SHIFT, BIOS_END >> PAGE_SHIFT))
264 pgprot_val(forbidden) |= _PAGE_NX; 264 pgprot_val(forbidden) |= _PAGE_NX;
265#endif
265 266
266 /* 267 /*
267 * The kernel text needs to be executable for obvious reasons 268 * The kernel text needs to be executable for obvious reasons
@@ -309,7 +310,7 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
309 * these shared mappings are made of small page mappings. 310 * these shared mappings are made of small page mappings.
310 * Thus this don't enforce !RW mapping for small page kernel 311 * Thus this don't enforce !RW mapping for small page kernel
311 * text mapping logic will help Linux Xen parvirt guest boot 312 * text mapping logic will help Linux Xen parvirt guest boot
312 * aswell. 313 * as well.
313 */ 314 */
314 if (lookup_address(address, &level) && (level != PG_LEVEL_4K)) 315 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
315 pgprot_val(forbidden) |= _PAGE_RW; 316 pgprot_val(forbidden) |= _PAGE_RW;
@@ -391,16 +392,16 @@ static int
391try_preserve_large_page(pte_t *kpte, unsigned long address, 392try_preserve_large_page(pte_t *kpte, unsigned long address,
392 struct cpa_data *cpa) 393 struct cpa_data *cpa)
393{ 394{
394 unsigned long nextpage_addr, numpages, pmask, psize, flags, addr, pfn; 395 unsigned long nextpage_addr, numpages, pmask, psize, addr, pfn;
395 pte_t new_pte, old_pte, *tmp; 396 pte_t new_pte, old_pte, *tmp;
396 pgprot_t old_prot, new_prot; 397 pgprot_t old_prot, new_prot, req_prot;
397 int i, do_split = 1; 398 int i, do_split = 1;
398 unsigned int level; 399 unsigned int level;
399 400
400 if (cpa->force_split) 401 if (cpa->force_split)
401 return 1; 402 return 1;
402 403
403 spin_lock_irqsave(&pgd_lock, flags); 404 spin_lock(&pgd_lock);
404 /* 405 /*
405 * Check for races, another CPU might have split this page 406 * Check for races, another CPU might have split this page
406 * up already: 407 * up already:
@@ -438,10 +439,10 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
438 * We are safe now. Check whether the new pgprot is the same: 439 * We are safe now. Check whether the new pgprot is the same:
439 */ 440 */
440 old_pte = *kpte; 441 old_pte = *kpte;
441 old_prot = new_prot = pte_pgprot(old_pte); 442 old_prot = new_prot = req_prot = pte_pgprot(old_pte);
442 443
443 pgprot_val(new_prot) &= ~pgprot_val(cpa->mask_clr); 444 pgprot_val(req_prot) &= ~pgprot_val(cpa->mask_clr);
444 pgprot_val(new_prot) |= pgprot_val(cpa->mask_set); 445 pgprot_val(req_prot) |= pgprot_val(cpa->mask_set);
445 446
446 /* 447 /*
447 * old_pte points to the large page base address. So we need 448 * old_pte points to the large page base address. So we need
@@ -450,17 +451,17 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
450 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT); 451 pfn = pte_pfn(old_pte) + ((address & (psize - 1)) >> PAGE_SHIFT);
451 cpa->pfn = pfn; 452 cpa->pfn = pfn;
452 453
453 new_prot = static_protections(new_prot, address, pfn); 454 new_prot = static_protections(req_prot, address, pfn);
454 455
455 /* 456 /*
456 * We need to check the full range, whether 457 * We need to check the full range, whether
457 * static_protection() requires a different pgprot for one of 458 * static_protection() requires a different pgprot for one of
458 * the pages in the range we try to preserve: 459 * the pages in the range we try to preserve:
459 */ 460 */
460 addr = address + PAGE_SIZE; 461 addr = address & pmask;
461 pfn++; 462 pfn = pte_pfn(old_pte);
462 for (i = 1; i < cpa->numpages; i++, addr += PAGE_SIZE, pfn++) { 463 for (i = 0; i < (psize >> PAGE_SHIFT); i++, addr += PAGE_SIZE, pfn++) {
463 pgprot_t chk_prot = static_protections(new_prot, addr, pfn); 464 pgprot_t chk_prot = static_protections(req_prot, addr, pfn);
464 465
465 if (pgprot_val(chk_prot) != pgprot_val(new_prot)) 466 if (pgprot_val(chk_prot) != pgprot_val(new_prot))
466 goto out_unlock; 467 goto out_unlock;
@@ -483,7 +484,7 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
483 * that we limited the number of possible pages already to 484 * that we limited the number of possible pages already to
484 * the number of pages in the large page. 485 * the number of pages in the large page.
485 */ 486 */
486 if (address == (nextpage_addr - psize) && cpa->numpages == numpages) { 487 if (address == (address & pmask) && cpa->numpages == (psize >> PAGE_SHIFT)) {
487 /* 488 /*
488 * The address is aligned and the number of pages 489 * The address is aligned and the number of pages
489 * covers the full page. 490 * covers the full page.
@@ -495,14 +496,14 @@ try_preserve_large_page(pte_t *kpte, unsigned long address,
495 } 496 }
496 497
497out_unlock: 498out_unlock:
498 spin_unlock_irqrestore(&pgd_lock, flags); 499 spin_unlock(&pgd_lock);
499 500
500 return do_split; 501 return do_split;
501} 502}
502 503
503static int split_large_page(pte_t *kpte, unsigned long address) 504static int split_large_page(pte_t *kpte, unsigned long address)
504{ 505{
505 unsigned long flags, pfn, pfninc = 1; 506 unsigned long pfn, pfninc = 1;
506 unsigned int i, level; 507 unsigned int i, level;
507 pte_t *pbase, *tmp; 508 pte_t *pbase, *tmp;
508 pgprot_t ref_prot; 509 pgprot_t ref_prot;
@@ -516,7 +517,7 @@ static int split_large_page(pte_t *kpte, unsigned long address)
516 if (!base) 517 if (!base)
517 return -ENOMEM; 518 return -ENOMEM;
518 519
519 spin_lock_irqsave(&pgd_lock, flags); 520 spin_lock(&pgd_lock);
520 /* 521 /*
521 * Check for races, another CPU might have split this page 522 * Check for races, another CPU might have split this page
522 * up for us already: 523 * up for us already:
@@ -588,7 +589,7 @@ out_unlock:
588 */ 589 */
589 if (base) 590 if (base)
590 __free_page(base); 591 __free_page(base);
591 spin_unlock_irqrestore(&pgd_lock, flags); 592 spin_unlock(&pgd_lock);
592 593
593 return 0; 594 return 0;
594} 595}
diff --git a/arch/x86/mm/pf_in.c b/arch/x86/mm/pf_in.c
index 38e6d174c497..9f0614daea85 100644
--- a/arch/x86/mm/pf_in.c
+++ b/arch/x86/mm/pf_in.c
@@ -414,22 +414,17 @@ unsigned long get_ins_reg_val(unsigned long ins_addr, struct pt_regs *regs)
414 unsigned char *p; 414 unsigned char *p;
415 struct prefix_bits prf; 415 struct prefix_bits prf;
416 int i; 416 int i;
417 unsigned long rv;
418 417
419 p = (unsigned char *)ins_addr; 418 p = (unsigned char *)ins_addr;
420 p += skip_prefix(p, &prf); 419 p += skip_prefix(p, &prf);
421 p += get_opcode(p, &opcode); 420 p += get_opcode(p, &opcode);
422 for (i = 0; i < ARRAY_SIZE(reg_rop); i++) 421 for (i = 0; i < ARRAY_SIZE(reg_rop); i++)
423 if (reg_rop[i] == opcode) { 422 if (reg_rop[i] == opcode)
424 rv = REG_READ;
425 goto do_work; 423 goto do_work;
426 }
427 424
428 for (i = 0; i < ARRAY_SIZE(reg_wop); i++) 425 for (i = 0; i < ARRAY_SIZE(reg_wop); i++)
429 if (reg_wop[i] == opcode) { 426 if (reg_wop[i] == opcode)
430 rv = REG_WRITE;
431 goto do_work; 427 goto do_work;
432 }
433 428
434 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode " 429 printk(KERN_ERR "mmiotrace: Not a register instruction, opcode "
435 "0x%02x\n", opcode); 430 "0x%02x\n", opcode);
@@ -474,16 +469,13 @@ unsigned long get_ins_imm_val(unsigned long ins_addr)
474 unsigned char *p; 469 unsigned char *p;
475 struct prefix_bits prf; 470 struct prefix_bits prf;
476 int i; 471 int i;
477 unsigned long rv;
478 472
479 p = (unsigned char *)ins_addr; 473 p = (unsigned char *)ins_addr;
480 p += skip_prefix(p, &prf); 474 p += skip_prefix(p, &prf);
481 p += get_opcode(p, &opcode); 475 p += get_opcode(p, &opcode);
482 for (i = 0; i < ARRAY_SIZE(imm_wop); i++) 476 for (i = 0; i < ARRAY_SIZE(imm_wop); i++)
483 if (imm_wop[i] == opcode) { 477 if (imm_wop[i] == opcode)
484 rv = IMM_WRITE;
485 goto do_work; 478 goto do_work;
486 }
487 479
488 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode " 480 printk(KERN_ERR "mmiotrace: Not an immediate instruction, opcode "
489 "0x%02x\n", opcode); 481 "0x%02x\n", opcode);
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index 5c4ee422590e..8573b83a63d0 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -87,7 +87,19 @@ static inline void pgd_list_del(pgd_t *pgd)
87#define UNSHARED_PTRS_PER_PGD \ 87#define UNSHARED_PTRS_PER_PGD \
88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD) 88 (SHARED_KERNEL_PMD ? KERNEL_PGD_BOUNDARY : PTRS_PER_PGD)
89 89
90static void pgd_ctor(pgd_t *pgd) 90
91static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm)
92{
93 BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm));
94 virt_to_page(pgd)->index = (pgoff_t)mm;
95}
96
97struct mm_struct *pgd_page_get_mm(struct page *page)
98{
99 return (struct mm_struct *)page->index;
100}
101
102static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd)
91{ 103{
92 /* If the pgd points to a shared pagetable level (either the 104 /* If the pgd points to a shared pagetable level (either the
93 ptes in non-PAE, or shared PMD in PAE), then just copy the 105 ptes in non-PAE, or shared PMD in PAE), then just copy the
@@ -98,27 +110,23 @@ static void pgd_ctor(pgd_t *pgd)
98 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY, 110 clone_pgd_range(pgd + KERNEL_PGD_BOUNDARY,
99 swapper_pg_dir + KERNEL_PGD_BOUNDARY, 111 swapper_pg_dir + KERNEL_PGD_BOUNDARY,
100 KERNEL_PGD_PTRS); 112 KERNEL_PGD_PTRS);
101 paravirt_alloc_pmd_clone(__pa(pgd) >> PAGE_SHIFT,
102 __pa(swapper_pg_dir) >> PAGE_SHIFT,
103 KERNEL_PGD_BOUNDARY,
104 KERNEL_PGD_PTRS);
105 } 113 }
106 114
107 /* list required to sync kernel mapping updates */ 115 /* list required to sync kernel mapping updates */
108 if (!SHARED_KERNEL_PMD) 116 if (!SHARED_KERNEL_PMD) {
117 pgd_set_mm(pgd, mm);
109 pgd_list_add(pgd); 118 pgd_list_add(pgd);
119 }
110} 120}
111 121
112static void pgd_dtor(pgd_t *pgd) 122static void pgd_dtor(pgd_t *pgd)
113{ 123{
114 unsigned long flags; /* can be called from interrupt context */
115
116 if (SHARED_KERNEL_PMD) 124 if (SHARED_KERNEL_PMD)
117 return; 125 return;
118 126
119 spin_lock_irqsave(&pgd_lock, flags); 127 spin_lock(&pgd_lock);
120 pgd_list_del(pgd); 128 pgd_list_del(pgd);
121 spin_unlock_irqrestore(&pgd_lock, flags); 129 spin_unlock(&pgd_lock);
122} 130}
123 131
124/* 132/*
@@ -160,8 +168,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd)
160 * section 8.1: in PAE mode we explicitly have to flush the 168 * section 8.1: in PAE mode we explicitly have to flush the
161 * TLB via cr3 if the top-level pgd is changed... 169 * TLB via cr3 if the top-level pgd is changed...
162 */ 170 */
163 if (mm == current->active_mm) 171 flush_tlb_mm(mm);
164 write_cr3(read_cr3());
165} 172}
166#else /* !CONFIG_X86_PAE */ 173#else /* !CONFIG_X86_PAE */
167 174
@@ -250,7 +257,6 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
250{ 257{
251 pgd_t *pgd; 258 pgd_t *pgd;
252 pmd_t *pmds[PREALLOCATED_PMDS]; 259 pmd_t *pmds[PREALLOCATED_PMDS];
253 unsigned long flags;
254 260
255 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP); 261 pgd = (pgd_t *)__get_free_page(PGALLOC_GFP);
256 262
@@ -270,12 +276,12 @@ pgd_t *pgd_alloc(struct mm_struct *mm)
270 * respect to anything walking the pgd_list, so that they 276 * respect to anything walking the pgd_list, so that they
271 * never see a partially populated pgd. 277 * never see a partially populated pgd.
272 */ 278 */
273 spin_lock_irqsave(&pgd_lock, flags); 279 spin_lock(&pgd_lock);
274 280
275 pgd_ctor(pgd); 281 pgd_ctor(mm, pgd);
276 pgd_prepopulate_pmd(mm, pgd, pmds); 282 pgd_prepopulate_pmd(mm, pgd, pmds);
277 283
278 spin_unlock_irqrestore(&pgd_lock, flags); 284 spin_unlock(&pgd_lock);
279 285
280 return pgd; 286 return pgd;
281 287
@@ -310,6 +316,25 @@ int ptep_set_access_flags(struct vm_area_struct *vma,
310 return changed; 316 return changed;
311} 317}
312 318
319#ifdef CONFIG_TRANSPARENT_HUGEPAGE
320int pmdp_set_access_flags(struct vm_area_struct *vma,
321 unsigned long address, pmd_t *pmdp,
322 pmd_t entry, int dirty)
323{
324 int changed = !pmd_same(*pmdp, entry);
325
326 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
327
328 if (changed && dirty) {
329 *pmdp = entry;
330 pmd_update_defer(vma->vm_mm, address, pmdp);
331 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
332 }
333
334 return changed;
335}
336#endif
337
313int ptep_test_and_clear_young(struct vm_area_struct *vma, 338int ptep_test_and_clear_young(struct vm_area_struct *vma,
314 unsigned long addr, pte_t *ptep) 339 unsigned long addr, pte_t *ptep)
315{ 340{
@@ -325,6 +350,23 @@ int ptep_test_and_clear_young(struct vm_area_struct *vma,
325 return ret; 350 return ret;
326} 351}
327 352
353#ifdef CONFIG_TRANSPARENT_HUGEPAGE
354int pmdp_test_and_clear_young(struct vm_area_struct *vma,
355 unsigned long addr, pmd_t *pmdp)
356{
357 int ret = 0;
358
359 if (pmd_young(*pmdp))
360 ret = test_and_clear_bit(_PAGE_BIT_ACCESSED,
361 (unsigned long *)pmdp);
362
363 if (ret)
364 pmd_update(vma->vm_mm, addr, pmdp);
365
366 return ret;
367}
368#endif
369
328int ptep_clear_flush_young(struct vm_area_struct *vma, 370int ptep_clear_flush_young(struct vm_area_struct *vma,
329 unsigned long address, pte_t *ptep) 371 unsigned long address, pte_t *ptep)
330{ 372{
@@ -337,6 +379,36 @@ int ptep_clear_flush_young(struct vm_area_struct *vma,
337 return young; 379 return young;
338} 380}
339 381
382#ifdef CONFIG_TRANSPARENT_HUGEPAGE
383int pmdp_clear_flush_young(struct vm_area_struct *vma,
384 unsigned long address, pmd_t *pmdp)
385{
386 int young;
387
388 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
389
390 young = pmdp_test_and_clear_young(vma, address, pmdp);
391 if (young)
392 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
393
394 return young;
395}
396
397void pmdp_splitting_flush(struct vm_area_struct *vma,
398 unsigned long address, pmd_t *pmdp)
399{
400 int set;
401 VM_BUG_ON(address & ~HPAGE_PMD_MASK);
402 set = !test_and_set_bit(_PAGE_BIT_SPLITTING,
403 (unsigned long *)pmdp);
404 if (set) {
405 pmd_update(vma->vm_mm, address, pmdp);
406 /* need tlb flush only to serialize against gup-fast */
407 flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
408 }
409}
410#endif
411
340/** 412/**
341 * reserve_top_address - reserves a hole in the top of kernel address space 413 * reserve_top_address - reserves a hole in the top of kernel address space
342 * @reserve - size of hole to reserve 414 * @reserve - size of hole to reserve
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index a3250aa34086..410531d3c292 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -41,7 +41,7 @@ void __init x86_report_nx(void)
41{ 41{
42 if (!cpu_has_nx) { 42 if (!cpu_has_nx) {
43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection " 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
44 "missing in CPU or disabled in BIOS!\n"); 44 "missing in CPU!\n");
45 } else { 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) 46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) { 47 if (disable_nx) {
diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c
new file mode 100644
index 000000000000..81dbfdeb080d
--- /dev/null
+++ b/arch/x86/mm/srat.c
@@ -0,0 +1,184 @@
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/memblock.h>
20#include <linux/mm.h>
21#include <asm/proto.h>
22#include <asm/numa.h>
23#include <asm/e820.h>
24#include <asm/apic.h>
25#include <asm/uv/uv.h>
26
27int acpi_numa __initdata;
28
29static __init int setup_node(int pxm)
30{
31 return acpi_map_pxm_to_node(pxm);
32}
33
34static __init void bad_srat(void)
35{
36 printk(KERN_ERR "SRAT: SRAT not used.\n");
37 acpi_numa = -1;
38}
39
40static __init inline int srat_disabled(void)
41{
42 return acpi_numa < 0;
43}
44
45/* Callback for SLIT parsing */
46void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
47{
48 int i, j;
49
50 for (i = 0; i < slit->locality_count; i++)
51 for (j = 0; j < slit->locality_count; j++)
52 numa_set_distance(pxm_to_node(i), pxm_to_node(j),
53 slit->entry[slit->locality_count * i + j]);
54}
55
56/* Callback for Proximity Domain -> x2APIC mapping */
57void __init
58acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
59{
60 int pxm, node;
61 int apic_id;
62
63 if (srat_disabled())
64 return;
65 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
66 bad_srat();
67 return;
68 }
69 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
70 return;
71 pxm = pa->proximity_domain;
72 node = setup_node(pxm);
73 if (node < 0) {
74 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
75 bad_srat();
76 return;
77 }
78
79 apic_id = pa->apic_id;
80 if (apic_id >= MAX_LOCAL_APIC) {
81 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
82 return;
83 }
84 set_apicid_to_node(apic_id, node);
85 node_set(node, numa_nodes_parsed);
86 acpi_numa = 1;
87 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
88 pxm, apic_id, node);
89}
90
91/* Callback for Proximity Domain -> LAPIC mapping */
92void __init
93acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
94{
95 int pxm, node;
96 int apic_id;
97
98 if (srat_disabled())
99 return;
100 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
101 bad_srat();
102 return;
103 }
104 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
105 return;
106 pxm = pa->proximity_domain_lo;
107 node = setup_node(pxm);
108 if (node < 0) {
109 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
110 bad_srat();
111 return;
112 }
113
114 if (get_uv_system_type() >= UV_X2APIC)
115 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
116 else
117 apic_id = pa->apic_id;
118
119 if (apic_id >= MAX_LOCAL_APIC) {
120 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u skipped apicid that is too big\n", pxm, apic_id, node);
121 return;
122 }
123
124 set_apicid_to_node(apic_id, node);
125 node_set(node, numa_nodes_parsed);
126 acpi_numa = 1;
127 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
128 pxm, apic_id, node);
129}
130
131#ifdef CONFIG_MEMORY_HOTPLUG
132static inline int save_add_info(void) {return 1;}
133#else
134static inline int save_add_info(void) {return 0;}
135#endif
136
137/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
138void __init
139acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
140{
141 u64 start, end;
142 int node, pxm;
143
144 if (srat_disabled())
145 return;
146 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
147 bad_srat();
148 return;
149 }
150 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
151 return;
152
153 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
154 return;
155 start = ma->base_address;
156 end = start + ma->length;
157 pxm = ma->proximity_domain;
158 node = setup_node(pxm);
159 if (node < 0) {
160 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
161 bad_srat();
162 return;
163 }
164
165 if (numa_add_memblk(node, start, end) < 0) {
166 bad_srat();
167 return;
168 }
169
170 printk(KERN_INFO "SRAT: Node %u PXM %u %Lx-%Lx\n", node, pxm,
171 start, end);
172}
173
174void __init acpi_numa_arch_fixup(void) {}
175
176int __init x86_acpi_numa_init(void)
177{
178 int ret;
179
180 ret = acpi_numa_init();
181 if (ret < 0)
182 return ret;
183 return srat_disabled() ? -EINVAL : 0;
184}
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
deleted file mode 100644
index 9324f13492d5..000000000000
--- a/arch/x86/mm/srat_32.c
+++ /dev/null
@@ -1,285 +0,0 @@
1/*
2 * Some of the code in this file has been gleaned from the 64 bit
3 * discontigmem support code base.
4 *
5 * Copyright (C) 2002, IBM Corp.
6 *
7 * All rights reserved.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or
17 * NON INFRINGEMENT. See the GNU General Public License for more
18 * details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; if not, write to the Free Software
22 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23 *
24 * Send feedback to Pat Gaughen <gone@us.ibm.com>
25 */
26#include <linux/mm.h>
27#include <linux/bootmem.h>
28#include <linux/mmzone.h>
29#include <linux/acpi.h>
30#include <linux/nodemask.h>
31#include <asm/srat.h>
32#include <asm/topology.h>
33#include <asm/smp.h>
34#include <asm/e820.h>
35
36/*
37 * proximity macros and definitions
38 */
39#define NODE_ARRAY_INDEX(x) ((x) / 8) /* 8 bits/char */
40#define NODE_ARRAY_OFFSET(x) ((x) % 8) /* 8 bits/char */
41#define BMAP_SET(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] |= 1 << NODE_ARRAY_OFFSET(bit))
42#define BMAP_TEST(bmap, bit) ((bmap)[NODE_ARRAY_INDEX(bit)] & (1 << NODE_ARRAY_OFFSET(bit)))
43/* bitmap length; _PXM is at most 255 */
44#define PXM_BITMAP_LEN (MAX_PXM_DOMAINS / 8)
45static u8 __initdata pxm_bitmap[PXM_BITMAP_LEN]; /* bitmap of proximity domains */
46
47#define MAX_CHUNKS_PER_NODE 3
48#define MAXCHUNKS (MAX_CHUNKS_PER_NODE * MAX_NUMNODES)
49struct node_memory_chunk_s {
50 unsigned long start_pfn;
51 unsigned long end_pfn;
52 u8 pxm; // proximity domain of node
53 u8 nid; // which cnode contains this chunk?
54 u8 bank; // which mem bank on this node
55};
56static struct node_memory_chunk_s __initdata node_memory_chunk[MAXCHUNKS];
57
58static int __initdata num_memory_chunks; /* total number of memory chunks */
59static u8 __initdata apicid_to_pxm[MAX_APICID];
60
61int numa_off __initdata;
62int acpi_numa __initdata;
63
64static __init void bad_srat(void)
65{
66 printk(KERN_ERR "SRAT: SRAT not used.\n");
67 acpi_numa = -1;
68 num_memory_chunks = 0;
69}
70
71static __init inline int srat_disabled(void)
72{
73 return numa_off || acpi_numa < 0;
74}
75
76/* Identify CPU proximity domains */
77void __init
78acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *cpu_affinity)
79{
80 if (srat_disabled())
81 return;
82 if (cpu_affinity->header.length !=
83 sizeof(struct acpi_srat_cpu_affinity)) {
84 bad_srat();
85 return;
86 }
87
88 if ((cpu_affinity->flags & ACPI_SRAT_CPU_ENABLED) == 0)
89 return; /* empty entry */
90
91 /* mark this node as "seen" in node bitmap */
92 BMAP_SET(pxm_bitmap, cpu_affinity->proximity_domain_lo);
93
94 apicid_to_pxm[cpu_affinity->apic_id] = cpu_affinity->proximity_domain_lo;
95
96 printk(KERN_DEBUG "CPU %02x in proximity domain %02x\n",
97 cpu_affinity->apic_id, cpu_affinity->proximity_domain_lo);
98}
99
100/*
101 * Identify memory proximity domains and hot-remove capabilities.
102 * Fill node memory chunk list structure.
103 */
104void __init
105acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *memory_affinity)
106{
107 unsigned long long paddr, size;
108 unsigned long start_pfn, end_pfn;
109 u8 pxm;
110 struct node_memory_chunk_s *p, *q, *pend;
111
112 if (srat_disabled())
113 return;
114 if (memory_affinity->header.length !=
115 sizeof(struct acpi_srat_mem_affinity)) {
116 bad_srat();
117 return;
118 }
119
120 if ((memory_affinity->flags & ACPI_SRAT_MEM_ENABLED) == 0)
121 return; /* empty entry */
122
123 pxm = memory_affinity->proximity_domain & 0xff;
124
125 /* mark this node as "seen" in node bitmap */
126 BMAP_SET(pxm_bitmap, pxm);
127
128 /* calculate info for memory chunk structure */
129 paddr = memory_affinity->base_address;
130 size = memory_affinity->length;
131
132 start_pfn = paddr >> PAGE_SHIFT;
133 end_pfn = (paddr + size) >> PAGE_SHIFT;
134
135
136 if (num_memory_chunks >= MAXCHUNKS) {
137 printk(KERN_WARNING "Too many mem chunks in SRAT."
138 " Ignoring %lld MBytes at %llx\n",
139 size/(1024*1024), paddr);
140 return;
141 }
142
143 /* Insertion sort based on base address */
144 pend = &node_memory_chunk[num_memory_chunks];
145 for (p = &node_memory_chunk[0]; p < pend; p++) {
146 if (start_pfn < p->start_pfn)
147 break;
148 }
149 if (p < pend) {
150 for (q = pend; q >= p; q--)
151 *(q + 1) = *q;
152 }
153 p->start_pfn = start_pfn;
154 p->end_pfn = end_pfn;
155 p->pxm = pxm;
156
157 num_memory_chunks++;
158
159 printk(KERN_DEBUG "Memory range %08lx to %08lx"
160 " in proximity domain %02x %s\n",
161 start_pfn, end_pfn,
162 pxm,
163 ((memory_affinity->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) ?
164 "enabled and removable" : "enabled" ) );
165}
166
167/* Callback for SLIT parsing */
168void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
169{
170}
171
172void acpi_numa_arch_fixup(void)
173{
174}
175/*
176 * The SRAT table always lists ascending addresses, so can always
177 * assume that the first "start" address that you see is the real
178 * start of the node, and that the current "end" address is after
179 * the previous one.
180 */
181static __init int node_read_chunk(int nid, struct node_memory_chunk_s *memory_chunk)
182{
183 /*
184 * Only add present memory as told by the e820.
185 * There is no guarantee from the SRAT that the memory it
186 * enumerates is present at boot time because it represents
187 * *possible* memory hotplug areas the same as normal RAM.
188 */
189 if (memory_chunk->start_pfn >= max_pfn) {
190 printk(KERN_INFO "Ignoring SRAT pfns: %08lx - %08lx\n",
191 memory_chunk->start_pfn, memory_chunk->end_pfn);
192 return -1;
193 }
194 if (memory_chunk->nid != nid)
195 return -1;
196
197 if (!node_has_online_mem(nid))
198 node_start_pfn[nid] = memory_chunk->start_pfn;
199
200 if (node_start_pfn[nid] > memory_chunk->start_pfn)
201 node_start_pfn[nid] = memory_chunk->start_pfn;
202
203 if (node_end_pfn[nid] < memory_chunk->end_pfn)
204 node_end_pfn[nid] = memory_chunk->end_pfn;
205
206 return 0;
207}
208
209int __init get_memcfg_from_srat(void)
210{
211 int i, j, nid;
212
213
214 if (srat_disabled())
215 goto out_fail;
216
217 if (num_memory_chunks == 0) {
218 printk(KERN_DEBUG
219 "could not find any ACPI SRAT memory areas.\n");
220 goto out_fail;
221 }
222
223 /* Calculate total number of nodes in system from PXM bitmap and create
224 * a set of sequential node IDs starting at zero. (ACPI doesn't seem
225 * to specify the range of _PXM values.)
226 */
227 /*
228 * MCD - we no longer HAVE to number nodes sequentially. PXM domain
229 * numbers could go as high as 256, and MAX_NUMNODES for i386 is typically
230 * 32, so we will continue numbering them in this manner until MAX_NUMNODES
231 * approaches MAX_PXM_DOMAINS for i386.
232 */
233 nodes_clear(node_online_map);
234 for (i = 0; i < MAX_PXM_DOMAINS; i++) {
235 if (BMAP_TEST(pxm_bitmap, i)) {
236 int nid = acpi_map_pxm_to_node(i);
237 node_set_online(nid);
238 }
239 }
240 BUG_ON(num_online_nodes() == 0);
241
242 /* set cnode id in memory chunk structure */
243 for (i = 0; i < num_memory_chunks; i++)
244 node_memory_chunk[i].nid = pxm_to_node(node_memory_chunk[i].pxm);
245
246 printk(KERN_DEBUG "pxm bitmap: ");
247 for (i = 0; i < sizeof(pxm_bitmap); i++) {
248 printk(KERN_CONT "%02x ", pxm_bitmap[i]);
249 }
250 printk(KERN_CONT "\n");
251 printk(KERN_DEBUG "Number of logical nodes in system = %d\n",
252 num_online_nodes());
253 printk(KERN_DEBUG "Number of memory chunks in system = %d\n",
254 num_memory_chunks);
255
256 for (i = 0; i < MAX_APICID; i++)
257 apicid_2_node[i] = pxm_to_node(apicid_to_pxm[i]);
258
259 for (j = 0; j < num_memory_chunks; j++){
260 struct node_memory_chunk_s * chunk = &node_memory_chunk[j];
261 printk(KERN_DEBUG
262 "chunk %d nid %d start_pfn %08lx end_pfn %08lx\n",
263 j, chunk->nid, chunk->start_pfn, chunk->end_pfn);
264 if (node_read_chunk(chunk->nid, chunk))
265 continue;
266
267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn));
269 }
270 /* for out of order entries in SRAT */
271 sort_node_map();
272
273 for_each_online_node(nid) {
274 unsigned long start = node_start_pfn[nid];
275 unsigned long end = min(node_end_pfn[nid], max_pfn);
276
277 memory_present(nid, start, end);
278 node_remap_size[nid] = node_memmap_size_bytes(nid, start, end);
279 }
280 return 1;
281out_fail:
282 printk(KERN_DEBUG "failed to get NUMA memory information from SRAT"
283 " table\n");
284 return 0;
285}
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
deleted file mode 100644
index 9c0d0d399c30..000000000000
--- a/arch/x86/mm/srat_64.c
+++ /dev/null
@@ -1,564 +0,0 @@
1/*
2 * ACPI 3.0 based NUMA setup
3 * Copyright 2004 Andi Kleen, SuSE Labs.
4 *
5 * Reads the ACPI SRAT table to figure out what memory belongs to which CPUs.
6 *
7 * Called from acpi_numa_init while reading the SRAT and SLIT tables.
8 * Assumes all memory regions belonging to a single proximity domain
9 * are in one chunk. Holes between them will be included in the node.
10 */
11
12#include <linux/kernel.h>
13#include <linux/acpi.h>
14#include <linux/mmzone.h>
15#include <linux/bitmap.h>
16#include <linux/module.h>
17#include <linux/topology.h>
18#include <linux/bootmem.h>
19#include <linux/mm.h>
20#include <asm/proto.h>
21#include <asm/numa.h>
22#include <asm/e820.h>
23#include <asm/apic.h>
24#include <asm/uv/uv.h>
25
26int acpi_numa __initdata;
27
28static struct acpi_table_slit *acpi_slit;
29
30static nodemask_t nodes_parsed __initdata;
31static nodemask_t cpu_nodes_parsed __initdata;
32static struct bootnode nodes[MAX_NUMNODES] __initdata;
33static struct bootnode nodes_add[MAX_NUMNODES];
34
35static int num_node_memblks __initdata;
36static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata;
37static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata;
38
39static __init int setup_node(int pxm)
40{
41 return acpi_map_pxm_to_node(pxm);
42}
43
44static __init int conflicting_memblks(unsigned long start, unsigned long end)
45{
46 int i;
47 for (i = 0; i < num_node_memblks; i++) {
48 struct bootnode *nd = &node_memblk_range[i];
49 if (nd->start == nd->end)
50 continue;
51 if (nd->end > start && nd->start < end)
52 return memblk_nodeid[i];
53 if (nd->end == end && nd->start == start)
54 return memblk_nodeid[i];
55 }
56 return -1;
57}
58
59static __init void cutoff_node(int i, unsigned long start, unsigned long end)
60{
61 struct bootnode *nd = &nodes[i];
62
63 if (nd->start < start) {
64 nd->start = start;
65 if (nd->end < nd->start)
66 nd->start = nd->end;
67 }
68 if (nd->end > end) {
69 nd->end = end;
70 if (nd->start > nd->end)
71 nd->start = nd->end;
72 }
73}
74
75static __init void bad_srat(void)
76{
77 int i;
78 printk(KERN_ERR "SRAT: SRAT not used.\n");
79 acpi_numa = -1;
80 for (i = 0; i < MAX_LOCAL_APIC; i++)
81 apicid_to_node[i] = NUMA_NO_NODE;
82 for (i = 0; i < MAX_NUMNODES; i++) {
83 nodes[i].start = nodes[i].end = 0;
84 nodes_add[i].start = nodes_add[i].end = 0;
85 }
86 remove_all_active_ranges();
87}
88
89static __init inline int srat_disabled(void)
90{
91 return numa_off || acpi_numa < 0;
92}
93
94/* Callback for SLIT parsing */
95void __init acpi_numa_slit_init(struct acpi_table_slit *slit)
96{
97 unsigned length;
98 unsigned long phys;
99
100 length = slit->header.length;
101 phys = find_e820_area(0, max_pfn_mapped<<PAGE_SHIFT, length,
102 PAGE_SIZE);
103
104 if (phys == -1L)
105 panic(" Can not save slit!\n");
106
107 acpi_slit = __va(phys);
108 memcpy(acpi_slit, slit, length);
109 reserve_early(phys, phys + length, "ACPI SLIT");
110}
111
112/* Callback for Proximity Domain -> x2APIC mapping */
113void __init
114acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
115{
116 int pxm, node;
117 int apic_id;
118
119 if (srat_disabled())
120 return;
121 if (pa->header.length < sizeof(struct acpi_srat_x2apic_cpu_affinity)) {
122 bad_srat();
123 return;
124 }
125 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
126 return;
127 pxm = pa->proximity_domain;
128 node = setup_node(pxm);
129 if (node < 0) {
130 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
131 bad_srat();
132 return;
133 }
134
135 apic_id = pa->apic_id;
136 apicid_to_node[apic_id] = node;
137 node_set(node, cpu_nodes_parsed);
138 acpi_numa = 1;
139 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
140 pxm, apic_id, node);
141}
142
143/* Callback for Proximity Domain -> LAPIC mapping */
144void __init
145acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
146{
147 int pxm, node;
148 int apic_id;
149
150 if (srat_disabled())
151 return;
152 if (pa->header.length != sizeof(struct acpi_srat_cpu_affinity)) {
153 bad_srat();
154 return;
155 }
156 if ((pa->flags & ACPI_SRAT_CPU_ENABLED) == 0)
157 return;
158 pxm = pa->proximity_domain_lo;
159 node = setup_node(pxm);
160 if (node < 0) {
161 printk(KERN_ERR "SRAT: Too many proximity domains %x\n", pxm);
162 bad_srat();
163 return;
164 }
165
166 if (get_uv_system_type() >= UV_X2APIC)
167 apic_id = (pa->apic_id << 8) | pa->local_sapic_eid;
168 else
169 apic_id = pa->apic_id;
170 apicid_to_node[apic_id] = node;
171 node_set(node, cpu_nodes_parsed);
172 acpi_numa = 1;
173 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
174 pxm, apic_id, node);
175}
176
177#ifdef CONFIG_MEMORY_HOTPLUG_SPARSE
178static inline int save_add_info(void) {return 1;}
179#else
180static inline int save_add_info(void) {return 0;}
181#endif
182/*
183 * Update nodes_add[]
184 * This code supports one contiguous hot add area per node
185 */
186static void __init
187update_nodes_add(int node, unsigned long start, unsigned long end)
188{
189 unsigned long s_pfn = start >> PAGE_SHIFT;
190 unsigned long e_pfn = end >> PAGE_SHIFT;
191 int changed = 0;
192 struct bootnode *nd = &nodes_add[node];
193
194 /* I had some trouble with strange memory hotadd regions breaking
195 the boot. Be very strict here and reject anything unexpected.
196 If you want working memory hotadd write correct SRATs.
197
198 The node size check is a basic sanity check to guard against
199 mistakes */
200 if ((signed long)(end - start) < NODE_MIN_SIZE) {
201 printk(KERN_ERR "SRAT: Hotplug area too small\n");
202 return;
203 }
204
205 /* This check might be a bit too strict, but I'm keeping it for now. */
206 if (absent_pages_in_range(s_pfn, e_pfn) != e_pfn - s_pfn) {
207 printk(KERN_ERR
208 "SRAT: Hotplug area %lu -> %lu has existing memory\n",
209 s_pfn, e_pfn);
210 return;
211 }
212
213 /* Looks good */
214
215 if (nd->start == nd->end) {
216 nd->start = start;
217 nd->end = end;
218 changed = 1;
219 } else {
220 if (nd->start == end) {
221 nd->start = start;
222 changed = 1;
223 }
224 if (nd->end == start) {
225 nd->end = end;
226 changed = 1;
227 }
228 if (!changed)
229 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
230 }
231
232 if (changed) {
233 node_set(node, cpu_nodes_parsed);
234 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
235 nd->start, nd->end);
236 }
237}
238
239/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
240void __init
241acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
242{
243 struct bootnode *nd, oldnode;
244 unsigned long start, end;
245 int node, pxm;
246 int i;
247
248 if (srat_disabled())
249 return;
250 if (ma->header.length != sizeof(struct acpi_srat_mem_affinity)) {
251 bad_srat();
252 return;
253 }
254 if ((ma->flags & ACPI_SRAT_MEM_ENABLED) == 0)
255 return;
256
257 if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && !save_add_info())
258 return;
259 start = ma->base_address;
260 end = start + ma->length;
261 pxm = ma->proximity_domain;
262 node = setup_node(pxm);
263 if (node < 0) {
264 printk(KERN_ERR "SRAT: Too many proximity domains.\n");
265 bad_srat();
266 return;
267 }
268 i = conflicting_memblks(start, end);
269 if (i == node) {
270 printk(KERN_WARNING
271 "SRAT: Warning: PXM %d (%lx-%lx) overlaps with itself (%Lx-%Lx)\n",
272 pxm, start, end, nodes[i].start, nodes[i].end);
273 } else if (i >= 0) {
274 printk(KERN_ERR
275 "SRAT: PXM %d (%lx-%lx) overlaps with PXM %d (%Lx-%Lx)\n",
276 pxm, start, end, node_to_pxm(i),
277 nodes[i].start, nodes[i].end);
278 bad_srat();
279 return;
280 }
281 nd = &nodes[node];
282 oldnode = *nd;
283 if (!node_test_and_set(node, nodes_parsed)) {
284 nd->start = start;
285 nd->end = end;
286 } else {
287 if (start < nd->start)
288 nd->start = start;
289 if (nd->end < end)
290 nd->end = end;
291 }
292
293 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
294 start, end);
295
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end);
298 /* restore nodes[node] */
299 *nd = oldnode;
300 if ((nd->start | nd->end) == 0)
301 node_clear(node, nodes_parsed);
302 }
303
304 node_memblk_range[num_node_memblks].start = start;
305 node_memblk_range[num_node_memblks].end = end;
306 memblk_nodeid[num_node_memblks] = node;
307 num_node_memblks++;
308}
309
310/* Sanity check to catch more bad SRATs (they are amazingly common).
311 Make sure the PXMs cover all memory. */
312static int __init nodes_cover_memory(const struct bootnode *nodes)
313{
314 int i;
315 unsigned long pxmram, e820ram;
316
317 pxmram = 0;
318 for_each_node_mask(i, nodes_parsed) {
319 unsigned long s = nodes[i].start >> PAGE_SHIFT;
320 unsigned long e = nodes[i].end >> PAGE_SHIFT;
321 pxmram += e - s;
322 pxmram -= __absent_pages_in_range(i, s, e);
323 if ((long)pxmram < 0)
324 pxmram = 0;
325 }
326
327 e820ram = max_pfn - (e820_hole_size(0, max_pfn<<PAGE_SHIFT)>>PAGE_SHIFT);
328 /* We seem to lose 3 pages somewhere. Allow 1M of slack. */
329 if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) {
330 printk(KERN_ERR
331 "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n",
332 (pxmram << PAGE_SHIFT) >> 20,
333 (e820ram << PAGE_SHIFT) >> 20);
334 return 0;
335 }
336 return 1;
337}
338
339void __init acpi_numa_arch_fixup(void) {}
340
341int __init acpi_get_nodes(struct bootnode *physnodes)
342{
343 int i;
344 int ret = 0;
345
346 for_each_node_mask(i, nodes_parsed) {
347 physnodes[ret].start = nodes[i].start;
348 physnodes[ret].end = nodes[i].end;
349 ret++;
350 }
351 return ret;
352}
353
354/* Use the information discovered above to actually set up the nodes. */
355int __init acpi_scan_nodes(unsigned long start, unsigned long end)
356{
357 int i;
358
359 if (acpi_numa <= 0)
360 return -1;
361
362 /* First clean up the node list */
363 for (i = 0; i < MAX_NUMNODES; i++)
364 cutoff_node(i, start, end);
365
366 /*
367 * Join together blocks on the same node, holes between
368 * which don't overlap with memory on other nodes.
369 */
370 for (i = 0; i < num_node_memblks; ++i) {
371 int j, k;
372
373 for (j = i + 1; j < num_node_memblks; ++j) {
374 unsigned long start, end;
375
376 if (memblk_nodeid[i] != memblk_nodeid[j])
377 continue;
378 start = min(node_memblk_range[i].end,
379 node_memblk_range[j].end);
380 end = max(node_memblk_range[i].start,
381 node_memblk_range[j].start);
382 for (k = 0; k < num_node_memblks; ++k) {
383 if (memblk_nodeid[i] == memblk_nodeid[k])
384 continue;
385 if (start < node_memblk_range[k].end &&
386 end > node_memblk_range[k].start)
387 break;
388 }
389 if (k < num_node_memblks)
390 continue;
391 start = min(node_memblk_range[i].start,
392 node_memblk_range[j].start);
393 end = max(node_memblk_range[i].end,
394 node_memblk_range[j].end);
395 printk(KERN_INFO "SRAT: Node %d "
396 "[%Lx,%Lx) + [%Lx,%Lx) -> [%lx,%lx)\n",
397 memblk_nodeid[i],
398 node_memblk_range[i].start,
399 node_memblk_range[i].end,
400 node_memblk_range[j].start,
401 node_memblk_range[j].end,
402 start, end);
403 node_memblk_range[i].start = start;
404 node_memblk_range[i].end = end;
405 k = --num_node_memblks - j;
406 memmove(memblk_nodeid + j, memblk_nodeid + j+1,
407 k * sizeof(*memblk_nodeid));
408 memmove(node_memblk_range + j, node_memblk_range + j+1,
409 k * sizeof(*node_memblk_range));
410 --j;
411 }
412 }
413
414 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
415 memblk_nodeid);
416 if (memnode_shift < 0) {
417 printk(KERN_ERR
418 "SRAT: No NUMA node hash function found. Contact maintainer\n");
419 bad_srat();
420 return -1;
421 }
422
423 for (i = 0; i < num_node_memblks; i++)
424 e820_register_active_regions(memblk_nodeid[i],
425 node_memblk_range[i].start >> PAGE_SHIFT,
426 node_memblk_range[i].end >> PAGE_SHIFT);
427
428 /* for out of order entries in SRAT */
429 sort_node_map();
430 if (!nodes_cover_memory(nodes)) {
431 bad_srat();
432 return -1;
433 }
434
435 /* Account for nodes with cpus and no memory */
436 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
437
438 /* Finally register nodes */
439 for_each_node_mask(i, node_possible_map)
440 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
441 /* Try again in case setup_node_bootmem missed one due
442 to missing bootmem */
443 for_each_node_mask(i, node_possible_map)
444 if (!node_online(i))
445 setup_node_bootmem(i, nodes[i].start, nodes[i].end);
446
447 for (i = 0; i < nr_cpu_ids; i++) {
448 int node = early_cpu_to_node(i);
449
450 if (node == NUMA_NO_NODE)
451 continue;
452 if (!node_online(node))
453 numa_clear_node(i);
454 }
455 numa_init_array();
456 return 0;
457}
458
459#ifdef CONFIG_NUMA_EMU
460static int fake_node_to_pxm_map[MAX_NUMNODES] __initdata = {
461 [0 ... MAX_NUMNODES-1] = PXM_INVAL
462};
463static s16 fake_apicid_to_node[MAX_LOCAL_APIC] __initdata = {
464 [0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
465};
466static int __init find_node_by_addr(unsigned long addr)
467{
468 int ret = NUMA_NO_NODE;
469 int i;
470
471 for_each_node_mask(i, nodes_parsed) {
472 /*
473 * Find the real node that this emulated node appears on. For
474 * the sake of simplicity, we only use a real node's starting
475 * address to determine which emulated node it appears on.
476 */
477 if (addr >= nodes[i].start && addr < nodes[i].end) {
478 ret = i;
479 break;
480 }
481 }
482 return ret;
483}
484
485/*
486 * In NUMA emulation, we need to setup proximity domain (_PXM) to node ID
487 * mappings that respect the real ACPI topology but reflect our emulated
488 * environment. For each emulated node, we find which real node it appears on
489 * and create PXM to NID mappings for those fake nodes which mirror that
490 * locality. SLIT will now represent the correct distances between emulated
491 * nodes as a result of the real topology.
492 */
493void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
494{
495 int i, j;
496
497 printk(KERN_INFO "Faking PXM affinity for fake nodes on real "
498 "topology.\n");
499 for (i = 0; i < num_nodes; i++) {
500 int nid, pxm;
501
502 nid = find_node_by_addr(fake_nodes[i].start);
503 if (nid == NUMA_NO_NODE)
504 continue;
505 pxm = node_to_pxm(nid);
506 if (pxm == PXM_INVAL)
507 continue;
508 fake_node_to_pxm_map[i] = pxm;
509 /*
510 * For each apicid_to_node mapping that exists for this real
511 * node, it must now point to the fake node ID.
512 */
513 for (j = 0; j < MAX_LOCAL_APIC; j++)
514 if (apicid_to_node[j] == nid &&
515 fake_apicid_to_node[j] == NUMA_NO_NODE)
516 fake_apicid_to_node[j] = i;
517 }
518 for (i = 0; i < num_nodes; i++)
519 __acpi_map_pxm_to_node(fake_node_to_pxm_map[i], i);
520 memcpy(apicid_to_node, fake_apicid_to_node, sizeof(apicid_to_node));
521
522 nodes_clear(nodes_parsed);
523 for (i = 0; i < num_nodes; i++)
524 if (fake_nodes[i].start != fake_nodes[i].end)
525 node_set(i, nodes_parsed);
526}
527
528static int null_slit_node_compare(int a, int b)
529{
530 return node_to_pxm(a) == node_to_pxm(b);
531}
532#else
533static int null_slit_node_compare(int a, int b)
534{
535 return a == b;
536}
537#endif /* CONFIG_NUMA_EMU */
538
539int __node_distance(int a, int b)
540{
541 int index;
542
543 if (!acpi_slit)
544 return null_slit_node_compare(a, b) ? LOCAL_DISTANCE :
545 REMOTE_DISTANCE;
546 index = acpi_slit->locality_count * node_to_pxm(a);
547 return acpi_slit->entry[index + node_to_pxm(b)];
548}
549
550EXPORT_SYMBOL(__node_distance);
551
552#if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) || defined(CONFIG_ACPI_HOTPLUG_MEMORY)
553int memory_add_physaddr_to_nid(u64 start)
554{
555 int i, ret = 0;
556
557 for_each_node(i)
558 if (nodes_add[i].start <= start && nodes_add[i].end > start)
559 ret = i;
560
561 return ret;
562}
563EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
564#endif
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index c03f14ab6667..d6c0418c3e47 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -5,6 +5,7 @@
5#include <linux/smp.h> 5#include <linux/smp.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/cpu.h>
8 9
9#include <asm/tlbflush.h> 10#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 11#include <asm/mmu_context.h>
@@ -52,6 +53,8 @@ union smp_flush_state {
52 want false sharing in the per cpu data segment. */ 53 want false sharing in the per cpu data segment. */
53static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS]; 54static union smp_flush_state flush_state[NUM_INVALIDATE_TLB_VECTORS];
54 55
56static DEFINE_PER_CPU_READ_MOSTLY(int, tlb_vector_offset);
57
55/* 58/*
56 * We cannot call mmdrop() because we are in interrupt context, 59 * We cannot call mmdrop() because we are in interrupt context,
57 * instead update mm->cpu_vm_mask. 60 * instead update mm->cpu_vm_mask.
@@ -173,15 +176,11 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
173 union smp_flush_state *f; 176 union smp_flush_state *f;
174 177
175 /* Caller has disabled preemption */ 178 /* Caller has disabled preemption */
176 sender = smp_processor_id() % NUM_INVALIDATE_TLB_VECTORS; 179 sender = this_cpu_read(tlb_vector_offset);
177 f = &flush_state[sender]; 180 f = &flush_state[sender];
178 181
179 /* 182 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
180 * Could avoid this lock when 183 raw_spin_lock(&f->tlbstate_lock);
181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
182 * probably not worth checking this for a cache-hot lock.
183 */
184 raw_spin_lock(&f->tlbstate_lock);
185 184
186 f->flush_mm = mm; 185 f->flush_mm = mm;
187 f->flush_va = va; 186 f->flush_va = va;
@@ -199,7 +198,8 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
199 198
200 f->flush_mm = NULL; 199 f->flush_mm = NULL;
201 f->flush_va = 0; 200 f->flush_va = 0;
202 raw_spin_unlock(&f->tlbstate_lock); 201 if (nr_cpu_ids > NUM_INVALIDATE_TLB_VECTORS)
202 raw_spin_unlock(&f->tlbstate_lock);
203} 203}
204 204
205void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -208,16 +208,57 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
208 if (is_uv_system()) { 208 if (is_uv_system()) {
209 unsigned int cpu; 209 unsigned int cpu;
210 210
211 cpu = get_cpu(); 211 cpu = smp_processor_id();
212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu); 212 cpumask = uv_flush_tlb_others(cpumask, mm, va, cpu);
213 if (cpumask) 213 if (cpumask)
214 flush_tlb_others_ipi(cpumask, mm, va); 214 flush_tlb_others_ipi(cpumask, mm, va);
215 put_cpu();
216 return; 215 return;
217 } 216 }
218 flush_tlb_others_ipi(cpumask, mm, va); 217 flush_tlb_others_ipi(cpumask, mm, va);
219} 218}
220 219
220static void __cpuinit calculate_tlb_offset(void)
221{
222 int cpu, node, nr_node_vecs, idx = 0;
223 /*
224 * we are changing tlb_vector_offset for each CPU in runtime, but this
225 * will not cause inconsistency, as the write is atomic under X86. we
226 * might see more lock contentions in a short time, but after all CPU's
227 * tlb_vector_offset are changed, everything should go normal
228 *
229 * Note: if NUM_INVALIDATE_TLB_VECTORS % nr_online_nodes !=0, we might
230 * waste some vectors.
231 **/
232 if (nr_online_nodes > NUM_INVALIDATE_TLB_VECTORS)
233 nr_node_vecs = 1;
234 else
235 nr_node_vecs = NUM_INVALIDATE_TLB_VECTORS/nr_online_nodes;
236
237 for_each_online_node(node) {
238 int node_offset = (idx % NUM_INVALIDATE_TLB_VECTORS) *
239 nr_node_vecs;
240 int cpu_offset = 0;
241 for_each_cpu(cpu, cpumask_of_node(node)) {
242 per_cpu(tlb_vector_offset, cpu) = node_offset +
243 cpu_offset;
244 cpu_offset++;
245 cpu_offset = cpu_offset % nr_node_vecs;
246 }
247 idx++;
248 }
249}
250
251static int __cpuinit tlb_cpuhp_notify(struct notifier_block *n,
252 unsigned long action, void *hcpu)
253{
254 switch (action & 0xf) {
255 case CPU_ONLINE:
256 case CPU_DEAD:
257 calculate_tlb_offset();
258 }
259 return NOTIFY_OK;
260}
261
221static int __cpuinit init_smp_flush(void) 262static int __cpuinit init_smp_flush(void)
222{ 263{
223 int i; 264 int i;
@@ -225,6 +266,8 @@ static int __cpuinit init_smp_flush(void)
225 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 266 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
226 raw_spin_lock_init(&flush_state[i].tlbstate_lock); 267 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
227 268
269 calculate_tlb_offset();
270 hotcpu_notifier(tlb_cpuhp_notify, 0);
228 return 0; 271 return 0;
229} 272}
230core_initcall(init_smp_flush); 273core_initcall(init_smp_flush);
diff --git a/arch/x86/net/Makefile b/arch/x86/net/Makefile
new file mode 100644
index 000000000000..90568c33ddb0
--- /dev/null
+++ b/arch/x86/net/Makefile
@@ -0,0 +1,4 @@
1#
2# Arch-specific network modules
3#
4obj-$(CONFIG_BPF_JIT) += bpf_jit.o bpf_jit_comp.o
diff --git a/arch/x86/net/bpf_jit.S b/arch/x86/net/bpf_jit.S
new file mode 100644
index 000000000000..66870223f8c5
--- /dev/null
+++ b/arch/x86/net/bpf_jit.S
@@ -0,0 +1,140 @@
1/* bpf_jit.S : BPF JIT helper functions
2 *
3 * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 */
10#include <linux/linkage.h>
11#include <asm/dwarf2.h>
12
13/*
14 * Calling convention :
15 * rdi : skb pointer
16 * esi : offset of byte(s) to fetch in skb (can be scratched)
17 * r8 : copy of skb->data
18 * r9d : hlen = skb->len - skb->data_len
19 */
20#define SKBDATA %r8
21
22sk_load_word_ind:
23 .globl sk_load_word_ind
24
25 add %ebx,%esi /* offset += X */
26# test %esi,%esi /* if (offset < 0) goto bpf_error; */
27 js bpf_error
28
29sk_load_word:
30 .globl sk_load_word
31
32 mov %r9d,%eax # hlen
33 sub %esi,%eax # hlen - offset
34 cmp $3,%eax
35 jle bpf_slow_path_word
36 mov (SKBDATA,%rsi),%eax
37 bswap %eax /* ntohl() */
38 ret
39
40
41sk_load_half_ind:
42 .globl sk_load_half_ind
43
44 add %ebx,%esi /* offset += X */
45 js bpf_error
46
47sk_load_half:
48 .globl sk_load_half
49
50 mov %r9d,%eax
51 sub %esi,%eax # hlen - offset
52 cmp $1,%eax
53 jle bpf_slow_path_half
54 movzwl (SKBDATA,%rsi),%eax
55 rol $8,%ax # ntohs()
56 ret
57
58sk_load_byte_ind:
59 .globl sk_load_byte_ind
60 add %ebx,%esi /* offset += X */
61 js bpf_error
62
63sk_load_byte:
64 .globl sk_load_byte
65
66 cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte */
67 jle bpf_slow_path_byte
68 movzbl (SKBDATA,%rsi),%eax
69 ret
70
71/**
72 * sk_load_byte_msh - BPF_S_LDX_B_MSH helper
73 *
74 * Implements BPF_S_LDX_B_MSH : ldxb 4*([offset]&0xf)
75 * Must preserve A accumulator (%eax)
76 * Inputs : %esi is the offset value, already known positive
77 */
78ENTRY(sk_load_byte_msh)
79 CFI_STARTPROC
80 cmp %esi,%r9d /* if (offset >= hlen) goto bpf_slow_path_byte_msh */
81 jle bpf_slow_path_byte_msh
82 movzbl (SKBDATA,%rsi),%ebx
83 and $15,%bl
84 shl $2,%bl
85 ret
86 CFI_ENDPROC
87ENDPROC(sk_load_byte_msh)
88
89bpf_error:
90# force a return 0 from jit handler
91 xor %eax,%eax
92 mov -8(%rbp),%rbx
93 leaveq
94 ret
95
96/* rsi contains offset and can be scratched */
97#define bpf_slow_path_common(LEN) \
98 push %rdi; /* save skb */ \
99 push %r9; \
100 push SKBDATA; \
101/* rsi already has offset */ \
102 mov $LEN,%ecx; /* len */ \
103 lea -12(%rbp),%rdx; \
104 call skb_copy_bits; \
105 test %eax,%eax; \
106 pop SKBDATA; \
107 pop %r9; \
108 pop %rdi
109
110
111bpf_slow_path_word:
112 bpf_slow_path_common(4)
113 js bpf_error
114 mov -12(%rbp),%eax
115 bswap %eax
116 ret
117
118bpf_slow_path_half:
119 bpf_slow_path_common(2)
120 js bpf_error
121 mov -12(%rbp),%ax
122 rol $8,%ax
123 movzwl %ax,%eax
124 ret
125
126bpf_slow_path_byte:
127 bpf_slow_path_common(1)
128 js bpf_error
129 movzbl -12(%rbp),%eax
130 ret
131
132bpf_slow_path_byte_msh:
133 xchg %eax,%ebx /* dont lose A , X is about to be scratched */
134 bpf_slow_path_common(1)
135 js bpf_error
136 movzbl -12(%rbp),%eax
137 and $15,%al
138 shl $2,%al
139 xchg %eax,%ebx
140 ret
diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c
new file mode 100644
index 000000000000..bfab3fa10edc
--- /dev/null
+++ b/arch/x86/net/bpf_jit_comp.c
@@ -0,0 +1,654 @@
1/* bpf_jit_comp.c : BPF JIT compiler
2 *
3 * Copyright (C) 2011 Eric Dumazet (eric.dumazet@gmail.com)
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License
7 * as published by the Free Software Foundation; version 2
8 * of the License.
9 */
10#include <linux/moduleloader.h>
11#include <asm/cacheflush.h>
12#include <linux/netdevice.h>
13#include <linux/filter.h>
14
15/*
16 * Conventions :
17 * EAX : BPF A accumulator
18 * EBX : BPF X accumulator
19 * RDI : pointer to skb (first argument given to JIT function)
20 * RBP : frame pointer (even if CONFIG_FRAME_POINTER=n)
21 * ECX,EDX,ESI : scratch registers
22 * r9d : skb->len - skb->data_len (headlen)
23 * r8 : skb->data
24 * -8(RBP) : saved RBX value
25 * -16(RBP)..-80(RBP) : BPF_MEMWORDS values
26 */
27int bpf_jit_enable __read_mostly;
28
29/*
30 * assembly code in arch/x86/net/bpf_jit.S
31 */
32extern u8 sk_load_word[], sk_load_half[], sk_load_byte[], sk_load_byte_msh[];
33extern u8 sk_load_word_ind[], sk_load_half_ind[], sk_load_byte_ind[];
34
35static inline u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len)
36{
37 if (len == 1)
38 *ptr = bytes;
39 else if (len == 2)
40 *(u16 *)ptr = bytes;
41 else {
42 *(u32 *)ptr = bytes;
43 barrier();
44 }
45 return ptr + len;
46}
47
48#define EMIT(bytes, len) do { prog = emit_code(prog, bytes, len); } while (0)
49
50#define EMIT1(b1) EMIT(b1, 1)
51#define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2)
52#define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3)
53#define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4)
54#define EMIT1_off32(b1, off) do { EMIT1(b1); EMIT(off, 4);} while (0)
55
56#define CLEAR_A() EMIT2(0x31, 0xc0) /* xor %eax,%eax */
57#define CLEAR_X() EMIT2(0x31, 0xdb) /* xor %ebx,%ebx */
58
59static inline bool is_imm8(int value)
60{
61 return value <= 127 && value >= -128;
62}
63
64static inline bool is_near(int offset)
65{
66 return offset <= 127 && offset >= -128;
67}
68
69#define EMIT_JMP(offset) \
70do { \
71 if (offset) { \
72 if (is_near(offset)) \
73 EMIT2(0xeb, offset); /* jmp .+off8 */ \
74 else \
75 EMIT1_off32(0xe9, offset); /* jmp .+off32 */ \
76 } \
77} while (0)
78
79/* list of x86 cond jumps opcodes (. + s8)
80 * Add 0x10 (and an extra 0x0f) to generate far jumps (. + s32)
81 */
82#define X86_JB 0x72
83#define X86_JAE 0x73
84#define X86_JE 0x74
85#define X86_JNE 0x75
86#define X86_JBE 0x76
87#define X86_JA 0x77
88
89#define EMIT_COND_JMP(op, offset) \
90do { \
91 if (is_near(offset)) \
92 EMIT2(op, offset); /* jxx .+off8 */ \
93 else { \
94 EMIT2(0x0f, op + 0x10); \
95 EMIT(offset, 4); /* jxx .+off32 */ \
96 } \
97} while (0)
98
99#define COND_SEL(CODE, TOP, FOP) \
100 case CODE: \
101 t_op = TOP; \
102 f_op = FOP; \
103 goto cond_branch
104
105
106#define SEEN_DATAREF 1 /* might call external helpers */
107#define SEEN_XREG 2 /* ebx is used */
108#define SEEN_MEM 4 /* use mem[] for temporary storage */
109
110static inline void bpf_flush_icache(void *start, void *end)
111{
112 mm_segment_t old_fs = get_fs();
113
114 set_fs(KERNEL_DS);
115 smp_wmb();
116 flush_icache_range((unsigned long)start, (unsigned long)end);
117 set_fs(old_fs);
118}
119
120
121void bpf_jit_compile(struct sk_filter *fp)
122{
123 u8 temp[64];
124 u8 *prog;
125 unsigned int proglen, oldproglen = 0;
126 int ilen, i;
127 int t_offset, f_offset;
128 u8 t_op, f_op, seen = 0, pass;
129 u8 *image = NULL;
130 u8 *func;
131 int pc_ret0 = -1; /* bpf index of first RET #0 instruction (if any) */
132 unsigned int cleanup_addr; /* epilogue code offset */
133 unsigned int *addrs;
134 const struct sock_filter *filter = fp->insns;
135 int flen = fp->len;
136
137 if (!bpf_jit_enable)
138 return;
139
140 addrs = kmalloc(flen * sizeof(*addrs), GFP_KERNEL);
141 if (addrs == NULL)
142 return;
143
144 /* Before first pass, make a rough estimation of addrs[]
145 * each bpf instruction is translated to less than 64 bytes
146 */
147 for (proglen = 0, i = 0; i < flen; i++) {
148 proglen += 64;
149 addrs[i] = proglen;
150 }
151 cleanup_addr = proglen; /* epilogue address */
152
153 for (pass = 0; pass < 10; pass++) {
154 /* no prologue/epilogue for trivial filters (RET something) */
155 proglen = 0;
156 prog = temp;
157
158 if (seen) {
159 EMIT4(0x55, 0x48, 0x89, 0xe5); /* push %rbp; mov %rsp,%rbp */
160 EMIT4(0x48, 0x83, 0xec, 96); /* subq $96,%rsp */
161 /* note : must save %rbx in case bpf_error is hit */
162 if (seen & (SEEN_XREG | SEEN_DATAREF))
163 EMIT4(0x48, 0x89, 0x5d, 0xf8); /* mov %rbx, -8(%rbp) */
164 if (seen & SEEN_XREG)
165 CLEAR_X(); /* make sure we dont leek kernel memory */
166
167 /*
168 * If this filter needs to access skb data,
169 * loads r9 and r8 with :
170 * r9 = skb->len - skb->data_len
171 * r8 = skb->data
172 */
173 if (seen & SEEN_DATAREF) {
174 if (offsetof(struct sk_buff, len) <= 127)
175 /* mov off8(%rdi),%r9d */
176 EMIT4(0x44, 0x8b, 0x4f, offsetof(struct sk_buff, len));
177 else {
178 /* mov off32(%rdi),%r9d */
179 EMIT3(0x44, 0x8b, 0x8f);
180 EMIT(offsetof(struct sk_buff, len), 4);
181 }
182 if (is_imm8(offsetof(struct sk_buff, data_len)))
183 /* sub off8(%rdi),%r9d */
184 EMIT4(0x44, 0x2b, 0x4f, offsetof(struct sk_buff, data_len));
185 else {
186 EMIT3(0x44, 0x2b, 0x8f);
187 EMIT(offsetof(struct sk_buff, data_len), 4);
188 }
189
190 if (is_imm8(offsetof(struct sk_buff, data)))
191 /* mov off8(%rdi),%r8 */
192 EMIT4(0x4c, 0x8b, 0x47, offsetof(struct sk_buff, data));
193 else {
194 /* mov off32(%rdi),%r8 */
195 EMIT3(0x4c, 0x8b, 0x87);
196 EMIT(offsetof(struct sk_buff, data), 4);
197 }
198 }
199 }
200
201 switch (filter[0].code) {
202 case BPF_S_RET_K:
203 case BPF_S_LD_W_LEN:
204 case BPF_S_ANC_PROTOCOL:
205 case BPF_S_ANC_IFINDEX:
206 case BPF_S_ANC_MARK:
207 case BPF_S_ANC_RXHASH:
208 case BPF_S_ANC_CPU:
209 case BPF_S_ANC_QUEUE:
210 case BPF_S_LD_W_ABS:
211 case BPF_S_LD_H_ABS:
212 case BPF_S_LD_B_ABS:
213 /* first instruction sets A register (or is RET 'constant') */
214 break;
215 default:
216 /* make sure we dont leak kernel information to user */
217 CLEAR_A(); /* A = 0 */
218 }
219
220 for (i = 0; i < flen; i++) {
221 unsigned int K = filter[i].k;
222
223 switch (filter[i].code) {
224 case BPF_S_ALU_ADD_X: /* A += X; */
225 seen |= SEEN_XREG;
226 EMIT2(0x01, 0xd8); /* add %ebx,%eax */
227 break;
228 case BPF_S_ALU_ADD_K: /* A += K; */
229 if (!K)
230 break;
231 if (is_imm8(K))
232 EMIT3(0x83, 0xc0, K); /* add imm8,%eax */
233 else
234 EMIT1_off32(0x05, K); /* add imm32,%eax */
235 break;
236 case BPF_S_ALU_SUB_X: /* A -= X; */
237 seen |= SEEN_XREG;
238 EMIT2(0x29, 0xd8); /* sub %ebx,%eax */
239 break;
240 case BPF_S_ALU_SUB_K: /* A -= K */
241 if (!K)
242 break;
243 if (is_imm8(K))
244 EMIT3(0x83, 0xe8, K); /* sub imm8,%eax */
245 else
246 EMIT1_off32(0x2d, K); /* sub imm32,%eax */
247 break;
248 case BPF_S_ALU_MUL_X: /* A *= X; */
249 seen |= SEEN_XREG;
250 EMIT3(0x0f, 0xaf, 0xc3); /* imul %ebx,%eax */
251 break;
252 case BPF_S_ALU_MUL_K: /* A *= K */
253 if (is_imm8(K))
254 EMIT3(0x6b, 0xc0, K); /* imul imm8,%eax,%eax */
255 else {
256 EMIT2(0x69, 0xc0); /* imul imm32,%eax */
257 EMIT(K, 4);
258 }
259 break;
260 case BPF_S_ALU_DIV_X: /* A /= X; */
261 seen |= SEEN_XREG;
262 EMIT2(0x85, 0xdb); /* test %ebx,%ebx */
263 if (pc_ret0 != -1)
264 EMIT_COND_JMP(X86_JE, addrs[pc_ret0] - (addrs[i] - 4));
265 else {
266 EMIT_COND_JMP(X86_JNE, 2 + 5);
267 CLEAR_A();
268 EMIT1_off32(0xe9, cleanup_addr - (addrs[i] - 4)); /* jmp .+off32 */
269 }
270 EMIT4(0x31, 0xd2, 0xf7, 0xf3); /* xor %edx,%edx; div %ebx */
271 break;
272 case BPF_S_ALU_DIV_K: /* A = reciprocal_divide(A, K); */
273 EMIT3(0x48, 0x69, 0xc0); /* imul imm32,%rax,%rax */
274 EMIT(K, 4);
275 EMIT4(0x48, 0xc1, 0xe8, 0x20); /* shr $0x20,%rax */
276 break;
277 case BPF_S_ALU_AND_X:
278 seen |= SEEN_XREG;
279 EMIT2(0x21, 0xd8); /* and %ebx,%eax */
280 break;
281 case BPF_S_ALU_AND_K:
282 if (K >= 0xFFFFFF00) {
283 EMIT2(0x24, K & 0xFF); /* and imm8,%al */
284 } else if (K >= 0xFFFF0000) {
285 EMIT2(0x66, 0x25); /* and imm16,%ax */
286 EMIT2(K, 2);
287 } else {
288 EMIT1_off32(0x25, K); /* and imm32,%eax */
289 }
290 break;
291 case BPF_S_ALU_OR_X:
292 seen |= SEEN_XREG;
293 EMIT2(0x09, 0xd8); /* or %ebx,%eax */
294 break;
295 case BPF_S_ALU_OR_K:
296 if (is_imm8(K))
297 EMIT3(0x83, 0xc8, K); /* or imm8,%eax */
298 else
299 EMIT1_off32(0x0d, K); /* or imm32,%eax */
300 break;
301 case BPF_S_ALU_LSH_X: /* A <<= X; */
302 seen |= SEEN_XREG;
303 EMIT4(0x89, 0xd9, 0xd3, 0xe0); /* mov %ebx,%ecx; shl %cl,%eax */
304 break;
305 case BPF_S_ALU_LSH_K:
306 if (K == 0)
307 break;
308 else if (K == 1)
309 EMIT2(0xd1, 0xe0); /* shl %eax */
310 else
311 EMIT3(0xc1, 0xe0, K);
312 break;
313 case BPF_S_ALU_RSH_X: /* A >>= X; */
314 seen |= SEEN_XREG;
315 EMIT4(0x89, 0xd9, 0xd3, 0xe8); /* mov %ebx,%ecx; shr %cl,%eax */
316 break;
317 case BPF_S_ALU_RSH_K: /* A >>= K; */
318 if (K == 0)
319 break;
320 else if (K == 1)
321 EMIT2(0xd1, 0xe8); /* shr %eax */
322 else
323 EMIT3(0xc1, 0xe8, K);
324 break;
325 case BPF_S_ALU_NEG:
326 EMIT2(0xf7, 0xd8); /* neg %eax */
327 break;
328 case BPF_S_RET_K:
329 if (!K) {
330 if (pc_ret0 == -1)
331 pc_ret0 = i;
332 CLEAR_A();
333 } else {
334 EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
335 }
336 /* fallinto */
337 case BPF_S_RET_A:
338 if (seen) {
339 if (i != flen - 1) {
340 EMIT_JMP(cleanup_addr - addrs[i]);
341 break;
342 }
343 if (seen & SEEN_XREG)
344 EMIT4(0x48, 0x8b, 0x5d, 0xf8); /* mov -8(%rbp),%rbx */
345 EMIT1(0xc9); /* leaveq */
346 }
347 EMIT1(0xc3); /* ret */
348 break;
349 case BPF_S_MISC_TAX: /* X = A */
350 seen |= SEEN_XREG;
351 EMIT2(0x89, 0xc3); /* mov %eax,%ebx */
352 break;
353 case BPF_S_MISC_TXA: /* A = X */
354 seen |= SEEN_XREG;
355 EMIT2(0x89, 0xd8); /* mov %ebx,%eax */
356 break;
357 case BPF_S_LD_IMM: /* A = K */
358 if (!K)
359 CLEAR_A();
360 else
361 EMIT1_off32(0xb8, K); /* mov $imm32,%eax */
362 break;
363 case BPF_S_LDX_IMM: /* X = K */
364 seen |= SEEN_XREG;
365 if (!K)
366 CLEAR_X();
367 else
368 EMIT1_off32(0xbb, K); /* mov $imm32,%ebx */
369 break;
370 case BPF_S_LD_MEM: /* A = mem[K] : mov off8(%rbp),%eax */
371 seen |= SEEN_MEM;
372 EMIT3(0x8b, 0x45, 0xf0 - K*4);
373 break;
374 case BPF_S_LDX_MEM: /* X = mem[K] : mov off8(%rbp),%ebx */
375 seen |= SEEN_XREG | SEEN_MEM;
376 EMIT3(0x8b, 0x5d, 0xf0 - K*4);
377 break;
378 case BPF_S_ST: /* mem[K] = A : mov %eax,off8(%rbp) */
379 seen |= SEEN_MEM;
380 EMIT3(0x89, 0x45, 0xf0 - K*4);
381 break;
382 case BPF_S_STX: /* mem[K] = X : mov %ebx,off8(%rbp) */
383 seen |= SEEN_XREG | SEEN_MEM;
384 EMIT3(0x89, 0x5d, 0xf0 - K*4);
385 break;
386 case BPF_S_LD_W_LEN: /* A = skb->len; */
387 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, len) != 4);
388 if (is_imm8(offsetof(struct sk_buff, len)))
389 /* mov off8(%rdi),%eax */
390 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, len));
391 else {
392 EMIT2(0x8b, 0x87);
393 EMIT(offsetof(struct sk_buff, len), 4);
394 }
395 break;
396 case BPF_S_LDX_W_LEN: /* X = skb->len; */
397 seen |= SEEN_XREG;
398 if (is_imm8(offsetof(struct sk_buff, len)))
399 /* mov off8(%rdi),%ebx */
400 EMIT3(0x8b, 0x5f, offsetof(struct sk_buff, len));
401 else {
402 EMIT2(0x8b, 0x9f);
403 EMIT(offsetof(struct sk_buff, len), 4);
404 }
405 break;
406 case BPF_S_ANC_PROTOCOL: /* A = ntohs(skb->protocol); */
407 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, protocol) != 2);
408 if (is_imm8(offsetof(struct sk_buff, protocol))) {
409 /* movzwl off8(%rdi),%eax */
410 EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, protocol));
411 } else {
412 EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
413 EMIT(offsetof(struct sk_buff, protocol), 4);
414 }
415 EMIT2(0x86, 0xc4); /* ntohs() : xchg %al,%ah */
416 break;
417 case BPF_S_ANC_IFINDEX:
418 if (is_imm8(offsetof(struct sk_buff, dev))) {
419 /* movq off8(%rdi),%rax */
420 EMIT4(0x48, 0x8b, 0x47, offsetof(struct sk_buff, dev));
421 } else {
422 EMIT3(0x48, 0x8b, 0x87); /* movq off32(%rdi),%rax */
423 EMIT(offsetof(struct sk_buff, dev), 4);
424 }
425 EMIT3(0x48, 0x85, 0xc0); /* test %rax,%rax */
426 EMIT_COND_JMP(X86_JE, cleanup_addr - (addrs[i] - 6));
427 BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4);
428 EMIT2(0x8b, 0x80); /* mov off32(%rax),%eax */
429 EMIT(offsetof(struct net_device, ifindex), 4);
430 break;
431 case BPF_S_ANC_MARK:
432 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, mark) != 4);
433 if (is_imm8(offsetof(struct sk_buff, mark))) {
434 /* mov off8(%rdi),%eax */
435 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, mark));
436 } else {
437 EMIT2(0x8b, 0x87);
438 EMIT(offsetof(struct sk_buff, mark), 4);
439 }
440 break;
441 case BPF_S_ANC_RXHASH:
442 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, rxhash) != 4);
443 if (is_imm8(offsetof(struct sk_buff, rxhash))) {
444 /* mov off8(%rdi),%eax */
445 EMIT3(0x8b, 0x47, offsetof(struct sk_buff, rxhash));
446 } else {
447 EMIT2(0x8b, 0x87);
448 EMIT(offsetof(struct sk_buff, rxhash), 4);
449 }
450 break;
451 case BPF_S_ANC_QUEUE:
452 BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, queue_mapping) != 2);
453 if (is_imm8(offsetof(struct sk_buff, queue_mapping))) {
454 /* movzwl off8(%rdi),%eax */
455 EMIT4(0x0f, 0xb7, 0x47, offsetof(struct sk_buff, queue_mapping));
456 } else {
457 EMIT3(0x0f, 0xb7, 0x87); /* movzwl off32(%rdi),%eax */
458 EMIT(offsetof(struct sk_buff, queue_mapping), 4);
459 }
460 break;
461 case BPF_S_ANC_CPU:
462#ifdef CONFIG_SMP
463 EMIT4(0x65, 0x8b, 0x04, 0x25); /* mov %gs:off32,%eax */
464 EMIT((u32)(unsigned long)&cpu_number, 4); /* A = smp_processor_id(); */
465#else
466 CLEAR_A();
467#endif
468 break;
469 case BPF_S_LD_W_ABS:
470 func = sk_load_word;
471common_load: seen |= SEEN_DATAREF;
472 if ((int)K < 0)
473 goto out;
474 t_offset = func - (image + addrs[i]);
475 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
476 EMIT1_off32(0xe8, t_offset); /* call */
477 break;
478 case BPF_S_LD_H_ABS:
479 func = sk_load_half;
480 goto common_load;
481 case BPF_S_LD_B_ABS:
482 func = sk_load_byte;
483 goto common_load;
484 case BPF_S_LDX_B_MSH:
485 if ((int)K < 0) {
486 if (pc_ret0 != -1) {
487 EMIT_JMP(addrs[pc_ret0] - addrs[i]);
488 break;
489 }
490 CLEAR_A();
491 EMIT_JMP(cleanup_addr - addrs[i]);
492 break;
493 }
494 seen |= SEEN_DATAREF | SEEN_XREG;
495 t_offset = sk_load_byte_msh - (image + addrs[i]);
496 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
497 EMIT1_off32(0xe8, t_offset); /* call sk_load_byte_msh */
498 break;
499 case BPF_S_LD_W_IND:
500 func = sk_load_word_ind;
501common_load_ind: seen |= SEEN_DATAREF | SEEN_XREG;
502 t_offset = func - (image + addrs[i]);
503 EMIT1_off32(0xbe, K); /* mov imm32,%esi */
504 EMIT1_off32(0xe8, t_offset); /* call sk_load_xxx_ind */
505 break;
506 case BPF_S_LD_H_IND:
507 func = sk_load_half_ind;
508 goto common_load_ind;
509 case BPF_S_LD_B_IND:
510 func = sk_load_byte_ind;
511 goto common_load_ind;
512 case BPF_S_JMP_JA:
513 t_offset = addrs[i + K] - addrs[i];
514 EMIT_JMP(t_offset);
515 break;
516 COND_SEL(BPF_S_JMP_JGT_K, X86_JA, X86_JBE);
517 COND_SEL(BPF_S_JMP_JGE_K, X86_JAE, X86_JB);
518 COND_SEL(BPF_S_JMP_JEQ_K, X86_JE, X86_JNE);
519 COND_SEL(BPF_S_JMP_JSET_K,X86_JNE, X86_JE);
520 COND_SEL(BPF_S_JMP_JGT_X, X86_JA, X86_JBE);
521 COND_SEL(BPF_S_JMP_JGE_X, X86_JAE, X86_JB);
522 COND_SEL(BPF_S_JMP_JEQ_X, X86_JE, X86_JNE);
523 COND_SEL(BPF_S_JMP_JSET_X,X86_JNE, X86_JE);
524
525cond_branch: f_offset = addrs[i + filter[i].jf] - addrs[i];
526 t_offset = addrs[i + filter[i].jt] - addrs[i];
527
528 /* same targets, can avoid doing the test :) */
529 if (filter[i].jt == filter[i].jf) {
530 EMIT_JMP(t_offset);
531 break;
532 }
533
534 switch (filter[i].code) {
535 case BPF_S_JMP_JGT_X:
536 case BPF_S_JMP_JGE_X:
537 case BPF_S_JMP_JEQ_X:
538 seen |= SEEN_XREG;
539 EMIT2(0x39, 0xd8); /* cmp %ebx,%eax */
540 break;
541 case BPF_S_JMP_JSET_X:
542 seen |= SEEN_XREG;
543 EMIT2(0x85, 0xd8); /* test %ebx,%eax */
544 break;
545 case BPF_S_JMP_JEQ_K:
546 if (K == 0) {
547 EMIT2(0x85, 0xc0); /* test %eax,%eax */
548 break;
549 }
550 case BPF_S_JMP_JGT_K:
551 case BPF_S_JMP_JGE_K:
552 if (K <= 127)
553 EMIT3(0x83, 0xf8, K); /* cmp imm8,%eax */
554 else
555 EMIT1_off32(0x3d, K); /* cmp imm32,%eax */
556 break;
557 case BPF_S_JMP_JSET_K:
558 if (K <= 0xFF)
559 EMIT2(0xa8, K); /* test imm8,%al */
560 else if (!(K & 0xFFFF00FF))
561 EMIT3(0xf6, 0xc4, K >> 8); /* test imm8,%ah */
562 else if (K <= 0xFFFF) {
563 EMIT2(0x66, 0xa9); /* test imm16,%ax */
564 EMIT(K, 2);
565 } else {
566 EMIT1_off32(0xa9, K); /* test imm32,%eax */
567 }
568 break;
569 }
570 if (filter[i].jt != 0) {
571 if (filter[i].jf)
572 t_offset += is_near(f_offset) ? 2 : 6;
573 EMIT_COND_JMP(t_op, t_offset);
574 if (filter[i].jf)
575 EMIT_JMP(f_offset);
576 break;
577 }
578 EMIT_COND_JMP(f_op, f_offset);
579 break;
580 default:
581 /* hmm, too complex filter, give up with jit compiler */
582 goto out;
583 }
584 ilen = prog - temp;
585 if (image) {
586 if (unlikely(proglen + ilen > oldproglen)) {
587 pr_err("bpb_jit_compile fatal error\n");
588 kfree(addrs);
589 module_free(NULL, image);
590 return;
591 }
592 memcpy(image + proglen, temp, ilen);
593 }
594 proglen += ilen;
595 addrs[i] = proglen;
596 prog = temp;
597 }
598 /* last bpf instruction is always a RET :
599 * use it to give the cleanup instruction(s) addr
600 */
601 cleanup_addr = proglen - 1; /* ret */
602 if (seen)
603 cleanup_addr -= 1; /* leaveq */
604 if (seen & SEEN_XREG)
605 cleanup_addr -= 4; /* mov -8(%rbp),%rbx */
606
607 if (image) {
608 WARN_ON(proglen != oldproglen);
609 break;
610 }
611 if (proglen == oldproglen) {
612 image = module_alloc(max_t(unsigned int,
613 proglen,
614 sizeof(struct work_struct)));
615 if (!image)
616 goto out;
617 }
618 oldproglen = proglen;
619 }
620 if (bpf_jit_enable > 1)
621 pr_err("flen=%d proglen=%u pass=%d image=%p\n",
622 flen, proglen, pass, image);
623
624 if (image) {
625 if (bpf_jit_enable > 1)
626 print_hex_dump(KERN_ERR, "JIT code: ", DUMP_PREFIX_ADDRESS,
627 16, 1, image, proglen, false);
628
629 bpf_flush_icache(image, image + proglen);
630
631 fp->bpf_func = (void *)image;
632 }
633out:
634 kfree(addrs);
635 return;
636}
637
638static void jit_free_defer(struct work_struct *arg)
639{
640 module_free(NULL, arg);
641}
642
643/* run from softirq, we must use a work_struct to call
644 * module_free() from process context
645 */
646void bpf_jit_free(struct sk_filter *fp)
647{
648 if (fp->bpf_func != sk_run_filter) {
649 struct work_struct *work = (struct work_struct *)fp->bpf_func;
650
651 INIT_WORK(work, jit_free_defer);
652 schedule_work(work);
653 }
654}
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 3855096c59b8..a5b64ab4cd6e 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -14,17 +14,7 @@
14#include <asm/ptrace.h> 14#include <asm/ptrace.h>
15#include <asm/uaccess.h> 15#include <asm/uaccess.h>
16#include <asm/stacktrace.h> 16#include <asm/stacktrace.h>
17 17#include <linux/compat.h>
18static void backtrace_warning_symbol(void *data, char *msg,
19 unsigned long symbol)
20{
21 /* Ignore warnings */
22}
23
24static void backtrace_warning(void *data, char *msg)
25{
26 /* Ignore warnings */
27}
28 18
29static int backtrace_stack(void *data, char *name) 19static int backtrace_stack(void *data, char *name)
30{ 20{
@@ -41,21 +31,17 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
41} 31}
42 32
43static struct stacktrace_ops backtrace_ops = { 33static struct stacktrace_ops backtrace_ops = {
44 .warning = backtrace_warning,
45 .warning_symbol = backtrace_warning_symbol,
46 .stack = backtrace_stack, 34 .stack = backtrace_stack,
47 .address = backtrace_address, 35 .address = backtrace_address,
48 .walk_stack = print_context_stack, 36 .walk_stack = print_context_stack,
49}; 37};
50 38
51struct frame_head { 39#ifdef CONFIG_COMPAT
52 struct frame_head *bp; 40static struct stack_frame_ia32 *
53 unsigned long ret; 41dump_user_backtrace_32(struct stack_frame_ia32 *head)
54} __attribute__((packed));
55
56static struct frame_head *dump_user_backtrace(struct frame_head *head)
57{ 42{
58 struct frame_head bufhead[2]; 43 struct stack_frame_ia32 bufhead[2];
44 struct stack_frame_ia32 *fp;
59 45
60 /* Also check accessibility of one struct frame_head beyond */ 46 /* Also check accessibility of one struct frame_head beyond */
61 if (!access_ok(VERIFY_READ, head, sizeof(bufhead))) 47 if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
@@ -63,20 +49,66 @@ static struct frame_head *dump_user_backtrace(struct frame_head *head)
63 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead))) 49 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
64 return NULL; 50 return NULL;
65 51
66 oprofile_add_trace(bufhead[0].ret); 52 fp = (struct stack_frame_ia32 *) compat_ptr(bufhead[0].next_frame);
53
54 oprofile_add_trace(bufhead[0].return_address);
55
56 /* frame pointers should strictly progress back up the stack
57 * (towards higher addresses) */
58 if (head >= fp)
59 return NULL;
60
61 return fp;
62}
63
64static inline int
65x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
66{
67 struct stack_frame_ia32 *head;
68
69 /* User process is 32-bit */
70 if (!current || !test_thread_flag(TIF_IA32))
71 return 0;
72
73 head = (struct stack_frame_ia32 *) regs->bp;
74 while (depth-- && head)
75 head = dump_user_backtrace_32(head);
76
77 return 1;
78}
79
80#else
81static inline int
82x86_backtrace_32(struct pt_regs * const regs, unsigned int depth)
83{
84 return 0;
85}
86#endif /* CONFIG_COMPAT */
87
88static struct stack_frame *dump_user_backtrace(struct stack_frame *head)
89{
90 struct stack_frame bufhead[2];
91
92 /* Also check accessibility of one struct stack_frame beyond */
93 if (!access_ok(VERIFY_READ, head, sizeof(bufhead)))
94 return NULL;
95 if (__copy_from_user_inatomic(bufhead, head, sizeof(bufhead)))
96 return NULL;
97
98 oprofile_add_trace(bufhead[0].return_address);
67 99
68 /* frame pointers should strictly progress back up the stack 100 /* frame pointers should strictly progress back up the stack
69 * (towards higher addresses) */ 101 * (towards higher addresses) */
70 if (head >= bufhead[0].bp) 102 if (head >= bufhead[0].next_frame)
71 return NULL; 103 return NULL;
72 104
73 return bufhead[0].bp; 105 return bufhead[0].next_frame;
74} 106}
75 107
76void 108void
77x86_backtrace(struct pt_regs * const regs, unsigned int depth) 109x86_backtrace(struct pt_regs * const regs, unsigned int depth)
78{ 110{
79 struct frame_head *head = (struct frame_head *)frame_pointer(regs); 111 struct stack_frame *head = (struct stack_frame *)frame_pointer(regs);
80 112
81 if (!user_mode_vm(regs)) { 113 if (!user_mode_vm(regs)) {
82 unsigned long stack = kernel_stack_pointer(regs); 114 unsigned long stack = kernel_stack_pointer(regs);
@@ -86,6 +118,9 @@ x86_backtrace(struct pt_regs * const regs, unsigned int depth)
86 return; 118 return;
87 } 119 }
88 120
121 if (x86_backtrace_32(regs, depth))
122 return;
123
89 while (depth-- && head) 124 while (depth-- && head)
90 head = dump_user_backtrace(head); 125 head = dump_user_backtrace(head);
91} 126}
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index f1575c9a2572..68894fdc034b 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -15,7 +15,7 @@
15#include <linux/notifier.h> 15#include <linux/notifier.h>
16#include <linux/smp.h> 16#include <linux/smp.h>
17#include <linux/oprofile.h> 17#include <linux/oprofile.h>
18#include <linux/sysdev.h> 18#include <linux/syscore_ops.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/moduleparam.h> 20#include <linux/moduleparam.h>
21#include <linux/kdebug.h> 21#include <linux/kdebug.h>
@@ -49,6 +49,10 @@ u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
49 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0; 49 val |= counter_config->user ? ARCH_PERFMON_EVENTSEL_USR : 0;
50 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0; 50 val |= counter_config->kernel ? ARCH_PERFMON_EVENTSEL_OS : 0;
51 val |= (counter_config->unit_mask & 0xFF) << 8; 51 val |= (counter_config->unit_mask & 0xFF) << 8;
52 counter_config->extra &= (ARCH_PERFMON_EVENTSEL_INV |
53 ARCH_PERFMON_EVENTSEL_EDGE |
54 ARCH_PERFMON_EVENTSEL_CMASK);
55 val |= counter_config->extra;
52 event &= model->event_mask ? model->event_mask : 0xFF; 56 event &= model->event_mask ? model->event_mask : 0xFF;
53 val |= event & 0xFF; 57 val |= event & 0xFF;
54 val |= (event & 0x0F00) << 24; 58 val |= (event & 0x0F00) << 24;
@@ -65,7 +69,6 @@ static int profile_exceptions_notify(struct notifier_block *self,
65 69
66 switch (val) { 70 switch (val) {
67 case DIE_NMI: 71 case DIE_NMI:
68 case DIE_NMI_IPI:
69 if (ctr_running) 72 if (ctr_running)
70 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs)); 73 model->check_ctrs(args->regs, &__get_cpu_var(cpu_msrs));
71 else if (!nmi_enabled) 74 else if (!nmi_enabled)
@@ -109,8 +112,10 @@ static void nmi_cpu_start(void *dummy)
109static int nmi_start(void) 112static int nmi_start(void)
110{ 113{
111 get_online_cpus(); 114 get_online_cpus();
112 on_each_cpu(nmi_cpu_start, NULL, 1);
113 ctr_running = 1; 115 ctr_running = 1;
116 /* make ctr_running visible to the nmi handler: */
117 smp_mb();
118 on_each_cpu(nmi_cpu_start, NULL, 1);
114 put_online_cpus(); 119 put_online_cpus();
115 return 0; 120 return 0;
116} 121}
@@ -143,7 +148,7 @@ static inline int has_mux(void)
143 148
144inline int op_x86_phys_to_virt(int phys) 149inline int op_x86_phys_to_virt(int phys)
145{ 150{
146 return __get_cpu_var(switch_index) + phys; 151 return __this_cpu_read(switch_index) + phys;
147} 152}
148 153
149inline int op_x86_virt_to_phys(int virt) 154inline int op_x86_virt_to_phys(int virt)
@@ -361,7 +366,7 @@ static void nmi_cpu_setup(void *dummy)
361static struct notifier_block profile_exceptions_nb = { 366static struct notifier_block profile_exceptions_nb = {
362 .notifier_call = profile_exceptions_notify, 367 .notifier_call = profile_exceptions_notify,
363 .next = NULL, 368 .next = NULL,
364 .priority = 2 369 .priority = NMI_LOCAL_LOW_PRIOR,
365}; 370};
366 371
367static void nmi_cpu_restore_registers(struct op_msrs *msrs) 372static void nmi_cpu_restore_registers(struct op_msrs *msrs)
@@ -441,6 +446,7 @@ static int nmi_create_files(struct super_block *sb, struct dentry *root)
441 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask); 446 oprofilefs_create_ulong(sb, dir, "unit_mask", &counter_config[i].unit_mask);
442 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel); 447 oprofilefs_create_ulong(sb, dir, "kernel", &counter_config[i].kernel);
443 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user); 448 oprofilefs_create_ulong(sb, dir, "user", &counter_config[i].user);
449 oprofilefs_create_ulong(sb, dir, "extra", &counter_config[i].extra);
444 } 450 }
445 451
446 return 0; 452 return 0;
@@ -500,15 +506,18 @@ static int nmi_setup(void)
500 506
501 nmi_enabled = 0; 507 nmi_enabled = 0;
502 ctr_running = 0; 508 ctr_running = 0;
503 barrier(); 509 /* make variables visible to the nmi handler: */
510 smp_mb();
504 err = register_die_notifier(&profile_exceptions_nb); 511 err = register_die_notifier(&profile_exceptions_nb);
505 if (err) 512 if (err)
506 goto fail; 513 goto fail;
507 514
508 get_online_cpus(); 515 get_online_cpus();
509 register_cpu_notifier(&oprofile_cpu_nb); 516 register_cpu_notifier(&oprofile_cpu_nb);
510 on_each_cpu(nmi_cpu_setup, NULL, 1);
511 nmi_enabled = 1; 517 nmi_enabled = 1;
518 /* make nmi_enabled visible to the nmi handler: */
519 smp_mb();
520 on_each_cpu(nmi_cpu_setup, NULL, 1);
512 put_online_cpus(); 521 put_online_cpus();
513 522
514 return 0; 523 return 0;
@@ -527,7 +536,8 @@ static void nmi_shutdown(void)
527 nmi_enabled = 0; 536 nmi_enabled = 0;
528 ctr_running = 0; 537 ctr_running = 0;
529 put_online_cpus(); 538 put_online_cpus();
530 barrier(); 539 /* make variables visible to the nmi handler: */
540 smp_mb();
531 unregister_die_notifier(&profile_exceptions_nb); 541 unregister_die_notifier(&profile_exceptions_nb);
532 msrs = &get_cpu_var(cpu_msrs); 542 msrs = &get_cpu_var(cpu_msrs);
533 model->shutdown(msrs); 543 model->shutdown(msrs);
@@ -537,7 +547,7 @@ static void nmi_shutdown(void)
537 547
538#ifdef CONFIG_PM 548#ifdef CONFIG_PM
539 549
540static int nmi_suspend(struct sys_device *dev, pm_message_t state) 550static int nmi_suspend(void)
541{ 551{
542 /* Only one CPU left, just stop that one */ 552 /* Only one CPU left, just stop that one */
543 if (nmi_enabled == 1) 553 if (nmi_enabled == 1)
@@ -545,49 +555,31 @@ static int nmi_suspend(struct sys_device *dev, pm_message_t state)
545 return 0; 555 return 0;
546} 556}
547 557
548static int nmi_resume(struct sys_device *dev) 558static void nmi_resume(void)
549{ 559{
550 if (nmi_enabled == 1) 560 if (nmi_enabled == 1)
551 nmi_cpu_start(NULL); 561 nmi_cpu_start(NULL);
552 return 0;
553} 562}
554 563
555static struct sysdev_class oprofile_sysclass = { 564static struct syscore_ops oprofile_syscore_ops = {
556 .name = "oprofile",
557 .resume = nmi_resume, 565 .resume = nmi_resume,
558 .suspend = nmi_suspend, 566 .suspend = nmi_suspend,
559}; 567};
560 568
561static struct sys_device device_oprofile = { 569static void __init init_suspend_resume(void)
562 .id = 0,
563 .cls = &oprofile_sysclass,
564};
565
566static int __init init_sysfs(void)
567{ 570{
568 int error; 571 register_syscore_ops(&oprofile_syscore_ops);
569
570 error = sysdev_class_register(&oprofile_sysclass);
571 if (error)
572 return error;
573
574 error = sysdev_register(&device_oprofile);
575 if (error)
576 sysdev_class_unregister(&oprofile_sysclass);
577
578 return error;
579} 572}
580 573
581static void exit_sysfs(void) 574static void exit_suspend_resume(void)
582{ 575{
583 sysdev_unregister(&device_oprofile); 576 unregister_syscore_ops(&oprofile_syscore_ops);
584 sysdev_class_unregister(&oprofile_sysclass);
585} 577}
586 578
587#else 579#else
588 580
589static inline int init_sysfs(void) { return 0; } 581static inline void init_suspend_resume(void) { }
590static inline void exit_sysfs(void) { } 582static inline void exit_suspend_resume(void) { }
591 583
592#endif /* CONFIG_PM */ 584#endif /* CONFIG_PM */
593 585
@@ -695,9 +687,6 @@ static int __init ppro_init(char **cpu_type)
695 return 1; 687 return 1;
696} 688}
697 689
698/* in order to get sysfs right */
699static int using_nmi;
700
701int __init op_nmi_init(struct oprofile_operations *ops) 690int __init op_nmi_init(struct oprofile_operations *ops)
702{ 691{
703 __u8 vendor = boot_cpu_data.x86_vendor; 692 __u8 vendor = boot_cpu_data.x86_vendor;
@@ -705,8 +694,6 @@ int __init op_nmi_init(struct oprofile_operations *ops)
705 char *cpu_type = NULL; 694 char *cpu_type = NULL;
706 int ret = 0; 695 int ret = 0;
707 696
708 using_nmi = 0;
709
710 if (!cpu_has_apic) 697 if (!cpu_has_apic)
711 return -ENODEV; 698 return -ENODEV;
712 699
@@ -731,6 +718,15 @@ int __init op_nmi_init(struct oprofile_operations *ops)
731 case 0x11: 718 case 0x11:
732 cpu_type = "x86-64/family11h"; 719 cpu_type = "x86-64/family11h";
733 break; 720 break;
721 case 0x12:
722 cpu_type = "x86-64/family12h";
723 break;
724 case 0x14:
725 cpu_type = "x86-64/family14h";
726 break;
727 case 0x15:
728 cpu_type = "x86-64/family15h";
729 break;
734 default: 730 default:
735 return -ENODEV; 731 return -ENODEV;
736 } 732 }
@@ -786,17 +782,13 @@ int __init op_nmi_init(struct oprofile_operations *ops)
786 782
787 mux_init(ops); 783 mux_init(ops);
788 784
789 ret = init_sysfs(); 785 init_suspend_resume();
790 if (ret)
791 return ret;
792 786
793 using_nmi = 1;
794 printk(KERN_INFO "oprofile: using NMI interrupt.\n"); 787 printk(KERN_INFO "oprofile: using NMI interrupt.\n");
795 return 0; 788 return 0;
796} 789}
797 790
798void op_nmi_exit(void) 791void op_nmi_exit(void)
799{ 792{
800 if (using_nmi) 793 exit_suspend_resume();
801 exit_sysfs();
802} 794}
diff --git a/arch/x86/oprofile/nmi_timer_int.c b/arch/x86/oprofile/nmi_timer_int.c
index e3ecb71b5790..720bf5a53c51 100644
--- a/arch/x86/oprofile/nmi_timer_int.c
+++ b/arch/x86/oprofile/nmi_timer_int.c
@@ -38,7 +38,7 @@ static int profile_timer_exceptions_notify(struct notifier_block *self,
38static struct notifier_block profile_timer_exceptions_nb = { 38static struct notifier_block profile_timer_exceptions_nb = {
39 .notifier_call = profile_timer_exceptions_notify, 39 .notifier_call = profile_timer_exceptions_notify,
40 .next = NULL, 40 .next = NULL,
41 .priority = 0 41 .priority = NMI_LOW_PRIOR,
42}; 42};
43 43
44static int timer_start(void) 44static int timer_start(void)
@@ -58,9 +58,6 @@ static void timer_stop(void)
58 58
59int __init op_nmi_timer_init(struct oprofile_operations *ops) 59int __init op_nmi_timer_init(struct oprofile_operations *ops)
60{ 60{
61 if ((nmi_watchdog != NMI_IO_APIC) || (atomic_read(&nmi_active) <= 0))
62 return -ENODEV;
63
64 ops->start = timer_start; 61 ops->start = timer_start;
65 ops->stop = timer_stop; 62 ops->stop = timer_stop;
66 ops->cpu_type = "timer"; 63 ops->cpu_type = "timer";
diff --git a/arch/x86/oprofile/op_counter.h b/arch/x86/oprofile/op_counter.h
index e28398df0df2..0b7b7b179cbe 100644
--- a/arch/x86/oprofile/op_counter.h
+++ b/arch/x86/oprofile/op_counter.h
@@ -22,6 +22,7 @@ struct op_counter_config {
22 unsigned long kernel; 22 unsigned long kernel;
23 unsigned long user; 23 unsigned long user;
24 unsigned long unit_mask; 24 unsigned long unit_mask;
25 unsigned long extra;
25}; 26};
26 27
27extern struct op_counter_config counter_config[]; 28extern struct op_counter_config counter_config[];
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index b67a6b5aa8d4..9cbb710dc94b 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -29,11 +29,12 @@
29#include "op_x86_model.h" 29#include "op_x86_model.h"
30#include "op_counter.h" 30#include "op_counter.h"
31 31
32#define NUM_COUNTERS 4 32#define NUM_COUNTERS 4
33#define NUM_COUNTERS_F15H 6
33#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 34#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
34#define NUM_VIRT_COUNTERS 32 35#define NUM_VIRT_COUNTERS 32
35#else 36#else
36#define NUM_VIRT_COUNTERS NUM_COUNTERS 37#define NUM_VIRT_COUNTERS 0
37#endif 38#endif
38 39
39#define OP_EVENT_MASK 0x0FFF 40#define OP_EVENT_MASK 0x0FFF
@@ -41,38 +42,61 @@
41 42
42#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21)) 43#define MSR_AMD_EVENTSEL_RESERVED ((0xFFFFFCF0ULL<<32)|(1ULL<<21))
43 44
44static unsigned long reset_value[NUM_VIRT_COUNTERS]; 45static int num_counters;
46static unsigned long reset_value[OP_MAX_COUNTER];
45 47
46#define IBS_FETCH_SIZE 6 48#define IBS_FETCH_SIZE 6
47#define IBS_OP_SIZE 12 49#define IBS_OP_SIZE 12
48 50
49static u32 ibs_caps; 51static u32 ibs_caps;
50 52
51struct op_ibs_config { 53struct ibs_config {
52 unsigned long op_enabled; 54 unsigned long op_enabled;
53 unsigned long fetch_enabled; 55 unsigned long fetch_enabled;
54 unsigned long max_cnt_fetch; 56 unsigned long max_cnt_fetch;
55 unsigned long max_cnt_op; 57 unsigned long max_cnt_op;
56 unsigned long rand_en; 58 unsigned long rand_en;
57 unsigned long dispatched_ops; 59 unsigned long dispatched_ops;
60 unsigned long branch_target;
58}; 61};
59 62
60static struct op_ibs_config ibs_config; 63struct ibs_state {
61static u64 ibs_op_ctl; 64 u64 ibs_op_ctl;
65 int branch_target;
66 unsigned long sample_size;
67};
68
69static struct ibs_config ibs_config;
70static struct ibs_state ibs_state;
62 71
63/* 72/*
64 * IBS cpuid feature detection 73 * IBS cpuid feature detection
65 */ 74 */
66 75
67#define IBS_CPUID_FEATURES 0x8000001b 76#define IBS_CPUID_FEATURES 0x8000001b
68 77
69/* 78/*
70 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but 79 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
71 * bit 0 is used to indicate the existence of IBS. 80 * bit 0 is used to indicate the existence of IBS.
72 */ 81 */
73#define IBS_CAPS_AVAIL (1LL<<0) 82#define IBS_CAPS_AVAIL (1U<<0)
74#define IBS_CAPS_RDWROPCNT (1LL<<3) 83#define IBS_CAPS_FETCHSAM (1U<<1)
75#define IBS_CAPS_OPCNT (1LL<<4) 84#define IBS_CAPS_OPSAM (1U<<2)
85#define IBS_CAPS_RDWROPCNT (1U<<3)
86#define IBS_CAPS_OPCNT (1U<<4)
87#define IBS_CAPS_BRNTRGT (1U<<5)
88#define IBS_CAPS_OPCNTEXT (1U<<6)
89
90#define IBS_CAPS_DEFAULT (IBS_CAPS_AVAIL \
91 | IBS_CAPS_FETCHSAM \
92 | IBS_CAPS_OPSAM)
93
94/*
95 * IBS APIC setup
96 */
97#define IBSCTL 0x1cc
98#define IBSCTL_LVT_OFFSET_VALID (1ULL<<8)
99#define IBSCTL_LVT_OFFSET_MASK 0x0F
76 100
77/* 101/*
78 * IBS randomization macros 102 * IBS randomization macros
@@ -92,12 +116,12 @@ static u32 get_ibs_caps(void)
92 /* check IBS cpuid feature flags */ 116 /* check IBS cpuid feature flags */
93 max_level = cpuid_eax(0x80000000); 117 max_level = cpuid_eax(0x80000000);
94 if (max_level < IBS_CPUID_FEATURES) 118 if (max_level < IBS_CPUID_FEATURES)
95 return IBS_CAPS_AVAIL; 119 return IBS_CAPS_DEFAULT;
96 120
97 ibs_caps = cpuid_eax(IBS_CPUID_FEATURES); 121 ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
98 if (!(ibs_caps & IBS_CAPS_AVAIL)) 122 if (!(ibs_caps & IBS_CAPS_AVAIL))
99 /* cpuid flags not valid */ 123 /* cpuid flags not valid */
100 return IBS_CAPS_AVAIL; 124 return IBS_CAPS_DEFAULT;
101 125
102 return ibs_caps; 126 return ibs_caps;
103} 127}
@@ -190,8 +214,8 @@ op_amd_handle_ibs(struct pt_regs * const regs,
190 rdmsrl(MSR_AMD64_IBSOPCTL, ctl); 214 rdmsrl(MSR_AMD64_IBSOPCTL, ctl);
191 if (ctl & IBS_OP_VAL) { 215 if (ctl & IBS_OP_VAL) {
192 rdmsrl(MSR_AMD64_IBSOPRIP, val); 216 rdmsrl(MSR_AMD64_IBSOPRIP, val);
193 oprofile_write_reserve(&entry, regs, val, 217 oprofile_write_reserve(&entry, regs, val, IBS_OP_CODE,
194 IBS_OP_CODE, IBS_OP_SIZE); 218 ibs_state.sample_size);
195 oprofile_add_data64(&entry, val); 219 oprofile_add_data64(&entry, val);
196 rdmsrl(MSR_AMD64_IBSOPDATA, val); 220 rdmsrl(MSR_AMD64_IBSOPDATA, val);
197 oprofile_add_data64(&entry, val); 221 oprofile_add_data64(&entry, val);
@@ -203,10 +227,14 @@ op_amd_handle_ibs(struct pt_regs * const regs,
203 oprofile_add_data64(&entry, val); 227 oprofile_add_data64(&entry, val);
204 rdmsrl(MSR_AMD64_IBSDCPHYSAD, val); 228 rdmsrl(MSR_AMD64_IBSDCPHYSAD, val);
205 oprofile_add_data64(&entry, val); 229 oprofile_add_data64(&entry, val);
230 if (ibs_state.branch_target) {
231 rdmsrl(MSR_AMD64_IBSBRTARGET, val);
232 oprofile_add_data(&entry, (unsigned long)val);
233 }
206 oprofile_write_commit(&entry); 234 oprofile_write_commit(&entry);
207 235
208 /* reenable the IRQ */ 236 /* reenable the IRQ */
209 ctl = op_amd_randomize_ibs_op(ibs_op_ctl); 237 ctl = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
210 wrmsrl(MSR_AMD64_IBSOPCTL, ctl); 238 wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
211 } 239 }
212 } 240 }
@@ -219,21 +247,32 @@ static inline void op_amd_start_ibs(void)
219 if (!ibs_caps) 247 if (!ibs_caps)
220 return; 248 return;
221 249
250 memset(&ibs_state, 0, sizeof(ibs_state));
251
252 /*
253 * Note: Since the max count settings may out of range we
254 * write back the actual used values so that userland can read
255 * it.
256 */
257
222 if (ibs_config.fetch_enabled) { 258 if (ibs_config.fetch_enabled) {
223 val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT; 259 val = ibs_config.max_cnt_fetch >> 4;
260 val = min(val, IBS_FETCH_MAX_CNT);
261 ibs_config.max_cnt_fetch = val << 4;
224 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; 262 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
225 val |= IBS_FETCH_ENABLE; 263 val |= IBS_FETCH_ENABLE;
226 wrmsrl(MSR_AMD64_IBSFETCHCTL, val); 264 wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
227 } 265 }
228 266
229 if (ibs_config.op_enabled) { 267 if (ibs_config.op_enabled) {
230 ibs_op_ctl = ibs_config.max_cnt_op >> 4; 268 val = ibs_config.max_cnt_op >> 4;
231 if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) { 269 if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
232 /* 270 /*
233 * IbsOpCurCnt not supported. See 271 * IbsOpCurCnt not supported. See
234 * op_amd_randomize_ibs_op() for details. 272 * op_amd_randomize_ibs_op() for details.
235 */ 273 */
236 ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL); 274 val = clamp(val, 0x0081ULL, 0xFF80ULL);
275 ibs_config.max_cnt_op = val << 4;
237 } else { 276 } else {
238 /* 277 /*
239 * The start value is randomized with a 278 * The start value is randomized with a
@@ -241,13 +280,24 @@ static inline void op_amd_start_ibs(void)
241 * with the half of the randomized range. Also 280 * with the half of the randomized range. Also
242 * avoid underflows. 281 * avoid underflows.
243 */ 282 */
244 ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET, 283 val += IBS_RANDOM_MAXCNT_OFFSET;
245 IBS_OP_MAX_CNT); 284 if (ibs_caps & IBS_CAPS_OPCNTEXT)
285 val = min(val, IBS_OP_MAX_CNT_EXT);
286 else
287 val = min(val, IBS_OP_MAX_CNT);
288 ibs_config.max_cnt_op =
289 (val - IBS_RANDOM_MAXCNT_OFFSET) << 4;
246 } 290 }
247 if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops) 291 val = ((val & ~IBS_OP_MAX_CNT) << 4) | (val & IBS_OP_MAX_CNT);
248 ibs_op_ctl |= IBS_OP_CNT_CTL; 292 val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0;
249 ibs_op_ctl |= IBS_OP_ENABLE; 293 val |= IBS_OP_ENABLE;
250 val = op_amd_randomize_ibs_op(ibs_op_ctl); 294 ibs_state.ibs_op_ctl = val;
295 ibs_state.sample_size = IBS_OP_SIZE;
296 if (ibs_config.branch_target) {
297 ibs_state.branch_target = 1;
298 ibs_state.sample_size++;
299 }
300 val = op_amd_randomize_ibs_op(ibs_state.ibs_op_ctl);
251 wrmsrl(MSR_AMD64_IBSOPCTL, val); 301 wrmsrl(MSR_AMD64_IBSOPCTL, val);
252 } 302 }
253} 303}
@@ -266,6 +316,81 @@ static void op_amd_stop_ibs(void)
266 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 316 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
267} 317}
268 318
319static inline int get_eilvt(int offset)
320{
321 return !setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 1);
322}
323
324static inline int put_eilvt(int offset)
325{
326 return !setup_APIC_eilvt(offset, 0, 0, 1);
327}
328
329static inline int ibs_eilvt_valid(void)
330{
331 int offset;
332 u64 val;
333 int valid = 0;
334
335 preempt_disable();
336
337 rdmsrl(MSR_AMD64_IBSCTL, val);
338 offset = val & IBSCTL_LVT_OFFSET_MASK;
339
340 if (!(val & IBSCTL_LVT_OFFSET_VALID)) {
341 pr_err(FW_BUG "cpu %d, invalid IBS interrupt offset %d (MSR%08X=0x%016llx)\n",
342 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
343 goto out;
344 }
345
346 if (!get_eilvt(offset)) {
347 pr_err(FW_BUG "cpu %d, IBS interrupt offset %d not available (MSR%08X=0x%016llx)\n",
348 smp_processor_id(), offset, MSR_AMD64_IBSCTL, val);
349 goto out;
350 }
351
352 valid = 1;
353out:
354 preempt_enable();
355
356 return valid;
357}
358
359static inline int get_ibs_offset(void)
360{
361 u64 val;
362
363 rdmsrl(MSR_AMD64_IBSCTL, val);
364 if (!(val & IBSCTL_LVT_OFFSET_VALID))
365 return -EINVAL;
366
367 return val & IBSCTL_LVT_OFFSET_MASK;
368}
369
370static void setup_APIC_ibs(void)
371{
372 int offset;
373
374 offset = get_ibs_offset();
375 if (offset < 0)
376 goto failed;
377
378 if (!setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_NMI, 0))
379 return;
380failed:
381 pr_warn("oprofile: IBS APIC setup failed on cpu #%d\n",
382 smp_processor_id());
383}
384
385static void clear_APIC_ibs(void)
386{
387 int offset;
388
389 offset = get_ibs_offset();
390 if (offset >= 0)
391 setup_APIC_eilvt(offset, 0, APIC_EILVT_MSG_FIX, 1);
392}
393
269#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 394#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
270 395
271static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, 396static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
@@ -275,7 +400,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
275 int i; 400 int i;
276 401
277 /* enable active counters */ 402 /* enable active counters */
278 for (i = 0; i < NUM_COUNTERS; ++i) { 403 for (i = 0; i < num_counters; ++i) {
279 int virt = op_x86_phys_to_virt(i); 404 int virt = op_x86_phys_to_virt(i);
280 if (!reset_value[virt]) 405 if (!reset_value[virt])
281 continue; 406 continue;
@@ -294,7 +419,7 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
294{ 419{
295 int i; 420 int i;
296 421
297 for (i = 0; i < NUM_COUNTERS; ++i) { 422 for (i = 0; i < num_counters; ++i) {
298 if (!msrs->counters[i].addr) 423 if (!msrs->counters[i].addr)
299 continue; 424 continue;
300 release_perfctr_nmi(MSR_K7_PERFCTR0 + i); 425 release_perfctr_nmi(MSR_K7_PERFCTR0 + i);
@@ -306,7 +431,7 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
306{ 431{
307 int i; 432 int i;
308 433
309 for (i = 0; i < NUM_COUNTERS; i++) { 434 for (i = 0; i < num_counters; i++) {
310 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 435 if (!reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
311 goto fail; 436 goto fail;
312 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) { 437 if (!reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) {
@@ -314,8 +439,13 @@ static int op_amd_fill_in_addresses(struct op_msrs * const msrs)
314 goto fail; 439 goto fail;
315 } 440 }
316 /* both registers must be reserved */ 441 /* both registers must be reserved */
317 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; 442 if (num_counters == NUM_COUNTERS_F15H) {
318 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; 443 msrs->counters[i].addr = MSR_F15H_PERF_CTR + (i << 1);
444 msrs->controls[i].addr = MSR_F15H_PERF_CTL + (i << 1);
445 } else {
446 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
447 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
448 }
319 continue; 449 continue;
320 fail: 450 fail:
321 if (!counter_config[i].enabled) 451 if (!counter_config[i].enabled)
@@ -335,7 +465,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
335 int i; 465 int i;
336 466
337 /* setup reset_value */ 467 /* setup reset_value */
338 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { 468 for (i = 0; i < OP_MAX_COUNTER; ++i) {
339 if (counter_config[i].enabled 469 if (counter_config[i].enabled
340 && msrs->counters[op_x86_virt_to_phys(i)].addr) 470 && msrs->counters[op_x86_virt_to_phys(i)].addr)
341 reset_value[i] = counter_config[i].count; 471 reset_value[i] = counter_config[i].count;
@@ -344,7 +474,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
344 } 474 }
345 475
346 /* clear all counters */ 476 /* clear all counters */
347 for (i = 0; i < NUM_COUNTERS; ++i) { 477 for (i = 0; i < num_counters; ++i) {
348 if (!msrs->controls[i].addr) 478 if (!msrs->controls[i].addr)
349 continue; 479 continue;
350 rdmsrl(msrs->controls[i].addr, val); 480 rdmsrl(msrs->controls[i].addr, val);
@@ -360,7 +490,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
360 } 490 }
361 491
362 /* enable active counters */ 492 /* enable active counters */
363 for (i = 0; i < NUM_COUNTERS; ++i) { 493 for (i = 0; i < num_counters; ++i) {
364 int virt = op_x86_phys_to_virt(i); 494 int virt = op_x86_phys_to_virt(i);
365 if (!reset_value[virt]) 495 if (!reset_value[virt])
366 continue; 496 continue;
@@ -376,13 +506,13 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
376 } 506 }
377 507
378 if (ibs_caps) 508 if (ibs_caps)
379 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_NMI, 0); 509 setup_APIC_ibs();
380} 510}
381 511
382static void op_amd_cpu_shutdown(void) 512static void op_amd_cpu_shutdown(void)
383{ 513{
384 if (ibs_caps) 514 if (ibs_caps)
385 setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1); 515 clear_APIC_ibs();
386} 516}
387 517
388static int op_amd_check_ctrs(struct pt_regs * const regs, 518static int op_amd_check_ctrs(struct pt_regs * const regs,
@@ -391,7 +521,7 @@ static int op_amd_check_ctrs(struct pt_regs * const regs,
391 u64 val; 521 u64 val;
392 int i; 522 int i;
393 523
394 for (i = 0; i < NUM_COUNTERS; ++i) { 524 for (i = 0; i < num_counters; ++i) {
395 int virt = op_x86_phys_to_virt(i); 525 int virt = op_x86_phys_to_virt(i);
396 if (!reset_value[virt]) 526 if (!reset_value[virt])
397 continue; 527 continue;
@@ -414,7 +544,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
414 u64 val; 544 u64 val;
415 int i; 545 int i;
416 546
417 for (i = 0; i < NUM_COUNTERS; ++i) { 547 for (i = 0; i < num_counters; ++i) {
418 if (!reset_value[op_x86_phys_to_virt(i)]) 548 if (!reset_value[op_x86_phys_to_virt(i)])
419 continue; 549 continue;
420 rdmsrl(msrs->controls[i].addr, val); 550 rdmsrl(msrs->controls[i].addr, val);
@@ -434,7 +564,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
434 * Subtle: stop on all counters to avoid race with setting our 564 * Subtle: stop on all counters to avoid race with setting our
435 * pm callback 565 * pm callback
436 */ 566 */
437 for (i = 0; i < NUM_COUNTERS; ++i) { 567 for (i = 0; i < num_counters; ++i) {
438 if (!reset_value[op_x86_phys_to_virt(i)]) 568 if (!reset_value[op_x86_phys_to_virt(i)])
439 continue; 569 continue;
440 rdmsrl(msrs->controls[i].addr, val); 570 rdmsrl(msrs->controls[i].addr, val);
@@ -445,16 +575,11 @@ static void op_amd_stop(struct op_msrs const * const msrs)
445 op_amd_stop_ibs(); 575 op_amd_stop_ibs();
446} 576}
447 577
448static int __init_ibs_nmi(void) 578static int setup_ibs_ctl(int ibs_eilvt_off)
449{ 579{
450#define IBSCTL_LVTOFFSETVAL (1 << 8)
451#define IBSCTL 0x1cc
452 struct pci_dev *cpu_cfg; 580 struct pci_dev *cpu_cfg;
453 int nodes; 581 int nodes;
454 u32 value = 0; 582 u32 value = 0;
455 u8 ibs_eilvt_off;
456
457 ibs_eilvt_off = setup_APIC_eilvt_ibs(0, APIC_EILVT_MSG_FIX, 1);
458 583
459 nodes = 0; 584 nodes = 0;
460 cpu_cfg = NULL; 585 cpu_cfg = NULL;
@@ -466,25 +591,75 @@ static int __init_ibs_nmi(void)
466 break; 591 break;
467 ++nodes; 592 ++nodes;
468 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off 593 pci_write_config_dword(cpu_cfg, IBSCTL, ibs_eilvt_off
469 | IBSCTL_LVTOFFSETVAL); 594 | IBSCTL_LVT_OFFSET_VALID);
470 pci_read_config_dword(cpu_cfg, IBSCTL, &value); 595 pci_read_config_dword(cpu_cfg, IBSCTL, &value);
471 if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { 596 if (value != (ibs_eilvt_off | IBSCTL_LVT_OFFSET_VALID)) {
472 pci_dev_put(cpu_cfg); 597 pci_dev_put(cpu_cfg);
473 printk(KERN_DEBUG "Failed to setup IBS LVT offset, " 598 printk(KERN_DEBUG "Failed to setup IBS LVT offset, "
474 "IBSCTL = 0x%08x", value); 599 "IBSCTL = 0x%08x\n", value);
475 return 1; 600 return -EINVAL;
476 } 601 }
477 } while (1); 602 } while (1);
478 603
479 if (!nodes) { 604 if (!nodes) {
480 printk(KERN_DEBUG "No CPU node configured for IBS"); 605 printk(KERN_DEBUG "No CPU node configured for IBS\n");
481 return 1; 606 return -ENODEV;
607 }
608
609 return 0;
610}
611
612/*
613 * This runs only on the current cpu. We try to find an LVT offset and
614 * setup the local APIC. For this we must disable preemption. On
615 * success we initialize all nodes with this offset. This updates then
616 * the offset in the IBS_CTL per-node msr. The per-core APIC setup of
617 * the IBS interrupt vector is called from op_amd_setup_ctrs()/op_-
618 * amd_cpu_shutdown() using the new offset.
619 */
620static int force_ibs_eilvt_setup(void)
621{
622 int offset;
623 int ret;
624
625 preempt_disable();
626 /* find the next free available EILVT entry, skip offset 0 */
627 for (offset = 1; offset < APIC_EILVT_NR_MAX; offset++) {
628 if (get_eilvt(offset))
629 break;
630 }
631 preempt_enable();
632
633 if (offset == APIC_EILVT_NR_MAX) {
634 printk(KERN_DEBUG "No EILVT entry available\n");
635 return -EBUSY;
482 } 636 }
483 637
638 ret = setup_ibs_ctl(offset);
639 if (ret)
640 goto out;
641
642 if (!ibs_eilvt_valid()) {
643 ret = -EFAULT;
644 goto out;
645 }
646
647 pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset);
648 pr_err(FW_BUG "workaround enabled for IBS LVT offset\n");
649
484 return 0; 650 return 0;
651out:
652 preempt_disable();
653 put_eilvt(offset);
654 preempt_enable();
655 return ret;
485} 656}
486 657
487/* initialize the APIC for the IBS interrupts if available */ 658/*
659 * check and reserve APIC extended interrupt LVT offset for IBS if
660 * available
661 */
662
488static void init_ibs(void) 663static void init_ibs(void)
489{ 664{
490 ibs_caps = get_ibs_caps(); 665 ibs_caps = get_ibs_caps();
@@ -492,13 +667,18 @@ static void init_ibs(void)
492 if (!ibs_caps) 667 if (!ibs_caps)
493 return; 668 return;
494 669
495 if (__init_ibs_nmi()) { 670 if (ibs_eilvt_valid())
496 ibs_caps = 0; 671 goto out;
497 return; 672
498 } 673 if (!force_ibs_eilvt_setup())
674 goto out;
499 675
500 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", 676 /* Failed to setup ibs */
501 (unsigned)ibs_caps); 677 ibs_caps = 0;
678 return;
679
680out:
681 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n", ibs_caps);
502} 682}
503 683
504static int (*create_arch_files)(struct super_block *sb, struct dentry *root); 684static int (*create_arch_files)(struct super_block *sb, struct dentry *root);
@@ -521,44 +701,60 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
521 /* model specific files */ 701 /* model specific files */
522 702
523 /* setup some reasonable defaults */ 703 /* setup some reasonable defaults */
704 memset(&ibs_config, 0, sizeof(ibs_config));
524 ibs_config.max_cnt_fetch = 250000; 705 ibs_config.max_cnt_fetch = 250000;
525 ibs_config.fetch_enabled = 0;
526 ibs_config.max_cnt_op = 250000; 706 ibs_config.max_cnt_op = 250000;
527 ibs_config.op_enabled = 0; 707
528 ibs_config.dispatched_ops = 0; 708 if (ibs_caps & IBS_CAPS_FETCHSAM) {
529 709 dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
530 dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); 710 oprofilefs_create_ulong(sb, dir, "enable",
531 oprofilefs_create_ulong(sb, dir, "enable", 711 &ibs_config.fetch_enabled);
532 &ibs_config.fetch_enabled); 712 oprofilefs_create_ulong(sb, dir, "max_count",
533 oprofilefs_create_ulong(sb, dir, "max_count", 713 &ibs_config.max_cnt_fetch);
534 &ibs_config.max_cnt_fetch); 714 oprofilefs_create_ulong(sb, dir, "rand_enable",
535 oprofilefs_create_ulong(sb, dir, "rand_enable", 715 &ibs_config.rand_en);
536 &ibs_config.rand_en); 716 }
537 717
538 dir = oprofilefs_mkdir(sb, root, "ibs_op"); 718 if (ibs_caps & IBS_CAPS_OPSAM) {
539 oprofilefs_create_ulong(sb, dir, "enable", 719 dir = oprofilefs_mkdir(sb, root, "ibs_op");
540 &ibs_config.op_enabled); 720 oprofilefs_create_ulong(sb, dir, "enable",
541 oprofilefs_create_ulong(sb, dir, "max_count", 721 &ibs_config.op_enabled);
542 &ibs_config.max_cnt_op); 722 oprofilefs_create_ulong(sb, dir, "max_count",
543 if (ibs_caps & IBS_CAPS_OPCNT) 723 &ibs_config.max_cnt_op);
544 oprofilefs_create_ulong(sb, dir, "dispatched_ops", 724 if (ibs_caps & IBS_CAPS_OPCNT)
545 &ibs_config.dispatched_ops); 725 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
726 &ibs_config.dispatched_ops);
727 if (ibs_caps & IBS_CAPS_BRNTRGT)
728 oprofilefs_create_ulong(sb, dir, "branch_target",
729 &ibs_config.branch_target);
730 }
546 731
547 return 0; 732 return 0;
548} 733}
549 734
735struct op_x86_model_spec op_amd_spec;
736
550static int op_amd_init(struct oprofile_operations *ops) 737static int op_amd_init(struct oprofile_operations *ops)
551{ 738{
552 init_ibs(); 739 init_ibs();
553 create_arch_files = ops->create_files; 740 create_arch_files = ops->create_files;
554 ops->create_files = setup_ibs_files; 741 ops->create_files = setup_ibs_files;
742
743 if (boot_cpu_data.x86 == 0x15) {
744 num_counters = NUM_COUNTERS_F15H;
745 } else {
746 num_counters = NUM_COUNTERS;
747 }
748
749 op_amd_spec.num_counters = num_counters;
750 op_amd_spec.num_controls = num_counters;
751 op_amd_spec.num_virt_counters = max(num_counters, NUM_VIRT_COUNTERS);
752
555 return 0; 753 return 0;
556} 754}
557 755
558struct op_x86_model_spec op_amd_spec = { 756struct op_x86_model_spec op_amd_spec = {
559 .num_counters = NUM_COUNTERS, 757 /* num_counters/num_controls filled in at runtime */
560 .num_controls = NUM_COUNTERS,
561 .num_virt_counters = NUM_VIRT_COUNTERS,
562 .reserved = MSR_AMD_EVENTSEL_RESERVED, 758 .reserved = MSR_AMD_EVENTSEL_RESERVED,
563 .event_mask = OP_EVENT_MASK, 759 .event_mask = OP_EVENT_MASK,
564 .init = op_amd_init, 760 .init = op_amd_init,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index 182558dd5515..98ab13058f89 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -11,7 +11,7 @@
11#include <linux/oprofile.h> 11#include <linux/oprofile.h>
12#include <linux/smp.h> 12#include <linux/smp.h>
13#include <linux/ptrace.h> 13#include <linux/ptrace.h>
14#include <linux/nmi.h> 14#include <asm/nmi.h>
15#include <asm/msr.h> 15#include <asm/msr.h>
16#include <asm/fixmap.h> 16#include <asm/fixmap.h>
17#include <asm/apic.h> 17#include <asm/apic.h>
@@ -50,7 +50,7 @@ static inline void setup_num_counters(void)
50#endif 50#endif
51} 51}
52 52
53static int inline addr_increment(void) 53static inline int addr_increment(void)
54{ 54{
55#ifdef CONFIG_SMP 55#ifdef CONFIG_SMP
56 return smp_num_siblings == 2 ? 2 : 1; 56 return smp_num_siblings == 2 ? 2 : 1;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index d769cda54082..94b745045e45 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -95,8 +95,8 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
95 * counter width: 95 * counter width:
96 */ 96 */
97 if (!(eax.split.version_id == 0 && 97 if (!(eax.split.version_id == 0 &&
98 current_cpu_data.x86 == 6 && 98 __this_cpu_read(cpu_info.x86) == 6 &&
99 current_cpu_data.x86_model == 15)) { 99 __this_cpu_read(cpu_info.x86_model) == 15)) {
100 100
101 if (counter_width < eax.split.bit_width) 101 if (counter_width < eax.split.bit_width)
102 counter_width = eax.split.bit_width; 102 counter_width = eax.split.bit_width;
@@ -235,8 +235,8 @@ static void arch_perfmon_setup_counters(void)
235 eax.full = cpuid_eax(0xa); 235 eax.full = cpuid_eax(0xa);
236 236
237 /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */ 237 /* Workaround for BIOS bugs in 6/15. Taken from perfmon2 */
238 if (eax.split.version_id == 0 && current_cpu_data.x86 == 6 && 238 if (eax.split.version_id == 0 && __this_cpu_read(cpu_info.x86) == 6 &&
239 current_cpu_data.x86_model == 15) { 239 __this_cpu_read(cpu_info.x86_model) == 15) {
240 eax.split.version_id = 2; 240 eax.split.version_id = 2;
241 eax.split.num_counters = 2; 241 eax.split.num_counters = 2;
242 eax.split.bit_width = 40; 242 eax.split.bit_width = 40;
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index a0207a7fdf39..6b8759f7634e 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -4,8 +4,10 @@ obj-$(CONFIG_PCI_BIOS) += pcbios.o
4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o 4obj-$(CONFIG_PCI_MMCONFIG) += mmconfig_$(BITS).o direct.o mmconfig-shared.o
5obj-$(CONFIG_PCI_DIRECT) += direct.o 5obj-$(CONFIG_PCI_DIRECT) += direct.o
6obj-$(CONFIG_PCI_OLPC) += olpc.o 6obj-$(CONFIG_PCI_OLPC) += olpc.o
7obj-$(CONFIG_PCI_XEN) += xen.o
7 8
8obj-y += fixup.o 9obj-y += fixup.o
10obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
9obj-$(CONFIG_ACPI) += acpi.o 11obj-$(CONFIG_ACPI) += acpi.o
10obj-y += legacy.o irq.o 12obj-y += legacy.o irq.o
11 13
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 15466c096ba5..68c3c1395202 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -138,7 +138,6 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
138 struct acpi_resource_address64 addr; 138 struct acpi_resource_address64 addr;
139 acpi_status status; 139 acpi_status status;
140 unsigned long flags; 140 unsigned long flags;
141 struct resource *root, *conflict;
142 u64 start, end; 141 u64 start, end;
143 142
144 status = resource_to_addr(acpi_res, &addr); 143 status = resource_to_addr(acpi_res, &addr);
@@ -146,12 +145,10 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
146 return AE_OK; 145 return AE_OK;
147 146
148 if (addr.resource_type == ACPI_MEMORY_RANGE) { 147 if (addr.resource_type == ACPI_MEMORY_RANGE) {
149 root = &iomem_resource;
150 flags = IORESOURCE_MEM; 148 flags = IORESOURCE_MEM;
151 if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY) 149 if (addr.info.mem.caching == ACPI_PREFETCHABLE_MEMORY)
152 flags |= IORESOURCE_PREFETCH; 150 flags |= IORESOURCE_PREFETCH;
153 } else if (addr.resource_type == ACPI_IO_RANGE) { 151 } else if (addr.resource_type == ACPI_IO_RANGE) {
154 root = &ioport_resource;
155 flags = IORESOURCE_IO; 152 flags = IORESOURCE_IO;
156 } else 153 } else
157 return AE_OK; 154 return AE_OK;
@@ -172,25 +169,90 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
172 return AE_OK; 169 return AE_OK;
173 } 170 }
174 171
175 conflict = insert_resource_conflict(root, res); 172 info->res_num++;
176 if (conflict) { 173 if (addr.translation_offset)
177 dev_err(&info->bridge->dev, 174 dev_info(&info->bridge->dev, "host bridge window %pR "
178 "address space collision: host bridge window %pR " 175 "(PCI address [%#llx-%#llx])\n",
179 "conflicts with %s %pR\n", 176 res, res->start - addr.translation_offset,
180 res, conflict->name, conflict); 177 res->end - addr.translation_offset);
181 } else { 178 else
182 pci_bus_add_resource(info->bus, res, 0); 179 dev_info(&info->bridge->dev, "host bridge window %pR\n", res);
183 info->res_num++; 180
184 if (addr.translation_offset) 181 return AE_OK;
185 dev_info(&info->bridge->dev, "host bridge window %pR " 182}
186 "(PCI address [%#llx-%#llx])\n", 183
187 res, res->start - addr.translation_offset, 184static bool resource_contains(struct resource *res, resource_size_t point)
188 res->end - addr.translation_offset); 185{
186 if (res->start <= point && point <= res->end)
187 return true;
188 return false;
189}
190
191static void coalesce_windows(struct pci_root_info *info, unsigned long type)
192{
193 int i, j;
194 struct resource *res1, *res2;
195
196 for (i = 0; i < info->res_num; i++) {
197 res1 = &info->res[i];
198 if (!(res1->flags & type))
199 continue;
200
201 for (j = i + 1; j < info->res_num; j++) {
202 res2 = &info->res[j];
203 if (!(res2->flags & type))
204 continue;
205
206 /*
207 * I don't like throwing away windows because then
208 * our resources no longer match the ACPI _CRS, but
209 * the kernel resource tree doesn't allow overlaps.
210 */
211 if (resource_contains(res1, res2->start) ||
212 resource_contains(res1, res2->end) ||
213 resource_contains(res2, res1->start) ||
214 resource_contains(res2, res1->end)) {
215 res1->start = min(res1->start, res2->start);
216 res1->end = max(res1->end, res2->end);
217 dev_info(&info->bridge->dev,
218 "host bridge window expanded to %pR; %pR ignored\n",
219 res1, res2);
220 res2->flags = 0;
221 }
222 }
223 }
224}
225
226static void add_resources(struct pci_root_info *info)
227{
228 int i;
229 struct resource *res, *root, *conflict;
230
231 if (!pci_use_crs)
232 return;
233
234 coalesce_windows(info, IORESOURCE_MEM);
235 coalesce_windows(info, IORESOURCE_IO);
236
237 for (i = 0; i < info->res_num; i++) {
238 res = &info->res[i];
239
240 if (res->flags & IORESOURCE_MEM)
241 root = &iomem_resource;
242 else if (res->flags & IORESOURCE_IO)
243 root = &ioport_resource;
189 else 244 else
190 dev_info(&info->bridge->dev, 245 continue;
191 "host bridge window %pR\n", res); 246
247 conflict = insert_resource_conflict(root, res);
248 if (conflict)
249 dev_err(&info->bridge->dev,
250 "address space collision: host bridge window %pR "
251 "conflicts with %s %pR\n",
252 res, conflict->name, conflict);
253 else
254 pci_bus_add_resource(info->bus, res, 0);
192 } 255 }
193 return AE_OK;
194} 256}
195 257
196static void 258static void
@@ -224,6 +286,7 @@ get_current_resources(struct acpi_device *device, int busnum,
224 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource, 286 acpi_walk_resources(device->handle, METHOD_NAME__CRS, setup_resource,
225 &info); 287 &info);
226 288
289 add_resources(&info);
227 return; 290 return;
228 291
229name_alloc_fail: 292name_alloc_fail:
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index fc1e8fe07e5c..026e4931d162 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -4,6 +4,7 @@
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/range.h> 5#include <linux/range.h>
6 6
7#include <asm/amd_nb.h>
7#include <asm/pci_x86.h> 8#include <asm/pci_x86.h>
8 9
9#include <asm/pci-direct.h> 10#include <asm/pci-direct.h>
@@ -349,7 +350,7 @@ static int __init early_fill_mp_bus_info(void)
349 350
350#define ENABLE_CF8_EXT_CFG (1ULL << 46) 351#define ENABLE_CF8_EXT_CFG (1ULL << 46)
351 352
352static void enable_pci_io_ecs(void *unused) 353static void __cpuinit enable_pci_io_ecs(void *unused)
353{ 354{
354 u64 reg; 355 u64 reg;
355 rdmsrl(MSR_AMD64_NB_CFG, reg); 356 rdmsrl(MSR_AMD64_NB_CFG, reg);
@@ -378,6 +379,34 @@ static struct notifier_block __cpuinitdata amd_cpu_notifier = {
378 .notifier_call = amd_cpu_notify, 379 .notifier_call = amd_cpu_notify,
379}; 380};
380 381
382static void __init pci_enable_pci_io_ecs(void)
383{
384#ifdef CONFIG_AMD_NB
385 unsigned int i, n;
386
387 for (n = i = 0; !n && amd_nb_bus_dev_ranges[i].dev_limit; ++i) {
388 u8 bus = amd_nb_bus_dev_ranges[i].bus;
389 u8 slot = amd_nb_bus_dev_ranges[i].dev_base;
390 u8 limit = amd_nb_bus_dev_ranges[i].dev_limit;
391
392 for (; slot < limit; ++slot) {
393 u32 val = read_pci_config(bus, slot, 3, 0);
394
395 if (!early_is_amd_nb(val))
396 continue;
397
398 val = read_pci_config(bus, slot, 3, 0x8c);
399 if (!(val & (ENABLE_CF8_EXT_CFG >> 32))) {
400 val |= ENABLE_CF8_EXT_CFG >> 32;
401 write_pci_config(bus, slot, 3, 0x8c, val);
402 }
403 ++n;
404 }
405 }
406 pr_info("Extended Config Space enabled on %u nodes\n", n);
407#endif
408}
409
381static int __init pci_io_ecs_init(void) 410static int __init pci_io_ecs_init(void)
382{ 411{
383 int cpu; 412 int cpu;
@@ -386,6 +415,10 @@ static int __init pci_io_ecs_init(void)
386 if (boot_cpu_data.x86 < 0x10) 415 if (boot_cpu_data.x86 < 0x10)
387 return 0; 416 return 0;
388 417
418 /* Try the PCI method first. */
419 if (early_pci_allowed())
420 pci_enable_pci_io_ecs();
421
389 register_cpu_notifier(&amd_cpu_notifier); 422 register_cpu_notifier(&amd_cpu_notifier);
390 for_each_online_cpu(cpu) 423 for_each_online_cpu(cpu)
391 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE, 424 amd_cpu_notify(&amd_cpu_notifier, (unsigned long)CPU_ONLINE,
diff --git a/arch/x86/pci/broadcom_bus.c b/arch/x86/pci/broadcom_bus.c
index 0846a5bbbfbd..ab8269b0da29 100644
--- a/arch/x86/pci/broadcom_bus.c
+++ b/arch/x86/pci/broadcom_bus.c
@@ -9,6 +9,7 @@
9 * option) any later version. 9 * option) any later version.
10 */ 10 */
11 11
12#include <linux/acpi.h>
12#include <linux/delay.h> 13#include <linux/delay.h>
13#include <linux/dmi.h> 14#include <linux/dmi.h>
14#include <linux/pci.h> 15#include <linux/pci.h>
@@ -25,12 +26,14 @@ static void __devinit cnb20le_res(struct pci_dev *dev)
25 u8 fbus, lbus; 26 u8 fbus, lbus;
26 int i; 27 int i;
27 28
29#ifdef CONFIG_ACPI
28 /* 30 /*
29 * The x86_pci_root_bus_res_quirks() function already refuses to use 31 * We should get host bridge information from ACPI unless the BIOS
30 * this information if ACPI _CRS was used. Therefore, we don't bother 32 * doesn't support it.
31 * checking if ACPI is enabled, and just generate the information
32 * for both the ACPI _CRS and no ACPI cases.
33 */ 33 */
34 if (acpi_os_get_root_pointer())
35 return;
36#endif
34 37
35 info = &pci_root_info[pci_root_num]; 38 info = &pci_root_info[pci_root_num];
36 pci_root_num++; 39 pci_root_num++;
diff --git a/arch/x86/pci/ce4100.c b/arch/x86/pci/ce4100.c
new file mode 100644
index 000000000000..67858be4b52b
--- /dev/null
+++ b/arch/x86/pci/ce4100.c
@@ -0,0 +1,316 @@
1/*
2 * GPL LICENSE SUMMARY
3 *
4 * Copyright(c) 2010 Intel Corporation. All rights reserved.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of version 2 of the GNU General Public License as
8 * published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but
11 * WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, write to the Free Software
17 * Foundation, Inc., 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
18 * The full GNU General Public License is included in this distribution
19 * in the file called LICENSE.GPL.
20 *
21 * Contact Information:
22 * Intel Corporation
23 * 2200 Mission College Blvd.
24 * Santa Clara, CA 97052
25 *
26 * This provides access methods for PCI registers that mis-behave on
27 * the CE4100. Each register can be assigned a private init, read and
28 * write routine. The exception to this is the bridge device. The
29 * bridge device is the only device on bus zero (0) that requires any
30 * fixup so it is a special case ATM
31 */
32
33#include <linux/kernel.h>
34#include <linux/pci.h>
35#include <linux/init.h>
36
37#include <asm/ce4100.h>
38#include <asm/pci_x86.h>
39
40struct sim_reg {
41 u32 value;
42 u32 mask;
43};
44
45struct sim_dev_reg {
46 int dev_func;
47 int reg;
48 void (*init)(struct sim_dev_reg *reg);
49 void (*read)(struct sim_dev_reg *reg, u32 *value);
50 void (*write)(struct sim_dev_reg *reg, u32 value);
51 struct sim_reg sim_reg;
52};
53
54struct sim_reg_op {
55 void (*init)(struct sim_dev_reg *reg);
56 void (*read)(struct sim_dev_reg *reg, u32 value);
57 void (*write)(struct sim_dev_reg *reg, u32 value);
58};
59
60#define MB (1024 * 1024)
61#define KB (1024)
62#define SIZE_TO_MASK(size) (~(size - 1))
63
64#define DEFINE_REG(device, func, offset, size, init_op, read_op, write_op)\
65{ PCI_DEVFN(device, func), offset, init_op, read_op, write_op,\
66 {0, SIZE_TO_MASK(size)} },
67
68static void reg_init(struct sim_dev_reg *reg)
69{
70 pci_direct_conf1.read(0, 1, reg->dev_func, reg->reg, 4,
71 &reg->sim_reg.value);
72}
73
74static void reg_read(struct sim_dev_reg *reg, u32 *value)
75{
76 unsigned long flags;
77
78 raw_spin_lock_irqsave(&pci_config_lock, flags);
79 *value = reg->sim_reg.value;
80 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
81}
82
83static void reg_write(struct sim_dev_reg *reg, u32 value)
84{
85 unsigned long flags;
86
87 raw_spin_lock_irqsave(&pci_config_lock, flags);
88 reg->sim_reg.value = (value & reg->sim_reg.mask) |
89 (reg->sim_reg.value & ~reg->sim_reg.mask);
90 raw_spin_unlock_irqrestore(&pci_config_lock, flags);
91}
92
93static void sata_reg_init(struct sim_dev_reg *reg)
94{
95 pci_direct_conf1.read(0, 1, PCI_DEVFN(14, 0), 0x10, 4,
96 &reg->sim_reg.value);
97 reg->sim_reg.value += 0x400;
98}
99
100static void ehci_reg_read(struct sim_dev_reg *reg, u32 *value)
101{
102 reg_read(reg, value);
103 if (*value != reg->sim_reg.mask)
104 *value |= 0x100;
105}
106
107void sata_revid_init(struct sim_dev_reg *reg)
108{
109 reg->sim_reg.value = 0x01060100;
110 reg->sim_reg.mask = 0;
111}
112
113static void sata_revid_read(struct sim_dev_reg *reg, u32 *value)
114{
115 reg_read(reg, value);
116}
117
118static struct sim_dev_reg bus1_fixups[] = {
119 DEFINE_REG(2, 0, 0x10, (16*MB), reg_init, reg_read, reg_write)
120 DEFINE_REG(2, 0, 0x14, (256), reg_init, reg_read, reg_write)
121 DEFINE_REG(2, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
122 DEFINE_REG(3, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
123 DEFINE_REG(4, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
124 DEFINE_REG(4, 1, 0x10, (128*KB), reg_init, reg_read, reg_write)
125 DEFINE_REG(6, 0, 0x10, (512*KB), reg_init, reg_read, reg_write)
126 DEFINE_REG(6, 1, 0x10, (512*KB), reg_init, reg_read, reg_write)
127 DEFINE_REG(6, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
128 DEFINE_REG(8, 0, 0x10, (1*MB), reg_init, reg_read, reg_write)
129 DEFINE_REG(8, 1, 0x10, (64*KB), reg_init, reg_read, reg_write)
130 DEFINE_REG(8, 2, 0x10, (64*KB), reg_init, reg_read, reg_write)
131 DEFINE_REG(9, 0, 0x10 , (1*MB), reg_init, reg_read, reg_write)
132 DEFINE_REG(9, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
133 DEFINE_REG(10, 0, 0x10, (256), reg_init, reg_read, reg_write)
134 DEFINE_REG(10, 0, 0x14, (256*MB), reg_init, reg_read, reg_write)
135 DEFINE_REG(11, 0, 0x10, (256), reg_init, reg_read, reg_write)
136 DEFINE_REG(11, 0, 0x14, (256), reg_init, reg_read, reg_write)
137 DEFINE_REG(11, 1, 0x10, (256), reg_init, reg_read, reg_write)
138 DEFINE_REG(11, 2, 0x10, (256), reg_init, reg_read, reg_write)
139 DEFINE_REG(11, 2, 0x14, (256), reg_init, reg_read, reg_write)
140 DEFINE_REG(11, 2, 0x18, (256), reg_init, reg_read, reg_write)
141 DEFINE_REG(11, 3, 0x10, (256), reg_init, reg_read, reg_write)
142 DEFINE_REG(11, 3, 0x14, (256), reg_init, reg_read, reg_write)
143 DEFINE_REG(11, 4, 0x10, (256), reg_init, reg_read, reg_write)
144 DEFINE_REG(11, 5, 0x10, (64*KB), reg_init, reg_read, reg_write)
145 DEFINE_REG(11, 6, 0x10, (256), reg_init, reg_read, reg_write)
146 DEFINE_REG(11, 7, 0x10, (64*KB), reg_init, reg_read, reg_write)
147 DEFINE_REG(12, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
148 DEFINE_REG(12, 0, 0x14, (256), reg_init, reg_read, reg_write)
149 DEFINE_REG(12, 1, 0x10, (1024), reg_init, reg_read, reg_write)
150 DEFINE_REG(13, 0, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
151 DEFINE_REG(13, 1, 0x10, (32*KB), reg_init, ehci_reg_read, reg_write)
152 DEFINE_REG(14, 0, 0x8, 0, sata_revid_init, sata_revid_read, 0)
153 DEFINE_REG(14, 0, 0x10, 0, reg_init, reg_read, reg_write)
154 DEFINE_REG(14, 0, 0x14, 0, reg_init, reg_read, reg_write)
155 DEFINE_REG(14, 0, 0x18, 0, reg_init, reg_read, reg_write)
156 DEFINE_REG(14, 0, 0x1C, 0, reg_init, reg_read, reg_write)
157 DEFINE_REG(14, 0, 0x20, 0, reg_init, reg_read, reg_write)
158 DEFINE_REG(14, 0, 0x24, (0x200), sata_reg_init, reg_read, reg_write)
159 DEFINE_REG(15, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
160 DEFINE_REG(15, 0, 0x14, (64*KB), reg_init, reg_read, reg_write)
161 DEFINE_REG(16, 0, 0x10, (64*KB), reg_init, reg_read, reg_write)
162 DEFINE_REG(16, 0, 0x14, (64*MB), reg_init, reg_read, reg_write)
163 DEFINE_REG(16, 0, 0x18, (64*MB), reg_init, reg_read, reg_write)
164 DEFINE_REG(17, 0, 0x10, (128*KB), reg_init, reg_read, reg_write)
165 DEFINE_REG(18, 0, 0x10, (1*KB), reg_init, reg_read, reg_write)
166};
167
168static void __init init_sim_regs(void)
169{
170 int i;
171
172 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
173 if (bus1_fixups[i].init)
174 bus1_fixups[i].init(&bus1_fixups[i]);
175 }
176}
177
178static inline void extract_bytes(u32 *value, int reg, int len)
179{
180 uint32_t mask;
181
182 *value >>= ((reg & 3) * 8);
183 mask = 0xFFFFFFFF >> ((4 - len) * 8);
184 *value &= mask;
185}
186
187int bridge_read(unsigned int devfn, int reg, int len, u32 *value)
188{
189 u32 av_bridge_base, av_bridge_limit;
190 int retval = 0;
191
192 switch (reg) {
193 /* Make BARs appear to not request any memory. */
194 case PCI_BASE_ADDRESS_0:
195 case PCI_BASE_ADDRESS_0 + 1:
196 case PCI_BASE_ADDRESS_0 + 2:
197 case PCI_BASE_ADDRESS_0 + 3:
198 *value = 0;
199 break;
200
201 /* Since subordinate bus number register is hardwired
202 * to zero and read only, so do the simulation.
203 */
204 case PCI_PRIMARY_BUS:
205 if (len == 4)
206 *value = 0x00010100;
207 break;
208
209 case PCI_SUBORDINATE_BUS:
210 *value = 1;
211 break;
212
213 case PCI_MEMORY_BASE:
214 case PCI_MEMORY_LIMIT:
215 /* Get the A/V bridge base address. */
216 pci_direct_conf1.read(0, 0, devfn,
217 PCI_BASE_ADDRESS_0, 4, &av_bridge_base);
218
219 av_bridge_limit = av_bridge_base + (512*MB - 1);
220 av_bridge_limit >>= 16;
221 av_bridge_limit &= 0xFFF0;
222
223 av_bridge_base >>= 16;
224 av_bridge_base &= 0xFFF0;
225
226 if (reg == PCI_MEMORY_LIMIT)
227 *value = av_bridge_limit;
228 else if (len == 2)
229 *value = av_bridge_base;
230 else
231 *value = (av_bridge_limit << 16) | av_bridge_base;
232 break;
233 /* Make prefetchable memory limit smaller than prefetchable
234 * memory base, so not claim prefetchable memory space.
235 */
236 case PCI_PREF_MEMORY_BASE:
237 *value = 0xFFF0;
238 break;
239 case PCI_PREF_MEMORY_LIMIT:
240 *value = 0x0;
241 break;
242 /* Make IO limit smaller than IO base, so not claim IO space. */
243 case PCI_IO_BASE:
244 *value = 0xF0;
245 break;
246 case PCI_IO_LIMIT:
247 *value = 0;
248 break;
249 default:
250 retval = 1;
251 }
252 return retval;
253}
254
255static int ce4100_conf_read(unsigned int seg, unsigned int bus,
256 unsigned int devfn, int reg, int len, u32 *value)
257{
258 int i;
259
260 if (bus == 1) {
261 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
262 if (bus1_fixups[i].dev_func == devfn &&
263 bus1_fixups[i].reg == (reg & ~3) &&
264 bus1_fixups[i].read) {
265 bus1_fixups[i].read(&(bus1_fixups[i]),
266 value);
267 extract_bytes(value, reg, len);
268 return 0;
269 }
270 }
271 }
272
273 if (bus == 0 && (PCI_DEVFN(1, 0) == devfn) &&
274 !bridge_read(devfn, reg, len, value))
275 return 0;
276
277 return pci_direct_conf1.read(seg, bus, devfn, reg, len, value);
278}
279
280static int ce4100_conf_write(unsigned int seg, unsigned int bus,
281 unsigned int devfn, int reg, int len, u32 value)
282{
283 int i;
284
285 if (bus == 1) {
286 for (i = 0; i < ARRAY_SIZE(bus1_fixups); i++) {
287 if (bus1_fixups[i].dev_func == devfn &&
288 bus1_fixups[i].reg == (reg & ~3) &&
289 bus1_fixups[i].write) {
290 bus1_fixups[i].write(&(bus1_fixups[i]),
291 value);
292 return 0;
293 }
294 }
295 }
296
297 /* Discard writes to A/V bridge BAR. */
298 if (bus == 0 && PCI_DEVFN(1, 0) == devfn &&
299 ((reg & ~3) == PCI_BASE_ADDRESS_0))
300 return 0;
301
302 return pci_direct_conf1.write(seg, bus, devfn, reg, len, value);
303}
304
305struct pci_raw_ops ce4100_pci_conf = {
306 .read = ce4100_conf_read,
307 .write = ce4100_conf_write,
308};
309
310int __init ce4100_pci_init(void)
311{
312 init_sim_regs();
313 raw_pci_ops = &ce4100_pci_conf;
314 /* Indicate caller that it should invoke pci_legacy_init() */
315 return 1;
316}
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index a0772af64efb..5fe75026ecc2 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -22,6 +22,7 @@ unsigned int pci_probe = PCI_PROBE_BIOS | PCI_PROBE_CONF1 | PCI_PROBE_CONF2 |
22 22
23unsigned int pci_early_dump_regs; 23unsigned int pci_early_dump_regs;
24static int pci_bf_sort; 24static int pci_bf_sort;
25static int smbios_type_b1_flag;
25int pci_routeirq; 26int pci_routeirq;
26int noioapicquirk; 27int noioapicquirk;
27#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS 28#ifdef CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS
@@ -185,6 +186,39 @@ static int __devinit set_bf_sort(const struct dmi_system_id *d)
185 return 0; 186 return 0;
186} 187}
187 188
189static void __devinit read_dmi_type_b1(const struct dmi_header *dm,
190 void *private_data)
191{
192 u8 *d = (u8 *)dm + 4;
193
194 if (dm->type != 0xB1)
195 return;
196 switch (((*(u32 *)d) >> 9) & 0x03) {
197 case 0x00:
198 printk(KERN_INFO "dmi type 0xB1 record - unknown flag\n");
199 break;
200 case 0x01: /* set pci=bfsort */
201 smbios_type_b1_flag = 1;
202 break;
203 case 0x02: /* do not set pci=bfsort */
204 smbios_type_b1_flag = 2;
205 break;
206 default:
207 break;
208 }
209}
210
211static int __devinit find_sort_method(const struct dmi_system_id *d)
212{
213 dmi_walk(read_dmi_type_b1, NULL);
214
215 if (smbios_type_b1_flag == 1) {
216 set_bf_sort(d);
217 return 0;
218 }
219 return -1;
220}
221
188/* 222/*
189 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus) 223 * Enable renumbering of PCI bus# ranges to reach all PCI busses (Cardbus)
190 */ 224 */
@@ -213,6 +247,13 @@ static const struct dmi_system_id __devinitconst pciprobe_dmi_table[] = {
213 }, 247 },
214#endif /* __i386__ */ 248#endif /* __i386__ */
215 { 249 {
250 .callback = find_sort_method,
251 .ident = "Dell System",
252 .matches = {
253 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc"),
254 },
255 },
256 {
216 .callback = set_bf_sort, 257 .callback = set_bf_sort,
217 .ident = "Dell PowerEdge 1950", 258 .ident = "Dell PowerEdge 1950",
218 .matches = { 259 .matches = {
@@ -421,16 +462,10 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
421 462
422 return bus; 463 return bus;
423} 464}
424 465void __init pcibios_set_cache_line_size(void)
425int __init pcibios_init(void)
426{ 466{
427 struct cpuinfo_x86 *c = &boot_cpu_data; 467 struct cpuinfo_x86 *c = &boot_cpu_data;
428 468
429 if (!raw_pci_ops) {
430 printk(KERN_WARNING "PCI: System does not support PCI\n");
431 return 0;
432 }
433
434 /* 469 /*
435 * Set PCI cacheline size to that of the CPU if the CPU has reported it. 470 * Set PCI cacheline size to that of the CPU if the CPU has reported it.
436 * (For older CPUs that don't support cpuid, we se it to 32 bytes 471 * (For older CPUs that don't support cpuid, we se it to 32 bytes
@@ -445,7 +480,16 @@ int __init pcibios_init(void)
445 pci_dfl_cache_line_size = 32 >> 2; 480 pci_dfl_cache_line_size = 32 >> 2;
446 printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n"); 481 printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
447 } 482 }
483}
484
485int __init pcibios_init(void)
486{
487 if (!raw_pci_ops) {
488 printk(KERN_WARNING "PCI: System does not support PCI\n");
489 return 0;
490 }
448 491
492 pcibios_set_cache_line_size();
449 pcibios_resource_survey(); 493 pcibios_resource_survey();
450 494
451 if (pci_bf_sort >= pci_force_bf) 495 if (pci_bf_sort >= pci_force_bf)
diff --git a/arch/x86/pci/direct.c b/arch/x86/pci/direct.c
index bd33620b0071..e6fd8473fb7b 100644
--- a/arch/x86/pci/direct.c
+++ b/arch/x86/pci/direct.c
@@ -280,12 +280,9 @@ void __init pci_direct_init(int type)
280 280
281int __init pci_direct_probe(void) 281int __init pci_direct_probe(void)
282{ 282{
283 struct resource *region, *region2;
284
285 if ((pci_probe & PCI_PROBE_CONF1) == 0) 283 if ((pci_probe & PCI_PROBE_CONF1) == 0)
286 goto type2; 284 goto type2;
287 region = request_region(0xCF8, 8, "PCI conf1"); 285 if (!request_region(0xCF8, 8, "PCI conf1"))
288 if (!region)
289 goto type2; 286 goto type2;
290 287
291 if (pci_check_type1()) { 288 if (pci_check_type1()) {
@@ -293,16 +290,14 @@ int __init pci_direct_probe(void)
293 port_cf9_safe = true; 290 port_cf9_safe = true;
294 return 1; 291 return 1;
295 } 292 }
296 release_resource(region); 293 release_region(0xCF8, 8);
297 294
298 type2: 295 type2:
299 if ((pci_probe & PCI_PROBE_CONF2) == 0) 296 if ((pci_probe & PCI_PROBE_CONF2) == 0)
300 return 0; 297 return 0;
301 region = request_region(0xCF8, 4, "PCI conf2"); 298 if (!request_region(0xCF8, 4, "PCI conf2"))
302 if (!region)
303 return 0; 299 return 0;
304 region2 = request_region(0xC000, 0x1000, "PCI conf2"); 300 if (!request_region(0xC000, 0x1000, "PCI conf2"))
305 if (!region2)
306 goto fail2; 301 goto fail2;
307 302
308 if (pci_check_type2()) { 303 if (pci_check_type2()) {
@@ -311,8 +306,8 @@ int __init pci_direct_probe(void)
311 return 2; 306 return 2;
312 } 307 }
313 308
314 release_resource(region2); 309 release_region(0xC000, 0x1000);
315 fail2: 310 fail2:
316 release_resource(region); 311 release_region(0xCF8, 4);
317 return 0; 312 return 0;
318} 313}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 55253095be84..494f2e7ea2b4 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -72,9 +72,6 @@ pcibios_align_resource(void *data, const struct resource *res,
72 return start; 72 return start;
73 if (start & 0x300) 73 if (start & 0x300)
74 start = (start + 0x3ff) & ~0x3ff; 74 start = (start + 0x3ff) & ~0x3ff;
75 } else if (res->flags & IORESOURCE_MEM) {
76 if (start < BIOS_END)
77 start = BIOS_END;
78 } 75 }
79 return start; 76 return start;
80} 77}
@@ -244,7 +241,7 @@ void __init pcibios_resource_survey(void)
244 e820_reserve_resources_late(); 241 e820_reserve_resources_late();
245 /* 242 /*
246 * Insert the IO APIC resources after PCI initialization has 243 * Insert the IO APIC resources after PCI initialization has
247 * occured to handle IO APICS that are mapped in on a BAR in 244 * occurred to handle IO APICS that are mapped in on a BAR in
248 * PCI space, but before trying to assign unassigned pci res. 245 * PCI space, but before trying to assign unassigned pci res.
249 */ 246 */
250 ioapic_insert_resources(); 247 ioapic_insert_resources();
@@ -307,10 +304,12 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
307 /* 304 /*
308 * ioremap() and ioremap_nocache() defaults to UC MINUS for now. 305 * ioremap() and ioremap_nocache() defaults to UC MINUS for now.
309 * To avoid attribute conflicts, request UC MINUS here 306 * To avoid attribute conflicts, request UC MINUS here
310 * aswell. 307 * as well.
311 */ 308 */
312 prot |= _PAGE_CACHE_UC_MINUS; 309 prot |= _PAGE_CACHE_UC_MINUS;
313 310
311 prot |= _PAGE_IOMAP; /* creating a mapping for IO */
312
314 vma->vm_page_prot = __pgprot(prot); 313 vma->vm_page_prot = __pgprot(prot);
315 314
316 if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, 315 if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index f547ee05f715..372e9b8989b3 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -584,32 +584,33 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
584 case PCI_DEVICE_ID_INTEL_ICH9_3: 584 case PCI_DEVICE_ID_INTEL_ICH9_3:
585 case PCI_DEVICE_ID_INTEL_ICH9_4: 585 case PCI_DEVICE_ID_INTEL_ICH9_4:
586 case PCI_DEVICE_ID_INTEL_ICH9_5: 586 case PCI_DEVICE_ID_INTEL_ICH9_5:
587 case PCI_DEVICE_ID_INTEL_TOLAPAI_0: 587 case PCI_DEVICE_ID_INTEL_EP80579_0:
588 case PCI_DEVICE_ID_INTEL_ICH10_0: 588 case PCI_DEVICE_ID_INTEL_ICH10_0:
589 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
590 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
591 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_0:
593 case PCI_DEVICE_ID_INTEL_PATSBURG_LPC_1:
592 r->name = "PIIX/ICH"; 594 r->name = "PIIX/ICH";
593 r->get = pirq_piix_get; 595 r->get = pirq_piix_get;
594 r->set = pirq_piix_set; 596 r->set = pirq_piix_set;
595 return 1; 597 return 1;
596 } 598 }
597 599
598 if ((device >= PCI_DEVICE_ID_INTEL_PCH_LPC_MIN) && 600 if ((device >= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MIN &&
599 (device <= PCI_DEVICE_ID_INTEL_PCH_LPC_MAX)) { 601 device <= PCI_DEVICE_ID_INTEL_5_3400_SERIES_LPC_MAX)
602 || (device >= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MIN &&
603 device <= PCI_DEVICE_ID_INTEL_COUGARPOINT_LPC_MAX)
604 || (device >= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MIN &&
605 device <= PCI_DEVICE_ID_INTEL_DH89XXCC_LPC_MAX)
606 || (device >= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MIN &&
607 device <= PCI_DEVICE_ID_INTEL_PANTHERPOINT_LPC_MAX)) {
600 r->name = "PIIX/ICH"; 608 r->name = "PIIX/ICH";
601 r->get = pirq_piix_get; 609 r->get = pirq_piix_get;
602 r->set = pirq_piix_set; 610 r->set = pirq_piix_set;
603 return 1; 611 return 1;
604 } 612 }
605 613
606 if ((device >= PCI_DEVICE_ID_INTEL_CPT_LPC_MIN) &&
607 (device <= PCI_DEVICE_ID_INTEL_CPT_LPC_MAX)) {
608 r->name = "PIIX/ICH";
609 r->get = pirq_piix_get;
610 r->set = pirq_piix_set;
611 return 1;
612 }
613 return 0; 614 return 0;
614} 615}
615 616
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index a918553ebc75..750c346ef50a 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -65,7 +65,6 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
65 int end, u64 addr) 65 int end, u64 addr)
66{ 66{
67 struct pci_mmcfg_region *new; 67 struct pci_mmcfg_region *new;
68 int num_buses;
69 struct resource *res; 68 struct resource *res;
70 69
71 if (addr == 0) 70 if (addr == 0)
@@ -82,10 +81,9 @@ static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
82 81
83 list_add_sorted(new); 82 list_add_sorted(new);
84 83
85 num_buses = end - start + 1;
86 res = &new->res; 84 res = &new->res;
87 res->start = addr + PCI_MMCFG_BUS_OFFSET(start); 85 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
88 res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1; 86 res->end = addr + PCI_MMCFG_BUS_OFFSET(end + 1) - 1;
89 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; 87 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
90 snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN, 88 snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
91 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end); 89 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
@@ -608,6 +606,16 @@ static void __init __pci_mmcfg_init(int early)
608 if (list_empty(&pci_mmcfg_list)) 606 if (list_empty(&pci_mmcfg_list))
609 return; 607 return;
610 608
609 if (pcibios_last_bus < 0) {
610 const struct pci_mmcfg_region *cfg;
611
612 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
613 if (cfg->segment)
614 break;
615 pcibios_last_bus = cfg->end_bus;
616 }
617 }
618
611 if (pci_mmcfg_arch_init()) 619 if (pci_mmcfg_arch_init())
612 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF; 620 pci_probe = (pci_probe & ~PCI_PROBE_MASK) | PCI_PROBE_MMCONF;
613 else { 621 else {
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b34815408f58..13700ec8e2e4 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,7 +304,7 @@ static struct pci_raw_ops pci_olpc_conf = {
304 304
305int __init pci_olpc_init(void) 305int __init pci_olpc_init(void)
306{ 306{
307 printk(KERN_INFO "PCI: Using configuration type OLPC\n"); 307 printk(KERN_INFO "PCI: Using configuration type OLPC XO-1\n");
308 raw_pci_ops = &pci_olpc_conf; 308 raw_pci_ops = &pci_olpc_conf;
309 is_lx = is_geode_lx(); 309 is_lx = is_geode_lx();
310 return 0; 310 return 0;
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 2492d165096a..a5f7d0d63de0 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -9,6 +9,7 @@
9#include <linux/uaccess.h> 9#include <linux/uaccess.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11#include <asm/pci-functions.h> 11#include <asm/pci-functions.h>
12#include <asm/cacheflush.h>
12 13
13/* BIOS32 signature: "_32_" */ 14/* BIOS32 signature: "_32_" */
14#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24)) 15#define BIOS32_SIGNATURE (('_' << 0) + ('3' << 8) + ('2' << 16) + ('_' << 24))
@@ -25,6 +26,27 @@
25#define PCIBIOS_HW_TYPE1_SPEC 0x10 26#define PCIBIOS_HW_TYPE1_SPEC 0x10
26#define PCIBIOS_HW_TYPE2_SPEC 0x20 27#define PCIBIOS_HW_TYPE2_SPEC 0x20
27 28
29int pcibios_enabled;
30
31/* According to the BIOS specification at:
32 * http://members.datafast.net.au/dft0802/specs/bios21.pdf, we could
33 * restrict the x zone to some pages and make it ro. But this may be
34 * broken on some bios, complex to handle with static_protections.
35 * We could make the 0xe0000-0x100000 range rox, but this can break
36 * some ISA mapping.
37 *
38 * So we let's an rw and x hole when pcibios is used. This shouldn't
39 * happen for modern system with mmconfig, and if you don't want it
40 * you could disable pcibios...
41 */
42static inline void set_bios_x(void)
43{
44 pcibios_enabled = 1;
45 set_memory_x(PAGE_OFFSET + BIOS_BEGIN, (BIOS_END - BIOS_BEGIN) >> PAGE_SHIFT);
46 if (__supported_pte_mask & _PAGE_NX)
47 printk(KERN_INFO "PCI : PCI BIOS aera is rw and x. Use pci=nobios if you want it NX.\n");
48}
49
28/* 50/*
29 * This is the standard structure used to identify the entry point 51 * This is the standard structure used to identify the entry point
30 * to the BIOS32 Service Directory, as documented in 52 * to the BIOS32 Service Directory, as documented in
@@ -332,6 +354,7 @@ static struct pci_raw_ops * __devinit pci_find_bios(void)
332 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n", 354 DBG("PCI: BIOS32 Service Directory entry at 0x%lx\n",
333 bios32_entry); 355 bios32_entry);
334 bios32_indirect.address = bios32_entry + PAGE_OFFSET; 356 bios32_indirect.address = bios32_entry + PAGE_OFFSET;
357 set_bios_x();
335 if (check_pcibios()) 358 if (check_pcibios())
336 return &pci_bios_access; 359 return &pci_bios_access;
337 } 360 }
diff --git a/arch/x86/pci/xen.c b/arch/x86/pci/xen.c
new file mode 100644
index 000000000000..f567965c0620
--- /dev/null
+++ b/arch/x86/pci/xen.c
@@ -0,0 +1,571 @@
1/*
2 * Xen PCI Frontend Stub - puts some "dummy" functions in to the Linux
3 * x86 PCI core to support the Xen PCI Frontend
4 *
5 * Author: Ryan Wilson <hap9@epoch.ncsc.mil>
6 */
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/pci.h>
10#include <linux/acpi.h>
11
12#include <linux/io.h>
13#include <asm/io_apic.h>
14#include <asm/pci_x86.h>
15
16#include <asm/xen/hypervisor.h>
17
18#include <xen/features.h>
19#include <xen/events.h>
20#include <asm/xen/pci.h>
21
22#ifdef CONFIG_ACPI
23static int acpi_register_gsi_xen_hvm(struct device *dev, u32 gsi,
24 int trigger, int polarity)
25{
26 int rc, irq;
27 struct physdev_map_pirq map_irq;
28 int shareable = 0;
29 char *name;
30
31 if (!xen_hvm_domain())
32 return -1;
33
34 map_irq.domid = DOMID_SELF;
35 map_irq.type = MAP_PIRQ_TYPE_GSI;
36 map_irq.index = gsi;
37 map_irq.pirq = -1;
38
39 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
40 if (rc) {
41 printk(KERN_WARNING "xen map irq failed %d\n", rc);
42 return -1;
43 }
44
45 if (trigger == ACPI_EDGE_SENSITIVE) {
46 shareable = 0;
47 name = "ioapic-edge";
48 } else {
49 shareable = 1;
50 name = "ioapic-level";
51 }
52
53 irq = xen_bind_pirq_gsi_to_irq(gsi, map_irq.pirq, shareable, name);
54
55 printk(KERN_DEBUG "xen: --> irq=%d, pirq=%d\n", irq, map_irq.pirq);
56
57 return irq;
58}
59#endif
60
61#if defined(CONFIG_PCI_MSI)
62#include <linux/msi.h>
63#include <asm/msidef.h>
64
65struct xen_pci_frontend_ops *xen_pci_frontend;
66EXPORT_SYMBOL_GPL(xen_pci_frontend);
67
68#define XEN_PIRQ_MSI_DATA (MSI_DATA_TRIGGER_EDGE | \
69 MSI_DATA_LEVEL_ASSERT | (3 << 8) | MSI_DATA_VECTOR(0))
70
71static void xen_msi_compose_msg(struct pci_dev *pdev, unsigned int pirq,
72 struct msi_msg *msg)
73{
74 /* We set vector == 0 to tell the hypervisor we don't care about it,
75 * but we want a pirq setup instead.
76 * We use the dest_id field to pass the pirq that we want. */
77 msg->address_hi = MSI_ADDR_BASE_HI | MSI_ADDR_EXT_DEST_ID(pirq);
78 msg->address_lo =
79 MSI_ADDR_BASE_LO |
80 MSI_ADDR_DEST_MODE_PHYSICAL |
81 MSI_ADDR_REDIRECTION_CPU |
82 MSI_ADDR_DEST_ID(pirq);
83
84 msg->data = XEN_PIRQ_MSI_DATA;
85}
86
87static int xen_hvm_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
88{
89 int irq, pirq;
90 struct msi_desc *msidesc;
91 struct msi_msg msg;
92
93 list_for_each_entry(msidesc, &dev->msi_list, list) {
94 __read_msi_msg(msidesc, &msg);
95 pirq = MSI_ADDR_EXT_DEST_ID(msg.address_hi) |
96 ((msg.address_lo >> MSI_ADDR_DEST_ID_SHIFT) & 0xff);
97 if (msg.data != XEN_PIRQ_MSI_DATA ||
98 xen_irq_from_pirq(pirq) < 0) {
99 pirq = xen_allocate_pirq_msi(dev, msidesc);
100 if (pirq < 0)
101 goto error;
102 xen_msi_compose_msg(dev, pirq, &msg);
103 __write_msi_msg(msidesc, &msg);
104 dev_dbg(&dev->dev, "xen: msi bound to pirq=%d\n", pirq);
105 } else {
106 dev_dbg(&dev->dev,
107 "xen: msi already bound to pirq=%d\n", pirq);
108 }
109 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, pirq, 0,
110 (type == PCI_CAP_ID_MSIX) ?
111 "msi-x" : "msi",
112 DOMID_SELF);
113 if (irq < 0)
114 goto error;
115 dev_dbg(&dev->dev,
116 "xen: msi --> pirq=%d --> irq=%d\n", pirq, irq);
117 }
118 return 0;
119
120error:
121 dev_err(&dev->dev,
122 "Xen PCI frontend has not registered MSI/MSI-X support!\n");
123 return -ENODEV;
124}
125
126/*
127 * For MSI interrupts we have to use drivers/xen/event.s functions to
128 * allocate an irq_desc and setup the right */
129
130
131static int xen_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
132{
133 int irq, ret, i;
134 struct msi_desc *msidesc;
135 int *v;
136
137 v = kzalloc(sizeof(int) * max(1, nvec), GFP_KERNEL);
138 if (!v)
139 return -ENOMEM;
140
141 if (type == PCI_CAP_ID_MSIX)
142 ret = xen_pci_frontend_enable_msix(dev, v, nvec);
143 else
144 ret = xen_pci_frontend_enable_msi(dev, v);
145 if (ret)
146 goto error;
147 i = 0;
148 list_for_each_entry(msidesc, &dev->msi_list, list) {
149 irq = xen_bind_pirq_msi_to_irq(dev, msidesc, v[i], 0,
150 (type == PCI_CAP_ID_MSIX) ?
151 "pcifront-msi-x" :
152 "pcifront-msi",
153 DOMID_SELF);
154 if (irq < 0)
155 goto free;
156 i++;
157 }
158 kfree(v);
159 return 0;
160
161error:
162 dev_err(&dev->dev, "Xen PCI frontend has not registered MSI/MSI-X support!\n");
163free:
164 kfree(v);
165 return ret;
166}
167
168static void xen_teardown_msi_irqs(struct pci_dev *dev)
169{
170 struct msi_desc *msidesc;
171
172 msidesc = list_entry(dev->msi_list.next, struct msi_desc, list);
173 if (msidesc->msi_attrib.is_msix)
174 xen_pci_frontend_disable_msix(dev);
175 else
176 xen_pci_frontend_disable_msi(dev);
177
178 /* Free the IRQ's and the msidesc using the generic code. */
179 default_teardown_msi_irqs(dev);
180}
181
182static void xen_teardown_msi_irq(unsigned int irq)
183{
184 xen_destroy_irq(irq);
185}
186
187#ifdef CONFIG_XEN_DOM0
188static int xen_initdom_setup_msi_irqs(struct pci_dev *dev, int nvec, int type)
189{
190 int ret = 0;
191 struct msi_desc *msidesc;
192
193 list_for_each_entry(msidesc, &dev->msi_list, list) {
194 struct physdev_map_pirq map_irq;
195 domid_t domid;
196
197 domid = ret = xen_find_device_domain_owner(dev);
198 /* N.B. Casting int's -ENODEV to uint16_t results in 0xFFED,
199 * hence check ret value for < 0. */
200 if (ret < 0)
201 domid = DOMID_SELF;
202
203 memset(&map_irq, 0, sizeof(map_irq));
204 map_irq.domid = domid;
205 map_irq.type = MAP_PIRQ_TYPE_MSI;
206 map_irq.index = -1;
207 map_irq.pirq = -1;
208 map_irq.bus = dev->bus->number;
209 map_irq.devfn = dev->devfn;
210
211 if (type == PCI_CAP_ID_MSIX) {
212 int pos;
213 u32 table_offset, bir;
214
215 pos = pci_find_capability(dev, PCI_CAP_ID_MSIX);
216
217 pci_read_config_dword(dev, pos + PCI_MSIX_TABLE,
218 &table_offset);
219 bir = (u8)(table_offset & PCI_MSIX_FLAGS_BIRMASK);
220
221 map_irq.table_base = pci_resource_start(dev, bir);
222 map_irq.entry_nr = msidesc->msi_attrib.entry_nr;
223 }
224
225 ret = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
226 if (ret) {
227 dev_warn(&dev->dev, "xen map irq failed %d for %d domain\n",
228 ret, domid);
229 goto out;
230 }
231
232 ret = xen_bind_pirq_msi_to_irq(dev, msidesc,
233 map_irq.pirq, map_irq.index,
234 (type == PCI_CAP_ID_MSIX) ?
235 "msi-x" : "msi",
236 domid);
237 if (ret < 0)
238 goto out;
239 }
240 ret = 0;
241out:
242 return ret;
243}
244#endif
245#endif
246
247static int xen_pcifront_enable_irq(struct pci_dev *dev)
248{
249 int rc;
250 int share = 1;
251 int pirq;
252 u8 gsi;
253
254 rc = pci_read_config_byte(dev, PCI_INTERRUPT_LINE, &gsi);
255 if (rc < 0) {
256 dev_warn(&dev->dev, "Xen PCI: failed to read interrupt line: %d\n",
257 rc);
258 return rc;
259 }
260
261 rc = xen_allocate_pirq_gsi(gsi);
262 if (rc < 0) {
263 dev_warn(&dev->dev, "Xen PCI: failed to allocate a PIRQ for GSI%d: %d\n",
264 gsi, rc);
265 return rc;
266 }
267 pirq = rc;
268
269 if (gsi < NR_IRQS_LEGACY)
270 share = 0;
271
272 rc = xen_bind_pirq_gsi_to_irq(gsi, pirq, share, "pcifront");
273 if (rc < 0) {
274 dev_warn(&dev->dev, "Xen PCI: failed to bind GSI%d (PIRQ%d) to IRQ: %d\n",
275 gsi, pirq, rc);
276 return rc;
277 }
278
279 dev->irq = rc;
280 dev_info(&dev->dev, "Xen PCI mapped GSI%d to IRQ%d\n", gsi, dev->irq);
281 return 0;
282}
283
284int __init pci_xen_init(void)
285{
286 if (!xen_pv_domain() || xen_initial_domain())
287 return -ENODEV;
288
289 printk(KERN_INFO "PCI: setting up Xen PCI frontend stub\n");
290
291 pcibios_set_cache_line_size();
292
293 pcibios_enable_irq = xen_pcifront_enable_irq;
294 pcibios_disable_irq = NULL;
295
296#ifdef CONFIG_ACPI
297 /* Keep ACPI out of the picture */
298 acpi_noirq = 1;
299#endif
300
301#ifdef CONFIG_PCI_MSI
302 x86_msi.setup_msi_irqs = xen_setup_msi_irqs;
303 x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
304 x86_msi.teardown_msi_irqs = xen_teardown_msi_irqs;
305#endif
306 return 0;
307}
308
309int __init pci_xen_hvm_init(void)
310{
311 if (!xen_feature(XENFEAT_hvm_pirqs))
312 return 0;
313
314#ifdef CONFIG_ACPI
315 /*
316 * We don't want to change the actual ACPI delivery model,
317 * just how GSIs get registered.
318 */
319 __acpi_register_gsi = acpi_register_gsi_xen_hvm;
320#endif
321
322#ifdef CONFIG_PCI_MSI
323 x86_msi.setup_msi_irqs = xen_hvm_setup_msi_irqs;
324 x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
325#endif
326 return 0;
327}
328
329#ifdef CONFIG_XEN_DOM0
330static int xen_register_pirq(u32 gsi, int gsi_override, int triggering)
331{
332 int rc, pirq, irq = -1;
333 struct physdev_map_pirq map_irq;
334 int shareable = 0;
335 char *name;
336
337 if (!xen_pv_domain())
338 return -1;
339
340 if (triggering == ACPI_EDGE_SENSITIVE) {
341 shareable = 0;
342 name = "ioapic-edge";
343 } else {
344 shareable = 1;
345 name = "ioapic-level";
346 }
347 pirq = xen_allocate_pirq_gsi(gsi);
348 if (pirq < 0)
349 goto out;
350
351 if (gsi_override >= 0)
352 irq = xen_bind_pirq_gsi_to_irq(gsi_override, pirq, shareable, name);
353 else
354 irq = xen_bind_pirq_gsi_to_irq(gsi, pirq, shareable, name);
355 if (irq < 0)
356 goto out;
357
358 printk(KERN_DEBUG "xen: --> pirq=%d -> irq=%d (gsi=%d)\n", pirq, irq, gsi);
359
360 map_irq.domid = DOMID_SELF;
361 map_irq.type = MAP_PIRQ_TYPE_GSI;
362 map_irq.index = gsi;
363 map_irq.pirq = pirq;
364
365 rc = HYPERVISOR_physdev_op(PHYSDEVOP_map_pirq, &map_irq);
366 if (rc) {
367 printk(KERN_WARNING "xen map irq failed %d\n", rc);
368 return -1;
369 }
370
371out:
372 return irq;
373}
374
375static int xen_register_gsi(u32 gsi, int gsi_override, int triggering, int polarity)
376{
377 int rc, irq;
378 struct physdev_setup_gsi setup_gsi;
379
380 if (!xen_pv_domain())
381 return -1;
382
383 printk(KERN_DEBUG "xen: registering gsi %u triggering %d polarity %d\n",
384 gsi, triggering, polarity);
385
386 irq = xen_register_pirq(gsi, gsi_override, triggering);
387
388 setup_gsi.gsi = gsi;
389 setup_gsi.triggering = (triggering == ACPI_EDGE_SENSITIVE ? 0 : 1);
390 setup_gsi.polarity = (polarity == ACPI_ACTIVE_HIGH ? 0 : 1);
391
392 rc = HYPERVISOR_physdev_op(PHYSDEVOP_setup_gsi, &setup_gsi);
393 if (rc == -EEXIST)
394 printk(KERN_INFO "Already setup the GSI :%d\n", gsi);
395 else if (rc) {
396 printk(KERN_ERR "Failed to setup GSI :%d, err_code:%d\n",
397 gsi, rc);
398 }
399
400 return irq;
401}
402
403static __init void xen_setup_acpi_sci(void)
404{
405 int rc;
406 int trigger, polarity;
407 int gsi = acpi_sci_override_gsi;
408 int irq = -1;
409 int gsi_override = -1;
410
411 if (!gsi)
412 return;
413
414 rc = acpi_get_override_irq(gsi, &trigger, &polarity);
415 if (rc) {
416 printk(KERN_WARNING "xen: acpi_get_override_irq failed for acpi"
417 " sci, rc=%d\n", rc);
418 return;
419 }
420 trigger = trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE;
421 polarity = polarity ? ACPI_ACTIVE_LOW : ACPI_ACTIVE_HIGH;
422
423 printk(KERN_INFO "xen: sci override: global_irq=%d trigger=%d "
424 "polarity=%d\n", gsi, trigger, polarity);
425
426 /* Before we bind the GSI to a Linux IRQ, check whether
427 * we need to override it with bus_irq (IRQ) value. Usually for
428 * IRQs below IRQ_LEGACY_IRQ this holds IRQ == GSI, as so:
429 * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 9 low level)
430 * but there are oddballs where the IRQ != GSI:
431 * ACPI: INT_SRC_OVR (bus 0 bus_irq 9 global_irq 20 low level)
432 * which ends up being: gsi_to_irq[9] == 20
433 * (which is what acpi_gsi_to_irq ends up calling when starting the
434 * the ACPI interpreter and keels over since IRQ 9 has not been
435 * setup as we had setup IRQ 20 for it).
436 */
437 /* Check whether the GSI != IRQ */
438 if (acpi_gsi_to_irq(gsi, &irq) == 0) {
439 if (irq >= 0 && irq != gsi)
440 /* Bugger, we MUST have that IRQ. */
441 gsi_override = irq;
442 }
443
444 gsi = xen_register_gsi(gsi, gsi_override, trigger, polarity);
445 printk(KERN_INFO "xen: acpi sci %d\n", gsi);
446
447 return;
448}
449
450static int acpi_register_gsi_xen(struct device *dev, u32 gsi,
451 int trigger, int polarity)
452{
453 return xen_register_gsi(gsi, -1 /* no GSI override */, trigger, polarity);
454}
455
456static int __init pci_xen_initial_domain(void)
457{
458#ifdef CONFIG_PCI_MSI
459 x86_msi.setup_msi_irqs = xen_initdom_setup_msi_irqs;
460 x86_msi.teardown_msi_irq = xen_teardown_msi_irq;
461#endif
462 xen_setup_acpi_sci();
463 __acpi_register_gsi = acpi_register_gsi_xen;
464
465 return 0;
466}
467
468void __init xen_setup_pirqs(void)
469{
470 int pirq, irq;
471
472 pci_xen_initial_domain();
473
474 if (0 == nr_ioapics) {
475 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
476 pirq = xen_allocate_pirq_gsi(irq);
477 if (WARN(pirq < 0,
478 "Could not allocate PIRQ for legacy interrupt\n"))
479 break;
480 irq = xen_bind_pirq_gsi_to_irq(irq, pirq, 0, "xt-pic");
481 }
482 return;
483 }
484
485 /* Pre-allocate legacy irqs */
486 for (irq = 0; irq < NR_IRQS_LEGACY; irq++) {
487 int trigger, polarity;
488
489 if (acpi_get_override_irq(irq, &trigger, &polarity) == -1)
490 continue;
491
492 xen_register_pirq(irq, -1 /* no GSI override */,
493 trigger ? ACPI_LEVEL_SENSITIVE : ACPI_EDGE_SENSITIVE);
494 }
495}
496#endif
497
498#ifdef CONFIG_XEN_DOM0
499struct xen_device_domain_owner {
500 domid_t domain;
501 struct pci_dev *dev;
502 struct list_head list;
503};
504
505static DEFINE_SPINLOCK(dev_domain_list_spinlock);
506static struct list_head dev_domain_list = LIST_HEAD_INIT(dev_domain_list);
507
508static struct xen_device_domain_owner *find_device(struct pci_dev *dev)
509{
510 struct xen_device_domain_owner *owner;
511
512 list_for_each_entry(owner, &dev_domain_list, list) {
513 if (owner->dev == dev)
514 return owner;
515 }
516 return NULL;
517}
518
519int xen_find_device_domain_owner(struct pci_dev *dev)
520{
521 struct xen_device_domain_owner *owner;
522 int domain = -ENODEV;
523
524 spin_lock(&dev_domain_list_spinlock);
525 owner = find_device(dev);
526 if (owner)
527 domain = owner->domain;
528 spin_unlock(&dev_domain_list_spinlock);
529 return domain;
530}
531EXPORT_SYMBOL_GPL(xen_find_device_domain_owner);
532
533int xen_register_device_domain_owner(struct pci_dev *dev, uint16_t domain)
534{
535 struct xen_device_domain_owner *owner;
536
537 owner = kzalloc(sizeof(struct xen_device_domain_owner), GFP_KERNEL);
538 if (!owner)
539 return -ENODEV;
540
541 spin_lock(&dev_domain_list_spinlock);
542 if (find_device(dev)) {
543 spin_unlock(&dev_domain_list_spinlock);
544 kfree(owner);
545 return -EEXIST;
546 }
547 owner->domain = domain;
548 owner->dev = dev;
549 list_add_tail(&owner->list, &dev_domain_list);
550 spin_unlock(&dev_domain_list_spinlock);
551 return 0;
552}
553EXPORT_SYMBOL_GPL(xen_register_device_domain_owner);
554
555int xen_unregister_device_domain_owner(struct pci_dev *dev)
556{
557 struct xen_device_domain_owner *owner;
558
559 spin_lock(&dev_domain_list_spinlock);
560 owner = find_device(dev);
561 if (!owner) {
562 spin_unlock(&dev_domain_list_spinlock);
563 return -ENODEV;
564 }
565 list_del(&owner->list);
566 spin_unlock(&dev_domain_list_spinlock);
567 kfree(owner);
568 return 0;
569}
570EXPORT_SYMBOL_GPL(xen_unregister_device_domain_owner);
571#endif
diff --git a/arch/x86/platform/Makefile b/arch/x86/platform/Makefile
new file mode 100644
index 000000000000..021eee91c056
--- /dev/null
+++ b/arch/x86/platform/Makefile
@@ -0,0 +1,10 @@
1# Platform specific code goes here
2obj-y += ce4100/
3obj-y += efi/
4obj-y += iris/
5obj-y += mrst/
6obj-y += olpc/
7obj-y += scx200/
8obj-y += sfi/
9obj-y += visws/
10obj-y += uv/
diff --git a/arch/x86/platform/ce4100/Makefile b/arch/x86/platform/ce4100/Makefile
new file mode 100644
index 000000000000..91fc92971d94
--- /dev/null
+++ b/arch/x86/platform/ce4100/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_INTEL_CE) += ce4100.o
diff --git a/arch/x86/platform/ce4100/ce4100.c b/arch/x86/platform/ce4100/ce4100.c
new file mode 100644
index 000000000000..28071bb31db7
--- /dev/null
+++ b/arch/x86/platform/ce4100/ce4100.c
@@ -0,0 +1,146 @@
1/*
2 * Intel CE4100 platform specific setup code
3 *
4 * (C) Copyright 2010 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11#include <linux/init.h>
12#include <linux/kernel.h>
13#include <linux/irq.h>
14#include <linux/module.h>
15#include <linux/serial_reg.h>
16#include <linux/serial_8250.h>
17
18#include <asm/ce4100.h>
19#include <asm/prom.h>
20#include <asm/setup.h>
21#include <asm/i8259.h>
22#include <asm/io.h>
23#include <asm/io_apic.h>
24
25static int ce4100_i8042_detect(void)
26{
27 return 0;
28}
29
30#ifdef CONFIG_SERIAL_8250
31
32static unsigned int mem_serial_in(struct uart_port *p, int offset)
33{
34 offset = offset << p->regshift;
35 return readl(p->membase + offset);
36}
37
38/*
39 * The UART Tx interrupts are not set under some conditions and therefore serial
40 * transmission hangs. This is a silicon issue and has not been root caused. The
41 * workaround for this silicon issue checks UART_LSR_THRE bit and UART_LSR_TEMT
42 * bit of LSR register in interrupt handler to see whether at least one of these
43 * two bits is set, if so then process the transmit request. If this workaround
44 * is not applied, then the serial transmission may hang. This workaround is for
45 * errata number 9 in Errata - B step.
46*/
47
48static unsigned int ce4100_mem_serial_in(struct uart_port *p, int offset)
49{
50 unsigned int ret, ier, lsr;
51
52 if (offset == UART_IIR) {
53 offset = offset << p->regshift;
54 ret = readl(p->membase + offset);
55 if (ret & UART_IIR_NO_INT) {
56 /* see if the TX interrupt should have really set */
57 ier = mem_serial_in(p, UART_IER);
58 /* see if the UART's XMIT interrupt is enabled */
59 if (ier & UART_IER_THRI) {
60 lsr = mem_serial_in(p, UART_LSR);
61 /* now check to see if the UART should be
62 generating an interrupt (but isn't) */
63 if (lsr & (UART_LSR_THRE | UART_LSR_TEMT))
64 ret &= ~UART_IIR_NO_INT;
65 }
66 }
67 } else
68 ret = mem_serial_in(p, offset);
69 return ret;
70}
71
72static void ce4100_mem_serial_out(struct uart_port *p, int offset, int value)
73{
74 offset = offset << p->regshift;
75 writel(value, p->membase + offset);
76}
77
78static void ce4100_serial_fixup(int port, struct uart_port *up,
79 unsigned short *capabilites)
80{
81#ifdef CONFIG_EARLY_PRINTK
82 /*
83 * Over ride the legacy port configuration that comes from
84 * asm/serial.h. Using the ioport driver then switching to the
85 * PCI memmaped driver hangs the IOAPIC
86 */
87 if (up->iotype != UPIO_MEM32) {
88 up->uartclk = 14745600;
89 up->mapbase = 0xdffe0200;
90 set_fixmap_nocache(FIX_EARLYCON_MEM_BASE,
91 up->mapbase & PAGE_MASK);
92 up->membase =
93 (void __iomem *)__fix_to_virt(FIX_EARLYCON_MEM_BASE);
94 up->membase += up->mapbase & ~PAGE_MASK;
95 up->iotype = UPIO_MEM32;
96 up->regshift = 2;
97 }
98#endif
99 up->iobase = 0;
100 up->serial_in = ce4100_mem_serial_in;
101 up->serial_out = ce4100_mem_serial_out;
102
103 *capabilites |= (1 << 12);
104}
105
106static __init void sdv_serial_fixup(void)
107{
108 serial8250_set_isa_configurator(ce4100_serial_fixup);
109}
110
111#else
112static inline void sdv_serial_fixup(void);
113#endif
114
115static void __init sdv_arch_setup(void)
116{
117 sdv_serial_fixup();
118}
119
120#ifdef CONFIG_X86_IO_APIC
121static void __cpuinit sdv_pci_init(void)
122{
123 x86_of_pci_init();
124 /* We can't set this earlier, because we need to calibrate the timer */
125 legacy_pic = &null_legacy_pic;
126}
127#endif
128
129/*
130 * CE4100 specific x86_init function overrides and early setup
131 * calls.
132 */
133void __init x86_ce4100_early_setup(void)
134{
135 x86_init.oem.arch_setup = sdv_arch_setup;
136 x86_platform.i8042_detect = ce4100_i8042_detect;
137 x86_init.resources.probe_roms = x86_init_noop;
138 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
139 x86_init.mpparse.find_smp_config = x86_init_noop;
140 x86_init.pci.init = ce4100_pci_init;
141
142#ifdef CONFIG_X86_IO_APIC
143 x86_init.pci.init_irq = sdv_pci_init;
144 x86_init.mpparse.setup_ioapic_ids = setup_ioapic_ids_from_mpc_nocheck;
145#endif
146}
diff --git a/arch/x86/platform/ce4100/falconfalls.dts b/arch/x86/platform/ce4100/falconfalls.dts
new file mode 100644
index 000000000000..e70be38ce039
--- /dev/null
+++ b/arch/x86/platform/ce4100/falconfalls.dts
@@ -0,0 +1,430 @@
1/*
2 * CE4100 on Falcon Falls
3 *
4 * (c) Copyright 2010 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License as published by the
8 * Free Software Foundation; version 2 of the License.
9 */
10/dts-v1/;
11/ {
12 model = "intel,falconfalls";
13 compatible = "intel,falconfalls";
14 #address-cells = <1>;
15 #size-cells = <1>;
16
17 cpus {
18 #address-cells = <1>;
19 #size-cells = <0>;
20
21 cpu@0 {
22 device_type = "cpu";
23 compatible = "intel,ce4100";
24 reg = <0>;
25 lapic = <&lapic0>;
26 };
27 };
28
29 soc@0 {
30 #address-cells = <1>;
31 #size-cells = <1>;
32 compatible = "intel,ce4100-cp";
33 ranges;
34
35 ioapic1: interrupt-controller@fec00000 {
36 #interrupt-cells = <2>;
37 compatible = "intel,ce4100-ioapic";
38 interrupt-controller;
39 reg = <0xfec00000 0x1000>;
40 };
41
42 timer@fed00000 {
43 compatible = "intel,ce4100-hpet";
44 reg = <0xfed00000 0x200>;
45 };
46
47 lapic0: interrupt-controller@fee00000 {
48 compatible = "intel,ce4100-lapic";
49 reg = <0xfee00000 0x1000>;
50 };
51
52 pci@3fc {
53 #address-cells = <3>;
54 #size-cells = <2>;
55 compatible = "intel,ce4100-pci", "pci";
56 device_type = "pci";
57 bus-range = <0 0>;
58 ranges = <0x2000000 0 0xbffff000 0xbffff000 0 0x1000
59 0x2000000 0 0xdffe0000 0xdffe0000 0 0x1000
60 0x0000000 0 0x0 0x0 0 0x100>;
61
62 /* Secondary IO-APIC */
63 ioapic2: interrupt-controller@0,1 {
64 #interrupt-cells = <2>;
65 compatible = "intel,ce4100-ioapic";
66 interrupt-controller;
67 reg = <0x100 0x0 0x0 0x0 0x0>;
68 assigned-addresses = <0x02000000 0x0 0xbffff000 0x0 0x1000>;
69 };
70
71 pci@1,0 {
72 #address-cells = <3>;
73 #size-cells = <2>;
74 compatible = "intel,ce4100-pci", "pci";
75 device_type = "pci";
76 bus-range = <1 1>;
77 reg = <0x0800 0x0 0x0 0x0 0x0>;
78 ranges = <0x2000000 0 0xdffe0000 0x2000000 0 0xdffe0000 0 0x1000>;
79
80 interrupt-parent = <&ioapic2>;
81
82 display@2,0 {
83 compatible = "pci8086,2e5b.2",
84 "pci8086,2e5b",
85 "pciclass038000",
86 "pciclass0380";
87
88 reg = <0x11000 0x0 0x0 0x0 0x0>;
89 interrupts = <0 1>;
90 };
91
92 multimedia@3,0 {
93 compatible = "pci8086,2e5c.2",
94 "pci8086,2e5c",
95 "pciclass048000",
96 "pciclass0480";
97
98 reg = <0x11800 0x0 0x0 0x0 0x0>;
99 interrupts = <2 1>;
100 };
101
102 multimedia@4,0 {
103 compatible = "pci8086,2e5d.2",
104 "pci8086,2e5d",
105 "pciclass048000",
106 "pciclass0480";
107
108 reg = <0x12000 0x0 0x0 0x0 0x0>;
109 interrupts = <4 1>;
110 };
111
112 multimedia@4,1 {
113 compatible = "pci8086,2e5e.2",
114 "pci8086,2e5e",
115 "pciclass048000",
116 "pciclass0480";
117
118 reg = <0x12100 0x0 0x0 0x0 0x0>;
119 interrupts = <5 1>;
120 };
121
122 sound@6,0 {
123 compatible = "pci8086,2e5f.2",
124 "pci8086,2e5f",
125 "pciclass040100",
126 "pciclass0401";
127
128 reg = <0x13000 0x0 0x0 0x0 0x0>;
129 interrupts = <6 1>;
130 };
131
132 sound@6,1 {
133 compatible = "pci8086,2e5f.2",
134 "pci8086,2e5f",
135 "pciclass040100",
136 "pciclass0401";
137
138 reg = <0x13100 0x0 0x0 0x0 0x0>;
139 interrupts = <7 1>;
140 };
141
142 sound@6,2 {
143 compatible = "pci8086,2e60.2",
144 "pci8086,2e60",
145 "pciclass040100",
146 "pciclass0401";
147
148 reg = <0x13200 0x0 0x0 0x0 0x0>;
149 interrupts = <8 1>;
150 };
151
152 display@8,0 {
153 compatible = "pci8086,2e61.2",
154 "pci8086,2e61",
155 "pciclass038000",
156 "pciclass0380";
157
158 reg = <0x14000 0x0 0x0 0x0 0x0>;
159 interrupts = <9 1>;
160 };
161
162 display@8,1 {
163 compatible = "pci8086,2e62.2",
164 "pci8086,2e62",
165 "pciclass038000",
166 "pciclass0380";
167
168 reg = <0x14100 0x0 0x0 0x0 0x0>;
169 interrupts = <10 1>;
170 };
171
172 multimedia@8,2 {
173 compatible = "pci8086,2e63.2",
174 "pci8086,2e63",
175 "pciclass048000",
176 "pciclass0480";
177
178 reg = <0x14200 0x0 0x0 0x0 0x0>;
179 interrupts = <11 1>;
180 };
181
182 entertainment-encryption@9,0 {
183 compatible = "pci8086,2e64.2",
184 "pci8086,2e64",
185 "pciclass101000",
186 "pciclass1010";
187
188 reg = <0x14800 0x0 0x0 0x0 0x0>;
189 interrupts = <12 1>;
190 };
191
192 localbus@a,0 {
193 compatible = "pci8086,2e65.2",
194 "pci8086,2e65",
195 "pciclassff0000",
196 "pciclassff00";
197
198 reg = <0x15000 0x0 0x0 0x0 0x0>;
199 };
200
201 serial@b,0 {
202 compatible = "pci8086,2e66.2",
203 "pci8086,2e66",
204 "pciclass070003",
205 "pciclass0700";
206
207 reg = <0x15800 0x0 0x0 0x0 0x0>;
208 interrupts = <14 1>;
209 };
210
211 gpio@b,1 {
212 compatible = "pci8086,2e67.2",
213 "pci8086,2e67",
214 "pciclassff0000",
215 "pciclassff00";
216
217 #gpio-cells = <2>;
218 reg = <0x15900 0x0 0x0 0x0 0x0>;
219 interrupts = <15 1>;
220 gpio-controller;
221 };
222
223 i2c-controller@b,2 {
224 #address-cells = <2>;
225 #size-cells = <1>;
226 compatible = "pci8086,2e68.2",
227 "pci8086,2e68",
228 "pciclass,ff0000",
229 "pciclass,ff00";
230
231 reg = <0x15a00 0x0 0x0 0x0 0x0>;
232 interrupts = <16 1>;
233 ranges = <0 0 0x02000000 0 0xdffe0500 0x100
234 1 0 0x02000000 0 0xdffe0600 0x100
235 2 0 0x02000000 0 0xdffe0700 0x100>;
236
237 i2c@0 {
238 #address-cells = <1>;
239 #size-cells = <0>;
240 compatible = "intel,ce4100-i2c-controller";
241 reg = <0 0 0x100>;
242 };
243
244 i2c@1 {
245 #address-cells = <1>;
246 #size-cells = <0>;
247 compatible = "intel,ce4100-i2c-controller";
248 reg = <1 0 0x100>;
249
250 gpio@26 {
251 #gpio-cells = <2>;
252 compatible = "ti,pcf8575";
253 reg = <0x26>;
254 gpio-controller;
255 };
256 };
257
258 i2c@2 {
259 #address-cells = <1>;
260 #size-cells = <0>;
261 compatible = "intel,ce4100-i2c-controller";
262 reg = <2 0 0x100>;
263
264 gpio@26 {
265 #gpio-cells = <2>;
266 compatible = "ti,pcf8575";
267 reg = <0x26>;
268 gpio-controller;
269 };
270 };
271 };
272
273 smard-card@b,3 {
274 compatible = "pci8086,2e69.2",
275 "pci8086,2e69",
276 "pciclass070500",
277 "pciclass0705";
278
279 reg = <0x15b00 0x0 0x0 0x0 0x0>;
280 interrupts = <15 1>;
281 };
282
283 spi-controller@b,4 {
284 #address-cells = <1>;
285 #size-cells = <0>;
286 compatible =
287 "pci8086,2e6a.2",
288 "pci8086,2e6a",
289 "pciclass,ff0000",
290 "pciclass,ff00";
291
292 reg = <0x15c00 0x0 0x0 0x0 0x0>;
293 interrupts = <15 1>;
294
295 dac@0 {
296 compatible = "ti,pcm1755";
297 reg = <0>;
298 spi-max-frequency = <115200>;
299 };
300
301 dac@1 {
302 compatible = "ti,pcm1609a";
303 reg = <1>;
304 spi-max-frequency = <115200>;
305 };
306
307 eeprom@2 {
308 compatible = "atmel,at93c46";
309 reg = <2>;
310 spi-max-frequency = <115200>;
311 };
312 };
313
314 multimedia@b,7 {
315 compatible = "pci8086,2e6d.2",
316 "pci8086,2e6d",
317 "pciclassff0000",
318 "pciclassff00";
319
320 reg = <0x15f00 0x0 0x0 0x0 0x0>;
321 };
322
323 ethernet@c,0 {
324 compatible = "pci8086,2e6e.2",
325 "pci8086,2e6e",
326 "pciclass020000",
327 "pciclass0200";
328
329 reg = <0x16000 0x0 0x0 0x0 0x0>;
330 interrupts = <21 1>;
331 };
332
333 clock@c,1 {
334 compatible = "pci8086,2e6f.2",
335 "pci8086,2e6f",
336 "pciclassff0000",
337 "pciclassff00";
338
339 reg = <0x16100 0x0 0x0 0x0 0x0>;
340 interrupts = <3 1>;
341 };
342
343 usb@d,0 {
344 compatible = "pci8086,2e70.2",
345 "pci8086,2e70",
346 "pciclass0c0320",
347 "pciclass0c03";
348
349 reg = <0x16800 0x0 0x0 0x0 0x0>;
350 interrupts = <22 1>;
351 };
352
353 usb@d,1 {
354 compatible = "pci8086,2e70.2",
355 "pci8086,2e70",
356 "pciclass0c0320",
357 "pciclass0c03";
358
359 reg = <0x16900 0x0 0x0 0x0 0x0>;
360 interrupts = <22 1>;
361 };
362
363 sata@e,0 {
364 compatible = "pci8086,2e71.0",
365 "pci8086,2e71",
366 "pciclass010601",
367 "pciclass0106";
368
369 reg = <0x17000 0x0 0x0 0x0 0x0>;
370 interrupts = <23 1>;
371 };
372
373 flash@f,0 {
374 compatible = "pci8086,701.1",
375 "pci8086,701",
376 "pciclass050100",
377 "pciclass0501";
378
379 reg = <0x17800 0x0 0x0 0x0 0x0>;
380 interrupts = <13 1>;
381 };
382
383 entertainment-encryption@10,0 {
384 compatible = "pci8086,702.1",
385 "pci8086,702",
386 "pciclass101000",
387 "pciclass1010";
388
389 reg = <0x18000 0x0 0x0 0x0 0x0>;
390 };
391
392 co-processor@11,0 {
393 compatible = "pci8086,703.1",
394 "pci8086,703",
395 "pciclass0b4000",
396 "pciclass0b40";
397
398 reg = <0x18800 0x0 0x0 0x0 0x0>;
399 interrupts = <1 1>;
400 };
401
402 multimedia@12,0 {
403 compatible = "pci8086,704.0",
404 "pci8086,704",
405 "pciclass048000",
406 "pciclass0480";
407
408 reg = <0x19000 0x0 0x0 0x0 0x0>;
409 };
410 };
411
412 isa@1f,0 {
413 #address-cells = <2>;
414 #size-cells = <1>;
415 compatible = "isa";
416 reg = <0xf800 0x0 0x0 0x0 0x0>;
417 ranges = <1 0 0 0 0 0x100>;
418
419 rtc@70 {
420 compatible = "intel,ce4100-rtc", "motorola,mc146818";
421 interrupts = <8 3>;
422 interrupt-parent = <&ioapic1>;
423 ctrl-reg = <2>;
424 freq-reg = <0x26>;
425 reg = <1 0x70 2>;
426 };
427 };
428 };
429 };
430};
diff --git a/arch/x86/platform/efi/Makefile b/arch/x86/platform/efi/Makefile
new file mode 100644
index 000000000000..73b8be0f3675
--- /dev/null
+++ b/arch/x86/platform/efi/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_EFI) += efi.o efi_$(BITS).o efi_stub_$(BITS).o
diff --git a/arch/x86/kernel/efi.c b/arch/x86/platform/efi/efi.c
index c2fa9b8b497e..899e393d8e73 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/platform/efi/efi.c
@@ -30,6 +30,7 @@
30#include <linux/init.h> 30#include <linux/init.h>
31#include <linux/efi.h> 31#include <linux/efi.h>
32#include <linux/bootmem.h> 32#include <linux/bootmem.h>
33#include <linux/memblock.h>
33#include <linux/spinlock.h> 34#include <linux/spinlock.h>
34#include <linux/uaccess.h> 35#include <linux/uaccess.h>
35#include <linux/time.h> 36#include <linux/time.h>
@@ -144,17 +145,6 @@ static void virt_efi_reset_system(int reset_type,
144 data_size, data); 145 data_size, data);
145} 146}
146 147
147static efi_status_t virt_efi_set_virtual_address_map(
148 unsigned long memory_map_size,
149 unsigned long descriptor_size,
150 u32 descriptor_version,
151 efi_memory_desc_t *virtual_map)
152{
153 return efi_call_virt4(set_virtual_address_map,
154 memory_map_size, descriptor_size,
155 descriptor_version, virtual_map);
156}
157
158static efi_status_t __init phys_efi_set_virtual_address_map( 148static efi_status_t __init phys_efi_set_virtual_address_map(
159 unsigned long memory_map_size, 149 unsigned long memory_map_size,
160 unsigned long descriptor_size, 150 unsigned long descriptor_size,
@@ -275,7 +265,7 @@ static void __init do_add_efi_memmap(void)
275 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 265 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
276} 266}
277 267
278void __init efi_reserve_early(void) 268void __init efi_memblock_x86_reserve_range(void)
279{ 269{
280 unsigned long pmap; 270 unsigned long pmap;
281 271
@@ -290,7 +280,7 @@ void __init efi_reserve_early(void)
290 boot_params.efi_info.efi_memdesc_size; 280 boot_params.efi_info.efi_memdesc_size;
291 memmap.desc_version = boot_params.efi_info.efi_memdesc_version; 281 memmap.desc_version = boot_params.efi_info.efi_memdesc_version;
292 memmap.desc_size = boot_params.efi_info.efi_memdesc_size; 282 memmap.desc_size = boot_params.efi_info.efi_memdesc_size;
293 reserve_early(pmap, pmap + memmap.nr_map * memmap.desc_size, 283 memblock_x86_reserve_range(pmap, pmap + memmap.nr_map * memmap.desc_size,
294 "EFI memmap"); 284 "EFI memmap");
295} 285}
296 286
@@ -314,6 +304,61 @@ static void __init print_efi_memmap(void)
314} 304}
315#endif /* EFI_DEBUG */ 305#endif /* EFI_DEBUG */
316 306
307void __init efi_reserve_boot_services(void)
308{
309 void *p;
310
311 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
312 efi_memory_desc_t *md = p;
313 u64 start = md->phys_addr;
314 u64 size = md->num_pages << EFI_PAGE_SHIFT;
315
316 if (md->type != EFI_BOOT_SERVICES_CODE &&
317 md->type != EFI_BOOT_SERVICES_DATA)
318 continue;
319 /* Only reserve where possible:
320 * - Not within any already allocated areas
321 * - Not over any memory area (really needed, if above?)
322 * - Not within any part of the kernel
323 * - Not the bios reserved area
324 */
325 if ((start+size >= virt_to_phys(_text)
326 && start <= virt_to_phys(_end)) ||
327 !e820_all_mapped(start, start+size, E820_RAM) ||
328 memblock_x86_check_reserved_size(&start, &size,
329 1<<EFI_PAGE_SHIFT)) {
330 /* Could not reserve, skip it */
331 md->num_pages = 0;
332 memblock_dbg(PFX "Could not reserve boot range "
333 "[0x%010llx-0x%010llx]\n",
334 start, start+size-1);
335 } else
336 memblock_x86_reserve_range(start, start+size,
337 "EFI Boot");
338 }
339}
340
341static void __init efi_free_boot_services(void)
342{
343 void *p;
344
345 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
346 efi_memory_desc_t *md = p;
347 unsigned long long start = md->phys_addr;
348 unsigned long long size = md->num_pages << EFI_PAGE_SHIFT;
349
350 if (md->type != EFI_BOOT_SERVICES_CODE &&
351 md->type != EFI_BOOT_SERVICES_DATA)
352 continue;
353
354 /* Could not reserve boot area */
355 if (!size)
356 continue;
357
358 free_bootmem_late(start, size);
359 }
360}
361
317void __init efi_init(void) 362void __init efi_init(void)
318{ 363{
319 efi_config_table_t *config_tables; 364 efi_config_table_t *config_tables;
@@ -459,19 +504,30 @@ void __init efi_init(void)
459 x86_platform.set_wallclock = efi_set_rtc_mmss; 504 x86_platform.set_wallclock = efi_set_rtc_mmss;
460#endif 505#endif
461 506
462 /* Setup for EFI runtime service */
463 reboot_type = BOOT_EFI;
464
465#if EFI_DEBUG 507#if EFI_DEBUG
466 print_efi_memmap(); 508 print_efi_memmap();
467#endif 509#endif
468} 510}
469 511
512void __init efi_set_executable(efi_memory_desc_t *md, bool executable)
513{
514 u64 addr, npages;
515
516 addr = md->virt_addr;
517 npages = md->num_pages;
518
519 memrange_efi_to_native(&addr, &npages);
520
521 if (executable)
522 set_memory_x(addr, npages);
523 else
524 set_memory_nx(addr, npages);
525}
526
470static void __init runtime_code_page_mkexec(void) 527static void __init runtime_code_page_mkexec(void)
471{ 528{
472 efi_memory_desc_t *md; 529 efi_memory_desc_t *md;
473 void *p; 530 void *p;
474 u64 addr, npages;
475 531
476 /* Make EFI runtime service code area executable */ 532 /* Make EFI runtime service code area executable */
477 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 533 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
@@ -480,10 +536,7 @@ static void __init runtime_code_page_mkexec(void)
480 if (md->type != EFI_RUNTIME_SERVICES_CODE) 536 if (md->type != EFI_RUNTIME_SERVICES_CODE)
481 continue; 537 continue;
482 538
483 addr = md->virt_addr; 539 efi_set_executable(md, true);
484 npages = md->num_pages;
485 memrange_efi_to_native(&addr, &npages);
486 set_memory_x(addr, npages);
487 } 540 }
488} 541}
489 542
@@ -497,16 +550,47 @@ static void __init runtime_code_page_mkexec(void)
497 */ 550 */
498void __init efi_enter_virtual_mode(void) 551void __init efi_enter_virtual_mode(void)
499{ 552{
500 efi_memory_desc_t *md; 553 efi_memory_desc_t *md, *prev_md = NULL;
501 efi_status_t status; 554 efi_status_t status;
502 unsigned long size; 555 unsigned long size;
503 u64 end, systab, addr, npages, end_pfn; 556 u64 end, systab, addr, npages, end_pfn;
504 void *p, *va; 557 void *p, *va, *new_memmap = NULL;
558 int count = 0;
505 559
506 efi.systab = NULL; 560 efi.systab = NULL;
561
562 /* Merge contiguous regions of the same type and attribute */
507 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 563 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
564 u64 prev_size;
508 md = p; 565 md = p;
509 if (!(md->attribute & EFI_MEMORY_RUNTIME)) 566
567 if (!prev_md) {
568 prev_md = md;
569 continue;
570 }
571
572 if (prev_md->type != md->type ||
573 prev_md->attribute != md->attribute) {
574 prev_md = md;
575 continue;
576 }
577
578 prev_size = prev_md->num_pages << EFI_PAGE_SHIFT;
579
580 if (md->phys_addr == (prev_md->phys_addr + prev_size)) {
581 prev_md->num_pages += md->num_pages;
582 md->type = EFI_RESERVED_TYPE;
583 md->attribute = 0;
584 continue;
585 }
586 prev_md = md;
587 }
588
589 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
590 md = p;
591 if (!(md->attribute & EFI_MEMORY_RUNTIME) &&
592 md->type != EFI_BOOT_SERVICES_CODE &&
593 md->type != EFI_BOOT_SERVICES_DATA)
510 continue; 594 continue;
511 595
512 size = md->num_pages << EFI_PAGE_SHIFT; 596 size = md->num_pages << EFI_PAGE_SHIFT;
@@ -540,15 +624,21 @@ void __init efi_enter_virtual_mode(void)
540 systab += md->virt_addr - md->phys_addr; 624 systab += md->virt_addr - md->phys_addr;
541 efi.systab = (efi_system_table_t *) (unsigned long) systab; 625 efi.systab = (efi_system_table_t *) (unsigned long) systab;
542 } 626 }
627 new_memmap = krealloc(new_memmap,
628 (count + 1) * memmap.desc_size,
629 GFP_KERNEL);
630 memcpy(new_memmap + (count * memmap.desc_size), md,
631 memmap.desc_size);
632 count++;
543 } 633 }
544 634
545 BUG_ON(!efi.systab); 635 BUG_ON(!efi.systab);
546 636
547 status = phys_efi_set_virtual_address_map( 637 status = phys_efi_set_virtual_address_map(
548 memmap.desc_size * memmap.nr_map, 638 memmap.desc_size * count,
549 memmap.desc_size, 639 memmap.desc_size,
550 memmap.desc_version, 640 memmap.desc_version,
551 memmap.phys_map); 641 (efi_memory_desc_t *)__pa(new_memmap));
552 642
553 if (status != EFI_SUCCESS) { 643 if (status != EFI_SUCCESS) {
554 printk(KERN_ALERT "Unable to switch EFI into virtual mode " 644 printk(KERN_ALERT "Unable to switch EFI into virtual mode "
@@ -557,6 +647,13 @@ void __init efi_enter_virtual_mode(void)
557 } 647 }
558 648
559 /* 649 /*
650 * Thankfully, it does seem that no runtime services other than
651 * SetVirtualAddressMap() will touch boot services code, so we can
652 * get rid of it all at this point
653 */
654 efi_free_boot_services();
655
656 /*
560 * Now that EFI is in virtual mode, update the function 657 * Now that EFI is in virtual mode, update the function
561 * pointers in the runtime service table to the new virtual addresses. 658 * pointers in the runtime service table to the new virtual addresses.
562 * 659 *
@@ -571,11 +668,12 @@ void __init efi_enter_virtual_mode(void)
571 efi.set_variable = virt_efi_set_variable; 668 efi.set_variable = virt_efi_set_variable;
572 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count; 669 efi.get_next_high_mono_count = virt_efi_get_next_high_mono_count;
573 efi.reset_system = virt_efi_reset_system; 670 efi.reset_system = virt_efi_reset_system;
574 efi.set_virtual_address_map = virt_efi_set_virtual_address_map; 671 efi.set_virtual_address_map = NULL;
575 if (__supported_pte_mask & _PAGE_NX) 672 if (__supported_pte_mask & _PAGE_NX)
576 runtime_code_page_mkexec(); 673 runtime_code_page_mkexec();
577 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size); 674 early_iounmap(memmap.map, memmap.nr_map * memmap.desc_size);
578 memmap.map = NULL; 675 memmap.map = NULL;
676 kfree(new_memmap);
579} 677}
580 678
581/* 679/*
diff --git a/arch/x86/kernel/efi_32.c b/arch/x86/platform/efi/efi_32.c
index 5cab48ee61a4..5cab48ee61a4 100644
--- a/arch/x86/kernel/efi_32.c
+++ b/arch/x86/platform/efi/efi_32.c
diff --git a/arch/x86/kernel/efi_64.c b/arch/x86/platform/efi/efi_64.c
index ac0621a7ac3d..ac3aa54e2654 100644
--- a/arch/x86/kernel/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -41,22 +41,7 @@
41static pgd_t save_pgd __initdata; 41static pgd_t save_pgd __initdata;
42static unsigned long efi_flags __initdata; 42static unsigned long efi_flags __initdata;
43 43
44static void __init early_mapping_set_exec(unsigned long start, 44static void __init early_code_mapping_set_exec(int executable)
45 unsigned long end,
46 int executable)
47{
48 unsigned long num_pages;
49
50 start &= PMD_MASK;
51 end = (end + PMD_SIZE - 1) & PMD_MASK;
52 num_pages = (end - start) >> PAGE_SHIFT;
53 if (executable)
54 set_memory_x((unsigned long)__va(start), num_pages);
55 else
56 set_memory_nx((unsigned long)__va(start), num_pages);
57}
58
59static void __init early_runtime_code_mapping_set_exec(int executable)
60{ 45{
61 efi_memory_desc_t *md; 46 efi_memory_desc_t *md;
62 void *p; 47 void *p;
@@ -64,14 +49,12 @@ static void __init early_runtime_code_mapping_set_exec(int executable)
64 if (!(__supported_pte_mask & _PAGE_NX)) 49 if (!(__supported_pte_mask & _PAGE_NX))
65 return; 50 return;
66 51
67 /* Make EFI runtime service code area executable */ 52 /* Make EFI service code area executable */
68 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) { 53 for (p = memmap.map; p < memmap.map_end; p += memmap.desc_size) {
69 md = p; 54 md = p;
70 if (md->type == EFI_RUNTIME_SERVICES_CODE) { 55 if (md->type == EFI_RUNTIME_SERVICES_CODE ||
71 unsigned long end; 56 md->type == EFI_BOOT_SERVICES_CODE)
72 end = md->phys_addr + (md->num_pages << EFI_PAGE_SHIFT); 57 efi_set_executable(md, executable);
73 early_mapping_set_exec(md->phys_addr, end, executable);
74 }
75 } 58 }
76} 59}
77 60
@@ -79,7 +62,7 @@ void __init efi_call_phys_prelog(void)
79{ 62{
80 unsigned long vaddress; 63 unsigned long vaddress;
81 64
82 early_runtime_code_mapping_set_exec(1); 65 early_code_mapping_set_exec(1);
83 local_irq_save(efi_flags); 66 local_irq_save(efi_flags);
84 vaddress = (unsigned long)__va(0x0UL); 67 vaddress = (unsigned long)__va(0x0UL);
85 save_pgd = *pgd_offset_k(0x0UL); 68 save_pgd = *pgd_offset_k(0x0UL);
@@ -95,7 +78,7 @@ void __init efi_call_phys_epilog(void)
95 set_pgd(pgd_offset_k(0x0UL), save_pgd); 78 set_pgd(pgd_offset_k(0x0UL), save_pgd);
96 __flush_tlb_all(); 79 __flush_tlb_all();
97 local_irq_restore(efi_flags); 80 local_irq_restore(efi_flags);
98 early_runtime_code_mapping_set_exec(0); 81 early_code_mapping_set_exec(0);
99} 82}
100 83
101void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size, 84void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
@@ -107,8 +90,10 @@ void __iomem *__init efi_ioremap(unsigned long phys_addr, unsigned long size,
107 return ioremap(phys_addr, size); 90 return ioremap(phys_addr, size);
108 91
109 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size); 92 last_map_pfn = init_memory_mapping(phys_addr, phys_addr + size);
110 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) 93 if ((last_map_pfn << PAGE_SHIFT) < phys_addr + size) {
111 return NULL; 94 unsigned long top = last_map_pfn << PAGE_SHIFT;
95 efi_ioremap(top, size - (top - phys_addr), type);
96 }
112 97
113 return (void __iomem *)__va(phys_addr); 98 return (void __iomem *)__va(phys_addr);
114} 99}
diff --git a/arch/x86/kernel/efi_stub_32.S b/arch/x86/platform/efi/efi_stub_32.S
index fbe66e626c09..fbe66e626c09 100644
--- a/arch/x86/kernel/efi_stub_32.S
+++ b/arch/x86/platform/efi/efi_stub_32.S
diff --git a/arch/x86/kernel/efi_stub_64.S b/arch/x86/platform/efi/efi_stub_64.S
index 4c07ccab8146..4c07ccab8146 100644
--- a/arch/x86/kernel/efi_stub_64.S
+++ b/arch/x86/platform/efi/efi_stub_64.S
diff --git a/arch/x86/platform/iris/Makefile b/arch/x86/platform/iris/Makefile
new file mode 100644
index 000000000000..db921983a102
--- /dev/null
+++ b/arch/x86/platform/iris/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_32_IRIS) += iris.o
diff --git a/arch/x86/platform/iris/iris.c b/arch/x86/platform/iris/iris.c
new file mode 100644
index 000000000000..1ba7f5ed8c9b
--- /dev/null
+++ b/arch/x86/platform/iris/iris.c
@@ -0,0 +1,91 @@
1/*
2 * Eurobraille/Iris power off support.
3 *
4 * Eurobraille's Iris machine is a PC with no APM or ACPI support.
5 * It is shutdown by a special I/O sequence which this module provides.
6 *
7 * Copyright (C) Shérab <Sebastien.Hinderer@ens-lyon.org>
8 *
9 * This program is free software ; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation ; either version 2 of the License, or
12 * (at your option) any later version.
13 *
14 * This program is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY ; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License
20 * along with the program ; if not, write to the Free Software
21 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
22 */
23
24#include <linux/moduleparam.h>
25#include <linux/module.h>
26#include <linux/kernel.h>
27#include <linux/errno.h>
28#include <linux/delay.h>
29#include <linux/init.h>
30#include <linux/pm.h>
31#include <asm/io.h>
32
33#define IRIS_GIO_BASE 0x340
34#define IRIS_GIO_INPUT IRIS_GIO_BASE
35#define IRIS_GIO_OUTPUT (IRIS_GIO_BASE + 1)
36#define IRIS_GIO_PULSE 0x80 /* First byte to send */
37#define IRIS_GIO_REST 0x00 /* Second byte to send */
38#define IRIS_GIO_NODEV 0xff /* Likely not an Iris */
39
40MODULE_LICENSE("GPL");
41MODULE_AUTHOR("Sébastien Hinderer <Sebastien.Hinderer@ens-lyon.org>");
42MODULE_DESCRIPTION("A power_off handler for Iris devices from EuroBraille");
43MODULE_SUPPORTED_DEVICE("Eurobraille/Iris");
44
45static int force;
46
47module_param(force, bool, 0);
48MODULE_PARM_DESC(force, "Set to one to force poweroff handler installation.");
49
50static void (*old_pm_power_off)(void);
51
52static void iris_power_off(void)
53{
54 outb(IRIS_GIO_PULSE, IRIS_GIO_OUTPUT);
55 msleep(850);
56 outb(IRIS_GIO_REST, IRIS_GIO_OUTPUT);
57}
58
59/*
60 * Before installing the power_off handler, try to make sure the OS is
61 * running on an Iris. Since Iris does not support DMI, this is done
62 * by reading its input port and seeing whether the read value is
63 * meaningful.
64 */
65static int iris_init(void)
66{
67 unsigned char status;
68 if (force != 1) {
69 printk(KERN_ERR "The force parameter has not been set to 1 so the Iris poweroff handler will not be installed.\n");
70 return -ENODEV;
71 }
72 status = inb(IRIS_GIO_INPUT);
73 if (status == IRIS_GIO_NODEV) {
74 printk(KERN_ERR "This machine does not seem to be an Iris. Power_off handler not installed.\n");
75 return -ENODEV;
76 }
77 old_pm_power_off = pm_power_off;
78 pm_power_off = &iris_power_off;
79 printk(KERN_INFO "Iris power_off handler installed.\n");
80
81 return 0;
82}
83
84static void iris_exit(void)
85{
86 pm_power_off = old_pm_power_off;
87 printk(KERN_INFO "Iris power_off handler uninstalled.\n");
88}
89
90module_init(iris_init);
91module_exit(iris_exit);
diff --git a/arch/x86/platform/mrst/Makefile b/arch/x86/platform/mrst/Makefile
new file mode 100644
index 000000000000..f61ccdd49341
--- /dev/null
+++ b/arch/x86/platform/mrst/Makefile
@@ -0,0 +1,3 @@
1obj-$(CONFIG_X86_MRST) += mrst.o
2obj-$(CONFIG_X86_MRST) += vrtc.o
3obj-$(CONFIG_EARLY_PRINTK_MRST) += early_printk_mrst.o
diff --git a/arch/x86/platform/mrst/early_printk_mrst.c b/arch/x86/platform/mrst/early_printk_mrst.c
new file mode 100644
index 000000000000..25bfdbb5b130
--- /dev/null
+++ b/arch/x86/platform/mrst/early_printk_mrst.c
@@ -0,0 +1,319 @@
1/*
2 * early_printk_mrst.c - early consoles for Intel MID platforms
3 *
4 * Copyright (c) 2008-2010, Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11
12/*
13 * This file implements two early consoles named mrst and hsu.
14 * mrst is based on Maxim3110 spi-uart device, it exists in both
15 * Moorestown and Medfield platforms, while hsu is based on a High
16 * Speed UART device which only exists in the Medfield platform
17 */
18
19#include <linux/serial_reg.h>
20#include <linux/serial_mfd.h>
21#include <linux/kmsg_dump.h>
22#include <linux/console.h>
23#include <linux/kernel.h>
24#include <linux/delay.h>
25#include <linux/init.h>
26#include <linux/io.h>
27
28#include <asm/fixmap.h>
29#include <asm/pgtable.h>
30#include <asm/mrst.h>
31
32#define MRST_SPI_TIMEOUT 0x200000
33#define MRST_REGBASE_SPI0 0xff128000
34#define MRST_REGBASE_SPI1 0xff128400
35#define MRST_CLK_SPI0_REG 0xff11d86c
36
37/* Bit fields in CTRLR0 */
38#define SPI_DFS_OFFSET 0
39
40#define SPI_FRF_OFFSET 4
41#define SPI_FRF_SPI 0x0
42#define SPI_FRF_SSP 0x1
43#define SPI_FRF_MICROWIRE 0x2
44#define SPI_FRF_RESV 0x3
45
46#define SPI_MODE_OFFSET 6
47#define SPI_SCPH_OFFSET 6
48#define SPI_SCOL_OFFSET 7
49#define SPI_TMOD_OFFSET 8
50#define SPI_TMOD_TR 0x0 /* xmit & recv */
51#define SPI_TMOD_TO 0x1 /* xmit only */
52#define SPI_TMOD_RO 0x2 /* recv only */
53#define SPI_TMOD_EPROMREAD 0x3 /* eeprom read mode */
54
55#define SPI_SLVOE_OFFSET 10
56#define SPI_SRL_OFFSET 11
57#define SPI_CFS_OFFSET 12
58
59/* Bit fields in SR, 7 bits */
60#define SR_MASK 0x7f /* cover 7 bits */
61#define SR_BUSY (1 << 0)
62#define SR_TF_NOT_FULL (1 << 1)
63#define SR_TF_EMPT (1 << 2)
64#define SR_RF_NOT_EMPT (1 << 3)
65#define SR_RF_FULL (1 << 4)
66#define SR_TX_ERR (1 << 5)
67#define SR_DCOL (1 << 6)
68
69struct dw_spi_reg {
70 u32 ctrl0;
71 u32 ctrl1;
72 u32 ssienr;
73 u32 mwcr;
74 u32 ser;
75 u32 baudr;
76 u32 txfltr;
77 u32 rxfltr;
78 u32 txflr;
79 u32 rxflr;
80 u32 sr;
81 u32 imr;
82 u32 isr;
83 u32 risr;
84 u32 txoicr;
85 u32 rxoicr;
86 u32 rxuicr;
87 u32 msticr;
88 u32 icr;
89 u32 dmacr;
90 u32 dmatdlr;
91 u32 dmardlr;
92 u32 idr;
93 u32 version;
94
95 /* Currently operates as 32 bits, though only the low 16 bits matter */
96 u32 dr;
97} __packed;
98
99#define dw_readl(dw, name) __raw_readl(&(dw)->name)
100#define dw_writel(dw, name, val) __raw_writel((val), &(dw)->name)
101
102/* Default use SPI0 register for mrst, we will detect Penwell and use SPI1 */
103static unsigned long mrst_spi_paddr = MRST_REGBASE_SPI0;
104
105static u32 *pclk_spi0;
106/* Always contains an accessible address, start with 0 */
107static struct dw_spi_reg *pspi;
108
109static struct kmsg_dumper dw_dumper;
110static int dumper_registered;
111
112static void dw_kmsg_dump(struct kmsg_dumper *dumper,
113 enum kmsg_dump_reason reason,
114 const char *s1, unsigned long l1,
115 const char *s2, unsigned long l2)
116{
117 int i;
118
119 /* When run to this, we'd better re-init the HW */
120 mrst_early_console_init();
121
122 for (i = 0; i < l1; i++)
123 early_mrst_console.write(&early_mrst_console, s1 + i, 1);
124 for (i = 0; i < l2; i++)
125 early_mrst_console.write(&early_mrst_console, s2 + i, 1);
126}
127
128/* Set the ratio rate to 115200, 8n1, IRQ disabled */
129static void max3110_write_config(void)
130{
131 u16 config;
132
133 config = 0xc001;
134 dw_writel(pspi, dr, config);
135}
136
137/* Translate char to a eligible word and send to max3110 */
138static void max3110_write_data(char c)
139{
140 u16 data;
141
142 data = 0x8000 | c;
143 dw_writel(pspi, dr, data);
144}
145
146void mrst_early_console_init(void)
147{
148 u32 ctrlr0 = 0;
149 u32 spi0_cdiv;
150 u32 freq; /* Freqency info only need be searched once */
151
152 /* Base clk is 100 MHz, the actual clk = 100M / (clk_divider + 1) */
153 pclk_spi0 = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
154 MRST_CLK_SPI0_REG);
155 spi0_cdiv = ((*pclk_spi0) & 0xe00) >> 9;
156 freq = 100000000 / (spi0_cdiv + 1);
157
158 if (mrst_identify_cpu() == MRST_CPU_CHIP_PENWELL)
159 mrst_spi_paddr = MRST_REGBASE_SPI1;
160
161 pspi = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
162 mrst_spi_paddr);
163
164 /* Disable SPI controller */
165 dw_writel(pspi, ssienr, 0);
166
167 /* Set control param, 8 bits, transmit only mode */
168 ctrlr0 = dw_readl(pspi, ctrl0);
169
170 ctrlr0 &= 0xfcc0;
171 ctrlr0 |= 0xf | (SPI_FRF_SPI << SPI_FRF_OFFSET)
172 | (SPI_TMOD_TO << SPI_TMOD_OFFSET);
173 dw_writel(pspi, ctrl0, ctrlr0);
174
175 /*
176 * Change the spi0 clk to comply with 115200 bps, use 100000 to
177 * calculate the clk dividor to make the clock a little slower
178 * than real baud rate.
179 */
180 dw_writel(pspi, baudr, freq/100000);
181
182 /* Disable all INT for early phase */
183 dw_writel(pspi, imr, 0x0);
184
185 /* Set the cs to spi-uart */
186 dw_writel(pspi, ser, 0x2);
187
188 /* Enable the HW, the last step for HW init */
189 dw_writel(pspi, ssienr, 0x1);
190
191 /* Set the default configuration */
192 max3110_write_config();
193
194 /* Register the kmsg dumper */
195 if (!dumper_registered) {
196 dw_dumper.dump = dw_kmsg_dump;
197 kmsg_dump_register(&dw_dumper);
198 dumper_registered = 1;
199 }
200}
201
202/* Slave select should be called in the read/write function */
203static void early_mrst_spi_putc(char c)
204{
205 unsigned int timeout;
206 u32 sr;
207
208 timeout = MRST_SPI_TIMEOUT;
209 /* Early putc needs to make sure the TX FIFO is not full */
210 while (--timeout) {
211 sr = dw_readl(pspi, sr);
212 if (!(sr & SR_TF_NOT_FULL))
213 cpu_relax();
214 else
215 break;
216 }
217
218 if (!timeout)
219 pr_warning("MRST earlycon: timed out\n");
220 else
221 max3110_write_data(c);
222}
223
224/* Early SPI only uses polling mode */
225static void early_mrst_spi_write(struct console *con, const char *str, unsigned n)
226{
227 int i;
228
229 for (i = 0; i < n && *str; i++) {
230 if (*str == '\n')
231 early_mrst_spi_putc('\r');
232 early_mrst_spi_putc(*str);
233 str++;
234 }
235}
236
237struct console early_mrst_console = {
238 .name = "earlymrst",
239 .write = early_mrst_spi_write,
240 .flags = CON_PRINTBUFFER,
241 .index = -1,
242};
243
244/*
245 * Following is the early console based on Medfield HSU (High
246 * Speed UART) device.
247 */
248#define HSU_PORT2_PADDR 0xffa28180
249
250static void __iomem *phsu;
251
252void hsu_early_console_init(void)
253{
254 u8 lcr;
255
256 phsu = (void *)set_fixmap_offset_nocache(FIX_EARLYCON_MEM_BASE,
257 HSU_PORT2_PADDR);
258
259 /* Disable FIFO */
260 writeb(0x0, phsu + UART_FCR);
261
262 /* Set to default 115200 bps, 8n1 */
263 lcr = readb(phsu + UART_LCR);
264 writeb((0x80 | lcr), phsu + UART_LCR);
265 writeb(0x18, phsu + UART_DLL);
266 writeb(lcr, phsu + UART_LCR);
267 writel(0x3600, phsu + UART_MUL*4);
268
269 writeb(0x8, phsu + UART_MCR);
270 writeb(0x7, phsu + UART_FCR);
271 writeb(0x3, phsu + UART_LCR);
272
273 /* Clear IRQ status */
274 readb(phsu + UART_LSR);
275 readb(phsu + UART_RX);
276 readb(phsu + UART_IIR);
277 readb(phsu + UART_MSR);
278
279 /* Enable FIFO */
280 writeb(0x7, phsu + UART_FCR);
281}
282
283#define BOTH_EMPTY (UART_LSR_TEMT | UART_LSR_THRE)
284
285static void early_hsu_putc(char ch)
286{
287 unsigned int timeout = 10000; /* 10ms */
288 u8 status;
289
290 while (--timeout) {
291 status = readb(phsu + UART_LSR);
292 if (status & BOTH_EMPTY)
293 break;
294 udelay(1);
295 }
296
297 /* Only write the char when there was no timeout */
298 if (timeout)
299 writeb(ch, phsu + UART_TX);
300}
301
302static void early_hsu_write(struct console *con, const char *str, unsigned n)
303{
304 int i;
305
306 for (i = 0; i < n && *str; i++) {
307 if (*str == '\n')
308 early_hsu_putc('\r');
309 early_hsu_putc(*str);
310 str++;
311 }
312}
313
314struct console early_hsu_console = {
315 .name = "earlyhsu",
316 .write = early_hsu_write,
317 .flags = CON_PRINTBUFFER,
318 .index = -1,
319};
diff --git a/arch/x86/platform/mrst/mrst.c b/arch/x86/platform/mrst/mrst.c
new file mode 100644
index 000000000000..7000e74b3087
--- /dev/null
+++ b/arch/x86/platform/mrst/mrst.c
@@ -0,0 +1,811 @@
1/*
2 * mrst.c: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2008 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 */
12
13#define pr_fmt(fmt) "mrst: " fmt
14
15#include <linux/init.h>
16#include <linux/kernel.h>
17#include <linux/sfi.h>
18#include <linux/intel_pmic_gpio.h>
19#include <linux/spi/spi.h>
20#include <linux/i2c.h>
21#include <linux/i2c/pca953x.h>
22#include <linux/gpio_keys.h>
23#include <linux/input.h>
24#include <linux/platform_device.h>
25#include <linux/irq.h>
26#include <linux/module.h>
27
28#include <asm/setup.h>
29#include <asm/mpspec_def.h>
30#include <asm/hw_irq.h>
31#include <asm/apic.h>
32#include <asm/io_apic.h>
33#include <asm/mrst.h>
34#include <asm/mrst-vrtc.h>
35#include <asm/io.h>
36#include <asm/i8259.h>
37#include <asm/intel_scu_ipc.h>
38#include <asm/apb_timer.h>
39#include <asm/reboot.h>
40
41/*
42 * the clockevent devices on Moorestown/Medfield can be APBT or LAPIC clock,
43 * cmdline option x86_mrst_timer can be used to override the configuration
44 * to prefer one or the other.
45 * at runtime, there are basically three timer configurations:
46 * 1. per cpu apbt clock only
47 * 2. per cpu always-on lapic clocks only, this is Penwell/Medfield only
48 * 3. per cpu lapic clock (C3STOP) and one apbt clock, with broadcast.
49 *
50 * by default (without cmdline option), platform code first detects cpu type
51 * to see if we are on lincroft or penwell, then set up both lapic or apbt
52 * clocks accordingly.
53 * i.e. by default, medfield uses configuration #2, moorestown uses #1.
54 * config #3 is supported but not recommended on medfield.
55 *
56 * rating and feature summary:
57 * lapic (with C3STOP) --------- 100
58 * apbt (always-on) ------------ 110
59 * lapic (always-on,ARAT) ------ 150
60 */
61
62__cpuinitdata enum mrst_timer_options mrst_timer_options;
63
64static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
65static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
66enum mrst_cpu_type __mrst_cpu_chip;
67EXPORT_SYMBOL_GPL(__mrst_cpu_chip);
68
69int sfi_mtimer_num;
70
71struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
72EXPORT_SYMBOL_GPL(sfi_mrtc_array);
73int sfi_mrtc_num;
74
75/* parse all the mtimer info to a static mtimer array */
76static int __init sfi_parse_mtmr(struct sfi_table_header *table)
77{
78 struct sfi_table_simple *sb;
79 struct sfi_timer_table_entry *pentry;
80 struct mpc_intsrc mp_irq;
81 int totallen;
82
83 sb = (struct sfi_table_simple *)table;
84 if (!sfi_mtimer_num) {
85 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
86 struct sfi_timer_table_entry);
87 pentry = (struct sfi_timer_table_entry *) sb->pentry;
88 totallen = sfi_mtimer_num * sizeof(*pentry);
89 memcpy(sfi_mtimer_array, pentry, totallen);
90 }
91
92 pr_debug("SFI MTIMER info (num = %d):\n", sfi_mtimer_num);
93 pentry = sfi_mtimer_array;
94 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
95 pr_debug("timer[%d]: paddr = 0x%08x, freq = %dHz,"
96 " irq = %d\n", totallen, (u32)pentry->phys_addr,
97 pentry->freq_hz, pentry->irq);
98 if (!pentry->irq)
99 continue;
100 mp_irq.type = MP_INTSRC;
101 mp_irq.irqtype = mp_INT;
102/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
103 mp_irq.irqflag = 5;
104 mp_irq.srcbus = MP_BUS_ISA;
105 mp_irq.srcbusirq = pentry->irq; /* IRQ */
106 mp_irq.dstapic = MP_APIC_ALL;
107 mp_irq.dstirq = pentry->irq;
108 mp_save_irq(&mp_irq);
109 }
110
111 return 0;
112}
113
114struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
115{
116 int i;
117 if (hint < sfi_mtimer_num) {
118 if (!sfi_mtimer_usage[hint]) {
119 pr_debug("hint taken for timer %d irq %d\n",\
120 hint, sfi_mtimer_array[hint].irq);
121 sfi_mtimer_usage[hint] = 1;
122 return &sfi_mtimer_array[hint];
123 }
124 }
125 /* take the first timer available */
126 for (i = 0; i < sfi_mtimer_num;) {
127 if (!sfi_mtimer_usage[i]) {
128 sfi_mtimer_usage[i] = 1;
129 return &sfi_mtimer_array[i];
130 }
131 i++;
132 }
133 return NULL;
134}
135
136void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
137{
138 int i;
139 for (i = 0; i < sfi_mtimer_num;) {
140 if (mtmr->irq == sfi_mtimer_array[i].irq) {
141 sfi_mtimer_usage[i] = 0;
142 return;
143 }
144 i++;
145 }
146}
147
148/* parse all the mrtc info to a global mrtc array */
149int __init sfi_parse_mrtc(struct sfi_table_header *table)
150{
151 struct sfi_table_simple *sb;
152 struct sfi_rtc_table_entry *pentry;
153 struct mpc_intsrc mp_irq;
154
155 int totallen;
156
157 sb = (struct sfi_table_simple *)table;
158 if (!sfi_mrtc_num) {
159 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
160 struct sfi_rtc_table_entry);
161 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
162 totallen = sfi_mrtc_num * sizeof(*pentry);
163 memcpy(sfi_mrtc_array, pentry, totallen);
164 }
165
166 pr_debug("SFI RTC info (num = %d):\n", sfi_mrtc_num);
167 pentry = sfi_mrtc_array;
168 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
169 pr_debug("RTC[%d]: paddr = 0x%08x, irq = %d\n",
170 totallen, (u32)pentry->phys_addr, pentry->irq);
171 mp_irq.type = MP_INTSRC;
172 mp_irq.irqtype = mp_INT;
173 mp_irq.irqflag = 0xf; /* level trigger and active low */
174 mp_irq.srcbus = MP_BUS_ISA;
175 mp_irq.srcbusirq = pentry->irq; /* IRQ */
176 mp_irq.dstapic = MP_APIC_ALL;
177 mp_irq.dstirq = pentry->irq;
178 mp_save_irq(&mp_irq);
179 }
180 return 0;
181}
182
183static unsigned long __init mrst_calibrate_tsc(void)
184{
185 unsigned long flags, fast_calibrate;
186
187 local_irq_save(flags);
188 fast_calibrate = apbt_quick_calibrate();
189 local_irq_restore(flags);
190
191 if (fast_calibrate)
192 return fast_calibrate;
193
194 return 0;
195}
196
197static void __init mrst_time_init(void)
198{
199 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
200 switch (mrst_timer_options) {
201 case MRST_TIMER_APBT_ONLY:
202 break;
203 case MRST_TIMER_LAPIC_APBT:
204 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
205 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
206 break;
207 default:
208 if (!boot_cpu_has(X86_FEATURE_ARAT))
209 break;
210 x86_init.timers.setup_percpu_clockev = setup_boot_APIC_clock;
211 x86_cpuinit.setup_percpu_clockev = setup_secondary_APIC_clock;
212 return;
213 }
214 /* we need at least one APB timer */
215 pre_init_apic_IRQ0();
216 apbt_time_init();
217}
218
219static void __cpuinit mrst_arch_setup(void)
220{
221 if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x27)
222 __mrst_cpu_chip = MRST_CPU_CHIP_PENWELL;
223 else if (boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x26)
224 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
225 else {
226 pr_err("Unknown Moorestown CPU (%d:%d), default to Lincroft\n",
227 boot_cpu_data.x86, boot_cpu_data.x86_model);
228 __mrst_cpu_chip = MRST_CPU_CHIP_LINCROFT;
229 }
230 pr_debug("Moorestown CPU %s identified\n",
231 (__mrst_cpu_chip == MRST_CPU_CHIP_LINCROFT) ?
232 "Lincroft" : "Penwell");
233}
234
235/* MID systems don't have i8042 controller */
236static int mrst_i8042_detect(void)
237{
238 return 0;
239}
240
241/* Reboot and power off are handled by the SCU on a MID device */
242static void mrst_power_off(void)
243{
244 intel_scu_ipc_simple_command(0xf1, 1);
245}
246
247static void mrst_reboot(void)
248{
249 intel_scu_ipc_simple_command(0xf1, 0);
250}
251
252/*
253 * Moorestown specific x86_init function overrides and early setup
254 * calls.
255 */
256void __init x86_mrst_early_setup(void)
257{
258 x86_init.resources.probe_roms = x86_init_noop;
259 x86_init.resources.reserve_resources = x86_init_noop;
260
261 x86_init.timers.timer_init = mrst_time_init;
262 x86_init.timers.setup_percpu_clockev = x86_init_noop;
263
264 x86_init.irqs.pre_vector_init = x86_init_noop;
265
266 x86_init.oem.arch_setup = mrst_arch_setup;
267
268 x86_cpuinit.setup_percpu_clockev = apbt_setup_secondary_clock;
269
270 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
271 x86_platform.i8042_detect = mrst_i8042_detect;
272 x86_init.timers.wallclock_init = mrst_rtc_init;
273 x86_init.pci.init = pci_mrst_init;
274 x86_init.pci.fixup_irqs = x86_init_noop;
275
276 legacy_pic = &null_legacy_pic;
277
278 /* Moorestown specific power_off/restart method */
279 pm_power_off = mrst_power_off;
280 machine_ops.emergency_restart = mrst_reboot;
281
282 /* Avoid searching for BIOS MP tables */
283 x86_init.mpparse.find_smp_config = x86_init_noop;
284 x86_init.mpparse.get_smp_config = x86_init_uint_noop;
285 set_bit(MP_BUS_ISA, mp_bus_not_pci);
286}
287
288/*
289 * if user does not want to use per CPU apb timer, just give it a lower rating
290 * than local apic timer and skip the late per cpu timer init.
291 */
292static inline int __init setup_x86_mrst_timer(char *arg)
293{
294 if (!arg)
295 return -EINVAL;
296
297 if (strcmp("apbt_only", arg) == 0)
298 mrst_timer_options = MRST_TIMER_APBT_ONLY;
299 else if (strcmp("lapic_and_apbt", arg) == 0)
300 mrst_timer_options = MRST_TIMER_LAPIC_APBT;
301 else {
302 pr_warning("X86 MRST timer option %s not recognised"
303 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
304 arg);
305 return -EINVAL;
306 }
307 return 0;
308}
309__setup("x86_mrst_timer=", setup_x86_mrst_timer);
310
311/*
312 * Parsing GPIO table first, since the DEVS table will need this table
313 * to map the pin name to the actual pin.
314 */
315static struct sfi_gpio_table_entry *gpio_table;
316static int gpio_num_entry;
317
318static int __init sfi_parse_gpio(struct sfi_table_header *table)
319{
320 struct sfi_table_simple *sb;
321 struct sfi_gpio_table_entry *pentry;
322 int num, i;
323
324 if (gpio_table)
325 return 0;
326 sb = (struct sfi_table_simple *)table;
327 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_gpio_table_entry);
328 pentry = (struct sfi_gpio_table_entry *)sb->pentry;
329
330 gpio_table = (struct sfi_gpio_table_entry *)
331 kmalloc(num * sizeof(*pentry), GFP_KERNEL);
332 if (!gpio_table)
333 return -1;
334 memcpy(gpio_table, pentry, num * sizeof(*pentry));
335 gpio_num_entry = num;
336
337 pr_debug("GPIO pin info:\n");
338 for (i = 0; i < num; i++, pentry++)
339 pr_debug("info[%2d]: controller = %16.16s, pin_name = %16.16s,"
340 " pin = %d\n", i,
341 pentry->controller_name,
342 pentry->pin_name,
343 pentry->pin_no);
344 return 0;
345}
346
347static int get_gpio_by_name(const char *name)
348{
349 struct sfi_gpio_table_entry *pentry = gpio_table;
350 int i;
351
352 if (!pentry)
353 return -1;
354 for (i = 0; i < gpio_num_entry; i++, pentry++) {
355 if (!strncmp(name, pentry->pin_name, SFI_NAME_LEN))
356 return pentry->pin_no;
357 }
358 return -1;
359}
360
361/*
362 * Here defines the array of devices platform data that IAFW would export
363 * through SFI "DEVS" table, we use name and type to match the device and
364 * its platform data.
365 */
366struct devs_id {
367 char name[SFI_NAME_LEN + 1];
368 u8 type;
369 u8 delay;
370 void *(*get_platform_data)(void *info);
371};
372
373/* the offset for the mapping of global gpio pin to irq */
374#define MRST_IRQ_OFFSET 0x100
375
376static void __init *pmic_gpio_platform_data(void *info)
377{
378 static struct intel_pmic_gpio_platform_data pmic_gpio_pdata;
379 int gpio_base = get_gpio_by_name("pmic_gpio_base");
380
381 if (gpio_base == -1)
382 gpio_base = 64;
383 pmic_gpio_pdata.gpio_base = gpio_base;
384 pmic_gpio_pdata.irq_base = gpio_base + MRST_IRQ_OFFSET;
385 pmic_gpio_pdata.gpiointr = 0xffffeff8;
386
387 return &pmic_gpio_pdata;
388}
389
390static void __init *max3111_platform_data(void *info)
391{
392 struct spi_board_info *spi_info = info;
393 int intr = get_gpio_by_name("max3111_int");
394
395 if (intr == -1)
396 return NULL;
397 spi_info->irq = intr + MRST_IRQ_OFFSET;
398 return NULL;
399}
400
401/* we have multiple max7315 on the board ... */
402#define MAX7315_NUM 2
403static void __init *max7315_platform_data(void *info)
404{
405 static struct pca953x_platform_data max7315_pdata[MAX7315_NUM];
406 static int nr;
407 struct pca953x_platform_data *max7315 = &max7315_pdata[nr];
408 struct i2c_board_info *i2c_info = info;
409 int gpio_base, intr;
410 char base_pin_name[SFI_NAME_LEN + 1];
411 char intr_pin_name[SFI_NAME_LEN + 1];
412
413 if (nr == MAX7315_NUM) {
414 pr_err("too many max7315s, we only support %d\n",
415 MAX7315_NUM);
416 return NULL;
417 }
418 /* we have several max7315 on the board, we only need load several
419 * instances of the same pca953x driver to cover them
420 */
421 strcpy(i2c_info->type, "max7315");
422 if (nr++) {
423 sprintf(base_pin_name, "max7315_%d_base", nr);
424 sprintf(intr_pin_name, "max7315_%d_int", nr);
425 } else {
426 strcpy(base_pin_name, "max7315_base");
427 strcpy(intr_pin_name, "max7315_int");
428 }
429
430 gpio_base = get_gpio_by_name(base_pin_name);
431 intr = get_gpio_by_name(intr_pin_name);
432
433 if (gpio_base == -1)
434 return NULL;
435 max7315->gpio_base = gpio_base;
436 if (intr != -1) {
437 i2c_info->irq = intr + MRST_IRQ_OFFSET;
438 max7315->irq_base = gpio_base + MRST_IRQ_OFFSET;
439 } else {
440 i2c_info->irq = -1;
441 max7315->irq_base = -1;
442 }
443 return max7315;
444}
445
446static void __init *emc1403_platform_data(void *info)
447{
448 static short intr2nd_pdata;
449 struct i2c_board_info *i2c_info = info;
450 int intr = get_gpio_by_name("thermal_int");
451 int intr2nd = get_gpio_by_name("thermal_alert");
452
453 if (intr == -1 || intr2nd == -1)
454 return NULL;
455
456 i2c_info->irq = intr + MRST_IRQ_OFFSET;
457 intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
458
459 return &intr2nd_pdata;
460}
461
462static void __init *lis331dl_platform_data(void *info)
463{
464 static short intr2nd_pdata;
465 struct i2c_board_info *i2c_info = info;
466 int intr = get_gpio_by_name("accel_int");
467 int intr2nd = get_gpio_by_name("accel_2");
468
469 if (intr == -1 || intr2nd == -1)
470 return NULL;
471
472 i2c_info->irq = intr + MRST_IRQ_OFFSET;
473 intr2nd_pdata = intr2nd + MRST_IRQ_OFFSET;
474
475 return &intr2nd_pdata;
476}
477
478static void __init *no_platform_data(void *info)
479{
480 return NULL;
481}
482
483static const struct devs_id __initconst device_ids[] = {
484 {"pmic_gpio", SFI_DEV_TYPE_SPI, 1, &pmic_gpio_platform_data},
485 {"spi_max3111", SFI_DEV_TYPE_SPI, 0, &max3111_platform_data},
486 {"i2c_max7315", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
487 {"i2c_max7315_2", SFI_DEV_TYPE_I2C, 1, &max7315_platform_data},
488 {"emc1403", SFI_DEV_TYPE_I2C, 1, &emc1403_platform_data},
489 {"i2c_accel", SFI_DEV_TYPE_I2C, 0, &lis331dl_platform_data},
490 {"pmic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
491 {"msic_audio", SFI_DEV_TYPE_IPC, 1, &no_platform_data},
492 {},
493};
494
495#define MAX_IPCDEVS 24
496static struct platform_device *ipc_devs[MAX_IPCDEVS];
497static int ipc_next_dev;
498
499#define MAX_SCU_SPI 24
500static struct spi_board_info *spi_devs[MAX_SCU_SPI];
501static int spi_next_dev;
502
503#define MAX_SCU_I2C 24
504static struct i2c_board_info *i2c_devs[MAX_SCU_I2C];
505static int i2c_bus[MAX_SCU_I2C];
506static int i2c_next_dev;
507
508static void __init intel_scu_device_register(struct platform_device *pdev)
509{
510 if(ipc_next_dev == MAX_IPCDEVS)
511 pr_err("too many SCU IPC devices");
512 else
513 ipc_devs[ipc_next_dev++] = pdev;
514}
515
516static void __init intel_scu_spi_device_register(struct spi_board_info *sdev)
517{
518 struct spi_board_info *new_dev;
519
520 if (spi_next_dev == MAX_SCU_SPI) {
521 pr_err("too many SCU SPI devices");
522 return;
523 }
524
525 new_dev = kzalloc(sizeof(*sdev), GFP_KERNEL);
526 if (!new_dev) {
527 pr_err("failed to alloc mem for delayed spi dev %s\n",
528 sdev->modalias);
529 return;
530 }
531 memcpy(new_dev, sdev, sizeof(*sdev));
532
533 spi_devs[spi_next_dev++] = new_dev;
534}
535
536static void __init intel_scu_i2c_device_register(int bus,
537 struct i2c_board_info *idev)
538{
539 struct i2c_board_info *new_dev;
540
541 if (i2c_next_dev == MAX_SCU_I2C) {
542 pr_err("too many SCU I2C devices");
543 return;
544 }
545
546 new_dev = kzalloc(sizeof(*idev), GFP_KERNEL);
547 if (!new_dev) {
548 pr_err("failed to alloc mem for delayed i2c dev %s\n",
549 idev->type);
550 return;
551 }
552 memcpy(new_dev, idev, sizeof(*idev));
553
554 i2c_bus[i2c_next_dev] = bus;
555 i2c_devs[i2c_next_dev++] = new_dev;
556}
557
558/* Called by IPC driver */
559void intel_scu_devices_create(void)
560{
561 int i;
562
563 for (i = 0; i < ipc_next_dev; i++)
564 platform_device_add(ipc_devs[i]);
565
566 for (i = 0; i < spi_next_dev; i++)
567 spi_register_board_info(spi_devs[i], 1);
568
569 for (i = 0; i < i2c_next_dev; i++) {
570 struct i2c_adapter *adapter;
571 struct i2c_client *client;
572
573 adapter = i2c_get_adapter(i2c_bus[i]);
574 if (adapter) {
575 client = i2c_new_device(adapter, i2c_devs[i]);
576 if (!client)
577 pr_err("can't create i2c device %s\n",
578 i2c_devs[i]->type);
579 } else
580 i2c_register_board_info(i2c_bus[i], i2c_devs[i], 1);
581 }
582}
583EXPORT_SYMBOL_GPL(intel_scu_devices_create);
584
585/* Called by IPC driver */
586void intel_scu_devices_destroy(void)
587{
588 int i;
589
590 for (i = 0; i < ipc_next_dev; i++)
591 platform_device_del(ipc_devs[i]);
592}
593EXPORT_SYMBOL_GPL(intel_scu_devices_destroy);
594
595static void __init install_irq_resource(struct platform_device *pdev, int irq)
596{
597 /* Single threaded */
598 static struct resource __initdata res = {
599 .name = "IRQ",
600 .flags = IORESOURCE_IRQ,
601 };
602 res.start = irq;
603 platform_device_add_resources(pdev, &res, 1);
604}
605
606static void __init sfi_handle_ipc_dev(struct platform_device *pdev)
607{
608 const struct devs_id *dev = device_ids;
609 void *pdata = NULL;
610
611 while (dev->name[0]) {
612 if (dev->type == SFI_DEV_TYPE_IPC &&
613 !strncmp(dev->name, pdev->name, SFI_NAME_LEN)) {
614 pdata = dev->get_platform_data(pdev);
615 break;
616 }
617 dev++;
618 }
619 pdev->dev.platform_data = pdata;
620 intel_scu_device_register(pdev);
621}
622
623static void __init sfi_handle_spi_dev(struct spi_board_info *spi_info)
624{
625 const struct devs_id *dev = device_ids;
626 void *pdata = NULL;
627
628 while (dev->name[0]) {
629 if (dev->type == SFI_DEV_TYPE_SPI &&
630 !strncmp(dev->name, spi_info->modalias, SFI_NAME_LEN)) {
631 pdata = dev->get_platform_data(spi_info);
632 break;
633 }
634 dev++;
635 }
636 spi_info->platform_data = pdata;
637 if (dev->delay)
638 intel_scu_spi_device_register(spi_info);
639 else
640 spi_register_board_info(spi_info, 1);
641}
642
643static void __init sfi_handle_i2c_dev(int bus, struct i2c_board_info *i2c_info)
644{
645 const struct devs_id *dev = device_ids;
646 void *pdata = NULL;
647
648 while (dev->name[0]) {
649 if (dev->type == SFI_DEV_TYPE_I2C &&
650 !strncmp(dev->name, i2c_info->type, SFI_NAME_LEN)) {
651 pdata = dev->get_platform_data(i2c_info);
652 break;
653 }
654 dev++;
655 }
656 i2c_info->platform_data = pdata;
657
658 if (dev->delay)
659 intel_scu_i2c_device_register(bus, i2c_info);
660 else
661 i2c_register_board_info(bus, i2c_info, 1);
662 }
663
664
665static int __init sfi_parse_devs(struct sfi_table_header *table)
666{
667 struct sfi_table_simple *sb;
668 struct sfi_device_table_entry *pentry;
669 struct spi_board_info spi_info;
670 struct i2c_board_info i2c_info;
671 struct platform_device *pdev;
672 int num, i, bus;
673 int ioapic;
674 struct io_apic_irq_attr irq_attr;
675
676 sb = (struct sfi_table_simple *)table;
677 num = SFI_GET_NUM_ENTRIES(sb, struct sfi_device_table_entry);
678 pentry = (struct sfi_device_table_entry *)sb->pentry;
679
680 for (i = 0; i < num; i++, pentry++) {
681 if (pentry->irq != (u8)0xff) { /* native RTE case */
682 /* these SPI2 devices are not exposed to system as PCI
683 * devices, but they have separate RTE entry in IOAPIC
684 * so we have to enable them one by one here
685 */
686 ioapic = mp_find_ioapic(pentry->irq);
687 irq_attr.ioapic = ioapic;
688 irq_attr.ioapic_pin = pentry->irq;
689 irq_attr.trigger = 1;
690 irq_attr.polarity = 1;
691 io_apic_set_pci_routing(NULL, pentry->irq, &irq_attr);
692 }
693 switch (pentry->type) {
694 case SFI_DEV_TYPE_IPC:
695 /* ID as IRQ is a hack that will go away */
696 pdev = platform_device_alloc(pentry->name, pentry->irq);
697 if (pdev == NULL) {
698 pr_err("out of memory for SFI platform device '%s'.\n",
699 pentry->name);
700 continue;
701 }
702 install_irq_resource(pdev, pentry->irq);
703 pr_debug("info[%2d]: IPC bus, name = %16.16s, "
704 "irq = 0x%2x\n", i, pentry->name, pentry->irq);
705 sfi_handle_ipc_dev(pdev);
706 break;
707 case SFI_DEV_TYPE_SPI:
708 memset(&spi_info, 0, sizeof(spi_info));
709 strncpy(spi_info.modalias, pentry->name, SFI_NAME_LEN);
710 spi_info.irq = pentry->irq;
711 spi_info.bus_num = pentry->host_num;
712 spi_info.chip_select = pentry->addr;
713 spi_info.max_speed_hz = pentry->max_freq;
714 pr_debug("info[%2d]: SPI bus = %d, name = %16.16s, "
715 "irq = 0x%2x, max_freq = %d, cs = %d\n", i,
716 spi_info.bus_num,
717 spi_info.modalias,
718 spi_info.irq,
719 spi_info.max_speed_hz,
720 spi_info.chip_select);
721 sfi_handle_spi_dev(&spi_info);
722 break;
723 case SFI_DEV_TYPE_I2C:
724 memset(&i2c_info, 0, sizeof(i2c_info));
725 bus = pentry->host_num;
726 strncpy(i2c_info.type, pentry->name, SFI_NAME_LEN);
727 i2c_info.irq = pentry->irq;
728 i2c_info.addr = pentry->addr;
729 pr_debug("info[%2d]: I2C bus = %d, name = %16.16s, "
730 "irq = 0x%2x, addr = 0x%x\n", i, bus,
731 i2c_info.type,
732 i2c_info.irq,
733 i2c_info.addr);
734 sfi_handle_i2c_dev(bus, &i2c_info);
735 break;
736 case SFI_DEV_TYPE_UART:
737 case SFI_DEV_TYPE_HSI:
738 default:
739 ;
740 }
741 }
742 return 0;
743}
744
745static int __init mrst_platform_init(void)
746{
747 sfi_table_parse(SFI_SIG_GPIO, NULL, NULL, sfi_parse_gpio);
748 sfi_table_parse(SFI_SIG_DEVS, NULL, NULL, sfi_parse_devs);
749 return 0;
750}
751arch_initcall(mrst_platform_init);
752
753/*
754 * we will search these buttons in SFI GPIO table (by name)
755 * and register them dynamically. Please add all possible
756 * buttons here, we will shrink them if no GPIO found.
757 */
758static struct gpio_keys_button gpio_button[] = {
759 {KEY_POWER, -1, 1, "power_btn", EV_KEY, 0, 3000},
760 {KEY_PROG1, -1, 1, "prog_btn1", EV_KEY, 0, 20},
761 {KEY_PROG2, -1, 1, "prog_btn2", EV_KEY, 0, 20},
762 {SW_LID, -1, 1, "lid_switch", EV_SW, 0, 20},
763 {KEY_VOLUMEUP, -1, 1, "vol_up", EV_KEY, 0, 20},
764 {KEY_VOLUMEDOWN, -1, 1, "vol_down", EV_KEY, 0, 20},
765 {KEY_CAMERA, -1, 1, "camera_full", EV_KEY, 0, 20},
766 {KEY_CAMERA_FOCUS, -1, 1, "camera_half", EV_KEY, 0, 20},
767 {SW_KEYPAD_SLIDE, -1, 1, "MagSw1", EV_SW, 0, 20},
768 {SW_KEYPAD_SLIDE, -1, 1, "MagSw2", EV_SW, 0, 20},
769};
770
771static struct gpio_keys_platform_data mrst_gpio_keys = {
772 .buttons = gpio_button,
773 .rep = 1,
774 .nbuttons = -1, /* will fill it after search */
775};
776
777static struct platform_device pb_device = {
778 .name = "gpio-keys",
779 .id = -1,
780 .dev = {
781 .platform_data = &mrst_gpio_keys,
782 },
783};
784
785/*
786 * Shrink the non-existent buttons, register the gpio button
787 * device if there is some
788 */
789static int __init pb_keys_init(void)
790{
791 struct gpio_keys_button *gb = gpio_button;
792 int i, num, good = 0;
793
794 num = sizeof(gpio_button) / sizeof(struct gpio_keys_button);
795 for (i = 0; i < num; i++) {
796 gb[i].gpio = get_gpio_by_name(gb[i].desc);
797 if (gb[i].gpio == -1)
798 continue;
799
800 if (i != good)
801 gb[good] = gb[i];
802 good++;
803 }
804
805 if (good) {
806 mrst_gpio_keys.nbuttons = good;
807 return platform_device_register(&pb_device);
808 }
809 return 0;
810}
811late_initcall(pb_keys_init);
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
new file mode 100644
index 000000000000..73d70d65e76e
--- /dev/null
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -0,0 +1,159 @@
1/*
2 * vrtc.c: Driver for virtual RTC device on Intel MID platform
3 *
4 * (C) Copyright 2009 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 *
11 * Note:
12 * VRTC is emulated by system controller firmware, the real HW
13 * RTC is located in the PMIC device. SCU FW shadows PMIC RTC
14 * in a memory mapped IO space that is visible to the host IA
15 * processor.
16 *
17 * This driver is based on RTC CMOS driver.
18 */
19
20#include <linux/kernel.h>
21#include <linux/init.h>
22#include <linux/sfi.h>
23#include <linux/platform_device.h>
24
25#include <asm/mrst.h>
26#include <asm/mrst-vrtc.h>
27#include <asm/time.h>
28#include <asm/fixmap.h>
29
30static unsigned char __iomem *vrtc_virt_base;
31
32unsigned char vrtc_cmos_read(unsigned char reg)
33{
34 unsigned char retval;
35
36 /* vRTC's registers range from 0x0 to 0xD */
37 if (reg > 0xd || !vrtc_virt_base)
38 return 0xff;
39
40 lock_cmos_prefix(reg);
41 retval = __raw_readb(vrtc_virt_base + (reg << 2));
42 lock_cmos_suffix(reg);
43 return retval;
44}
45EXPORT_SYMBOL_GPL(vrtc_cmos_read);
46
47void vrtc_cmos_write(unsigned char val, unsigned char reg)
48{
49 if (reg > 0xd || !vrtc_virt_base)
50 return;
51
52 lock_cmos_prefix(reg);
53 __raw_writeb(val, vrtc_virt_base + (reg << 2));
54 lock_cmos_suffix(reg);
55}
56EXPORT_SYMBOL_GPL(vrtc_cmos_write);
57
58unsigned long vrtc_get_time(void)
59{
60 u8 sec, min, hour, mday, mon;
61 u32 year;
62
63 while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
64 cpu_relax();
65
66 sec = vrtc_cmos_read(RTC_SECONDS);
67 min = vrtc_cmos_read(RTC_MINUTES);
68 hour = vrtc_cmos_read(RTC_HOURS);
69 mday = vrtc_cmos_read(RTC_DAY_OF_MONTH);
70 mon = vrtc_cmos_read(RTC_MONTH);
71 year = vrtc_cmos_read(RTC_YEAR);
72
73 /* vRTC YEAR reg contains the offset to 1960 */
74 year += 1960;
75
76 printk(KERN_INFO "vRTC: sec: %d min: %d hour: %d day: %d "
77 "mon: %d year: %d\n", sec, min, hour, mday, mon, year);
78
79 return mktime(year, mon, mday, hour, min, sec);
80}
81
82/* Only care about the minutes and seconds */
83int vrtc_set_mmss(unsigned long nowtime)
84{
85 int real_sec, real_min;
86 int vrtc_min;
87
88 vrtc_min = vrtc_cmos_read(RTC_MINUTES);
89
90 real_sec = nowtime % 60;
91 real_min = nowtime / 60;
92 if (((abs(real_min - vrtc_min) + 15)/30) & 1)
93 real_min += 30;
94 real_min %= 60;
95
96 vrtc_cmos_write(real_sec, RTC_SECONDS);
97 vrtc_cmos_write(real_min, RTC_MINUTES);
98 return 0;
99}
100
101void __init mrst_rtc_init(void)
102{
103 unsigned long vrtc_paddr;
104
105 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
106
107 vrtc_paddr = sfi_mrtc_array[0].phys_addr;
108 if (!sfi_mrtc_num || !vrtc_paddr)
109 return;
110
111 vrtc_virt_base = (void __iomem *)set_fixmap_offset_nocache(FIX_LNW_VRTC,
112 vrtc_paddr);
113 x86_platform.get_wallclock = vrtc_get_time;
114 x86_platform.set_wallclock = vrtc_set_mmss;
115}
116
117/*
118 * The Moorestown platform has a memory mapped virtual RTC device that emulates
119 * the programming interface of the RTC.
120 */
121
122static struct resource vrtc_resources[] = {
123 [0] = {
124 .flags = IORESOURCE_MEM,
125 },
126 [1] = {
127 .flags = IORESOURCE_IRQ,
128 }
129};
130
131static struct platform_device vrtc_device = {
132 .name = "rtc_mrst",
133 .id = -1,
134 .resource = vrtc_resources,
135 .num_resources = ARRAY_SIZE(vrtc_resources),
136};
137
138/* Register the RTC device if appropriate */
139static int __init mrst_device_create(void)
140{
141 /* No Moorestown, no device */
142 if (!mrst_identify_cpu())
143 return -ENODEV;
144 /* No timer, no device */
145 if (!sfi_mrtc_num)
146 return -ENODEV;
147
148 /* iomem resource */
149 vrtc_resources[0].start = sfi_mrtc_array[0].phys_addr;
150 vrtc_resources[0].end = sfi_mrtc_array[0].phys_addr +
151 MRST_VRTC_MAP_SZ;
152 /* irq resource */
153 vrtc_resources[1].start = sfi_mrtc_array[0].irq;
154 vrtc_resources[1].end = sfi_mrtc_array[0].irq;
155
156 return platform_device_register(&vrtc_device);
157}
158
159module_init(mrst_device_create);
diff --git a/arch/x86/platform/olpc/Makefile b/arch/x86/platform/olpc/Makefile
new file mode 100644
index 000000000000..81c5e2165c24
--- /dev/null
+++ b/arch/x86/platform/olpc/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_OLPC) += olpc.o olpc_ofw.o olpc_dt.o
2obj-$(CONFIG_OLPC_XO1) += olpc-xo1.o
diff --git a/arch/x86/platform/olpc/olpc-xo1.c b/arch/x86/platform/olpc/olpc-xo1.c
new file mode 100644
index 000000000000..ab81fb271760
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc-xo1.c
@@ -0,0 +1,146 @@
1/*
2 * Support for features of the OLPC XO-1 laptop
3 *
4 * Copyright (C) 2010 Andres Salomon <dilinger@queued.net>
5 * Copyright (C) 2010 One Laptop per Child
6 * Copyright (C) 2006 Red Hat, Inc.
7 * Copyright (C) 2006 Advanced Micro Devices, Inc.
8 *
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
13 */
14
15#include <linux/module.h>
16#include <linux/platform_device.h>
17#include <linux/pm.h>
18#include <linux/mfd/core.h>
19
20#include <asm/io.h>
21#include <asm/olpc.h>
22
23#define DRV_NAME "olpc-xo1"
24
25/* PMC registers (PMS block) */
26#define PM_SCLK 0x10
27#define PM_IN_SLPCTL 0x20
28#define PM_WKXD 0x34
29#define PM_WKD 0x30
30#define PM_SSC 0x54
31
32/* PM registers (ACPI block) */
33#define PM1_CNT 0x08
34#define PM_GPE0_STS 0x18
35
36static unsigned long acpi_base;
37static unsigned long pms_base;
38
39static void xo1_power_off(void)
40{
41 printk(KERN_INFO "OLPC XO-1 power off sequence...\n");
42
43 /* Enable all of these controls with 0 delay */
44 outl(0x40000000, pms_base + PM_SCLK);
45 outl(0x40000000, pms_base + PM_IN_SLPCTL);
46 outl(0x40000000, pms_base + PM_WKXD);
47 outl(0x40000000, pms_base + PM_WKD);
48
49 /* Clear status bits (possibly unnecessary) */
50 outl(0x0002ffff, pms_base + PM_SSC);
51 outl(0xffffffff, acpi_base + PM_GPE0_STS);
52
53 /* Write SLP_EN bit to start the machinery */
54 outl(0x00002000, acpi_base + PM1_CNT);
55}
56
57static int __devinit olpc_xo1_probe(struct platform_device *pdev)
58{
59 struct resource *res;
60 int err;
61
62 /* don't run on non-XOs */
63 if (!machine_is_olpc())
64 return -ENODEV;
65
66 err = mfd_cell_enable(pdev);
67 if (err)
68 return err;
69
70 res = platform_get_resource(pdev, IORESOURCE_IO, 0);
71 if (!res) {
72 dev_err(&pdev->dev, "can't fetch device resource info\n");
73 return -EIO;
74 }
75 if (strcmp(pdev->name, "cs5535-pms") == 0)
76 pms_base = res->start;
77 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
78 acpi_base = res->start;
79
80 /* If we have both addresses, we can override the poweroff hook */
81 if (pms_base && acpi_base) {
82 pm_power_off = xo1_power_off;
83 printk(KERN_INFO "OLPC XO-1 support registered\n");
84 }
85
86 return 0;
87}
88
89static int __devexit olpc_xo1_remove(struct platform_device *pdev)
90{
91 mfd_cell_disable(pdev);
92
93 if (strcmp(pdev->name, "cs5535-pms") == 0)
94 pms_base = 0;
95 else if (strcmp(pdev->name, "olpc-xo1-pm-acpi") == 0)
96 acpi_base = 0;
97
98 pm_power_off = NULL;
99 return 0;
100}
101
102static struct platform_driver cs5535_pms_drv = {
103 .driver = {
104 .name = "cs5535-pms",
105 .owner = THIS_MODULE,
106 },
107 .probe = olpc_xo1_probe,
108 .remove = __devexit_p(olpc_xo1_remove),
109};
110
111static struct platform_driver cs5535_acpi_drv = {
112 .driver = {
113 .name = "olpc-xo1-pm-acpi",
114 .owner = THIS_MODULE,
115 },
116 .probe = olpc_xo1_probe,
117 .remove = __devexit_p(olpc_xo1_remove),
118};
119
120static int __init olpc_xo1_init(void)
121{
122 int r;
123
124 r = platform_driver_register(&cs5535_pms_drv);
125 if (r)
126 return r;
127
128 r = platform_driver_register(&cs5535_acpi_drv);
129 if (r)
130 platform_driver_unregister(&cs5535_pms_drv);
131
132 return r;
133}
134
135static void __exit olpc_xo1_exit(void)
136{
137 platform_driver_unregister(&cs5535_acpi_drv);
138 platform_driver_unregister(&cs5535_pms_drv);
139}
140
141MODULE_AUTHOR("Daniel Drake <dsd@laptop.org>");
142MODULE_LICENSE("GPL");
143MODULE_ALIAS("platform:cs5535-pms");
144
145module_init(olpc_xo1_init);
146module_exit(olpc_xo1_exit);
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/platform/olpc/olpc.c
index 0e0cdde519be..0060fd59ea00 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/platform/olpc/olpc.c
@@ -17,6 +17,8 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20#include <linux/platform_device.h>
21#include <linux/of.h>
20 22
21#include <asm/geode.h> 23#include <asm/geode.h>
22#include <asm/setup.h> 24#include <asm/setup.h>
@@ -114,6 +116,7 @@ int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
114 unsigned long flags; 116 unsigned long flags;
115 int ret = -EIO; 117 int ret = -EIO;
116 int i; 118 int i;
119 int restarts = 0;
117 120
118 spin_lock_irqsave(&ec_lock, flags); 121 spin_lock_irqsave(&ec_lock, flags);
119 122
@@ -169,7 +172,9 @@ restart:
169 if (wait_on_obf(0x6c, 1)) { 172 if (wait_on_obf(0x6c, 1)) {
170 printk(KERN_ERR "olpc-ec: timeout waiting for" 173 printk(KERN_ERR "olpc-ec: timeout waiting for"
171 " EC to provide data!\n"); 174 " EC to provide data!\n");
172 goto restart; 175 if (restarts++ < 10)
176 goto restart;
177 goto err;
173 } 178 }
174 outbuf[i] = inb(0x68); 179 outbuf[i] = inb(0x68);
175 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]); 180 pr_devel("olpc-ec: received 0x%x\n", outbuf[i]);
@@ -183,55 +188,68 @@ err:
183} 188}
184EXPORT_SYMBOL_GPL(olpc_ec_cmd); 189EXPORT_SYMBOL_GPL(olpc_ec_cmd);
185 190
186#ifdef CONFIG_OLPC_OPENFIRMWARE 191static bool __init check_ofw_architecture(struct device_node *root)
187static void __init platform_detect(void)
188{
189 size_t propsize;
190 __be32 rev;
191 const void *args[] = { NULL, "board-revision-int", &rev, (void *)4 };
192 void *res[] = { &propsize };
193
194 if (olpc_ofw("getprop", args, res) || propsize != 4) {
195 printk(KERN_ERR "ofw: getprop call failed!\n");
196 rev = cpu_to_be32(0);
197 }
198 olpc_platform_info.boardrev = be32_to_cpu(rev);
199}
200#else
201static void __init platform_detect(void)
202{ 192{
203 /* stopgap until OFW support is added to the kernel */ 193 const char *olpc_arch;
204 olpc_platform_info.boardrev = olpc_board(0xc2); 194 int propsize;
195
196 olpc_arch = of_get_property(root, "architecture", &propsize);
197 return propsize == 5 && strncmp("OLPC", olpc_arch, 5) == 0;
205} 198}
206#endif
207 199
208static int __init olpc_init(void) 200static u32 __init get_board_revision(struct device_node *root)
209{ 201{
210 unsigned char *romsig; 202 int propsize;
203 const __be32 *rev;
211 204
212 /* The ioremap check is dangerous; limit what we run it on */ 205 rev = of_get_property(root, "board-revision-int", &propsize);
213 if (!is_geode() || cs5535_has_vsa2()) 206 if (propsize != 4)
214 return 0; 207 return 0;
215 208
216 spin_lock_init(&ec_lock); 209 return be32_to_cpu(*rev);
210}
217 211
218 romsig = ioremap(0xffffffc0, 16); 212static bool __init platform_detect(void)
219 if (!romsig) 213{
220 return 0; 214 struct device_node *root = of_find_node_by_path("/");
215 bool success;
221 216
222 if (strncmp(romsig, "CL1 Q", 7)) 217 if (!root)
223 goto unmap; 218 return false;
224 if (strncmp(romsig+6, romsig+13, 3)) { 219
225 printk(KERN_INFO "OLPC BIOS signature looks invalid. " 220 success = check_ofw_architecture(root);
226 "Assuming not OLPC\n"); 221 if (success) {
227 goto unmap; 222 olpc_platform_info.boardrev = get_board_revision(root);
223 olpc_platform_info.flags |= OLPC_F_PRESENT;
228 } 224 }
229 225
230 printk(KERN_INFO "OLPC board with OpenFirmware %.16s\n", romsig); 226 of_node_put(root);
231 olpc_platform_info.flags |= OLPC_F_PRESENT; 227 return success;
228}
229
230static int __init add_xo1_platform_devices(void)
231{
232 struct platform_device *pdev;
233
234 pdev = platform_device_register_simple("xo1-rfkill", -1, NULL, 0);
235 if (IS_ERR(pdev))
236 return PTR_ERR(pdev);
237
238 pdev = platform_device_register_simple("olpc-xo1", -1, NULL, 0);
239 if (IS_ERR(pdev))
240 return PTR_ERR(pdev);
241
242 return 0;
243}
244
245static int __init olpc_init(void)
246{
247 int r = 0;
232 248
233 /* get the platform revision */ 249 if (!olpc_ofw_present() || !platform_detect())
234 platform_detect(); 250 return 0;
251
252 spin_lock_init(&ec_lock);
235 253
236 /* assume B1 and above models always have a DCON */ 254 /* assume B1 and above models always have a DCON */
237 if (olpc_board_at_least(olpc_board(0xb1))) 255 if (olpc_board_at_least(olpc_board(0xb1)))
@@ -242,8 +260,10 @@ static int __init olpc_init(void)
242 (unsigned char *) &olpc_platform_info.ecver, 1); 260 (unsigned char *) &olpc_platform_info.ecver, 1);
243 261
244#ifdef CONFIG_PCI_OLPC 262#ifdef CONFIG_PCI_OLPC
245 /* If the VSA exists let it emulate PCI, if not emulate in kernel */ 263 /* If the VSA exists let it emulate PCI, if not emulate in kernel.
246 if (!cs5535_has_vsa2()) 264 * XO-1 only. */
265 if (olpc_platform_info.boardrev < olpc_board_pre(0xd0) &&
266 !cs5535_has_vsa2())
247 x86_init.pci.arch_init = pci_olpc_init; 267 x86_init.pci.arch_init = pci_olpc_init;
248#endif 268#endif
249 269
@@ -252,8 +272,12 @@ static int __init olpc_init(void)
252 olpc_platform_info.boardrev >> 4, 272 olpc_platform_info.boardrev >> 4,
253 olpc_platform_info.ecver); 273 olpc_platform_info.ecver);
254 274
255unmap: 275 if (olpc_platform_info.boardrev < olpc_board_pre(0xd0)) { /* XO-1 */
256 iounmap(romsig); 276 r = add_xo1_platform_devices();
277 if (r)
278 return r;
279 }
280
257 return 0; 281 return 0;
258} 282}
259 283
diff --git a/arch/x86/platform/olpc/olpc_dt.c b/arch/x86/platform/olpc/olpc_dt.c
new file mode 100644
index 000000000000..d39f63d017d2
--- /dev/null
+++ b/arch/x86/platform/olpc/olpc_dt.c
@@ -0,0 +1,201 @@
1/*
2 * OLPC-specific OFW device tree support code.
3 *
4 * Paul Mackerras August 1996.
5 * Copyright (C) 1996-2005 Paul Mackerras.
6 *
7 * Adapted for 64bit PowerPC by Dave Engebretsen and Peter Bergner.
8 * {engebret|bergner}@us.ibm.com
9 *
10 * Adapted for sparc by David S. Miller davem@davemloft.net
11 * Adapted for x86/OLPC by Andres Salomon <dilinger@queued.net>
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 */
18
19#include <linux/kernel.h>
20#include <linux/bootmem.h>
21#include <linux/of.h>
22#include <linux/of_platform.h>
23#include <linux/of_pdt.h>
24#include <asm/olpc.h>
25#include <asm/olpc_ofw.h>
26
27static phandle __init olpc_dt_getsibling(phandle node)
28{
29 const void *args[] = { (void *)node };
30 void *res[] = { &node };
31
32 if ((s32)node == -1)
33 return 0;
34
35 if (olpc_ofw("peer", args, res) || (s32)node == -1)
36 return 0;
37
38 return node;
39}
40
41static phandle __init olpc_dt_getchild(phandle node)
42{
43 const void *args[] = { (void *)node };
44 void *res[] = { &node };
45
46 if ((s32)node == -1)
47 return 0;
48
49 if (olpc_ofw("child", args, res) || (s32)node == -1) {
50 pr_err("PROM: %s: fetching child failed!\n", __func__);
51 return 0;
52 }
53
54 return node;
55}
56
57static int __init olpc_dt_getproplen(phandle node, const char *prop)
58{
59 const void *args[] = { (void *)node, prop };
60 int len;
61 void *res[] = { &len };
62
63 if ((s32)node == -1)
64 return -1;
65
66 if (olpc_ofw("getproplen", args, res)) {
67 pr_err("PROM: %s: getproplen failed!\n", __func__);
68 return -1;
69 }
70
71 return len;
72}
73
74static int __init olpc_dt_getproperty(phandle node, const char *prop,
75 char *buf, int bufsize)
76{
77 int plen;
78
79 plen = olpc_dt_getproplen(node, prop);
80 if (plen > bufsize || plen < 1) {
81 return -1;
82 } else {
83 const void *args[] = { (void *)node, prop, buf, (void *)plen };
84 void *res[] = { &plen };
85
86 if (olpc_ofw("getprop", args, res)) {
87 pr_err("PROM: %s: getprop failed!\n", __func__);
88 return -1;
89 }
90 }
91
92 return plen;
93}
94
95static int __init olpc_dt_nextprop(phandle node, char *prev, char *buf)
96{
97 const void *args[] = { (void *)node, prev, buf };
98 int success;
99 void *res[] = { &success };
100
101 buf[0] = '\0';
102
103 if ((s32)node == -1)
104 return -1;
105
106 if (olpc_ofw("nextprop", args, res) || success != 1)
107 return -1;
108
109 return 0;
110}
111
112static int __init olpc_dt_pkg2path(phandle node, char *buf,
113 const int buflen, int *len)
114{
115 const void *args[] = { (void *)node, buf, (void *)buflen };
116 void *res[] = { len };
117
118 if ((s32)node == -1)
119 return -1;
120
121 if (olpc_ofw("package-to-path", args, res) || *len < 1)
122 return -1;
123
124 return 0;
125}
126
127static unsigned int prom_early_allocated __initdata;
128
129void * __init prom_early_alloc(unsigned long size)
130{
131 static u8 *mem;
132 static size_t free_mem;
133 void *res;
134
135 if (free_mem < size) {
136 const size_t chunk_size = max(PAGE_SIZE, size);
137
138 /*
139 * To mimimize the number of allocations, grab at least
140 * PAGE_SIZE of memory (that's an arbitrary choice that's
141 * fast enough on the platforms we care about while minimizing
142 * wasted bootmem) and hand off chunks of it to callers.
143 */
144 res = alloc_bootmem(chunk_size);
145 BUG_ON(!res);
146 prom_early_allocated += chunk_size;
147 memset(res, 0, chunk_size);
148 free_mem = chunk_size;
149 mem = res;
150 }
151
152 /* allocate from the local cache */
153 free_mem -= size;
154 res = mem;
155 mem += size;
156 return res;
157}
158
159static struct of_pdt_ops prom_olpc_ops __initdata = {
160 .nextprop = olpc_dt_nextprop,
161 .getproplen = olpc_dt_getproplen,
162 .getproperty = olpc_dt_getproperty,
163 .getchild = olpc_dt_getchild,
164 .getsibling = olpc_dt_getsibling,
165 .pkg2path = olpc_dt_pkg2path,
166};
167
168void __init olpc_dt_build_devicetree(void)
169{
170 phandle root;
171
172 if (!olpc_ofw_is_installed())
173 return;
174
175 root = olpc_dt_getsibling(0);
176 if (!root) {
177 pr_err("PROM: unable to get root node from OFW!\n");
178 return;
179 }
180 of_pdt_build_devicetree(root, &prom_olpc_ops);
181
182 pr_info("PROM DT: Built device tree with %u bytes of memory.\n",
183 prom_early_allocated);
184}
185
186/* A list of DT node/bus matches that we want to expose as platform devices */
187static struct of_device_id __initdata of_ids[] = {
188 { .compatible = "olpc,xo1-battery" },
189 { .compatible = "olpc,xo1-dcon" },
190 { .compatible = "olpc,xo1-rtc" },
191 {},
192};
193
194static int __init olpc_create_platform_devices(void)
195{
196 if (machine_is_olpc())
197 return of_platform_bus_probe(NULL, of_ids, NULL);
198 else
199 return 0;
200}
201device_initcall(olpc_create_platform_devices);
diff --git a/arch/x86/kernel/olpc_ofw.c b/arch/x86/platform/olpc/olpc_ofw.c
index 3218aa71ab5e..e7604f62870d 100644
--- a/arch/x86/kernel/olpc_ofw.c
+++ b/arch/x86/platform/olpc/olpc_ofw.c
@@ -74,6 +74,12 @@ int __olpc_ofw(const char *name, int nr_args, const void **args, int nr_res,
74} 74}
75EXPORT_SYMBOL_GPL(__olpc_ofw); 75EXPORT_SYMBOL_GPL(__olpc_ofw);
76 76
77bool olpc_ofw_present(void)
78{
79 return olpc_ofw_cif != NULL;
80}
81EXPORT_SYMBOL_GPL(olpc_ofw_present);
82
77/* OFW cif _should_ be above this address */ 83/* OFW cif _should_ be above this address */
78#define OFW_MIN 0xff000000 84#define OFW_MIN 0xff000000
79 85
@@ -104,3 +110,8 @@ void __init olpc_ofw_detect(void)
104 (unsigned long)olpc_ofw_cif, (-start) >> 20); 110 (unsigned long)olpc_ofw_cif, (-start) >> 20);
105 reserve_top_address(-start); 111 reserve_top_address(-start);
106} 112}
113
114bool __init olpc_ofw_is_installed(void)
115{
116 return olpc_ofw_cif != NULL;
117}
diff --git a/arch/x86/platform/scx200/Makefile b/arch/x86/platform/scx200/Makefile
new file mode 100644
index 000000000000..762b4c7f4314
--- /dev/null
+++ b/arch/x86/platform/scx200/Makefile
@@ -0,0 +1,2 @@
1obj-$(CONFIG_SCx200) += scx200.o
2scx200-y += scx200_32.o
diff --git a/arch/x86/kernel/scx200_32.c b/arch/x86/platform/scx200/scx200_32.c
index 7e004acbe526..7e004acbe526 100644
--- a/arch/x86/kernel/scx200_32.c
+++ b/arch/x86/platform/scx200/scx200_32.c
diff --git a/arch/x86/platform/sfi/Makefile b/arch/x86/platform/sfi/Makefile
new file mode 100644
index 000000000000..cc5db1168a5e
--- /dev/null
+++ b/arch/x86/platform/sfi/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_SFI) += sfi.o
diff --git a/arch/x86/kernel/sfi.c b/arch/x86/platform/sfi/sfi.c
index cb22acf3ed09..7785b72ecc3a 100644
--- a/arch/x86/kernel/sfi.c
+++ b/arch/x86/platform/sfi/sfi.c
@@ -34,23 +34,12 @@
34#ifdef CONFIG_X86_LOCAL_APIC 34#ifdef CONFIG_X86_LOCAL_APIC
35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE; 35static unsigned long sfi_lapic_addr __initdata = APIC_DEFAULT_PHYS_BASE;
36 36
37void __init mp_sfi_register_lapic_address(unsigned long address)
38{
39 mp_lapic_addr = address;
40
41 set_fixmap_nocache(FIX_APIC_BASE, mp_lapic_addr);
42 if (boot_cpu_physical_apicid == -1U)
43 boot_cpu_physical_apicid = read_apic_id();
44
45 pr_info("Boot CPU = %d\n", boot_cpu_physical_apicid);
46}
47
48/* All CPUs enumerated by SFI must be present and enabled */ 37/* All CPUs enumerated by SFI must be present and enabled */
49void __cpuinit mp_sfi_register_lapic(u8 id) 38static void __cpuinit mp_sfi_register_lapic(u8 id)
50{ 39{
51 if (MAX_APICS - id <= 0) { 40 if (MAX_LOCAL_APIC - id <= 0) {
52 pr_warning("Processor #%d invalid (max %d)\n", 41 pr_warning("Processor #%d invalid (max %d)\n",
53 id, MAX_APICS); 42 id, MAX_LOCAL_APIC);
54 return; 43 return;
55 } 44 }
56 45
@@ -110,7 +99,7 @@ static int __init sfi_parse_ioapic(struct sfi_table_header *table)
110int __init sfi_platform_init(void) 99int __init sfi_platform_init(void)
111{ 100{
112#ifdef CONFIG_X86_LOCAL_APIC 101#ifdef CONFIG_X86_LOCAL_APIC
113 mp_sfi_register_lapic_address(sfi_lapic_addr); 102 register_lapic_address(sfi_lapic_addr);
114 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus); 103 sfi_table_parse(SFI_SIG_CPUS, NULL, NULL, sfi_parse_cpus);
115#endif 104#endif
116#ifdef CONFIG_X86_IO_APIC 105#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/platform/uv/Makefile b/arch/x86/platform/uv/Makefile
new file mode 100644
index 000000000000..6c40995fefb8
--- /dev/null
+++ b/arch/x86/platform/uv/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_UV) += tlb_uv.o bios_uv.o uv_irq.o uv_sysfs.o uv_time.o
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/platform/uv/bios_uv.c
index 8bc57baaa9ad..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/platform/uv/bios_uv.c
diff --git a/arch/x86/platform/uv/tlb_uv.c b/arch/x86/platform/uv/tlb_uv.c
new file mode 100644
index 000000000000..68e467f69fec
--- /dev/null
+++ b/arch/x86/platform/uv/tlb_uv.c
@@ -0,0 +1,1857 @@
1/*
2 * SGI UltraViolet TLB flush routines.
3 *
4 * (c) 2008-2011 Cliff Wickman <cpw@sgi.com>, SGI.
5 *
6 * This code is released under the GNU General Public License version 2 or
7 * later.
8 */
9#include <linux/seq_file.h>
10#include <linux/proc_fs.h>
11#include <linux/debugfs.h>
12#include <linux/kernel.h>
13#include <linux/slab.h>
14#include <linux/delay.h>
15
16#include <asm/mmu_context.h>
17#include <asm/uv/uv.h>
18#include <asm/uv/uv_mmrs.h>
19#include <asm/uv/uv_hub.h>
20#include <asm/uv/uv_bau.h>
21#include <asm/apic.h>
22#include <asm/idle.h>
23#include <asm/tsc.h>
24#include <asm/irq_vectors.h>
25#include <asm/timer.h>
26
27/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
28static int timeout_base_ns[] = {
29 20,
30 160,
31 1280,
32 10240,
33 81920,
34 655360,
35 5242880,
36 167772160
37};
38
39static int timeout_us;
40static int nobau;
41static int baudisabled;
42static spinlock_t disable_lock;
43static cycles_t congested_cycles;
44
45/* tunables: */
46static int max_concurr = MAX_BAU_CONCURRENT;
47static int max_concurr_const = MAX_BAU_CONCURRENT;
48static int plugged_delay = PLUGGED_DELAY;
49static int plugsb4reset = PLUGSB4RESET;
50static int timeoutsb4reset = TIMEOUTSB4RESET;
51static int ipi_reset_limit = IPI_RESET_LIMIT;
52static int complete_threshold = COMPLETE_THRESHOLD;
53static int congested_respns_us = CONGESTED_RESPONSE_US;
54static int congested_reps = CONGESTED_REPS;
55static int congested_period = CONGESTED_PERIOD;
56
57static struct tunables tunables[] = {
58 {&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
59 {&plugged_delay, PLUGGED_DELAY},
60 {&plugsb4reset, PLUGSB4RESET},
61 {&timeoutsb4reset, TIMEOUTSB4RESET},
62 {&ipi_reset_limit, IPI_RESET_LIMIT},
63 {&complete_threshold, COMPLETE_THRESHOLD},
64 {&congested_respns_us, CONGESTED_RESPONSE_US},
65 {&congested_reps, CONGESTED_REPS},
66 {&congested_period, CONGESTED_PERIOD}
67};
68
69static struct dentry *tunables_dir;
70static struct dentry *tunables_file;
71
72/* these correspond to the statistics printed by ptc_seq_show() */
73static char *stat_description[] = {
74 "sent: number of shootdown messages sent",
75 "stime: time spent sending messages",
76 "numuvhubs: number of hubs targeted with shootdown",
77 "numuvhubs16: number times 16 or more hubs targeted",
78 "numuvhubs8: number times 8 or more hubs targeted",
79 "numuvhubs4: number times 4 or more hubs targeted",
80 "numuvhubs2: number times 2 or more hubs targeted",
81 "numuvhubs1: number times 1 hub targeted",
82 "numcpus: number of cpus targeted with shootdown",
83 "dto: number of destination timeouts",
84 "retries: destination timeout retries sent",
85 "rok: : destination timeouts successfully retried",
86 "resetp: ipi-style resource resets for plugs",
87 "resett: ipi-style resource resets for timeouts",
88 "giveup: fall-backs to ipi-style shootdowns",
89 "sto: number of source timeouts",
90 "bz: number of stay-busy's",
91 "throt: number times spun in throttle",
92 "swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
93 "recv: shootdown messages received",
94 "rtime: time spent processing messages",
95 "all: shootdown all-tlb messages",
96 "one: shootdown one-tlb messages",
97 "mult: interrupts that found multiple messages",
98 "none: interrupts that found no messages",
99 "retry: number of retry messages processed",
100 "canc: number messages canceled by retries",
101 "nocan: number retries that found nothing to cancel",
102 "reset: number of ipi-style reset requests processed",
103 "rcan: number messages canceled by reset requests",
104 "disable: number times use of the BAU was disabled",
105 "enable: number times use of the BAU was re-enabled"
106};
107
108static int __init
109setup_nobau(char *arg)
110{
111 nobau = 1;
112 return 0;
113}
114early_param("nobau", setup_nobau);
115
116/* base pnode in this partition */
117static int uv_base_pnode __read_mostly;
118/* position of pnode (which is nasid>>1): */
119static int uv_nshift __read_mostly;
120static unsigned long uv_mmask __read_mostly;
121
122static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
123static DEFINE_PER_CPU(struct bau_control, bau_control);
124static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
125
126/*
127 * Determine the first node on a uvhub. 'Nodes' are used for kernel
128 * memory allocation.
129 */
130static int __init uvhub_to_first_node(int uvhub)
131{
132 int node, b;
133
134 for_each_online_node(node) {
135 b = uv_node_to_blade_id(node);
136 if (uvhub == b)
137 return node;
138 }
139 return -1;
140}
141
142/*
143 * Determine the apicid of the first cpu on a uvhub.
144 */
145static int __init uvhub_to_first_apicid(int uvhub)
146{
147 int cpu;
148
149 for_each_present_cpu(cpu)
150 if (uvhub == uv_cpu_to_blade_id(cpu))
151 return per_cpu(x86_cpu_to_apicid, cpu);
152 return -1;
153}
154
155/*
156 * Free a software acknowledge hardware resource by clearing its Pending
157 * bit. This will return a reply to the sender.
158 * If the message has timed out, a reply has already been sent by the
159 * hardware but the resource has not been released. In that case our
160 * clear of the Timeout bit (as well) will free the resource. No reply will
161 * be sent (the hardware will only do one reply per message).
162 */
163static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
164{
165 unsigned long dw;
166 struct bau_pq_entry *msg;
167
168 msg = mdp->msg;
169 if (!msg->canceled) {
170 dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
171 write_mmr_sw_ack(dw);
172 }
173 msg->replied_to = 1;
174 msg->swack_vec = 0;
175}
176
177/*
178 * Process the receipt of a RETRY message
179 */
180static void bau_process_retry_msg(struct msg_desc *mdp,
181 struct bau_control *bcp)
182{
183 int i;
184 int cancel_count = 0;
185 unsigned long msg_res;
186 unsigned long mmr = 0;
187 struct bau_pq_entry *msg = mdp->msg;
188 struct bau_pq_entry *msg2;
189 struct ptc_stats *stat = bcp->statp;
190
191 stat->d_retries++;
192 /*
193 * cancel any message from msg+1 to the retry itself
194 */
195 for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
196 if (msg2 > mdp->queue_last)
197 msg2 = mdp->queue_first;
198 if (msg2 == msg)
199 break;
200
201 /* same conditions for cancellation as do_reset */
202 if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
203 (msg2->swack_vec) && ((msg2->swack_vec &
204 msg->swack_vec) == 0) &&
205 (msg2->sending_cpu == msg->sending_cpu) &&
206 (msg2->msg_type != MSG_NOOP)) {
207 mmr = read_mmr_sw_ack();
208 msg_res = msg2->swack_vec;
209 /*
210 * This is a message retry; clear the resources held
211 * by the previous message only if they timed out.
212 * If it has not timed out we have an unexpected
213 * situation to report.
214 */
215 if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
216 unsigned long mr;
217 /*
218 * is the resource timed out?
219 * make everyone ignore the cancelled message.
220 */
221 msg2->canceled = 1;
222 stat->d_canceled++;
223 cancel_count++;
224 mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
225 write_mmr_sw_ack(mr);
226 }
227 }
228 }
229 if (!cancel_count)
230 stat->d_nocanceled++;
231}
232
233/*
234 * Do all the things a cpu should do for a TLB shootdown message.
235 * Other cpu's may come here at the same time for this message.
236 */
237static void bau_process_message(struct msg_desc *mdp,
238 struct bau_control *bcp)
239{
240 short socket_ack_count = 0;
241 short *sp;
242 struct atomic_short *asp;
243 struct ptc_stats *stat = bcp->statp;
244 struct bau_pq_entry *msg = mdp->msg;
245 struct bau_control *smaster = bcp->socket_master;
246
247 /*
248 * This must be a normal message, or retry of a normal message
249 */
250 if (msg->address == TLB_FLUSH_ALL) {
251 local_flush_tlb();
252 stat->d_alltlb++;
253 } else {
254 __flush_tlb_one(msg->address);
255 stat->d_onetlb++;
256 }
257 stat->d_requestee++;
258
259 /*
260 * One cpu on each uvhub has the additional job on a RETRY
261 * of releasing the resource held by the message that is
262 * being retried. That message is identified by sending
263 * cpu number.
264 */
265 if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
266 bau_process_retry_msg(mdp, bcp);
267
268 /*
269 * This is a swack message, so we have to reply to it.
270 * Count each responding cpu on the socket. This avoids
271 * pinging the count's cache line back and forth between
272 * the sockets.
273 */
274 sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
275 asp = (struct atomic_short *)sp;
276 socket_ack_count = atom_asr(1, asp);
277 if (socket_ack_count == bcp->cpus_in_socket) {
278 int msg_ack_count;
279 /*
280 * Both sockets dump their completed count total into
281 * the message's count.
282 */
283 smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
284 asp = (struct atomic_short *)&msg->acknowledge_count;
285 msg_ack_count = atom_asr(socket_ack_count, asp);
286
287 if (msg_ack_count == bcp->cpus_in_uvhub) {
288 /*
289 * All cpus in uvhub saw it; reply
290 */
291 reply_to_message(mdp, bcp);
292 }
293 }
294
295 return;
296}
297
298/*
299 * Determine the first cpu on a uvhub.
300 */
301static int uvhub_to_first_cpu(int uvhub)
302{
303 int cpu;
304 for_each_present_cpu(cpu)
305 if (uvhub == uv_cpu_to_blade_id(cpu))
306 return cpu;
307 return -1;
308}
309
310/*
311 * Last resort when we get a large number of destination timeouts is
312 * to clear resources held by a given cpu.
313 * Do this with IPI so that all messages in the BAU message queue
314 * can be identified by their nonzero swack_vec field.
315 *
316 * This is entered for a single cpu on the uvhub.
317 * The sender want's this uvhub to free a specific message's
318 * swack resources.
319 */
320static void do_reset(void *ptr)
321{
322 int i;
323 struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
324 struct reset_args *rap = (struct reset_args *)ptr;
325 struct bau_pq_entry *msg;
326 struct ptc_stats *stat = bcp->statp;
327
328 stat->d_resets++;
329 /*
330 * We're looking for the given sender, and
331 * will free its swack resource.
332 * If all cpu's finally responded after the timeout, its
333 * message 'replied_to' was set.
334 */
335 for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
336 unsigned long msg_res;
337 /* do_reset: same conditions for cancellation as
338 bau_process_retry_msg() */
339 if ((msg->replied_to == 0) &&
340 (msg->canceled == 0) &&
341 (msg->sending_cpu == rap->sender) &&
342 (msg->swack_vec) &&
343 (msg->msg_type != MSG_NOOP)) {
344 unsigned long mmr;
345 unsigned long mr;
346 /*
347 * make everyone else ignore this message
348 */
349 msg->canceled = 1;
350 /*
351 * only reset the resource if it is still pending
352 */
353 mmr = read_mmr_sw_ack();
354 msg_res = msg->swack_vec;
355 mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
356 if (mmr & msg_res) {
357 stat->d_rcanceled++;
358 write_mmr_sw_ack(mr);
359 }
360 }
361 }
362 return;
363}
364
365/*
366 * Use IPI to get all target uvhubs to release resources held by
367 * a given sending cpu number.
368 */
369static void reset_with_ipi(struct bau_targ_hubmask *distribution, int sender)
370{
371 int uvhub;
372 int maskbits;
373 cpumask_t mask;
374 struct reset_args reset_args;
375
376 reset_args.sender = sender;
377 cpus_clear(mask);
378 /* find a single cpu for each uvhub in this distribution mask */
379 maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE;
380 for (uvhub = 0; uvhub < maskbits; uvhub++) {
381 int cpu;
382 if (!bau_uvhub_isset(uvhub, distribution))
383 continue;
384 /* find a cpu for this uvhub */
385 cpu = uvhub_to_first_cpu(uvhub);
386 cpu_set(cpu, mask);
387 }
388
389 /* IPI all cpus; preemption is already disabled */
390 smp_call_function_many(&mask, do_reset, (void *)&reset_args, 1);
391 return;
392}
393
394static inline unsigned long cycles_2_us(unsigned long long cyc)
395{
396 unsigned long long ns;
397 unsigned long us;
398 int cpu = smp_processor_id();
399
400 ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
401 us = ns / 1000;
402 return us;
403}
404
405/*
406 * wait for all cpus on this hub to finish their sends and go quiet
407 * leaves uvhub_quiesce set so that no new broadcasts are started by
408 * bau_flush_send_and_wait()
409 */
410static inline void quiesce_local_uvhub(struct bau_control *hmaster)
411{
412 atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
413}
414
415/*
416 * mark this quiet-requestor as done
417 */
418static inline void end_uvhub_quiesce(struct bau_control *hmaster)
419{
420 atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
421}
422
423static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
424{
425 unsigned long descriptor_status;
426
427 descriptor_status = uv_read_local_mmr(mmr_offset);
428 descriptor_status >>= right_shift;
429 descriptor_status &= UV_ACT_STATUS_MASK;
430 return descriptor_status;
431}
432
433/*
434 * Wait for completion of a broadcast software ack message
435 * return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
436 */
437static int uv1_wait_completion(struct bau_desc *bau_desc,
438 unsigned long mmr_offset, int right_shift,
439 struct bau_control *bcp, long try)
440{
441 unsigned long descriptor_status;
442 cycles_t ttm;
443 struct ptc_stats *stat = bcp->statp;
444
445 descriptor_status = uv1_read_status(mmr_offset, right_shift);
446 /* spin on the status MMR, waiting for it to go idle */
447 while ((descriptor_status != DS_IDLE)) {
448 /*
449 * Our software ack messages may be blocked because
450 * there are no swack resources available. As long
451 * as none of them has timed out hardware will NACK
452 * our message and its state will stay IDLE.
453 */
454 if (descriptor_status == DS_SOURCE_TIMEOUT) {
455 stat->s_stimeout++;
456 return FLUSH_GIVEUP;
457 } else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
458 stat->s_dtimeout++;
459 ttm = get_cycles();
460
461 /*
462 * Our retries may be blocked by all destination
463 * swack resources being consumed, and a timeout
464 * pending. In that case hardware returns the
465 * ERROR that looks like a destination timeout.
466 */
467 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
468 bcp->conseccompletes = 0;
469 return FLUSH_RETRY_PLUGGED;
470 }
471
472 bcp->conseccompletes = 0;
473 return FLUSH_RETRY_TIMEOUT;
474 } else {
475 /*
476 * descriptor_status is still BUSY
477 */
478 cpu_relax();
479 }
480 descriptor_status = uv1_read_status(mmr_offset, right_shift);
481 }
482 bcp->conseccompletes++;
483 return FLUSH_COMPLETE;
484}
485
486/*
487 * UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
488 */
489static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
490{
491 unsigned long descriptor_status;
492 unsigned long descriptor_status2;
493
494 descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
495 descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
496 descriptor_status = (descriptor_status << 1) | descriptor_status2;
497 return descriptor_status;
498}
499
500static int uv2_wait_completion(struct bau_desc *bau_desc,
501 unsigned long mmr_offset, int right_shift,
502 struct bau_control *bcp, long try)
503{
504 unsigned long descriptor_stat;
505 cycles_t ttm;
506 int cpu = bcp->uvhub_cpu;
507 struct ptc_stats *stat = bcp->statp;
508
509 descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
510
511 /* spin on the status MMR, waiting for it to go idle */
512 while (descriptor_stat != UV2H_DESC_IDLE) {
513 /*
514 * Our software ack messages may be blocked because
515 * there are no swack resources available. As long
516 * as none of them has timed out hardware will NACK
517 * our message and its state will stay IDLE.
518 */
519 if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
520 (descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
521 (descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
522 stat->s_stimeout++;
523 return FLUSH_GIVEUP;
524 } else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
525 stat->s_dtimeout++;
526 ttm = get_cycles();
527 /*
528 * Our retries may be blocked by all destination
529 * swack resources being consumed, and a timeout
530 * pending. In that case hardware returns the
531 * ERROR that looks like a destination timeout.
532 */
533 if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
534 bcp->conseccompletes = 0;
535 return FLUSH_RETRY_PLUGGED;
536 }
537 bcp->conseccompletes = 0;
538 return FLUSH_RETRY_TIMEOUT;
539 } else {
540 /*
541 * descriptor_stat is still BUSY
542 */
543 cpu_relax();
544 }
545 descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
546 }
547 bcp->conseccompletes++;
548 return FLUSH_COMPLETE;
549}
550
551/*
552 * There are 2 status registers; each and array[32] of 2 bits. Set up for
553 * which register to read and position in that register based on cpu in
554 * current hub.
555 */
556static int wait_completion(struct bau_desc *bau_desc,
557 struct bau_control *bcp, long try)
558{
559 int right_shift;
560 unsigned long mmr_offset;
561 int cpu = bcp->uvhub_cpu;
562
563 if (cpu < UV_CPUS_PER_AS) {
564 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
565 right_shift = cpu * UV_ACT_STATUS_SIZE;
566 } else {
567 mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
568 right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
569 }
570
571 if (is_uv1_hub())
572 return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
573 bcp, try);
574 else
575 return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
576 bcp, try);
577}
578
579static inline cycles_t sec_2_cycles(unsigned long sec)
580{
581 unsigned long ns;
582 cycles_t cyc;
583
584 ns = sec * 1000000000;
585 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
586 return cyc;
587}
588
589/*
590 * Our retries are blocked by all destination sw ack resources being
591 * in use, and a timeout is pending. In that case hardware immediately
592 * returns the ERROR that looks like a destination timeout.
593 */
594static void destination_plugged(struct bau_desc *bau_desc,
595 struct bau_control *bcp,
596 struct bau_control *hmaster, struct ptc_stats *stat)
597{
598 udelay(bcp->plugged_delay);
599 bcp->plugged_tries++;
600
601 if (bcp->plugged_tries >= bcp->plugsb4reset) {
602 bcp->plugged_tries = 0;
603
604 quiesce_local_uvhub(hmaster);
605
606 spin_lock(&hmaster->queue_lock);
607 reset_with_ipi(&bau_desc->distribution, bcp->cpu);
608 spin_unlock(&hmaster->queue_lock);
609
610 end_uvhub_quiesce(hmaster);
611
612 bcp->ipi_attempts++;
613 stat->s_resets_plug++;
614 }
615}
616
617static void destination_timeout(struct bau_desc *bau_desc,
618 struct bau_control *bcp, struct bau_control *hmaster,
619 struct ptc_stats *stat)
620{
621 hmaster->max_concurr = 1;
622 bcp->timeout_tries++;
623 if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
624 bcp->timeout_tries = 0;
625
626 quiesce_local_uvhub(hmaster);
627
628 spin_lock(&hmaster->queue_lock);
629 reset_with_ipi(&bau_desc->distribution, bcp->cpu);
630 spin_unlock(&hmaster->queue_lock);
631
632 end_uvhub_quiesce(hmaster);
633
634 bcp->ipi_attempts++;
635 stat->s_resets_timeout++;
636 }
637}
638
639/*
640 * Completions are taking a very long time due to a congested numalink
641 * network.
642 */
643static void disable_for_congestion(struct bau_control *bcp,
644 struct ptc_stats *stat)
645{
646 /* let only one cpu do this disabling */
647 spin_lock(&disable_lock);
648
649 if (!baudisabled && bcp->period_requests &&
650 ((bcp->period_time / bcp->period_requests) > congested_cycles)) {
651 int tcpu;
652 struct bau_control *tbcp;
653 /* it becomes this cpu's job to turn on the use of the
654 BAU again */
655 baudisabled = 1;
656 bcp->set_bau_off = 1;
657 bcp->set_bau_on_time = get_cycles();
658 bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
659 stat->s_bau_disabled++;
660 for_each_present_cpu(tcpu) {
661 tbcp = &per_cpu(bau_control, tcpu);
662 tbcp->baudisabled = 1;
663 }
664 }
665
666 spin_unlock(&disable_lock);
667}
668
669static void count_max_concurr(int stat, struct bau_control *bcp,
670 struct bau_control *hmaster)
671{
672 bcp->plugged_tries = 0;
673 bcp->timeout_tries = 0;
674 if (stat != FLUSH_COMPLETE)
675 return;
676 if (bcp->conseccompletes <= bcp->complete_threshold)
677 return;
678 if (hmaster->max_concurr >= hmaster->max_concurr_const)
679 return;
680 hmaster->max_concurr++;
681}
682
683static void record_send_stats(cycles_t time1, cycles_t time2,
684 struct bau_control *bcp, struct ptc_stats *stat,
685 int completion_status, int try)
686{
687 cycles_t elapsed;
688
689 if (time2 > time1) {
690 elapsed = time2 - time1;
691 stat->s_time += elapsed;
692
693 if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
694 bcp->period_requests++;
695 bcp->period_time += elapsed;
696 if ((elapsed > congested_cycles) &&
697 (bcp->period_requests > bcp->cong_reps))
698 disable_for_congestion(bcp, stat);
699 }
700 } else
701 stat->s_requestor--;
702
703 if (completion_status == FLUSH_COMPLETE && try > 1)
704 stat->s_retriesok++;
705 else if (completion_status == FLUSH_GIVEUP)
706 stat->s_giveup++;
707}
708
709/*
710 * Because of a uv1 hardware bug only a limited number of concurrent
711 * requests can be made.
712 */
713static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
714{
715 spinlock_t *lock = &hmaster->uvhub_lock;
716 atomic_t *v;
717
718 v = &hmaster->active_descriptor_count;
719 if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
720 stat->s_throttles++;
721 do {
722 cpu_relax();
723 } while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
724 }
725}
726
727/*
728 * Handle the completion status of a message send.
729 */
730static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
731 struct bau_control *bcp, struct bau_control *hmaster,
732 struct ptc_stats *stat)
733{
734 if (completion_status == FLUSH_RETRY_PLUGGED)
735 destination_plugged(bau_desc, bcp, hmaster, stat);
736 else if (completion_status == FLUSH_RETRY_TIMEOUT)
737 destination_timeout(bau_desc, bcp, hmaster, stat);
738}
739
740/*
741 * Send a broadcast and wait for it to complete.
742 *
743 * The flush_mask contains the cpus the broadcast is to be sent to including
744 * cpus that are on the local uvhub.
745 *
746 * Returns 0 if all flushing represented in the mask was done.
747 * Returns 1 if it gives up entirely and the original cpu mask is to be
748 * returned to the kernel.
749 */
750int uv_flush_send_and_wait(struct bau_desc *bau_desc,
751 struct cpumask *flush_mask, struct bau_control *bcp)
752{
753 int seq_number = 0;
754 int completion_stat = 0;
755 long try = 0;
756 unsigned long index;
757 cycles_t time1;
758 cycles_t time2;
759 struct ptc_stats *stat = bcp->statp;
760 struct bau_control *hmaster = bcp->uvhub_master;
761
762 if (is_uv1_hub())
763 uv1_throttle(hmaster, stat);
764
765 while (hmaster->uvhub_quiesce)
766 cpu_relax();
767
768 time1 = get_cycles();
769 do {
770 if (try == 0) {
771 bau_desc->header.msg_type = MSG_REGULAR;
772 seq_number = bcp->message_number++;
773 } else {
774 bau_desc->header.msg_type = MSG_RETRY;
775 stat->s_retry_messages++;
776 }
777
778 bau_desc->header.sequence = seq_number;
779 index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
780 bcp->send_message = get_cycles();
781
782 write_mmr_activation(index);
783
784 try++;
785 completion_stat = wait_completion(bau_desc, bcp, try);
786
787 handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
788
789 if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
790 bcp->ipi_attempts = 0;
791 completion_stat = FLUSH_GIVEUP;
792 break;
793 }
794 cpu_relax();
795 } while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
796 (completion_stat == FLUSH_RETRY_TIMEOUT));
797
798 time2 = get_cycles();
799
800 count_max_concurr(completion_stat, bcp, hmaster);
801
802 while (hmaster->uvhub_quiesce)
803 cpu_relax();
804
805 atomic_dec(&hmaster->active_descriptor_count);
806
807 record_send_stats(time1, time2, bcp, stat, completion_stat, try);
808
809 if (completion_stat == FLUSH_GIVEUP)
810 return 1;
811 return 0;
812}
813
814/*
815 * The BAU is disabled. When the disabled time period has expired, the cpu
816 * that disabled it must re-enable it.
817 * Return 0 if it is re-enabled for all cpus.
818 */
819static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
820{
821 int tcpu;
822 struct bau_control *tbcp;
823
824 if (bcp->set_bau_off) {
825 if (get_cycles() >= bcp->set_bau_on_time) {
826 stat->s_bau_reenabled++;
827 baudisabled = 0;
828 for_each_present_cpu(tcpu) {
829 tbcp = &per_cpu(bau_control, tcpu);
830 tbcp->baudisabled = 0;
831 tbcp->period_requests = 0;
832 tbcp->period_time = 0;
833 }
834 return 0;
835 }
836 }
837 return -1;
838}
839
840static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
841 int remotes, struct bau_desc *bau_desc)
842{
843 stat->s_requestor++;
844 stat->s_ntargcpu += remotes + locals;
845 stat->s_ntargremotes += remotes;
846 stat->s_ntarglocals += locals;
847
848 /* uvhub statistics */
849 hubs = bau_uvhub_weight(&bau_desc->distribution);
850 if (locals) {
851 stat->s_ntarglocaluvhub++;
852 stat->s_ntargremoteuvhub += (hubs - 1);
853 } else
854 stat->s_ntargremoteuvhub += hubs;
855
856 stat->s_ntarguvhub += hubs;
857
858 if (hubs >= 16)
859 stat->s_ntarguvhub16++;
860 else if (hubs >= 8)
861 stat->s_ntarguvhub8++;
862 else if (hubs >= 4)
863 stat->s_ntarguvhub4++;
864 else if (hubs >= 2)
865 stat->s_ntarguvhub2++;
866 else
867 stat->s_ntarguvhub1++;
868}
869
870/*
871 * Translate a cpu mask to the uvhub distribution mask in the BAU
872 * activation descriptor.
873 */
874static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
875 struct bau_desc *bau_desc, int *localsp, int *remotesp)
876{
877 int cpu;
878 int pnode;
879 int cnt = 0;
880 struct hub_and_pnode *hpp;
881
882 for_each_cpu(cpu, flush_mask) {
883 /*
884 * The distribution vector is a bit map of pnodes, relative
885 * to the partition base pnode (and the partition base nasid
886 * in the header).
887 * Translate cpu to pnode and hub using a local memory array.
888 */
889 hpp = &bcp->socket_master->thp[cpu];
890 pnode = hpp->pnode - bcp->partition_base_pnode;
891 bau_uvhub_set(pnode, &bau_desc->distribution);
892 cnt++;
893 if (hpp->uvhub == bcp->uvhub)
894 (*localsp)++;
895 else
896 (*remotesp)++;
897 }
898 if (!cnt)
899 return 1;
900 return 0;
901}
902
903/*
904 * globally purge translation cache of a virtual address or all TLB's
905 * @cpumask: mask of all cpu's in which the address is to be removed
906 * @mm: mm_struct containing virtual address range
907 * @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
908 * @cpu: the current cpu
909 *
910 * This is the entry point for initiating any UV global TLB shootdown.
911 *
912 * Purges the translation caches of all specified processors of the given
913 * virtual address, or purges all TLB's on specified processors.
914 *
915 * The caller has derived the cpumask from the mm_struct. This function
916 * is called only if there are bits set in the mask. (e.g. flush_tlb_page())
917 *
918 * The cpumask is converted into a uvhubmask of the uvhubs containing
919 * those cpus.
920 *
921 * Note that this function should be called with preemption disabled.
922 *
923 * Returns NULL if all remote flushing was done.
924 * Returns pointer to cpumask if some remote flushing remains to be
925 * done. The returned pointer is valid till preemption is re-enabled.
926 */
927const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
928 struct mm_struct *mm, unsigned long va,
929 unsigned int cpu)
930{
931 int locals = 0;
932 int remotes = 0;
933 int hubs = 0;
934 struct bau_desc *bau_desc;
935 struct cpumask *flush_mask;
936 struct ptc_stats *stat;
937 struct bau_control *bcp;
938
939 /* kernel was booted 'nobau' */
940 if (nobau)
941 return cpumask;
942
943 bcp = &per_cpu(bau_control, cpu);
944 stat = bcp->statp;
945
946 /* bau was disabled due to slow response */
947 if (bcp->baudisabled) {
948 if (check_enable(bcp, stat))
949 return cpumask;
950 }
951
952 /*
953 * Each sending cpu has a per-cpu mask which it fills from the caller's
954 * cpu mask. All cpus are converted to uvhubs and copied to the
955 * activation descriptor.
956 */
957 flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
958 /* don't actually do a shootdown of the local cpu */
959 cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
960
961 if (cpu_isset(cpu, *cpumask))
962 stat->s_ntargself++;
963
964 bau_desc = bcp->descriptor_base;
965 bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu;
966 bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
967 if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
968 return NULL;
969
970 record_send_statistics(stat, locals, hubs, remotes, bau_desc);
971
972 bau_desc->payload.address = va;
973 bau_desc->payload.sending_cpu = cpu;
974 /*
975 * uv_flush_send_and_wait returns 0 if all cpu's were messaged,
976 * or 1 if it gave up and the original cpumask should be returned.
977 */
978 if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
979 return NULL;
980 else
981 return cpumask;
982}
983
984/*
985 * The BAU message interrupt comes here. (registered by set_intr_gate)
986 * See entry_64.S
987 *
988 * We received a broadcast assist message.
989 *
990 * Interrupts are disabled; this interrupt could represent
991 * the receipt of several messages.
992 *
993 * All cores/threads on this hub get this interrupt.
994 * The last one to see it does the software ack.
995 * (the resource will not be freed until noninterruptable cpus see this
996 * interrupt; hardware may timeout the s/w ack and reply ERROR)
997 */
998void uv_bau_message_interrupt(struct pt_regs *regs)
999{
1000 int count = 0;
1001 cycles_t time_start;
1002 struct bau_pq_entry *msg;
1003 struct bau_control *bcp;
1004 struct ptc_stats *stat;
1005 struct msg_desc msgdesc;
1006
1007 time_start = get_cycles();
1008
1009 bcp = &per_cpu(bau_control, smp_processor_id());
1010 stat = bcp->statp;
1011
1012 msgdesc.queue_first = bcp->queue_first;
1013 msgdesc.queue_last = bcp->queue_last;
1014
1015 msg = bcp->bau_msg_head;
1016 while (msg->swack_vec) {
1017 count++;
1018
1019 msgdesc.msg_slot = msg - msgdesc.queue_first;
1020 msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
1021 msgdesc.msg = msg;
1022 bau_process_message(&msgdesc, bcp);
1023
1024 msg++;
1025 if (msg > msgdesc.queue_last)
1026 msg = msgdesc.queue_first;
1027 bcp->bau_msg_head = msg;
1028 }
1029 stat->d_time += (get_cycles() - time_start);
1030 if (!count)
1031 stat->d_nomsg++;
1032 else if (count > 1)
1033 stat->d_multmsg++;
1034
1035 ack_APIC_irq();
1036}
1037
1038/*
1039 * Each target uvhub (i.e. a uvhub that has cpu's) needs to have
1040 * shootdown message timeouts enabled. The timeout does not cause
1041 * an interrupt, but causes an error message to be returned to
1042 * the sender.
1043 */
1044static void __init enable_timeouts(void)
1045{
1046 int uvhub;
1047 int nuvhubs;
1048 int pnode;
1049 unsigned long mmr_image;
1050
1051 nuvhubs = uv_num_possible_blades();
1052
1053 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1054 if (!uv_blade_nr_possible_cpus(uvhub))
1055 continue;
1056
1057 pnode = uv_blade_to_pnode(uvhub);
1058 mmr_image = read_mmr_misc_control(pnode);
1059 /*
1060 * Set the timeout period and then lock it in, in three
1061 * steps; captures and locks in the period.
1062 *
1063 * To program the period, the SOFT_ACK_MODE must be off.
1064 */
1065 mmr_image &= ~(1L << SOFTACK_MSHIFT);
1066 write_mmr_misc_control(pnode, mmr_image);
1067 /*
1068 * Set the 4-bit period.
1069 */
1070 mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
1071 mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
1072 write_mmr_misc_control(pnode, mmr_image);
1073 /*
1074 * UV1:
1075 * Subsequent reversals of the timebase bit (3) cause an
1076 * immediate timeout of one or all INTD resources as
1077 * indicated in bits 2:0 (7 causes all of them to timeout).
1078 */
1079 mmr_image |= (1L << SOFTACK_MSHIFT);
1080 if (is_uv2_hub()) {
1081 mmr_image |= (1L << UV2_LEG_SHFT);
1082 mmr_image |= (1L << UV2_EXT_SHFT);
1083 }
1084 write_mmr_misc_control(pnode, mmr_image);
1085 }
1086}
1087
1088static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
1089{
1090 if (*offset < num_possible_cpus())
1091 return offset;
1092 return NULL;
1093}
1094
1095static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
1096{
1097 (*offset)++;
1098 if (*offset < num_possible_cpus())
1099 return offset;
1100 return NULL;
1101}
1102
1103static void ptc_seq_stop(struct seq_file *file, void *data)
1104{
1105}
1106
1107static inline unsigned long long usec_2_cycles(unsigned long microsec)
1108{
1109 unsigned long ns;
1110 unsigned long long cyc;
1111
1112 ns = microsec * 1000;
1113 cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
1114 return cyc;
1115}
1116
1117/*
1118 * Display the statistics thru /proc/sgi_uv/ptc_statistics
1119 * 'data' points to the cpu number
1120 * Note: see the descriptions in stat_description[].
1121 */
1122static int ptc_seq_show(struct seq_file *file, void *data)
1123{
1124 struct ptc_stats *stat;
1125 int cpu;
1126
1127 cpu = *(loff_t *)data;
1128 if (!cpu) {
1129 seq_printf(file,
1130 "# cpu sent stime self locals remotes ncpus localhub ");
1131 seq_printf(file,
1132 "remotehub numuvhubs numuvhubs16 numuvhubs8 ");
1133 seq_printf(file,
1134 "numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok ");
1135 seq_printf(file,
1136 "resetp resett giveup sto bz throt swack recv rtime ");
1137 seq_printf(file,
1138 "all one mult none retry canc nocan reset rcan ");
1139 seq_printf(file,
1140 "disable enable\n");
1141 }
1142 if (cpu < num_possible_cpus() && cpu_online(cpu)) {
1143 stat = &per_cpu(ptcstats, cpu);
1144 /* source side statistics */
1145 seq_printf(file,
1146 "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1147 cpu, stat->s_requestor, cycles_2_us(stat->s_time),
1148 stat->s_ntargself, stat->s_ntarglocals,
1149 stat->s_ntargremotes, stat->s_ntargcpu,
1150 stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
1151 stat->s_ntarguvhub, stat->s_ntarguvhub16);
1152 seq_printf(file, "%ld %ld %ld %ld %ld ",
1153 stat->s_ntarguvhub8, stat->s_ntarguvhub4,
1154 stat->s_ntarguvhub2, stat->s_ntarguvhub1,
1155 stat->s_dtimeout);
1156 seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
1157 stat->s_retry_messages, stat->s_retriesok,
1158 stat->s_resets_plug, stat->s_resets_timeout,
1159 stat->s_giveup, stat->s_stimeout,
1160 stat->s_busy, stat->s_throttles);
1161
1162 /* destination side statistics */
1163 seq_printf(file,
1164 "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1165 read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
1166 stat->d_requestee, cycles_2_us(stat->d_time),
1167 stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
1168 stat->d_nomsg, stat->d_retries, stat->d_canceled,
1169 stat->d_nocanceled, stat->d_resets,
1170 stat->d_rcanceled);
1171 seq_printf(file, "%ld %ld\n",
1172 stat->s_bau_disabled, stat->s_bau_reenabled);
1173 }
1174 return 0;
1175}
1176
1177/*
1178 * Display the tunables thru debugfs
1179 */
1180static ssize_t tunables_read(struct file *file, char __user *userbuf,
1181 size_t count, loff_t *ppos)
1182{
1183 char *buf;
1184 int ret;
1185
1186 buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1187 "max_concur plugged_delay plugsb4reset",
1188 "timeoutsb4reset ipi_reset_limit complete_threshold",
1189 "congested_response_us congested_reps congested_period",
1190 max_concurr, plugged_delay, plugsb4reset,
1191 timeoutsb4reset, ipi_reset_limit, complete_threshold,
1192 congested_respns_us, congested_reps, congested_period);
1193
1194 if (!buf)
1195 return -ENOMEM;
1196
1197 ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
1198 kfree(buf);
1199 return ret;
1200}
1201
1202/*
1203 * handle a write to /proc/sgi_uv/ptc_statistics
1204 * -1: reset the statistics
1205 * 0: display meaning of the statistics
1206 */
1207static ssize_t ptc_proc_write(struct file *file, const char __user *user,
1208 size_t count, loff_t *data)
1209{
1210 int cpu;
1211 int i;
1212 int elements;
1213 long input_arg;
1214 char optstr[64];
1215 struct ptc_stats *stat;
1216
1217 if (count == 0 || count > sizeof(optstr))
1218 return -EINVAL;
1219 if (copy_from_user(optstr, user, count))
1220 return -EFAULT;
1221 optstr[count - 1] = '\0';
1222
1223 if (strict_strtol(optstr, 10, &input_arg) < 0) {
1224 printk(KERN_DEBUG "%s is invalid\n", optstr);
1225 return -EINVAL;
1226 }
1227
1228 if (input_arg == 0) {
1229 elements = sizeof(stat_description)/sizeof(*stat_description);
1230 printk(KERN_DEBUG "# cpu: cpu number\n");
1231 printk(KERN_DEBUG "Sender statistics:\n");
1232 for (i = 0; i < elements; i++)
1233 printk(KERN_DEBUG "%s\n", stat_description[i]);
1234 } else if (input_arg == -1) {
1235 for_each_present_cpu(cpu) {
1236 stat = &per_cpu(ptcstats, cpu);
1237 memset(stat, 0, sizeof(struct ptc_stats));
1238 }
1239 }
1240
1241 return count;
1242}
1243
1244static int local_atoi(const char *name)
1245{
1246 int val = 0;
1247
1248 for (;; name++) {
1249 switch (*name) {
1250 case '0' ... '9':
1251 val = 10*val+(*name-'0');
1252 break;
1253 default:
1254 return val;
1255 }
1256 }
1257}
1258
1259/*
1260 * Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
1261 * Zero values reset them to defaults.
1262 */
1263static int parse_tunables_write(struct bau_control *bcp, char *instr,
1264 int count)
1265{
1266 char *p;
1267 char *q;
1268 int cnt = 0;
1269 int val;
1270 int e = sizeof(tunables) / sizeof(*tunables);
1271
1272 p = instr + strspn(instr, WHITESPACE);
1273 q = p;
1274 for (; *p; p = q + strspn(q, WHITESPACE)) {
1275 q = p + strcspn(p, WHITESPACE);
1276 cnt++;
1277 if (q == p)
1278 break;
1279 }
1280 if (cnt != e) {
1281 printk(KERN_INFO "bau tunable error: should be %d values\n", e);
1282 return -EINVAL;
1283 }
1284
1285 p = instr + strspn(instr, WHITESPACE);
1286 q = p;
1287 for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
1288 q = p + strcspn(p, WHITESPACE);
1289 val = local_atoi(p);
1290 switch (cnt) {
1291 case 0:
1292 if (val == 0) {
1293 max_concurr = MAX_BAU_CONCURRENT;
1294 max_concurr_const = MAX_BAU_CONCURRENT;
1295 continue;
1296 }
1297 if (val < 1 || val > bcp->cpus_in_uvhub) {
1298 printk(KERN_DEBUG
1299 "Error: BAU max concurrent %d is invalid\n",
1300 val);
1301 return -EINVAL;
1302 }
1303 max_concurr = val;
1304 max_concurr_const = val;
1305 continue;
1306 default:
1307 if (val == 0)
1308 *tunables[cnt].tunp = tunables[cnt].deflt;
1309 else
1310 *tunables[cnt].tunp = val;
1311 continue;
1312 }
1313 if (q == p)
1314 break;
1315 }
1316 return 0;
1317}
1318
1319/*
1320 * Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
1321 */
1322static ssize_t tunables_write(struct file *file, const char __user *user,
1323 size_t count, loff_t *data)
1324{
1325 int cpu;
1326 int ret;
1327 char instr[100];
1328 struct bau_control *bcp;
1329
1330 if (count == 0 || count > sizeof(instr)-1)
1331 return -EINVAL;
1332 if (copy_from_user(instr, user, count))
1333 return -EFAULT;
1334
1335 instr[count] = '\0';
1336
1337 bcp = &per_cpu(bau_control, smp_processor_id());
1338
1339 ret = parse_tunables_write(bcp, instr, count);
1340 if (ret)
1341 return ret;
1342
1343 for_each_present_cpu(cpu) {
1344 bcp = &per_cpu(bau_control, cpu);
1345 bcp->max_concurr = max_concurr;
1346 bcp->max_concurr_const = max_concurr;
1347 bcp->plugged_delay = plugged_delay;
1348 bcp->plugsb4reset = plugsb4reset;
1349 bcp->timeoutsb4reset = timeoutsb4reset;
1350 bcp->ipi_reset_limit = ipi_reset_limit;
1351 bcp->complete_threshold = complete_threshold;
1352 bcp->cong_response_us = congested_respns_us;
1353 bcp->cong_reps = congested_reps;
1354 bcp->cong_period = congested_period;
1355 }
1356 return count;
1357}
1358
1359static const struct seq_operations uv_ptc_seq_ops = {
1360 .start = ptc_seq_start,
1361 .next = ptc_seq_next,
1362 .stop = ptc_seq_stop,
1363 .show = ptc_seq_show
1364};
1365
1366static int ptc_proc_open(struct inode *inode, struct file *file)
1367{
1368 return seq_open(file, &uv_ptc_seq_ops);
1369}
1370
1371static int tunables_open(struct inode *inode, struct file *file)
1372{
1373 return 0;
1374}
1375
1376static const struct file_operations proc_uv_ptc_operations = {
1377 .open = ptc_proc_open,
1378 .read = seq_read,
1379 .write = ptc_proc_write,
1380 .llseek = seq_lseek,
1381 .release = seq_release,
1382};
1383
1384static const struct file_operations tunables_fops = {
1385 .open = tunables_open,
1386 .read = tunables_read,
1387 .write = tunables_write,
1388 .llseek = default_llseek,
1389};
1390
1391static int __init uv_ptc_init(void)
1392{
1393 struct proc_dir_entry *proc_uv_ptc;
1394
1395 if (!is_uv_system())
1396 return 0;
1397
1398 proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
1399 &proc_uv_ptc_operations);
1400 if (!proc_uv_ptc) {
1401 printk(KERN_ERR "unable to create %s proc entry\n",
1402 UV_PTC_BASENAME);
1403 return -EINVAL;
1404 }
1405
1406 tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
1407 if (!tunables_dir) {
1408 printk(KERN_ERR "unable to create debugfs directory %s\n",
1409 UV_BAU_TUNABLES_DIR);
1410 return -EINVAL;
1411 }
1412 tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1413 tunables_dir, NULL, &tunables_fops);
1414 if (!tunables_file) {
1415 printk(KERN_ERR "unable to create debugfs file %s\n",
1416 UV_BAU_TUNABLES_FILE);
1417 return -EINVAL;
1418 }
1419 return 0;
1420}
1421
1422/*
1423 * Initialize the sending side's sending buffers.
1424 */
1425static void activation_descriptor_init(int node, int pnode, int base_pnode)
1426{
1427 int i;
1428 int cpu;
1429 unsigned long pa;
1430 unsigned long m;
1431 unsigned long n;
1432 size_t dsize;
1433 struct bau_desc *bau_desc;
1434 struct bau_desc *bd2;
1435 struct bau_control *bcp;
1436
1437 /*
1438 * each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
1439 * per cpu; and one per cpu on the uvhub (ADP_SZ)
1440 */
1441 dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
1442 bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
1443 BUG_ON(!bau_desc);
1444
1445 pa = uv_gpa(bau_desc); /* need the real nasid*/
1446 n = pa >> uv_nshift;
1447 m = pa & uv_mmask;
1448
1449 /* the 14-bit pnode */
1450 write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
1451 /*
1452 * Initializing all 8 (ITEMS_PER_DESC) descriptors for each
1453 * cpu even though we only use the first one; one descriptor can
1454 * describe a broadcast to 256 uv hubs.
1455 */
1456 for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
1457 memset(bd2, 0, sizeof(struct bau_desc));
1458 bd2->header.swack_flag = 1;
1459 /*
1460 * The base_dest_nasid set in the message header is the nasid
1461 * of the first uvhub in the partition. The bit map will
1462 * indicate destination pnode numbers relative to that base.
1463 * They may not be consecutive if nasid striding is being used.
1464 */
1465 bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
1466 bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
1467 bd2->header.command = UV_NET_ENDPOINT_INTD;
1468 bd2->header.int_both = 1;
1469 /*
1470 * all others need to be set to zero:
1471 * fairness chaining multilevel count replied_to
1472 */
1473 }
1474 for_each_present_cpu(cpu) {
1475 if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1476 continue;
1477 bcp = &per_cpu(bau_control, cpu);
1478 bcp->descriptor_base = bau_desc;
1479 }
1480}
1481
1482/*
1483 * initialize the destination side's receiving buffers
1484 * entered for each uvhub in the partition
1485 * - node is first node (kernel memory notion) on the uvhub
1486 * - pnode is the uvhub's physical identifier
1487 */
1488static void pq_init(int node, int pnode)
1489{
1490 int cpu;
1491 size_t plsize;
1492 char *cp;
1493 void *vp;
1494 unsigned long pn;
1495 unsigned long first;
1496 unsigned long pn_first;
1497 unsigned long last;
1498 struct bau_pq_entry *pqp;
1499 struct bau_control *bcp;
1500
1501 plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
1502 vp = kmalloc_node(plsize, GFP_KERNEL, node);
1503 pqp = (struct bau_pq_entry *)vp;
1504 BUG_ON(!pqp);
1505
1506 cp = (char *)pqp + 31;
1507 pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
1508
1509 for_each_present_cpu(cpu) {
1510 if (pnode != uv_cpu_to_pnode(cpu))
1511 continue;
1512 /* for every cpu on this pnode: */
1513 bcp = &per_cpu(bau_control, cpu);
1514 bcp->queue_first = pqp;
1515 bcp->bau_msg_head = pqp;
1516 bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
1517 }
1518 /*
1519 * need the pnode of where the memory was really allocated
1520 */
1521 pn = uv_gpa(pqp) >> uv_nshift;
1522 first = uv_physnodeaddr(pqp);
1523 pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
1524 last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
1525 write_mmr_payload_first(pnode, pn_first);
1526 write_mmr_payload_tail(pnode, first);
1527 write_mmr_payload_last(pnode, last);
1528
1529 /* in effect, all msg_type's are set to MSG_NOOP */
1530 memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
1531}
1532
1533/*
1534 * Initialization of each UV hub's structures
1535 */
1536static void __init init_uvhub(int uvhub, int vector, int base_pnode)
1537{
1538 int node;
1539 int pnode;
1540 unsigned long apicid;
1541
1542 node = uvhub_to_first_node(uvhub);
1543 pnode = uv_blade_to_pnode(uvhub);
1544
1545 activation_descriptor_init(node, pnode, base_pnode);
1546
1547 pq_init(node, pnode);
1548 /*
1549 * The below initialization can't be in firmware because the
1550 * messaging IRQ will be determined by the OS.
1551 */
1552 apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
1553 write_mmr_data_config(pnode, ((apicid << 32) | vector));
1554}
1555
1556/*
1557 * We will set BAU_MISC_CONTROL with a timeout period.
1558 * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1559 * So the destination timeout period has to be calculated from them.
1560 */
1561static int calculate_destination_timeout(void)
1562{
1563 unsigned long mmr_image;
1564 int mult1;
1565 int mult2;
1566 int index;
1567 int base;
1568 int ret;
1569 unsigned long ts_ns;
1570
1571 if (is_uv1_hub()) {
1572 mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1573 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1574 index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1575 mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1576 mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1577 base = timeout_base_ns[index];
1578 ts_ns = base * mult1 * mult2;
1579 ret = ts_ns / 1000;
1580 } else {
1581 /* 4 bits 0/1 for 10/80us, 3 bits of multiplier */
1582 mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1583 mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
1584 if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
1585 mult1 = 80;
1586 else
1587 mult1 = 10;
1588 base = mmr_image & UV2_ACK_MASK;
1589 ret = mult1 * base;
1590 }
1591 return ret;
1592}
1593
1594static void __init init_per_cpu_tunables(void)
1595{
1596 int cpu;
1597 struct bau_control *bcp;
1598
1599 for_each_present_cpu(cpu) {
1600 bcp = &per_cpu(bau_control, cpu);
1601 bcp->baudisabled = 0;
1602 bcp->statp = &per_cpu(ptcstats, cpu);
1603 /* time interval to catch a hardware stay-busy bug */
1604 bcp->timeout_interval = usec_2_cycles(2*timeout_us);
1605 bcp->max_concurr = max_concurr;
1606 bcp->max_concurr_const = max_concurr;
1607 bcp->plugged_delay = plugged_delay;
1608 bcp->plugsb4reset = plugsb4reset;
1609 bcp->timeoutsb4reset = timeoutsb4reset;
1610 bcp->ipi_reset_limit = ipi_reset_limit;
1611 bcp->complete_threshold = complete_threshold;
1612 bcp->cong_response_us = congested_respns_us;
1613 bcp->cong_reps = congested_reps;
1614 bcp->cong_period = congested_period;
1615 }
1616}
1617
1618/*
1619 * Scan all cpus to collect blade and socket summaries.
1620 */
1621static int __init get_cpu_topology(int base_pnode,
1622 struct uvhub_desc *uvhub_descs,
1623 unsigned char *uvhub_mask)
1624{
1625 int cpu;
1626 int pnode;
1627 int uvhub;
1628 int socket;
1629 struct bau_control *bcp;
1630 struct uvhub_desc *bdp;
1631 struct socket_desc *sdp;
1632
1633 for_each_present_cpu(cpu) {
1634 bcp = &per_cpu(bau_control, cpu);
1635
1636 memset(bcp, 0, sizeof(struct bau_control));
1637
1638 pnode = uv_cpu_hub_info(cpu)->pnode;
1639 if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
1640 printk(KERN_EMERG
1641 "cpu %d pnode %d-%d beyond %d; BAU disabled\n",
1642 cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
1643 return 1;
1644 }
1645
1646 bcp->osnode = cpu_to_node(cpu);
1647 bcp->partition_base_pnode = base_pnode;
1648
1649 uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1650 *(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1651 bdp = &uvhub_descs[uvhub];
1652
1653 bdp->num_cpus++;
1654 bdp->uvhub = uvhub;
1655 bdp->pnode = pnode;
1656
1657 /* kludge: 'assuming' one node per socket, and assuming that
1658 disabling a socket just leaves a gap in node numbers */
1659 socket = bcp->osnode & 1;
1660 bdp->socket_mask |= (1 << socket);
1661 sdp = &bdp->socket[socket];
1662 sdp->cpu_number[sdp->num_cpus] = cpu;
1663 sdp->num_cpus++;
1664 if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
1665 printk(KERN_EMERG "%d cpus per socket invalid\n",
1666 sdp->num_cpus);
1667 return 1;
1668 }
1669 }
1670 return 0;
1671}
1672
1673/*
1674 * Each socket is to get a local array of pnodes/hubs.
1675 */
1676static void make_per_cpu_thp(struct bau_control *smaster)
1677{
1678 int cpu;
1679 size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
1680
1681 smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
1682 memset(smaster->thp, 0, hpsz);
1683 for_each_present_cpu(cpu) {
1684 smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
1685 smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1686 }
1687}
1688
1689/*
1690 * Initialize all the per_cpu information for the cpu's on a given socket,
1691 * given what has been gathered into the socket_desc struct.
1692 * And reports the chosen hub and socket masters back to the caller.
1693 */
1694static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
1695 struct bau_control **smasterp,
1696 struct bau_control **hmasterp)
1697{
1698 int i;
1699 int cpu;
1700 struct bau_control *bcp;
1701
1702 for (i = 0; i < sdp->num_cpus; i++) {
1703 cpu = sdp->cpu_number[i];
1704 bcp = &per_cpu(bau_control, cpu);
1705 bcp->cpu = cpu;
1706 if (i == 0) {
1707 *smasterp = bcp;
1708 if (!(*hmasterp))
1709 *hmasterp = bcp;
1710 }
1711 bcp->cpus_in_uvhub = bdp->num_cpus;
1712 bcp->cpus_in_socket = sdp->num_cpus;
1713 bcp->socket_master = *smasterp;
1714 bcp->uvhub = bdp->uvhub;
1715 bcp->uvhub_master = *hmasterp;
1716 bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
1717 if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1718 printk(KERN_EMERG "%d cpus per uvhub invalid\n",
1719 bcp->uvhub_cpu);
1720 return 1;
1721 }
1722 }
1723 return 0;
1724}
1725
1726/*
1727 * Summarize the blade and socket topology into the per_cpu structures.
1728 */
1729static int __init summarize_uvhub_sockets(int nuvhubs,
1730 struct uvhub_desc *uvhub_descs,
1731 unsigned char *uvhub_mask)
1732{
1733 int socket;
1734 int uvhub;
1735 unsigned short socket_mask;
1736
1737 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1738 struct uvhub_desc *bdp;
1739 struct bau_control *smaster = NULL;
1740 struct bau_control *hmaster = NULL;
1741
1742 if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1743 continue;
1744
1745 bdp = &uvhub_descs[uvhub];
1746 socket_mask = bdp->socket_mask;
1747 socket = 0;
1748 while (socket_mask) {
1749 struct socket_desc *sdp;
1750 if ((socket_mask & 1)) {
1751 sdp = &bdp->socket[socket];
1752 if (scan_sock(sdp, bdp, &smaster, &hmaster))
1753 return 1;
1754 }
1755 socket++;
1756 socket_mask = (socket_mask >> 1);
1757 make_per_cpu_thp(smaster);
1758 }
1759 }
1760 return 0;
1761}
1762
1763/*
1764 * initialize the bau_control structure for each cpu
1765 */
1766static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
1767{
1768 unsigned char *uvhub_mask;
1769 void *vp;
1770 struct uvhub_desc *uvhub_descs;
1771
1772 timeout_us = calculate_destination_timeout();
1773
1774 vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1775 uvhub_descs = (struct uvhub_desc *)vp;
1776 memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1777 uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1778
1779 if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
1780 return 1;
1781
1782 if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
1783 return 1;
1784
1785 kfree(uvhub_descs);
1786 kfree(uvhub_mask);
1787 init_per_cpu_tunables();
1788 return 0;
1789}
1790
1791/*
1792 * Initialization of BAU-related structures
1793 */
1794static int __init uv_bau_init(void)
1795{
1796 int uvhub;
1797 int pnode;
1798 int nuvhubs;
1799 int cur_cpu;
1800 int cpus;
1801 int vector;
1802 cpumask_var_t *mask;
1803
1804 if (!is_uv_system())
1805 return 0;
1806
1807 if (nobau)
1808 return 0;
1809
1810 for_each_possible_cpu(cur_cpu) {
1811 mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
1812 zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
1813 }
1814
1815 uv_nshift = uv_hub_info->m_val;
1816 uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1817 nuvhubs = uv_num_possible_blades();
1818 spin_lock_init(&disable_lock);
1819 congested_cycles = usec_2_cycles(congested_respns_us);
1820
1821 uv_base_pnode = 0x7fffffff;
1822 for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1823 cpus = uv_blade_nr_possible_cpus(uvhub);
1824 if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
1825 uv_base_pnode = uv_blade_to_pnode(uvhub);
1826 }
1827
1828 if (init_per_cpu(nuvhubs, uv_base_pnode)) {
1829 nobau = 1;
1830 return 0;
1831 }
1832
1833 vector = UV_BAU_MESSAGE;
1834 for_each_possible_blade(uvhub)
1835 if (uv_blade_nr_possible_cpus(uvhub))
1836 init_uvhub(uvhub, vector, uv_base_pnode);
1837
1838 enable_timeouts();
1839 alloc_intr_gate(vector, uv_bau_message_intr1);
1840
1841 for_each_possible_blade(uvhub) {
1842 if (uv_blade_nr_possible_cpus(uvhub)) {
1843 unsigned long val;
1844 unsigned long mmr;
1845 pnode = uv_blade_to_pnode(uvhub);
1846 /* INIT the bau */
1847 val = 1L << 63;
1848 write_gmmr_activation(pnode, val);
1849 mmr = 1; /* should be 1 to broadcast to both sockets */
1850 write_mmr_data_broadcast(pnode, mmr);
1851 }
1852 }
1853
1854 return 0;
1855}
1856core_initcall(uv_bau_init);
1857fs_initcall(uv_ptc_init);
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/platform/uv/uv_irq.c
index 1132129db792..374a05d8ad22 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/platform/uv/uv_irq.c
@@ -28,34 +28,21 @@ struct uv_irq_2_mmr_pnode{
28static spinlock_t uv_irq_lock; 28static spinlock_t uv_irq_lock;
29static struct rb_root uv_irq_root; 29static struct rb_root uv_irq_root;
30 30
31static int uv_set_irq_affinity(unsigned int, const struct cpumask *); 31static int uv_set_irq_affinity(struct irq_data *, const struct cpumask *, bool);
32 32
33static void uv_noop(unsigned int irq) 33static void uv_noop(struct irq_data *data) { }
34{
35}
36
37static unsigned int uv_noop_ret(unsigned int irq)
38{
39 return 0;
40}
41 34
42static void uv_ack_apic(unsigned int irq) 35static void uv_ack_apic(struct irq_data *data)
43{ 36{
44 ack_APIC_irq(); 37 ack_APIC_irq();
45} 38}
46 39
47static struct irq_chip uv_irq_chip = { 40static struct irq_chip uv_irq_chip = {
48 .name = "UV-CORE", 41 .name = "UV-CORE",
49 .startup = uv_noop_ret, 42 .irq_mask = uv_noop,
50 .shutdown = uv_noop, 43 .irq_unmask = uv_noop,
51 .enable = uv_noop, 44 .irq_eoi = uv_ack_apic,
52 .disable = uv_noop, 45 .irq_set_affinity = uv_set_irq_affinity,
53 .ack = uv_noop,
54 .mask = uv_noop,
55 .unmask = uv_noop,
56 .eoi = uv_ack_apic,
57 .end = uv_noop,
58 .set_affinity = uv_set_irq_affinity,
59}; 46};
60 47
61/* 48/*
@@ -144,28 +131,24 @@ arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
144 unsigned long mmr_offset, int limit) 131 unsigned long mmr_offset, int limit)
145{ 132{
146 const struct cpumask *eligible_cpu = cpumask_of(cpu); 133 const struct cpumask *eligible_cpu = cpumask_of(cpu);
147 struct irq_desc *desc = irq_to_desc(irq); 134 struct irq_cfg *cfg = irq_get_chip_data(irq);
148 struct irq_cfg *cfg;
149 int mmr_pnode;
150 unsigned long mmr_value; 135 unsigned long mmr_value;
151 struct uv_IO_APIC_route_entry *entry; 136 struct uv_IO_APIC_route_entry *entry;
152 int err; 137 int mmr_pnode, err;
153 138
154 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != 139 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
155 sizeof(unsigned long)); 140 sizeof(unsigned long));
156 141
157 cfg = irq_cfg(irq);
158
159 err = assign_irq_vector(irq, cfg, eligible_cpu); 142 err = assign_irq_vector(irq, cfg, eligible_cpu);
160 if (err != 0) 143 if (err != 0)
161 return err; 144 return err;
162 145
163 if (limit == UV_AFFINITY_CPU) 146 if (limit == UV_AFFINITY_CPU)
164 desc->status |= IRQ_NO_BALANCING; 147 irq_set_status_flags(irq, IRQ_NO_BALANCING);
165 else 148 else
166 desc->status |= IRQ_MOVE_PCNTXT; 149 irq_set_status_flags(irq, IRQ_MOVE_PCNTXT);
167 150
168 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq, 151 irq_set_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
169 irq_name); 152 irq_name);
170 153
171 mmr_value = 0; 154 mmr_value = 0;
@@ -206,17 +189,17 @@ static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
206 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 189 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
207} 190}
208 191
209static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask) 192static int
193uv_set_irq_affinity(struct irq_data *data, const struct cpumask *mask,
194 bool force)
210{ 195{
211 struct irq_desc *desc = irq_to_desc(irq); 196 struct irq_cfg *cfg = data->chip_data;
212 struct irq_cfg *cfg = desc->chip_data;
213 unsigned int dest; 197 unsigned int dest;
214 unsigned long mmr_value; 198 unsigned long mmr_value, mmr_offset;
215 struct uv_IO_APIC_route_entry *entry; 199 struct uv_IO_APIC_route_entry *entry;
216 unsigned long mmr_offset;
217 int mmr_pnode; 200 int mmr_pnode;
218 201
219 if (set_desc_affinity(desc, mask, &dest)) 202 if (__ioapic_set_affinity(data, mask, &dest))
220 return -1; 203 return -1;
221 204
222 mmr_value = 0; 205 mmr_value = 0;
@@ -231,7 +214,7 @@ static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
231 entry->dest = dest; 214 entry->dest = dest;
232 215
233 /* Get previously stored MMR and pnode of hub sourcing interrupts */ 216 /* Get previously stored MMR and pnode of hub sourcing interrupts */
234 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode)) 217 if (uv_irq_2_mmr_info(data->irq, &mmr_offset, &mmr_pnode))
235 return -1; 218 return -1;
236 219
237 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value); 220 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/platform/uv/uv_sysfs.c
index 309c70fb7759..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/platform/uv/uv_sysfs.c
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/platform/uv/uv_time.c
index 56e421bc379b..9f29a01ee1b3 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/platform/uv/uv_time.c
@@ -40,7 +40,6 @@ static struct clocksource clocksource_uv = {
40 .rating = 400, 40 .rating = 400,
41 .read = uv_read_rtc, 41 .read = uv_read_rtc,
42 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK, 42 .mask = (cycle_t)UVH_RTC_REAL_TIME_CLOCK_MASK,
43 .shift = 10,
44 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 43 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
45}; 44};
46 45
@@ -89,6 +88,7 @@ static void uv_rtc_send_IPI(int cpu)
89 88
90 apicid = cpu_physical_id(cpu); 89 apicid = cpu_physical_id(cpu);
91 pnode = uv_apicid_to_pnode(apicid); 90 pnode = uv_apicid_to_pnode(apicid);
91 apicid |= uv_apicid_hibits;
92 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 92 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
93 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 93 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
94 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 94 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
@@ -99,25 +99,34 @@ static void uv_rtc_send_IPI(int cpu)
99/* Check for an RTC interrupt pending */ 99/* Check for an RTC interrupt pending */
100static int uv_intr_pending(int pnode) 100static int uv_intr_pending(int pnode)
101{ 101{
102 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) & 102 if (is_uv1_hub())
103 UVH_EVENT_OCCURRED0_RTC1_MASK; 103 return uv_read_global_mmr64(pnode, UVH_EVENT_OCCURRED0) &
104 UV1H_EVENT_OCCURRED0_RTC1_MASK;
105 else
106 return uv_read_global_mmr64(pnode, UV2H_EVENT_OCCURRED2) &
107 UV2H_EVENT_OCCURRED2_RTC_1_MASK;
104} 108}
105 109
106/* Setup interrupt and return non-zero if early expiration occurred. */ 110/* Setup interrupt and return non-zero if early expiration occurred. */
107static int uv_setup_intr(int cpu, u64 expires) 111static int uv_setup_intr(int cpu, u64 expires)
108{ 112{
109 u64 val; 113 u64 val;
114 unsigned long apicid = cpu_physical_id(cpu) | uv_apicid_hibits;
110 int pnode = uv_cpu_to_pnode(cpu); 115 int pnode = uv_cpu_to_pnode(cpu);
111 116
112 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, 117 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG,
113 UVH_RTC1_INT_CONFIG_M_MASK); 118 UVH_RTC1_INT_CONFIG_M_MASK);
114 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L); 119 uv_write_global_mmr64(pnode, UVH_INT_CMPB, -1L);
115 120
116 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 121 if (is_uv1_hub())
117 UVH_EVENT_OCCURRED0_RTC1_MASK); 122 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
123 UV1H_EVENT_OCCURRED0_RTC1_MASK);
124 else
125 uv_write_global_mmr64(pnode, UV2H_EVENT_OCCURRED2_ALIAS,
126 UV2H_EVENT_OCCURRED2_RTC_1_MASK);
118 127
119 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 128 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
120 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 129 ((u64)apicid << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
121 130
122 /* Set configuration */ 131 /* Set configuration */
123 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val); 132 uv_write_global_mmr64(pnode, UVH_RTC1_INT_CONFIG, val);
@@ -370,14 +379,11 @@ static __init int uv_rtc_setup_clock(void)
370 if (!is_uv_system()) 379 if (!is_uv_system())
371 return -ENODEV; 380 return -ENODEV;
372 381
373 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
374 clocksource_uv.shift);
375
376 /* If single blade, prefer tsc */ 382 /* If single blade, prefer tsc */
377 if (uv_num_possible_blades() == 1) 383 if (uv_num_possible_blades() == 1)
378 clocksource_uv.rating = 250; 384 clocksource_uv.rating = 250;
379 385
380 rc = clocksource_register(&clocksource_uv); 386 rc = clocksource_register_hz(&clocksource_uv, sn_rtc_cycles_per_second);
381 if (rc) 387 if (rc)
382 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc); 388 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
383 else 389 else
diff --git a/arch/x86/platform/visws/Makefile b/arch/x86/platform/visws/Makefile
new file mode 100644
index 000000000000..91bc17ab2fd5
--- /dev/null
+++ b/arch/x86/platform/visws/Makefile
@@ -0,0 +1 @@
obj-$(CONFIG_X86_VISWS) += visws_quirks.o
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/platform/visws/visws_quirks.c
index e680ea52db9b..c7abf13a213f 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/platform/visws/visws_quirks.c
@@ -66,10 +66,7 @@ static void __init visws_time_init(void)
66} 66}
67 67
68/* Replaces the default init_ISA_irqs in the generic setup */ 68/* Replaces the default init_ISA_irqs in the generic setup */
69static void __init visws_pre_intr_init(void) 69static void __init visws_pre_intr_init(void);
70{
71 init_VISWS_APIC_irqs();
72}
73 70
74/* Quirk for machine specific memory setup. */ 71/* Quirk for machine specific memory setup. */
75 72
@@ -174,7 +171,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
174 ver = m->apicver; 171 ver = m->apicver;
175 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) { 172 if ((ver >= 0x14 && m->apicid >= 0xff) || m->apicid >= 0xf) {
176 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n", 173 printk(KERN_ERR "Processor #%d INVALID. (Max ID: %d).\n",
177 m->apicid, MAX_APICS); 174 m->apicid, MAX_LOCAL_APIC);
178 return; 175 return;
179 } 176 }
180 177
@@ -429,67 +426,34 @@ static int is_co_apic(unsigned int irq)
429/* 426/*
430 * This is the SGI Cobalt (IO-)APIC: 427 * This is the SGI Cobalt (IO-)APIC:
431 */ 428 */
432 429static void enable_cobalt_irq(struct irq_data *data)
433static void enable_cobalt_irq(unsigned int irq)
434{ 430{
435 co_apic_set(is_co_apic(irq), irq); 431 co_apic_set(is_co_apic(data->irq), data->irq);
436} 432}
437 433
438static void disable_cobalt_irq(unsigned int irq) 434static void disable_cobalt_irq(struct irq_data *data)
439{ 435{
440 int entry = is_co_apic(irq); 436 int entry = is_co_apic(data->irq);
441 437
442 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK); 438 co_apic_write(CO_APIC_LO(entry), CO_APIC_MASK);
443 co_apic_read(CO_APIC_LO(entry)); 439 co_apic_read(CO_APIC_LO(entry));
444} 440}
445 441
446/* 442static void ack_cobalt_irq(struct irq_data *data)
447 * "irq" really just serves to identify the device. Here is where we
448 * map this to the Cobalt APIC entry where it's physically wired.
449 * This is called via request_irq -> setup_irq -> irq_desc->startup()
450 */
451static unsigned int startup_cobalt_irq(unsigned int irq)
452{ 443{
453 unsigned long flags; 444 unsigned long flags;
454 struct irq_desc *desc = irq_to_desc(irq);
455 445
456 spin_lock_irqsave(&cobalt_lock, flags); 446 spin_lock_irqsave(&cobalt_lock, flags);
457 if ((desc->status & (IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING))) 447 disable_cobalt_irq(data);
458 desc->status &= ~(IRQ_DISABLED | IRQ_INPROGRESS | IRQ_WAITING);
459 enable_cobalt_irq(irq);
460 spin_unlock_irqrestore(&cobalt_lock, flags);
461 return 0;
462}
463
464static void ack_cobalt_irq(unsigned int irq)
465{
466 unsigned long flags;
467
468 spin_lock_irqsave(&cobalt_lock, flags);
469 disable_cobalt_irq(irq);
470 apic_write(APIC_EOI, APIC_EIO_ACK); 448 apic_write(APIC_EOI, APIC_EIO_ACK);
471 spin_unlock_irqrestore(&cobalt_lock, flags); 449 spin_unlock_irqrestore(&cobalt_lock, flags);
472} 450}
473 451
474static void end_cobalt_irq(unsigned int irq)
475{
476 unsigned long flags;
477 struct irq_desc *desc = irq_to_desc(irq);
478
479 spin_lock_irqsave(&cobalt_lock, flags);
480 if (!(desc->status & (IRQ_DISABLED | IRQ_INPROGRESS)))
481 enable_cobalt_irq(irq);
482 spin_unlock_irqrestore(&cobalt_lock, flags);
483}
484
485static struct irq_chip cobalt_irq_type = { 452static struct irq_chip cobalt_irq_type = {
486 .name = "Cobalt-APIC", 453 .name = "Cobalt-APIC",
487 .startup = startup_cobalt_irq, 454 .irq_enable = enable_cobalt_irq,
488 .shutdown = disable_cobalt_irq, 455 .irq_disable = disable_cobalt_irq,
489 .enable = enable_cobalt_irq, 456 .irq_ack = ack_cobalt_irq,
490 .disable = disable_cobalt_irq,
491 .ack = ack_cobalt_irq,
492 .end = end_cobalt_irq,
493}; 457};
494 458
495 459
@@ -503,35 +467,26 @@ static struct irq_chip cobalt_irq_type = {
503 * interrupt controller type, and through a special virtual interrupt- 467 * interrupt controller type, and through a special virtual interrupt-
504 * controller. Device drivers only see the virtual interrupt sources. 468 * controller. Device drivers only see the virtual interrupt sources.
505 */ 469 */
506static unsigned int startup_piix4_master_irq(unsigned int irq) 470static unsigned int startup_piix4_master_irq(struct irq_data *data)
507{ 471{
508 legacy_pic->init(0); 472 legacy_pic->init(0);
509 473 enable_cobalt_irq(data);
510 return startup_cobalt_irq(irq); 474 return 0;
511}
512
513static void end_piix4_master_irq(unsigned int irq)
514{
515 unsigned long flags;
516
517 spin_lock_irqsave(&cobalt_lock, flags);
518 enable_cobalt_irq(irq);
519 spin_unlock_irqrestore(&cobalt_lock, flags);
520} 475}
521 476
522static struct irq_chip piix4_master_irq_type = { 477static struct irq_chip piix4_master_irq_type = {
523 .name = "PIIX4-master", 478 .name = "PIIX4-master",
524 .startup = startup_piix4_master_irq, 479 .irq_startup = startup_piix4_master_irq,
525 .ack = ack_cobalt_irq, 480 .irq_ack = ack_cobalt_irq,
526 .end = end_piix4_master_irq,
527}; 481};
528 482
483static void pii4_mask(struct irq_data *data) { }
529 484
530static struct irq_chip piix4_virtual_irq_type = { 485static struct irq_chip piix4_virtual_irq_type = {
531 .name = "PIIX4-virtual", 486 .name = "PIIX4-virtual",
487 .irq_mask = pii4_mask,
532}; 488};
533 489
534
535/* 490/*
536 * PIIX4-8259 master/virtual functions to handle interrupt requests 491 * PIIX4-8259 master/virtual functions to handle interrupt requests
537 * from legacy devices: floppy, parallel, serial, rtc. 492 * from legacy devices: floppy, parallel, serial, rtc.
@@ -549,9 +504,8 @@ static struct irq_chip piix4_virtual_irq_type = {
549 */ 504 */
550static irqreturn_t piix4_master_intr(int irq, void *dev_id) 505static irqreturn_t piix4_master_intr(int irq, void *dev_id)
551{ 506{
552 int realirq;
553 struct irq_desc *desc;
554 unsigned long flags; 507 unsigned long flags;
508 int realirq;
555 509
556 raw_spin_lock_irqsave(&i8259A_lock, flags); 510 raw_spin_lock_irqsave(&i8259A_lock, flags);
557 511
@@ -592,18 +546,10 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
592 546
593 raw_spin_unlock_irqrestore(&i8259A_lock, flags); 547 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
594 548
595 desc = irq_to_desc(realirq);
596
597 /* 549 /*
598 * handle this 'virtual interrupt' as a Cobalt one now. 550 * handle this 'virtual interrupt' as a Cobalt one now.
599 */ 551 */
600 kstat_incr_irqs_this_cpu(realirq, desc); 552 generic_handle_irq(realirq);
601
602 if (likely(desc->action != NULL))
603 handle_IRQ_event(realirq, desc->action);
604
605 if (!(desc->status & IRQ_DISABLED))
606 legacy_pic->chip->unmask(realirq);
607 553
608 return IRQ_HANDLED; 554 return IRQ_HANDLED;
609 555
@@ -615,50 +561,46 @@ out_unlock:
615static struct irqaction master_action = { 561static struct irqaction master_action = {
616 .handler = piix4_master_intr, 562 .handler = piix4_master_intr,
617 .name = "PIIX4-8259", 563 .name = "PIIX4-8259",
564 .flags = IRQF_NO_THREAD,
618}; 565};
619 566
620static struct irqaction cascade_action = { 567static struct irqaction cascade_action = {
621 .handler = no_action, 568 .handler = no_action,
622 .name = "cascade", 569 .name = "cascade",
570 .flags = IRQF_NO_THREAD,
623}; 571};
624 572
625static inline void set_piix4_virtual_irq_type(void) 573static inline void set_piix4_virtual_irq_type(void)
626{ 574{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask; 575 piix4_virtual_irq_type.irq_enable = i8259A_chip.irq_unmask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask; 576 piix4_virtual_irq_type.irq_disable = i8259A_chip.irq_mask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask; 577 piix4_virtual_irq_type.irq_unmask = i8259A_chip.irq_unmask;
630} 578}
631 579
632void init_VISWS_APIC_irqs(void) 580static void __init visws_pre_intr_init(void)
633{ 581{
634 int i; 582 int i;
635 583
636 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) { 584 set_piix4_virtual_irq_type();
637 struct irq_desc *desc = irq_to_desc(i);
638
639 desc->status = IRQ_DISABLED;
640 desc->action = 0;
641 desc->depth = 1;
642 585
643 if (i == 0) { 586 for (i = 0; i < CO_IRQ_APIC0 + CO_APIC_LAST + 1; i++) {
644 desc->chip = &cobalt_irq_type; 587 struct irq_chip *chip = NULL;
645 } 588
646 else if (i == CO_IRQ_IDE0) { 589 if (i == 0)
647 desc->chip = &cobalt_irq_type; 590 chip = &cobalt_irq_type;
648 } 591 else if (i == CO_IRQ_IDE0)
649 else if (i == CO_IRQ_IDE1) { 592 chip = &cobalt_irq_type;
650 desc->chip = &cobalt_irq_type; 593 else if (i == CO_IRQ_IDE1)
651 } 594 chip = &cobalt_irq_type;
652 else if (i == CO_IRQ_8259) { 595 else if (i == CO_IRQ_8259)
653 desc->chip = &piix4_master_irq_type; 596 chip = &piix4_master_irq_type;
654 } 597 else if (i < CO_IRQ_APIC0)
655 else if (i < CO_IRQ_APIC0) { 598 chip = &piix4_virtual_irq_type;
656 set_piix4_virtual_irq_type(); 599 else if (IS_CO_APIC(i))
657 desc->chip = &piix4_virtual_irq_type; 600 chip = &cobalt_irq_type;
658 } 601
659 else if (IS_CO_APIC(i)) { 602 if (chip)
660 desc->chip = &cobalt_irq_type; 603 irq_set_chip(i, chip);
661 }
662 } 604 }
663 605
664 setup_irq(CO_IRQ_8259, &master_action); 606 setup_irq(CO_IRQ_8259, &master_action);
diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile
index 4a2afa1bac51..bef0bc962400 100644
--- a/arch/x86/vdso/Makefile
+++ b/arch/x86/vdso/Makefile
@@ -11,7 +11,7 @@ vdso-install-$(VDSO32-y) += $(vdso32-images)
11 11
12 12
13# files to link into the vdso 13# files to link into the vdso
14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vvar.o 14vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o
15 15
16# files to link into kernel 16# files to link into kernel
17obj-$(VDSO64-y) += vma.o vdso.o 17obj-$(VDSO64-y) += vma.o vdso.o
@@ -25,7 +25,7 @@ targets += vdso.so vdso.so.dbg vdso.lds $(vobjs-y)
25 25
26export CPPFLAGS_vdso.lds += -P -C 26export CPPFLAGS_vdso.lds += -P -C
27 27
28VDSO_LDFLAGS_vdso.lds = -m elf_x86_64 -Wl,-soname=linux-vdso.so.1 \ 28VDSO_LDFLAGS_vdso.lds = -m64 -Wl,-soname=linux-vdso.so.1 \
29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096 29 -Wl,-z,max-page-size=4096 -Wl,-z,common-page-size=4096
30 30
31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so 31$(obj)/vdso.o: $(src)/vdso.S $(obj)/vdso.so
@@ -37,11 +37,24 @@ $(obj)/%.so: OBJCOPYFLAGS := -S
37$(obj)/%.so: $(obj)/%.so.dbg FORCE 37$(obj)/%.so: $(obj)/%.so.dbg FORCE
38 $(call if_changed,objcopy) 38 $(call if_changed,objcopy)
39 39
40#
41# Don't omit frame pointers for ease of userspace debugging, but do
42# optimize sibling calls.
43#
40CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ 44CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \
41 $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) 45 $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \
46 -fno-omit-frame-pointer -foptimize-sibling-calls
42 47
43$(vobjs): KBUILD_CFLAGS += $(CFL) 48$(vobjs): KBUILD_CFLAGS += $(CFL)
44 49
50#
51# vDSO code runs in userspace and -pg doesn't help with profiling anyway.
52#
53CFLAGS_REMOVE_vdso-note.o = -pg
54CFLAGS_REMOVE_vclock_gettime.o = -pg
55CFLAGS_REMOVE_vgetcpu.o = -pg
56CFLAGS_REMOVE_vvar.o = -pg
57
45targets += vdso-syms.lds 58targets += vdso-syms.lds
46obj-$(VDSO64-y) += vdso-syms.lds 59obj-$(VDSO64-y) += vdso-syms.lds
47 60
@@ -69,7 +82,7 @@ vdso32.so-$(VDSO32-y) += sysenter
69vdso32-images = $(vdso32.so-y:%=vdso32-%.so) 82vdso32-images = $(vdso32.so-y:%=vdso32-%.so)
70 83
71CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds) 84CPPFLAGS_vdso32.lds = $(CPPFLAGS_vdso.lds)
72VDSO_LDFLAGS_vdso32.lds = -m elf_i386 -Wl,-soname=linux-gate.so.1 85VDSO_LDFLAGS_vdso32.lds = -m32 -Wl,-soname=linux-gate.so.1
73 86
74# This makes sure the $(obj) subdirectory exists even though vdso32/ 87# This makes sure the $(obj) subdirectory exists even though vdso32/
75# is not a kbuild sub-make subdirectory. 88# is not a kbuild sub-make subdirectory.
diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c
index ee55754cc3c5..a724905fdae7 100644
--- a/arch/x86/vdso/vclock_gettime.c
+++ b/arch/x86/vdso/vclock_gettime.c
@@ -2,7 +2,7 @@
2 * Copyright 2006 Andi Kleen, SUSE Labs. 2 * Copyright 2006 Andi Kleen, SUSE Labs.
3 * Subject to the GNU Public License, v.2 3 * Subject to the GNU Public License, v.2
4 * 4 *
5 * Fast user context implementation of clock_gettime and gettimeofday. 5 * Fast user context implementation of clock_gettime, gettimeofday, and time.
6 * 6 *
7 * The code should have no internal unresolved relocations. 7 * The code should have no internal unresolved relocations.
8 * Check with readelf after changing. 8 * Check with readelf after changing.
@@ -22,9 +22,8 @@
22#include <asm/hpet.h> 22#include <asm/hpet.h>
23#include <asm/unistd.h> 23#include <asm/unistd.h>
24#include <asm/io.h> 24#include <asm/io.h>
25#include "vextern.h"
26 25
27#define gtod vdso_vsyscall_gtod_data 26#define gtod (&VVAR(vsyscall_gtod_data))
28 27
29notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) 28notrace static long vdso_fallback_gettime(long clock, struct timespec *ts)
30{ 29{
@@ -56,22 +55,6 @@ notrace static noinline int do_realtime(struct timespec *ts)
56 return 0; 55 return 0;
57} 56}
58 57
59/* Copy of the version in kernel/time.c which we cannot directly access */
60notrace static void
61vset_normalized_timespec(struct timespec *ts, long sec, long nsec)
62{
63 while (nsec >= NSEC_PER_SEC) {
64 nsec -= NSEC_PER_SEC;
65 ++sec;
66 }
67 while (nsec < 0) {
68 nsec += NSEC_PER_SEC;
69 --sec;
70 }
71 ts->tv_sec = sec;
72 ts->tv_nsec = nsec;
73}
74
75notrace static noinline int do_monotonic(struct timespec *ts) 58notrace static noinline int do_monotonic(struct timespec *ts)
76{ 59{
77 unsigned long seq, ns, secs; 60 unsigned long seq, ns, secs;
@@ -82,7 +65,17 @@ notrace static noinline int do_monotonic(struct timespec *ts)
82 secs += gtod->wall_to_monotonic.tv_sec; 65 secs += gtod->wall_to_monotonic.tv_sec;
83 ns += gtod->wall_to_monotonic.tv_nsec; 66 ns += gtod->wall_to_monotonic.tv_nsec;
84 } while (unlikely(read_seqretry(&gtod->lock, seq))); 67 } while (unlikely(read_seqretry(&gtod->lock, seq)));
85 vset_normalized_timespec(ts, secs, ns); 68
69 /* wall_time_nsec, vgetns(), and wall_to_monotonic.tv_nsec
70 * are all guaranteed to be nonnegative.
71 */
72 while (ns >= NSEC_PER_SEC) {
73 ns -= NSEC_PER_SEC;
74 ++secs;
75 }
76 ts->tv_sec = secs;
77 ts->tv_nsec = ns;
78
86 return 0; 79 return 0;
87} 80}
88 81
@@ -107,7 +100,17 @@ notrace static noinline int do_monotonic_coarse(struct timespec *ts)
107 secs += gtod->wall_to_monotonic.tv_sec; 100 secs += gtod->wall_to_monotonic.tv_sec;
108 ns += gtod->wall_to_monotonic.tv_nsec; 101 ns += gtod->wall_to_monotonic.tv_nsec;
109 } while (unlikely(read_seqretry(&gtod->lock, seq))); 102 } while (unlikely(read_seqretry(&gtod->lock, seq)));
110 vset_normalized_timespec(ts, secs, ns); 103
104 /* wall_time_nsec and wall_to_monotonic.tv_nsec are
105 * guaranteed to be between 0 and NSEC_PER_SEC.
106 */
107 if (ns >= NSEC_PER_SEC) {
108 ns -= NSEC_PER_SEC;
109 ++secs;
110 }
111 ts->tv_sec = secs;
112 ts->tv_nsec = ns;
113
111 return 0; 114 return 0;
112} 115}
113 116
@@ -157,3 +160,32 @@ notrace int __vdso_gettimeofday(struct timeval *tv, struct timezone *tz)
157} 160}
158int gettimeofday(struct timeval *, struct timezone *) 161int gettimeofday(struct timeval *, struct timezone *)
159 __attribute__((weak, alias("__vdso_gettimeofday"))); 162 __attribute__((weak, alias("__vdso_gettimeofday")));
163
164/* This will break when the xtime seconds get inaccurate, but that is
165 * unlikely */
166
167static __always_inline long time_syscall(long *t)
168{
169 long secs;
170 asm volatile("syscall"
171 : "=a" (secs)
172 : "0" (__NR_time), "D" (t) : "cc", "r11", "cx", "memory");
173 return secs;
174}
175
176notrace time_t __vdso_time(time_t *t)
177{
178 time_t result;
179
180 if (unlikely(!VVAR(vsyscall_gtod_data).sysctl_enabled))
181 return time_syscall(t);
182
183 /* This is atomic on x86_64 so we don't need any locks. */
184 result = ACCESS_ONCE(VVAR(vsyscall_gtod_data).wall_time_sec);
185
186 if (t)
187 *t = result;
188 return result;
189}
190int time(time_t *t)
191 __attribute__((weak, alias("__vdso_time")));
diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S
index 4e5dd3b4de7f..b96b2677cad8 100644
--- a/arch/x86/vdso/vdso.lds.S
+++ b/arch/x86/vdso/vdso.lds.S
@@ -23,15 +23,10 @@ VERSION {
23 __vdso_gettimeofday; 23 __vdso_gettimeofday;
24 getcpu; 24 getcpu;
25 __vdso_getcpu; 25 __vdso_getcpu;
26 time;
27 __vdso_time;
26 local: *; 28 local: *;
27 }; 29 };
28} 30}
29 31
30VDSO64_PRELINK = VDSO_PRELINK; 32VDSO64_PRELINK = VDSO_PRELINK;
31
32/*
33 * Define VDSO64_x for each VEXTERN(x), for use via VDSO64_SYMBOL.
34 */
35#define VEXTERN(x) VDSO64_ ## x = vdso_ ## x;
36#include "vextern.h"
37#undef VEXTERN
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 36df991985b2..468d591dde31 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -417,24 +417,25 @@ const char *arch_vma_name(struct vm_area_struct *vma)
417 return NULL; 417 return NULL;
418} 418}
419 419
420struct vm_area_struct *get_gate_vma(struct task_struct *tsk) 420struct vm_area_struct *get_gate_vma(struct mm_struct *mm)
421{ 421{
422 struct mm_struct *mm = tsk->mm; 422 /*
423 423 * Check to see if the corresponding task was created in compat vdso
424 /* Check to see if this task was created in compat vdso mode */ 424 * mode.
425 */
425 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE) 426 if (mm && mm->context.vdso == (void *)VDSO_HIGH_BASE)
426 return &gate_vma; 427 return &gate_vma;
427 return NULL; 428 return NULL;
428} 429}
429 430
430int in_gate_area(struct task_struct *task, unsigned long addr) 431int in_gate_area(struct mm_struct *mm, unsigned long addr)
431{ 432{
432 const struct vm_area_struct *vma = get_gate_vma(task); 433 const struct vm_area_struct *vma = get_gate_vma(mm);
433 434
434 return vma && addr >= vma->vm_start && addr < vma->vm_end; 435 return vma && addr >= vma->vm_start && addr < vma->vm_end;
435} 436}
436 437
437int in_gate_area_no_task(unsigned long addr) 438int in_gate_area_no_mm(unsigned long addr)
438{ 439{
439 return 0; 440 return 0;
440} 441}
diff --git a/arch/x86/vdso/vextern.h b/arch/x86/vdso/vextern.h
deleted file mode 100644
index 1683ba2ae3e8..000000000000
--- a/arch/x86/vdso/vextern.h
+++ /dev/null
@@ -1,16 +0,0 @@
1#ifndef VEXTERN
2#include <asm/vsyscall.h>
3#define VEXTERN(x) \
4 extern typeof(x) *vdso_ ## x __attribute__((visibility("hidden")));
5#endif
6
7#define VMAGIC 0xfeedbabeabcdefabUL
8
9/* Any kernel variables used in the vDSO must be exported in the main
10 kernel's vmlinux.lds.S/vsyscall.h/proper __section and
11 put into vextern.h and be referenced as a pointer with vdso prefix.
12 The main kernel later fills in the values. */
13
14VEXTERN(jiffies)
15VEXTERN(vgetcpu_mode)
16VEXTERN(vsyscall_gtod_data)
diff --git a/arch/x86/vdso/vgetcpu.c b/arch/x86/vdso/vgetcpu.c
index 9fbc6b20026b..5463ad558573 100644
--- a/arch/x86/vdso/vgetcpu.c
+++ b/arch/x86/vdso/vgetcpu.c
@@ -11,14 +11,13 @@
11#include <linux/time.h> 11#include <linux/time.h>
12#include <asm/vsyscall.h> 12#include <asm/vsyscall.h>
13#include <asm/vgtod.h> 13#include <asm/vgtod.h>
14#include "vextern.h"
15 14
16notrace long 15notrace long
17__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused) 16__vdso_getcpu(unsigned *cpu, unsigned *node, struct getcpu_cache *unused)
18{ 17{
19 unsigned int p; 18 unsigned int p;
20 19
21 if (*vdso_vgetcpu_mode == VGETCPU_RDTSCP) { 20 if (VVAR(vgetcpu_mode) == VGETCPU_RDTSCP) {
22 /* Load per CPU data from RDTSCP */ 21 /* Load per CPU data from RDTSCP */
23 native_read_tscp(&p); 22 native_read_tscp(&p);
24 } else { 23 } else {
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 4b5d26f108bb..7abd2be0f9b9 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -15,9 +15,6 @@
15#include <asm/proto.h> 15#include <asm/proto.h>
16#include <asm/vdso.h> 16#include <asm/vdso.h>
17 17
18#include "vextern.h" /* Just for VMAGIC. */
19#undef VEXTERN
20
21unsigned int __read_mostly vdso_enabled = 1; 18unsigned int __read_mostly vdso_enabled = 1;
22 19
23extern char vdso_start[], vdso_end[]; 20extern char vdso_start[], vdso_end[];
@@ -26,20 +23,10 @@ extern unsigned short vdso_sync_cpuid;
26static struct page **vdso_pages; 23static struct page **vdso_pages;
27static unsigned vdso_size; 24static unsigned vdso_size;
28 25
29static inline void *var_ref(void *p, char *name)
30{
31 if (*(void **)p != (void *)VMAGIC) {
32 printk("VDSO: variable %s broken\n", name);
33 vdso_enabled = 0;
34 }
35 return p;
36}
37
38static int __init init_vdso_vars(void) 26static int __init init_vdso_vars(void)
39{ 27{
40 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE; 28 int npages = (vdso_end - vdso_start + PAGE_SIZE - 1) / PAGE_SIZE;
41 int i; 29 int i;
42 char *vbase;
43 30
44 vdso_size = npages << PAGE_SHIFT; 31 vdso_size = npages << PAGE_SHIFT;
45 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL); 32 vdso_pages = kmalloc(sizeof(struct page *) * npages, GFP_KERNEL);
@@ -54,20 +41,6 @@ static int __init init_vdso_vars(void)
54 copy_page(page_address(p), vdso_start + i*PAGE_SIZE); 41 copy_page(page_address(p), vdso_start + i*PAGE_SIZE);
55 } 42 }
56 43
57 vbase = vmap(vdso_pages, npages, 0, PAGE_KERNEL);
58 if (!vbase)
59 goto oom;
60
61 if (memcmp(vbase, "\177ELF", 4)) {
62 printk("VDSO: I'm broken; not ELF\n");
63 vdso_enabled = 0;
64 }
65
66#define VEXTERN(x) \
67 *(typeof(__ ## x) **) var_ref(VDSO64_SYMBOL(vbase, x), #x) = &__ ## x;
68#include "vextern.h"
69#undef VEXTERN
70 vunmap(vbase);
71 return 0; 44 return 0;
72 45
73 oom: 46 oom:
diff --git a/arch/x86/vdso/vvar.c b/arch/x86/vdso/vvar.c
deleted file mode 100644
index 1b7e703684f9..000000000000
--- a/arch/x86/vdso/vvar.c
+++ /dev/null
@@ -1,12 +0,0 @@
1/* Define pointer to external vDSO variables.
2 These are part of the vDSO. The kernel fills in the real addresses
3 at boot time. This is done because when the vdso is linked the
4 kernel isn't yet and we don't know the final addresses. */
5#include <linux/kernel.h>
6#include <linux/time.h>
7#include <asm/vsyscall.h>
8#include <asm/timex.h>
9#include <asm/vgtod.h>
10
11#define VEXTERN(x) typeof (__ ## x) *const vdso_ ## x = (void *)VMAGIC;
12#include "vextern.h"
diff --git a/arch/x86/xen/Kconfig b/arch/x86/xen/Kconfig
index 68128a1b401a..5cc821cb2e09 100644
--- a/arch/x86/xen/Kconfig
+++ b/arch/x86/xen/Kconfig
@@ -13,25 +13,33 @@ config XEN
13 kernel to boot in a paravirtualized environment under the 13 kernel to boot in a paravirtualized environment under the
14 Xen hypervisor. 14 Xen hypervisor.
15 15
16config XEN_DOM0
17 def_bool y
18 depends on XEN && PCI_XEN && SWIOTLB_XEN
19 depends on X86_LOCAL_APIC && X86_IO_APIC && ACPI && PCI
20
21# Dummy symbol since people have come to rely on the PRIVILEGED_GUEST
22# name in tools.
23config XEN_PRIVILEGED_GUEST
24 def_bool XEN_DOM0
25
16config XEN_PVHVM 26config XEN_PVHVM
17 def_bool y 27 def_bool y
18 depends on XEN 28 depends on XEN
19 depends on X86_LOCAL_APIC 29 depends on X86_LOCAL_APIC
20 30
21config XEN_MAX_DOMAIN_MEMORY 31config XEN_MAX_DOMAIN_MEMORY
22 int "Maximum allowed size of a domain in gigabytes" 32 int
23 default 8 if X86_32 33 default 128
24 default 32 if X86_64
25 depends on XEN 34 depends on XEN
26 help 35 help
27 The pseudo-physical to machine address array is sized 36 This only affects the sizing of some bss arrays, the unused
28 according to the maximum possible memory size of a Xen 37 portions of which are freed.
29 domain. This array uses 1 page per gigabyte, so there's no
30 need to be too stingy here.
31 38
32config XEN_SAVE_RESTORE 39config XEN_SAVE_RESTORE
33 bool 40 bool
34 depends on XEN && PM 41 depends on XEN
42 select HIBERNATE_CALLBACKS
35 default y 43 default y
36 44
37config XEN_DEBUG_FS 45config XEN_DEBUG_FS
@@ -41,3 +49,11 @@ config XEN_DEBUG_FS
41 help 49 help
42 Enable statistics output and various tuning options in debugfs. 50 Enable statistics output and various tuning options in debugfs.
43 Enabling this option may incur a significant performance overhead. 51 Enabling this option may incur a significant performance overhead.
52
53config XEN_DEBUG
54 bool "Enable Xen debug checks"
55 depends on XEN
56 default n
57 help
58 Enable various WARN_ON checks in the Xen MMU code.
59 Enabling this option WILL incur a significant performance overhead.
diff --git a/arch/x86/xen/Makefile b/arch/x86/xen/Makefile
index 779385158915..17c565de3d64 100644
--- a/arch/x86/xen/Makefile
+++ b/arch/x86/xen/Makefile
@@ -12,7 +12,8 @@ CFLAGS_mmu.o := $(nostackp)
12 12
13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \ 13obj-y := enlighten.o setup.o multicalls.o mmu.o irq.o \
14 time.o xen-asm.o xen-asm_$(BITS).o \ 14 time.o xen-asm.o xen-asm_$(BITS).o \
15 grant-table.o suspend.o platform-pci-unplug.o 15 grant-table.o suspend.o platform-pci-unplug.o \
16 p2m.o
16 17
17obj-$(CONFIG_SMP) += smp.o 18obj-$(CONFIG_SMP) += smp.o
18obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o 19obj-$(CONFIG_PARAVIRT_SPINLOCKS)+= spinlock.o
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index 1304bcec8ee5..7c0fedd98ea0 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -106,6 +106,7 @@ static const struct file_operations u32_array_fops = {
106 .open = u32_array_open, 106 .open = u32_array_open,
107 .release= xen_array_release, 107 .release= xen_array_release,
108 .read = u32_array_read, 108 .read = u32_array_read,
109 .llseek = no_llseek,
109}; 110};
110 111
111struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode, 112struct dentry *xen_debugfs_create_u32_array(const char *name, mode_t mode,
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 7d46c8441418..5525163a0398 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -30,6 +30,7 @@
30#include <linux/console.h> 30#include <linux/console.h>
31#include <linux/pci.h> 31#include <linux/pci.h>
32#include <linux/gfp.h> 32#include <linux/gfp.h>
33#include <linux/memblock.h>
33 34
34#include <xen/xen.h> 35#include <xen/xen.h>
35#include <xen/interface/xen.h> 36#include <xen/interface/xen.h>
@@ -45,6 +46,7 @@
45#include <asm/paravirt.h> 46#include <asm/paravirt.h>
46#include <asm/apic.h> 47#include <asm/apic.h>
47#include <asm/page.h> 48#include <asm/page.h>
49#include <asm/xen/pci.h>
48#include <asm/xen/hypercall.h> 50#include <asm/xen/hypercall.h>
49#include <asm/xen/hypervisor.h> 51#include <asm/xen/hypervisor.h>
50#include <asm/fixmap.h> 52#include <asm/fixmap.h>
@@ -58,7 +60,6 @@
58#include <asm/pgtable.h> 60#include <asm/pgtable.h>
59#include <asm/tlbflush.h> 61#include <asm/tlbflush.h>
60#include <asm/reboot.h> 62#include <asm/reboot.h>
61#include <asm/setup.h>
62#include <asm/stackprotector.h> 63#include <asm/stackprotector.h>
63#include <asm/hypervisor.h> 64#include <asm/hypervisor.h>
64 65
@@ -74,6 +75,11 @@ DEFINE_PER_CPU(struct vcpu_info, xen_vcpu_info);
74enum xen_domain_type xen_domain_type = XEN_NATIVE; 75enum xen_domain_type xen_domain_type = XEN_NATIVE;
75EXPORT_SYMBOL_GPL(xen_domain_type); 76EXPORT_SYMBOL_GPL(xen_domain_type);
76 77
78unsigned long *machine_to_phys_mapping = (void *)MACH2PHYS_VIRT_START;
79EXPORT_SYMBOL(machine_to_phys_mapping);
80unsigned int machine_to_phys_order;
81EXPORT_SYMBOL(machine_to_phys_order);
82
77struct start_info *xen_start_info; 83struct start_info *xen_start_info;
78EXPORT_SYMBOL_GPL(xen_start_info); 84EXPORT_SYMBOL_GPL(xen_start_info);
79 85
@@ -135,9 +141,6 @@ static void xen_vcpu_setup(int cpu)
135 info.mfn = arbitrary_virt_to_mfn(vcpup); 141 info.mfn = arbitrary_virt_to_mfn(vcpup);
136 info.offset = offset_in_page(vcpup); 142 info.offset = offset_in_page(vcpup);
137 143
138 printk(KERN_DEBUG "trying to map vcpu_info %d at %p, mfn %llx, offset %d\n",
139 cpu, vcpup, info.mfn, info.offset);
140
141 /* Check to see if the hypervisor will put the vcpu_info 144 /* Check to see if the hypervisor will put the vcpu_info
142 structure where we want it, which allows direct access via 145 structure where we want it, which allows direct access via
143 a percpu-variable. */ 146 a percpu-variable. */
@@ -151,9 +154,6 @@ static void xen_vcpu_setup(int cpu)
151 /* This cpu is using the registered vcpu info, even if 154 /* This cpu is using the registered vcpu info, even if
152 later ones fail to. */ 155 later ones fail to. */
153 per_cpu(xen_vcpu, cpu) = vcpup; 156 per_cpu(xen_vcpu, cpu) = vcpup;
154
155 printk(KERN_DEBUG "cpu %d using vcpu_info at %p\n",
156 cpu, vcpup);
157 } 157 }
158} 158}
159 159
@@ -235,37 +235,31 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx,
235 *dx &= maskedx; 235 *dx &= maskedx;
236} 236}
237 237
238static __init void xen_init_cpuid_mask(void) 238static void __init xen_init_cpuid_mask(void)
239{ 239{
240 unsigned int ax, bx, cx, dx; 240 unsigned int ax, bx, cx, dx;
241 unsigned int xsave_mask;
241 242
242 cpuid_leaf1_edx_mask = 243 cpuid_leaf1_edx_mask =
243 ~((1 << X86_FEATURE_MCE) | /* disable MCE */ 244 ~((1 << X86_FEATURE_MCE) | /* disable MCE */
244 (1 << X86_FEATURE_MCA) | /* disable MCA */ 245 (1 << X86_FEATURE_MCA) | /* disable MCA */
246 (1 << X86_FEATURE_MTRR) | /* disable MTRR */
245 (1 << X86_FEATURE_ACC)); /* thermal monitoring */ 247 (1 << X86_FEATURE_ACC)); /* thermal monitoring */
246 248
247 if (!xen_initial_domain()) 249 if (!xen_initial_domain())
248 cpuid_leaf1_edx_mask &= 250 cpuid_leaf1_edx_mask &=
249 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ 251 ~((1 << X86_FEATURE_APIC) | /* disable local APIC */
250 (1 << X86_FEATURE_ACPI)); /* disable ACPI */ 252 (1 << X86_FEATURE_ACPI)); /* disable ACPI */
251
252 ax = 1; 253 ax = 1;
253 cx = 0;
254 xen_cpuid(&ax, &bx, &cx, &dx); 254 xen_cpuid(&ax, &bx, &cx, &dx);
255 255
256 /* cpuid claims we support xsave; try enabling it to see what happens */ 256 xsave_mask =
257 if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { 257 (1 << (X86_FEATURE_XSAVE % 32)) |
258 unsigned long cr4; 258 (1 << (X86_FEATURE_OSXSAVE % 32));
259
260 set_in_cr4(X86_CR4_OSXSAVE);
261
262 cr4 = read_cr4();
263
264 if ((cr4 & X86_CR4_OSXSAVE) == 0)
265 cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32));
266 259
267 clear_in_cr4(X86_CR4_OSXSAVE); 260 /* Xen will set CR4.OSXSAVE if supported and not disabled by force */
268 } 261 if ((cx & xsave_mask) != xsave_mask)
262 cpuid_leaf1_ecx_mask &= ~xsave_mask; /* disable XSAVE & OSXSAVE */
269} 263}
270 264
271static void xen_set_debugreg(int reg, unsigned long val) 265static void xen_set_debugreg(int reg, unsigned long val)
@@ -406,7 +400,7 @@ static void xen_load_gdt(const struct desc_ptr *dtr)
406/* 400/*
407 * load_gdt for early boot, when the gdt is only mapped once 401 * load_gdt for early boot, when the gdt is only mapped once
408 */ 402 */
409static __init void xen_load_gdt_boot(const struct desc_ptr *dtr) 403static void __init xen_load_gdt_boot(const struct desc_ptr *dtr)
410{ 404{
411 unsigned long va = dtr->address; 405 unsigned long va = dtr->address;
412 unsigned int size = dtr->size + 1; 406 unsigned int size = dtr->size + 1;
@@ -573,8 +567,8 @@ static void xen_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g)
573 567
574 preempt_disable(); 568 preempt_disable();
575 569
576 start = __get_cpu_var(idt_desc).address; 570 start = __this_cpu_read(idt_desc.address);
577 end = start + __get_cpu_var(idt_desc).size + 1; 571 end = start + __this_cpu_read(idt_desc.size) + 1;
578 572
579 xen_mc_flush(); 573 xen_mc_flush();
580 574
@@ -668,7 +662,7 @@ static void xen_write_gdt_entry(struct desc_struct *dt, int entry,
668 * Version of write_gdt_entry for use at early boot-time needed to 662 * Version of write_gdt_entry for use at early boot-time needed to
669 * update an entry as simply as possible. 663 * update an entry as simply as possible.
670 */ 664 */
671static __init void xen_write_gdt_entry_boot(struct desc_struct *dt, int entry, 665static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
672 const void *desc, int type) 666 const void *desc, int type)
673{ 667{
674 switch (type) { 668 switch (type) {
@@ -835,6 +829,11 @@ static int xen_write_msr_safe(unsigned int msr, unsigned low, unsigned high)
835 Xen console noise. */ 829 Xen console noise. */
836 break; 830 break;
837 831
832 case MSR_IA32_CR_PAT:
833 if (smp_processor_id() == 0)
834 xen_set_pat(((u64)high << 32) | low);
835 break;
836
838 default: 837 default:
839 ret = native_write_msr_safe(msr, low, high); 838 ret = native_write_msr_safe(msr, low, high);
840 } 839 }
@@ -873,8 +872,6 @@ void xen_setup_vcpu_info_placement(void)
873 /* xen_vcpu_setup managed to place the vcpu_info within the 872 /* xen_vcpu_setup managed to place the vcpu_info within the
874 percpu area for all cpus, so make use of it */ 873 percpu area for all cpus, so make use of it */
875 if (have_vcpu_info_placement) { 874 if (have_vcpu_info_placement) {
876 printk(KERN_INFO "Xen: using vcpu_info placement\n");
877
878 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct); 875 pv_irq_ops.save_fl = __PV_IS_CALLEE_SAVE(xen_save_fl_direct);
879 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct); 876 pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(xen_restore_fl_direct);
880 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct); 877 pv_irq_ops.irq_disable = __PV_IS_CALLEE_SAVE(xen_irq_disable_direct);
@@ -936,18 +933,18 @@ static unsigned xen_patch(u8 type, u16 clobbers, void *insnbuf,
936 return ret; 933 return ret;
937} 934}
938 935
939static const struct pv_info xen_info __initdata = { 936static const struct pv_info xen_info __initconst = {
940 .paravirt_enabled = 1, 937 .paravirt_enabled = 1,
941 .shared_kernel_pmd = 0, 938 .shared_kernel_pmd = 0,
942 939
943 .name = "Xen", 940 .name = "Xen",
944}; 941};
945 942
946static const struct pv_init_ops xen_init_ops __initdata = { 943static const struct pv_init_ops xen_init_ops __initconst = {
947 .patch = xen_patch, 944 .patch = xen_patch,
948}; 945};
949 946
950static const struct pv_cpu_ops xen_cpu_ops __initdata = { 947static const struct pv_cpu_ops xen_cpu_ops __initconst = {
951 .cpuid = xen_cpuid, 948 .cpuid = xen_cpuid,
952 949
953 .set_debugreg = xen_set_debugreg, 950 .set_debugreg = xen_set_debugreg,
@@ -1007,7 +1004,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = {
1007 .end_context_switch = xen_end_context_switch, 1004 .end_context_switch = xen_end_context_switch,
1008}; 1005};
1009 1006
1010static const struct pv_apic_ops xen_apic_ops __initdata = { 1007static const struct pv_apic_ops xen_apic_ops __initconst = {
1011#ifdef CONFIG_X86_LOCAL_APIC 1008#ifdef CONFIG_X86_LOCAL_APIC
1012 .startup_ipi_hook = paravirt_nop, 1009 .startup_ipi_hook = paravirt_nop,
1013#endif 1010#endif
@@ -1017,10 +1014,6 @@ static void xen_reboot(int reason)
1017{ 1014{
1018 struct sched_shutdown r = { .reason = reason }; 1015 struct sched_shutdown r = { .reason = reason };
1019 1016
1020#ifdef CONFIG_SMP
1021 smp_send_stop();
1022#endif
1023
1024 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r)) 1017 if (HYPERVISOR_sched_op(SCHEDOP_shutdown, &r))
1025 BUG(); 1018 BUG();
1026} 1019}
@@ -1040,6 +1033,13 @@ static void xen_machine_halt(void)
1040 xen_reboot(SHUTDOWN_poweroff); 1033 xen_reboot(SHUTDOWN_poweroff);
1041} 1034}
1042 1035
1036static void xen_machine_power_off(void)
1037{
1038 if (pm_power_off)
1039 pm_power_off();
1040 xen_reboot(SHUTDOWN_poweroff);
1041}
1042
1043static void xen_crash_shutdown(struct pt_regs *regs) 1043static void xen_crash_shutdown(struct pt_regs *regs)
1044{ 1044{
1045 xen_reboot(SHUTDOWN_crash); 1045 xen_reboot(SHUTDOWN_crash);
@@ -1062,10 +1062,10 @@ int xen_panic_handler_init(void)
1062 return 0; 1062 return 0;
1063} 1063}
1064 1064
1065static const struct machine_ops __initdata xen_machine_ops = { 1065static const struct machine_ops xen_machine_ops __initconst = {
1066 .restart = xen_restart, 1066 .restart = xen_restart,
1067 .halt = xen_machine_halt, 1067 .halt = xen_machine_halt,
1068 .power_off = xen_machine_halt, 1068 .power_off = xen_machine_power_off,
1069 .shutdown = xen_machine_halt, 1069 .shutdown = xen_machine_halt,
1070 .crash_shutdown = xen_crash_shutdown, 1070 .crash_shutdown = xen_crash_shutdown,
1071 .emergency_restart = xen_emergency_restart, 1071 .emergency_restart = xen_emergency_restart,
@@ -1091,6 +1091,8 @@ static void __init xen_setup_stackprotector(void)
1091/* First C function to be called on Xen boot */ 1091/* First C function to be called on Xen boot */
1092asmlinkage void __init xen_start_kernel(void) 1092asmlinkage void __init xen_start_kernel(void)
1093{ 1093{
1094 struct physdev_set_iopl set_iopl;
1095 int rc;
1094 pgd_t *pgd; 1096 pgd_t *pgd;
1095 1097
1096 if (!xen_start_info) 1098 if (!xen_start_info)
@@ -1098,6 +1100,8 @@ asmlinkage void __init xen_start_kernel(void)
1098 1100
1099 xen_domain_type = XEN_PV_DOMAIN; 1101 xen_domain_type = XEN_PV_DOMAIN;
1100 1102
1103 xen_setup_machphys_mapping();
1104
1101 /* Install Xen paravirt ops */ 1105 /* Install Xen paravirt ops */
1102 pv_info = xen_info; 1106 pv_info = xen_info;
1103 pv_init_ops = xen_init_ops; 1107 pv_init_ops = xen_init_ops;
@@ -1170,6 +1174,15 @@ asmlinkage void __init xen_start_kernel(void)
1170 1174
1171 xen_smp_init(); 1175 xen_smp_init();
1172 1176
1177#ifdef CONFIG_ACPI_NUMA
1178 /*
1179 * The pages we from Xen are not related to machine pages, so
1180 * any NUMA information the kernel tries to get from ACPI will
1181 * be meaningless. Prevent it from trying.
1182 */
1183 acpi_numa = -1;
1184#endif
1185
1173 pgd = (pgd_t *)xen_start_info->pt_base; 1186 pgd = (pgd_t *)xen_start_info->pt_base;
1174 1187
1175 if (!xen_initial_domain()) 1188 if (!xen_initial_domain())
@@ -1181,12 +1194,16 @@ asmlinkage void __init xen_start_kernel(void)
1181 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; 1194 per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0];
1182 1195
1183 local_irq_disable(); 1196 local_irq_disable();
1184 early_boot_irqs_off(); 1197 early_boot_irqs_disabled = true;
1198
1199 memblock_init();
1185 1200
1186 xen_raw_console_write("mapping kernel into physical memory\n"); 1201 xen_raw_console_write("mapping kernel into physical memory\n");
1187 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages); 1202 pgd = xen_setup_kernel_pagetable(pgd, xen_start_info->nr_pages);
1203 xen_ident_map_ISA();
1188 1204
1189 init_mm.pgd = pgd; 1205 /* Allocate and initialize top and mid mfn levels for p2m structure */
1206 xen_build_mfn_list_list();
1190 1207
1191 /* keep using Xen gdt for now; no urgent need to change it */ 1208 /* keep using Xen gdt for now; no urgent need to change it */
1192 1209
@@ -1197,10 +1214,18 @@ asmlinkage void __init xen_start_kernel(void)
1197#else 1214#else
1198 pv_info.kernel_rpl = 0; 1215 pv_info.kernel_rpl = 0;
1199#endif 1216#endif
1200
1201 /* set the limit of our address space */ 1217 /* set the limit of our address space */
1202 xen_reserve_top(); 1218 xen_reserve_top();
1203 1219
1220 /* We used to do this in xen_arch_setup, but that is too late on AMD
1221 * were early_cpu_init (run before ->arch_setup()) calls early_amd_init
1222 * which pokes 0xcf8 port.
1223 */
1224 set_iopl.iopl = 1;
1225 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
1226 if (rc != 0)
1227 xen_raw_printk("physdev_op failed %d\n", rc);
1228
1204#ifdef CONFIG_X86_32 1229#ifdef CONFIG_X86_32
1205 /* set up basic CPUID stuff */ 1230 /* set up basic CPUID stuff */
1206 cpu_detect(&new_cpu_data); 1231 cpu_detect(&new_cpu_data);
@@ -1220,6 +1245,8 @@ asmlinkage void __init xen_start_kernel(void)
1220 add_preferred_console("xenboot", 0, NULL); 1245 add_preferred_console("xenboot", 0, NULL);
1221 add_preferred_console("tty", 0, NULL); 1246 add_preferred_console("tty", 0, NULL);
1222 add_preferred_console("hvc", 0, NULL); 1247 add_preferred_console("hvc", 0, NULL);
1248 if (pci_xen)
1249 x86_init.pci.arch_init = pci_xen_init;
1223 } else { 1250 } else {
1224 /* Make sure ACS will be enabled */ 1251 /* Make sure ACS will be enabled */
1225 pci_request_acs(); 1252 pci_request_acs();
@@ -1238,25 +1265,6 @@ asmlinkage void __init xen_start_kernel(void)
1238#endif 1265#endif
1239} 1266}
1240 1267
1241static uint32_t xen_cpuid_base(void)
1242{
1243 uint32_t base, eax, ebx, ecx, edx;
1244 char signature[13];
1245
1246 for (base = 0x40000000; base < 0x40010000; base += 0x100) {
1247 cpuid(base, &eax, &ebx, &ecx, &edx);
1248 *(uint32_t *)(signature + 0) = ebx;
1249 *(uint32_t *)(signature + 4) = ecx;
1250 *(uint32_t *)(signature + 8) = edx;
1251 signature[12] = 0;
1252
1253 if (!strcmp("XenVMMXenVMM", signature) && ((eax - base) >= 2))
1254 return base;
1255 }
1256
1257 return 0;
1258}
1259
1260static int init_hvm_pv_info(int *major, int *minor) 1268static int init_hvm_pv_info(int *major, int *minor)
1261{ 1269{
1262 uint32_t eax, ebx, ecx, edx, pages, msr, base; 1270 uint32_t eax, ebx, ecx, edx, pages, msr, base;
@@ -1276,15 +1284,14 @@ static int init_hvm_pv_info(int *major, int *minor)
1276 1284
1277 xen_setup_features(); 1285 xen_setup_features();
1278 1286
1279 pv_info = xen_info; 1287 pv_info.name = "Xen HVM";
1280 pv_info.kernel_rpl = 0;
1281 1288
1282 xen_domain_type = XEN_HVM_DOMAIN; 1289 xen_domain_type = XEN_HVM_DOMAIN;
1283 1290
1284 return 0; 1291 return 0;
1285} 1292}
1286 1293
1287void xen_hvm_init_shared_info(void) 1294void __ref xen_hvm_init_shared_info(void)
1288{ 1295{
1289 int cpu; 1296 int cpu;
1290 struct xen_add_to_physmap xatp; 1297 struct xen_add_to_physmap xatp;
@@ -1323,6 +1330,8 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1323 switch (action) { 1330 switch (action) {
1324 case CPU_UP_PREPARE: 1331 case CPU_UP_PREPARE:
1325 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu]; 1332 per_cpu(xen_vcpu, cpu) = &HYPERVISOR_shared_info->vcpu_info[cpu];
1333 if (xen_have_vector_callback)
1334 xen_init_lock_cpu(cpu);
1326 break; 1335 break;
1327 default: 1336 default:
1328 break; 1337 break;
@@ -1330,7 +1339,7 @@ static int __cpuinit xen_hvm_cpu_notify(struct notifier_block *self,
1330 return NOTIFY_OK; 1339 return NOTIFY_OK;
1331} 1340}
1332 1341
1333static struct notifier_block __cpuinitdata xen_hvm_cpu_notifier = { 1342static struct notifier_block xen_hvm_cpu_notifier __cpuinitdata = {
1334 .notifier_call = xen_hvm_cpu_notify, 1343 .notifier_call = xen_hvm_cpu_notify,
1335}; 1344};
1336 1345
@@ -1347,6 +1356,7 @@ static void __init xen_hvm_guest_init(void)
1347 1356
1348 if (xen_feature(XENFEAT_hvm_callback_vector)) 1357 if (xen_feature(XENFEAT_hvm_callback_vector))
1349 xen_have_vector_callback = 1; 1358 xen_have_vector_callback = 1;
1359 xen_hvm_smp_init();
1350 register_cpu_notifier(&xen_hvm_cpu_notifier); 1360 register_cpu_notifier(&xen_hvm_cpu_notifier);
1351 xen_unplug_emulated_devices(); 1361 xen_unplug_emulated_devices();
1352 have_vcpu_info_placement = 0; 1362 have_vcpu_info_placement = 0;
@@ -1366,7 +1376,19 @@ static bool __init xen_hvm_platform(void)
1366 return true; 1376 return true;
1367} 1377}
1368 1378
1369const __refconst struct hypervisor_x86 x86_hyper_xen_hvm = { 1379bool xen_hvm_need_lapic(void)
1380{
1381 if (xen_pv_domain())
1382 return false;
1383 if (!xen_hvm_domain())
1384 return false;
1385 if (xen_feature(XENFEAT_hvm_pirqs) && xen_have_vector_callback)
1386 return false;
1387 return true;
1388}
1389EXPORT_SYMBOL_GPL(xen_hvm_need_lapic);
1390
1391const struct hypervisor_x86 x86_hyper_xen_hvm __refconst = {
1370 .name = "Xen HVM", 1392 .name = "Xen HVM",
1371 .detect = xen_hvm_platform, 1393 .detect = xen_hvm_platform,
1372 .init_platform = xen_hvm_guest_init, 1394 .init_platform = xen_hvm_guest_init,
diff --git a/arch/x86/xen/irq.c b/arch/x86/xen/irq.c
index 9d30105a0c4a..8bbb465b6f0a 100644
--- a/arch/x86/xen/irq.c
+++ b/arch/x86/xen/irq.c
@@ -113,7 +113,7 @@ static void xen_halt(void)
113 xen_safe_halt(); 113 xen_safe_halt();
114} 114}
115 115
116static const struct pv_irq_ops xen_irq_ops __initdata = { 116static const struct pv_irq_ops xen_irq_ops __initconst = {
117 .save_fl = PV_CALLEE_SAVE(xen_save_fl), 117 .save_fl = PV_CALLEE_SAVE(xen_save_fl),
118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl), 118 .restore_fl = PV_CALLEE_SAVE(xen_restore_fl),
119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable), 119 .irq_disable = PV_CALLEE_SAVE(xen_irq_disable),
@@ -126,7 +126,7 @@ static const struct pv_irq_ops xen_irq_ops __initdata = {
126#endif 126#endif
127}; 127};
128 128
129void __init xen_init_irq_ops() 129void __init xen_init_irq_ops(void)
130{ 130{
131 pv_irq_ops = xen_irq_ops; 131 pv_irq_ops = xen_irq_ops;
132 x86_init.irqs.intr_init = xen_init_IRQ; 132 x86_init.irqs.intr_init = xen_init_IRQ;
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 42086ac406af..0ccccb67a993 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -45,6 +45,8 @@
45#include <linux/vmalloc.h> 45#include <linux/vmalloc.h>
46#include <linux/module.h> 46#include <linux/module.h>
47#include <linux/gfp.h> 47#include <linux/gfp.h>
48#include <linux/memblock.h>
49#include <linux/seq_file.h>
48 50
49#include <asm/pgtable.h> 51#include <asm/pgtable.h>
50#include <asm/tlbflush.h> 52#include <asm/tlbflush.h>
@@ -55,6 +57,9 @@
55#include <asm/e820.h> 57#include <asm/e820.h>
56#include <asm/linkage.h> 58#include <asm/linkage.h>
57#include <asm/page.h> 59#include <asm/page.h>
60#include <asm/init.h>
61#include <asm/pat.h>
62#include <asm/smp.h>
58 63
59#include <asm/xen/hypercall.h> 64#include <asm/xen/hypercall.h>
60#include <asm/xen/hypervisor.h> 65#include <asm/xen/hypervisor.h>
@@ -71,74 +76,19 @@
71#include "mmu.h" 76#include "mmu.h"
72#include "debugfs.h" 77#include "debugfs.h"
73 78
74#define MMU_UPDATE_HISTO 30
75
76/* 79/*
77 * Protects atomic reservation decrease/increase against concurrent increases. 80 * Protects atomic reservation decrease/increase against concurrent increases.
78 * Also protects non-atomic updates of current_pages and driver_pages, and 81 * Also protects non-atomic updates of current_pages and balloon lists.
79 * balloon lists.
80 */ 82 */
81DEFINE_SPINLOCK(xen_reservation_lock); 83DEFINE_SPINLOCK(xen_reservation_lock);
82 84
83#ifdef CONFIG_XEN_DEBUG_FS
84
85static struct {
86 u32 pgd_update;
87 u32 pgd_update_pinned;
88 u32 pgd_update_batched;
89
90 u32 pud_update;
91 u32 pud_update_pinned;
92 u32 pud_update_batched;
93
94 u32 pmd_update;
95 u32 pmd_update_pinned;
96 u32 pmd_update_batched;
97
98 u32 pte_update;
99 u32 pte_update_pinned;
100 u32 pte_update_batched;
101
102 u32 mmu_update;
103 u32 mmu_update_extended;
104 u32 mmu_update_histo[MMU_UPDATE_HISTO];
105
106 u32 prot_commit;
107 u32 prot_commit_batched;
108
109 u32 set_pte_at;
110 u32 set_pte_at_batched;
111 u32 set_pte_at_pinned;
112 u32 set_pte_at_current;
113 u32 set_pte_at_kernel;
114} mmu_stats;
115
116static u8 zero_stats;
117
118static inline void check_zero(void)
119{
120 if (unlikely(zero_stats)) {
121 memset(&mmu_stats, 0, sizeof(mmu_stats));
122 zero_stats = 0;
123 }
124}
125
126#define ADD_STATS(elem, val) \
127 do { check_zero(); mmu_stats.elem += (val); } while(0)
128
129#else /* !CONFIG_XEN_DEBUG_FS */
130
131#define ADD_STATS(elem, val) do { (void)(val); } while(0)
132
133#endif /* CONFIG_XEN_DEBUG_FS */
134
135
136/* 85/*
137 * Identity map, in addition to plain kernel map. This needs to be 86 * Identity map, in addition to plain kernel map. This needs to be
138 * large enough to allocate page table pages to allocate the rest. 87 * large enough to allocate page table pages to allocate the rest.
139 * Each page can map 2MB. 88 * Each page can map 2MB.
140 */ 89 */
141static pte_t level1_ident_pgt[PTRS_PER_PTE * 4] __page_aligned_bss; 90#define LEVEL1_IDENT_ENTRIES (PTRS_PER_PTE * 4)
91static RESERVE_BRK_ARRAY(pte_t, level1_ident_pgt, LEVEL1_IDENT_ENTRIES);
142 92
143#ifdef CONFIG_X86_64 93#ifdef CONFIG_X86_64
144/* l3 pud for userspace vsyscall mapping */ 94/* l3 pud for userspace vsyscall mapping */
@@ -169,160 +119,6 @@ DEFINE_PER_CPU(unsigned long, xen_current_cr3); /* actual vcpu cr3 */
169 */ 119 */
170#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK) 120#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
171 121
172
173#define P2M_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
174#define TOP_ENTRIES (MAX_DOMAIN_PAGES / P2M_ENTRIES_PER_PAGE)
175
176/* Placeholder for holes in the address space */
177static unsigned long p2m_missing[P2M_ENTRIES_PER_PAGE] __page_aligned_data =
178 { [ 0 ... P2M_ENTRIES_PER_PAGE-1 ] = ~0UL };
179
180 /* Array of pointers to pages containing p2m entries */
181static unsigned long *p2m_top[TOP_ENTRIES] __page_aligned_data =
182 { [ 0 ... TOP_ENTRIES - 1] = &p2m_missing[0] };
183
184/* Arrays of p2m arrays expressed in mfns used for save/restore */
185static unsigned long p2m_top_mfn[TOP_ENTRIES] __page_aligned_bss;
186
187static unsigned long p2m_top_mfn_list[TOP_ENTRIES / P2M_ENTRIES_PER_PAGE]
188 __page_aligned_bss;
189
190static inline unsigned p2m_top_index(unsigned long pfn)
191{
192 BUG_ON(pfn >= MAX_DOMAIN_PAGES);
193 return pfn / P2M_ENTRIES_PER_PAGE;
194}
195
196static inline unsigned p2m_index(unsigned long pfn)
197{
198 return pfn % P2M_ENTRIES_PER_PAGE;
199}
200
201/* Build the parallel p2m_top_mfn structures */
202void xen_build_mfn_list_list(void)
203{
204 unsigned pfn, idx;
205
206 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn += P2M_ENTRIES_PER_PAGE) {
207 unsigned topidx = p2m_top_index(pfn);
208
209 p2m_top_mfn[topidx] = virt_to_mfn(p2m_top[topidx]);
210 }
211
212 for (idx = 0; idx < ARRAY_SIZE(p2m_top_mfn_list); idx++) {
213 unsigned topidx = idx * P2M_ENTRIES_PER_PAGE;
214 p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]);
215 }
216}
217
218void xen_setup_mfn_list_list(void)
219{
220 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
221
222 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
223 virt_to_mfn(p2m_top_mfn_list);
224 HYPERVISOR_shared_info->arch.max_pfn = xen_start_info->nr_pages;
225}
226
227/* Set up p2m_top to point to the domain-builder provided p2m pages */
228void __init xen_build_dynamic_phys_to_machine(void)
229{
230 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
231 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
232 unsigned pfn;
233
234 for (pfn = 0; pfn < max_pfn; pfn += P2M_ENTRIES_PER_PAGE) {
235 unsigned topidx = p2m_top_index(pfn);
236
237 p2m_top[topidx] = &mfn_list[pfn];
238 }
239
240 xen_build_mfn_list_list();
241}
242
243unsigned long get_phys_to_machine(unsigned long pfn)
244{
245 unsigned topidx, idx;
246
247 if (unlikely(pfn >= MAX_DOMAIN_PAGES))
248 return INVALID_P2M_ENTRY;
249
250 topidx = p2m_top_index(pfn);
251 idx = p2m_index(pfn);
252 return p2m_top[topidx][idx];
253}
254EXPORT_SYMBOL_GPL(get_phys_to_machine);
255
256/* install a new p2m_top page */
257bool install_p2mtop_page(unsigned long pfn, unsigned long *p)
258{
259 unsigned topidx = p2m_top_index(pfn);
260 unsigned long **pfnp, *mfnp;
261 unsigned i;
262
263 pfnp = &p2m_top[topidx];
264 mfnp = &p2m_top_mfn[topidx];
265
266 for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++)
267 p[i] = INVALID_P2M_ENTRY;
268
269 if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) {
270 *mfnp = virt_to_mfn(p);
271 return true;
272 }
273
274 return false;
275}
276
277static void alloc_p2m(unsigned long pfn)
278{
279 unsigned long *p;
280
281 p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL);
282 BUG_ON(p == NULL);
283
284 if (!install_p2mtop_page(pfn, p))
285 free_page((unsigned long)p);
286}
287
288/* Try to install p2m mapping; fail if intermediate bits missing */
289bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
290{
291 unsigned topidx, idx;
292
293 if (unlikely(pfn >= MAX_DOMAIN_PAGES)) {
294 BUG_ON(mfn != INVALID_P2M_ENTRY);
295 return true;
296 }
297
298 topidx = p2m_top_index(pfn);
299 if (p2m_top[topidx] == p2m_missing) {
300 if (mfn == INVALID_P2M_ENTRY)
301 return true;
302 return false;
303 }
304
305 idx = p2m_index(pfn);
306 p2m_top[topidx][idx] = mfn;
307
308 return true;
309}
310
311void set_phys_to_machine(unsigned long pfn, unsigned long mfn)
312{
313 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
314 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
315 return;
316 }
317
318 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
319 alloc_p2m(pfn);
320
321 if (!__set_phys_to_machine(pfn, mfn))
322 BUG();
323 }
324}
325
326unsigned long arbitrary_virt_to_mfn(void *vaddr) 122unsigned long arbitrary_virt_to_mfn(void *vaddr)
327{ 123{
328 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr); 124 xmaddr_t maddr = arbitrary_virt_to_machine(vaddr);
@@ -351,6 +147,7 @@ xmaddr_t arbitrary_virt_to_machine(void *vaddr)
351 offset = address & ~PAGE_MASK; 147 offset = address & ~PAGE_MASK;
352 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset); 148 return XMADDR(((phys_addr_t)pte_mfn(*pte) << PAGE_SHIFT) + offset);
353} 149}
150EXPORT_SYMBOL_GPL(arbitrary_virt_to_machine);
354 151
355void make_lowmem_page_readonly(void *vaddr) 152void make_lowmem_page_readonly(void *vaddr)
356{ 153{
@@ -359,7 +156,8 @@ void make_lowmem_page_readonly(void *vaddr)
359 unsigned int level; 156 unsigned int level;
360 157
361 pte = lookup_address(address, &level); 158 pte = lookup_address(address, &level);
362 BUG_ON(pte == NULL); 159 if (pte == NULL)
160 return; /* vaddr missing */
363 161
364 ptev = pte_wrprotect(*pte); 162 ptev = pte_wrprotect(*pte);
365 163
@@ -374,7 +172,8 @@ void make_lowmem_page_readwrite(void *vaddr)
374 unsigned int level; 172 unsigned int level;
375 173
376 pte = lookup_address(address, &level); 174 pte = lookup_address(address, &level);
377 BUG_ON(pte == NULL); 175 if (pte == NULL)
176 return; /* vaddr missing */
378 177
379 ptev = pte_mkwrite(*pte); 178 ptev = pte_mkwrite(*pte);
380 179
@@ -390,12 +189,7 @@ static bool xen_page_pinned(void *ptr)
390 return PagePinned(page); 189 return PagePinned(page);
391} 190}
392 191
393static bool xen_iomap_pte(pte_t pte) 192void xen_set_domain_pte(pte_t *ptep, pte_t pteval, unsigned domid)
394{
395 return pte_flags(pte) & _PAGE_IOMAP;
396}
397
398static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
399{ 193{
400 struct multicall_space mcs; 194 struct multicall_space mcs;
401 struct mmu_update *u; 195 struct mmu_update *u;
@@ -404,13 +198,14 @@ static void xen_set_iomap_pte(pte_t *ptep, pte_t pteval)
404 u = mcs.args; 198 u = mcs.args;
405 199
406 /* ptep might be kmapped when using 32-bit HIGHPTE */ 200 /* ptep might be kmapped when using 32-bit HIGHPTE */
407 u->ptr = arbitrary_virt_to_machine(ptep).maddr; 201 u->ptr = virt_to_machine(ptep).maddr;
408 u->val = pte_val_ma(pteval); 202 u->val = pte_val_ma(pteval);
409 203
410 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_IO); 204 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, domid);
411 205
412 xen_mc_issue(PARAVIRT_LAZY_MMU); 206 xen_mc_issue(PARAVIRT_LAZY_MMU);
413} 207}
208EXPORT_SYMBOL_GPL(xen_set_domain_pte);
414 209
415static void xen_extend_mmu_update(const struct mmu_update *update) 210static void xen_extend_mmu_update(const struct mmu_update *update)
416{ 211{
@@ -420,27 +215,17 @@ static void xen_extend_mmu_update(const struct mmu_update *update)
420 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u)); 215 mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, sizeof(*u));
421 216
422 if (mcs.mc != NULL) { 217 if (mcs.mc != NULL) {
423 ADD_STATS(mmu_update_extended, 1);
424 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], -1);
425
426 mcs.mc->args[1]++; 218 mcs.mc->args[1]++;
427
428 if (mcs.mc->args[1] < MMU_UPDATE_HISTO)
429 ADD_STATS(mmu_update_histo[mcs.mc->args[1]], 1);
430 else
431 ADD_STATS(mmu_update_histo[0], 1);
432 } else { 219 } else {
433 ADD_STATS(mmu_update, 1);
434 mcs = __xen_mc_entry(sizeof(*u)); 220 mcs = __xen_mc_entry(sizeof(*u));
435 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF); 221 MULTI_mmu_update(mcs.mc, mcs.args, 1, NULL, DOMID_SELF);
436 ADD_STATS(mmu_update_histo[1], 1);
437 } 222 }
438 223
439 u = mcs.args; 224 u = mcs.args;
440 *u = *update; 225 *u = *update;
441} 226}
442 227
443void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val) 228static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
444{ 229{
445 struct mmu_update u; 230 struct mmu_update u;
446 231
@@ -453,17 +238,13 @@ void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
453 u.val = pmd_val_ma(val); 238 u.val = pmd_val_ma(val);
454 xen_extend_mmu_update(&u); 239 xen_extend_mmu_update(&u);
455 240
456 ADD_STATS(pmd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
457
458 xen_mc_issue(PARAVIRT_LAZY_MMU); 241 xen_mc_issue(PARAVIRT_LAZY_MMU);
459 242
460 preempt_enable(); 243 preempt_enable();
461} 244}
462 245
463void xen_set_pmd(pmd_t *ptr, pmd_t val) 246static void xen_set_pmd(pmd_t *ptr, pmd_t val)
464{ 247{
465 ADD_STATS(pmd_update, 1);
466
467 /* If page is not pinned, we can just update the entry 248 /* If page is not pinned, we can just update the entry
468 directly */ 249 directly */
469 if (!xen_page_pinned(ptr)) { 250 if (!xen_page_pinned(ptr)) {
@@ -471,8 +252,6 @@ void xen_set_pmd(pmd_t *ptr, pmd_t val)
471 return; 252 return;
472 } 253 }
473 254
474 ADD_STATS(pmd_update_pinned, 1);
475
476 xen_set_pmd_hyper(ptr, val); 255 xen_set_pmd_hyper(ptr, val);
477} 256}
478 257
@@ -485,35 +264,34 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
485 set_pte_vaddr(vaddr, mfn_pte(mfn, flags)); 264 set_pte_vaddr(vaddr, mfn_pte(mfn, flags));
486} 265}
487 266
488void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, 267static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
489 pte_t *ptep, pte_t pteval)
490{ 268{
491 if (xen_iomap_pte(pteval)) { 269 struct mmu_update u;
492 xen_set_iomap_pte(ptep, pteval); 270
493 goto out; 271 if (paravirt_get_lazy_mode() != PARAVIRT_LAZY_MMU)
494 } 272 return false;
495 273
496 ADD_STATS(set_pte_at, 1); 274 xen_mc_batch();
497// ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep));
498 ADD_STATS(set_pte_at_current, mm == current->mm);
499 ADD_STATS(set_pte_at_kernel, mm == &init_mm);
500 275
501 if (mm == current->mm || mm == &init_mm) { 276 u.ptr = virt_to_machine(ptep).maddr | MMU_NORMAL_PT_UPDATE;
502 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { 277 u.val = pte_val_ma(pteval);
503 struct multicall_space mcs; 278 xen_extend_mmu_update(&u);
504 mcs = xen_mc_entry(0);
505 279
506 MULTI_update_va_mapping(mcs.mc, addr, pteval, 0); 280 xen_mc_issue(PARAVIRT_LAZY_MMU);
507 ADD_STATS(set_pte_at_batched, 1); 281
508 xen_mc_issue(PARAVIRT_LAZY_MMU); 282 return true;
509 goto out; 283}
510 } else
511 if (HYPERVISOR_update_va_mapping(addr, pteval, 0) == 0)
512 goto out;
513 }
514 xen_set_pte(ptep, pteval);
515 284
516out: return; 285static void xen_set_pte(pte_t *ptep, pte_t pteval)
286{
287 if (!xen_batched_set_pte(ptep, pteval))
288 native_set_pte(ptep, pteval);
289}
290
291static void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
292 pte_t *ptep, pte_t pteval)
293{
294 xen_set_pte(ptep, pteval);
517} 295}
518 296
519pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, 297pte_t xen_ptep_modify_prot_start(struct mm_struct *mm,
@@ -530,13 +308,10 @@ void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
530 308
531 xen_mc_batch(); 309 xen_mc_batch();
532 310
533 u.ptr = arbitrary_virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD; 311 u.ptr = virt_to_machine(ptep).maddr | MMU_PT_UPDATE_PRESERVE_AD;
534 u.val = pte_val_ma(pte); 312 u.val = pte_val_ma(pte);
535 xen_extend_mmu_update(&u); 313 xen_extend_mmu_update(&u);
536 314
537 ADD_STATS(prot_commit, 1);
538 ADD_STATS(prot_commit_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
539
540 xen_mc_issue(PARAVIRT_LAZY_MMU); 315 xen_mc_issue(PARAVIRT_LAZY_MMU);
541} 316}
542 317
@@ -557,7 +332,34 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
557 if (val & _PAGE_PRESENT) { 332 if (val & _PAGE_PRESENT) {
558 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT; 333 unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
559 pteval_t flags = val & PTE_FLAGS_MASK; 334 pteval_t flags = val & PTE_FLAGS_MASK;
560 val = ((pteval_t)pfn_to_mfn(pfn) << PAGE_SHIFT) | flags; 335 unsigned long mfn;
336
337 if (!xen_feature(XENFEAT_auto_translated_physmap))
338 mfn = get_phys_to_machine(pfn);
339 else
340 mfn = pfn;
341 /*
342 * If there's no mfn for the pfn, then just create an
343 * empty non-present pte. Unfortunately this loses
344 * information about the original pfn, so
345 * pte_mfn_to_pfn is asymmetric.
346 */
347 if (unlikely(mfn == INVALID_P2M_ENTRY)) {
348 mfn = 0;
349 flags = 0;
350 } else {
351 /*
352 * Paramount to do this test _after_ the
353 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
354 * IDENTITY_FRAME_BIT resolves to true.
355 */
356 mfn &= ~FOREIGN_FRAME_BIT;
357 if (mfn & IDENTITY_FRAME_BIT) {
358 mfn &= ~IDENTITY_FRAME_BIT;
359 flags |= _PAGE_IOMAP;
360 }
361 }
362 val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
561 } 363 }
562 364
563 return val; 365 return val;
@@ -577,25 +379,71 @@ static pteval_t iomap_pte(pteval_t val)
577 return val; 379 return val;
578} 380}
579 381
580pteval_t xen_pte_val(pte_t pte) 382static pteval_t xen_pte_val(pte_t pte)
581{ 383{
582 if (xen_initial_domain() && (pte.pte & _PAGE_IOMAP)) 384 pteval_t pteval = pte.pte;
583 return pte.pte;
584 385
585 return pte_mfn_to_pfn(pte.pte); 386 /* If this is a WC pte, convert back from Xen WC to Linux WC */
387 if ((pteval & (_PAGE_PAT | _PAGE_PCD | _PAGE_PWT)) == _PAGE_PAT) {
388 WARN_ON(!pat_enabled);
389 pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
390 }
391
392 if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
393 return pteval;
394
395 return pte_mfn_to_pfn(pteval);
586} 396}
587PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val); 397PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
588 398
589pgdval_t xen_pgd_val(pgd_t pgd) 399static pgdval_t xen_pgd_val(pgd_t pgd)
590{ 400{
591 return pte_mfn_to_pfn(pgd.pgd); 401 return pte_mfn_to_pfn(pgd.pgd);
592} 402}
593PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val); 403PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
594 404
595pte_t xen_make_pte(pteval_t pte) 405/*
406 * Xen's PAT setup is part of its ABI, though I assume entries 6 & 7
407 * are reserved for now, to correspond to the Intel-reserved PAT
408 * types.
409 *
410 * We expect Linux's PAT set as follows:
411 *
412 * Idx PTE flags Linux Xen Default
413 * 0 WB WB WB
414 * 1 PWT WC WT WT
415 * 2 PCD UC- UC- UC-
416 * 3 PCD PWT UC UC UC
417 * 4 PAT WB WC WB
418 * 5 PAT PWT WC WP WT
419 * 6 PAT PCD UC- UC UC-
420 * 7 PAT PCD PWT UC UC UC
421 */
422
423void xen_set_pat(u64 pat)
424{
425 /* We expect Linux to use a PAT setting of
426 * UC UC- WC WB (ignoring the PAT flag) */
427 WARN_ON(pat != 0x0007010600070106ull);
428}
429
430static pte_t xen_make_pte(pteval_t pte)
596{ 431{
597 phys_addr_t addr = (pte & PTE_PFN_MASK); 432 phys_addr_t addr = (pte & PTE_PFN_MASK);
598 433
434 /* If Linux is trying to set a WC pte, then map to the Xen WC.
435 * If _PAGE_PAT is set, then it probably means it is really
436 * _PAGE_PSE, so avoid fiddling with the PAT mapping and hope
437 * things work out OK...
438 *
439 * (We should never see kernel mappings with _PAGE_PSE set,
440 * but we could see hugetlbfs mappings, I think.).
441 */
442 if (pat_enabled && !WARN_ON(pte & _PAGE_PAT)) {
443 if ((pte & (_PAGE_PCD | _PAGE_PWT)) == _PAGE_PWT)
444 pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
445 }
446
599 /* 447 /*
600 * Unprivileged domains are allowed to do IOMAPpings for 448 * Unprivileged domains are allowed to do IOMAPpings for
601 * PCI passthrough, but not map ISA space. The ISA 449 * PCI passthrough, but not map ISA space. The ISA
@@ -614,20 +462,55 @@ pte_t xen_make_pte(pteval_t pte)
614} 462}
615PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte); 463PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
616 464
617pgd_t xen_make_pgd(pgdval_t pgd) 465#ifdef CONFIG_XEN_DEBUG
466pte_t xen_make_pte_debug(pteval_t pte)
467{
468 phys_addr_t addr = (pte & PTE_PFN_MASK);
469 phys_addr_t other_addr;
470 bool io_page = false;
471 pte_t _pte;
472
473 if (pte & _PAGE_IOMAP)
474 io_page = true;
475
476 _pte = xen_make_pte(pte);
477
478 if (!addr)
479 return _pte;
480
481 if (io_page &&
482 (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
483 other_addr = pfn_to_mfn(addr >> PAGE_SHIFT) << PAGE_SHIFT;
484 WARN_ONCE(addr != other_addr,
485 "0x%lx is using VM_IO, but it is 0x%lx!\n",
486 (unsigned long)addr, (unsigned long)other_addr);
487 } else {
488 pteval_t iomap_set = (_pte.pte & PTE_FLAGS_MASK) & _PAGE_IOMAP;
489 other_addr = (_pte.pte & PTE_PFN_MASK);
490 WARN_ONCE((addr == other_addr) && (!io_page) && (!iomap_set),
491 "0x%lx is missing VM_IO (and wasn't fixed)!\n",
492 (unsigned long)addr);
493 }
494
495 return _pte;
496}
497PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_debug);
498#endif
499
500static pgd_t xen_make_pgd(pgdval_t pgd)
618{ 501{
619 pgd = pte_pfn_to_mfn(pgd); 502 pgd = pte_pfn_to_mfn(pgd);
620 return native_make_pgd(pgd); 503 return native_make_pgd(pgd);
621} 504}
622PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd); 505PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
623 506
624pmdval_t xen_pmd_val(pmd_t pmd) 507static pmdval_t xen_pmd_val(pmd_t pmd)
625{ 508{
626 return pte_mfn_to_pfn(pmd.pmd); 509 return pte_mfn_to_pfn(pmd.pmd);
627} 510}
628PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val); 511PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
629 512
630void xen_set_pud_hyper(pud_t *ptr, pud_t val) 513static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
631{ 514{
632 struct mmu_update u; 515 struct mmu_update u;
633 516
@@ -640,17 +523,13 @@ void xen_set_pud_hyper(pud_t *ptr, pud_t val)
640 u.val = pud_val_ma(val); 523 u.val = pud_val_ma(val);
641 xen_extend_mmu_update(&u); 524 xen_extend_mmu_update(&u);
642 525
643 ADD_STATS(pud_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
644
645 xen_mc_issue(PARAVIRT_LAZY_MMU); 526 xen_mc_issue(PARAVIRT_LAZY_MMU);
646 527
647 preempt_enable(); 528 preempt_enable();
648} 529}
649 530
650void xen_set_pud(pud_t *ptr, pud_t val) 531static void xen_set_pud(pud_t *ptr, pud_t val)
651{ 532{
652 ADD_STATS(pud_update, 1);
653
654 /* If page is not pinned, we can just update the entry 533 /* If page is not pinned, we can just update the entry
655 directly */ 534 directly */
656 if (!xen_page_pinned(ptr)) { 535 if (!xen_page_pinned(ptr)) {
@@ -658,56 +537,28 @@ void xen_set_pud(pud_t *ptr, pud_t val)
658 return; 537 return;
659 } 538 }
660 539
661 ADD_STATS(pud_update_pinned, 1);
662
663 xen_set_pud_hyper(ptr, val); 540 xen_set_pud_hyper(ptr, val);
664} 541}
665 542
666void xen_set_pte(pte_t *ptep, pte_t pte)
667{
668 if (xen_iomap_pte(pte)) {
669 xen_set_iomap_pte(ptep, pte);
670 return;
671 }
672
673 ADD_STATS(pte_update, 1);
674// ADD_STATS(pte_update_pinned, xen_page_pinned(ptep));
675 ADD_STATS(pte_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
676
677#ifdef CONFIG_X86_PAE 543#ifdef CONFIG_X86_PAE
678 ptep->pte_high = pte.pte_high; 544static void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
679 smp_wmb();
680 ptep->pte_low = pte.pte_low;
681#else
682 *ptep = pte;
683#endif
684}
685
686#ifdef CONFIG_X86_PAE
687void xen_set_pte_atomic(pte_t *ptep, pte_t pte)
688{ 545{
689 if (xen_iomap_pte(pte)) {
690 xen_set_iomap_pte(ptep, pte);
691 return;
692 }
693
694 set_64bit((u64 *)ptep, native_pte_val(pte)); 546 set_64bit((u64 *)ptep, native_pte_val(pte));
695} 547}
696 548
697void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) 549static void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
698{ 550{
699 ptep->pte_low = 0; 551 if (!xen_batched_set_pte(ptep, native_make_pte(0)))
700 smp_wmb(); /* make sure low gets written first */ 552 native_pte_clear(mm, addr, ptep);
701 ptep->pte_high = 0;
702} 553}
703 554
704void xen_pmd_clear(pmd_t *pmdp) 555static void xen_pmd_clear(pmd_t *pmdp)
705{ 556{
706 set_pmd(pmdp, __pmd(0)); 557 set_pmd(pmdp, __pmd(0));
707} 558}
708#endif /* CONFIG_X86_PAE */ 559#endif /* CONFIG_X86_PAE */
709 560
710pmd_t xen_make_pmd(pmdval_t pmd) 561static pmd_t xen_make_pmd(pmdval_t pmd)
711{ 562{
712 pmd = pte_pfn_to_mfn(pmd); 563 pmd = pte_pfn_to_mfn(pmd);
713 return native_make_pmd(pmd); 564 return native_make_pmd(pmd);
@@ -715,13 +566,13 @@ pmd_t xen_make_pmd(pmdval_t pmd)
715PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd); 566PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
716 567
717#if PAGETABLE_LEVELS == 4 568#if PAGETABLE_LEVELS == 4
718pudval_t xen_pud_val(pud_t pud) 569static pudval_t xen_pud_val(pud_t pud)
719{ 570{
720 return pte_mfn_to_pfn(pud.pud); 571 return pte_mfn_to_pfn(pud.pud);
721} 572}
722PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val); 573PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
723 574
724pud_t xen_make_pud(pudval_t pud) 575static pud_t xen_make_pud(pudval_t pud)
725{ 576{
726 pud = pte_pfn_to_mfn(pud); 577 pud = pte_pfn_to_mfn(pud);
727 578
@@ -729,7 +580,7 @@ pud_t xen_make_pud(pudval_t pud)
729} 580}
730PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud); 581PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
731 582
732pgd_t *xen_get_user_pgd(pgd_t *pgd) 583static pgd_t *xen_get_user_pgd(pgd_t *pgd)
733{ 584{
734 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK); 585 pgd_t *pgd_page = (pgd_t *)(((unsigned long)pgd) & PAGE_MASK);
735 unsigned offset = pgd - pgd_page; 586 unsigned offset = pgd - pgd_page;
@@ -761,7 +612,7 @@ static void __xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
761 * 2. It is always pinned 612 * 2. It is always pinned
762 * 3. It has no user pagetable attached to it 613 * 3. It has no user pagetable attached to it
763 */ 614 */
764void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val) 615static void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
765{ 616{
766 preempt_disable(); 617 preempt_disable();
767 618
@@ -774,12 +625,10 @@ void __init xen_set_pgd_hyper(pgd_t *ptr, pgd_t val)
774 preempt_enable(); 625 preempt_enable();
775} 626}
776 627
777void xen_set_pgd(pgd_t *ptr, pgd_t val) 628static void xen_set_pgd(pgd_t *ptr, pgd_t val)
778{ 629{
779 pgd_t *user_ptr = xen_get_user_pgd(ptr); 630 pgd_t *user_ptr = xen_get_user_pgd(ptr);
780 631
781 ADD_STATS(pgd_update, 1);
782
783 /* If page is not pinned, we can just update the entry 632 /* If page is not pinned, we can just update the entry
784 directly */ 633 directly */
785 if (!xen_page_pinned(ptr)) { 634 if (!xen_page_pinned(ptr)) {
@@ -791,9 +640,6 @@ void xen_set_pgd(pgd_t *ptr, pgd_t val)
791 return; 640 return;
792 } 641 }
793 642
794 ADD_STATS(pgd_update_pinned, 1);
795 ADD_STATS(pgd_update_batched, paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU);
796
797 /* If it's pinned, then we can at least batch the kernel and 643 /* If it's pinned, then we can at least batch the kernel and
798 user updates together. */ 644 user updates together. */
799 xen_mc_batch(); 645 xen_mc_batch();
@@ -1068,10 +914,9 @@ static void xen_pgd_pin(struct mm_struct *mm)
1068 */ 914 */
1069void xen_mm_pin_all(void) 915void xen_mm_pin_all(void)
1070{ 916{
1071 unsigned long flags;
1072 struct page *page; 917 struct page *page;
1073 918
1074 spin_lock_irqsave(&pgd_lock, flags); 919 spin_lock(&pgd_lock);
1075 920
1076 list_for_each_entry(page, &pgd_list, lru) { 921 list_for_each_entry(page, &pgd_list, lru) {
1077 if (!PagePinned(page)) { 922 if (!PagePinned(page)) {
@@ -1080,7 +925,7 @@ void xen_mm_pin_all(void)
1080 } 925 }
1081 } 926 }
1082 927
1083 spin_unlock_irqrestore(&pgd_lock, flags); 928 spin_unlock(&pgd_lock);
1084} 929}
1085 930
1086/* 931/*
@@ -1088,7 +933,7 @@ void xen_mm_pin_all(void)
1088 * that's before we have page structures to store the bits. So do all 933 * that's before we have page structures to store the bits. So do all
1089 * the book-keeping now. 934 * the book-keeping now.
1090 */ 935 */
1091static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, 936static int __init xen_mark_pinned(struct mm_struct *mm, struct page *page,
1092 enum pt_level level) 937 enum pt_level level)
1093{ 938{
1094 SetPagePinned(page); 939 SetPagePinned(page);
@@ -1181,10 +1026,9 @@ static void xen_pgd_unpin(struct mm_struct *mm)
1181 */ 1026 */
1182void xen_mm_unpin_all(void) 1027void xen_mm_unpin_all(void)
1183{ 1028{
1184 unsigned long flags;
1185 struct page *page; 1029 struct page *page;
1186 1030
1187 spin_lock_irqsave(&pgd_lock, flags); 1031 spin_lock(&pgd_lock);
1188 1032
1189 list_for_each_entry(page, &pgd_list, lru) { 1033 list_for_each_entry(page, &pgd_list, lru) {
1190 if (PageSavePinned(page)) { 1034 if (PageSavePinned(page)) {
@@ -1194,17 +1038,17 @@ void xen_mm_unpin_all(void)
1194 } 1038 }
1195 } 1039 }
1196 1040
1197 spin_unlock_irqrestore(&pgd_lock, flags); 1041 spin_unlock(&pgd_lock);
1198} 1042}
1199 1043
1200void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next) 1044static void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next)
1201{ 1045{
1202 spin_lock(&next->page_table_lock); 1046 spin_lock(&next->page_table_lock);
1203 xen_pgd_pin(next); 1047 xen_pgd_pin(next);
1204 spin_unlock(&next->page_table_lock); 1048 spin_unlock(&next->page_table_lock);
1205} 1049}
1206 1050
1207void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) 1051static void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm)
1208{ 1052{
1209 spin_lock(&mm->page_table_lock); 1053 spin_lock(&mm->page_table_lock);
1210 xen_pgd_pin(mm); 1054 xen_pgd_pin(mm);
@@ -1222,7 +1066,7 @@ static void drop_other_mm_ref(void *info)
1222 1066
1223 active_mm = percpu_read(cpu_tlbstate.active_mm); 1067 active_mm = percpu_read(cpu_tlbstate.active_mm);
1224 1068
1225 if (active_mm == mm) 1069 if (active_mm == mm && percpu_read(cpu_tlbstate.state) != TLBSTATE_OK)
1226 leave_mm(smp_processor_id()); 1070 leave_mm(smp_processor_id());
1227 1071
1228 /* If this cpu still has a stale cr3 reference, then make sure 1072 /* If this cpu still has a stale cr3 reference, then make sure
@@ -1291,7 +1135,7 @@ static void xen_drop_mm_ref(struct mm_struct *mm)
1291 * pagetable because of lazy tlb flushing. This means we need need to 1135 * pagetable because of lazy tlb flushing. This means we need need to
1292 * switch all CPUs off this pagetable before we can unpin it. 1136 * switch all CPUs off this pagetable before we can unpin it.
1293 */ 1137 */
1294void xen_exit_mmap(struct mm_struct *mm) 1138static void xen_exit_mmap(struct mm_struct *mm)
1295{ 1139{
1296 get_cpu(); /* make sure we don't move around */ 1140 get_cpu(); /* make sure we don't move around */
1297 xen_drop_mm_ref(mm); 1141 xen_drop_mm_ref(mm);
@@ -1306,13 +1150,27 @@ void xen_exit_mmap(struct mm_struct *mm)
1306 spin_unlock(&mm->page_table_lock); 1150 spin_unlock(&mm->page_table_lock);
1307} 1151}
1308 1152
1309static __init void xen_pagetable_setup_start(pgd_t *base) 1153static void __init xen_pagetable_setup_start(pgd_t *base)
1310{ 1154{
1311} 1155}
1312 1156
1157static __init void xen_mapping_pagetable_reserve(u64 start, u64 end)
1158{
1159 /* reserve the range used */
1160 native_pagetable_reserve(start, end);
1161
1162 /* set as RW the rest */
1163 printk(KERN_DEBUG "xen: setting RW the range %llx - %llx\n", end,
1164 PFN_PHYS(pgt_buf_top));
1165 while (end < PFN_PHYS(pgt_buf_top)) {
1166 make_lowmem_page_readwrite(__va(end));
1167 end += PAGE_SIZE;
1168 }
1169}
1170
1313static void xen_post_allocator_init(void); 1171static void xen_post_allocator_init(void);
1314 1172
1315static __init void xen_pagetable_setup_done(pgd_t *base) 1173static void __init xen_pagetable_setup_done(pgd_t *base)
1316{ 1174{
1317 xen_setup_shared_info(); 1175 xen_setup_shared_info();
1318 xen_post_allocator_init(); 1176 xen_post_allocator_init();
@@ -1374,7 +1232,11 @@ static void xen_flush_tlb_others(const struct cpumask *cpus,
1374{ 1232{
1375 struct { 1233 struct {
1376 struct mmuext_op op; 1234 struct mmuext_op op;
1235#ifdef CONFIG_SMP
1236 DECLARE_BITMAP(mask, num_processors);
1237#else
1377 DECLARE_BITMAP(mask, NR_CPUS); 1238 DECLARE_BITMAP(mask, NR_CPUS);
1239#endif
1378 } *args; 1240 } *args;
1379 struct multicall_space mcs; 1241 struct multicall_space mcs;
1380 1242
@@ -1509,7 +1371,7 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1509} 1371}
1510 1372
1511#ifdef CONFIG_X86_32 1373#ifdef CONFIG_X86_32
1512static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1374static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1513{ 1375{
1514 /* If there's an existing pte, then don't allow _PAGE_RW to be set */ 1376 /* If there's an existing pte, then don't allow _PAGE_RW to be set */
1515 if (pte_val_ma(*ptep) & _PAGE_PRESENT) 1377 if (pte_val_ma(*ptep) & _PAGE_PRESENT)
@@ -1518,16 +1380,34 @@ static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1518 1380
1519 return pte; 1381 return pte;
1520} 1382}
1383#else /* CONFIG_X86_64 */
1384static pte_t __init mask_rw_pte(pte_t *ptep, pte_t pte)
1385{
1386 unsigned long pfn = pte_pfn(pte);
1387
1388 /*
1389 * If the new pfn is within the range of the newly allocated
1390 * kernel pagetable, and it isn't being mapped into an
1391 * early_ioremap fixmap slot as a freshly allocated page, make sure
1392 * it is RO.
1393 */
1394 if (((!is_early_ioremap_ptep(ptep) &&
1395 pfn >= pgt_buf_start && pfn < pgt_buf_top)) ||
1396 (is_early_ioremap_ptep(ptep) && pfn != (pgt_buf_end - 1)))
1397 pte = pte_wrprotect(pte);
1398
1399 return pte;
1400}
1401#endif /* CONFIG_X86_64 */
1521 1402
1522/* Init-time set_pte while constructing initial pagetables, which 1403/* Init-time set_pte while constructing initial pagetables, which
1523 doesn't allow RO pagetable pages to be remapped RW */ 1404 doesn't allow RO pagetable pages to be remapped RW */
1524static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) 1405static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1525{ 1406{
1526 pte = mask_rw_pte(ptep, pte); 1407 pte = mask_rw_pte(ptep, pte);
1527 1408
1528 xen_set_pte(ptep, pte); 1409 xen_set_pte(ptep, pte);
1529} 1410}
1530#endif
1531 1411
1532static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) 1412static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1533{ 1413{
@@ -1540,7 +1420,7 @@ static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1540 1420
1541/* Early in boot, while setting up the initial pagetable, assume 1421/* Early in boot, while setting up the initial pagetable, assume
1542 everything is pinned. */ 1422 everything is pinned. */
1543static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) 1423static void __init xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1544{ 1424{
1545#ifdef CONFIG_FLATMEM 1425#ifdef CONFIG_FLATMEM
1546 BUG_ON(mem_map); /* should only be used early */ 1426 BUG_ON(mem_map); /* should only be used early */
@@ -1550,7 +1430,7 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn)
1550} 1430}
1551 1431
1552/* Used for pmd and pud */ 1432/* Used for pmd and pud */
1553static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) 1433static void __init xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1554{ 1434{
1555#ifdef CONFIG_FLATMEM 1435#ifdef CONFIG_FLATMEM
1556 BUG_ON(mem_map); /* should only be used early */ 1436 BUG_ON(mem_map); /* should only be used early */
@@ -1560,13 +1440,13 @@ static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn)
1560 1440
1561/* Early release_pte assumes that all pts are pinned, since there's 1441/* Early release_pte assumes that all pts are pinned, since there's
1562 only init_mm and anything attached to that is pinned. */ 1442 only init_mm and anything attached to that is pinned. */
1563static __init void xen_release_pte_init(unsigned long pfn) 1443static void __init xen_release_pte_init(unsigned long pfn)
1564{ 1444{
1565 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); 1445 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1566 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1446 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1567} 1447}
1568 1448
1569static __init void xen_release_pmd_init(unsigned long pfn) 1449static void __init xen_release_pmd_init(unsigned long pfn)
1570{ 1450{
1571 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); 1451 make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1572} 1452}
@@ -1682,6 +1562,7 @@ static void *m2v(phys_addr_t maddr)
1682 return __ka(m2p(maddr)); 1562 return __ka(m2p(maddr));
1683} 1563}
1684 1564
1565/* Set the page permissions on an identity-mapped pages */
1685static void set_page_prot(void *addr, pgprot_t prot) 1566static void set_page_prot(void *addr, pgprot_t prot)
1686{ 1567{
1687 unsigned long pfn = __pa(addr) >> PAGE_SHIFT; 1568 unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
@@ -1691,12 +1572,15 @@ static void set_page_prot(void *addr, pgprot_t prot)
1691 BUG(); 1572 BUG();
1692} 1573}
1693 1574
1694static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn) 1575static void __init xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1695{ 1576{
1696 unsigned pmdidx, pteidx; 1577 unsigned pmdidx, pteidx;
1697 unsigned ident_pte; 1578 unsigned ident_pte;
1698 unsigned long pfn; 1579 unsigned long pfn;
1699 1580
1581 level1_ident_pgt = extend_brk(sizeof(pte_t) * LEVEL1_IDENT_ENTRIES,
1582 PAGE_SIZE);
1583
1700 ident_pte = 0; 1584 ident_pte = 0;
1701 pfn = 0; 1585 pfn = 0;
1702 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) { 1586 for (pmdidx = 0; pmdidx < PTRS_PER_PMD && pfn < max_pfn; pmdidx++) {
@@ -1707,7 +1591,7 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1707 pte_page = m2v(pmd[pmdidx].pmd); 1591 pte_page = m2v(pmd[pmdidx].pmd);
1708 else { 1592 else {
1709 /* Check for free pte pages */ 1593 /* Check for free pte pages */
1710 if (ident_pte == ARRAY_SIZE(level1_ident_pgt)) 1594 if (ident_pte == LEVEL1_IDENT_ENTRIES)
1711 break; 1595 break;
1712 1596
1713 pte_page = &level1_ident_pgt[ident_pte]; 1597 pte_page = &level1_ident_pgt[ident_pte];
@@ -1720,8 +1604,10 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1720 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) { 1604 for (pteidx = 0; pteidx < PTRS_PER_PTE; pteidx++, pfn++) {
1721 pte_t pte; 1605 pte_t pte;
1722 1606
1607#ifdef CONFIG_X86_32
1723 if (pfn > max_pfn_mapped) 1608 if (pfn > max_pfn_mapped)
1724 max_pfn_mapped = pfn; 1609 max_pfn_mapped = pfn;
1610#endif
1725 1611
1726 if (!pte_none(pte_page[pteidx])) 1612 if (!pte_none(pte_page[pteidx]))
1727 continue; 1613 continue;
@@ -1737,6 +1623,20 @@ static __init void xen_map_identity_early(pmd_t *pmd, unsigned long max_pfn)
1737 set_page_prot(pmd, PAGE_KERNEL_RO); 1623 set_page_prot(pmd, PAGE_KERNEL_RO);
1738} 1624}
1739 1625
1626void __init xen_setup_machphys_mapping(void)
1627{
1628 struct xen_machphys_mapping mapping;
1629 unsigned long machine_to_phys_nr_ents;
1630
1631 if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, &mapping) == 0) {
1632 machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1633 machine_to_phys_nr_ents = mapping.max_mfn + 1;
1634 } else {
1635 machine_to_phys_nr_ents = MACH2PHYS_NR_ENTRIES;
1636 }
1637 machine_to_phys_order = fls(machine_to_phys_nr_ents - 1);
1638}
1639
1740#ifdef CONFIG_X86_64 1640#ifdef CONFIG_X86_64
1741static void convert_pfn_mfn(void *v) 1641static void convert_pfn_mfn(void *v)
1742{ 1642{
@@ -1750,7 +1650,7 @@ static void convert_pfn_mfn(void *v)
1750} 1650}
1751 1651
1752/* 1652/*
1753 * Set up the inital kernel pagetable. 1653 * Set up the initial kernel pagetable.
1754 * 1654 *
1755 * We can construct this by grafting the Xen provided pagetable into 1655 * We can construct this by grafting the Xen provided pagetable into
1756 * head_64.S's preconstructed pagetables. We copy the Xen L2's into 1656 * head_64.S's preconstructed pagetables. We copy the Xen L2's into
@@ -1760,12 +1660,18 @@ static void convert_pfn_mfn(void *v)
1760 * of the physical mapping once some sort of allocator has been set 1660 * of the physical mapping once some sort of allocator has been set
1761 * up. 1661 * up.
1762 */ 1662 */
1763__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1663pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1764 unsigned long max_pfn) 1664 unsigned long max_pfn)
1765{ 1665{
1766 pud_t *l3; 1666 pud_t *l3;
1767 pmd_t *l2; 1667 pmd_t *l2;
1768 1668
1669 /* max_pfn_mapped is the last pfn mapped in the initial memory
1670 * mappings. Considering that on Xen after the kernel mappings we
1671 * have the mappings of some pages that don't exist in pfn space, we
1672 * set max_pfn_mapped to the last real pfn mapped. */
1673 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1674
1769 /* Zap identity mapping */ 1675 /* Zap identity mapping */
1770 init_level4_pgt[0] = __pgd(0); 1676 init_level4_pgt[0] = __pgd(0);
1771 1677
@@ -1814,7 +1720,7 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1814 __xen_write_cr3(true, __pa(pgd)); 1720 __xen_write_cr3(true, __pa(pgd));
1815 xen_mc_issue(PARAVIRT_LAZY_CPU); 1721 xen_mc_issue(PARAVIRT_LAZY_CPU);
1816 1722
1817 reserve_early(__pa(xen_start_info->pt_base), 1723 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1818 __pa(xen_start_info->pt_base + 1724 __pa(xen_start_info->pt_base +
1819 xen_start_info->nr_pt_frames * PAGE_SIZE), 1725 xen_start_info->nr_pt_frames * PAGE_SIZE),
1820 "XEN PAGETABLES"); 1726 "XEN PAGETABLES");
@@ -1822,45 +1728,88 @@ __init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd,
1822 return pgd; 1728 return pgd;
1823} 1729}
1824#else /* !CONFIG_X86_64 */ 1730#else /* !CONFIG_X86_64 */
1825static pmd_t level2_kernel_pgt[PTRS_PER_PMD] __page_aligned_bss; 1731static RESERVE_BRK_ARRAY(pmd_t, initial_kernel_pmd, PTRS_PER_PMD);
1732static RESERVE_BRK_ARRAY(pmd_t, swapper_kernel_pmd, PTRS_PER_PMD);
1733
1734static void __init xen_write_cr3_init(unsigned long cr3)
1735{
1736 unsigned long pfn = PFN_DOWN(__pa(swapper_pg_dir));
1737
1738 BUG_ON(read_cr3() != __pa(initial_page_table));
1739 BUG_ON(cr3 != __pa(swapper_pg_dir));
1740
1741 /*
1742 * We are switching to swapper_pg_dir for the first time (from
1743 * initial_page_table) and therefore need to mark that page
1744 * read-only and then pin it.
1745 *
1746 * Xen disallows sharing of kernel PMDs for PAE
1747 * guests. Therefore we must copy the kernel PMD from
1748 * initial_page_table into a new kernel PMD to be used in
1749 * swapper_pg_dir.
1750 */
1751 swapper_kernel_pmd =
1752 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1753 memcpy(swapper_kernel_pmd, initial_kernel_pmd,
1754 sizeof(pmd_t) * PTRS_PER_PMD);
1755 swapper_pg_dir[KERNEL_PGD_BOUNDARY] =
1756 __pgd(__pa(swapper_kernel_pmd) | _PAGE_PRESENT);
1757 set_page_prot(swapper_kernel_pmd, PAGE_KERNEL_RO);
1758
1759 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO);
1760 xen_write_cr3(cr3);
1761 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, pfn);
1762
1763 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE,
1764 PFN_DOWN(__pa(initial_page_table)));
1765 set_page_prot(initial_page_table, PAGE_KERNEL);
1766 set_page_prot(initial_kernel_pmd, PAGE_KERNEL);
1826 1767
1827__init pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, 1768 pv_mmu_ops.write_cr3 = &xen_write_cr3;
1769}
1770
1771pgd_t * __init xen_setup_kernel_pagetable(pgd_t *pgd,
1828 unsigned long max_pfn) 1772 unsigned long max_pfn)
1829{ 1773{
1830 pmd_t *kernel_pmd; 1774 pmd_t *kernel_pmd;
1831 1775
1776 initial_kernel_pmd =
1777 extend_brk(sizeof(pmd_t) * PTRS_PER_PMD, PAGE_SIZE);
1778
1832 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) + 1779 max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->pt_base) +
1833 xen_start_info->nr_pt_frames * PAGE_SIZE + 1780 xen_start_info->nr_pt_frames * PAGE_SIZE +
1834 512*1024); 1781 512*1024);
1835 1782
1836 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd); 1783 kernel_pmd = m2v(pgd[KERNEL_PGD_BOUNDARY].pgd);
1837 memcpy(level2_kernel_pgt, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD); 1784 memcpy(initial_kernel_pmd, kernel_pmd, sizeof(pmd_t) * PTRS_PER_PMD);
1838 1785
1839 xen_map_identity_early(level2_kernel_pgt, max_pfn); 1786 xen_map_identity_early(initial_kernel_pmd, max_pfn);
1840 1787
1841 memcpy(swapper_pg_dir, pgd, sizeof(pgd_t) * PTRS_PER_PGD); 1788 memcpy(initial_page_table, pgd, sizeof(pgd_t) * PTRS_PER_PGD);
1842 set_pgd(&swapper_pg_dir[KERNEL_PGD_BOUNDARY], 1789 initial_page_table[KERNEL_PGD_BOUNDARY] =
1843 __pgd(__pa(level2_kernel_pgt) | _PAGE_PRESENT)); 1790 __pgd(__pa(initial_kernel_pmd) | _PAGE_PRESENT);
1844 1791
1845 set_page_prot(level2_kernel_pgt, PAGE_KERNEL_RO); 1792 set_page_prot(initial_kernel_pmd, PAGE_KERNEL_RO);
1846 set_page_prot(swapper_pg_dir, PAGE_KERNEL_RO); 1793 set_page_prot(initial_page_table, PAGE_KERNEL_RO);
1847 set_page_prot(empty_zero_page, PAGE_KERNEL_RO); 1794 set_page_prot(empty_zero_page, PAGE_KERNEL_RO);
1848 1795
1849 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd))); 1796 pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1850 1797
1851 xen_write_cr3(__pa(swapper_pg_dir)); 1798 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1799 PFN_DOWN(__pa(initial_page_table)));
1800 xen_write_cr3(__pa(initial_page_table));
1852 1801
1853 pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(__pa(swapper_pg_dir))); 1802 memblock_x86_reserve_range(__pa(xen_start_info->pt_base),
1854
1855 reserve_early(__pa(xen_start_info->pt_base),
1856 __pa(xen_start_info->pt_base + 1803 __pa(xen_start_info->pt_base +
1857 xen_start_info->nr_pt_frames * PAGE_SIZE), 1804 xen_start_info->nr_pt_frames * PAGE_SIZE),
1858 "XEN PAGETABLES"); 1805 "XEN PAGETABLES");
1859 1806
1860 return swapper_pg_dir; 1807 return initial_page_table;
1861} 1808}
1862#endif /* CONFIG_X86_64 */ 1809#endif /* CONFIG_X86_64 */
1863 1810
1811static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
1812
1864static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot) 1813static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1865{ 1814{
1866 pte_t pte; 1815 pte_t pte;
@@ -1881,15 +1830,28 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1881#else 1830#else
1882 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE: 1831 case VSYSCALL_LAST_PAGE ... VSYSCALL_FIRST_PAGE:
1883#endif 1832#endif
1884#ifdef CONFIG_X86_LOCAL_APIC
1885 case FIX_APIC_BASE: /* maps dummy local APIC */
1886#endif
1887 case FIX_TEXT_POKE0: 1833 case FIX_TEXT_POKE0:
1888 case FIX_TEXT_POKE1: 1834 case FIX_TEXT_POKE1:
1889 /* All local page mappings */ 1835 /* All local page mappings */
1890 pte = pfn_pte(phys, prot); 1836 pte = pfn_pte(phys, prot);
1891 break; 1837 break;
1892 1838
1839#ifdef CONFIG_X86_LOCAL_APIC
1840 case FIX_APIC_BASE: /* maps dummy local APIC */
1841 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1842 break;
1843#endif
1844
1845#ifdef CONFIG_X86_IO_APIC
1846 case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
1847 /*
1848 * We just don't map the IO APIC - all access is via
1849 * hypercalls. Keep the address in the pte for reference.
1850 */
1851 pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
1852 break;
1853#endif
1854
1893 case FIX_PARAVIRT_BOOTMAP: 1855 case FIX_PARAVIRT_BOOTMAP:
1894 /* This is an MFN, but it isn't an IO mapping from the 1856 /* This is an MFN, but it isn't an IO mapping from the
1895 IO domain */ 1857 IO domain */
@@ -1914,8 +1876,34 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
1914#endif 1876#endif
1915} 1877}
1916 1878
1917static __init void xen_post_allocator_init(void) 1879void __init xen_ident_map_ISA(void)
1880{
1881 unsigned long pa;
1882
1883 /*
1884 * If we're dom0, then linear map the ISA machine addresses into
1885 * the kernel's address space.
1886 */
1887 if (!xen_initial_domain())
1888 return;
1889
1890 xen_raw_printk("Xen: setup ISA identity maps\n");
1891
1892 for (pa = ISA_START_ADDRESS; pa < ISA_END_ADDRESS; pa += PAGE_SIZE) {
1893 pte_t pte = mfn_pte(PFN_DOWN(pa), PAGE_KERNEL_IO);
1894
1895 if (HYPERVISOR_update_va_mapping(PAGE_OFFSET + pa, pte, 0))
1896 BUG();
1897 }
1898
1899 xen_flush_tlb();
1900}
1901
1902static void __init xen_post_allocator_init(void)
1918{ 1903{
1904#ifdef CONFIG_XEN_DEBUG
1905 pv_mmu_ops.make_pte = PV_CALLEE_SAVE(xen_make_pte_debug);
1906#endif
1919 pv_mmu_ops.set_pte = xen_set_pte; 1907 pv_mmu_ops.set_pte = xen_set_pte;
1920 pv_mmu_ops.set_pmd = xen_set_pmd; 1908 pv_mmu_ops.set_pmd = xen_set_pmd;
1921 pv_mmu_ops.set_pud = xen_set_pud; 1909 pv_mmu_ops.set_pud = xen_set_pud;
@@ -1948,12 +1936,16 @@ static void xen_leave_lazy_mmu(void)
1948 preempt_enable(); 1936 preempt_enable();
1949} 1937}
1950 1938
1951static const struct pv_mmu_ops xen_mmu_ops __initdata = { 1939static const struct pv_mmu_ops xen_mmu_ops __initconst = {
1952 .read_cr2 = xen_read_cr2, 1940 .read_cr2 = xen_read_cr2,
1953 .write_cr2 = xen_write_cr2, 1941 .write_cr2 = xen_write_cr2,
1954 1942
1955 .read_cr3 = xen_read_cr3, 1943 .read_cr3 = xen_read_cr3,
1944#ifdef CONFIG_X86_32
1945 .write_cr3 = xen_write_cr3_init,
1946#else
1956 .write_cr3 = xen_write_cr3, 1947 .write_cr3 = xen_write_cr3,
1948#endif
1957 1949
1958 .flush_tlb_user = xen_flush_tlb, 1950 .flush_tlb_user = xen_flush_tlb,
1959 .flush_tlb_kernel = xen_flush_tlb, 1951 .flush_tlb_kernel = xen_flush_tlb,
@@ -1969,14 +1961,9 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1969 .alloc_pte = xen_alloc_pte_init, 1961 .alloc_pte = xen_alloc_pte_init,
1970 .release_pte = xen_release_pte_init, 1962 .release_pte = xen_release_pte_init,
1971 .alloc_pmd = xen_alloc_pmd_init, 1963 .alloc_pmd = xen_alloc_pmd_init,
1972 .alloc_pmd_clone = paravirt_nop,
1973 .release_pmd = xen_release_pmd_init, 1964 .release_pmd = xen_release_pmd_init,
1974 1965
1975#ifdef CONFIG_X86_64
1976 .set_pte = xen_set_pte,
1977#else
1978 .set_pte = xen_set_pte_init, 1966 .set_pte = xen_set_pte_init,
1979#endif
1980 .set_pte_at = xen_set_pte_at, 1967 .set_pte_at = xen_set_pte_at,
1981 .set_pmd = xen_set_pmd_hyper, 1968 .set_pmd = xen_set_pmd_hyper,
1982 1969
@@ -2022,11 +2009,12 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
2022 2009
2023void __init xen_init_mmu_ops(void) 2010void __init xen_init_mmu_ops(void)
2024{ 2011{
2012 x86_init.mapping.pagetable_reserve = xen_mapping_pagetable_reserve;
2025 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start; 2013 x86_init.paging.pagetable_setup_start = xen_pagetable_setup_start;
2026 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done; 2014 x86_init.paging.pagetable_setup_done = xen_pagetable_setup_done;
2027 pv_mmu_ops = xen_mmu_ops; 2015 pv_mmu_ops = xen_mmu_ops;
2028 2016
2029 vmap_lazy_unmap = false; 2017 memset(dummy_mapping, 0xff, PAGE_SIZE);
2030} 2018}
2031 2019
2032/* Protected by xen_reservation_lock. */ 2020/* Protected by xen_reservation_lock. */
@@ -2049,7 +2037,7 @@ static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2049 in_frames[i] = virt_to_mfn(vaddr); 2037 in_frames[i] = virt_to_mfn(vaddr);
2050 2038
2051 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0); 2039 MULTI_update_va_mapping(mcs.mc, vaddr, VOID_PTE, 0);
2052 set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY); 2040 __set_phys_to_machine(virt_to_pfn(vaddr), INVALID_P2M_ENTRY);
2053 2041
2054 if (out_frames) 2042 if (out_frames)
2055 out_frames[i] = virt_to_pfn(vaddr); 2043 out_frames[i] = virt_to_pfn(vaddr);
@@ -2259,65 +2247,83 @@ void __init xen_hvm_init_mmu_ops(void)
2259} 2247}
2260#endif 2248#endif
2261 2249
2262#ifdef CONFIG_XEN_DEBUG_FS 2250#define REMAP_BATCH_SIZE 16
2263 2251
2264static struct dentry *d_mmu_debug; 2252struct remap_data {
2253 unsigned long mfn;
2254 pgprot_t prot;
2255 struct mmu_update *mmu_update;
2256};
2265 2257
2266static int __init xen_mmu_debugfs(void) 2258static int remap_area_mfn_pte_fn(pte_t *ptep, pgtable_t token,
2259 unsigned long addr, void *data)
2267{ 2260{
2268 struct dentry *d_xen = xen_init_debugfs(); 2261 struct remap_data *rmd = data;
2269 2262 pte_t pte = pte_mkspecial(pfn_pte(rmd->mfn++, rmd->prot));
2270 if (d_xen == NULL)
2271 return -ENOMEM;
2272 2263
2273 d_mmu_debug = debugfs_create_dir("mmu", d_xen); 2264 rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2274 2265 rmd->mmu_update->val = pte_val_ma(pte);
2275 debugfs_create_u8("zero_stats", 0644, d_mmu_debug, &zero_stats); 2266 rmd->mmu_update++;
2276
2277 debugfs_create_u32("pgd_update", 0444, d_mmu_debug, &mmu_stats.pgd_update);
2278 debugfs_create_u32("pgd_update_pinned", 0444, d_mmu_debug,
2279 &mmu_stats.pgd_update_pinned);
2280 debugfs_create_u32("pgd_update_batched", 0444, d_mmu_debug,
2281 &mmu_stats.pgd_update_pinned);
2282
2283 debugfs_create_u32("pud_update", 0444, d_mmu_debug, &mmu_stats.pud_update);
2284 debugfs_create_u32("pud_update_pinned", 0444, d_mmu_debug,
2285 &mmu_stats.pud_update_pinned);
2286 debugfs_create_u32("pud_update_batched", 0444, d_mmu_debug,
2287 &mmu_stats.pud_update_pinned);
2288
2289 debugfs_create_u32("pmd_update", 0444, d_mmu_debug, &mmu_stats.pmd_update);
2290 debugfs_create_u32("pmd_update_pinned", 0444, d_mmu_debug,
2291 &mmu_stats.pmd_update_pinned);
2292 debugfs_create_u32("pmd_update_batched", 0444, d_mmu_debug,
2293 &mmu_stats.pmd_update_pinned);
2294
2295 debugfs_create_u32("pte_update", 0444, d_mmu_debug, &mmu_stats.pte_update);
2296// debugfs_create_u32("pte_update_pinned", 0444, d_mmu_debug,
2297// &mmu_stats.pte_update_pinned);
2298 debugfs_create_u32("pte_update_batched", 0444, d_mmu_debug,
2299 &mmu_stats.pte_update_pinned);
2300
2301 debugfs_create_u32("mmu_update", 0444, d_mmu_debug, &mmu_stats.mmu_update);
2302 debugfs_create_u32("mmu_update_extended", 0444, d_mmu_debug,
2303 &mmu_stats.mmu_update_extended);
2304 xen_debugfs_create_u32_array("mmu_update_histo", 0444, d_mmu_debug,
2305 mmu_stats.mmu_update_histo, 20);
2306
2307 debugfs_create_u32("set_pte_at", 0444, d_mmu_debug, &mmu_stats.set_pte_at);
2308 debugfs_create_u32("set_pte_at_batched", 0444, d_mmu_debug,
2309 &mmu_stats.set_pte_at_batched);
2310 debugfs_create_u32("set_pte_at_current", 0444, d_mmu_debug,
2311 &mmu_stats.set_pte_at_current);
2312 debugfs_create_u32("set_pte_at_kernel", 0444, d_mmu_debug,
2313 &mmu_stats.set_pte_at_kernel);
2314
2315 debugfs_create_u32("prot_commit", 0444, d_mmu_debug, &mmu_stats.prot_commit);
2316 debugfs_create_u32("prot_commit_batched", 0444, d_mmu_debug,
2317 &mmu_stats.prot_commit_batched);
2318 2267
2319 return 0; 2268 return 0;
2320} 2269}
2321fs_initcall(xen_mmu_debugfs);
2322 2270
2323#endif /* CONFIG_XEN_DEBUG_FS */ 2271int xen_remap_domain_mfn_range(struct vm_area_struct *vma,
2272 unsigned long addr,
2273 unsigned long mfn, int nr,
2274 pgprot_t prot, unsigned domid)
2275{
2276 struct remap_data rmd;
2277 struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2278 int batch;
2279 unsigned long range;
2280 int err = 0;
2281
2282 prot = __pgprot(pgprot_val(prot) | _PAGE_IOMAP);
2283
2284 BUG_ON(!((vma->vm_flags & (VM_PFNMAP | VM_RESERVED | VM_IO)) ==
2285 (VM_PFNMAP | VM_RESERVED | VM_IO)));
2286
2287 rmd.mfn = mfn;
2288 rmd.prot = prot;
2289
2290 while (nr) {
2291 batch = min(REMAP_BATCH_SIZE, nr);
2292 range = (unsigned long)batch << PAGE_SHIFT;
2293
2294 rmd.mmu_update = mmu_update;
2295 err = apply_to_page_range(vma->vm_mm, addr, range,
2296 remap_area_mfn_pte_fn, &rmd);
2297 if (err)
2298 goto out;
2299
2300 err = -EFAULT;
2301 if (HYPERVISOR_mmu_update(mmu_update, batch, NULL, domid) < 0)
2302 goto out;
2303
2304 nr -= batch;
2305 addr += range;
2306 }
2307
2308 err = 0;
2309out:
2310
2311 flush_tlb_all();
2312
2313 return err;
2314}
2315EXPORT_SYMBOL_GPL(xen_remap_domain_mfn_range);
2316
2317#ifdef CONFIG_XEN_DEBUG_FS
2318static int p2m_dump_open(struct inode *inode, struct file *filp)
2319{
2320 return single_open(filp, p2m_dump_show, NULL);
2321}
2322
2323static const struct file_operations p2m_dump_fops = {
2324 .open = p2m_dump_open,
2325 .read = seq_read,
2326 .llseek = seq_lseek,
2327 .release = single_release,
2328};
2329#endif /* CONFIG_XEN_DEBUG_FS */
diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h
index fa938c4aa2f7..73809bb951b4 100644
--- a/arch/x86/xen/mmu.h
+++ b/arch/x86/xen/mmu.h
@@ -12,47 +12,9 @@ enum pt_level {
12 12
13 13
14bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); 14bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn);
15bool install_p2mtop_page(unsigned long pfn, unsigned long *p);
16 15
17void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); 16void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags);
18 17
19
20void xen_activate_mm(struct mm_struct *prev, struct mm_struct *next);
21void xen_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm);
22void xen_exit_mmap(struct mm_struct *mm);
23
24pteval_t xen_pte_val(pte_t);
25pmdval_t xen_pmd_val(pmd_t);
26pgdval_t xen_pgd_val(pgd_t);
27
28pte_t xen_make_pte(pteval_t);
29pmd_t xen_make_pmd(pmdval_t);
30pgd_t xen_make_pgd(pgdval_t);
31
32void xen_set_pte(pte_t *ptep, pte_t pteval);
33void xen_set_pte_at(struct mm_struct *mm, unsigned long addr,
34 pte_t *ptep, pte_t pteval);
35
36#ifdef CONFIG_X86_PAE
37void xen_set_pte_atomic(pte_t *ptep, pte_t pte);
38void xen_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
39void xen_pmd_clear(pmd_t *pmdp);
40#endif /* CONFIG_X86_PAE */
41
42void xen_set_pmd(pmd_t *pmdp, pmd_t pmdval);
43void xen_set_pud(pud_t *ptr, pud_t val);
44void xen_set_pmd_hyper(pmd_t *pmdp, pmd_t pmdval);
45void xen_set_pud_hyper(pud_t *ptr, pud_t val);
46
47#if PAGETABLE_LEVELS == 4
48pudval_t xen_pud_val(pud_t pud);
49pud_t xen_make_pud(pudval_t pudval);
50void xen_set_pgd(pgd_t *pgdp, pgd_t pgd);
51void xen_set_pgd_hyper(pgd_t *pgdp, pgd_t pgd);
52#endif
53
54pgd_t *xen_get_user_pgd(pgd_t *pgd);
55
56pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep); 18pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, unsigned long addr, pte_t *ptep);
57void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr, 19void xen_ptep_modify_prot_commit(struct mm_struct *mm, unsigned long addr,
58 pte_t *ptep, pte_t pte); 20 pte_t *ptep, pte_t pte);
diff --git a/arch/x86/xen/multicalls.c b/arch/x86/xen/multicalls.c
index 8bff7e7c290b..1b2b73ff0a6e 100644
--- a/arch/x86/xen/multicalls.c
+++ b/arch/x86/xen/multicalls.c
@@ -189,10 +189,10 @@ struct multicall_space __xen_mc_entry(size_t args)
189 unsigned argidx = roundup(b->argidx, sizeof(u64)); 189 unsigned argidx = roundup(b->argidx, sizeof(u64));
190 190
191 BUG_ON(preemptible()); 191 BUG_ON(preemptible());
192 BUG_ON(b->argidx > MC_ARGS); 192 BUG_ON(b->argidx >= MC_ARGS);
193 193
194 if (b->mcidx == MC_BATCH || 194 if (b->mcidx == MC_BATCH ||
195 (argidx + args) > MC_ARGS) { 195 (argidx + args) >= MC_ARGS) {
196 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS); 196 mc_stats_flush(b->mcidx == MC_BATCH ? FL_SLOTS : FL_ARGS);
197 xen_mc_flush(); 197 xen_mc_flush();
198 argidx = roundup(b->argidx, sizeof(u64)); 198 argidx = roundup(b->argidx, sizeof(u64));
@@ -206,7 +206,7 @@ struct multicall_space __xen_mc_entry(size_t args)
206 ret.args = &b->args[argidx]; 206 ret.args = &b->args[argidx];
207 b->argidx = argidx + args; 207 b->argidx = argidx + args;
208 208
209 BUG_ON(b->argidx > MC_ARGS); 209 BUG_ON(b->argidx >= MC_ARGS);
210 return ret; 210 return ret;
211} 211}
212 212
@@ -216,7 +216,7 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
216 struct multicall_space ret = { NULL, NULL }; 216 struct multicall_space ret = { NULL, NULL };
217 217
218 BUG_ON(preemptible()); 218 BUG_ON(preemptible());
219 BUG_ON(b->argidx > MC_ARGS); 219 BUG_ON(b->argidx >= MC_ARGS);
220 220
221 if (b->mcidx == 0) 221 if (b->mcidx == 0)
222 return ret; 222 return ret;
@@ -224,14 +224,14 @@ struct multicall_space xen_mc_extend_args(unsigned long op, size_t size)
224 if (b->entries[b->mcidx - 1].op != op) 224 if (b->entries[b->mcidx - 1].op != op)
225 return ret; 225 return ret;
226 226
227 if ((b->argidx + size) > MC_ARGS) 227 if ((b->argidx + size) >= MC_ARGS)
228 return ret; 228 return ret;
229 229
230 ret.mc = &b->entries[b->mcidx - 1]; 230 ret.mc = &b->entries[b->mcidx - 1];
231 ret.args = &b->args[b->argidx]; 231 ret.args = &b->args[b->argidx];
232 b->argidx += size; 232 b->argidx += size;
233 233
234 BUG_ON(b->argidx > MC_ARGS); 234 BUG_ON(b->argidx >= MC_ARGS);
235 return ret; 235 return ret;
236} 236}
237 237
diff --git a/arch/x86/xen/multicalls.h b/arch/x86/xen/multicalls.h
index 9e565da5d1f7..4ec8035e3216 100644
--- a/arch/x86/xen/multicalls.h
+++ b/arch/x86/xen/multicalls.h
@@ -22,7 +22,7 @@ static inline void xen_mc_batch(void)
22 unsigned long flags; 22 unsigned long flags;
23 /* need to disable interrupts until this entry is complete */ 23 /* need to disable interrupts until this entry is complete */
24 local_irq_save(flags); 24 local_irq_save(flags);
25 __get_cpu_var(xen_mc_irq_flags) = flags; 25 __this_cpu_write(xen_mc_irq_flags, flags);
26} 26}
27 27
28static inline struct multicall_space xen_mc_entry(size_t args) 28static inline struct multicall_space xen_mc_entry(size_t args)
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
new file mode 100644
index 000000000000..58efeb9d5440
--- /dev/null
+++ b/arch/x86/xen/p2m.c
@@ -0,0 +1,859 @@
1/*
2 * Xen leaves the responsibility for maintaining p2m mappings to the
3 * guests themselves, but it must also access and update the p2m array
4 * during suspend/resume when all the pages are reallocated.
5 *
6 * The p2m table is logically a flat array, but we implement it as a
7 * three-level tree to allow the address space to be sparse.
8 *
9 * Xen
10 * |
11 * p2m_top p2m_top_mfn
12 * / \ / \
13 * p2m_mid p2m_mid p2m_mid_mfn p2m_mid_mfn
14 * / \ / \ / /
15 * p2m p2m p2m p2m p2m p2m p2m ...
16 *
17 * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p.
18 *
19 * The p2m_top and p2m_top_mfn levels are limited to 1 page, so the
20 * maximum representable pseudo-physical address space is:
21 * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages
22 *
23 * P2M_PER_PAGE depends on the architecture, as a mfn is always
24 * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to
25 * 512 and 1024 entries respectively.
26 *
27 * In short, these structures contain the Machine Frame Number (MFN) of the PFN.
28 *
29 * However not all entries are filled with MFNs. Specifically for all other
30 * leaf entries, or for the top root, or middle one, for which there is a void
31 * entry, we assume it is "missing". So (for example)
32 * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY.
33 *
34 * We also have the possibility of setting 1-1 mappings on certain regions, so
35 * that:
36 * pfn_to_mfn(0xc0000)=0xc0000
37 *
38 * The benefit of this is, that we can assume for non-RAM regions (think
39 * PCI BARs, or ACPI spaces), we can create mappings easily b/c we
40 * get the PFN value to match the MFN.
41 *
42 * For this to work efficiently we have one new page p2m_identity and
43 * allocate (via reserved_brk) any other pages we need to cover the sides
44 * (1GB or 4MB boundary violations). All entries in p2m_identity are set to
45 * INVALID_P2M_ENTRY type (Xen toolstack only recognizes that and MFNs,
46 * no other fancy value).
47 *
48 * On lookup we spot that the entry points to p2m_identity and return the
49 * identity value instead of dereferencing and returning INVALID_P2M_ENTRY.
50 * If the entry points to an allocated page, we just proceed as before and
51 * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in
52 * appropriate functions (pfn_to_mfn).
53 *
54 * The reason for having the IDENTITY_FRAME_BIT instead of just returning the
55 * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a
56 * non-identity pfn. To protect ourselves against we elect to set (and get) the
57 * IDENTITY_FRAME_BIT on all identity mapped PFNs.
58 *
59 * This simplistic diagram is used to explain the more subtle piece of code.
60 * There is also a digram of the P2M at the end that can help.
61 * Imagine your E820 looking as so:
62 *
63 * 1GB 2GB
64 * /-------------------+---------\/----\ /----------\ /---+-----\
65 * | System RAM | Sys RAM ||ACPI| | reserved | | Sys RAM |
66 * \-------------------+---------/\----/ \----------/ \---+-----/
67 * ^- 1029MB ^- 2001MB
68 *
69 * [1029MB = 263424 (0x40500), 2001MB = 512256 (0x7D100),
70 * 2048MB = 524288 (0x80000)]
71 *
72 * And dom0_mem=max:3GB,1GB is passed in to the guest, meaning memory past 1GB
73 * is actually not present (would have to kick the balloon driver to put it in).
74 *
75 * When we are told to set the PFNs for identity mapping (see patch: "xen/setup:
76 * Set identity mapping for non-RAM E820 and E820 gaps.") we pass in the start
77 * of the PFN and the end PFN (263424 and 512256 respectively). The first step
78 * is to reserve_brk a top leaf page if the p2m[1] is missing. The top leaf page
79 * covers 512^2 of page estate (1GB) and in case the start or end PFN is not
80 * aligned on 512^2*PAGE_SIZE (1GB) we loop on aligned 1GB PFNs from start pfn
81 * to end pfn. We reserve_brk top leaf pages if they are missing (means they
82 * point to p2m_mid_missing).
83 *
84 * With the E820 example above, 263424 is not 1GB aligned so we allocate a
85 * reserve_brk page which will cover the PFNs estate from 0x40000 to 0x80000.
86 * Each entry in the allocate page is "missing" (points to p2m_missing).
87 *
88 * Next stage is to determine if we need to do a more granular boundary check
89 * on the 4MB (or 2MB depending on architecture) off the start and end pfn's.
90 * We check if the start pfn and end pfn violate that boundary check, and if
91 * so reserve_brk a middle (p2m[x][y]) leaf page. This way we have a much finer
92 * granularity of setting which PFNs are missing and which ones are identity.
93 * In our example 263424 and 512256 both fail the check so we reserve_brk two
94 * pages. Populate them with INVALID_P2M_ENTRY (so they both have "missing"
95 * values) and assign them to p2m[1][2] and p2m[1][488] respectively.
96 *
97 * At this point we would at minimum reserve_brk one page, but could be up to
98 * three. Each call to set_phys_range_identity has at maximum a three page
99 * cost. If we were to query the P2M at this stage, all those entries from
100 * start PFN through end PFN (so 1029MB -> 2001MB) would return
101 * INVALID_P2M_ENTRY ("missing").
102 *
103 * The next step is to walk from the start pfn to the end pfn setting
104 * the IDENTITY_FRAME_BIT on each PFN. This is done in set_phys_range_identity.
105 * If we find that the middle leaf is pointing to p2m_missing we can swap it
106 * over to p2m_identity - this way covering 4MB (or 2MB) PFN space. At this
107 * point we do not need to worry about boundary aligment (so no need to
108 * reserve_brk a middle page, figure out which PFNs are "missing" and which
109 * ones are identity), as that has been done earlier. If we find that the
110 * middle leaf is not occupied by p2m_identity or p2m_missing, we dereference
111 * that page (which covers 512 PFNs) and set the appropriate PFN with
112 * IDENTITY_FRAME_BIT. In our example 263424 and 512256 end up there, and we
113 * set from p2m[1][2][256->511] and p2m[1][488][0->256] with
114 * IDENTITY_FRAME_BIT set.
115 *
116 * All other regions that are void (or not filled) either point to p2m_missing
117 * (considered missing) or have the default value of INVALID_P2M_ENTRY (also
118 * considered missing). In our case, p2m[1][2][0->255] and p2m[1][488][257->511]
119 * contain the INVALID_P2M_ENTRY value and are considered "missing."
120 *
121 * This is what the p2m ends up looking (for the E820 above) with this
122 * fabulous drawing:
123 *
124 * p2m /--------------\
125 * /-----\ | &mfn_list[0],| /-----------------\
126 * | 0 |------>| &mfn_list[1],| /---------------\ | ~0, ~0, .. |
127 * |-----| | ..., ~0, ~0 | | ~0, ~0, [x]---+----->| IDENTITY [@256] |
128 * | 1 |---\ \--------------/ | [p2m_identity]+\ | IDENTITY [@257] |
129 * |-----| \ | [p2m_identity]+\\ | .... |
130 * | 2 |--\ \-------------------->| ... | \\ \----------------/
131 * |-----| \ \---------------/ \\
132 * | 3 |\ \ \\ p2m_identity
133 * |-----| \ \-------------------->/---------------\ /-----------------\
134 * | .. +->+ | [p2m_identity]+-->| ~0, ~0, ~0, ... |
135 * \-----/ / | [p2m_identity]+-->| ..., ~0 |
136 * / /---------------\ | .... | \-----------------/
137 * / | IDENTITY[@0] | /-+-[x], ~0, ~0.. |
138 * / | IDENTITY[@256]|<----/ \---------------/
139 * / | ~0, ~0, .... |
140 * | \---------------/
141 * |
142 * p2m_missing p2m_missing
143 * /------------------\ /------------\
144 * | [p2m_mid_missing]+---->| ~0, ~0, ~0 |
145 * | [p2m_mid_missing]+---->| ..., ~0 |
146 * \------------------/ \------------/
147 *
148 * where ~0 is INVALID_P2M_ENTRY. IDENTITY is (PFN | IDENTITY_BIT)
149 */
150
151#include <linux/init.h>
152#include <linux/module.h>
153#include <linux/list.h>
154#include <linux/hash.h>
155#include <linux/sched.h>
156#include <linux/seq_file.h>
157
158#include <asm/cache.h>
159#include <asm/setup.h>
160
161#include <asm/xen/page.h>
162#include <asm/xen/hypercall.h>
163#include <asm/xen/hypervisor.h>
164
165#include "xen-ops.h"
166
167static void __init m2p_override_init(void);
168
169unsigned long xen_max_p2m_pfn __read_mostly;
170
171#define P2M_PER_PAGE (PAGE_SIZE / sizeof(unsigned long))
172#define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *))
173#define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **))
174
175#define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
176
177/* Placeholders for holes in the address space */
178static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
179static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
180static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_missing_mfn, P2M_MID_PER_PAGE);
181
182static RESERVE_BRK_ARRAY(unsigned long **, p2m_top, P2M_TOP_PER_PAGE);
183static RESERVE_BRK_ARRAY(unsigned long, p2m_top_mfn, P2M_TOP_PER_PAGE);
184static RESERVE_BRK_ARRAY(unsigned long *, p2m_top_mfn_p, P2M_TOP_PER_PAGE);
185
186static RESERVE_BRK_ARRAY(unsigned long, p2m_identity, P2M_PER_PAGE);
187
188RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
189RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
190
191/* We might hit two boundary violations at the start and end, at max each
192 * boundary violation will require three middle nodes. */
193RESERVE_BRK(p2m_mid_identity, PAGE_SIZE * 2 * 3);
194
195static inline unsigned p2m_top_index(unsigned long pfn)
196{
197 BUG_ON(pfn >= MAX_P2M_PFN);
198 return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE);
199}
200
201static inline unsigned p2m_mid_index(unsigned long pfn)
202{
203 return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE;
204}
205
206static inline unsigned p2m_index(unsigned long pfn)
207{
208 return pfn % P2M_PER_PAGE;
209}
210
211static void p2m_top_init(unsigned long ***top)
212{
213 unsigned i;
214
215 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
216 top[i] = p2m_mid_missing;
217}
218
219static void p2m_top_mfn_init(unsigned long *top)
220{
221 unsigned i;
222
223 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
224 top[i] = virt_to_mfn(p2m_mid_missing_mfn);
225}
226
227static void p2m_top_mfn_p_init(unsigned long **top)
228{
229 unsigned i;
230
231 for (i = 0; i < P2M_TOP_PER_PAGE; i++)
232 top[i] = p2m_mid_missing_mfn;
233}
234
235static void p2m_mid_init(unsigned long **mid)
236{
237 unsigned i;
238
239 for (i = 0; i < P2M_MID_PER_PAGE; i++)
240 mid[i] = p2m_missing;
241}
242
243static void p2m_mid_mfn_init(unsigned long *mid)
244{
245 unsigned i;
246
247 for (i = 0; i < P2M_MID_PER_PAGE; i++)
248 mid[i] = virt_to_mfn(p2m_missing);
249}
250
251static void p2m_init(unsigned long *p2m)
252{
253 unsigned i;
254
255 for (i = 0; i < P2M_MID_PER_PAGE; i++)
256 p2m[i] = INVALID_P2M_ENTRY;
257}
258
259/*
260 * Build the parallel p2m_top_mfn and p2m_mid_mfn structures
261 *
262 * This is called both at boot time, and after resuming from suspend:
263 * - At boot time we're called very early, and must use extend_brk()
264 * to allocate memory.
265 *
266 * - After resume we're called from within stop_machine, but the mfn
267 * tree should alreay be completely allocated.
268 */
269void __ref xen_build_mfn_list_list(void)
270{
271 unsigned long pfn;
272
273 /* Pre-initialize p2m_top_mfn to be completely missing */
274 if (p2m_top_mfn == NULL) {
275 p2m_mid_missing_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
276 p2m_mid_mfn_init(p2m_mid_missing_mfn);
277
278 p2m_top_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
279 p2m_top_mfn_p_init(p2m_top_mfn_p);
280
281 p2m_top_mfn = extend_brk(PAGE_SIZE, PAGE_SIZE);
282 p2m_top_mfn_init(p2m_top_mfn);
283 } else {
284 /* Reinitialise, mfn's all change after migration */
285 p2m_mid_mfn_init(p2m_mid_missing_mfn);
286 }
287
288 for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += P2M_PER_PAGE) {
289 unsigned topidx = p2m_top_index(pfn);
290 unsigned mididx = p2m_mid_index(pfn);
291 unsigned long **mid;
292 unsigned long *mid_mfn_p;
293
294 mid = p2m_top[topidx];
295 mid_mfn_p = p2m_top_mfn_p[topidx];
296
297 /* Don't bother allocating any mfn mid levels if
298 * they're just missing, just update the stored mfn,
299 * since all could have changed over a migrate.
300 */
301 if (mid == p2m_mid_missing) {
302 BUG_ON(mididx);
303 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
304 p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn);
305 pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE;
306 continue;
307 }
308
309 if (mid_mfn_p == p2m_mid_missing_mfn) {
310 /*
311 * XXX boot-time only! We should never find
312 * missing parts of the mfn tree after
313 * runtime. extend_brk() will BUG if we call
314 * it too late.
315 */
316 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
317 p2m_mid_mfn_init(mid_mfn_p);
318
319 p2m_top_mfn_p[topidx] = mid_mfn_p;
320 }
321
322 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
323 mid_mfn_p[mididx] = virt_to_mfn(mid[mididx]);
324 }
325}
326
327void xen_setup_mfn_list_list(void)
328{
329 BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info);
330
331 HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list =
332 virt_to_mfn(p2m_top_mfn);
333 HYPERVISOR_shared_info->arch.max_pfn = xen_max_p2m_pfn;
334}
335
336/* Set up p2m_top to point to the domain-builder provided p2m pages */
337void __init xen_build_dynamic_phys_to_machine(void)
338{
339 unsigned long *mfn_list = (unsigned long *)xen_start_info->mfn_list;
340 unsigned long max_pfn = min(MAX_DOMAIN_PAGES, xen_start_info->nr_pages);
341 unsigned long pfn;
342
343 xen_max_p2m_pfn = max_pfn;
344
345 p2m_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
346 p2m_init(p2m_missing);
347
348 p2m_mid_missing = extend_brk(PAGE_SIZE, PAGE_SIZE);
349 p2m_mid_init(p2m_mid_missing);
350
351 p2m_top = extend_brk(PAGE_SIZE, PAGE_SIZE);
352 p2m_top_init(p2m_top);
353
354 p2m_identity = extend_brk(PAGE_SIZE, PAGE_SIZE);
355 p2m_init(p2m_identity);
356
357 /*
358 * The domain builder gives us a pre-constructed p2m array in
359 * mfn_list for all the pages initially given to us, so we just
360 * need to graft that into our tree structure.
361 */
362 for (pfn = 0; pfn < max_pfn; pfn += P2M_PER_PAGE) {
363 unsigned topidx = p2m_top_index(pfn);
364 unsigned mididx = p2m_mid_index(pfn);
365
366 if (p2m_top[topidx] == p2m_mid_missing) {
367 unsigned long **mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
368 p2m_mid_init(mid);
369
370 p2m_top[topidx] = mid;
371 }
372
373 /*
374 * As long as the mfn_list has enough entries to completely
375 * fill a p2m page, pointing into the array is ok. But if
376 * not the entries beyond the last pfn will be undefined.
377 */
378 if (unlikely(pfn + P2M_PER_PAGE > max_pfn)) {
379 unsigned long p2midx;
380
381 p2midx = max_pfn % P2M_PER_PAGE;
382 for ( ; p2midx < P2M_PER_PAGE; p2midx++)
383 mfn_list[pfn + p2midx] = INVALID_P2M_ENTRY;
384 }
385 p2m_top[topidx][mididx] = &mfn_list[pfn];
386 }
387
388 m2p_override_init();
389}
390
391unsigned long get_phys_to_machine(unsigned long pfn)
392{
393 unsigned topidx, mididx, idx;
394
395 if (unlikely(pfn >= MAX_P2M_PFN))
396 return INVALID_P2M_ENTRY;
397
398 topidx = p2m_top_index(pfn);
399 mididx = p2m_mid_index(pfn);
400 idx = p2m_index(pfn);
401
402 /*
403 * The INVALID_P2M_ENTRY is filled in both p2m_*identity
404 * and in p2m_*missing, so returning the INVALID_P2M_ENTRY
405 * would be wrong.
406 */
407 if (p2m_top[topidx][mididx] == p2m_identity)
408 return IDENTITY_FRAME(pfn);
409
410 return p2m_top[topidx][mididx][idx];
411}
412EXPORT_SYMBOL_GPL(get_phys_to_machine);
413
414static void *alloc_p2m_page(void)
415{
416 return (void *)__get_free_page(GFP_KERNEL | __GFP_REPEAT);
417}
418
419static void free_p2m_page(void *p)
420{
421 free_page((unsigned long)p);
422}
423
424/*
425 * Fully allocate the p2m structure for a given pfn. We need to check
426 * that both the top and mid levels are allocated, and make sure the
427 * parallel mfn tree is kept in sync. We may race with other cpus, so
428 * the new pages are installed with cmpxchg; if we lose the race then
429 * simply free the page we allocated and use the one that's there.
430 */
431static bool alloc_p2m(unsigned long pfn)
432{
433 unsigned topidx, mididx;
434 unsigned long ***top_p, **mid;
435 unsigned long *top_mfn_p, *mid_mfn;
436
437 topidx = p2m_top_index(pfn);
438 mididx = p2m_mid_index(pfn);
439
440 top_p = &p2m_top[topidx];
441 mid = *top_p;
442
443 if (mid == p2m_mid_missing) {
444 /* Mid level is missing, allocate a new one */
445 mid = alloc_p2m_page();
446 if (!mid)
447 return false;
448
449 p2m_mid_init(mid);
450
451 if (cmpxchg(top_p, p2m_mid_missing, mid) != p2m_mid_missing)
452 free_p2m_page(mid);
453 }
454
455 top_mfn_p = &p2m_top_mfn[topidx];
456 mid_mfn = p2m_top_mfn_p[topidx];
457
458 BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p);
459
460 if (mid_mfn == p2m_mid_missing_mfn) {
461 /* Separately check the mid mfn level */
462 unsigned long missing_mfn;
463 unsigned long mid_mfn_mfn;
464
465 mid_mfn = alloc_p2m_page();
466 if (!mid_mfn)
467 return false;
468
469 p2m_mid_mfn_init(mid_mfn);
470
471 missing_mfn = virt_to_mfn(p2m_mid_missing_mfn);
472 mid_mfn_mfn = virt_to_mfn(mid_mfn);
473 if (cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn) != missing_mfn)
474 free_p2m_page(mid_mfn);
475 else
476 p2m_top_mfn_p[topidx] = mid_mfn;
477 }
478
479 if (p2m_top[topidx][mididx] == p2m_identity ||
480 p2m_top[topidx][mididx] == p2m_missing) {
481 /* p2m leaf page is missing */
482 unsigned long *p2m;
483 unsigned long *p2m_orig = p2m_top[topidx][mididx];
484
485 p2m = alloc_p2m_page();
486 if (!p2m)
487 return false;
488
489 p2m_init(p2m);
490
491 if (cmpxchg(&mid[mididx], p2m_orig, p2m) != p2m_orig)
492 free_p2m_page(p2m);
493 else
494 mid_mfn[mididx] = virt_to_mfn(p2m);
495 }
496
497 return true;
498}
499
500static bool __init __early_alloc_p2m(unsigned long pfn)
501{
502 unsigned topidx, mididx, idx;
503
504 topidx = p2m_top_index(pfn);
505 mididx = p2m_mid_index(pfn);
506 idx = p2m_index(pfn);
507
508 /* Pfff.. No boundary cross-over, lets get out. */
509 if (!idx)
510 return false;
511
512 WARN(p2m_top[topidx][mididx] == p2m_identity,
513 "P2M[%d][%d] == IDENTITY, should be MISSING (or alloced)!\n",
514 topidx, mididx);
515
516 /*
517 * Could be done by xen_build_dynamic_phys_to_machine..
518 */
519 if (p2m_top[topidx][mididx] != p2m_missing)
520 return false;
521
522 /* Boundary cross-over for the edges: */
523 if (idx) {
524 unsigned long *p2m = extend_brk(PAGE_SIZE, PAGE_SIZE);
525 unsigned long *mid_mfn_p;
526
527 p2m_init(p2m);
528
529 p2m_top[topidx][mididx] = p2m;
530
531 /* For save/restore we need to MFN of the P2M saved */
532
533 mid_mfn_p = p2m_top_mfn_p[topidx];
534 WARN(mid_mfn_p[mididx] != virt_to_mfn(p2m_missing),
535 "P2M_TOP_P[%d][%d] != MFN of p2m_missing!\n",
536 topidx, mididx);
537 mid_mfn_p[mididx] = virt_to_mfn(p2m);
538
539 }
540 return idx != 0;
541}
542unsigned long __init set_phys_range_identity(unsigned long pfn_s,
543 unsigned long pfn_e)
544{
545 unsigned long pfn;
546
547 if (unlikely(pfn_s >= MAX_P2M_PFN || pfn_e >= MAX_P2M_PFN))
548 return 0;
549
550 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap)))
551 return pfn_e - pfn_s;
552
553 if (pfn_s > pfn_e)
554 return 0;
555
556 for (pfn = (pfn_s & ~(P2M_MID_PER_PAGE * P2M_PER_PAGE - 1));
557 pfn < ALIGN(pfn_e, (P2M_MID_PER_PAGE * P2M_PER_PAGE));
558 pfn += P2M_MID_PER_PAGE * P2M_PER_PAGE)
559 {
560 unsigned topidx = p2m_top_index(pfn);
561 unsigned long *mid_mfn_p;
562 unsigned long **mid;
563
564 mid = p2m_top[topidx];
565 mid_mfn_p = p2m_top_mfn_p[topidx];
566 if (mid == p2m_mid_missing) {
567 mid = extend_brk(PAGE_SIZE, PAGE_SIZE);
568
569 p2m_mid_init(mid);
570
571 p2m_top[topidx] = mid;
572
573 BUG_ON(mid_mfn_p != p2m_mid_missing_mfn);
574 }
575 /* And the save/restore P2M tables.. */
576 if (mid_mfn_p == p2m_mid_missing_mfn) {
577 mid_mfn_p = extend_brk(PAGE_SIZE, PAGE_SIZE);
578 p2m_mid_mfn_init(mid_mfn_p);
579
580 p2m_top_mfn_p[topidx] = mid_mfn_p;
581 p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p);
582 /* Note: we don't set mid_mfn_p[midix] here,
583 * look in __early_alloc_p2m */
584 }
585 }
586
587 __early_alloc_p2m(pfn_s);
588 __early_alloc_p2m(pfn_e);
589
590 for (pfn = pfn_s; pfn < pfn_e; pfn++)
591 if (!__set_phys_to_machine(pfn, IDENTITY_FRAME(pfn)))
592 break;
593
594 if (!WARN((pfn - pfn_s) != (pfn_e - pfn_s),
595 "Identity mapping failed. We are %ld short of 1-1 mappings!\n",
596 (pfn_e - pfn_s) - (pfn - pfn_s)))
597 printk(KERN_DEBUG "1-1 mapping on %lx->%lx\n", pfn_s, pfn);
598
599 return pfn - pfn_s;
600}
601
602/* Try to install p2m mapping; fail if intermediate bits missing */
603bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn)
604{
605 unsigned topidx, mididx, idx;
606
607 if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) {
608 BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY);
609 return true;
610 }
611 if (unlikely(pfn >= MAX_P2M_PFN)) {
612 BUG_ON(mfn != INVALID_P2M_ENTRY);
613 return true;
614 }
615
616 topidx = p2m_top_index(pfn);
617 mididx = p2m_mid_index(pfn);
618 idx = p2m_index(pfn);
619
620 /* For sparse holes were the p2m leaf has real PFN along with
621 * PCI holes, stick in the PFN as the MFN value.
622 */
623 if (mfn != INVALID_P2M_ENTRY && (mfn & IDENTITY_FRAME_BIT)) {
624 if (p2m_top[topidx][mididx] == p2m_identity)
625 return true;
626
627 /* Swap over from MISSING to IDENTITY if needed. */
628 if (p2m_top[topidx][mididx] == p2m_missing) {
629 WARN_ON(cmpxchg(&p2m_top[topidx][mididx], p2m_missing,
630 p2m_identity) != p2m_missing);
631 return true;
632 }
633 }
634
635 if (p2m_top[topidx][mididx] == p2m_missing)
636 return mfn == INVALID_P2M_ENTRY;
637
638 p2m_top[topidx][mididx][idx] = mfn;
639
640 return true;
641}
642
643bool set_phys_to_machine(unsigned long pfn, unsigned long mfn)
644{
645 if (unlikely(!__set_phys_to_machine(pfn, mfn))) {
646 if (!alloc_p2m(pfn))
647 return false;
648
649 if (!__set_phys_to_machine(pfn, mfn))
650 return false;
651 }
652
653 return true;
654}
655
656#define M2P_OVERRIDE_HASH_SHIFT 10
657#define M2P_OVERRIDE_HASH (1 << M2P_OVERRIDE_HASH_SHIFT)
658
659static RESERVE_BRK_ARRAY(struct list_head, m2p_overrides, M2P_OVERRIDE_HASH);
660static DEFINE_SPINLOCK(m2p_override_lock);
661
662static void __init m2p_override_init(void)
663{
664 unsigned i;
665
666 m2p_overrides = extend_brk(sizeof(*m2p_overrides) * M2P_OVERRIDE_HASH,
667 sizeof(unsigned long));
668
669 for (i = 0; i < M2P_OVERRIDE_HASH; i++)
670 INIT_LIST_HEAD(&m2p_overrides[i]);
671}
672
673static unsigned long mfn_hash(unsigned long mfn)
674{
675 return hash_long(mfn, M2P_OVERRIDE_HASH_SHIFT);
676}
677
678/* Add an MFN override for a particular page */
679int m2p_add_override(unsigned long mfn, struct page *page, bool clear_pte)
680{
681 unsigned long flags;
682 unsigned long pfn;
683 unsigned long uninitialized_var(address);
684 unsigned level;
685 pte_t *ptep = NULL;
686
687 pfn = page_to_pfn(page);
688 if (!PageHighMem(page)) {
689 address = (unsigned long)__va(pfn << PAGE_SHIFT);
690 ptep = lookup_address(address, &level);
691 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
692 "m2p_add_override: pfn %lx not mapped", pfn))
693 return -EINVAL;
694 }
695
696 page->private = mfn;
697 page->index = pfn_to_mfn(pfn);
698
699 if (unlikely(!set_phys_to_machine(pfn, FOREIGN_FRAME(mfn))))
700 return -ENOMEM;
701
702 if (clear_pte && !PageHighMem(page))
703 /* Just zap old mapping for now */
704 pte_clear(&init_mm, address, ptep);
705 spin_lock_irqsave(&m2p_override_lock, flags);
706 list_add(&page->lru, &m2p_overrides[mfn_hash(mfn)]);
707 spin_unlock_irqrestore(&m2p_override_lock, flags);
708
709 return 0;
710}
711EXPORT_SYMBOL_GPL(m2p_add_override);
712int m2p_remove_override(struct page *page, bool clear_pte)
713{
714 unsigned long flags;
715 unsigned long mfn;
716 unsigned long pfn;
717 unsigned long uninitialized_var(address);
718 unsigned level;
719 pte_t *ptep = NULL;
720
721 pfn = page_to_pfn(page);
722 mfn = get_phys_to_machine(pfn);
723 if (mfn == INVALID_P2M_ENTRY || !(mfn & FOREIGN_FRAME_BIT))
724 return -EINVAL;
725
726 if (!PageHighMem(page)) {
727 address = (unsigned long)__va(pfn << PAGE_SHIFT);
728 ptep = lookup_address(address, &level);
729
730 if (WARN(ptep == NULL || level != PG_LEVEL_4K,
731 "m2p_remove_override: pfn %lx not mapped", pfn))
732 return -EINVAL;
733 }
734
735 spin_lock_irqsave(&m2p_override_lock, flags);
736 list_del(&page->lru);
737 spin_unlock_irqrestore(&m2p_override_lock, flags);
738 set_phys_to_machine(pfn, page->index);
739
740 if (clear_pte && !PageHighMem(page))
741 set_pte_at(&init_mm, address, ptep,
742 pfn_pte(pfn, PAGE_KERNEL));
743 /* No tlb flush necessary because the caller already
744 * left the pte unmapped. */
745
746 return 0;
747}
748EXPORT_SYMBOL_GPL(m2p_remove_override);
749
750struct page *m2p_find_override(unsigned long mfn)
751{
752 unsigned long flags;
753 struct list_head *bucket = &m2p_overrides[mfn_hash(mfn)];
754 struct page *p, *ret;
755
756 ret = NULL;
757
758 spin_lock_irqsave(&m2p_override_lock, flags);
759
760 list_for_each_entry(p, bucket, lru) {
761 if (p->private == mfn) {
762 ret = p;
763 break;
764 }
765 }
766
767 spin_unlock_irqrestore(&m2p_override_lock, flags);
768
769 return ret;
770}
771
772unsigned long m2p_find_override_pfn(unsigned long mfn, unsigned long pfn)
773{
774 struct page *p = m2p_find_override(mfn);
775 unsigned long ret = pfn;
776
777 if (p)
778 ret = page_to_pfn(p);
779
780 return ret;
781}
782EXPORT_SYMBOL_GPL(m2p_find_override_pfn);
783
784#ifdef CONFIG_XEN_DEBUG_FS
785
786int p2m_dump_show(struct seq_file *m, void *v)
787{
788 static const char * const level_name[] = { "top", "middle",
789 "entry", "abnormal" };
790 static const char * const type_name[] = { "identity", "missing",
791 "pfn", "abnormal"};
792#define TYPE_IDENTITY 0
793#define TYPE_MISSING 1
794#define TYPE_PFN 2
795#define TYPE_UNKNOWN 3
796 unsigned long pfn, prev_pfn_type = 0, prev_pfn_level = 0;
797 unsigned int uninitialized_var(prev_level);
798 unsigned int uninitialized_var(prev_type);
799
800 if (!p2m_top)
801 return 0;
802
803 for (pfn = 0; pfn < MAX_DOMAIN_PAGES; pfn++) {
804 unsigned topidx = p2m_top_index(pfn);
805 unsigned mididx = p2m_mid_index(pfn);
806 unsigned idx = p2m_index(pfn);
807 unsigned lvl, type;
808
809 lvl = 4;
810 type = TYPE_UNKNOWN;
811 if (p2m_top[topidx] == p2m_mid_missing) {
812 lvl = 0; type = TYPE_MISSING;
813 } else if (p2m_top[topidx] == NULL) {
814 lvl = 0; type = TYPE_UNKNOWN;
815 } else if (p2m_top[topidx][mididx] == NULL) {
816 lvl = 1; type = TYPE_UNKNOWN;
817 } else if (p2m_top[topidx][mididx] == p2m_identity) {
818 lvl = 1; type = TYPE_IDENTITY;
819 } else if (p2m_top[topidx][mididx] == p2m_missing) {
820 lvl = 1; type = TYPE_MISSING;
821 } else if (p2m_top[topidx][mididx][idx] == 0) {
822 lvl = 2; type = TYPE_UNKNOWN;
823 } else if (p2m_top[topidx][mididx][idx] == IDENTITY_FRAME(pfn)) {
824 lvl = 2; type = TYPE_IDENTITY;
825 } else if (p2m_top[topidx][mididx][idx] == INVALID_P2M_ENTRY) {
826 lvl = 2; type = TYPE_MISSING;
827 } else if (p2m_top[topidx][mididx][idx] == pfn) {
828 lvl = 2; type = TYPE_PFN;
829 } else if (p2m_top[topidx][mididx][idx] != pfn) {
830 lvl = 2; type = TYPE_PFN;
831 }
832 if (pfn == 0) {
833 prev_level = lvl;
834 prev_type = type;
835 }
836 if (pfn == MAX_DOMAIN_PAGES-1) {
837 lvl = 3;
838 type = TYPE_UNKNOWN;
839 }
840 if (prev_type != type) {
841 seq_printf(m, " [0x%lx->0x%lx] %s\n",
842 prev_pfn_type, pfn, type_name[prev_type]);
843 prev_pfn_type = pfn;
844 prev_type = type;
845 }
846 if (prev_level != lvl) {
847 seq_printf(m, " [0x%lx->0x%lx] level %s\n",
848 prev_pfn_level, pfn, level_name[prev_level]);
849 prev_pfn_level = pfn;
850 prev_level = lvl;
851 }
852 }
853 return 0;
854#undef TYPE_IDENTITY
855#undef TYPE_MISSING
856#undef TYPE_PFN
857#undef TYPE_UNKNOWN
858}
859#endif
diff --git a/arch/x86/xen/pci-swiotlb-xen.c b/arch/x86/xen/pci-swiotlb-xen.c
index a013ec9d0c54..b480d4207a4c 100644
--- a/arch/x86/xen/pci-swiotlb-xen.c
+++ b/arch/x86/xen/pci-swiotlb-xen.c
@@ -1,10 +1,12 @@
1/* Glue code to lib/swiotlb-xen.c */ 1/* Glue code to lib/swiotlb-xen.c */
2 2
3#include <linux/dma-mapping.h> 3#include <linux/dma-mapping.h>
4#include <linux/pci.h>
4#include <xen/swiotlb-xen.h> 5#include <xen/swiotlb-xen.h>
5 6
6#include <asm/xen/hypervisor.h> 7#include <asm/xen/hypervisor.h>
7#include <xen/xen.h> 8#include <xen/xen.h>
9#include <asm/iommu_table.h>
8 10
9int xen_swiotlb __read_mostly; 11int xen_swiotlb __read_mostly;
10 12
@@ -34,7 +36,7 @@ int __init pci_xen_swiotlb_detect(void)
34 36
35 /* If running as PV guest, either iommu=soft, or swiotlb=force will 37 /* If running as PV guest, either iommu=soft, or swiotlb=force will
36 * activate this IOMMU. If running as PV privileged, activate it 38 * activate this IOMMU. If running as PV privileged, activate it
37 * irregardlesss. 39 * irregardless.
38 */ 40 */
39 if ((xen_initial_domain() || swiotlb || swiotlb_force) && 41 if ((xen_initial_domain() || swiotlb || swiotlb_force) &&
40 (xen_pv_domain())) 42 (xen_pv_domain()))
@@ -54,5 +56,12 @@ void __init pci_xen_swiotlb_init(void)
54 if (xen_swiotlb) { 56 if (xen_swiotlb) {
55 xen_swiotlb_init(1); 57 xen_swiotlb_init(1);
56 dma_ops = &xen_swiotlb_dma_ops; 58 dma_ops = &xen_swiotlb_dma_ops;
59
60 /* Make sure ACS will be enabled */
61 pci_request_acs();
57 } 62 }
58} 63}
64IOMMU_INIT_FINISH(pci_xen_swiotlb_detect,
65 0,
66 pci_xen_swiotlb_init,
67 0);
diff --git a/arch/x86/xen/platform-pci-unplug.c b/arch/x86/xen/platform-pci-unplug.c
index 0f456386cce5..25c52f94a27c 100644
--- a/arch/x86/xen/platform-pci-unplug.c
+++ b/arch/x86/xen/platform-pci-unplug.c
@@ -68,7 +68,7 @@ static int __init check_platform_magic(void)
68 return 0; 68 return 0;
69} 69}
70 70
71void __init xen_unplug_emulated_devices(void) 71void xen_unplug_emulated_devices(void)
72{ 72{
73 int r; 73 int r;
74 74
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 328b00305426..60aeeb56948f 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -8,6 +8,7 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/pm.h> 10#include <linux/pm.h>
11#include <linux/memblock.h>
11 12
12#include <asm/elf.h> 13#include <asm/elf.h>
13#include <asm/vdso.h> 14#include <asm/vdso.h>
@@ -17,10 +18,11 @@
17#include <asm/xen/hypervisor.h> 18#include <asm/xen/hypervisor.h>
18#include <asm/xen/hypercall.h> 19#include <asm/xen/hypercall.h>
19 20
21#include <xen/xen.h>
20#include <xen/page.h> 22#include <xen/page.h>
21#include <xen/interface/callback.h> 23#include <xen/interface/callback.h>
22#include <xen/interface/physdev.h>
23#include <xen/interface/memory.h> 24#include <xen/interface/memory.h>
25#include <xen/interface/physdev.h>
24#include <xen/features.h> 26#include <xen/features.h>
25 27
26#include "xen-ops.h" 28#include "xen-ops.h"
@@ -33,6 +35,44 @@ extern void xen_sysenter_target(void);
33extern void xen_syscall_target(void); 35extern void xen_syscall_target(void);
34extern void xen_syscall32_target(void); 36extern void xen_syscall32_target(void);
35 37
38/* Amount of extra memory space we add to the e820 ranges */
39phys_addr_t xen_extra_mem_start, xen_extra_mem_size;
40
41/*
42 * The maximum amount of extra memory compared to the base size. The
43 * main scaling factor is the size of struct page. At extreme ratios
44 * of base:extra, all the base memory can be filled with page
45 * structures for the extra memory, leaving no space for anything
46 * else.
47 *
48 * 10x seems like a reasonable balance between scaling flexibility and
49 * leaving a practically usable system.
50 */
51#define EXTRA_MEM_RATIO (10)
52
53static void __init xen_add_extra_mem(unsigned long pages)
54{
55 unsigned long pfn;
56
57 u64 size = (u64)pages * PAGE_SIZE;
58 u64 extra_start = xen_extra_mem_start + xen_extra_mem_size;
59
60 if (!pages)
61 return;
62
63 e820_add_region(extra_start, size, E820_RAM);
64 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
65
66 memblock_x86_reserve_range(extra_start, extra_start + size, "XEN EXTRA");
67
68 xen_extra_mem_size += size;
69
70 xen_max_p2m_pfn = PFN_DOWN(extra_start + size);
71
72 for (pfn = PFN_DOWN(extra_start); pfn <= xen_max_p2m_pfn; pfn++)
73 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
74}
75
36static unsigned long __init xen_release_chunk(phys_addr_t start_addr, 76static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
37 phys_addr_t end_addr) 77 phys_addr_t end_addr)
38{ 78{
@@ -69,7 +109,7 @@ static unsigned long __init xen_release_chunk(phys_addr_t start_addr,
69 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n", 109 WARN(ret != 1, "Failed to release memory %lx-%lx err=%d\n",
70 start, end, ret); 110 start, end, ret);
71 if (ret == 1) { 111 if (ret == 1) {
72 set_phys_to_machine(pfn, INVALID_P2M_ENTRY); 112 __set_phys_to_machine(pfn, INVALID_P2M_ENTRY);
73 len++; 113 len++;
74 } 114 }
75 } 115 }
@@ -82,16 +122,18 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
82 const struct e820map *e820) 122 const struct e820map *e820)
83{ 123{
84 phys_addr_t max_addr = PFN_PHYS(max_pfn); 124 phys_addr_t max_addr = PFN_PHYS(max_pfn);
85 phys_addr_t last_end = 0; 125 phys_addr_t last_end = ISA_END_ADDRESS;
86 unsigned long released = 0; 126 unsigned long released = 0;
87 int i; 127 int i;
88 128
129 /* Free any unused memory above the low 1Mbyte. */
89 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) { 130 for (i = 0; i < e820->nr_map && last_end < max_addr; i++) {
90 phys_addr_t end = e820->map[i].addr; 131 phys_addr_t end = e820->map[i].addr;
91 end = min(max_addr, end); 132 end = min(max_addr, end);
92 133
93 released += xen_release_chunk(last_end, end); 134 if (last_end < end)
94 last_end = e820->map[i].addr + e820->map[i].size; 135 released += xen_release_chunk(last_end, end);
136 last_end = max(last_end, e820->map[i].addr + e820->map[i].size);
95 } 137 }
96 138
97 if (last_end < max_addr) 139 if (last_end < max_addr)
@@ -101,24 +143,140 @@ static unsigned long __init xen_return_unused_memory(unsigned long max_pfn,
101 return released; 143 return released;
102} 144}
103 145
146static unsigned long __init xen_set_identity(const struct e820entry *list,
147 ssize_t map_size)
148{
149 phys_addr_t last = xen_initial_domain() ? 0 : ISA_END_ADDRESS;
150 phys_addr_t start_pci = last;
151 const struct e820entry *entry;
152 unsigned long identity = 0;
153 int i;
154
155 for (i = 0, entry = list; i < map_size; i++, entry++) {
156 phys_addr_t start = entry->addr;
157 phys_addr_t end = start + entry->size;
158
159 if (start < last)
160 start = last;
161
162 if (end <= start)
163 continue;
164
165 /* Skip over the 1MB region. */
166 if (last > end)
167 continue;
168
169 if ((entry->type == E820_RAM) || (entry->type == E820_UNUSABLE)) {
170 if (start > start_pci)
171 identity += set_phys_range_identity(
172 PFN_UP(start_pci), PFN_DOWN(start));
173
174 /* Without saving 'last' we would gooble RAM too
175 * at the end of the loop. */
176 last = end;
177 start_pci = end;
178 continue;
179 }
180 start_pci = min(start, start_pci);
181 last = end;
182 }
183 if (last > start_pci)
184 identity += set_phys_range_identity(
185 PFN_UP(start_pci), PFN_DOWN(last));
186 return identity;
187}
104/** 188/**
105 * machine_specific_memory_setup - Hook for machine specific memory setup. 189 * machine_specific_memory_setup - Hook for machine specific memory setup.
106 **/ 190 **/
107
108char * __init xen_memory_setup(void) 191char * __init xen_memory_setup(void)
109{ 192{
193 static struct e820entry map[E820MAX] __initdata;
194 static struct e820entry map_raw[E820MAX] __initdata;
195
110 unsigned long max_pfn = xen_start_info->nr_pages; 196 unsigned long max_pfn = xen_start_info->nr_pages;
197 unsigned long long mem_end;
198 int rc;
199 struct xen_memory_map memmap;
200 unsigned long extra_pages = 0;
201 unsigned long extra_limit;
202 unsigned long identity_pages = 0;
203 int i;
204 int op;
111 205
112 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn); 206 max_pfn = min(MAX_DOMAIN_PAGES, max_pfn);
207 mem_end = PFN_PHYS(max_pfn);
208
209 memmap.nr_entries = E820MAX;
210 set_xen_guest_handle(memmap.buffer, map);
211
212 op = xen_initial_domain() ?
213 XENMEM_machine_memory_map :
214 XENMEM_memory_map;
215 rc = HYPERVISOR_memory_op(op, &memmap);
216 if (rc == -ENOSYS) {
217 BUG_ON(xen_initial_domain());
218 memmap.nr_entries = 1;
219 map[0].addr = 0ULL;
220 map[0].size = mem_end;
221 /* 8MB slack (to balance backend allocations). */
222 map[0].size += 8ULL << 20;
223 map[0].type = E820_RAM;
224 rc = 0;
225 }
226 BUG_ON(rc);
113 227
228 memcpy(map_raw, map, sizeof(map));
114 e820.nr_map = 0; 229 e820.nr_map = 0;
230 xen_extra_mem_start = mem_end;
231 for (i = 0; i < memmap.nr_entries; i++) {
232 unsigned long long end;
233
234 /* Guard against non-page aligned E820 entries. */
235 if (map[i].type == E820_RAM)
236 map[i].size -= (map[i].size + map[i].addr) % PAGE_SIZE;
237
238 end = map[i].addr + map[i].size;
239 if (map[i].type == E820_RAM && end > mem_end) {
240 /* RAM off the end - may be partially included */
241 u64 delta = min(map[i].size, end - mem_end);
242
243 map[i].size -= delta;
244 end -= delta;
245
246 extra_pages += PFN_DOWN(delta);
247 /*
248 * Set RAM below 4GB that is not for us to be unusable.
249 * This prevents "System RAM" address space from being
250 * used as potential resource for I/O address (happens
251 * when 'allocate_resource' is called).
252 */
253 if (delta &&
254 (xen_initial_domain() && end < 0x100000000ULL))
255 e820_add_region(end, delta, E820_UNUSABLE);
256 }
115 257
116 e820_add_region(0, PFN_PHYS((u64)max_pfn), E820_RAM); 258 if (map[i].size > 0 && end > xen_extra_mem_start)
259 xen_extra_mem_start = end;
260
261 /* Add region if any remains */
262 if (map[i].size > 0)
263 e820_add_region(map[i].addr, map[i].size, map[i].type);
264 }
265 /* Align the balloon area so that max_low_pfn does not get set
266 * to be at the _end_ of the PCI gap at the far end (fee01000).
267 * Note that xen_extra_mem_start gets set in the loop above to be
268 * past the last E820 region. */
269 if (xen_initial_domain() && (xen_extra_mem_start < (1ULL<<32)))
270 xen_extra_mem_start = (1ULL<<32);
117 271
118 /* 272 /*
119 * Even though this is normal, usable memory under Xen, reserve 273 * In domU, the ISA region is normal, usable memory, but we
120 * ISA memory anyway because too many things think they can poke 274 * reserve ISA memory anyway because too many things poke
121 * about in there. 275 * about in there.
276 *
277 * In Dom0, the host E820 information can leave gaps in the
278 * ISA range, which would cause us to release those pages. To
279 * avoid this, we unconditionally reserve them here.
122 */ 280 */
123 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS, 281 e820_add_region(ISA_START_ADDRESS, ISA_END_ADDRESS - ISA_START_ADDRESS,
124 E820_RESERVED); 282 E820_RESERVED);
@@ -129,29 +287,43 @@ char * __init xen_memory_setup(void)
129 * - xen_start_info 287 * - xen_start_info
130 * See comment above "struct start_info" in <xen/interface/xen.h> 288 * See comment above "struct start_info" in <xen/interface/xen.h>
131 */ 289 */
132 reserve_early(__pa(xen_start_info->mfn_list), 290 memblock_x86_reserve_range(__pa(xen_start_info->mfn_list),
133 __pa(xen_start_info->pt_base), 291 __pa(xen_start_info->pt_base),
134 "XEN START INFO"); 292 "XEN START INFO");
135 293
136 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); 294 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
137 295
138 xen_return_unused_memory(xen_start_info->nr_pages, &e820); 296 extra_pages += xen_return_unused_memory(xen_start_info->nr_pages, &e820);
139 297
140 return "Xen"; 298 /*
141} 299 * Clamp the amount of extra memory to a EXTRA_MEM_RATIO
300 * factor the base size. On non-highmem systems, the base
301 * size is the full initial memory allocation; on highmem it
302 * is limited to the max size of lowmem, so that it doesn't
303 * get completely filled.
304 *
305 * In principle there could be a problem in lowmem systems if
306 * the initial memory is also very large with respect to
307 * lowmem, but we won't try to deal with that here.
308 */
309 extra_limit = min(EXTRA_MEM_RATIO * min(max_pfn, PFN_DOWN(MAXMEM)),
310 max_pfn + extra_pages);
142 311
143static void xen_idle(void) 312 if (extra_limit >= max_pfn)
144{ 313 extra_pages = extra_limit - max_pfn;
145 local_irq_disable(); 314 else
146 315 extra_pages = 0;
147 if (need_resched()) 316
148 local_irq_enable(); 317 xen_add_extra_mem(extra_pages);
149 else { 318
150 current_thread_info()->status &= ~TS_POLLING; 319 /*
151 smp_mb__after_clear_bit(); 320 * Set P2M for all non-RAM pages and E820 gaps to be identity
152 safe_halt(); 321 * type PFNs. We supply it with the non-sanitized version
153 current_thread_info()->status |= TS_POLLING; 322 * of the E820.
154 } 323 */
324 identity_pages = xen_set_identity(map_raw, memmap.nr_entries);
325 printk(KERN_INFO "Set %ld page(s) to 1-1 mapping.\n", identity_pages);
326 return "Xen";
155} 327}
156 328
157/* 329/*
@@ -170,7 +342,7 @@ static void __init fiddle_vdso(void)
170#endif 342#endif
171} 343}
172 344
173static __cpuinit int register_callback(unsigned type, const void *func) 345static int __cpuinit register_callback(unsigned type, const void *func)
174{ 346{
175 struct callback_register callback = { 347 struct callback_register callback = {
176 .type = type, 348 .type = type,
@@ -223,9 +395,6 @@ void __cpuinit xen_enable_syscall(void)
223 395
224void __init xen_arch_setup(void) 396void __init xen_arch_setup(void)
225{ 397{
226 struct physdev_set_iopl set_iopl;
227 int rc;
228
229 xen_panic_handler_init(); 398 xen_panic_handler_init();
230 399
231 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments); 400 HYPERVISOR_vm_assist(VMASST_CMD_enable, VMASST_TYPE_4gb_segments);
@@ -242,11 +411,6 @@ void __init xen_arch_setup(void)
242 xen_enable_sysenter(); 411 xen_enable_sysenter();
243 xen_enable_syscall(); 412 xen_enable_syscall();
244 413
245 set_iopl.iopl = 1;
246 rc = HYPERVISOR_physdev_op(PHYSDEVOP_set_iopl, &set_iopl);
247 if (rc != 0)
248 printk(KERN_INFO "physdev_op failed %d\n", rc);
249
250#ifdef CONFIG_ACPI 414#ifdef CONFIG_ACPI
251 if (!(xen_start_info->flags & SIF_INITDOMAIN)) { 415 if (!(xen_start_info->flags & SIF_INITDOMAIN)) {
252 printk(KERN_INFO "ACPI in unprivileged domain disabled\n"); 416 printk(KERN_INFO "ACPI in unprivileged domain disabled\n");
@@ -258,9 +422,12 @@ void __init xen_arch_setup(void)
258 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ? 422 MAX_GUEST_CMDLINE > COMMAND_LINE_SIZE ?
259 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE); 423 COMMAND_LINE_SIZE : MAX_GUEST_CMDLINE);
260 424
261 pm_idle = xen_idle; 425 /* Set up idle, making sure it calls safe_halt() pvop */
262 426#ifdef CONFIG_X86_32
263 paravirt_disable_iospace(); 427 boot_cpu_data.hlt_works_ok = 1;
428#endif
429 pm_idle = default_idle;
430 boot_option_idle_override = IDLE_HALT;
264 431
265 fiddle_vdso(); 432 fiddle_vdso();
266} 433}
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 25f232b18a82..b4533a86d7e4 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -28,6 +28,7 @@
28#include <asm/xen/interface.h> 28#include <asm/xen/interface.h>
29#include <asm/xen/hypercall.h> 29#include <asm/xen/hypercall.h>
30 30
31#include <xen/xen.h>
31#include <xen/page.h> 32#include <xen/page.h>
32#include <xen/events.h> 33#include <xen/events.h>
33 34
@@ -45,18 +46,17 @@ static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
45static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 46static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
46 47
47/* 48/*
48 * Reschedule call back. Nothing to do, 49 * Reschedule call back.
49 * all the work is done automatically when
50 * we return from the interrupt.
51 */ 50 */
52static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id) 51static irqreturn_t xen_reschedule_interrupt(int irq, void *dev_id)
53{ 52{
54 inc_irq_stat(irq_resched_count); 53 inc_irq_stat(irq_resched_count);
54 scheduler_ipi();
55 55
56 return IRQ_HANDLED; 56 return IRQ_HANDLED;
57} 57}
58 58
59static __cpuinit void cpu_bringup(void) 59static void __cpuinit cpu_bringup(void)
60{ 60{
61 int cpu = smp_processor_id(); 61 int cpu = smp_processor_id();
62 62
@@ -84,7 +84,7 @@ static __cpuinit void cpu_bringup(void)
84 wmb(); /* make sure everything is out */ 84 wmb(); /* make sure everything is out */
85} 85}
86 86
87static __cpuinit void cpu_bringup_and_idle(void) 87static void __cpuinit cpu_bringup_and_idle(void)
88{ 88{
89 cpu_bringup(); 89 cpu_bringup();
90 cpu_idle(); 90 cpu_idle();
@@ -156,6 +156,9 @@ static void __init xen_fill_possible_map(void)
156{ 156{
157 int i, rc; 157 int i, rc;
158 158
159 if (xen_initial_domain())
160 return;
161
159 for (i = 0; i < nr_cpu_ids; i++) { 162 for (i = 0; i < nr_cpu_ids; i++) {
160 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL); 163 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
161 if (rc >= 0) { 164 if (rc >= 0) {
@@ -165,6 +168,27 @@ static void __init xen_fill_possible_map(void)
165 } 168 }
166} 169}
167 170
171static void __init xen_filter_cpu_maps(void)
172{
173 int i, rc;
174
175 if (!xen_initial_domain())
176 return;
177
178 num_processors = 0;
179 disabled_cpus = 0;
180 for (i = 0; i < nr_cpu_ids; i++) {
181 rc = HYPERVISOR_vcpu_op(VCPUOP_is_up, i, NULL);
182 if (rc >= 0) {
183 num_processors++;
184 set_cpu_possible(i, true);
185 } else {
186 set_cpu_possible(i, false);
187 set_cpu_present(i, false);
188 }
189 }
190}
191
168static void __init xen_smp_prepare_boot_cpu(void) 192static void __init xen_smp_prepare_boot_cpu(void)
169{ 193{
170 BUG_ON(smp_processor_id() != 0); 194 BUG_ON(smp_processor_id() != 0);
@@ -174,17 +198,25 @@ static void __init xen_smp_prepare_boot_cpu(void)
174 old memory can be recycled */ 198 old memory can be recycled */
175 make_lowmem_page_readwrite(xen_initial_gdt); 199 make_lowmem_page_readwrite(xen_initial_gdt);
176 200
201 xen_filter_cpu_maps();
177 xen_setup_vcpu_info_placement(); 202 xen_setup_vcpu_info_placement();
178} 203}
179 204
180static void __init xen_smp_prepare_cpus(unsigned int max_cpus) 205static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
181{ 206{
182 unsigned cpu; 207 unsigned cpu;
208 unsigned int i;
183 209
184 xen_init_lock_cpu(0); 210 xen_init_lock_cpu(0);
185 211
186 smp_store_cpu_info(0); 212 smp_store_cpu_info(0);
187 cpu_data(0).x86_max_cores = 1; 213 cpu_data(0).x86_max_cores = 1;
214
215 for_each_possible_cpu(i) {
216 zalloc_cpumask_var(&per_cpu(cpu_sibling_map, i), GFP_KERNEL);
217 zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL);
218 zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL);
219 }
188 set_cpu_sibling_map(0); 220 set_cpu_sibling_map(0);
189 221
190 if (xen_smp_intr_init(0)) 222 if (xen_smp_intr_init(0))
@@ -216,7 +248,7 @@ static void __init xen_smp_prepare_cpus(unsigned int max_cpus)
216 } 248 }
217} 249}
218 250
219static __cpuinit int 251static int __cpuinit
220cpu_initialize_context(unsigned int cpu, struct task_struct *idle) 252cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
221{ 253{
222 struct vcpu_guest_context *ctxt; 254 struct vcpu_guest_context *ctxt;
@@ -400,9 +432,9 @@ static void stop_self(void *v)
400 BUG(); 432 BUG();
401} 433}
402 434
403static void xen_smp_send_stop(void) 435static void xen_stop_other_cpus(int wait)
404{ 436{
405 smp_call_function(stop_self, NULL, 0); 437 smp_call_function(stop_self, NULL, wait);
406} 438}
407 439
408static void xen_smp_send_reschedule(int cpu) 440static void xen_smp_send_reschedule(int cpu)
@@ -460,7 +492,7 @@ static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id)
460 return IRQ_HANDLED; 492 return IRQ_HANDLED;
461} 493}
462 494
463static const struct smp_ops xen_smp_ops __initdata = { 495static const struct smp_ops xen_smp_ops __initconst = {
464 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu, 496 .smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu,
465 .smp_prepare_cpus = xen_smp_prepare_cpus, 497 .smp_prepare_cpus = xen_smp_prepare_cpus,
466 .smp_cpus_done = xen_smp_cpus_done, 498 .smp_cpus_done = xen_smp_cpus_done,
@@ -470,7 +502,7 @@ static const struct smp_ops xen_smp_ops __initdata = {
470 .cpu_disable = xen_cpu_disable, 502 .cpu_disable = xen_cpu_disable,
471 .play_dead = xen_play_dead, 503 .play_dead = xen_play_dead,
472 504
473 .smp_send_stop = xen_smp_send_stop, 505 .stop_other_cpus = xen_stop_other_cpus,
474 .smp_send_reschedule = xen_smp_send_reschedule, 506 .smp_send_reschedule = xen_smp_send_reschedule,
475 507
476 .send_call_func_ipi = xen_smp_send_call_function_ipi, 508 .send_call_func_ipi = xen_smp_send_call_function_ipi,
@@ -483,3 +515,41 @@ void __init xen_smp_init(void)
483 xen_fill_possible_map(); 515 xen_fill_possible_map();
484 xen_init_spinlocks(); 516 xen_init_spinlocks();
485} 517}
518
519static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
520{
521 native_smp_prepare_cpus(max_cpus);
522 WARN_ON(xen_smp_intr_init(0));
523
524 if (!xen_have_vector_callback)
525 return;
526 xen_init_lock_cpu(0);
527 xen_init_spinlocks();
528}
529
530static int __cpuinit xen_hvm_cpu_up(unsigned int cpu)
531{
532 int rc;
533 rc = native_cpu_up(cpu);
534 WARN_ON (xen_smp_intr_init(cpu));
535 return rc;
536}
537
538static void xen_hvm_cpu_die(unsigned int cpu)
539{
540 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
541 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
542 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
543 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
544 native_cpu_die(cpu);
545}
546
547void __init xen_hvm_smp_init(void)
548{
549 smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
550 smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
551 smp_ops.cpu_up = xen_hvm_cpu_up;
552 smp_ops.cpu_die = xen_hvm_cpu_die;
553 smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
554 smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
555}
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index e0500646585d..cc9b1e182fcf 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -159,8 +159,8 @@ static inline struct xen_spinlock *spinning_lock(struct xen_spinlock *xl)
159{ 159{
160 struct xen_spinlock *prev; 160 struct xen_spinlock *prev;
161 161
162 prev = __get_cpu_var(lock_spinners); 162 prev = __this_cpu_read(lock_spinners);
163 __get_cpu_var(lock_spinners) = xl; 163 __this_cpu_write(lock_spinners, xl);
164 164
165 wmb(); /* set lock of interest before count */ 165 wmb(); /* set lock of interest before count */
166 166
@@ -179,14 +179,14 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
179 asm(LOCK_PREFIX " decw %0" 179 asm(LOCK_PREFIX " decw %0"
180 : "+m" (xl->spinners) : : "memory"); 180 : "+m" (xl->spinners) : : "memory");
181 wmb(); /* decrement count before restoring lock */ 181 wmb(); /* decrement count before restoring lock */
182 __get_cpu_var(lock_spinners) = prev; 182 __this_cpu_write(lock_spinners, prev);
183} 183}
184 184
185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable) 185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
186{ 186{
187 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 187 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
188 struct xen_spinlock *prev; 188 struct xen_spinlock *prev;
189 int irq = __get_cpu_var(lock_kicker_irq); 189 int irq = __this_cpu_read(lock_kicker_irq);
190 int ret; 190 int ret;
191 u64 start; 191 u64 start;
192 192
@@ -224,7 +224,7 @@ static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enab
224 goto out; 224 goto out;
225 } 225 }
226 226
227 flags = __raw_local_save_flags(); 227 flags = arch_local_save_flags();
228 if (irq_enable) { 228 if (irq_enable) {
229 ADD_STATS(taken_slow_irqenable, 1); 229 ADD_STATS(taken_slow_irqenable, 1);
230 raw_local_irq_enable(); 230 raw_local_irq_enable();
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 1d789d56877c..45329c8c226e 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -12,7 +12,7 @@
12#include "xen-ops.h" 12#include "xen-ops.h"
13#include "mmu.h" 13#include "mmu.h"
14 14
15void xen_pre_suspend(void) 15void xen_arch_pre_suspend(void)
16{ 16{
17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn); 17 xen_start_info->store_mfn = mfn_to_pfn(xen_start_info->store_mfn);
18 xen_start_info->console.domU.mfn = 18 xen_start_info->console.domU.mfn =
@@ -26,19 +26,22 @@ void xen_pre_suspend(void)
26 BUG(); 26 BUG();
27} 27}
28 28
29void xen_hvm_post_suspend(int suspend_cancelled) 29void xen_arch_hvm_post_suspend(int suspend_cancelled)
30{ 30{
31#ifdef CONFIG_XEN_PVHVM
31 int cpu; 32 int cpu;
32 xen_hvm_init_shared_info(); 33 xen_hvm_init_shared_info();
33 xen_callback_vector(); 34 xen_callback_vector();
35 xen_unplug_emulated_devices();
34 if (xen_feature(XENFEAT_hvm_safe_pvclock)) { 36 if (xen_feature(XENFEAT_hvm_safe_pvclock)) {
35 for_each_online_cpu(cpu) { 37 for_each_online_cpu(cpu) {
36 xen_setup_runstate_info(cpu); 38 xen_setup_runstate_info(cpu);
37 } 39 }
38 } 40 }
41#endif
39} 42}
40 43
41void xen_post_suspend(int suspend_cancelled) 44void xen_arch_post_suspend(int suspend_cancelled)
42{ 45{
43 xen_build_mfn_list_list(); 46 xen_build_mfn_list_list();
44 47
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index b2bb5aa3b054..5158c505bef9 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -26,8 +26,6 @@
26 26
27#include "xen-ops.h" 27#include "xen-ops.h"
28 28
29#define XEN_SHIFT 22
30
31/* Xen may fire a timer up to this many ns early */ 29/* Xen may fire a timer up to this many ns early */
32#define TIMER_SLOP 100000 30#define TIMER_SLOP 100000
33#define NS_PER_TICK (1000000000LL / HZ) 31#define NS_PER_TICK (1000000000LL / HZ)
@@ -135,24 +133,24 @@ static void do_stolen_accounting(void)
135 133
136 /* Add the appropriate number of ticks of stolen time, 134 /* Add the appropriate number of ticks of stolen time,
137 including any left-overs from last time. */ 135 including any left-overs from last time. */
138 stolen = runnable + offline + __get_cpu_var(xen_residual_stolen); 136 stolen = runnable + offline + __this_cpu_read(xen_residual_stolen);
139 137
140 if (stolen < 0) 138 if (stolen < 0)
141 stolen = 0; 139 stolen = 0;
142 140
143 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
144 __get_cpu_var(xen_residual_stolen) = stolen; 142 __this_cpu_write(xen_residual_stolen, stolen);
145 account_steal_ticks(ticks); 143 account_steal_ticks(ticks);
146 144
147 /* Add the appropriate number of ticks of blocked time, 145 /* Add the appropriate number of ticks of blocked time,
148 including any left-overs from last time. */ 146 including any left-overs from last time. */
149 blocked += __get_cpu_var(xen_residual_blocked); 147 blocked += __this_cpu_read(xen_residual_blocked);
150 148
151 if (blocked < 0) 149 if (blocked < 0)
152 blocked = 0; 150 blocked = 0;
153 151
154 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
155 __get_cpu_var(xen_residual_blocked) = blocked; 153 __this_cpu_write(xen_residual_blocked, blocked);
156 account_idle_ticks(ticks); 154 account_idle_ticks(ticks);
157} 155}
158 156
@@ -211,8 +209,6 @@ static struct clocksource xen_clocksource __read_mostly = {
211 .rating = 400, 209 .rating = 400,
212 .read = xen_clocksource_get_cycles, 210 .read = xen_clocksource_get_cycles,
213 .mask = ~0, 211 .mask = ~0,
214 .mult = 1<<XEN_SHIFT, /* time directly in nanoseconds */
215 .shift = XEN_SHIFT,
216 .flags = CLOCK_SOURCE_IS_CONTINUOUS, 212 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
217}; 213};
218 214
@@ -397,7 +393,9 @@ void xen_setup_timer(int cpu)
397 name = "<timer kasprintf failed>"; 393 name = "<timer kasprintf failed>";
398 394
399 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 395 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
400 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER, 396 IRQF_DISABLED|IRQF_PERCPU|
397 IRQF_NOBALANCING|IRQF_TIMER|
398 IRQF_FORCE_RESUME,
401 name, NULL); 399 name, NULL);
402 400
403 evt = &per_cpu(xen_clock_events, cpu); 401 evt = &per_cpu(xen_clock_events, cpu);
@@ -426,6 +424,8 @@ void xen_timer_resume(void)
426{ 424{
427 int cpu; 425 int cpu;
428 426
427 pvclock_resume();
428
429 if (xen_clockevent != &xen_vcpuop_clockevent) 429 if (xen_clockevent != &xen_vcpuop_clockevent)
430 return; 430 return;
431 431
@@ -435,16 +435,16 @@ void xen_timer_resume(void)
435 } 435 }
436} 436}
437 437
438static const struct pv_time_ops xen_time_ops __initdata = { 438static const struct pv_time_ops xen_time_ops __initconst = {
439 .sched_clock = xen_clocksource_read, 439 .sched_clock = xen_clocksource_read,
440}; 440};
441 441
442static __init void xen_time_init(void) 442static void __init xen_time_init(void)
443{ 443{
444 int cpu = smp_processor_id(); 444 int cpu = smp_processor_id();
445 struct timespec tp; 445 struct timespec tp;
446 446
447 clocksource_register(&xen_clocksource); 447 clocksource_register_hz(&xen_clocksource, NSEC_PER_SEC);
448 448
449 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) { 449 if (HYPERVISOR_vcpu_op(VCPUOP_stop_periodic_timer, cpu, NULL) == 0) {
450 /* Successfully turned off 100Hz tick, so we have the 450 /* Successfully turned off 100Hz tick, so we have the
@@ -464,7 +464,7 @@ static __init void xen_time_init(void)
464 xen_setup_cpu_clockevents(); 464 xen_setup_cpu_clockevents();
465} 465}
466 466
467__init void xen_init_time_ops(void) 467void __init xen_init_time_ops(void)
468{ 468{
469 pv_time_ops = xen_time_ops; 469 pv_time_ops = xen_time_ops;
470 470
@@ -486,7 +486,7 @@ static void xen_hvm_setup_cpu_clockevents(void)
486 xen_setup_cpu_clockevents(); 486 xen_setup_cpu_clockevents();
487} 487}
488 488
489__init void xen_hvm_init_time_ops(void) 489void __init xen_hvm_init_time_ops(void)
490{ 490{
491 /* vector callback is needed otherwise we cannot receive interrupts 491 /* vector callback is needed otherwise we cannot receive interrupts
492 * on cpu > 0 and at this point we don't know how many cpus are 492 * on cpu > 0 and at this point we don't know how many cpus are
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 1a5ff24e29c0..aaa7291c9259 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -28,9 +28,9 @@ ENTRY(startup_xen)
28 __FINIT 28 __FINIT
29 29
30.pushsection .text 30.pushsection .text
31 .align PAGE_SIZE_asm 31 .align PAGE_SIZE
32ENTRY(hypercall_page) 32ENTRY(hypercall_page)
33 .skip PAGE_SIZE_asm 33 .skip PAGE_SIZE
34.popsection 34.popsection
35 35
36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux") 36 ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS, .asciz "linux")
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 7c8ab86163e9..97dfdc8757b3 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -30,6 +30,9 @@ void xen_setup_machphys_mapping(void);
30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
31void xen_ident_map_ISA(void); 31void xen_ident_map_ISA(void);
32void xen_reserve_top(void); 32void xen_reserve_top(void);
33extern unsigned long xen_max_p2m_pfn;
34
35void xen_set_pat(u64);
33 36
34char * __init xen_memory_setup(void); 37char * __init xen_memory_setup(void);
35void __init xen_arch_setup(void); 38void __init xen_arch_setup(void);
@@ -40,7 +43,7 @@ void xen_vcpu_restore(void);
40 43
41void xen_callback_vector(void); 44void xen_callback_vector(void);
42void xen_hvm_init_shared_info(void); 45void xen_hvm_init_shared_info(void);
43void __init xen_unplug_emulated_devices(void); 46void xen_unplug_emulated_devices(void);
44 47
45void __init xen_build_dynamic_phys_to_machine(void); 48void __init xen_build_dynamic_phys_to_machine(void);
46 49
@@ -61,15 +64,17 @@ void xen_setup_vcpu_info_placement(void);
61 64
62#ifdef CONFIG_SMP 65#ifdef CONFIG_SMP
63void xen_smp_init(void); 66void xen_smp_init(void);
67void __init xen_hvm_smp_init(void);
64 68
65extern cpumask_var_t xen_cpu_initialized_map; 69extern cpumask_var_t xen_cpu_initialized_map;
66#else 70#else
67static inline void xen_smp_init(void) {} 71static inline void xen_smp_init(void) {}
72static inline void xen_hvm_smp_init(void) {}
68#endif 73#endif
69 74
70#ifdef CONFIG_PARAVIRT_SPINLOCKS 75#ifdef CONFIG_PARAVIRT_SPINLOCKS
71void __init xen_init_spinlocks(void); 76void __init xen_init_spinlocks(void);
72__cpuinit void xen_init_lock_cpu(int cpu); 77void __cpuinit xen_init_lock_cpu(int cpu);
73void xen_uninit_lock_cpu(int cpu); 78void xen_uninit_lock_cpu(int cpu);
74#else 79#else
75static inline void xen_init_spinlocks(void) 80static inline void xen_init_spinlocks(void)