aboutsummaryrefslogtreecommitdiffstats
path: root/arch/x86
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/Kconfig79
-rw-r--r--arch/x86/Kconfig.cpu20
-rw-r--r--arch/x86/Kconfig.debug23
-rw-r--r--arch/x86/Makefile5
-rw-r--r--arch/x86/Makefile_32.cpu9
-rw-r--r--arch/x86/boot/compressed/Makefile6
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/boot/compressed/misc.c19
-rw-r--r--arch/x86/boot/compressed/relocs.c87
-rw-r--r--arch/x86/boot/compressed/vmlinux.lds.S3
-rw-r--r--arch/x86/boot/header.S2
-rw-r--r--arch/x86/boot/mkcpustr.c2
-rw-r--r--arch/x86/boot/version.c4
-rw-r--r--arch/x86/boot/video-vga.c9
-rw-r--r--arch/x86/boot/video.c13
-rw-r--r--arch/x86/crypto/Makefile3
-rw-r--r--arch/x86/crypto/aesni-intel_asm.S517
-rw-r--r--arch/x86/crypto/fpu.c1
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_asm.S157
-rw-r--r--arch/x86/crypto/ghash-clmulni-intel_glue.c333
-rw-r--r--arch/x86/crypto/twofish-i586-asm_32.S10
-rw-r--r--arch/x86/crypto/twofish-x86_64-asm_64.S20
-rw-r--r--arch/x86/ia32/ia32_aout.c15
-rw-r--r--arch/x86/ia32/ia32entry.S13
-rw-r--r--arch/x86/ia32/sys_ia32.c176
-rw-r--r--arch/x86/include/asm/Kbuild2
-rw-r--r--arch/x86/include/asm/a.out-core.h10
-rw-r--r--arch/x86/include/asm/acpi.h29
-rw-r--r--arch/x86/include/asm/alternative-asm.h10
-rw-r--r--arch/x86/include/asm/alternative.h17
-rw-r--r--arch/x86/include/asm/amd_iommu.h16
-rw-r--r--arch/x86/include/asm/amd_iommu_proto.h41
-rw-r--r--arch/x86/include/asm/amd_iommu_types.h57
-rw-r--r--arch/x86/include/asm/apb_timer.h70
-rw-r--r--arch/x86/include/asm/apic.h21
-rw-r--r--arch/x86/include/asm/apicdef.h6
-rw-r--r--arch/x86/include/asm/apicnum.h12
-rw-r--r--arch/x86/include/asm/asm-offsets.h1
-rw-r--r--arch/x86/include/asm/atomic.h299
-rw-r--r--arch/x86/include/asm/atomic64_32.h160
-rw-r--r--arch/x86/include/asm/atomic64_64.h224
-rw-r--r--arch/x86/include/asm/atomic_32.h415
-rw-r--r--arch/x86/include/asm/atomic_64.h485
-rw-r--r--arch/x86/include/asm/bug.h4
-rw-r--r--arch/x86/include/asm/cache.h7
-rw-r--r--arch/x86/include/asm/cacheflush.h2
-rw-r--r--arch/x86/include/asm/calgary.h2
-rw-r--r--arch/x86/include/asm/cmpxchg_32.h218
-rw-r--r--arch/x86/include/asm/cmpxchg_64.h234
-rw-r--r--arch/x86/include/asm/compat.h3
-rw-r--r--arch/x86/include/asm/cpu_debug.h127
-rw-r--r--arch/x86/include/asm/cpufeature.h6
-rw-r--r--arch/x86/include/asm/debugreg.h36
-rw-r--r--arch/x86/include/asm/desc_defs.h4
-rw-r--r--arch/x86/include/asm/device.h2
-rw-r--r--arch/x86/include/asm/dma-mapping.h7
-rw-r--r--arch/x86/include/asm/e820.h28
-rw-r--r--arch/x86/include/asm/elf.h36
-rw-r--r--arch/x86/include/asm/entry_arch.h2
-rw-r--r--arch/x86/include/asm/fb.h4
-rw-r--r--arch/x86/include/asm/fixmap.h22
-rw-r--r--arch/x86/include/asm/gart.h9
-rw-r--r--arch/x86/include/asm/geode.h219
-rw-r--r--arch/x86/include/asm/hardirq.h8
-rw-r--r--arch/x86/include/asm/highmem.h4
-rw-r--r--arch/x86/include/asm/hpet.h8
-rw-r--r--arch/x86/include/asm/hw_breakpoint.h72
-rw-r--r--arch/x86/include/asm/hw_irq.h43
-rw-r--r--arch/x86/include/asm/hyperv.h186
-rw-r--r--arch/x86/include/asm/i387.h19
-rw-r--r--arch/x86/include/asm/i8259.h21
-rw-r--r--arch/x86/include/asm/inat.h220
-rw-r--r--arch/x86/include/asm/inat_types.h29
-rw-r--r--arch/x86/include/asm/insn.h184
-rw-r--r--arch/x86/include/asm/inst.h150
-rw-r--r--arch/x86/include/asm/io.h156
-rw-r--r--arch/x86/include/asm/io_32.h196
-rw-r--r--arch/x86/include/asm/io_64.h181
-rw-r--r--arch/x86/include/asm/io_apic.h8
-rw-r--r--arch/x86/include/asm/iommu.h2
-rw-r--r--arch/x86/include/asm/irq.h3
-rw-r--r--arch/x86/include/asm/irq_vectors.h52
-rw-r--r--arch/x86/include/asm/k8.h10
-rw-r--r--arch/x86/include/asm/kprobes.h31
-rw-r--r--arch/x86/include/asm/kvm.h34
-rw-r--r--arch/x86/include/asm/kvm_emulate.h19
-rw-r--r--arch/x86/include/asm/kvm_host.h94
-rw-r--r--arch/x86/include/asm/kvm_para.h1
-rw-r--r--arch/x86/include/asm/lguest_hcall.h29
-rw-r--r--arch/x86/include/asm/local.h37
-rw-r--r--arch/x86/include/asm/mce.h15
-rw-r--r--arch/x86/include/asm/mmzone_32.h2
-rw-r--r--arch/x86/include/asm/mmzone_64.h6
-rw-r--r--arch/x86/include/asm/mpspec.h27
-rw-r--r--arch/x86/include/asm/mrst.h19
-rw-r--r--arch/x86/include/asm/msr-index.h4
-rw-r--r--arch/x86/include/asm/msr.h27
-rw-r--r--arch/x86/include/asm/nmi.h1
-rw-r--r--arch/x86/include/asm/numa_64.h5
-rw-r--r--arch/x86/include/asm/numaq.h5
-rw-r--r--arch/x86/include/asm/olpc.h22
-rw-r--r--arch/x86/include/asm/page_types.h4
-rw-r--r--arch/x86/include/asm/paravirt.h23
-rw-r--r--arch/x86/include/asm/paravirt_types.h18
-rw-r--r--arch/x86/include/asm/pci.h39
-rw-r--r--arch/x86/include/asm/pci_64.h2
-rw-r--r--arch/x86/include/asm/pci_x86.h43
-rw-r--r--arch/x86/include/asm/percpu.h199
-rw-r--r--arch/x86/include/asm/perf_event.h42
-rw-r--r--arch/x86/include/asm/pgalloc.h5
-rw-r--r--arch/x86/include/asm/pgtable.h6
-rw-r--r--arch/x86/include/asm/pgtable_32.h7
-rw-r--r--arch/x86/include/asm/pgtable_64.h2
-rw-r--r--arch/x86/include/asm/processor.h18
-rw-r--r--arch/x86/include/asm/proto.h27
-rw-r--r--arch/x86/include/asm/ptrace.h65
-rw-r--r--arch/x86/include/asm/rwsem.h81
-rw-r--r--arch/x86/include/asm/sections.h6
-rw-r--r--arch/x86/include/asm/setup.h2
-rw-r--r--arch/x86/include/asm/sigcontext.h4
-rw-r--r--arch/x86/include/asm/smp.h9
-rw-r--r--arch/x86/include/asm/spinlock.h62
-rw-r--r--arch/x86/include/asm/spinlock_types.h10
-rw-r--r--arch/x86/include/asm/stacktrace.h24
-rw-r--r--arch/x86/include/asm/string_32.h9
-rw-r--r--arch/x86/include/asm/svm.h5
-rw-r--r--arch/x86/include/asm/swiotlb.h11
-rw-r--r--arch/x86/include/asm/sys_ia32.h20
-rw-r--r--arch/x86/include/asm/syscall.h2
-rw-r--r--arch/x86/include/asm/syscalls.h49
-rw-r--r--arch/x86/include/asm/system.h44
-rw-r--r--arch/x86/include/asm/thread_info.h9
-rw-r--r--arch/x86/include/asm/topology.h9
-rw-r--r--arch/x86/include/asm/trampoline.h1
-rw-r--r--arch/x86/include/asm/uaccess.h1
-rw-r--r--arch/x86/include/asm/uaccess_32.h26
-rw-r--r--arch/x86/include/asm/uaccess_64.h56
-rw-r--r--arch/x86/include/asm/unistd_32.h9
-rw-r--r--arch/x86/include/asm/unistd_64.h5
-rw-r--r--arch/x86/include/asm/user.h58
-rw-r--r--arch/x86/include/asm/uv/bios.h22
-rw-r--r--arch/x86/include/asm/uv/uv.h1
-rw-r--r--arch/x86/include/asm/uv/uv_bau.h2
-rw-r--r--arch/x86/include/asm/uv/uv_hub.h143
-rw-r--r--arch/x86/include/asm/uv/uv_irq.h14
-rw-r--r--arch/x86/include/asm/visws/cobalt.h2
-rw-r--r--arch/x86/include/asm/vmx.h9
-rw-r--r--arch/x86/include/asm/x86_init.h31
-rw-r--r--arch/x86/include/asm/xen/hypervisor.h27
-rw-r--r--arch/x86/include/asm/xsave.h2
-rw-r--r--arch/x86/kernel/Makefile4
-rw-r--r--arch/x86/kernel/acpi/Makefile2
-rw-r--r--arch/x86/kernel/acpi/boot.c173
-rw-r--r--arch/x86/kernel/acpi/cstate.c2
-rw-r--r--arch/x86/kernel/acpi/processor.c101
-rw-r--r--arch/x86/kernel/acpi/sleep.c26
-rw-r--r--arch/x86/kernel/alternative.c83
-rw-r--r--arch/x86/kernel/amd_iommu.c1330
-rw-r--r--arch/x86/kernel/amd_iommu_init.c157
-rw-r--r--arch/x86/kernel/apb_timer.c785
-rw-r--r--arch/x86/kernel/aperture_64.c28
-rw-r--r--arch/x86/kernel/apic/Makefile2
-rw-r--r--arch/x86/kernel/apic/apic.c68
-rw-r--r--arch/x86/kernel/apic/apic_flat_64.c12
-rw-r--r--arch/x86/kernel/apic/apic_noop.c200
-rw-r--r--arch/x86/kernel/apic/bigsmp_32.c18
-rw-r--r--arch/x86/kernel/apic/es7000_32.c29
-rw-r--r--arch/x86/kernel/apic/io_apic.c802
-rw-r--r--arch/x86/kernel/apic/nmi.c28
-rw-r--r--arch/x86/kernel/apic/numaq_32.c21
-rw-r--r--arch/x86/kernel/apic/probe_32.c31
-rw-r--r--arch/x86/kernel/apic/probe_64.c13
-rw-r--r--arch/x86/kernel/apic/summit_32.c10
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_phys.c5
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c164
-rw-r--r--arch/x86/kernel/apm_32.c18
-rw-r--r--arch/x86/kernel/bios_uv.c47
-rw-r--r--arch/x86/kernel/bootflag.c1
-rw-r--r--arch/x86/kernel/cpu/Makefile3
-rw-r--r--arch/x86/kernel/cpu/addon_cpuid_features.c19
-rw-r--r--arch/x86/kernel/cpu/amd.c57
-rw-r--r--arch/x86/kernel/cpu/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/common.c52
-rw-r--r--arch/x86/kernel/cpu/cpu.h2
-rw-r--r--arch/x86/kernel/cpu/cpu_debug.c688
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Kconfig14
-rw-r--r--arch/x86/kernel/cpu/cpufreq/Makefile1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c50
-rw-r--r--arch/x86/kernel/cpu/cpufreq/elanfreq.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/gx-suspmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longhaul.c2
-rw-r--r--arch/x86/kernel/cpu/cpufreq/longrun.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/p4-clockmod.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c621
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k6.c3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k7.c19
-rw-r--r--arch/x86/kernel/cpu/cpufreq/powernow-k8.c44
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c1
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-ich.c3
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.c7
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-lib.h24
-rw-r--r--arch/x86/kernel/cpu/cpufreq/speedstep-smi.c3
-rw-r--r--arch/x86/kernel/cpu/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/intel.c33
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c420
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-inject.c23
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c129
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c3
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c5
-rw-r--r--arch/x86/kernel/cpu/mcheck/therm_throt.c45
-rw-r--r--arch/x86/kernel/cpu/mtrr/Makefile2
-rw-r--r--arch/x86/kernel/cpu/mtrr/amd.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/centaur.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/cleanup.c189
-rw-r--r--arch/x86/kernel/cpu/mtrr/cyrix.c2
-rw-r--r--arch/x86/kernel/cpu/mtrr/generic.c11
-rw-r--r--arch/x86/kernel/cpu/mtrr/if.c12
-rw-r--r--arch/x86/kernel/cpu/mtrr/main.c7
-rw-r--r--arch/x86/kernel/cpu/mtrr/mtrr.h6
-rw-r--r--arch/x86/kernel/cpu/mtrr/state.c94
-rw-r--r--arch/x86/kernel/cpu/perf_event.c1865
-rw-r--r--arch/x86/kernel/cpu/perf_event_amd.c422
-rw-r--r--arch/x86/kernel/cpu/perf_event_intel.c980
-rw-r--r--arch/x86/kernel/cpu/perf_event_p6.c159
-rw-r--r--arch/x86/kernel/cpu/perfctr-watchdog.c15
-rw-r--r--arch/x86/kernel/cpu/transmeta.c2
-rw-r--r--arch/x86/kernel/cpu/vmware.c2
-rw-r--r--arch/x86/kernel/cpuid.c25
-rw-r--r--arch/x86/kernel/crash.c7
-rw-r--r--arch/x86/kernel/crash_dump_32.c1
-rw-r--r--arch/x86/kernel/ds.c4
-rw-r--r--arch/x86/kernel/dumpstack.c50
-rw-r--r--arch/x86/kernel/dumpstack.h28
-rw-r--r--arch/x86/kernel/dumpstack_32.c14
-rw-r--r--arch/x86/kernel/dumpstack_64.c99
-rw-r--r--arch/x86/kernel/e820.c370
-rw-r--r--arch/x86/kernel/efi.c2
-rw-r--r--arch/x86/kernel/entry_32.S100
-rw-r--r--arch/x86/kernel/entry_64.S84
-rw-r--r--arch/x86/kernel/ftrace.c133
-rw-r--r--arch/x86/kernel/geode_32.c196
-rw-r--r--arch/x86/kernel/head32.c14
-rw-r--r--arch/x86/kernel/head64.c5
-rw-r--r--arch/x86/kernel/head_32.S24
-rw-r--r--arch/x86/kernel/head_64.S9
-rw-r--r--arch/x86/kernel/hpet.c95
-rw-r--r--arch/x86/kernel/hw_breakpoint.c530
-rw-r--r--arch/x86/kernel/i387.c72
-rw-r--r--arch/x86/kernel/i8259.c95
-rw-r--r--arch/x86/kernel/ioport.c28
-rw-r--r--arch/x86/kernel/irq.c126
-rw-r--r--arch/x86/kernel/irq_32.c45
-rw-r--r--arch/x86/kernel/irq_64.c58
-rw-r--r--arch/x86/kernel/irqinit.c63
-rw-r--r--arch/x86/kernel/k8.c16
-rw-r--r--arch/x86/kernel/kdebugfs.c1
-rw-r--r--arch/x86/kernel/kgdb.c241
-rw-r--r--arch/x86/kernel/kprobes.c888
-rw-r--r--arch/x86/kernel/ldt.c1
-rw-r--r--arch/x86/kernel/machine_kexec_32.c8
-rw-r--r--arch/x86/kernel/machine_kexec_64.c3
-rw-r--r--arch/x86/kernel/mca_32.c1
-rw-r--r--arch/x86/kernel/mfgpt_32.c410
-rw-r--r--arch/x86/kernel/microcode_amd.c57
-rw-r--r--arch/x86/kernel/microcode_core.c28
-rw-r--r--arch/x86/kernel/microcode_intel.c47
-rw-r--r--arch/x86/kernel/mmconf-fam10h_64.c7
-rw-r--r--arch/x86/kernel/module.c1
-rw-r--r--arch/x86/kernel/mpparse.c54
-rw-r--r--arch/x86/kernel/mrst.c216
-rw-r--r--arch/x86/kernel/msr.c26
-rw-r--r--arch/x86/kernel/olpc.c12
-rw-r--r--arch/x86/kernel/paravirt-spinlocks.c4
-rw-r--r--arch/x86/kernel/paravirt.c4
-rw-r--r--arch/x86/kernel/pci-calgary_64.c102
-rw-r--r--arch/x86/kernel/pci-dma.c55
-rw-r--r--arch/x86/kernel/pci-gart_64.c169
-rw-r--r--arch/x86/kernel/pci-nommu.c12
-rw-r--r--arch/x86/kernel/pci-swiotlb.c21
-rw-r--r--arch/x86/kernel/process.c171
-rw-r--r--arch/x86/kernel/process_32.c115
-rw-r--r--arch/x86/kernel/process_64.c131
-rw-r--r--arch/x86/kernel/ptrace.c494
-rw-r--r--arch/x86/kernel/quirks.c22
-rw-r--r--arch/x86/kernel/reboot.c29
-rw-r--r--arch/x86/kernel/reboot_fixups_32.c3
-rw-r--r--arch/x86/kernel/setup.c186
-rw-r--r--arch/x86/kernel/setup_percpu.c19
-rw-r--r--arch/x86/kernel/signal.c24
-rw-r--r--arch/x86/kernel/smp.c1
-rw-r--r--arch/x86/kernel/smpboot.c86
-rw-r--r--arch/x86/kernel/stacktrace.c18
-rw-r--r--arch/x86/kernel/sys_i386_32.c210
-rw-r--r--arch/x86/kernel/sys_x86_64.c29
-rw-r--r--arch/x86/kernel/syscall_table_32.S9
-rw-r--r--arch/x86/kernel/time.c4
-rw-r--r--arch/x86/kernel/tlb_uv.c5
-rw-r--r--arch/x86/kernel/trampoline.c20
-rw-r--r--arch/x86/kernel/traps.c76
-rw-r--r--arch/x86/kernel/tsc.c7
-rw-r--r--arch/x86/kernel/tsc_sync.c23
-rw-r--r--arch/x86/kernel/uv_irq.c239
-rw-r--r--arch/x86/kernel/uv_sysfs.c6
-rw-r--r--arch/x86/kernel/uv_time.c94
-rw-r--r--arch/x86/kernel/visws_quirks.c37
-rw-r--r--arch/x86/kernel/vm86_32.c11
-rw-r--r--arch/x86/kernel/vmi_32.c36
-rw-r--r--arch/x86/kernel/vmiclock_32.c10
-rw-r--r--arch/x86/kernel/vmlinux.lds.S48
-rw-r--r--arch/x86/kernel/vsyscall_64.c10
-rw-r--r--arch/x86/kernel/x8664_ksyms_64.c14
-rw-r--r--arch/x86/kernel/x86_init.c21
-rw-r--r--arch/x86/kernel/xsave.c1
-rw-r--r--arch/x86/kvm/Kconfig3
-rw-r--r--arch/x86/kvm/Makefile3
-rw-r--r--arch/x86/kvm/emulate.c589
-rw-r--r--arch/x86/kvm/i8254.c41
-rw-r--r--arch/x86/kvm/i8254.h2
-rw-r--r--arch/x86/kvm/i8259.c87
-rw-r--r--arch/x86/kvm/irq.h10
-rw-r--r--arch/x86/kvm/kvm_cache_regs.h31
-rw-r--r--arch/x86/kvm/lapic.c52
-rw-r--r--arch/x86/kvm/lapic.h8
-rw-r--r--arch/x86/kvm/mmu.c152
-rw-r--r--arch/x86/kvm/mmu.h35
-rw-r--r--arch/x86/kvm/paging_tmpl.h36
-rw-r--r--arch/x86/kvm/svm.c660
-rw-r--r--arch/x86/kvm/trace.h224
-rw-r--r--arch/x86/kvm/vmx.c862
-rw-r--r--arch/x86/kvm/x86.c1680
-rw-r--r--arch/x86/kvm/x86.h30
-rw-r--r--arch/x86/lguest/boot.c61
-rw-r--r--arch/x86/lguest/i386_head.S2
-rw-r--r--arch/x86/lib/.gitignore1
-rw-r--r--arch/x86/lib/Makefile20
-rw-r--r--arch/x86/lib/cache-smp.c19
-rw-r--r--arch/x86/lib/copy_user_64.S20
-rw-r--r--arch/x86/lib/inat.c90
-rw-r--r--arch/x86/lib/insn.c516
-rw-r--r--arch/x86/lib/io_64.c25
-rw-r--r--arch/x86/lib/memcpy_64.S23
-rw-r--r--arch/x86/lib/memset_64.S18
-rw-r--r--arch/x86/lib/msr-smp.c204
-rw-r--r--arch/x86/lib/msr.c227
-rw-r--r--arch/x86/lib/rwsem_64.S81
-rw-r--r--arch/x86/lib/usercopy_32.c10
-rw-r--r--arch/x86/lib/x86-opcode-map.txt893
-rw-r--r--arch/x86/mm/extable.c31
-rw-r--r--arch/x86/mm/fault.c13
-rw-r--r--arch/x86/mm/gup.c2
-rw-r--r--arch/x86/mm/hugetlbpage.c1
-rw-r--r--arch/x86/mm/init.c44
-rw-r--r--arch/x86/mm/init_32.c30
-rw-r--r--arch/x86/mm/init_64.c64
-rw-r--r--arch/x86/mm/ioremap.c81
-rw-r--r--arch/x86/mm/k8topology_64.c101
-rw-r--r--arch/x86/mm/kmemcheck/error.c19
-rw-r--r--arch/x86/mm/kmemcheck/kmemcheck.c2
-rw-r--r--arch/x86/mm/kmemcheck/shadow.c16
-rw-r--r--arch/x86/mm/kmemcheck/shadow.h2
-rw-r--r--arch/x86/mm/kmmio.c58
-rw-r--r--arch/x86/mm/mmap.c4
-rw-r--r--arch/x86/mm/mmio-mod.c72
-rw-r--r--arch/x86/mm/numa_32.c7
-rw-r--r--arch/x86/mm/numa_64.c506
-rw-r--r--arch/x86/mm/pageattr.c45
-rw-r--r--arch/x86/mm/pat.c25
-rw-r--r--arch/x86/mm/pgtable.c32
-rw-r--r--arch/x86/mm/pgtable_32.c3
-rw-r--r--arch/x86/mm/setup_nx.c59
-rw-r--r--arch/x86/mm/srat_32.c2
-rw-r--r--arch/x86/mm/srat_64.c44
-rw-r--r--arch/x86/mm/testmmiotrace.c29
-rw-r--r--arch/x86/mm/tlb.c11
-rw-r--r--arch/x86/oprofile/backtrace.c9
-rw-r--r--arch/x86/oprofile/nmi_int.c20
-rw-r--r--arch/x86/oprofile/op_model_amd.c261
-rw-r--r--arch/x86/oprofile/op_model_p4.c6
-rw-r--r--arch/x86/oprofile/op_model_ppro.c21
-rw-r--r--arch/x86/oprofile/op_x86_model.h20
-rw-r--r--arch/x86/pci/Makefile8
-rw-r--r--arch/x86/pci/acpi.c170
-rw-r--r--arch/x86/pci/amd_bus.c239
-rw-r--r--arch/x86/pci/bus_numa.c101
-rw-r--r--arch/x86/pci/bus_numa.h25
-rw-r--r--arch/x86/pci/common.c30
-rw-r--r--arch/x86/pci/early.c7
-rw-r--r--arch/x86/pci/i386.c58
-rw-r--r--arch/x86/pci/init.c8
-rw-r--r--arch/x86/pci/irq.c19
-rw-r--r--arch/x86/pci/legacy.c24
-rw-r--r--arch/x86/pci/mmconfig-shared.c356
-rw-r--r--arch/x86/pci/mmconfig_32.c16
-rw-r--r--arch/x86/pci/mmconfig_64.c88
-rw-r--r--arch/x86/pci/mrst.c266
-rw-r--r--arch/x86/pci/numaq_32.c12
-rw-r--r--arch/x86/pci/olpc.c3
-rw-r--r--arch/x86/pci/pcbios.c1
-rw-r--r--arch/x86/pci/visws.c6
-rw-r--r--arch/x86/power/cpu.c26
-rw-r--r--arch/x86/power/hibernate_32.c1
-rw-r--r--arch/x86/power/hibernate_64.c1
-rw-r--r--arch/x86/power/hibernate_asm_32.S15
-rw-r--r--arch/x86/tools/Makefile31
-rw-r--r--arch/x86/tools/chkobjdump.awk33
-rw-r--r--arch/x86/tools/distill.awk47
-rw-r--r--arch/x86/tools/gen-insn-attr-x86.awk378
-rw-r--r--arch/x86/tools/test_get_len.c173
-rw-r--r--arch/x86/vdso/vdso32-setup.c1
-rw-r--r--arch/x86/vdso/vma.c1
-rw-r--r--arch/x86/xen/debugfs.c1
-rw-r--r--arch/x86/xen/enlighten.c49
-rw-r--r--arch/x86/xen/mmu.c24
-rw-r--r--arch/x86/xen/smp.c47
-rw-r--r--arch/x86/xen/spinlock.c17
-rw-r--r--arch/x86/xen/suspend.c17
-rw-r--r--arch/x86/xen/time.c32
-rw-r--r--arch/x86/xen/xen-asm_32.S4
-rw-r--r--arch/x86/xen/xen-asm_64.S4
-rw-r--r--arch/x86/xen/xen-ops.h2
421 files changed, 21986 insertions, 13367 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index e2cd95ebeeb1..12fbd5b65f1f 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -31,6 +31,7 @@ config X86
31 select ARCH_WANT_FRAME_POINTERS 31 select ARCH_WANT_FRAME_POINTERS
32 select HAVE_DMA_ATTRS 32 select HAVE_DMA_ATTRS
33 select HAVE_KRETPROBES 33 select HAVE_KRETPROBES
34 select HAVE_OPTPROBES
34 select HAVE_FTRACE_MCOUNT_RECORD 35 select HAVE_FTRACE_MCOUNT_RECORD
35 select HAVE_DYNAMIC_FTRACE 36 select HAVE_DYNAMIC_FTRACE
36 select HAVE_FUNCTION_TRACER 37 select HAVE_FUNCTION_TRACER
@@ -45,11 +46,17 @@ config X86
45 select HAVE_GENERIC_DMA_COHERENT if X86_32 46 select HAVE_GENERIC_DMA_COHERENT if X86_32
46 select HAVE_EFFICIENT_UNALIGNED_ACCESS 47 select HAVE_EFFICIENT_UNALIGNED_ACCESS
47 select USER_STACKTRACE_SUPPORT 48 select USER_STACKTRACE_SUPPORT
49 select HAVE_REGS_AND_STACK_ACCESS_API
48 select HAVE_DMA_API_DEBUG 50 select HAVE_DMA_API_DEBUG
49 select HAVE_KERNEL_GZIP 51 select HAVE_KERNEL_GZIP
50 select HAVE_KERNEL_BZIP2 52 select HAVE_KERNEL_BZIP2
51 select HAVE_KERNEL_LZMA 53 select HAVE_KERNEL_LZMA
54 select HAVE_KERNEL_LZO
55 select HAVE_HW_BREAKPOINT
56 select PERF_EVENTS
57 select ANON_INODES
52 select HAVE_ARCH_KMEMCHECK 58 select HAVE_ARCH_KMEMCHECK
59 select HAVE_USER_RETURN_NOTIFIER
53 60
54config OUTPUT_FORMAT 61config OUTPUT_FORMAT
55 string 62 string
@@ -95,6 +102,9 @@ config ZONE_DMA
95config SBUS 102config SBUS
96 bool 103 bool
97 104
105config NEED_DMA_MAP_STATE
106 def_bool (X86_64 || DMAR || DMA_API_DEBUG)
107
98config GENERIC_ISA_DMA 108config GENERIC_ISA_DMA
99 def_bool y 109 def_bool y
100 110
@@ -178,6 +188,9 @@ config ARCH_SUPPORTS_OPTIMIZED_INLINING
178config ARCH_SUPPORTS_DEBUG_PAGEALLOC 188config ARCH_SUPPORTS_DEBUG_PAGEALLOC
179 def_bool y 189 def_bool y
180 190
191config HAVE_EARLY_RES
192 def_bool y
193
181config HAVE_INTEL_TXT 194config HAVE_INTEL_TXT
182 def_bool y 195 def_bool y
183 depends on EXPERIMENTAL && DMAR && ACPI 196 depends on EXPERIMENTAL && DMAR && ACPI
@@ -383,8 +396,12 @@ config X86_ELAN
383 396
384config X86_MRST 397config X86_MRST
385 bool "Moorestown MID platform" 398 bool "Moorestown MID platform"
399 depends on PCI
400 depends on PCI_GOANY
386 depends on X86_32 401 depends on X86_32
387 depends on X86_EXTENDED_PLATFORM 402 depends on X86_EXTENDED_PLATFORM
403 depends on X86_IO_APIC
404 select APB_TIMER
388 ---help--- 405 ---help---
389 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin 406 Moorestown is Intel's Low Power Intel Architecture (LPIA) based Moblin
390 Internet Device(MID) platform. Moorestown consists of two chips: 407 Internet Device(MID) platform. Moorestown consists of two chips:
@@ -419,6 +436,7 @@ config X86_32_NON_STANDARD
419config X86_NUMAQ 436config X86_NUMAQ
420 bool "NUMAQ (IBM/Sequent)" 437 bool "NUMAQ (IBM/Sequent)"
421 depends on X86_32_NON_STANDARD 438 depends on X86_32_NON_STANDARD
439 depends on PCI
422 select NUMA 440 select NUMA
423 select X86_MPPARSE 441 select X86_MPPARSE
424 ---help--- 442 ---help---
@@ -563,6 +581,18 @@ config PARAVIRT_DEBUG
563 Enable to debug paravirt_ops internals. Specifically, BUG if 581 Enable to debug paravirt_ops internals. Specifically, BUG if
564 a paravirt_op is missing when it is called. 582 a paravirt_op is missing when it is called.
565 583
584config NO_BOOTMEM
585 default y
586 bool "Disable Bootmem code"
587 ---help---
588 Use early_res directly instead of bootmem before slab is ready.
589 - allocator (buddy) [generic]
590 - early allocator (bootmem) [generic]
591 - very early allocator (reserve_early*()) [x86]
592 - very very early allocator (early brk model) [x86]
593 So reduce one layer between early allocator to final allocator
594
595
566config MEMTEST 596config MEMTEST
567 bool "Memtest" 597 bool "Memtest"
568 ---help--- 598 ---help---
@@ -607,6 +637,16 @@ config HPET_EMULATE_RTC
607 def_bool y 637 def_bool y
608 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y) 638 depends on HPET_TIMER && (RTC=y || RTC=m || RTC_DRV_CMOS=m || RTC_DRV_CMOS=y)
609 639
640config APB_TIMER
641 def_bool y if MRST
642 prompt "Langwell APB Timer Support" if X86_MRST
643 help
644 APB timer is the replacement for 8254, HPET on X86 MID platforms.
645 The APBT provides a stable time base on SMP
646 systems, unlike the TSC, but it is more expensive to access,
647 as it is off-chip. APB timers are always running regardless of CPU
648 C states, they are used as per CPU clockevent device when possible.
649
610# Mark as embedded because too many people got it wrong. 650# Mark as embedded because too many people got it wrong.
611# The code disables itself when not needed. 651# The code disables itself when not needed.
612config DMI 652config DMI
@@ -622,7 +662,7 @@ config GART_IOMMU
622 bool "GART IOMMU support" if EMBEDDED 662 bool "GART IOMMU support" if EMBEDDED
623 default y 663 default y
624 select SWIOTLB 664 select SWIOTLB
625 depends on X86_64 && PCI 665 depends on X86_64 && PCI && K8_NB
626 ---help--- 666 ---help---
627 Support for full DMA access of devices with 32bit memory access only 667 Support for full DMA access of devices with 32bit memory access only
628 on systems with more than 3GB. This is usually needed for USB, 668 on systems with more than 3GB. This is usually needed for USB,
@@ -984,12 +1024,6 @@ config X86_CPUID
984 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to 1024 with major 203 and minors 0 to 31 for /dev/cpu/0/cpuid to
985 /dev/cpu/31/cpuid. 1025 /dev/cpu/31/cpuid.
986 1026
987config X86_CPU_DEBUG
988 tristate "/sys/kernel/debug/x86/cpu/* - CPU Debug support"
989 ---help---
990 If you select this option, this will provide various x86 CPUs
991 information through debugfs.
992
993choice 1027choice
994 prompt "High Memory Support" 1028 prompt "High Memory Support"
995 default HIGHMEM4G if !X86_NUMAQ 1029 default HIGHMEM4G if !X86_NUMAQ
@@ -1182,8 +1216,8 @@ config NUMA_EMU
1182 1216
1183config NODES_SHIFT 1217config NODES_SHIFT
1184 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP 1218 int "Maximum NUMA Nodes (as a power of 2)" if !MAXSMP
1185 range 1 9 1219 range 1 10
1186 default "9" if MAXSMP 1220 default "10" if MAXSMP
1187 default "6" if X86_64 1221 default "6" if X86_64
1188 default "4" if X86_NUMAQ 1222 default "4" if X86_NUMAQ
1189 default "3" 1223 default "3"
@@ -1242,6 +1276,11 @@ config ARCH_MEMORY_PROBE
1242 def_bool X86_64 1276 def_bool X86_64
1243 depends on MEMORY_HOTPLUG 1277 depends on MEMORY_HOTPLUG
1244 1278
1279config ILLEGAL_POINTER_VALUE
1280 hex
1281 default 0 if X86_32
1282 default 0xdead000000000000 if X86_64
1283
1245source "mm/Kconfig" 1284source "mm/Kconfig"
1246 1285
1247config HIGHPTE 1286config HIGHPTE
@@ -1330,7 +1369,9 @@ config MATH_EMULATION
1330 kernel, it won't hurt. 1369 kernel, it won't hurt.
1331 1370
1332config MTRR 1371config MTRR
1333 bool "MTRR (Memory Type Range Register) support" 1372 bool
1373 default y
1374 prompt "MTRR (Memory Type Range Register) support" if EMBEDDED
1334 ---help--- 1375 ---help---
1335 On Intel P6 family processors (Pentium Pro, Pentium II and later) 1376 On Intel P6 family processors (Pentium Pro, Pentium II and later)
1336 the Memory Type Range Registers (MTRRs) may be used to control 1377 the Memory Type Range Registers (MTRRs) may be used to control
@@ -1396,7 +1437,8 @@ config MTRR_SANITIZER_SPARE_REG_NR_DEFAULT
1396 1437
1397config X86_PAT 1438config X86_PAT
1398 bool 1439 bool
1399 prompt "x86 PAT support" 1440 default y
1441 prompt "x86 PAT support" if EMBEDDED
1400 depends on MTRR 1442 depends on MTRR
1401 ---help--- 1443 ---help---
1402 Use PAT attributes to setup page level cache control. 1444 Use PAT attributes to setup page level cache control.
@@ -1602,7 +1644,7 @@ config COMPAT_VDSO
1602 depends on X86_32 || IA32_EMULATION 1644 depends on X86_32 || IA32_EMULATION
1603 ---help--- 1645 ---help---
1604 Map the 32-bit VDSO to the predictable old-style address too. 1646 Map the 32-bit VDSO to the predictable old-style address too.
1605 ---help--- 1647
1606 Say N here if you are running a sufficiently recent glibc 1648 Say N here if you are running a sufficiently recent glibc
1607 version (2.3.3 or later), to remove the high-mapped 1649 version (2.3.3 or later), to remove the high-mapped
1608 VDSO mapping and to exclusively use the randomized VDSO. 1650 VDSO mapping and to exclusively use the randomized VDSO.
@@ -2007,18 +2049,9 @@ config SCx200HR_TIMER
2007 processor goes idle (as is done by the scheduler). The 2049 processor goes idle (as is done by the scheduler). The
2008 other workaround is idle=poll boot option. 2050 other workaround is idle=poll boot option.
2009 2051
2010config GEODE_MFGPT_TIMER
2011 def_bool y
2012 prompt "Geode Multi-Function General Purpose Timer (MFGPT) events"
2013 depends on MGEODE_LX && GENERIC_TIME && GENERIC_CLOCKEVENTS
2014 ---help---
2015 This driver provides a clock event source based on the MFGPT
2016 timer(s) in the CS5535 and CS5536 companion chip for the geode.
2017 MFGPTs have a better resolution and max interval than the
2018 generic PIT, and are suitable for use as high-res timers.
2019
2020config OLPC 2052config OLPC
2021 bool "One Laptop Per Child support" 2053 bool "One Laptop Per Child support"
2054 select GPIOLIB
2022 default n 2055 default n
2023 ---help--- 2056 ---help---
2024 Add support for detecting the unique features of the OLPC 2057 Add support for detecting the unique features of the OLPC
@@ -2028,7 +2061,7 @@ endif # X86_32
2028 2061
2029config K8_NB 2062config K8_NB
2030 def_bool y 2063 def_bool y
2031 depends on AGP_AMD64 || (X86_64 && (GART_IOMMU || (PCI && NUMA))) 2064 depends on CPU_SUP_AMD && PCI
2032 2065
2033source "drivers/pcmcia/Kconfig" 2066source "drivers/pcmcia/Kconfig"
2034 2067
diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu
index 2649840d888f..a19829374e6a 100644
--- a/arch/x86/Kconfig.cpu
+++ b/arch/x86/Kconfig.cpu
@@ -301,15 +301,11 @@ config X86_CPU
301 301
302# 302#
303# Define implied options from the CPU selection here 303# Define implied options from the CPU selection here
304config X86_L1_CACHE_BYTES 304config X86_INTERNODE_CACHE_SHIFT
305 int 305 int
306 default "128" if MPSC 306 default "12" if X86_VSMP
307 default "64" if GENERIC_CPU || MK8 || MCORE2 || MATOM || X86_32 307 default "7" if NUMA
308 308 default X86_L1_CACHE_SHIFT
309config X86_INTERNODE_CACHE_BYTES
310 int
311 default "4096" if X86_VSMP
312 default X86_L1_CACHE_BYTES if !X86_VSMP
313 309
314config X86_CMPXCHG 310config X86_CMPXCHG
315 def_bool X86_64 || (X86_32 && !M386) 311 def_bool X86_64 || (X86_32 && !M386)
@@ -317,13 +313,13 @@ config X86_CMPXCHG
317config X86_L1_CACHE_SHIFT 313config X86_L1_CACHE_SHIFT
318 int 314 int
319 default "7" if MPENTIUM4 || MPSC 315 default "7" if MPENTIUM4 || MPSC
316 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
320 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1 317 default "4" if X86_ELAN || M486 || M386 || MGEODEGX1
321 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX 318 default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX
322 default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU
323 319
324config X86_XADD 320config X86_XADD
325 def_bool y 321 def_bool y
326 depends on X86_32 && !M386 322 depends on X86_64 || !M386
327 323
328config X86_PPRO_FENCE 324config X86_PPRO_FENCE
329 bool "PentiumPro memory ordering errata workaround" 325 bool "PentiumPro memory ordering errata workaround"
@@ -400,13 +396,13 @@ config X86_TSC
400 396
401config X86_CMPXCHG64 397config X86_CMPXCHG64
402 def_bool y 398 def_bool y
403 depends on !M386 && !M486 399 depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM
404 400
405# this should be set for all -march=.. options where the compiler 401# this should be set for all -march=.. options where the compiler
406# generates cmov. 402# generates cmov.
407config X86_CMOV 403config X86_CMOV
408 def_bool y 404 def_bool y
409 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM) 405 depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX)
410 406
411config X86_MINIMUM_CPU_FAMILY 407config X86_MINIMUM_CPU_FAMILY
412 int 408 int
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index d105f29bb6bb..bc01e3ebfeb2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -186,6 +186,15 @@ config X86_DS_SELFTEST
186config HAVE_MMIOTRACE_SUPPORT 186config HAVE_MMIOTRACE_SUPPORT
187 def_bool y 187 def_bool y
188 188
189config X86_DECODER_SELFTEST
190 bool "x86 instruction decoder selftest"
191 depends on DEBUG_KERNEL && KPROBES
192 ---help---
193 Perform x86 instruction decoder selftests at build time.
194 This option is useful for checking the sanity of x86 instruction
195 decoder code.
196 If unsure, say "N".
197
189# 198#
190# IO delay types: 199# IO delay types:
191# 200#
@@ -287,4 +296,18 @@ config OPTIMIZE_INLINING
287 296
288 If unsure, say N. 297 If unsure, say N.
289 298
299config DEBUG_STRICT_USER_COPY_CHECKS
300 bool "Strict copy size checks"
301 depends on DEBUG_KERNEL && !TRACE_BRANCH_PROFILING
302 ---help---
303 Enabling this option turns a certain set of sanity checks for user
304 copy operations into compile time failures.
305
306 The copy_from_user() etc checks are there to help test if there
307 are sufficient security checks on the length argument of
308 the copy operation, by having gcc prove that the argument is
309 within bounds.
310
311 If unsure, or if you run an older (pre 4.4) gcc, say N.
312
290endmenu 313endmenu
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index d2d24c9ee64d..0a43dc515e4c 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -135,9 +135,7 @@ drivers-$(CONFIG_OPROFILE) += arch/x86/oprofile/
135# suspend and hibernation support 135# suspend and hibernation support
136drivers-$(CONFIG_PM) += arch/x86/power/ 136drivers-$(CONFIG_PM) += arch/x86/power/
137 137
138ifeq ($(CONFIG_X86_32),y)
139drivers-$(CONFIG_FB) += arch/x86/video/ 138drivers-$(CONFIG_FB) += arch/x86/video/
140endif
141 139
142#### 140####
143# boot loader support. Several targets are kept for legacy purposes 141# boot loader support. Several targets are kept for legacy purposes
@@ -155,6 +153,9 @@ all: bzImage
155KBUILD_IMAGE := $(boot)/bzImage 153KBUILD_IMAGE := $(boot)/bzImage
156 154
157bzImage: vmlinux 155bzImage: vmlinux
156ifeq ($(CONFIG_X86_DECODER_SELFTEST),y)
157 $(Q)$(MAKE) $(build)=arch/x86/tools posttest
158endif
158 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE) 159 $(Q)$(MAKE) $(build)=$(boot) $(KBUILD_IMAGE)
159 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot 160 $(Q)mkdir -p $(objtree)/arch/$(UTS_MACHINE)/boot
160 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@ 161 $(Q)ln -fsn ../../x86/boot/bzImage $(objtree)/arch/$(UTS_MACHINE)/boot/$@
diff --git a/arch/x86/Makefile_32.cpu b/arch/x86/Makefile_32.cpu
index 30e9a264f69d..1255d953c65d 100644
--- a/arch/x86/Makefile_32.cpu
+++ b/arch/x86/Makefile_32.cpu
@@ -41,11 +41,18 @@ cflags-$(CONFIG_X86_ELAN) += -march=i486
41 41
42# Geode GX1 support 42# Geode GX1 support
43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx 43cflags-$(CONFIG_MGEODEGX1) += -march=pentium-mmx
44 44cflags-$(CONFIG_MGEODE_LX) += $(call cc-option,-march=geode,-march=pentium-mmx)
45# add at the end to overwrite eventual tuning options from earlier 45# add at the end to overwrite eventual tuning options from earlier
46# cpu entries 46# cpu entries
47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686)) 47cflags-$(CONFIG_X86_GENERIC) += $(call tune,generic,$(call tune,i686))
48 48
49# Work around the pentium-mmx code generator madness of gcc4.4.x which
50# does stack alignment by generating horrible code _before_ the mcount
51# prologue (push %ebp, mov %esp, %ebp) which breaks the function graph
52# tracer assumptions. For i686, generic, core2 this is set by the
53# compiler anyway
54cflags-$(CONFIG_FUNCTION_GRAPH_TRACER) += $(call cc-option,-maccumulate-outgoing-args)
55
49# Bug fix for binutils: this option is required in order to keep 56# Bug fix for binutils: this option is required in order to keep
50# binutils from generating NOPL instructions against our will. 57# binutils from generating NOPL instructions against our will.
51ifneq ($(CONFIG_X86_P6_NOP),y) 58ifneq ($(CONFIG_X86_P6_NOP),y)
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index f8ed0658404c..fbb47daf2459 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -4,11 +4,12 @@
4# create a compressed vmlinux image from the original vmlinux 4# create a compressed vmlinux image from the original vmlinux
5# 5#
6 6
7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma head_$(BITS).o misc.o piggy.o 7targets := vmlinux.lds vmlinux vmlinux.bin vmlinux.bin.gz vmlinux.bin.bz2 vmlinux.bin.lzma vmlinux.bin.lzo head_$(BITS).o misc.o piggy.o
8 8
9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2 9KBUILD_CFLAGS := -m$(BITS) -D__KERNEL__ $(LINUX_INCLUDE) -O2
10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC 10KBUILD_CFLAGS += -fno-strict-aliasing -fPIC
11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING 11KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING
12cflags-$(CONFIG_X86_32) := -march=i386
12cflags-$(CONFIG_X86_64) := -mcmodel=small 13cflags-$(CONFIG_X86_64) := -mcmodel=small
13KBUILD_CFLAGS += $(cflags-y) 14KBUILD_CFLAGS += $(cflags-y)
14KBUILD_CFLAGS += $(call cc-option,-ffreestanding) 15KBUILD_CFLAGS += $(call cc-option,-ffreestanding)
@@ -48,10 +49,13 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE
48 $(call if_changed,bzip2) 49 $(call if_changed,bzip2)
49$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE 50$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE
50 $(call if_changed,lzma) 51 $(call if_changed,lzma)
52$(obj)/vmlinux.bin.lzo: $(vmlinux.bin.all-y) FORCE
53 $(call if_changed,lzo)
51 54
52suffix-$(CONFIG_KERNEL_GZIP) := gz 55suffix-$(CONFIG_KERNEL_GZIP) := gz
53suffix-$(CONFIG_KERNEL_BZIP2) := bz2 56suffix-$(CONFIG_KERNEL_BZIP2) := bz2
54suffix-$(CONFIG_KERNEL_LZMA) := lzma 57suffix-$(CONFIG_KERNEL_LZMA) := lzma
58suffix-$(CONFIG_KERNEL_LZO) := lzo
55 59
56quiet_cmd_mkpiggy = MKPIGGY $@ 60quiet_cmd_mkpiggy = MKPIGGY $@
57 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) 61 cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false )
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 077e1b69198e..faff0dc9c06a 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -107,8 +107,7 @@ ENTRY(startup_32)
107 lgdt gdt(%ebp) 107 lgdt gdt(%ebp)
108 108
109 /* Enable PAE mode */ 109 /* Enable PAE mode */
110 xorl %eax, %eax 110 movl $(X86_CR4_PAE), %eax
111 orl $(X86_CR4_PAE), %eax
112 movl %eax, %cr4 111 movl %eax, %cr4
113 112
114 /* 113 /*
diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c
index 842b2a36174a..51e240779a44 100644
--- a/arch/x86/boot/compressed/misc.c
+++ b/arch/x86/boot/compressed/misc.c
@@ -19,11 +19,6 @@
19#define _ASM_X86_DESC_H 1 19#define _ASM_X86_DESC_H 1
20#endif 20#endif
21 21
22#ifdef CONFIG_X86_64
23#define _LINUX_STRING_H_ 1
24#define __LINUX_BITMAP_H 1
25#endif
26
27#include <linux/linkage.h> 22#include <linux/linkage.h>
28#include <linux/screen_info.h> 23#include <linux/screen_info.h>
29#include <linux/elf.h> 24#include <linux/elf.h>
@@ -131,8 +126,8 @@ static void error(char *m);
131static struct boot_params *real_mode; /* Pointer to real-mode data */ 126static struct boot_params *real_mode; /* Pointer to real-mode data */
132static int quiet; 127static int quiet;
133 128
134static void *memset(void *s, int c, unsigned n); 129void *memset(void *s, int c, size_t n);
135void *memcpy(void *dest, const void *src, unsigned n); 130void *memcpy(void *dest, const void *src, size_t n);
136 131
137static void __putstr(int, const char *); 132static void __putstr(int, const char *);
138#define putstr(__x) __putstr(0, __x) 133#define putstr(__x) __putstr(0, __x)
@@ -162,6 +157,10 @@ static int lines, cols;
162#include "../../../../lib/decompress_unlzma.c" 157#include "../../../../lib/decompress_unlzma.c"
163#endif 158#endif
164 159
160#ifdef CONFIG_KERNEL_LZO
161#include "../../../../lib/decompress_unlzo.c"
162#endif
163
165static void scroll(void) 164static void scroll(void)
166{ 165{
167 int i; 166 int i;
@@ -181,11 +180,9 @@ static void __putstr(int error, const char *s)
181 return; 180 return;
182#endif 181#endif
183 182
184#ifdef CONFIG_X86_32
185 if (real_mode->screen_info.orig_video_mode == 0 && 183 if (real_mode->screen_info.orig_video_mode == 0 &&
186 lines == 0 && cols == 0) 184 lines == 0 && cols == 0)
187 return; 185 return;
188#endif
189 186
190 x = real_mode->screen_info.orig_x; 187 x = real_mode->screen_info.orig_x;
191 y = real_mode->screen_info.orig_y; 188 y = real_mode->screen_info.orig_y;
@@ -219,7 +216,7 @@ static void __putstr(int error, const char *s)
219 outb(0xff & (pos >> 1), vidport+1); 216 outb(0xff & (pos >> 1), vidport+1);
220} 217}
221 218
222static void *memset(void *s, int c, unsigned n) 219void *memset(void *s, int c, size_t n)
223{ 220{
224 int i; 221 int i;
225 char *ss = s; 222 char *ss = s;
@@ -229,7 +226,7 @@ static void *memset(void *s, int c, unsigned n)
229 return s; 226 return s;
230} 227}
231 228
232void *memcpy(void *dest, const void *src, unsigned n) 229void *memcpy(void *dest, const void *src, size_t n)
233{ 230{
234 int i; 231 int i;
235 const char *s = src; 232 const char *s = src;
diff --git a/arch/x86/boot/compressed/relocs.c b/arch/x86/boot/compressed/relocs.c
index bbeb0c3fbd90..89bbf4e4d05d 100644
--- a/arch/x86/boot/compressed/relocs.c
+++ b/arch/x86/boot/compressed/relocs.c
@@ -9,6 +9,9 @@
9#include <byteswap.h> 9#include <byteswap.h>
10#define USE_BSD 10#define USE_BSD
11#include <endian.h> 11#include <endian.h>
12#include <regex.h>
13
14static void die(char *fmt, ...);
12 15
13#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) 16#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
14static Elf32_Ehdr ehdr; 17static Elf32_Ehdr ehdr;
@@ -30,25 +33,47 @@ static struct section *secs;
30 * the address for which it has been compiled. Don't warn user about 33 * the address for which it has been compiled. Don't warn user about
31 * absolute relocations present w.r.t these symbols. 34 * absolute relocations present w.r.t these symbols.
32 */ 35 */
33static const char* safe_abs_relocs[] = { 36static const char abs_sym_regex[] =
34 "xen_irq_disable_direct_reloc", 37 "^(xen_irq_disable_direct_reloc$|"
35 "xen_save_fl_direct_reloc", 38 "xen_save_fl_direct_reloc$|"
36}; 39 "VDSO|"
40 "__crc_)";
41static regex_t abs_sym_regex_c;
42static int is_abs_reloc(const char *sym_name)
43{
44 return !regexec(&abs_sym_regex_c, sym_name, 0, NULL, 0);
45}
37 46
38static int is_safe_abs_reloc(const char* sym_name) 47/*
48 * These symbols are known to be relative, even if the linker marks them
49 * as absolute (typically defined outside any section in the linker script.)
50 */
51static const char rel_sym_regex[] =
52 "^_end$";
53static regex_t rel_sym_regex_c;
54static int is_rel_reloc(const char *sym_name)
39{ 55{
40 int i; 56 return !regexec(&rel_sym_regex_c, sym_name, 0, NULL, 0);
57}
41 58
42 for (i = 0; i < ARRAY_SIZE(safe_abs_relocs); i++) { 59static void regex_init(void)
43 if (!strcmp(sym_name, safe_abs_relocs[i])) 60{
44 /* Match found */ 61 char errbuf[128];
45 return 1; 62 int err;
46 } 63
47 if (strncmp(sym_name, "VDSO", 4) == 0) 64 err = regcomp(&abs_sym_regex_c, abs_sym_regex,
48 return 1; 65 REG_EXTENDED|REG_NOSUB);
49 if (strncmp(sym_name, "__crc_", 6) == 0) 66 if (err) {
50 return 1; 67 regerror(err, &abs_sym_regex_c, errbuf, sizeof errbuf);
51 return 0; 68 die("%s", errbuf);
69 }
70
71 err = regcomp(&rel_sym_regex_c, rel_sym_regex,
72 REG_EXTENDED|REG_NOSUB);
73 if (err) {
74 regerror(err, &rel_sym_regex_c, errbuf, sizeof errbuf);
75 die("%s", errbuf);
76 }
52} 77}
53 78
54static void die(char *fmt, ...) 79static void die(char *fmt, ...)
@@ -131,7 +156,7 @@ static const char *rel_type(unsigned type)
131#undef REL_TYPE 156#undef REL_TYPE
132 }; 157 };
133 const char *name = "unknown type rel type name"; 158 const char *name = "unknown type rel type name";
134 if (type < ARRAY_SIZE(type_name)) { 159 if (type < ARRAY_SIZE(type_name) && type_name[type]) {
135 name = type_name[type]; 160 name = type_name[type];
136 } 161 }
137 return name; 162 return name;
@@ -448,7 +473,7 @@ static void print_absolute_relocs(void)
448 * Before warning check if this absolute symbol 473 * Before warning check if this absolute symbol
449 * relocation is harmless. 474 * relocation is harmless.
450 */ 475 */
451 if (is_safe_abs_reloc(name)) 476 if (is_abs_reloc(name) || is_rel_reloc(name))
452 continue; 477 continue;
453 478
454 if (!printed) { 479 if (!printed) {
@@ -501,21 +526,26 @@ static void walk_relocs(void (*visit)(Elf32_Rel *rel, Elf32_Sym *sym))
501 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)]; 526 sym = &sh_symtab[ELF32_R_SYM(rel->r_info)];
502 r_type = ELF32_R_TYPE(rel->r_info); 527 r_type = ELF32_R_TYPE(rel->r_info);
503 /* Don't visit relocations to absolute symbols */ 528 /* Don't visit relocations to absolute symbols */
504 if (sym->st_shndx == SHN_ABS) { 529 if (sym->st_shndx == SHN_ABS &&
530 !is_rel_reloc(sym_name(sym_strtab, sym))) {
505 continue; 531 continue;
506 } 532 }
507 if (r_type == R_386_NONE || r_type == R_386_PC32) { 533 switch (r_type) {
534 case R_386_NONE:
535 case R_386_PC32:
508 /* 536 /*
509 * NONE can be ignored and and PC relative 537 * NONE can be ignored and and PC relative
510 * relocations don't need to be adjusted. 538 * relocations don't need to be adjusted.
511 */ 539 */
512 } 540 break;
513 else if (r_type == R_386_32) { 541 case R_386_32:
514 /* Visit relocations that need to be adjusted */ 542 /* Visit relocations that need to be adjusted */
515 visit(rel, sym); 543 visit(rel, sym);
516 } 544 break;
517 else { 545 default:
518 die("Unsupported relocation type: %d\n", r_type); 546 die("Unsupported relocation type: %s (%d)\n",
547 rel_type(r_type), r_type);
548 break;
519 } 549 }
520 } 550 }
521 } 551 }
@@ -571,16 +601,15 @@ static void emit_relocs(int as_text)
571 } 601 }
572 else { 602 else {
573 unsigned char buf[4]; 603 unsigned char buf[4];
574 buf[0] = buf[1] = buf[2] = buf[3] = 0;
575 /* Print a stop */ 604 /* Print a stop */
576 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); 605 fwrite("\0\0\0\0", 4, 1, stdout);
577 /* Now print each relocation */ 606 /* Now print each relocation */
578 for (i = 0; i < reloc_count; i++) { 607 for (i = 0; i < reloc_count; i++) {
579 buf[0] = (relocs[i] >> 0) & 0xff; 608 buf[0] = (relocs[i] >> 0) & 0xff;
580 buf[1] = (relocs[i] >> 8) & 0xff; 609 buf[1] = (relocs[i] >> 8) & 0xff;
581 buf[2] = (relocs[i] >> 16) & 0xff; 610 buf[2] = (relocs[i] >> 16) & 0xff;
582 buf[3] = (relocs[i] >> 24) & 0xff; 611 buf[3] = (relocs[i] >> 24) & 0xff;
583 printf("%c%c%c%c", buf[0], buf[1], buf[2], buf[3]); 612 fwrite(buf, 4, 1, stdout);
584 } 613 }
585 } 614 }
586} 615}
@@ -598,6 +627,8 @@ int main(int argc, char **argv)
598 FILE *fp; 627 FILE *fp;
599 int i; 628 int i;
600 629
630 regex_init();
631
601 show_absolute_syms = 0; 632 show_absolute_syms = 0;
602 show_absolute_relocs = 0; 633 show_absolute_relocs = 0;
603 as_text = 0; 634 as_text = 0;
diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S
index f4193bb48782..a6f1a59a5b0c 100644
--- a/arch/x86/boot/compressed/vmlinux.lds.S
+++ b/arch/x86/boot/compressed/vmlinux.lds.S
@@ -4,6 +4,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT)
4 4
5#undef i386 5#undef i386
6 6
7#include <asm/cache.h>
7#include <asm/page_types.h> 8#include <asm/page_types.h>
8 9
9#ifdef CONFIG_X86_64 10#ifdef CONFIG_X86_64
@@ -46,7 +47,7 @@ SECTIONS
46 *(.data.*) 47 *(.data.*)
47 _edata = . ; 48 _edata = . ;
48 } 49 }
49 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 50 . = ALIGN(L1_CACHE_BYTES);
50 .bss : { 51 .bss : {
51 _bss = . ; 52 _bss = . ;
52 *(.bss) 53 *(.bss)
diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S
index b31cc54b4641..93e689f4bd86 100644
--- a/arch/x86/boot/header.S
+++ b/arch/x86/boot/header.S
@@ -16,7 +16,7 @@
16 */ 16 */
17 17
18#include <asm/segment.h> 18#include <asm/segment.h>
19#include <linux/utsrelease.h> 19#include <generated/utsrelease.h>
20#include <asm/boot.h> 20#include <asm/boot.h>
21#include <asm/e820.h> 21#include <asm/e820.h>
22#include <asm/page_types.h> 22#include <asm/page_types.h>
diff --git a/arch/x86/boot/mkcpustr.c b/arch/x86/boot/mkcpustr.c
index 8ef60f20b371..919257f526f2 100644
--- a/arch/x86/boot/mkcpustr.c
+++ b/arch/x86/boot/mkcpustr.c
@@ -22,7 +22,7 @@ int main(void)
22 int i, j; 22 int i, j;
23 const char *str; 23 const char *str;
24 24
25 printf("static const char x86_cap_strs[] = \n"); 25 printf("static const char x86_cap_strs[] =\n");
26 26
27 for (i = 0; i < NCAPINTS; i++) { 27 for (i = 0; i < NCAPINTS; i++) {
28 for (j = 0; j < 32; j++) { 28 for (j = 0; j < 32; j++) {
diff --git a/arch/x86/boot/version.c b/arch/x86/boot/version.c
index 2723d9b5ce43..2b15aa488ffb 100644
--- a/arch/x86/boot/version.c
+++ b/arch/x86/boot/version.c
@@ -13,8 +13,8 @@
13 */ 13 */
14 14
15#include "boot.h" 15#include "boot.h"
16#include <linux/utsrelease.h> 16#include <generated/utsrelease.h>
17#include <linux/compile.h> 17#include <generated/compile.h>
18 18
19const char kernel_version[] = 19const char kernel_version[] =
20 UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") " 20 UTS_RELEASE " (" LINUX_COMPILE_BY "@" LINUX_COMPILE_HOST ") "
diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c
index 819caa1f2008..ed7aeff786b2 100644
--- a/arch/x86/boot/video-vga.c
+++ b/arch/x86/boot/video-vga.c
@@ -42,22 +42,15 @@ static u8 vga_set_basic_mode(void)
42{ 42{
43 struct biosregs ireg, oreg; 43 struct biosregs ireg, oreg;
44 u16 ax; 44 u16 ax;
45 u8 rows;
46 u8 mode; 45 u8 mode;
47 46
48 initregs(&ireg); 47 initregs(&ireg);
49 48
49 /* Query current mode */
50 ax = 0x0f00; 50 ax = 0x0f00;
51 intcall(0x10, &ireg, &oreg); 51 intcall(0x10, &ireg, &oreg);
52 mode = oreg.al; 52 mode = oreg.al;
53 53
54 set_fs(0);
55 rows = rdfs8(0x484); /* rows minus one */
56
57 if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) &&
58 (rows == 0 || rows == 24))
59 return mode;
60
61 if (mode != 3 && mode != 7) 54 if (mode != 3 && mode != 7)
62 mode = 3; 55 mode = 3;
63 56
diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c
index d42da3802499..43eda284d27f 100644
--- a/arch/x86/boot/video.c
+++ b/arch/x86/boot/video.c
@@ -27,6 +27,12 @@ static void store_cursor_position(void)
27 27
28 boot_params.screen_info.orig_x = oreg.dl; 28 boot_params.screen_info.orig_x = oreg.dl;
29 boot_params.screen_info.orig_y = oreg.dh; 29 boot_params.screen_info.orig_y = oreg.dh;
30
31 if (oreg.ch & 0x20)
32 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
33
34 if ((oreg.ch & 0x1f) > (oreg.cl & 0x1f))
35 boot_params.screen_info.flags |= VIDEO_FLAGS_NOCURSOR;
30} 36}
31 37
32static void store_video_mode(void) 38static void store_video_mode(void)
@@ -292,11 +298,18 @@ static void restore_screen(void)
292 } 298 }
293 299
294 /* Restore cursor position */ 300 /* Restore cursor position */
301 if (saved.curx >= xs)
302 saved.curx = xs-1;
303 if (saved.cury >= ys)
304 saved.cury = ys-1;
305
295 initregs(&ireg); 306 initregs(&ireg);
296 ireg.ah = 0x02; /* Set cursor position */ 307 ireg.ah = 0x02; /* Set cursor position */
297 ireg.dh = saved.cury; 308 ireg.dh = saved.cury;
298 ireg.dl = saved.curx; 309 ireg.dl = saved.curx;
299 intcall(0x10, &ireg, NULL); 310 intcall(0x10, &ireg, NULL);
311
312 store_cursor_position();
300} 313}
301 314
302void set_video(void) 315void set_video(void)
diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile
index cfb0010fa940..1a58ad89fdf7 100644
--- a/arch/x86/crypto/Makefile
+++ b/arch/x86/crypto/Makefile
@@ -12,6 +12,7 @@ obj-$(CONFIG_CRYPTO_AES_X86_64) += aes-x86_64.o
12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o 12obj-$(CONFIG_CRYPTO_TWOFISH_X86_64) += twofish-x86_64.o
13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o 13obj-$(CONFIG_CRYPTO_SALSA20_X86_64) += salsa20-x86_64.o
14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o 14obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o
15obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o
15 16
16obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o 17obj-$(CONFIG_CRYPTO_CRC32C_INTEL) += crc32c-intel.o
17 18
@@ -24,3 +25,5 @@ twofish-x86_64-y := twofish-x86_64-asm_64.o twofish_glue.o
24salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o 25salsa20-x86_64-y := salsa20-x86_64-asm_64.o salsa20_glue.o
25 26
26aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o 27aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o
28
29ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o
diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index eb0566e83319..20bb0e1ac681 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -16,6 +16,7 @@
16 */ 16 */
17 17
18#include <linux/linkage.h> 18#include <linux/linkage.h>
19#include <asm/inst.h>
19 20
20.text 21.text
21 22
@@ -122,103 +123,72 @@ ENTRY(aesni_set_key)
122 movups 0x10(%rsi), %xmm2 # other user key 123 movups 0x10(%rsi), %xmm2 # other user key
123 movaps %xmm2, (%rcx) 124 movaps %xmm2, (%rcx)
124 add $0x10, %rcx 125 add $0x10, %rcx
125 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 126 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
126 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
127 call _key_expansion_256a 127 call _key_expansion_256a
128 # aeskeygenassist $0x1, %xmm0, %xmm1 128 AESKEYGENASSIST 0x1 %xmm0 %xmm1
129 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
130 call _key_expansion_256b 129 call _key_expansion_256b
131 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 130 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
132 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
133 call _key_expansion_256a 131 call _key_expansion_256a
134 # aeskeygenassist $0x2, %xmm0, %xmm1 132 AESKEYGENASSIST 0x2 %xmm0 %xmm1
135 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
136 call _key_expansion_256b 133 call _key_expansion_256b
137 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 134 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
138 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
139 call _key_expansion_256a 135 call _key_expansion_256a
140 # aeskeygenassist $0x4, %xmm0, %xmm1 136 AESKEYGENASSIST 0x4 %xmm0 %xmm1
141 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
142 call _key_expansion_256b 137 call _key_expansion_256b
143 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 138 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
144 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
145 call _key_expansion_256a 139 call _key_expansion_256a
146 # aeskeygenassist $0x8, %xmm0, %xmm1 140 AESKEYGENASSIST 0x8 %xmm0 %xmm1
147 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
148 call _key_expansion_256b 141 call _key_expansion_256b
149 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 142 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
150 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
151 call _key_expansion_256a 143 call _key_expansion_256a
152 # aeskeygenassist $0x10, %xmm0, %xmm1 144 AESKEYGENASSIST 0x10 %xmm0 %xmm1
153 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
154 call _key_expansion_256b 145 call _key_expansion_256b
155 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 146 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
156 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
157 call _key_expansion_256a 147 call _key_expansion_256a
158 # aeskeygenassist $0x20, %xmm0, %xmm1 148 AESKEYGENASSIST 0x20 %xmm0 %xmm1
159 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
160 call _key_expansion_256b 149 call _key_expansion_256b
161 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 150 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
162 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
163 call _key_expansion_256a 151 call _key_expansion_256a
164 jmp .Ldec_key 152 jmp .Ldec_key
165.Lenc_key192: 153.Lenc_key192:
166 movq 0x10(%rsi), %xmm2 # other user key 154 movq 0x10(%rsi), %xmm2 # other user key
167 # aeskeygenassist $0x1, %xmm2, %xmm1 # round 1 155 AESKEYGENASSIST 0x1 %xmm2 %xmm1 # round 1
168 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x01
169 call _key_expansion_192a 156 call _key_expansion_192a
170 # aeskeygenassist $0x2, %xmm2, %xmm1 # round 2 157 AESKEYGENASSIST 0x2 %xmm2 %xmm1 # round 2
171 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x02
172 call _key_expansion_192b 158 call _key_expansion_192b
173 # aeskeygenassist $0x4, %xmm2, %xmm1 # round 3 159 AESKEYGENASSIST 0x4 %xmm2 %xmm1 # round 3
174 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x04
175 call _key_expansion_192a 160 call _key_expansion_192a
176 # aeskeygenassist $0x8, %xmm2, %xmm1 # round 4 161 AESKEYGENASSIST 0x8 %xmm2 %xmm1 # round 4
177 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x08
178 call _key_expansion_192b 162 call _key_expansion_192b
179 # aeskeygenassist $0x10, %xmm2, %xmm1 # round 5 163 AESKEYGENASSIST 0x10 %xmm2 %xmm1 # round 5
180 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x10
181 call _key_expansion_192a 164 call _key_expansion_192a
182 # aeskeygenassist $0x20, %xmm2, %xmm1 # round 6 165 AESKEYGENASSIST 0x20 %xmm2 %xmm1 # round 6
183 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x20
184 call _key_expansion_192b 166 call _key_expansion_192b
185 # aeskeygenassist $0x40, %xmm2, %xmm1 # round 7 167 AESKEYGENASSIST 0x40 %xmm2 %xmm1 # round 7
186 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x40
187 call _key_expansion_192a 168 call _key_expansion_192a
188 # aeskeygenassist $0x80, %xmm2, %xmm1 # round 8 169 AESKEYGENASSIST 0x80 %xmm2 %xmm1 # round 8
189 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xca, 0x80
190 call _key_expansion_192b 170 call _key_expansion_192b
191 jmp .Ldec_key 171 jmp .Ldec_key
192.Lenc_key128: 172.Lenc_key128:
193 # aeskeygenassist $0x1, %xmm0, %xmm1 # round 1 173 AESKEYGENASSIST 0x1 %xmm0 %xmm1 # round 1
194 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x01
195 call _key_expansion_128 174 call _key_expansion_128
196 # aeskeygenassist $0x2, %xmm0, %xmm1 # round 2 175 AESKEYGENASSIST 0x2 %xmm0 %xmm1 # round 2
197 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x02
198 call _key_expansion_128 176 call _key_expansion_128
199 # aeskeygenassist $0x4, %xmm0, %xmm1 # round 3 177 AESKEYGENASSIST 0x4 %xmm0 %xmm1 # round 3
200 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x04
201 call _key_expansion_128 178 call _key_expansion_128
202 # aeskeygenassist $0x8, %xmm0, %xmm1 # round 4 179 AESKEYGENASSIST 0x8 %xmm0 %xmm1 # round 4
203 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x08
204 call _key_expansion_128 180 call _key_expansion_128
205 # aeskeygenassist $0x10, %xmm0, %xmm1 # round 5 181 AESKEYGENASSIST 0x10 %xmm0 %xmm1 # round 5
206 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x10
207 call _key_expansion_128 182 call _key_expansion_128
208 # aeskeygenassist $0x20, %xmm0, %xmm1 # round 6 183 AESKEYGENASSIST 0x20 %xmm0 %xmm1 # round 6
209 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x20
210 call _key_expansion_128 184 call _key_expansion_128
211 # aeskeygenassist $0x40, %xmm0, %xmm1 # round 7 185 AESKEYGENASSIST 0x40 %xmm0 %xmm1 # round 7
212 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x40
213 call _key_expansion_128 186 call _key_expansion_128
214 # aeskeygenassist $0x80, %xmm0, %xmm1 # round 8 187 AESKEYGENASSIST 0x80 %xmm0 %xmm1 # round 8
215 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x80
216 call _key_expansion_128 188 call _key_expansion_128
217 # aeskeygenassist $0x1b, %xmm0, %xmm1 # round 9 189 AESKEYGENASSIST 0x1b %xmm0 %xmm1 # round 9
218 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x1b
219 call _key_expansion_128 190 call _key_expansion_128
220 # aeskeygenassist $0x36, %xmm0, %xmm1 # round 10 191 AESKEYGENASSIST 0x36 %xmm0 %xmm1 # round 10
221 .byte 0x66, 0x0f, 0x3a, 0xdf, 0xc8, 0x36
222 call _key_expansion_128 192 call _key_expansion_128
223.Ldec_key: 193.Ldec_key:
224 sub $0x10, %rcx 194 sub $0x10, %rcx
@@ -231,8 +201,7 @@ ENTRY(aesni_set_key)
231.align 4 201.align 4
232.Ldec_key_loop: 202.Ldec_key_loop:
233 movaps (%rdi), %xmm0 203 movaps (%rdi), %xmm0
234 # aesimc %xmm0, %xmm1 204 AESIMC %xmm0 %xmm1
235 .byte 0x66, 0x0f, 0x38, 0xdb, 0xc8
236 movaps %xmm1, (%rsi) 205 movaps %xmm1, (%rsi)
237 add $0x10, %rdi 206 add $0x10, %rdi
238 sub $0x10, %rsi 207 sub $0x10, %rsi
@@ -274,51 +243,37 @@ _aesni_enc1:
274 je .Lenc192 243 je .Lenc192
275 add $0x20, TKEYP 244 add $0x20, TKEYP
276 movaps -0x60(TKEYP), KEY 245 movaps -0x60(TKEYP), KEY
277 # aesenc KEY, STATE 246 AESENC KEY STATE
278 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
279 movaps -0x50(TKEYP), KEY 247 movaps -0x50(TKEYP), KEY
280 # aesenc KEY, STATE 248 AESENC KEY STATE
281 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
282.align 4 249.align 4
283.Lenc192: 250.Lenc192:
284 movaps -0x40(TKEYP), KEY 251 movaps -0x40(TKEYP), KEY
285 # aesenc KEY, STATE 252 AESENC KEY STATE
286 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
287 movaps -0x30(TKEYP), KEY 253 movaps -0x30(TKEYP), KEY
288 # aesenc KEY, STATE 254 AESENC KEY STATE
289 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
290.align 4 255.align 4
291.Lenc128: 256.Lenc128:
292 movaps -0x20(TKEYP), KEY 257 movaps -0x20(TKEYP), KEY
293 # aesenc KEY, STATE 258 AESENC KEY STATE
294 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
295 movaps -0x10(TKEYP), KEY 259 movaps -0x10(TKEYP), KEY
296 # aesenc KEY, STATE 260 AESENC KEY STATE
297 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
298 movaps (TKEYP), KEY 261 movaps (TKEYP), KEY
299 # aesenc KEY, STATE 262 AESENC KEY STATE
300 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
301 movaps 0x10(TKEYP), KEY 263 movaps 0x10(TKEYP), KEY
302 # aesenc KEY, STATE 264 AESENC KEY STATE
303 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
304 movaps 0x20(TKEYP), KEY 265 movaps 0x20(TKEYP), KEY
305 # aesenc KEY, STATE 266 AESENC KEY STATE
306 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
307 movaps 0x30(TKEYP), KEY 267 movaps 0x30(TKEYP), KEY
308 # aesenc KEY, STATE 268 AESENC KEY STATE
309 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
310 movaps 0x40(TKEYP), KEY 269 movaps 0x40(TKEYP), KEY
311 # aesenc KEY, STATE 270 AESENC KEY STATE
312 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
313 movaps 0x50(TKEYP), KEY 271 movaps 0x50(TKEYP), KEY
314 # aesenc KEY, STATE 272 AESENC KEY STATE
315 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
316 movaps 0x60(TKEYP), KEY 273 movaps 0x60(TKEYP), KEY
317 # aesenc KEY, STATE 274 AESENC KEY STATE
318 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2
319 movaps 0x70(TKEYP), KEY 275 movaps 0x70(TKEYP), KEY
320 # aesenclast KEY, STATE # last round 276 AESENCLAST KEY STATE
321 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2
322 ret 277 ret
323 278
324/* 279/*
@@ -353,135 +308,79 @@ _aesni_enc4:
353 je .L4enc192 308 je .L4enc192
354 add $0x20, TKEYP 309 add $0x20, TKEYP
355 movaps -0x60(TKEYP), KEY 310 movaps -0x60(TKEYP), KEY
356 # aesenc KEY, STATE1 311 AESENC KEY STATE1
357 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 312 AESENC KEY STATE2
358 # aesenc KEY, STATE2 313 AESENC KEY STATE3
359 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 314 AESENC KEY STATE4
360 # aesenc KEY, STATE3
361 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
362 # aesenc KEY, STATE4
363 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
364 movaps -0x50(TKEYP), KEY 315 movaps -0x50(TKEYP), KEY
365 # aesenc KEY, STATE1 316 AESENC KEY STATE1
366 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 317 AESENC KEY STATE2
367 # aesenc KEY, STATE2 318 AESENC KEY STATE3
368 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 319 AESENC KEY STATE4
369 # aesenc KEY, STATE3
370 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
371 # aesenc KEY, STATE4
372 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
373#.align 4 320#.align 4
374.L4enc192: 321.L4enc192:
375 movaps -0x40(TKEYP), KEY 322 movaps -0x40(TKEYP), KEY
376 # aesenc KEY, STATE1 323 AESENC KEY STATE1
377 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 324 AESENC KEY STATE2
378 # aesenc KEY, STATE2 325 AESENC KEY STATE3
379 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 326 AESENC KEY STATE4
380 # aesenc KEY, STATE3
381 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
382 # aesenc KEY, STATE4
383 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
384 movaps -0x30(TKEYP), KEY 327 movaps -0x30(TKEYP), KEY
385 # aesenc KEY, STATE1 328 AESENC KEY STATE1
386 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 329 AESENC KEY STATE2
387 # aesenc KEY, STATE2 330 AESENC KEY STATE3
388 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 331 AESENC KEY STATE4
389 # aesenc KEY, STATE3
390 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
391 # aesenc KEY, STATE4
392 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
393#.align 4 332#.align 4
394.L4enc128: 333.L4enc128:
395 movaps -0x20(TKEYP), KEY 334 movaps -0x20(TKEYP), KEY
396 # aesenc KEY, STATE1 335 AESENC KEY STATE1
397 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 336 AESENC KEY STATE2
398 # aesenc KEY, STATE2 337 AESENC KEY STATE3
399 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 338 AESENC KEY STATE4
400 # aesenc KEY, STATE3
401 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
402 # aesenc KEY, STATE4
403 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
404 movaps -0x10(TKEYP), KEY 339 movaps -0x10(TKEYP), KEY
405 # aesenc KEY, STATE1 340 AESENC KEY STATE1
406 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 341 AESENC KEY STATE2
407 # aesenc KEY, STATE2 342 AESENC KEY STATE3
408 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 343 AESENC KEY STATE4
409 # aesenc KEY, STATE3
410 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
411 # aesenc KEY, STATE4
412 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
413 movaps (TKEYP), KEY 344 movaps (TKEYP), KEY
414 # aesenc KEY, STATE1 345 AESENC KEY STATE1
415 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 346 AESENC KEY STATE2
416 # aesenc KEY, STATE2 347 AESENC KEY STATE3
417 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 348 AESENC KEY STATE4
418 # aesenc KEY, STATE3
419 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
420 # aesenc KEY, STATE4
421 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
422 movaps 0x10(TKEYP), KEY 349 movaps 0x10(TKEYP), KEY
423 # aesenc KEY, STATE1 350 AESENC KEY STATE1
424 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 351 AESENC KEY STATE2
425 # aesenc KEY, STATE2 352 AESENC KEY STATE3
426 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 353 AESENC KEY STATE4
427 # aesenc KEY, STATE3
428 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
429 # aesenc KEY, STATE4
430 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
431 movaps 0x20(TKEYP), KEY 354 movaps 0x20(TKEYP), KEY
432 # aesenc KEY, STATE1 355 AESENC KEY STATE1
433 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 356 AESENC KEY STATE2
434 # aesenc KEY, STATE2 357 AESENC KEY STATE3
435 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 358 AESENC KEY STATE4
436 # aesenc KEY, STATE3
437 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
438 # aesenc KEY, STATE4
439 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
440 movaps 0x30(TKEYP), KEY 359 movaps 0x30(TKEYP), KEY
441 # aesenc KEY, STATE1 360 AESENC KEY STATE1
442 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 361 AESENC KEY STATE2
443 # aesenc KEY, STATE2 362 AESENC KEY STATE3
444 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 363 AESENC KEY STATE4
445 # aesenc KEY, STATE3
446 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
447 # aesenc KEY, STATE4
448 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
449 movaps 0x40(TKEYP), KEY 364 movaps 0x40(TKEYP), KEY
450 # aesenc KEY, STATE1 365 AESENC KEY STATE1
451 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 366 AESENC KEY STATE2
452 # aesenc KEY, STATE2 367 AESENC KEY STATE3
453 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 368 AESENC KEY STATE4
454 # aesenc KEY, STATE3
455 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
456 # aesenc KEY, STATE4
457 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
458 movaps 0x50(TKEYP), KEY 369 movaps 0x50(TKEYP), KEY
459 # aesenc KEY, STATE1 370 AESENC KEY STATE1
460 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 371 AESENC KEY STATE2
461 # aesenc KEY, STATE2 372 AESENC KEY STATE3
462 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 373 AESENC KEY STATE4
463 # aesenc KEY, STATE3
464 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
465 # aesenc KEY, STATE4
466 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
467 movaps 0x60(TKEYP), KEY 374 movaps 0x60(TKEYP), KEY
468 # aesenc KEY, STATE1 375 AESENC KEY STATE1
469 .byte 0x66, 0x0f, 0x38, 0xdc, 0xc2 376 AESENC KEY STATE2
470 # aesenc KEY, STATE2 377 AESENC KEY STATE3
471 .byte 0x66, 0x0f, 0x38, 0xdc, 0xe2 378 AESENC KEY STATE4
472 # aesenc KEY, STATE3
473 .byte 0x66, 0x0f, 0x38, 0xdc, 0xea
474 # aesenc KEY, STATE4
475 .byte 0x66, 0x0f, 0x38, 0xdc, 0xf2
476 movaps 0x70(TKEYP), KEY 379 movaps 0x70(TKEYP), KEY
477 # aesenclast KEY, STATE1 # last round 380 AESENCLAST KEY STATE1 # last round
478 .byte 0x66, 0x0f, 0x38, 0xdd, 0xc2 381 AESENCLAST KEY STATE2
479 # aesenclast KEY, STATE2 382 AESENCLAST KEY STATE3
480 .byte 0x66, 0x0f, 0x38, 0xdd, 0xe2 383 AESENCLAST KEY STATE4
481 # aesenclast KEY, STATE3
482 .byte 0x66, 0x0f, 0x38, 0xdd, 0xea
483 # aesenclast KEY, STATE4
484 .byte 0x66, 0x0f, 0x38, 0xdd, 0xf2
485 ret 384 ret
486 385
487/* 386/*
@@ -518,51 +417,37 @@ _aesni_dec1:
518 je .Ldec192 417 je .Ldec192
519 add $0x20, TKEYP 418 add $0x20, TKEYP
520 movaps -0x60(TKEYP), KEY 419 movaps -0x60(TKEYP), KEY
521 # aesdec KEY, STATE 420 AESDEC KEY STATE
522 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
523 movaps -0x50(TKEYP), KEY 421 movaps -0x50(TKEYP), KEY
524 # aesdec KEY, STATE 422 AESDEC KEY STATE
525 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
526.align 4 423.align 4
527.Ldec192: 424.Ldec192:
528 movaps -0x40(TKEYP), KEY 425 movaps -0x40(TKEYP), KEY
529 # aesdec KEY, STATE 426 AESDEC KEY STATE
530 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
531 movaps -0x30(TKEYP), KEY 427 movaps -0x30(TKEYP), KEY
532 # aesdec KEY, STATE 428 AESDEC KEY STATE
533 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
534.align 4 429.align 4
535.Ldec128: 430.Ldec128:
536 movaps -0x20(TKEYP), KEY 431 movaps -0x20(TKEYP), KEY
537 # aesdec KEY, STATE 432 AESDEC KEY STATE
538 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
539 movaps -0x10(TKEYP), KEY 433 movaps -0x10(TKEYP), KEY
540 # aesdec KEY, STATE 434 AESDEC KEY STATE
541 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
542 movaps (TKEYP), KEY 435 movaps (TKEYP), KEY
543 # aesdec KEY, STATE 436 AESDEC KEY STATE
544 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
545 movaps 0x10(TKEYP), KEY 437 movaps 0x10(TKEYP), KEY
546 # aesdec KEY, STATE 438 AESDEC KEY STATE
547 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
548 movaps 0x20(TKEYP), KEY 439 movaps 0x20(TKEYP), KEY
549 # aesdec KEY, STATE 440 AESDEC KEY STATE
550 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
551 movaps 0x30(TKEYP), KEY 441 movaps 0x30(TKEYP), KEY
552 # aesdec KEY, STATE 442 AESDEC KEY STATE
553 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
554 movaps 0x40(TKEYP), KEY 443 movaps 0x40(TKEYP), KEY
555 # aesdec KEY, STATE 444 AESDEC KEY STATE
556 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
557 movaps 0x50(TKEYP), KEY 445 movaps 0x50(TKEYP), KEY
558 # aesdec KEY, STATE 446 AESDEC KEY STATE
559 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
560 movaps 0x60(TKEYP), KEY 447 movaps 0x60(TKEYP), KEY
561 # aesdec KEY, STATE 448 AESDEC KEY STATE
562 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2
563 movaps 0x70(TKEYP), KEY 449 movaps 0x70(TKEYP), KEY
564 # aesdeclast KEY, STATE # last round 450 AESDECLAST KEY STATE
565 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2
566 ret 451 ret
567 452
568/* 453/*
@@ -597,135 +482,79 @@ _aesni_dec4:
597 je .L4dec192 482 je .L4dec192
598 add $0x20, TKEYP 483 add $0x20, TKEYP
599 movaps -0x60(TKEYP), KEY 484 movaps -0x60(TKEYP), KEY
600 # aesdec KEY, STATE1 485 AESDEC KEY STATE1
601 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 486 AESDEC KEY STATE2
602 # aesdec KEY, STATE2 487 AESDEC KEY STATE3
603 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 488 AESDEC KEY STATE4
604 # aesdec KEY, STATE3
605 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
606 # aesdec KEY, STATE4
607 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
608 movaps -0x50(TKEYP), KEY 489 movaps -0x50(TKEYP), KEY
609 # aesdec KEY, STATE1 490 AESDEC KEY STATE1
610 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 491 AESDEC KEY STATE2
611 # aesdec KEY, STATE2 492 AESDEC KEY STATE3
612 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 493 AESDEC KEY STATE4
613 # aesdec KEY, STATE3
614 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
615 # aesdec KEY, STATE4
616 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
617.align 4 494.align 4
618.L4dec192: 495.L4dec192:
619 movaps -0x40(TKEYP), KEY 496 movaps -0x40(TKEYP), KEY
620 # aesdec KEY, STATE1 497 AESDEC KEY STATE1
621 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 498 AESDEC KEY STATE2
622 # aesdec KEY, STATE2 499 AESDEC KEY STATE3
623 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 500 AESDEC KEY STATE4
624 # aesdec KEY, STATE3
625 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
626 # aesdec KEY, STATE4
627 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
628 movaps -0x30(TKEYP), KEY 501 movaps -0x30(TKEYP), KEY
629 # aesdec KEY, STATE1 502 AESDEC KEY STATE1
630 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 503 AESDEC KEY STATE2
631 # aesdec KEY, STATE2 504 AESDEC KEY STATE3
632 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 505 AESDEC KEY STATE4
633 # aesdec KEY, STATE3
634 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
635 # aesdec KEY, STATE4
636 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
637.align 4 506.align 4
638.L4dec128: 507.L4dec128:
639 movaps -0x20(TKEYP), KEY 508 movaps -0x20(TKEYP), KEY
640 # aesdec KEY, STATE1 509 AESDEC KEY STATE1
641 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 510 AESDEC KEY STATE2
642 # aesdec KEY, STATE2 511 AESDEC KEY STATE3
643 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 512 AESDEC KEY STATE4
644 # aesdec KEY, STATE3
645 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
646 # aesdec KEY, STATE4
647 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
648 movaps -0x10(TKEYP), KEY 513 movaps -0x10(TKEYP), KEY
649 # aesdec KEY, STATE1 514 AESDEC KEY STATE1
650 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 515 AESDEC KEY STATE2
651 # aesdec KEY, STATE2 516 AESDEC KEY STATE3
652 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 517 AESDEC KEY STATE4
653 # aesdec KEY, STATE3
654 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
655 # aesdec KEY, STATE4
656 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
657 movaps (TKEYP), KEY 518 movaps (TKEYP), KEY
658 # aesdec KEY, STATE1 519 AESDEC KEY STATE1
659 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 520 AESDEC KEY STATE2
660 # aesdec KEY, STATE2 521 AESDEC KEY STATE3
661 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 522 AESDEC KEY STATE4
662 # aesdec KEY, STATE3
663 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
664 # aesdec KEY, STATE4
665 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
666 movaps 0x10(TKEYP), KEY 523 movaps 0x10(TKEYP), KEY
667 # aesdec KEY, STATE1 524 AESDEC KEY STATE1
668 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 525 AESDEC KEY STATE2
669 # aesdec KEY, STATE2 526 AESDEC KEY STATE3
670 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 527 AESDEC KEY STATE4
671 # aesdec KEY, STATE3
672 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
673 # aesdec KEY, STATE4
674 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
675 movaps 0x20(TKEYP), KEY 528 movaps 0x20(TKEYP), KEY
676 # aesdec KEY, STATE1 529 AESDEC KEY STATE1
677 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 530 AESDEC KEY STATE2
678 # aesdec KEY, STATE2 531 AESDEC KEY STATE3
679 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 532 AESDEC KEY STATE4
680 # aesdec KEY, STATE3
681 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
682 # aesdec KEY, STATE4
683 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
684 movaps 0x30(TKEYP), KEY 533 movaps 0x30(TKEYP), KEY
685 # aesdec KEY, STATE1 534 AESDEC KEY STATE1
686 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 535 AESDEC KEY STATE2
687 # aesdec KEY, STATE2 536 AESDEC KEY STATE3
688 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 537 AESDEC KEY STATE4
689 # aesdec KEY, STATE3
690 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
691 # aesdec KEY, STATE4
692 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
693 movaps 0x40(TKEYP), KEY 538 movaps 0x40(TKEYP), KEY
694 # aesdec KEY, STATE1 539 AESDEC KEY STATE1
695 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 540 AESDEC KEY STATE2
696 # aesdec KEY, STATE2 541 AESDEC KEY STATE3
697 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 542 AESDEC KEY STATE4
698 # aesdec KEY, STATE3
699 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
700 # aesdec KEY, STATE4
701 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
702 movaps 0x50(TKEYP), KEY 543 movaps 0x50(TKEYP), KEY
703 # aesdec KEY, STATE1 544 AESDEC KEY STATE1
704 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 545 AESDEC KEY STATE2
705 # aesdec KEY, STATE2 546 AESDEC KEY STATE3
706 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 547 AESDEC KEY STATE4
707 # aesdec KEY, STATE3
708 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
709 # aesdec KEY, STATE4
710 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
711 movaps 0x60(TKEYP), KEY 548 movaps 0x60(TKEYP), KEY
712 # aesdec KEY, STATE1 549 AESDEC KEY STATE1
713 .byte 0x66, 0x0f, 0x38, 0xde, 0xc2 550 AESDEC KEY STATE2
714 # aesdec KEY, STATE2 551 AESDEC KEY STATE3
715 .byte 0x66, 0x0f, 0x38, 0xde, 0xe2 552 AESDEC KEY STATE4
716 # aesdec KEY, STATE3
717 .byte 0x66, 0x0f, 0x38, 0xde, 0xea
718 # aesdec KEY, STATE4
719 .byte 0x66, 0x0f, 0x38, 0xde, 0xf2
720 movaps 0x70(TKEYP), KEY 553 movaps 0x70(TKEYP), KEY
721 # aesdeclast KEY, STATE1 # last round 554 AESDECLAST KEY STATE1 # last round
722 .byte 0x66, 0x0f, 0x38, 0xdf, 0xc2 555 AESDECLAST KEY STATE2
723 # aesdeclast KEY, STATE2 556 AESDECLAST KEY STATE3
724 .byte 0x66, 0x0f, 0x38, 0xdf, 0xe2 557 AESDECLAST KEY STATE4
725 # aesdeclast KEY, STATE3
726 .byte 0x66, 0x0f, 0x38, 0xdf, 0xea
727 # aesdeclast KEY, STATE4
728 .byte 0x66, 0x0f, 0x38, 0xdf, 0xf2
729 ret 558 ret
730 559
731/* 560/*
diff --git a/arch/x86/crypto/fpu.c b/arch/x86/crypto/fpu.c
index daef6cd2b45d..1a8f8649c035 100644
--- a/arch/x86/crypto/fpu.c
+++ b/arch/x86/crypto/fpu.c
@@ -16,6 +16,7 @@
16#include <linux/init.h> 16#include <linux/init.h>
17#include <linux/kernel.h> 17#include <linux/kernel.h>
18#include <linux/module.h> 18#include <linux/module.h>
19#include <linux/slab.h>
19#include <asm/i387.h> 20#include <asm/i387.h>
20 21
21struct crypto_fpu_ctx { 22struct crypto_fpu_ctx {
diff --git a/arch/x86/crypto/ghash-clmulni-intel_asm.S b/arch/x86/crypto/ghash-clmulni-intel_asm.S
new file mode 100644
index 000000000000..1eb7f90cb7b9
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_asm.S
@@ -0,0 +1,157 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains accelerated part of ghash
4 * implementation. More information about PCLMULQDQ can be found at:
5 *
6 * http://software.intel.com/en-us/articles/carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/
7 *
8 * Copyright (c) 2009 Intel Corp.
9 * Author: Huang Ying <ying.huang@intel.com>
10 * Vinodh Gopal
11 * Erdinc Ozturk
12 * Deniz Karakoyunlu
13 *
14 * This program is free software; you can redistribute it and/or modify it
15 * under the terms of the GNU General Public License version 2 as published
16 * by the Free Software Foundation.
17 */
18
19#include <linux/linkage.h>
20#include <asm/inst.h>
21
22.data
23
24.align 16
25.Lbswap_mask:
26 .octa 0x000102030405060708090a0b0c0d0e0f
27.Lpoly:
28 .octa 0xc2000000000000000000000000000001
29.Ltwo_one:
30 .octa 0x00000001000000000000000000000001
31
32#define DATA %xmm0
33#define SHASH %xmm1
34#define T1 %xmm2
35#define T2 %xmm3
36#define T3 %xmm4
37#define BSWAP %xmm5
38#define IN1 %xmm6
39
40.text
41
42/*
43 * __clmul_gf128mul_ble: internal ABI
44 * input:
45 * DATA: operand1
46 * SHASH: operand2, hash_key << 1 mod poly
47 * output:
48 * DATA: operand1 * operand2 mod poly
49 * changed:
50 * T1
51 * T2
52 * T3
53 */
54__clmul_gf128mul_ble:
55 movaps DATA, T1
56 pshufd $0b01001110, DATA, T2
57 pshufd $0b01001110, SHASH, T3
58 pxor DATA, T2
59 pxor SHASH, T3
60
61 PCLMULQDQ 0x00 SHASH DATA # DATA = a0 * b0
62 PCLMULQDQ 0x11 SHASH T1 # T1 = a1 * b1
63 PCLMULQDQ 0x00 T3 T2 # T2 = (a1 + a0) * (b1 + b0)
64 pxor DATA, T2
65 pxor T1, T2 # T2 = a0 * b1 + a1 * b0
66
67 movaps T2, T3
68 pslldq $8, T3
69 psrldq $8, T2
70 pxor T3, DATA
71 pxor T2, T1 # <T1:DATA> is result of
72 # carry-less multiplication
73
74 # first phase of the reduction
75 movaps DATA, T3
76 psllq $1, T3
77 pxor DATA, T3
78 psllq $5, T3
79 pxor DATA, T3
80 psllq $57, T3
81 movaps T3, T2
82 pslldq $8, T2
83 psrldq $8, T3
84 pxor T2, DATA
85 pxor T3, T1
86
87 # second phase of the reduction
88 movaps DATA, T2
89 psrlq $5, T2
90 pxor DATA, T2
91 psrlq $1, T2
92 pxor DATA, T2
93 psrlq $1, T2
94 pxor T2, T1
95 pxor T1, DATA
96 ret
97
98/* void clmul_ghash_mul(char *dst, const be128 *shash) */
99ENTRY(clmul_ghash_mul)
100 movups (%rdi), DATA
101 movups (%rsi), SHASH
102 movaps .Lbswap_mask, BSWAP
103 PSHUFB_XMM BSWAP DATA
104 call __clmul_gf128mul_ble
105 PSHUFB_XMM BSWAP DATA
106 movups DATA, (%rdi)
107 ret
108
109/*
110 * void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
111 * const be128 *shash);
112 */
113ENTRY(clmul_ghash_update)
114 cmp $16, %rdx
115 jb .Lupdate_just_ret # check length
116 movaps .Lbswap_mask, BSWAP
117 movups (%rdi), DATA
118 movups (%rcx), SHASH
119 PSHUFB_XMM BSWAP DATA
120.align 4
121.Lupdate_loop:
122 movups (%rsi), IN1
123 PSHUFB_XMM BSWAP IN1
124 pxor IN1, DATA
125 call __clmul_gf128mul_ble
126 sub $16, %rdx
127 add $16, %rsi
128 cmp $16, %rdx
129 jge .Lupdate_loop
130 PSHUFB_XMM BSWAP DATA
131 movups DATA, (%rdi)
132.Lupdate_just_ret:
133 ret
134
135/*
136 * void clmul_ghash_setkey(be128 *shash, const u8 *key);
137 *
138 * Calculate hash_key << 1 mod poly
139 */
140ENTRY(clmul_ghash_setkey)
141 movaps .Lbswap_mask, BSWAP
142 movups (%rsi), %xmm0
143 PSHUFB_XMM BSWAP %xmm0
144 movaps %xmm0, %xmm1
145 psllq $1, %xmm0
146 psrlq $63, %xmm1
147 movaps %xmm1, %xmm2
148 pslldq $8, %xmm1
149 psrldq $8, %xmm2
150 por %xmm1, %xmm0
151 # reduction
152 pshufd $0b00100100, %xmm2, %xmm1
153 pcmpeqd .Ltwo_one, %xmm1
154 pand .Lpoly, %xmm1
155 pxor %xmm1, %xmm0
156 movups %xmm0, (%rdi)
157 ret
diff --git a/arch/x86/crypto/ghash-clmulni-intel_glue.c b/arch/x86/crypto/ghash-clmulni-intel_glue.c
new file mode 100644
index 000000000000..cbcc8d8ea93a
--- /dev/null
+++ b/arch/x86/crypto/ghash-clmulni-intel_glue.c
@@ -0,0 +1,333 @@
1/*
2 * Accelerated GHASH implementation with Intel PCLMULQDQ-NI
3 * instructions. This file contains glue code.
4 *
5 * Copyright (c) 2009 Intel Corp.
6 * Author: Huang Ying <ying.huang@intel.com>
7 *
8 * This program is free software; you can redistribute it and/or modify it
9 * under the terms of the GNU General Public License version 2 as published
10 * by the Free Software Foundation.
11 */
12
13#include <linux/module.h>
14#include <linux/init.h>
15#include <linux/kernel.h>
16#include <linux/crypto.h>
17#include <crypto/algapi.h>
18#include <crypto/cryptd.h>
19#include <crypto/gf128mul.h>
20#include <crypto/internal/hash.h>
21#include <asm/i387.h>
22
23#define GHASH_BLOCK_SIZE 16
24#define GHASH_DIGEST_SIZE 16
25
26void clmul_ghash_mul(char *dst, const be128 *shash);
27
28void clmul_ghash_update(char *dst, const char *src, unsigned int srclen,
29 const be128 *shash);
30
31void clmul_ghash_setkey(be128 *shash, const u8 *key);
32
33struct ghash_async_ctx {
34 struct cryptd_ahash *cryptd_tfm;
35};
36
37struct ghash_ctx {
38 be128 shash;
39};
40
41struct ghash_desc_ctx {
42 u8 buffer[GHASH_BLOCK_SIZE];
43 u32 bytes;
44};
45
46static int ghash_init(struct shash_desc *desc)
47{
48 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
49
50 memset(dctx, 0, sizeof(*dctx));
51
52 return 0;
53}
54
55static int ghash_setkey(struct crypto_shash *tfm,
56 const u8 *key, unsigned int keylen)
57{
58 struct ghash_ctx *ctx = crypto_shash_ctx(tfm);
59
60 if (keylen != GHASH_BLOCK_SIZE) {
61 crypto_shash_set_flags(tfm, CRYPTO_TFM_RES_BAD_KEY_LEN);
62 return -EINVAL;
63 }
64
65 clmul_ghash_setkey(&ctx->shash, key);
66
67 return 0;
68}
69
70static int ghash_update(struct shash_desc *desc,
71 const u8 *src, unsigned int srclen)
72{
73 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
74 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
75 u8 *dst = dctx->buffer;
76
77 kernel_fpu_begin();
78 if (dctx->bytes) {
79 int n = min(srclen, dctx->bytes);
80 u8 *pos = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
81
82 dctx->bytes -= n;
83 srclen -= n;
84
85 while (n--)
86 *pos++ ^= *src++;
87
88 if (!dctx->bytes)
89 clmul_ghash_mul(dst, &ctx->shash);
90 }
91
92 clmul_ghash_update(dst, src, srclen, &ctx->shash);
93 kernel_fpu_end();
94
95 if (srclen & 0xf) {
96 src += srclen - (srclen & 0xf);
97 srclen &= 0xf;
98 dctx->bytes = GHASH_BLOCK_SIZE - srclen;
99 while (srclen--)
100 *dst++ ^= *src++;
101 }
102
103 return 0;
104}
105
106static void ghash_flush(struct ghash_ctx *ctx, struct ghash_desc_ctx *dctx)
107{
108 u8 *dst = dctx->buffer;
109
110 if (dctx->bytes) {
111 u8 *tmp = dst + (GHASH_BLOCK_SIZE - dctx->bytes);
112
113 while (dctx->bytes--)
114 *tmp++ ^= 0;
115
116 kernel_fpu_begin();
117 clmul_ghash_mul(dst, &ctx->shash);
118 kernel_fpu_end();
119 }
120
121 dctx->bytes = 0;
122}
123
124static int ghash_final(struct shash_desc *desc, u8 *dst)
125{
126 struct ghash_desc_ctx *dctx = shash_desc_ctx(desc);
127 struct ghash_ctx *ctx = crypto_shash_ctx(desc->tfm);
128 u8 *buf = dctx->buffer;
129
130 ghash_flush(ctx, dctx);
131 memcpy(dst, buf, GHASH_BLOCK_SIZE);
132
133 return 0;
134}
135
136static struct shash_alg ghash_alg = {
137 .digestsize = GHASH_DIGEST_SIZE,
138 .init = ghash_init,
139 .update = ghash_update,
140 .final = ghash_final,
141 .setkey = ghash_setkey,
142 .descsize = sizeof(struct ghash_desc_ctx),
143 .base = {
144 .cra_name = "__ghash",
145 .cra_driver_name = "__ghash-pclmulqdqni",
146 .cra_priority = 0,
147 .cra_flags = CRYPTO_ALG_TYPE_SHASH,
148 .cra_blocksize = GHASH_BLOCK_SIZE,
149 .cra_ctxsize = sizeof(struct ghash_ctx),
150 .cra_module = THIS_MODULE,
151 .cra_list = LIST_HEAD_INIT(ghash_alg.base.cra_list),
152 },
153};
154
155static int ghash_async_init(struct ahash_request *req)
156{
157 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
158 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
159 struct ahash_request *cryptd_req = ahash_request_ctx(req);
160 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
161
162 if (!irq_fpu_usable()) {
163 memcpy(cryptd_req, req, sizeof(*req));
164 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
165 return crypto_ahash_init(cryptd_req);
166 } else {
167 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
168 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
169
170 desc->tfm = child;
171 desc->flags = req->base.flags;
172 return crypto_shash_init(desc);
173 }
174}
175
176static int ghash_async_update(struct ahash_request *req)
177{
178 struct ahash_request *cryptd_req = ahash_request_ctx(req);
179
180 if (!irq_fpu_usable()) {
181 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
182 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
183 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
184
185 memcpy(cryptd_req, req, sizeof(*req));
186 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
187 return crypto_ahash_update(cryptd_req);
188 } else {
189 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
190 return shash_ahash_update(req, desc);
191 }
192}
193
194static int ghash_async_final(struct ahash_request *req)
195{
196 struct ahash_request *cryptd_req = ahash_request_ctx(req);
197
198 if (!irq_fpu_usable()) {
199 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
200 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
201 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
202
203 memcpy(cryptd_req, req, sizeof(*req));
204 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
205 return crypto_ahash_final(cryptd_req);
206 } else {
207 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
208 return crypto_shash_final(desc, req->result);
209 }
210}
211
212static int ghash_async_digest(struct ahash_request *req)
213{
214 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req);
215 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
216 struct ahash_request *cryptd_req = ahash_request_ctx(req);
217 struct cryptd_ahash *cryptd_tfm = ctx->cryptd_tfm;
218
219 if (!irq_fpu_usable()) {
220 memcpy(cryptd_req, req, sizeof(*req));
221 ahash_request_set_tfm(cryptd_req, &cryptd_tfm->base);
222 return crypto_ahash_digest(cryptd_req);
223 } else {
224 struct shash_desc *desc = cryptd_shash_desc(cryptd_req);
225 struct crypto_shash *child = cryptd_ahash_child(cryptd_tfm);
226
227 desc->tfm = child;
228 desc->flags = req->base.flags;
229 return shash_ahash_digest(req, desc);
230 }
231}
232
233static int ghash_async_setkey(struct crypto_ahash *tfm, const u8 *key,
234 unsigned int keylen)
235{
236 struct ghash_async_ctx *ctx = crypto_ahash_ctx(tfm);
237 struct crypto_ahash *child = &ctx->cryptd_tfm->base;
238 int err;
239
240 crypto_ahash_clear_flags(child, CRYPTO_TFM_REQ_MASK);
241 crypto_ahash_set_flags(child, crypto_ahash_get_flags(tfm)
242 & CRYPTO_TFM_REQ_MASK);
243 err = crypto_ahash_setkey(child, key, keylen);
244 crypto_ahash_set_flags(tfm, crypto_ahash_get_flags(child)
245 & CRYPTO_TFM_RES_MASK);
246
247 return 0;
248}
249
250static int ghash_async_init_tfm(struct crypto_tfm *tfm)
251{
252 struct cryptd_ahash *cryptd_tfm;
253 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
254
255 cryptd_tfm = cryptd_alloc_ahash("__ghash-pclmulqdqni", 0, 0);
256 if (IS_ERR(cryptd_tfm))
257 return PTR_ERR(cryptd_tfm);
258 ctx->cryptd_tfm = cryptd_tfm;
259 crypto_ahash_set_reqsize(__crypto_ahash_cast(tfm),
260 sizeof(struct ahash_request) +
261 crypto_ahash_reqsize(&cryptd_tfm->base));
262
263 return 0;
264}
265
266static void ghash_async_exit_tfm(struct crypto_tfm *tfm)
267{
268 struct ghash_async_ctx *ctx = crypto_tfm_ctx(tfm);
269
270 cryptd_free_ahash(ctx->cryptd_tfm);
271}
272
273static struct ahash_alg ghash_async_alg = {
274 .init = ghash_async_init,
275 .update = ghash_async_update,
276 .final = ghash_async_final,
277 .setkey = ghash_async_setkey,
278 .digest = ghash_async_digest,
279 .halg = {
280 .digestsize = GHASH_DIGEST_SIZE,
281 .base = {
282 .cra_name = "ghash",
283 .cra_driver_name = "ghash-clmulni",
284 .cra_priority = 400,
285 .cra_flags = CRYPTO_ALG_TYPE_AHASH | CRYPTO_ALG_ASYNC,
286 .cra_blocksize = GHASH_BLOCK_SIZE,
287 .cra_type = &crypto_ahash_type,
288 .cra_module = THIS_MODULE,
289 .cra_list = LIST_HEAD_INIT(ghash_async_alg.halg.base.cra_list),
290 .cra_init = ghash_async_init_tfm,
291 .cra_exit = ghash_async_exit_tfm,
292 },
293 },
294};
295
296static int __init ghash_pclmulqdqni_mod_init(void)
297{
298 int err;
299
300 if (!cpu_has_pclmulqdq) {
301 printk(KERN_INFO "Intel PCLMULQDQ-NI instructions are not"
302 " detected.\n");
303 return -ENODEV;
304 }
305
306 err = crypto_register_shash(&ghash_alg);
307 if (err)
308 goto err_out;
309 err = crypto_register_ahash(&ghash_async_alg);
310 if (err)
311 goto err_shash;
312
313 return 0;
314
315err_shash:
316 crypto_unregister_shash(&ghash_alg);
317err_out:
318 return err;
319}
320
321static void __exit ghash_pclmulqdqni_mod_exit(void)
322{
323 crypto_unregister_ahash(&ghash_async_alg);
324 crypto_unregister_shash(&ghash_alg);
325}
326
327module_init(ghash_pclmulqdqni_mod_init);
328module_exit(ghash_pclmulqdqni_mod_exit);
329
330MODULE_LICENSE("GPL");
331MODULE_DESCRIPTION("GHASH Message Digest Algorithm, "
332 "acclerated by PCLMULQDQ-NI");
333MODULE_ALIAS("ghash");
diff --git a/arch/x86/crypto/twofish-i586-asm_32.S b/arch/x86/crypto/twofish-i586-asm_32.S
index 39b98ed2c1b9..575331cb2a8a 100644
--- a/arch/x86/crypto/twofish-i586-asm_32.S
+++ b/arch/x86/crypto/twofish-i586-asm_32.S
@@ -22,7 +22,7 @@
22 22
23#include <asm/asm-offsets.h> 23#include <asm/asm-offsets.h>
24 24
25/* return adress at 0 */ 25/* return address at 0 */
26 26
27#define in_blk 12 /* input byte array address parameter*/ 27#define in_blk 12 /* input byte array address parameter*/
28#define out_blk 8 /* output byte array address parameter*/ 28#define out_blk 8 /* output byte array address parameter*/
@@ -230,8 +230,8 @@ twofish_enc_blk:
230 push %edi 230 push %edi
231 231
232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ 232 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
233 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ 233 add $crypto_tfm_ctx_offset, %ebp /* ctx address */
234 mov in_blk+16(%esp),%edi /* input adress in edi */ 234 mov in_blk+16(%esp),%edi /* input address in edi */
235 235
236 mov (%edi), %eax 236 mov (%edi), %eax
237 mov b_offset(%edi), %ebx 237 mov b_offset(%edi), %ebx
@@ -286,8 +286,8 @@ twofish_dec_blk:
286 286
287 287
288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */ 288 mov tfm + 16(%esp), %ebp /* abuse the base pointer: set new base bointer to the crypto tfm */
289 add $crypto_tfm_ctx_offset, %ebp /* ctx adress */ 289 add $crypto_tfm_ctx_offset, %ebp /* ctx address */
290 mov in_blk+16(%esp),%edi /* input adress in edi */ 290 mov in_blk+16(%esp),%edi /* input address in edi */
291 291
292 mov (%edi), %eax 292 mov (%edi), %eax
293 mov b_offset(%edi), %ebx 293 mov b_offset(%edi), %ebx
diff --git a/arch/x86/crypto/twofish-x86_64-asm_64.S b/arch/x86/crypto/twofish-x86_64-asm_64.S
index 35974a586615..573aa102542e 100644
--- a/arch/x86/crypto/twofish-x86_64-asm_64.S
+++ b/arch/x86/crypto/twofish-x86_64-asm_64.S
@@ -221,11 +221,11 @@
221twofish_enc_blk: 221twofish_enc_blk:
222 pushq R1 222 pushq R1
223 223
224 /* %rdi contains the crypto tfm adress */ 224 /* %rdi contains the crypto tfm address */
225 /* %rsi contains the output adress */ 225 /* %rsi contains the output address */
226 /* %rdx contains the input adress */ 226 /* %rdx contains the input address */
227 add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ 227 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
228 /* ctx adress is moved to free one non-rex register 228 /* ctx address is moved to free one non-rex register
229 as target for the 8bit high operations */ 229 as target for the 8bit high operations */
230 mov %rdi, %r11 230 mov %rdi, %r11
231 231
@@ -274,11 +274,11 @@ twofish_enc_blk:
274twofish_dec_blk: 274twofish_dec_blk:
275 pushq R1 275 pushq R1
276 276
277 /* %rdi contains the crypto tfm adress */ 277 /* %rdi contains the crypto tfm address */
278 /* %rsi contains the output adress */ 278 /* %rsi contains the output address */
279 /* %rdx contains the input adress */ 279 /* %rdx contains the input address */
280 add $crypto_tfm_ctx_offset, %rdi /* set ctx adress */ 280 add $crypto_tfm_ctx_offset, %rdi /* set ctx address */
281 /* ctx adress is moved to free one non-rex register 281 /* ctx address is moved to free one non-rex register
282 as target for the 8bit high operations */ 282 as target for the 8bit high operations */
283 mov %rdi, %r11 283 mov %rdi, %r11
284 284
diff --git a/arch/x86/ia32/ia32_aout.c b/arch/x86/ia32/ia32_aout.c
index 2a4d073d2cf1..0350311906ae 100644
--- a/arch/x86/ia32/ia32_aout.c
+++ b/arch/x86/ia32/ia32_aout.c
@@ -21,7 +21,6 @@
21#include <linux/fcntl.h> 21#include <linux/fcntl.h>
22#include <linux/ptrace.h> 22#include <linux/ptrace.h>
23#include <linux/user.h> 23#include <linux/user.h>
24#include <linux/slab.h>
25#include <linux/binfmts.h> 24#include <linux/binfmts.h>
26#include <linux/personality.h> 25#include <linux/personality.h>
27#include <linux/init.h> 26#include <linux/init.h>
@@ -297,7 +296,7 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
297 * size limits imposed on them by creating programs with large 296 * size limits imposed on them by creating programs with large
298 * arrays in the data or bss. 297 * arrays in the data or bss.
299 */ 298 */
300 rlim = current->signal->rlim[RLIMIT_DATA].rlim_cur; 299 rlim = rlimit(RLIMIT_DATA);
301 if (rlim >= RLIM_INFINITY) 300 if (rlim >= RLIM_INFINITY)
302 rlim = ~0; 301 rlim = ~0;
303 if (ex.a_data + ex.a_bss > rlim) 302 if (ex.a_data + ex.a_bss > rlim)
@@ -308,14 +307,15 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
308 if (retval) 307 if (retval)
309 return retval; 308 return retval;
310 309
311 regs->cs = __USER32_CS;
312 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
313 regs->r13 = regs->r14 = regs->r15 = 0;
314
315 /* OK, This is the point of no return */ 310 /* OK, This is the point of no return */
316 set_personality(PER_LINUX); 311 set_personality(PER_LINUX);
317 set_thread_flag(TIF_IA32); 312 set_thread_flag(TIF_IA32);
318 clear_thread_flag(TIF_ABI_PENDING); 313
314 setup_new_exec(bprm);
315
316 regs->cs = __USER32_CS;
317 regs->r8 = regs->r9 = regs->r10 = regs->r11 = regs->r12 =
318 regs->r13 = regs->r14 = regs->r15 = 0;
319 319
320 current->mm->end_code = ex.a_text + 320 current->mm->end_code = ex.a_text +
321 (current->mm->start_code = N_TXTADDR(ex)); 321 (current->mm->start_code = N_TXTADDR(ex));
@@ -326,7 +326,6 @@ static int load_aout_binary(struct linux_binprm *bprm, struct pt_regs *regs)
326 current->mm->free_area_cache = TASK_UNMAPPED_BASE; 326 current->mm->free_area_cache = TASK_UNMAPPED_BASE;
327 current->mm->cached_hole_size = 0; 327 current->mm->cached_hole_size = 0;
328 328
329 current->mm->mmap = NULL;
330 install_exec_creds(bprm); 329 install_exec_creds(bprm);
331 current->flags &= ~PF_FORKNOEXEC; 330 current->flags &= ~PF_FORKNOEXEC;
332 331
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 581b0568fe19..e790bc1fbfa3 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -563,7 +563,7 @@ ia32_sys_call_table:
563 .quad quiet_ni_syscall /* old mpx syscall holder */ 563 .quad quiet_ni_syscall /* old mpx syscall holder */
564 .quad sys_setpgid 564 .quad sys_setpgid
565 .quad quiet_ni_syscall /* old ulimit syscall holder */ 565 .quad quiet_ni_syscall /* old ulimit syscall holder */
566 .quad sys32_olduname 566 .quad sys_olduname
567 .quad sys_umask /* 60 */ 567 .quad sys_umask /* 60 */
568 .quad sys_chroot 568 .quad sys_chroot
569 .quad compat_sys_ustat 569 .quad compat_sys_ustat
@@ -586,7 +586,7 @@ ia32_sys_call_table:
586 .quad compat_sys_settimeofday 586 .quad compat_sys_settimeofday
587 .quad sys_getgroups16 /* 80 */ 587 .quad sys_getgroups16 /* 80 */
588 .quad sys_setgroups16 588 .quad sys_setgroups16
589 .quad sys32_old_select 589 .quad compat_sys_old_select
590 .quad sys_symlink 590 .quad sys_symlink
591 .quad sys_lstat 591 .quad sys_lstat
592 .quad sys_readlink /* 85 */ 592 .quad sys_readlink /* 85 */
@@ -613,7 +613,7 @@ ia32_sys_call_table:
613 .quad compat_sys_newstat 613 .quad compat_sys_newstat
614 .quad compat_sys_newlstat 614 .quad compat_sys_newlstat
615 .quad compat_sys_newfstat 615 .quad compat_sys_newfstat
616 .quad sys32_uname 616 .quad sys_uname
617 .quad stub32_iopl /* 110 */ 617 .quad stub32_iopl /* 110 */
618 .quad sys_vhangup 618 .quad sys_vhangup
619 .quad quiet_ni_syscall /* old "idle" system call */ 619 .quad quiet_ni_syscall /* old "idle" system call */
@@ -626,7 +626,7 @@ ia32_sys_call_table:
626 .quad stub32_sigreturn 626 .quad stub32_sigreturn
627 .quad stub32_clone /* 120 */ 627 .quad stub32_clone /* 120 */
628 .quad sys_setdomainname 628 .quad sys_setdomainname
629 .quad sys_uname 629 .quad sys_newuname
630 .quad sys_modify_ldt 630 .quad sys_modify_ldt
631 .quad compat_sys_adjtimex 631 .quad compat_sys_adjtimex
632 .quad sys32_mprotect /* 125 */ 632 .quad sys32_mprotect /* 125 */
@@ -653,7 +653,7 @@ ia32_sys_call_table:
653 .quad compat_sys_writev 653 .quad compat_sys_writev
654 .quad sys_getsid 654 .quad sys_getsid
655 .quad sys_fdatasync 655 .quad sys_fdatasync
656 .quad sys32_sysctl /* sysctl */ 656 .quad compat_sys_sysctl /* sysctl */
657 .quad sys_mlock /* 150 */ 657 .quad sys_mlock /* 150 */
658 .quad sys_munlock 658 .quad sys_munlock
659 .quad sys_mlockall 659 .quad sys_mlockall
@@ -696,7 +696,7 @@ ia32_sys_call_table:
696 .quad quiet_ni_syscall /* streams2 */ 696 .quad quiet_ni_syscall /* streams2 */
697 .quad stub32_vfork /* 190 */ 697 .quad stub32_vfork /* 190 */
698 .quad compat_sys_getrlimit 698 .quad compat_sys_getrlimit
699 .quad sys32_mmap2 699 .quad sys_mmap_pgoff
700 .quad sys32_truncate64 700 .quad sys32_truncate64
701 .quad sys32_ftruncate64 701 .quad sys32_ftruncate64
702 .quad sys32_stat64 /* 195 */ 702 .quad sys32_stat64 /* 195 */
@@ -841,4 +841,5 @@ ia32_sys_call_table:
841 .quad compat_sys_pwritev 841 .quad compat_sys_pwritev
842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */ 842 .quad compat_sys_rt_tgsigqueueinfo /* 335 */
843 .quad sys_perf_event_open 843 .quad sys_perf_event_open
844 .quad compat_sys_recvmmsg
844ia32_syscall_end: 845ia32_syscall_end:
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 9f5527198825..626be156d88d 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -40,6 +40,7 @@
40#include <linux/ptrace.h> 40#include <linux/ptrace.h>
41#include <linux/highuid.h> 41#include <linux/highuid.h>
42#include <linux/sysctl.h> 42#include <linux/sysctl.h>
43#include <linux/slab.h>
43#include <asm/mman.h> 44#include <asm/mman.h>
44#include <asm/types.h> 45#include <asm/types.h>
45#include <asm/uaccess.h> 46#include <asm/uaccess.h>
@@ -143,7 +144,7 @@ asmlinkage long sys32_fstatat(unsigned int dfd, char __user *filename,
143 * block for parameter passing.. 144 * block for parameter passing..
144 */ 145 */
145 146
146struct mmap_arg_struct { 147struct mmap_arg_struct32 {
147 unsigned int addr; 148 unsigned int addr;
148 unsigned int len; 149 unsigned int len;
149 unsigned int prot; 150 unsigned int prot;
@@ -152,12 +153,9 @@ struct mmap_arg_struct {
152 unsigned int offset; 153 unsigned int offset;
153}; 154};
154 155
155asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg) 156asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *arg)
156{ 157{
157 struct mmap_arg_struct a; 158 struct mmap_arg_struct32 a;
158 struct file *file = NULL;
159 unsigned long retval;
160 struct mm_struct *mm ;
161 159
162 if (copy_from_user(&a, arg, sizeof(a))) 160 if (copy_from_user(&a, arg, sizeof(a)))
163 return -EFAULT; 161 return -EFAULT;
@@ -165,22 +163,8 @@ asmlinkage long sys32_mmap(struct mmap_arg_struct __user *arg)
165 if (a.offset & ~PAGE_MASK) 163 if (a.offset & ~PAGE_MASK)
166 return -EINVAL; 164 return -EINVAL;
167 165
168 if (!(a.flags & MAP_ANONYMOUS)) { 166 return sys_mmap_pgoff(a.addr, a.len, a.prot, a.flags, a.fd,
169 file = fget(a.fd);
170 if (!file)
171 return -EBADF;
172 }
173
174 mm = current->mm;
175 down_write(&mm->mmap_sem);
176 retval = do_mmap_pgoff(file, a.addr, a.len, a.prot, a.flags,
177 a.offset>>PAGE_SHIFT); 167 a.offset>>PAGE_SHIFT);
178 if (file)
179 fput(file);
180
181 up_write(&mm->mmap_sem);
182
183 return retval;
184} 168}
185 169
186asmlinkage long sys32_mprotect(unsigned long start, size_t len, 170asmlinkage long sys32_mprotect(unsigned long start, size_t len,
@@ -349,24 +333,6 @@ asmlinkage long sys32_alarm(unsigned int seconds)
349 return alarm_setitimer(seconds); 333 return alarm_setitimer(seconds);
350} 334}
351 335
352struct sel_arg_struct {
353 unsigned int n;
354 unsigned int inp;
355 unsigned int outp;
356 unsigned int exp;
357 unsigned int tvp;
358};
359
360asmlinkage long sys32_old_select(struct sel_arg_struct __user *arg)
361{
362 struct sel_arg_struct a;
363
364 if (copy_from_user(&a, arg, sizeof(a)))
365 return -EFAULT;
366 return compat_sys_select(a.n, compat_ptr(a.inp), compat_ptr(a.outp),
367 compat_ptr(a.exp), compat_ptr(a.tvp));
368}
369
370asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr, 336asmlinkage long sys32_waitpid(compat_pid_t pid, unsigned int *stat_addr,
371 int options) 337 int options)
372{ 338{
@@ -434,62 +400,6 @@ asmlinkage long sys32_rt_sigqueueinfo(int pid, int sig,
434 return ret; 400 return ret;
435} 401}
436 402
437#ifdef CONFIG_SYSCTL_SYSCALL
438struct sysctl_ia32 {
439 unsigned int name;
440 int nlen;
441 unsigned int oldval;
442 unsigned int oldlenp;
443 unsigned int newval;
444 unsigned int newlen;
445 unsigned int __unused[4];
446};
447
448
449asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *args32)
450{
451 struct sysctl_ia32 a32;
452 mm_segment_t old_fs = get_fs();
453 void __user *oldvalp, *newvalp;
454 size_t oldlen;
455 int __user *namep;
456 long ret;
457
458 if (copy_from_user(&a32, args32, sizeof(a32)))
459 return -EFAULT;
460
461 /*
462 * We need to pre-validate these because we have to disable
463 * address checking before calling do_sysctl() because of
464 * OLDLEN but we can't run the risk of the user specifying bad
465 * addresses here. Well, since we're dealing with 32 bit
466 * addresses, we KNOW that access_ok() will always succeed, so
467 * this is an expensive NOP, but so what...
468 */
469 namep = compat_ptr(a32.name);
470 oldvalp = compat_ptr(a32.oldval);
471 newvalp = compat_ptr(a32.newval);
472
473 if ((oldvalp && get_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
474 || !access_ok(VERIFY_WRITE, namep, 0)
475 || !access_ok(VERIFY_WRITE, oldvalp, 0)
476 || !access_ok(VERIFY_WRITE, newvalp, 0))
477 return -EFAULT;
478
479 set_fs(KERNEL_DS);
480 lock_kernel();
481 ret = do_sysctl(namep, a32.nlen, oldvalp, (size_t __user *)&oldlen,
482 newvalp, (size_t) a32.newlen);
483 unlock_kernel();
484 set_fs(old_fs);
485
486 if (oldvalp && put_user(oldlen, (int __user *)compat_ptr(a32.oldlenp)))
487 return -EFAULT;
488
489 return ret;
490}
491#endif
492
493/* warning: next two assume little endian */ 403/* warning: next two assume little endian */
494asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count, 404asmlinkage long sys32_pread(unsigned int fd, char __user *ubuf, u32 count,
495 u32 poslo, u32 poshi) 405 u32 poslo, u32 poshi)
@@ -539,82 +449,6 @@ asmlinkage long sys32_sendfile(int out_fd, int in_fd,
539 return ret; 449 return ret;
540} 450}
541 451
542asmlinkage long sys32_mmap2(unsigned long addr, unsigned long len,
543 unsigned long prot, unsigned long flags,
544 unsigned long fd, unsigned long pgoff)
545{
546 struct mm_struct *mm = current->mm;
547 unsigned long error;
548 struct file *file = NULL;
549
550 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
551 if (!(flags & MAP_ANONYMOUS)) {
552 file = fget(fd);
553 if (!file)
554 return -EBADF;
555 }
556
557 down_write(&mm->mmap_sem);
558 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
559 up_write(&mm->mmap_sem);
560
561 if (file)
562 fput(file);
563 return error;
564}
565
566asmlinkage long sys32_olduname(struct oldold_utsname __user *name)
567{
568 char *arch = "x86_64";
569 int err;
570
571 if (!name)
572 return -EFAULT;
573 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
574 return -EFAULT;
575
576 down_read(&uts_sem);
577
578 err = __copy_to_user(&name->sysname, &utsname()->sysname,
579 __OLD_UTS_LEN);
580 err |= __put_user(0, name->sysname+__OLD_UTS_LEN);
581 err |= __copy_to_user(&name->nodename, &utsname()->nodename,
582 __OLD_UTS_LEN);
583 err |= __put_user(0, name->nodename+__OLD_UTS_LEN);
584 err |= __copy_to_user(&name->release, &utsname()->release,
585 __OLD_UTS_LEN);
586 err |= __put_user(0, name->release+__OLD_UTS_LEN);
587 err |= __copy_to_user(&name->version, &utsname()->version,
588 __OLD_UTS_LEN);
589 err |= __put_user(0, name->version+__OLD_UTS_LEN);
590
591 if (personality(current->personality) == PER_LINUX32)
592 arch = "i686";
593
594 err |= __copy_to_user(&name->machine, arch, strlen(arch) + 1);
595
596 up_read(&uts_sem);
597
598 err = err ? -EFAULT : 0;
599
600 return err;
601}
602
603long sys32_uname(struct old_utsname __user *name)
604{
605 int err;
606
607 if (!name)
608 return -EFAULT;
609 down_read(&uts_sem);
610 err = copy_to_user(name, utsname(), sizeof(*name));
611 up_read(&uts_sem);
612 if (personality(current->personality) == PER_LINUX32)
613 err |= copy_to_user(&name->machine, "i686", 5);
614
615 return err ? -EFAULT : 0;
616}
617
618asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv, 452asmlinkage long sys32_execve(char __user *name, compat_uptr_t __user *argv,
619 compat_uptr_t __user *envp, struct pt_regs *regs) 453 compat_uptr_t __user *envp, struct pt_regs *regs)
620{ 454{
diff --git a/arch/x86/include/asm/Kbuild b/arch/x86/include/asm/Kbuild
index 4a8e80cdcfa5..493092efaa3b 100644
--- a/arch/x86/include/asm/Kbuild
+++ b/arch/x86/include/asm/Kbuild
@@ -10,6 +10,8 @@ header-y += ptrace-abi.h
10header-y += sigcontext32.h 10header-y += sigcontext32.h
11header-y += ucontext.h 11header-y += ucontext.h
12header-y += processor-flags.h 12header-y += processor-flags.h
13header-y += hw_breakpoint.h
14header-y += hyperv.h
13 15
14unifdef-y += e820.h 16unifdef-y += e820.h
15unifdef-y += ist.h 17unifdef-y += ist.h
diff --git a/arch/x86/include/asm/a.out-core.h b/arch/x86/include/asm/a.out-core.h
index bb70e397aa84..7a15588e45d4 100644
--- a/arch/x86/include/asm/a.out-core.h
+++ b/arch/x86/include/asm/a.out-core.h
@@ -17,6 +17,7 @@
17 17
18#include <linux/user.h> 18#include <linux/user.h>
19#include <linux/elfcore.h> 19#include <linux/elfcore.h>
20#include <asm/debugreg.h>
20 21
21/* 22/*
22 * fill in the user structure for an a.out core dump 23 * fill in the user structure for an a.out core dump
@@ -32,14 +33,7 @@ static inline void aout_dump_thread(struct pt_regs *regs, struct user *dump)
32 >> PAGE_SHIFT; 33 >> PAGE_SHIFT;
33 dump->u_dsize -= dump->u_tsize; 34 dump->u_dsize -= dump->u_tsize;
34 dump->u_ssize = 0; 35 dump->u_ssize = 0;
35 dump->u_debugreg[0] = current->thread.debugreg0; 36 aout_dump_debugregs(dump);
36 dump->u_debugreg[1] = current->thread.debugreg1;
37 dump->u_debugreg[2] = current->thread.debugreg2;
38 dump->u_debugreg[3] = current->thread.debugreg3;
39 dump->u_debugreg[4] = 0;
40 dump->u_debugreg[5] = 0;
41 dump->u_debugreg[6] = current->thread.debugreg6;
42 dump->u_debugreg[7] = current->thread.debugreg7;
43 37
44 if (dump->start_stack < TASK_SIZE) 38 if (dump->start_stack < TASK_SIZE)
45 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack)) 39 dump->u_ssize = ((unsigned long)(TASK_SIZE - dump->start_stack))
diff --git a/arch/x86/include/asm/acpi.h b/arch/x86/include/asm/acpi.h
index 4518dc500903..56f462cf22d2 100644
--- a/arch/x86/include/asm/acpi.h
+++ b/arch/x86/include/asm/acpi.h
@@ -118,7 +118,7 @@ extern void acpi_restore_state_mem(void);
118extern unsigned long acpi_wakeup_address; 118extern unsigned long acpi_wakeup_address;
119 119
120/* early initialization routine */ 120/* early initialization routine */
121extern void acpi_reserve_bootmem(void); 121extern void acpi_reserve_wakeup_memory(void);
122 122
123/* 123/*
124 * Check if the CPU can handle C2 and deeper 124 * Check if the CPU can handle C2 and deeper
@@ -142,6 +142,32 @@ static inline unsigned int acpi_processor_cstate_check(unsigned int max_cstate)
142 return max_cstate; 142 return max_cstate;
143} 143}
144 144
145static inline bool arch_has_acpi_pdc(void)
146{
147 struct cpuinfo_x86 *c = &cpu_data(0);
148 return (c->x86_vendor == X86_VENDOR_INTEL ||
149 c->x86_vendor == X86_VENDOR_CENTAUR);
150}
151
152static inline void arch_acpi_set_pdc_bits(u32 *buf)
153{
154 struct cpuinfo_x86 *c = &cpu_data(0);
155
156 buf[2] |= ACPI_PDC_C_CAPABILITY_SMP;
157
158 if (cpu_has(c, X86_FEATURE_EST))
159 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
160
161 if (cpu_has(c, X86_FEATURE_ACPI))
162 buf[2] |= ACPI_PDC_T_FFH;
163
164 /*
165 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
166 */
167 if (!cpu_has(c, X86_FEATURE_MWAIT))
168 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
169}
170
145#else /* !CONFIG_ACPI */ 171#else /* !CONFIG_ACPI */
146 172
147#define acpi_lapic 0 173#define acpi_lapic 0
@@ -158,6 +184,7 @@ struct bootnode;
158 184
159#ifdef CONFIG_ACPI_NUMA 185#ifdef CONFIG_ACPI_NUMA
160extern int acpi_numa; 186extern int acpi_numa;
187extern int acpi_get_nodes(struct bootnode *physnodes);
161extern int acpi_scan_nodes(unsigned long start, unsigned long end); 188extern int acpi_scan_nodes(unsigned long start, unsigned long end);
162#define NR_NODE_MEMBLKS (MAX_NUMNODES*2) 189#define NR_NODE_MEMBLKS (MAX_NUMNODES*2)
163extern void acpi_fake_nodes(const struct bootnode *fake_nodes, 190extern void acpi_fake_nodes(const struct bootnode *fake_nodes,
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index e2077d343c33..b97f786a48d5 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -1,17 +1,13 @@
1#ifdef __ASSEMBLY__ 1#ifdef __ASSEMBLY__
2 2
3#ifdef CONFIG_X86_32 3#include <asm/asm.h>
4# define X86_ALIGN .long
5#else
6# define X86_ALIGN .quad
7#endif
8 4
9#ifdef CONFIG_SMP 5#ifdef CONFIG_SMP
10 .macro LOCK_PREFIX 6 .macro LOCK_PREFIX
111: lock 71: lock
12 .section .smp_locks,"a" 8 .section .smp_locks,"a"
13 .align 4 9 _ASM_ALIGN
14 X86_ALIGN 1b 10 _ASM_PTR 1b
15 .previous 11 .previous
16 .endm 12 .endm
17#else 13#else
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index c240efc74e00..b09ec55650b3 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -65,12 +65,17 @@ extern void alternatives_smp_module_add(struct module *mod, char *name,
65 void *text, void *text_end); 65 void *text, void *text_end);
66extern void alternatives_smp_module_del(struct module *mod); 66extern void alternatives_smp_module_del(struct module *mod);
67extern void alternatives_smp_switch(int smp); 67extern void alternatives_smp_switch(int smp);
68extern int alternatives_text_reserved(void *start, void *end);
68#else 69#else
69static inline void alternatives_smp_module_add(struct module *mod, char *name, 70static inline void alternatives_smp_module_add(struct module *mod, char *name,
70 void *locks, void *locks_end, 71 void *locks, void *locks_end,
71 void *text, void *text_end) {} 72 void *text, void *text_end) {}
72static inline void alternatives_smp_module_del(struct module *mod) {} 73static inline void alternatives_smp_module_del(struct module *mod) {}
73static inline void alternatives_smp_switch(int smp) {} 74static inline void alternatives_smp_switch(int smp) {}
75static inline int alternatives_text_reserved(void *start, void *end)
76{
77 return 0;
78}
74#endif /* CONFIG_SMP */ 79#endif /* CONFIG_SMP */
75 80
76/* alternative assembly primitive: */ 81/* alternative assembly primitive: */
@@ -84,6 +89,7 @@ static inline void alternatives_smp_switch(int smp) {}
84 " .byte " __stringify(feature) "\n" /* feature bit */ \ 89 " .byte " __stringify(feature) "\n" /* feature bit */ \
85 " .byte 662b-661b\n" /* sourcelen */ \ 90 " .byte 662b-661b\n" /* sourcelen */ \
86 " .byte 664f-663f\n" /* replacementlen */ \ 91 " .byte 664f-663f\n" /* replacementlen */ \
92 " .byte 0xff + (664f-663f) - (662b-661b)\n" /* rlen <= slen */ \
87 ".previous\n" \ 93 ".previous\n" \
88 ".section .altinstr_replacement, \"ax\"\n" \ 94 ".section .altinstr_replacement, \"ax\"\n" \
89 "663:\n\t" newinstr "\n664:\n" /* replacement */ \ 95 "663:\n\t" newinstr "\n664:\n" /* replacement */ \
@@ -124,11 +130,16 @@ static inline void alternatives_smp_switch(int smp) {}
124 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ 130 asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \
125 : output : "i" (0), ## input) 131 : output : "i" (0), ## input)
126 132
133/* Like alternative_io, but for replacing a direct call with another one. */
134#define alternative_call(oldfunc, newfunc, feature, output, input...) \
135 asm volatile (ALTERNATIVE("call %P[old]", "call %P[new]", feature) \
136 : output : [old] "i" (oldfunc), [new] "i" (newfunc), ## input)
137
127/* 138/*
128 * use this macro(s) if you need more than one output parameter 139 * use this macro(s) if you need more than one output parameter
129 * in alternative_io 140 * in alternative_io
130 */ 141 */
131#define ASM_OUTPUT2(a, b) a, b 142#define ASM_OUTPUT2(a...) a
132 143
133struct paravirt_patch_site; 144struct paravirt_patch_site;
134#ifdef CONFIG_PARAVIRT 145#ifdef CONFIG_PARAVIRT
@@ -154,10 +165,12 @@ static inline void apply_paravirt(struct paravirt_patch_site *start,
154 * invalid instruction possible) or if the instructions are changed from a 165 * invalid instruction possible) or if the instructions are changed from a
155 * consistent state to another consistent state atomically. 166 * consistent state to another consistent state atomically.
156 * More care must be taken when modifying code in the SMP case because of 167 * More care must be taken when modifying code in the SMP case because of
157 * Intel's errata. 168 * Intel's errata. text_poke_smp() takes care that errata, but still
169 * doesn't support NMI/MCE handler code modifying.
158 * On the local CPU you need to be protected again NMI or MCE handlers seeing an 170 * On the local CPU you need to be protected again NMI or MCE handlers seeing an
159 * inconsistent instruction while you patch. 171 * inconsistent instruction while you patch.
160 */ 172 */
161extern void *text_poke(void *addr, const void *opcode, size_t len); 173extern void *text_poke(void *addr, const void *opcode, size_t len);
174extern void *text_poke_smp(void *addr, const void *opcode, size_t len);
162 175
163#endif /* _ASM_X86_ALTERNATIVE_H */ 176#endif /* _ASM_X86_ALTERNATIVE_H */
diff --git a/arch/x86/include/asm/amd_iommu.h b/arch/x86/include/asm/amd_iommu.h
index 4b180897e6b5..5af2982133b5 100644
--- a/arch/x86/include/asm/amd_iommu.h
+++ b/arch/x86/include/asm/amd_iommu.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -23,19 +23,13 @@
23#include <linux/irqreturn.h> 23#include <linux/irqreturn.h>
24 24
25#ifdef CONFIG_AMD_IOMMU 25#ifdef CONFIG_AMD_IOMMU
26extern int amd_iommu_init(void); 26
27extern int amd_iommu_init_dma_ops(void);
28extern int amd_iommu_init_passthrough(void);
29extern void amd_iommu_detect(void); 27extern void amd_iommu_detect(void);
30extern irqreturn_t amd_iommu_int_handler(int irq, void *data); 28
31extern void amd_iommu_flush_all_domains(void);
32extern void amd_iommu_flush_all_devices(void);
33extern void amd_iommu_shutdown(void);
34extern void amd_iommu_apply_erratum_63(u16 devid);
35#else 29#else
36static inline int amd_iommu_init(void) { return -ENODEV; } 30
37static inline void amd_iommu_detect(void) { } 31static inline void amd_iommu_detect(void) { }
38static inline void amd_iommu_shutdown(void) { } 32
39#endif 33#endif
40 34
41#endif /* _ASM_X86_AMD_IOMMU_H */ 35#endif /* _ASM_X86_AMD_IOMMU_H */
diff --git a/arch/x86/include/asm/amd_iommu_proto.h b/arch/x86/include/asm/amd_iommu_proto.h
new file mode 100644
index 000000000000..d2544f1d705d
--- /dev/null
+++ b/arch/x86/include/asm/amd_iommu_proto.h
@@ -0,0 +1,41 @@
1/*
2 * Copyright (C) 2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 as published
7 * by the Free Software Foundation.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 */
18
19#ifndef _ASM_X86_AMD_IOMMU_PROTO_H
20#define _ASM_X86_AMD_IOMMU_PROTO_H
21
22struct amd_iommu;
23
24extern int amd_iommu_init_dma_ops(void);
25extern int amd_iommu_init_passthrough(void);
26extern irqreturn_t amd_iommu_int_handler(int irq, void *data);
27extern void amd_iommu_flush_all_domains(void);
28extern void amd_iommu_flush_all_devices(void);
29extern void amd_iommu_apply_erratum_63(u16 devid);
30extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
31extern int amd_iommu_init_devices(void);
32extern void amd_iommu_uninit_devices(void);
33extern void amd_iommu_init_notifier(void);
34extern void amd_iommu_init_api(void);
35#ifndef CONFIG_AMD_IOMMU_STATS
36
37static inline void amd_iommu_stats_init(void) { }
38
39#endif /* !CONFIG_AMD_IOMMU_STATS */
40
41#endif /* _ASM_X86_AMD_IOMMU_PROTO_H */
diff --git a/arch/x86/include/asm/amd_iommu_types.h b/arch/x86/include/asm/amd_iommu_types.h
index 2a2cc7a78a81..86a0ff0aeac7 100644
--- a/arch/x86/include/asm/amd_iommu_types.h
+++ b/arch/x86/include/asm/amd_iommu_types.h
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -21,10 +21,16 @@
21#define _ASM_X86_AMD_IOMMU_TYPES_H 21#define _ASM_X86_AMD_IOMMU_TYPES_H
22 22
23#include <linux/types.h> 23#include <linux/types.h>
24#include <linux/mutex.h>
24#include <linux/list.h> 25#include <linux/list.h>
25#include <linux/spinlock.h> 26#include <linux/spinlock.h>
26 27
27/* 28/*
29 * Maximum number of IOMMUs supported
30 */
31#define MAX_IOMMUS 32
32
33/*
28 * some size calculation constants 34 * some size calculation constants
29 */ 35 */
30#define DEV_TABLE_ENTRY_SIZE 32 36#define DEV_TABLE_ENTRY_SIZE 32
@@ -135,6 +141,7 @@
135 141
136/* constants to configure the command buffer */ 142/* constants to configure the command buffer */
137#define CMD_BUFFER_SIZE 8192 143#define CMD_BUFFER_SIZE 8192
144#define CMD_BUFFER_UNINITIALIZED 1
138#define CMD_BUFFER_ENTRIES 512 145#define CMD_BUFFER_ENTRIES 512
139#define MMIO_CMD_SIZE_SHIFT 56 146#define MMIO_CMD_SIZE_SHIFT 56
140#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT) 147#define MMIO_CMD_SIZE_512 (0x9ULL << MMIO_CMD_SIZE_SHIFT)
@@ -206,6 +213,9 @@ extern bool amd_iommu_dump;
206 printk(KERN_INFO "AMD-Vi: " format, ## arg); \ 213 printk(KERN_INFO "AMD-Vi: " format, ## arg); \
207 } while(0); 214 } while(0);
208 215
216/* global flag if IOMMUs cache non-present entries */
217extern bool amd_iommu_np_cache;
218
209/* 219/*
210 * Make iterating over all IOMMUs easier 220 * Make iterating over all IOMMUs easier
211 */ 221 */
@@ -226,14 +236,30 @@ extern bool amd_iommu_dump;
226 * independent of their use. 236 * independent of their use.
227 */ 237 */
228struct protection_domain { 238struct protection_domain {
239 struct list_head list; /* for list of all protection domains */
240 struct list_head dev_list; /* List of all devices in this domain */
229 spinlock_t lock; /* mostly used to lock the page table*/ 241 spinlock_t lock; /* mostly used to lock the page table*/
242 struct mutex api_lock; /* protect page tables in the iommu-api path */
230 u16 id; /* the domain id written to the device table */ 243 u16 id; /* the domain id written to the device table */
231 int mode; /* paging mode (0-6 levels) */ 244 int mode; /* paging mode (0-6 levels) */
232 u64 *pt_root; /* page table root pointer */ 245 u64 *pt_root; /* page table root pointer */
233 unsigned long flags; /* flags to find out type of domain */ 246 unsigned long flags; /* flags to find out type of domain */
234 bool updated; /* complete domain flush required */ 247 bool updated; /* complete domain flush required */
235 unsigned dev_cnt; /* devices assigned to this domain */ 248 unsigned dev_cnt; /* devices assigned to this domain */
249 unsigned dev_iommu[MAX_IOMMUS]; /* per-IOMMU reference count */
236 void *priv; /* private data */ 250 void *priv; /* private data */
251
252};
253
254/*
255 * This struct contains device specific data for the IOMMU
256 */
257struct iommu_dev_data {
258 struct list_head list; /* For domain->dev_list */
259 struct device *dev; /* Device this data belong to */
260 struct device *alias; /* The Alias Device */
261 struct protection_domain *domain; /* Domain the device is bound to */
262 atomic_t bind; /* Domain attach reverent count */
237}; 263};
238 264
239/* 265/*
@@ -291,6 +317,9 @@ struct dma_ops_domain {
291struct amd_iommu { 317struct amd_iommu {
292 struct list_head list; 318 struct list_head list;
293 319
320 /* Index within the IOMMU array */
321 int index;
322
294 /* locks the accesses to the hardware */ 323 /* locks the accesses to the hardware */
295 spinlock_t lock; 324 spinlock_t lock;
296 325
@@ -357,6 +386,21 @@ struct amd_iommu {
357extern struct list_head amd_iommu_list; 386extern struct list_head amd_iommu_list;
358 387
359/* 388/*
389 * Array with pointers to each IOMMU struct
390 * The indices are referenced in the protection domains
391 */
392extern struct amd_iommu *amd_iommus[MAX_IOMMUS];
393
394/* Number of IOMMUs present in the system */
395extern int amd_iommus_present;
396
397/*
398 * Declarations for the global list of all protection domains
399 */
400extern spinlock_t amd_iommu_pd_lock;
401extern struct list_head amd_iommu_pd_list;
402
403/*
360 * Structure defining one entry in the device table 404 * Structure defining one entry in the device table
361 */ 405 */
362struct dev_table_entry { 406struct dev_table_entry {
@@ -416,15 +460,9 @@ extern unsigned amd_iommu_aperture_order;
416/* largest PCI device id we expect translation requests for */ 460/* largest PCI device id we expect translation requests for */
417extern u16 amd_iommu_last_bdf; 461extern u16 amd_iommu_last_bdf;
418 462
419/* data structures for protection domain handling */
420extern struct protection_domain **amd_iommu_pd_table;
421
422/* allocation bitmap for domain ids */ 463/* allocation bitmap for domain ids */
423extern unsigned long *amd_iommu_pd_alloc_bitmap; 464extern unsigned long *amd_iommu_pd_alloc_bitmap;
424 465
425/* will be 1 if device isolation is enabled */
426extern bool amd_iommu_isolate;
427
428/* 466/*
429 * If true, the addresses will be flushed on unmap time, not when 467 * If true, the addresses will be flushed on unmap time, not when
430 * they are reused 468 * they are reused
@@ -462,11 +500,6 @@ struct __iommu_counter {
462#define ADD_STATS_COUNTER(name, x) 500#define ADD_STATS_COUNTER(name, x)
463#define SUB_STATS_COUNTER(name, x) 501#define SUB_STATS_COUNTER(name, x)
464 502
465static inline void amd_iommu_stats_init(void) { }
466
467#endif /* CONFIG_AMD_IOMMU_STATS */ 503#endif /* CONFIG_AMD_IOMMU_STATS */
468 504
469/* some function prototypes */
470extern void amd_iommu_reset_cmd_buffer(struct amd_iommu *iommu);
471
472#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */ 505#endif /* _ASM_X86_AMD_IOMMU_TYPES_H */
diff --git a/arch/x86/include/asm/apb_timer.h b/arch/x86/include/asm/apb_timer.h
new file mode 100644
index 000000000000..c74a2eebe570
--- /dev/null
+++ b/arch/x86/include/asm/apb_timer.h
@@ -0,0 +1,70 @@
1/*
2 * apb_timer.h: Driver for Langwell APB timer based on Synopsis DesignWare
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 */
14
15#ifndef ASM_X86_APBT_H
16#define ASM_X86_APBT_H
17#include <linux/sfi.h>
18
19#ifdef CONFIG_APB_TIMER
20
21/* Langwell DW APB timer registers */
22#define APBTMR_N_LOAD_COUNT 0x00
23#define APBTMR_N_CURRENT_VALUE 0x04
24#define APBTMR_N_CONTROL 0x08
25#define APBTMR_N_EOI 0x0c
26#define APBTMR_N_INT_STATUS 0x10
27
28#define APBTMRS_INT_STATUS 0xa0
29#define APBTMRS_EOI 0xa4
30#define APBTMRS_RAW_INT_STATUS 0xa8
31#define APBTMRS_COMP_VERSION 0xac
32#define APBTMRS_REG_SIZE 0x14
33
34/* register bits */
35#define APBTMR_CONTROL_ENABLE (1<<0)
36#define APBTMR_CONTROL_MODE_PERIODIC (1<<1) /*1: periodic 0:free running */
37#define APBTMR_CONTROL_INT (1<<2)
38
39/* default memory mapped register base */
40#define LNW_SCU_ADDR 0xFF100000
41#define LNW_EXT_TIMER_OFFSET 0x1B800
42#define APBT_DEFAULT_BASE (LNW_SCU_ADDR+LNW_EXT_TIMER_OFFSET)
43#define LNW_EXT_TIMER_PGOFFSET 0x800
44
45/* APBT clock speed range from PCLK to fabric base, 25-100MHz */
46#define APBT_MAX_FREQ 50
47#define APBT_MIN_FREQ 1
48#define APBT_MMAP_SIZE 1024
49
50#define APBT_DEV_USED 1
51
52extern void apbt_time_init(void);
53extern struct clock_event_device *global_clock_event;
54extern unsigned long apbt_quick_calibrate(void);
55extern int arch_setup_apbt_irqs(int irq, int trigger, int mask, int cpu);
56extern void apbt_setup_secondary_clock(void);
57extern unsigned int boot_cpu_id;
58extern int disable_apbt_percpu;
59
60extern struct sfi_timer_table_entry *sfi_get_mtmr(int hint);
61extern void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr);
62extern int sfi_mtimer_num;
63
64#else /* CONFIG_APB_TIMER */
65
66static inline unsigned long apbt_quick_calibrate(void) {return 0; }
67static inline void apbt_time_init(void) {return 0; }
68
69#endif
70#endif /* ASM_X86_APBT_H */
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index 474d80d3e6cc..b4ac2cdcb64f 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -297,20 +297,20 @@ struct apic {
297 int disable_esr; 297 int disable_esr;
298 298
299 int dest_logical; 299 int dest_logical;
300 unsigned long (*check_apicid_used)(physid_mask_t bitmap, int apicid); 300 unsigned long (*check_apicid_used)(physid_mask_t *map, int apicid);
301 unsigned long (*check_apicid_present)(int apicid); 301 unsigned long (*check_apicid_present)(int apicid);
302 302
303 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask); 303 void (*vector_allocation_domain)(int cpu, struct cpumask *retmask);
304 void (*init_apic_ldr)(void); 304 void (*init_apic_ldr)(void);
305 305
306 physid_mask_t (*ioapic_phys_id_map)(physid_mask_t map); 306 void (*ioapic_phys_id_map)(physid_mask_t *phys_map, physid_mask_t *retmap);
307 307
308 void (*setup_apic_routing)(void); 308 void (*setup_apic_routing)(void);
309 int (*multi_timer_check)(int apic, int irq); 309 int (*multi_timer_check)(int apic, int irq);
310 int (*apicid_to_node)(int logical_apicid); 310 int (*apicid_to_node)(int logical_apicid);
311 int (*cpu_to_logical_apicid)(int cpu); 311 int (*cpu_to_logical_apicid)(int cpu);
312 int (*cpu_present_to_apicid)(int mps_cpu); 312 int (*cpu_present_to_apicid)(int mps_cpu);
313 physid_mask_t (*apicid_to_cpu_present)(int phys_apicid); 313 void (*apicid_to_cpu_present)(int phys_apicid, physid_mask_t *retmap);
314 void (*setup_portio_remap)(void); 314 void (*setup_portio_remap)(void);
315 int (*check_phys_apicid_present)(int phys_apicid); 315 int (*check_phys_apicid_present)(int phys_apicid);
316 void (*enable_apic_mode)(void); 316 void (*enable_apic_mode)(void);
@@ -488,6 +488,8 @@ static inline unsigned int read_apic_id(void)
488 488
489extern void default_setup_apic_routing(void); 489extern void default_setup_apic_routing(void);
490 490
491extern struct apic apic_noop;
492
491#ifdef CONFIG_X86_32 493#ifdef CONFIG_X86_32
492 494
493extern struct apic apic_default; 495extern struct apic apic_default;
@@ -532,9 +534,9 @@ default_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
532 return (unsigned int)(mask1 & mask2 & mask3); 534 return (unsigned int)(mask1 & mask2 & mask3);
533} 535}
534 536
535static inline unsigned long default_check_apicid_used(physid_mask_t bitmap, int apicid) 537static inline unsigned long default_check_apicid_used(physid_mask_t *map, int apicid)
536{ 538{
537 return physid_isset(apicid, bitmap); 539 return physid_isset(apicid, *map);
538} 540}
539 541
540static inline unsigned long default_check_apicid_present(int bit) 542static inline unsigned long default_check_apicid_present(int bit)
@@ -542,9 +544,9 @@ static inline unsigned long default_check_apicid_present(int bit)
542 return physid_isset(bit, phys_cpu_present_map); 544 return physid_isset(bit, phys_cpu_present_map);
543} 545}
544 546
545static inline physid_mask_t default_ioapic_phys_id_map(physid_mask_t phys_map) 547static inline void default_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
546{ 548{
547 return phys_map; 549 *retmap = *phys_map;
548} 550}
549 551
550/* Mapping from cpu number to logical apicid */ 552/* Mapping from cpu number to logical apicid */
@@ -583,11 +585,6 @@ extern int default_cpu_present_to_apicid(int mps_cpu);
583extern int default_check_phys_apicid_present(int phys_apicid); 585extern int default_check_phys_apicid_present(int phys_apicid);
584#endif 586#endif
585 587
586static inline physid_mask_t default_apicid_to_cpu_present(int phys_apicid)
587{
588 return physid_mask_of_physid(phys_apicid);
589}
590
591#endif /* CONFIG_X86_LOCAL_APIC */ 588#endif /* CONFIG_X86_LOCAL_APIC */
592 589
593#ifdef CONFIG_X86_32 590#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 3b62da926de9..7fe3b3060f08 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -11,6 +11,12 @@
11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000 11#define IO_APIC_DEFAULT_PHYS_BASE 0xfec00000
12#define APIC_DEFAULT_PHYS_BASE 0xfee00000 12#define APIC_DEFAULT_PHYS_BASE 0xfee00000
13 13
14/*
15 * This is the IO-APIC register space as specified
16 * by Intel docs:
17 */
18#define IO_APIC_SLOT_SIZE 1024
19
14#define APIC_ID 0x20 20#define APIC_ID 0x20
15 21
16#define APIC_LVR 0x30 22#define APIC_LVR 0x30
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
deleted file mode 100644
index 82f613c607ce..000000000000
--- a/arch/x86/include/asm/apicnum.h
+++ /dev/null
@@ -1,12 +0,0 @@
1#ifndef _ASM_X86_APICNUM_H
2#define _ASM_X86_APICNUM_H
3
4/* define MAX_IO_APICS */
5#ifdef CONFIG_X86_32
6# define MAX_IO_APICS 64
7#else
8# define MAX_IO_APICS 128
9# define MAX_LOCAL_APIC 32768
10#endif
11
12#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/asm-offsets.h b/arch/x86/include/asm/asm-offsets.h
new file mode 100644
index 000000000000..d370ee36a182
--- /dev/null
+++ b/arch/x86/include/asm/asm-offsets.h
@@ -0,0 +1 @@
#include <generated/asm-offsets.h>
diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index 4e1b8873c474..8f8217b9bdac 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -1,5 +1,300 @@
1#ifndef _ASM_X86_ATOMIC_H
2#define _ASM_X86_ATOMIC_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7#include <asm/alternative.h>
8#include <asm/cmpxchg.h>
9
10/*
11 * Atomic operations that C can't guarantee us. Useful for
12 * resource counting etc..
13 */
14
15#define ATOMIC_INIT(i) { (i) }
16
17/**
18 * atomic_read - read atomic variable
19 * @v: pointer of type atomic_t
20 *
21 * Atomically reads the value of @v.
22 */
23static inline int atomic_read(const atomic_t *v)
24{
25 return v->counter;
26}
27
28/**
29 * atomic_set - set atomic variable
30 * @v: pointer of type atomic_t
31 * @i: required value
32 *
33 * Atomically sets the value of @v to @i.
34 */
35static inline void atomic_set(atomic_t *v, int i)
36{
37 v->counter = i;
38}
39
40/**
41 * atomic_add - add integer to atomic variable
42 * @i: integer value to add
43 * @v: pointer of type atomic_t
44 *
45 * Atomically adds @i to @v.
46 */
47static inline void atomic_add(int i, atomic_t *v)
48{
49 asm volatile(LOCK_PREFIX "addl %1,%0"
50 : "+m" (v->counter)
51 : "ir" (i));
52}
53
54/**
55 * atomic_sub - subtract integer from atomic variable
56 * @i: integer value to subtract
57 * @v: pointer of type atomic_t
58 *
59 * Atomically subtracts @i from @v.
60 */
61static inline void atomic_sub(int i, atomic_t *v)
62{
63 asm volatile(LOCK_PREFIX "subl %1,%0"
64 : "+m" (v->counter)
65 : "ir" (i));
66}
67
68/**
69 * atomic_sub_and_test - subtract value from variable and test result
70 * @i: integer value to subtract
71 * @v: pointer of type atomic_t
72 *
73 * Atomically subtracts @i from @v and returns
74 * true if the result is zero, or false for all
75 * other cases.
76 */
77static inline int atomic_sub_and_test(int i, atomic_t *v)
78{
79 unsigned char c;
80
81 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
82 : "+m" (v->counter), "=qm" (c)
83 : "ir" (i) : "memory");
84 return c;
85}
86
87/**
88 * atomic_inc - increment atomic variable
89 * @v: pointer of type atomic_t
90 *
91 * Atomically increments @v by 1.
92 */
93static inline void atomic_inc(atomic_t *v)
94{
95 asm volatile(LOCK_PREFIX "incl %0"
96 : "+m" (v->counter));
97}
98
99/**
100 * atomic_dec - decrement atomic variable
101 * @v: pointer of type atomic_t
102 *
103 * Atomically decrements @v by 1.
104 */
105static inline void atomic_dec(atomic_t *v)
106{
107 asm volatile(LOCK_PREFIX "decl %0"
108 : "+m" (v->counter));
109}
110
111/**
112 * atomic_dec_and_test - decrement and test
113 * @v: pointer of type atomic_t
114 *
115 * Atomically decrements @v by 1 and
116 * returns true if the result is 0, or false for all other
117 * cases.
118 */
119static inline int atomic_dec_and_test(atomic_t *v)
120{
121 unsigned char c;
122
123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
124 : "+m" (v->counter), "=qm" (c)
125 : : "memory");
126 return c != 0;
127}
128
129/**
130 * atomic_inc_and_test - increment and test
131 * @v: pointer of type atomic_t
132 *
133 * Atomically increments @v by 1
134 * and returns true if the result is zero, or false for all
135 * other cases.
136 */
137static inline int atomic_inc_and_test(atomic_t *v)
138{
139 unsigned char c;
140
141 asm volatile(LOCK_PREFIX "incl %0; sete %1"
142 : "+m" (v->counter), "=qm" (c)
143 : : "memory");
144 return c != 0;
145}
146
147/**
148 * atomic_add_negative - add and test if negative
149 * @i: integer value to add
150 * @v: pointer of type atomic_t
151 *
152 * Atomically adds @i to @v and returns true
153 * if the result is negative, or false when
154 * result is greater than or equal to zero.
155 */
156static inline int atomic_add_negative(int i, atomic_t *v)
157{
158 unsigned char c;
159
160 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
161 : "+m" (v->counter), "=qm" (c)
162 : "ir" (i) : "memory");
163 return c;
164}
165
166/**
167 * atomic_add_return - add integer and return
168 * @i: integer value to add
169 * @v: pointer of type atomic_t
170 *
171 * Atomically adds @i to @v and returns @i + @v
172 */
173static inline int atomic_add_return(int i, atomic_t *v)
174{
175 int __i;
176#ifdef CONFIG_M386
177 unsigned long flags;
178 if (unlikely(boot_cpu_data.x86 <= 3))
179 goto no_xadd;
180#endif
181 /* Modern 486+ processor */
182 __i = i;
183 asm volatile(LOCK_PREFIX "xaddl %0, %1"
184 : "+r" (i), "+m" (v->counter)
185 : : "memory");
186 return i + __i;
187
188#ifdef CONFIG_M386
189no_xadd: /* Legacy 386 processor */
190 raw_local_irq_save(flags);
191 __i = atomic_read(v);
192 atomic_set(v, i + __i);
193 raw_local_irq_restore(flags);
194 return i + __i;
195#endif
196}
197
198/**
199 * atomic_sub_return - subtract integer and return
200 * @v: pointer of type atomic_t
201 * @i: integer value to subtract
202 *
203 * Atomically subtracts @i from @v and returns @v - @i
204 */
205static inline int atomic_sub_return(int i, atomic_t *v)
206{
207 return atomic_add_return(-i, v);
208}
209
210#define atomic_inc_return(v) (atomic_add_return(1, v))
211#define atomic_dec_return(v) (atomic_sub_return(1, v))
212
213static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
214{
215 return cmpxchg(&v->counter, old, new);
216}
217
218static inline int atomic_xchg(atomic_t *v, int new)
219{
220 return xchg(&v->counter, new);
221}
222
223/**
224 * atomic_add_unless - add unless the number is already a given value
225 * @v: pointer of type atomic_t
226 * @a: the amount to add to v...
227 * @u: ...unless v is equal to u.
228 *
229 * Atomically adds @a to @v, so long as @v was not already @u.
230 * Returns non-zero if @v was not @u, and zero otherwise.
231 */
232static inline int atomic_add_unless(atomic_t *v, int a, int u)
233{
234 int c, old;
235 c = atomic_read(v);
236 for (;;) {
237 if (unlikely(c == (u)))
238 break;
239 old = atomic_cmpxchg((v), c, c + (a));
240 if (likely(old == c))
241 break;
242 c = old;
243 }
244 return c != (u);
245}
246
247#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
248
249/**
250 * atomic_inc_short - increment of a short integer
251 * @v: pointer to type int
252 *
253 * Atomically adds 1 to @v
254 * Returns the new value of @u
255 */
256static inline short int atomic_inc_short(short int *v)
257{
258 asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
259 return *v;
260}
261
262#ifdef CONFIG_X86_64
263/**
264 * atomic_or_long - OR of two long integers
265 * @v1: pointer to type unsigned long
266 * @v2: pointer to type unsigned long
267 *
268 * Atomically ORs @v1 and @v2
269 * Returns the result of the OR
270 */
271static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
272{
273 asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
274}
275#endif
276
277/* These are x86-specific, used by some header files */
278#define atomic_clear_mask(mask, addr) \
279 asm volatile(LOCK_PREFIX "andl %0,%1" \
280 : : "r" (~(mask)), "m" (*(addr)) : "memory")
281
282#define atomic_set_mask(mask, addr) \
283 asm volatile(LOCK_PREFIX "orl %0,%1" \
284 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
285 : "memory")
286
287/* Atomic operations are already serializing on x86 */
288#define smp_mb__before_atomic_dec() barrier()
289#define smp_mb__after_atomic_dec() barrier()
290#define smp_mb__before_atomic_inc() barrier()
291#define smp_mb__after_atomic_inc() barrier()
292
1#ifdef CONFIG_X86_32 293#ifdef CONFIG_X86_32
2# include "atomic_32.h" 294# include "atomic64_32.h"
3#else 295#else
4# include "atomic_64.h" 296# include "atomic64_64.h"
5#endif 297#endif
298
299#include <asm-generic/atomic-long.h>
300#endif /* _ASM_X86_ATOMIC_H */
diff --git a/arch/x86/include/asm/atomic64_32.h b/arch/x86/include/asm/atomic64_32.h
new file mode 100644
index 000000000000..03027bf28de5
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_32.h
@@ -0,0 +1,160 @@
1#ifndef _ASM_X86_ATOMIC64_32_H
2#define _ASM_X86_ATOMIC64_32_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7//#include <asm/cmpxchg.h>
8
9/* An 64bit atomic type */
10
11typedef struct {
12 u64 __aligned(8) counter;
13} atomic64_t;
14
15#define ATOMIC64_INIT(val) { (val) }
16
17extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
18
19/**
20 * atomic64_xchg - xchg atomic64 variable
21 * @ptr: pointer to type atomic64_t
22 * @new_val: value to assign
23 *
24 * Atomically xchgs the value of @ptr to @new_val and returns
25 * the old value.
26 */
27extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
28
29/**
30 * atomic64_set - set atomic64 variable
31 * @ptr: pointer to type atomic64_t
32 * @new_val: value to assign
33 *
34 * Atomically sets the value of @ptr to @new_val.
35 */
36extern void atomic64_set(atomic64_t *ptr, u64 new_val);
37
38/**
39 * atomic64_read - read atomic64 variable
40 * @ptr: pointer to type atomic64_t
41 *
42 * Atomically reads the value of @ptr and returns it.
43 */
44static inline u64 atomic64_read(atomic64_t *ptr)
45{
46 u64 res;
47
48 /*
49 * Note, we inline this atomic64_t primitive because
50 * it only clobbers EAX/EDX and leaves the others
51 * untouched. We also (somewhat subtly) rely on the
52 * fact that cmpxchg8b returns the current 64-bit value
53 * of the memory location we are touching:
54 */
55 asm volatile(
56 "mov %%ebx, %%eax\n\t"
57 "mov %%ecx, %%edx\n\t"
58 LOCK_PREFIX "cmpxchg8b %1\n"
59 : "=&A" (res)
60 : "m" (*ptr)
61 );
62
63 return res;
64}
65
66extern u64 atomic64_read(atomic64_t *ptr);
67
68/**
69 * atomic64_add_return - add and return
70 * @delta: integer value to add
71 * @ptr: pointer to type atomic64_t
72 *
73 * Atomically adds @delta to @ptr and returns @delta + *@ptr
74 */
75extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
76
77/*
78 * Other variants with different arithmetic operators:
79 */
80extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
81extern u64 atomic64_inc_return(atomic64_t *ptr);
82extern u64 atomic64_dec_return(atomic64_t *ptr);
83
84/**
85 * atomic64_add - add integer to atomic64 variable
86 * @delta: integer value to add
87 * @ptr: pointer to type atomic64_t
88 *
89 * Atomically adds @delta to @ptr.
90 */
91extern void atomic64_add(u64 delta, atomic64_t *ptr);
92
93/**
94 * atomic64_sub - subtract the atomic64 variable
95 * @delta: integer value to subtract
96 * @ptr: pointer to type atomic64_t
97 *
98 * Atomically subtracts @delta from @ptr.
99 */
100extern void atomic64_sub(u64 delta, atomic64_t *ptr);
101
102/**
103 * atomic64_sub_and_test - subtract value from variable and test result
104 * @delta: integer value to subtract
105 * @ptr: pointer to type atomic64_t
106 *
107 * Atomically subtracts @delta from @ptr and returns
108 * true if the result is zero, or false for all
109 * other cases.
110 */
111extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
112
113/**
114 * atomic64_inc - increment atomic64 variable
115 * @ptr: pointer to type atomic64_t
116 *
117 * Atomically increments @ptr by 1.
118 */
119extern void atomic64_inc(atomic64_t *ptr);
120
121/**
122 * atomic64_dec - decrement atomic64 variable
123 * @ptr: pointer to type atomic64_t
124 *
125 * Atomically decrements @ptr by 1.
126 */
127extern void atomic64_dec(atomic64_t *ptr);
128
129/**
130 * atomic64_dec_and_test - decrement and test
131 * @ptr: pointer to type atomic64_t
132 *
133 * Atomically decrements @ptr by 1 and
134 * returns true if the result is 0, or false for all other
135 * cases.
136 */
137extern int atomic64_dec_and_test(atomic64_t *ptr);
138
139/**
140 * atomic64_inc_and_test - increment and test
141 * @ptr: pointer to type atomic64_t
142 *
143 * Atomically increments @ptr by 1
144 * and returns true if the result is zero, or false for all
145 * other cases.
146 */
147extern int atomic64_inc_and_test(atomic64_t *ptr);
148
149/**
150 * atomic64_add_negative - add and test if negative
151 * @delta: integer value to add
152 * @ptr: pointer to type atomic64_t
153 *
154 * Atomically adds @delta to @ptr and returns true
155 * if the result is negative, or false when
156 * result is greater than or equal to zero.
157 */
158extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
159
160#endif /* _ASM_X86_ATOMIC64_32_H */
diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h
new file mode 100644
index 000000000000..51c5b4056929
--- /dev/null
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -0,0 +1,224 @@
1#ifndef _ASM_X86_ATOMIC64_64_H
2#define _ASM_X86_ATOMIC64_64_H
3
4#include <linux/types.h>
5#include <asm/alternative.h>
6#include <asm/cmpxchg.h>
7
8/* The 64-bit atomic type */
9
10#define ATOMIC64_INIT(i) { (i) }
11
12/**
13 * atomic64_read - read atomic64 variable
14 * @v: pointer of type atomic64_t
15 *
16 * Atomically reads the value of @v.
17 * Doesn't imply a read memory barrier.
18 */
19static inline long atomic64_read(const atomic64_t *v)
20{
21 return v->counter;
22}
23
24/**
25 * atomic64_set - set atomic64 variable
26 * @v: pointer to type atomic64_t
27 * @i: required value
28 *
29 * Atomically sets the value of @v to @i.
30 */
31static inline void atomic64_set(atomic64_t *v, long i)
32{
33 v->counter = i;
34}
35
36/**
37 * atomic64_add - add integer to atomic64 variable
38 * @i: integer value to add
39 * @v: pointer to type atomic64_t
40 *
41 * Atomically adds @i to @v.
42 */
43static inline void atomic64_add(long i, atomic64_t *v)
44{
45 asm volatile(LOCK_PREFIX "addq %1,%0"
46 : "=m" (v->counter)
47 : "er" (i), "m" (v->counter));
48}
49
50/**
51 * atomic64_sub - subtract the atomic64 variable
52 * @i: integer value to subtract
53 * @v: pointer to type atomic64_t
54 *
55 * Atomically subtracts @i from @v.
56 */
57static inline void atomic64_sub(long i, atomic64_t *v)
58{
59 asm volatile(LOCK_PREFIX "subq %1,%0"
60 : "=m" (v->counter)
61 : "er" (i), "m" (v->counter));
62}
63
64/**
65 * atomic64_sub_and_test - subtract value from variable and test result
66 * @i: integer value to subtract
67 * @v: pointer to type atomic64_t
68 *
69 * Atomically subtracts @i from @v and returns
70 * true if the result is zero, or false for all
71 * other cases.
72 */
73static inline int atomic64_sub_and_test(long i, atomic64_t *v)
74{
75 unsigned char c;
76
77 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
78 : "=m" (v->counter), "=qm" (c)
79 : "er" (i), "m" (v->counter) : "memory");
80 return c;
81}
82
83/**
84 * atomic64_inc - increment atomic64 variable
85 * @v: pointer to type atomic64_t
86 *
87 * Atomically increments @v by 1.
88 */
89static inline void atomic64_inc(atomic64_t *v)
90{
91 asm volatile(LOCK_PREFIX "incq %0"
92 : "=m" (v->counter)
93 : "m" (v->counter));
94}
95
96/**
97 * atomic64_dec - decrement atomic64 variable
98 * @v: pointer to type atomic64_t
99 *
100 * Atomically decrements @v by 1.
101 */
102static inline void atomic64_dec(atomic64_t *v)
103{
104 asm volatile(LOCK_PREFIX "decq %0"
105 : "=m" (v->counter)
106 : "m" (v->counter));
107}
108
109/**
110 * atomic64_dec_and_test - decrement and test
111 * @v: pointer to type atomic64_t
112 *
113 * Atomically decrements @v by 1 and
114 * returns true if the result is 0, or false for all other
115 * cases.
116 */
117static inline int atomic64_dec_and_test(atomic64_t *v)
118{
119 unsigned char c;
120
121 asm volatile(LOCK_PREFIX "decq %0; sete %1"
122 : "=m" (v->counter), "=qm" (c)
123 : "m" (v->counter) : "memory");
124 return c != 0;
125}
126
127/**
128 * atomic64_inc_and_test - increment and test
129 * @v: pointer to type atomic64_t
130 *
131 * Atomically increments @v by 1
132 * and returns true if the result is zero, or false for all
133 * other cases.
134 */
135static inline int atomic64_inc_and_test(atomic64_t *v)
136{
137 unsigned char c;
138
139 asm volatile(LOCK_PREFIX "incq %0; sete %1"
140 : "=m" (v->counter), "=qm" (c)
141 : "m" (v->counter) : "memory");
142 return c != 0;
143}
144
145/**
146 * atomic64_add_negative - add and test if negative
147 * @i: integer value to add
148 * @v: pointer to type atomic64_t
149 *
150 * Atomically adds @i to @v and returns true
151 * if the result is negative, or false when
152 * result is greater than or equal to zero.
153 */
154static inline int atomic64_add_negative(long i, atomic64_t *v)
155{
156 unsigned char c;
157
158 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
159 : "=m" (v->counter), "=qm" (c)
160 : "er" (i), "m" (v->counter) : "memory");
161 return c;
162}
163
164/**
165 * atomic64_add_return - add and return
166 * @i: integer value to add
167 * @v: pointer to type atomic64_t
168 *
169 * Atomically adds @i to @v and returns @i + @v
170 */
171static inline long atomic64_add_return(long i, atomic64_t *v)
172{
173 long __i = i;
174 asm volatile(LOCK_PREFIX "xaddq %0, %1;"
175 : "+r" (i), "+m" (v->counter)
176 : : "memory");
177 return i + __i;
178}
179
180static inline long atomic64_sub_return(long i, atomic64_t *v)
181{
182 return atomic64_add_return(-i, v);
183}
184
185#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
186#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
187
188static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
189{
190 return cmpxchg(&v->counter, old, new);
191}
192
193static inline long atomic64_xchg(atomic64_t *v, long new)
194{
195 return xchg(&v->counter, new);
196}
197
198/**
199 * atomic64_add_unless - add unless the number is a given value
200 * @v: pointer of type atomic64_t
201 * @a: the amount to add to v...
202 * @u: ...unless v is equal to u.
203 *
204 * Atomically adds @a to @v, so long as it was not @u.
205 * Returns non-zero if @v was not @u, and zero otherwise.
206 */
207static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
208{
209 long c, old;
210 c = atomic64_read(v);
211 for (;;) {
212 if (unlikely(c == (u)))
213 break;
214 old = atomic64_cmpxchg((v), c, c + (a));
215 if (likely(old == c))
216 break;
217 c = old;
218 }
219 return c != (u);
220}
221
222#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
223
224#endif /* _ASM_X86_ATOMIC64_64_H */
diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h
deleted file mode 100644
index dc5a667ff791..000000000000
--- a/arch/x86/include/asm/atomic_32.h
+++ /dev/null
@@ -1,415 +0,0 @@
1#ifndef _ASM_X86_ATOMIC_32_H
2#define _ASM_X86_ATOMIC_32_H
3
4#include <linux/compiler.h>
5#include <linux/types.h>
6#include <asm/processor.h>
7#include <asm/cmpxchg.h>
8
9/*
10 * Atomic operations that C can't guarantee us. Useful for
11 * resource counting etc..
12 */
13
14#define ATOMIC_INIT(i) { (i) }
15
16/**
17 * atomic_read - read atomic variable
18 * @v: pointer of type atomic_t
19 *
20 * Atomically reads the value of @v.
21 */
22static inline int atomic_read(const atomic_t *v)
23{
24 return v->counter;
25}
26
27/**
28 * atomic_set - set atomic variable
29 * @v: pointer of type atomic_t
30 * @i: required value
31 *
32 * Atomically sets the value of @v to @i.
33 */
34static inline void atomic_set(atomic_t *v, int i)
35{
36 v->counter = i;
37}
38
39/**
40 * atomic_add - add integer to atomic variable
41 * @i: integer value to add
42 * @v: pointer of type atomic_t
43 *
44 * Atomically adds @i to @v.
45 */
46static inline void atomic_add(int i, atomic_t *v)
47{
48 asm volatile(LOCK_PREFIX "addl %1,%0"
49 : "+m" (v->counter)
50 : "ir" (i));
51}
52
53/**
54 * atomic_sub - subtract integer from atomic variable
55 * @i: integer value to subtract
56 * @v: pointer of type atomic_t
57 *
58 * Atomically subtracts @i from @v.
59 */
60static inline void atomic_sub(int i, atomic_t *v)
61{
62 asm volatile(LOCK_PREFIX "subl %1,%0"
63 : "+m" (v->counter)
64 : "ir" (i));
65}
66
67/**
68 * atomic_sub_and_test - subtract value from variable and test result
69 * @i: integer value to subtract
70 * @v: pointer of type atomic_t
71 *
72 * Atomically subtracts @i from @v and returns
73 * true if the result is zero, or false for all
74 * other cases.
75 */
76static inline int atomic_sub_and_test(int i, atomic_t *v)
77{
78 unsigned char c;
79
80 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
81 : "+m" (v->counter), "=qm" (c)
82 : "ir" (i) : "memory");
83 return c;
84}
85
86/**
87 * atomic_inc - increment atomic variable
88 * @v: pointer of type atomic_t
89 *
90 * Atomically increments @v by 1.
91 */
92static inline void atomic_inc(atomic_t *v)
93{
94 asm volatile(LOCK_PREFIX "incl %0"
95 : "+m" (v->counter));
96}
97
98/**
99 * atomic_dec - decrement atomic variable
100 * @v: pointer of type atomic_t
101 *
102 * Atomically decrements @v by 1.
103 */
104static inline void atomic_dec(atomic_t *v)
105{
106 asm volatile(LOCK_PREFIX "decl %0"
107 : "+m" (v->counter));
108}
109
110/**
111 * atomic_dec_and_test - decrement and test
112 * @v: pointer of type atomic_t
113 *
114 * Atomically decrements @v by 1 and
115 * returns true if the result is 0, or false for all other
116 * cases.
117 */
118static inline int atomic_dec_and_test(atomic_t *v)
119{
120 unsigned char c;
121
122 asm volatile(LOCK_PREFIX "decl %0; sete %1"
123 : "+m" (v->counter), "=qm" (c)
124 : : "memory");
125 return c != 0;
126}
127
128/**
129 * atomic_inc_and_test - increment and test
130 * @v: pointer of type atomic_t
131 *
132 * Atomically increments @v by 1
133 * and returns true if the result is zero, or false for all
134 * other cases.
135 */
136static inline int atomic_inc_and_test(atomic_t *v)
137{
138 unsigned char c;
139
140 asm volatile(LOCK_PREFIX "incl %0; sete %1"
141 : "+m" (v->counter), "=qm" (c)
142 : : "memory");
143 return c != 0;
144}
145
146/**
147 * atomic_add_negative - add and test if negative
148 * @v: pointer of type atomic_t
149 * @i: integer value to add
150 *
151 * Atomically adds @i to @v and returns true
152 * if the result is negative, or false when
153 * result is greater than or equal to zero.
154 */
155static inline int atomic_add_negative(int i, atomic_t *v)
156{
157 unsigned char c;
158
159 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
160 : "+m" (v->counter), "=qm" (c)
161 : "ir" (i) : "memory");
162 return c;
163}
164
165/**
166 * atomic_add_return - add integer and return
167 * @v: pointer of type atomic_t
168 * @i: integer value to add
169 *
170 * Atomically adds @i to @v and returns @i + @v
171 */
172static inline int atomic_add_return(int i, atomic_t *v)
173{
174 int __i;
175#ifdef CONFIG_M386
176 unsigned long flags;
177 if (unlikely(boot_cpu_data.x86 <= 3))
178 goto no_xadd;
179#endif
180 /* Modern 486+ processor */
181 __i = i;
182 asm volatile(LOCK_PREFIX "xaddl %0, %1"
183 : "+r" (i), "+m" (v->counter)
184 : : "memory");
185 return i + __i;
186
187#ifdef CONFIG_M386
188no_xadd: /* Legacy 386 processor */
189 local_irq_save(flags);
190 __i = atomic_read(v);
191 atomic_set(v, i + __i);
192 local_irq_restore(flags);
193 return i + __i;
194#endif
195}
196
197/**
198 * atomic_sub_return - subtract integer and return
199 * @v: pointer of type atomic_t
200 * @i: integer value to subtract
201 *
202 * Atomically subtracts @i from @v and returns @v - @i
203 */
204static inline int atomic_sub_return(int i, atomic_t *v)
205{
206 return atomic_add_return(-i, v);
207}
208
209static inline int atomic_cmpxchg(atomic_t *v, int old, int new)
210{
211 return cmpxchg(&v->counter, old, new);
212}
213
214static inline int atomic_xchg(atomic_t *v, int new)
215{
216 return xchg(&v->counter, new);
217}
218
219/**
220 * atomic_add_unless - add unless the number is already a given value
221 * @v: pointer of type atomic_t
222 * @a: the amount to add to v...
223 * @u: ...unless v is equal to u.
224 *
225 * Atomically adds @a to @v, so long as @v was not already @u.
226 * Returns non-zero if @v was not @u, and zero otherwise.
227 */
228static inline int atomic_add_unless(atomic_t *v, int a, int u)
229{
230 int c, old;
231 c = atomic_read(v);
232 for (;;) {
233 if (unlikely(c == (u)))
234 break;
235 old = atomic_cmpxchg((v), c, c + (a));
236 if (likely(old == c))
237 break;
238 c = old;
239 }
240 return c != (u);
241}
242
243#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
244
245#define atomic_inc_return(v) (atomic_add_return(1, v))
246#define atomic_dec_return(v) (atomic_sub_return(1, v))
247
248/* These are x86-specific, used by some header files */
249#define atomic_clear_mask(mask, addr) \
250 asm volatile(LOCK_PREFIX "andl %0,%1" \
251 : : "r" (~(mask)), "m" (*(addr)) : "memory")
252
253#define atomic_set_mask(mask, addr) \
254 asm volatile(LOCK_PREFIX "orl %0,%1" \
255 : : "r" (mask), "m" (*(addr)) : "memory")
256
257/* Atomic operations are already serializing on x86 */
258#define smp_mb__before_atomic_dec() barrier()
259#define smp_mb__after_atomic_dec() barrier()
260#define smp_mb__before_atomic_inc() barrier()
261#define smp_mb__after_atomic_inc() barrier()
262
263/* An 64bit atomic type */
264
265typedef struct {
266 u64 __aligned(8) counter;
267} atomic64_t;
268
269#define ATOMIC64_INIT(val) { (val) }
270
271extern u64 atomic64_cmpxchg(atomic64_t *ptr, u64 old_val, u64 new_val);
272
273/**
274 * atomic64_xchg - xchg atomic64 variable
275 * @ptr: pointer to type atomic64_t
276 * @new_val: value to assign
277 *
278 * Atomically xchgs the value of @ptr to @new_val and returns
279 * the old value.
280 */
281extern u64 atomic64_xchg(atomic64_t *ptr, u64 new_val);
282
283/**
284 * atomic64_set - set atomic64 variable
285 * @ptr: pointer to type atomic64_t
286 * @new_val: value to assign
287 *
288 * Atomically sets the value of @ptr to @new_val.
289 */
290extern void atomic64_set(atomic64_t *ptr, u64 new_val);
291
292/**
293 * atomic64_read - read atomic64 variable
294 * @ptr: pointer to type atomic64_t
295 *
296 * Atomically reads the value of @ptr and returns it.
297 */
298static inline u64 atomic64_read(atomic64_t *ptr)
299{
300 u64 res;
301
302 /*
303 * Note, we inline this atomic64_t primitive because
304 * it only clobbers EAX/EDX and leaves the others
305 * untouched. We also (somewhat subtly) rely on the
306 * fact that cmpxchg8b returns the current 64-bit value
307 * of the memory location we are touching:
308 */
309 asm volatile(
310 "mov %%ebx, %%eax\n\t"
311 "mov %%ecx, %%edx\n\t"
312 LOCK_PREFIX "cmpxchg8b %1\n"
313 : "=&A" (res)
314 : "m" (*ptr)
315 );
316
317 return res;
318}
319
320extern u64 atomic64_read(atomic64_t *ptr);
321
322/**
323 * atomic64_add_return - add and return
324 * @delta: integer value to add
325 * @ptr: pointer to type atomic64_t
326 *
327 * Atomically adds @delta to @ptr and returns @delta + *@ptr
328 */
329extern u64 atomic64_add_return(u64 delta, atomic64_t *ptr);
330
331/*
332 * Other variants with different arithmetic operators:
333 */
334extern u64 atomic64_sub_return(u64 delta, atomic64_t *ptr);
335extern u64 atomic64_inc_return(atomic64_t *ptr);
336extern u64 atomic64_dec_return(atomic64_t *ptr);
337
338/**
339 * atomic64_add - add integer to atomic64 variable
340 * @delta: integer value to add
341 * @ptr: pointer to type atomic64_t
342 *
343 * Atomically adds @delta to @ptr.
344 */
345extern void atomic64_add(u64 delta, atomic64_t *ptr);
346
347/**
348 * atomic64_sub - subtract the atomic64 variable
349 * @delta: integer value to subtract
350 * @ptr: pointer to type atomic64_t
351 *
352 * Atomically subtracts @delta from @ptr.
353 */
354extern void atomic64_sub(u64 delta, atomic64_t *ptr);
355
356/**
357 * atomic64_sub_and_test - subtract value from variable and test result
358 * @delta: integer value to subtract
359 * @ptr: pointer to type atomic64_t
360 *
361 * Atomically subtracts @delta from @ptr and returns
362 * true if the result is zero, or false for all
363 * other cases.
364 */
365extern int atomic64_sub_and_test(u64 delta, atomic64_t *ptr);
366
367/**
368 * atomic64_inc - increment atomic64 variable
369 * @ptr: pointer to type atomic64_t
370 *
371 * Atomically increments @ptr by 1.
372 */
373extern void atomic64_inc(atomic64_t *ptr);
374
375/**
376 * atomic64_dec - decrement atomic64 variable
377 * @ptr: pointer to type atomic64_t
378 *
379 * Atomically decrements @ptr by 1.
380 */
381extern void atomic64_dec(atomic64_t *ptr);
382
383/**
384 * atomic64_dec_and_test - decrement and test
385 * @ptr: pointer to type atomic64_t
386 *
387 * Atomically decrements @ptr by 1 and
388 * returns true if the result is 0, or false for all other
389 * cases.
390 */
391extern int atomic64_dec_and_test(atomic64_t *ptr);
392
393/**
394 * atomic64_inc_and_test - increment and test
395 * @ptr: pointer to type atomic64_t
396 *
397 * Atomically increments @ptr by 1
398 * and returns true if the result is zero, or false for all
399 * other cases.
400 */
401extern int atomic64_inc_and_test(atomic64_t *ptr);
402
403/**
404 * atomic64_add_negative - add and test if negative
405 * @delta: integer value to add
406 * @ptr: pointer to type atomic64_t
407 *
408 * Atomically adds @delta to @ptr and returns true
409 * if the result is negative, or false when
410 * result is greater than or equal to zero.
411 */
412extern int atomic64_add_negative(u64 delta, atomic64_t *ptr);
413
414#include <asm-generic/atomic-long.h>
415#endif /* _ASM_X86_ATOMIC_32_H */
diff --git a/arch/x86/include/asm/atomic_64.h b/arch/x86/include/asm/atomic_64.h
deleted file mode 100644
index d605dc268e79..000000000000
--- a/arch/x86/include/asm/atomic_64.h
+++ /dev/null
@@ -1,485 +0,0 @@
1#ifndef _ASM_X86_ATOMIC_64_H
2#define _ASM_X86_ATOMIC_64_H
3
4#include <linux/types.h>
5#include <asm/alternative.h>
6#include <asm/cmpxchg.h>
7
8/*
9 * Atomic operations that C can't guarantee us. Useful for
10 * resource counting etc..
11 */
12
13#define ATOMIC_INIT(i) { (i) }
14
15/**
16 * atomic_read - read atomic variable
17 * @v: pointer of type atomic_t
18 *
19 * Atomically reads the value of @v.
20 */
21static inline int atomic_read(const atomic_t *v)
22{
23 return v->counter;
24}
25
26/**
27 * atomic_set - set atomic variable
28 * @v: pointer of type atomic_t
29 * @i: required value
30 *
31 * Atomically sets the value of @v to @i.
32 */
33static inline void atomic_set(atomic_t *v, int i)
34{
35 v->counter = i;
36}
37
38/**
39 * atomic_add - add integer to atomic variable
40 * @i: integer value to add
41 * @v: pointer of type atomic_t
42 *
43 * Atomically adds @i to @v.
44 */
45static inline void atomic_add(int i, atomic_t *v)
46{
47 asm volatile(LOCK_PREFIX "addl %1,%0"
48 : "=m" (v->counter)
49 : "ir" (i), "m" (v->counter));
50}
51
52/**
53 * atomic_sub - subtract the atomic variable
54 * @i: integer value to subtract
55 * @v: pointer of type atomic_t
56 *
57 * Atomically subtracts @i from @v.
58 */
59static inline void atomic_sub(int i, atomic_t *v)
60{
61 asm volatile(LOCK_PREFIX "subl %1,%0"
62 : "=m" (v->counter)
63 : "ir" (i), "m" (v->counter));
64}
65
66/**
67 * atomic_sub_and_test - subtract value from variable and test result
68 * @i: integer value to subtract
69 * @v: pointer of type atomic_t
70 *
71 * Atomically subtracts @i from @v and returns
72 * true if the result is zero, or false for all
73 * other cases.
74 */
75static inline int atomic_sub_and_test(int i, atomic_t *v)
76{
77 unsigned char c;
78
79 asm volatile(LOCK_PREFIX "subl %2,%0; sete %1"
80 : "=m" (v->counter), "=qm" (c)
81 : "ir" (i), "m" (v->counter) : "memory");
82 return c;
83}
84
85/**
86 * atomic_inc - increment atomic variable
87 * @v: pointer of type atomic_t
88 *
89 * Atomically increments @v by 1.
90 */
91static inline void atomic_inc(atomic_t *v)
92{
93 asm volatile(LOCK_PREFIX "incl %0"
94 : "=m" (v->counter)
95 : "m" (v->counter));
96}
97
98/**
99 * atomic_dec - decrement atomic variable
100 * @v: pointer of type atomic_t
101 *
102 * Atomically decrements @v by 1.
103 */
104static inline void atomic_dec(atomic_t *v)
105{
106 asm volatile(LOCK_PREFIX "decl %0"
107 : "=m" (v->counter)
108 : "m" (v->counter));
109}
110
111/**
112 * atomic_dec_and_test - decrement and test
113 * @v: pointer of type atomic_t
114 *
115 * Atomically decrements @v by 1 and
116 * returns true if the result is 0, or false for all other
117 * cases.
118 */
119static inline int atomic_dec_and_test(atomic_t *v)
120{
121 unsigned char c;
122
123 asm volatile(LOCK_PREFIX "decl %0; sete %1"
124 : "=m" (v->counter), "=qm" (c)
125 : "m" (v->counter) : "memory");
126 return c != 0;
127}
128
129/**
130 * atomic_inc_and_test - increment and test
131 * @v: pointer of type atomic_t
132 *
133 * Atomically increments @v by 1
134 * and returns true if the result is zero, or false for all
135 * other cases.
136 */
137static inline int atomic_inc_and_test(atomic_t *v)
138{
139 unsigned char c;
140
141 asm volatile(LOCK_PREFIX "incl %0; sete %1"
142 : "=m" (v->counter), "=qm" (c)
143 : "m" (v->counter) : "memory");
144 return c != 0;
145}
146
147/**
148 * atomic_add_negative - add and test if negative
149 * @i: integer value to add
150 * @v: pointer of type atomic_t
151 *
152 * Atomically adds @i to @v and returns true
153 * if the result is negative, or false when
154 * result is greater than or equal to zero.
155 */
156static inline int atomic_add_negative(int i, atomic_t *v)
157{
158 unsigned char c;
159
160 asm volatile(LOCK_PREFIX "addl %2,%0; sets %1"
161 : "=m" (v->counter), "=qm" (c)
162 : "ir" (i), "m" (v->counter) : "memory");
163 return c;
164}
165
166/**
167 * atomic_add_return - add and return
168 * @i: integer value to add
169 * @v: pointer of type atomic_t
170 *
171 * Atomically adds @i to @v and returns @i + @v
172 */
173static inline int atomic_add_return(int i, atomic_t *v)
174{
175 int __i = i;
176 asm volatile(LOCK_PREFIX "xaddl %0, %1"
177 : "+r" (i), "+m" (v->counter)
178 : : "memory");
179 return i + __i;
180}
181
182static inline int atomic_sub_return(int i, atomic_t *v)
183{
184 return atomic_add_return(-i, v);
185}
186
187#define atomic_inc_return(v) (atomic_add_return(1, v))
188#define atomic_dec_return(v) (atomic_sub_return(1, v))
189
190/* The 64-bit atomic type */
191
192#define ATOMIC64_INIT(i) { (i) }
193
194/**
195 * atomic64_read - read atomic64 variable
196 * @v: pointer of type atomic64_t
197 *
198 * Atomically reads the value of @v.
199 * Doesn't imply a read memory barrier.
200 */
201static inline long atomic64_read(const atomic64_t *v)
202{
203 return v->counter;
204}
205
206/**
207 * atomic64_set - set atomic64 variable
208 * @v: pointer to type atomic64_t
209 * @i: required value
210 *
211 * Atomically sets the value of @v to @i.
212 */
213static inline void atomic64_set(atomic64_t *v, long i)
214{
215 v->counter = i;
216}
217
218/**
219 * atomic64_add - add integer to atomic64 variable
220 * @i: integer value to add
221 * @v: pointer to type atomic64_t
222 *
223 * Atomically adds @i to @v.
224 */
225static inline void atomic64_add(long i, atomic64_t *v)
226{
227 asm volatile(LOCK_PREFIX "addq %1,%0"
228 : "=m" (v->counter)
229 : "er" (i), "m" (v->counter));
230}
231
232/**
233 * atomic64_sub - subtract the atomic64 variable
234 * @i: integer value to subtract
235 * @v: pointer to type atomic64_t
236 *
237 * Atomically subtracts @i from @v.
238 */
239static inline void atomic64_sub(long i, atomic64_t *v)
240{
241 asm volatile(LOCK_PREFIX "subq %1,%0"
242 : "=m" (v->counter)
243 : "er" (i), "m" (v->counter));
244}
245
246/**
247 * atomic64_sub_and_test - subtract value from variable and test result
248 * @i: integer value to subtract
249 * @v: pointer to type atomic64_t
250 *
251 * Atomically subtracts @i from @v and returns
252 * true if the result is zero, or false for all
253 * other cases.
254 */
255static inline int atomic64_sub_and_test(long i, atomic64_t *v)
256{
257 unsigned char c;
258
259 asm volatile(LOCK_PREFIX "subq %2,%0; sete %1"
260 : "=m" (v->counter), "=qm" (c)
261 : "er" (i), "m" (v->counter) : "memory");
262 return c;
263}
264
265/**
266 * atomic64_inc - increment atomic64 variable
267 * @v: pointer to type atomic64_t
268 *
269 * Atomically increments @v by 1.
270 */
271static inline void atomic64_inc(atomic64_t *v)
272{
273 asm volatile(LOCK_PREFIX "incq %0"
274 : "=m" (v->counter)
275 : "m" (v->counter));
276}
277
278/**
279 * atomic64_dec - decrement atomic64 variable
280 * @v: pointer to type atomic64_t
281 *
282 * Atomically decrements @v by 1.
283 */
284static inline void atomic64_dec(atomic64_t *v)
285{
286 asm volatile(LOCK_PREFIX "decq %0"
287 : "=m" (v->counter)
288 : "m" (v->counter));
289}
290
291/**
292 * atomic64_dec_and_test - decrement and test
293 * @v: pointer to type atomic64_t
294 *
295 * Atomically decrements @v by 1 and
296 * returns true if the result is 0, or false for all other
297 * cases.
298 */
299static inline int atomic64_dec_and_test(atomic64_t *v)
300{
301 unsigned char c;
302
303 asm volatile(LOCK_PREFIX "decq %0; sete %1"
304 : "=m" (v->counter), "=qm" (c)
305 : "m" (v->counter) : "memory");
306 return c != 0;
307}
308
309/**
310 * atomic64_inc_and_test - increment and test
311 * @v: pointer to type atomic64_t
312 *
313 * Atomically increments @v by 1
314 * and returns true if the result is zero, or false for all
315 * other cases.
316 */
317static inline int atomic64_inc_and_test(atomic64_t *v)
318{
319 unsigned char c;
320
321 asm volatile(LOCK_PREFIX "incq %0; sete %1"
322 : "=m" (v->counter), "=qm" (c)
323 : "m" (v->counter) : "memory");
324 return c != 0;
325}
326
327/**
328 * atomic64_add_negative - add and test if negative
329 * @i: integer value to add
330 * @v: pointer to type atomic64_t
331 *
332 * Atomically adds @i to @v and returns true
333 * if the result is negative, or false when
334 * result is greater than or equal to zero.
335 */
336static inline int atomic64_add_negative(long i, atomic64_t *v)
337{
338 unsigned char c;
339
340 asm volatile(LOCK_PREFIX "addq %2,%0; sets %1"
341 : "=m" (v->counter), "=qm" (c)
342 : "er" (i), "m" (v->counter) : "memory");
343 return c;
344}
345
346/**
347 * atomic64_add_return - add and return
348 * @i: integer value to add
349 * @v: pointer to type atomic64_t
350 *
351 * Atomically adds @i to @v and returns @i + @v
352 */
353static inline long atomic64_add_return(long i, atomic64_t *v)
354{
355 long __i = i;
356 asm volatile(LOCK_PREFIX "xaddq %0, %1;"
357 : "+r" (i), "+m" (v->counter)
358 : : "memory");
359 return i + __i;
360}
361
362static inline long atomic64_sub_return(long i, atomic64_t *v)
363{
364 return atomic64_add_return(-i, v);
365}
366
367#define atomic64_inc_return(v) (atomic64_add_return(1, (v)))
368#define atomic64_dec_return(v) (atomic64_sub_return(1, (v)))
369
370static inline long atomic64_cmpxchg(atomic64_t *v, long old, long new)
371{
372 return cmpxchg(&v->counter, old, new);
373}
374
375static inline long atomic64_xchg(atomic64_t *v, long new)
376{
377 return xchg(&v->counter, new);
378}
379
380static inline long atomic_cmpxchg(atomic_t *v, int old, int new)
381{
382 return cmpxchg(&v->counter, old, new);
383}
384
385static inline long atomic_xchg(atomic_t *v, int new)
386{
387 return xchg(&v->counter, new);
388}
389
390/**
391 * atomic_add_unless - add unless the number is a given value
392 * @v: pointer of type atomic_t
393 * @a: the amount to add to v...
394 * @u: ...unless v is equal to u.
395 *
396 * Atomically adds @a to @v, so long as it was not @u.
397 * Returns non-zero if @v was not @u, and zero otherwise.
398 */
399static inline int atomic_add_unless(atomic_t *v, int a, int u)
400{
401 int c, old;
402 c = atomic_read(v);
403 for (;;) {
404 if (unlikely(c == (u)))
405 break;
406 old = atomic_cmpxchg((v), c, c + (a));
407 if (likely(old == c))
408 break;
409 c = old;
410 }
411 return c != (u);
412}
413
414#define atomic_inc_not_zero(v) atomic_add_unless((v), 1, 0)
415
416/**
417 * atomic64_add_unless - add unless the number is a given value
418 * @v: pointer of type atomic64_t
419 * @a: the amount to add to v...
420 * @u: ...unless v is equal to u.
421 *
422 * Atomically adds @a to @v, so long as it was not @u.
423 * Returns non-zero if @v was not @u, and zero otherwise.
424 */
425static inline int atomic64_add_unless(atomic64_t *v, long a, long u)
426{
427 long c, old;
428 c = atomic64_read(v);
429 for (;;) {
430 if (unlikely(c == (u)))
431 break;
432 old = atomic64_cmpxchg((v), c, c + (a));
433 if (likely(old == c))
434 break;
435 c = old;
436 }
437 return c != (u);
438}
439
440/**
441 * atomic_inc_short - increment of a short integer
442 * @v: pointer to type int
443 *
444 * Atomically adds 1 to @v
445 * Returns the new value of @u
446 */
447static inline short int atomic_inc_short(short int *v)
448{
449 asm(LOCK_PREFIX "addw $1, %0" : "+m" (*v));
450 return *v;
451}
452
453/**
454 * atomic_or_long - OR of two long integers
455 * @v1: pointer to type unsigned long
456 * @v2: pointer to type unsigned long
457 *
458 * Atomically ORs @v1 and @v2
459 * Returns the result of the OR
460 */
461static inline void atomic_or_long(unsigned long *v1, unsigned long v2)
462{
463 asm(LOCK_PREFIX "orq %1, %0" : "+m" (*v1) : "r" (v2));
464}
465
466#define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0)
467
468/* These are x86-specific, used by some header files */
469#define atomic_clear_mask(mask, addr) \
470 asm volatile(LOCK_PREFIX "andl %0,%1" \
471 : : "r" (~(mask)), "m" (*(addr)) : "memory")
472
473#define atomic_set_mask(mask, addr) \
474 asm volatile(LOCK_PREFIX "orl %0,%1" \
475 : : "r" ((unsigned)(mask)), "m" (*(addr)) \
476 : "memory")
477
478/* Atomic operations are already serializing on x86 */
479#define smp_mb__before_atomic_dec() barrier()
480#define smp_mb__after_atomic_dec() barrier()
481#define smp_mb__before_atomic_inc() barrier()
482#define smp_mb__after_atomic_inc() barrier()
483
484#include <asm-generic/atomic-long.h>
485#endif /* _ASM_X86_ATOMIC_64_H */
diff --git a/arch/x86/include/asm/bug.h b/arch/x86/include/asm/bug.h
index d9cf1cd156d2..f654d1bb17fb 100644
--- a/arch/x86/include/asm/bug.h
+++ b/arch/x86/include/asm/bug.h
@@ -22,14 +22,14 @@ do { \
22 ".popsection" \ 22 ".popsection" \
23 : : "i" (__FILE__), "i" (__LINE__), \ 23 : : "i" (__FILE__), "i" (__LINE__), \
24 "i" (sizeof(struct bug_entry))); \ 24 "i" (sizeof(struct bug_entry))); \
25 for (;;) ; \ 25 unreachable(); \
26} while (0) 26} while (0)
27 27
28#else 28#else
29#define BUG() \ 29#define BUG() \
30do { \ 30do { \
31 asm volatile("ud2"); \ 31 asm volatile("ud2"); \
32 for (;;) ; \ 32 unreachable(); \
33} while (0) 33} while (0)
34#endif 34#endif
35 35
diff --git a/arch/x86/include/asm/cache.h b/arch/x86/include/asm/cache.h
index 549860d3be8f..2f9047cfaaca 100644
--- a/arch/x86/include/asm/cache.h
+++ b/arch/x86/include/asm/cache.h
@@ -9,12 +9,13 @@
9 9
10#define __read_mostly __attribute__((__section__(".data.read_mostly"))) 10#define __read_mostly __attribute__((__section__(".data.read_mostly")))
11 11
12#define INTERNODE_CACHE_SHIFT CONFIG_X86_INTERNODE_CACHE_SHIFT
13#define INTERNODE_CACHE_BYTES (1 << INTERNODE_CACHE_SHIFT)
14
12#ifdef CONFIG_X86_VSMP 15#ifdef CONFIG_X86_VSMP
13/* vSMP Internode cacheline shift */
14#define INTERNODE_CACHE_SHIFT (12)
15#ifdef CONFIG_SMP 16#ifdef CONFIG_SMP
16#define __cacheline_aligned_in_smp \ 17#define __cacheline_aligned_in_smp \
17 __attribute__((__aligned__(1 << (INTERNODE_CACHE_SHIFT)))) \ 18 __attribute__((__aligned__(INTERNODE_CACHE_BYTES))) \
18 __page_aligned_data 19 __page_aligned_data
19#endif 20#endif
20#endif 21#endif
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index b54f6afe7ec4..634c40a739a6 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -12,6 +12,7 @@ static inline void flush_cache_range(struct vm_area_struct *vma,
12 unsigned long start, unsigned long end) { } 12 unsigned long start, unsigned long end) { }
13static inline void flush_cache_page(struct vm_area_struct *vma, 13static inline void flush_cache_page(struct vm_area_struct *vma,
14 unsigned long vmaddr, unsigned long pfn) { } 14 unsigned long vmaddr, unsigned long pfn) { }
15#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0
15static inline void flush_dcache_page(struct page *page) { } 16static inline void flush_dcache_page(struct page *page) { }
16static inline void flush_dcache_mmap_lock(struct address_space *mapping) { } 17static inline void flush_dcache_mmap_lock(struct address_space *mapping) { }
17static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { } 18static inline void flush_dcache_mmap_unlock(struct address_space *mapping) { }
@@ -176,6 +177,7 @@ void clflush_cache_range(void *addr, unsigned int size);
176#ifdef CONFIG_DEBUG_RODATA 177#ifdef CONFIG_DEBUG_RODATA
177void mark_rodata_ro(void); 178void mark_rodata_ro(void);
178extern const int rodata_test_data; 179extern const int rodata_test_data;
180extern int kernel_set_to_readonly;
179void set_kernel_text_rw(void); 181void set_kernel_text_rw(void);
180void set_kernel_text_ro(void); 182void set_kernel_text_ro(void);
181#else 183#else
diff --git a/arch/x86/include/asm/calgary.h b/arch/x86/include/asm/calgary.h
index b03bedb62aa7..0918654305af 100644
--- a/arch/x86/include/asm/calgary.h
+++ b/arch/x86/include/asm/calgary.h
@@ -62,10 +62,8 @@ struct cal_chipset_ops {
62extern int use_calgary; 62extern int use_calgary;
63 63
64#ifdef CONFIG_CALGARY_IOMMU 64#ifdef CONFIG_CALGARY_IOMMU
65extern int calgary_iommu_init(void);
66extern void detect_calgary(void); 65extern void detect_calgary(void);
67#else 66#else
68static inline int calgary_iommu_init(void) { return 1; }
69static inline void detect_calgary(void) { return; } 67static inline void detect_calgary(void) { return; }
70#endif 68#endif
71 69
diff --git a/arch/x86/include/asm/cmpxchg_32.h b/arch/x86/include/asm/cmpxchg_32.h
index ee1931be6593..ffb9bb6b6c37 100644
--- a/arch/x86/include/asm/cmpxchg_32.h
+++ b/arch/x86/include/asm/cmpxchg_32.h
@@ -8,14 +8,50 @@
8 * you need to test for the feature in boot_cpu_data. 8 * you need to test for the feature in boot_cpu_data.
9 */ 9 */
10 10
11#define xchg(ptr, v) \ 11extern void __xchg_wrong_size(void);
12 ((__typeof__(*(ptr)))__xchg((unsigned long)(v), (ptr), sizeof(*(ptr)))) 12
13/*
14 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
15 * Note 2: xchg has side effect, so that attribute volatile is necessary,
16 * but generally the primitive is invalid, *ptr is output argument. --ANK
17 */
13 18
14struct __xchg_dummy { 19struct __xchg_dummy {
15 unsigned long a[100]; 20 unsigned long a[100];
16}; 21};
17#define __xg(x) ((struct __xchg_dummy *)(x)) 22#define __xg(x) ((struct __xchg_dummy *)(x))
18 23
24#define __xchg(x, ptr, size) \
25({ \
26 __typeof(*(ptr)) __x = (x); \
27 switch (size) { \
28 case 1: \
29 asm volatile("xchgb %b0,%1" \
30 : "=q" (__x) \
31 : "m" (*__xg(ptr)), "0" (__x) \
32 : "memory"); \
33 break; \
34 case 2: \
35 asm volatile("xchgw %w0,%1" \
36 : "=r" (__x) \
37 : "m" (*__xg(ptr)), "0" (__x) \
38 : "memory"); \
39 break; \
40 case 4: \
41 asm volatile("xchgl %0,%1" \
42 : "=r" (__x) \
43 : "m" (*__xg(ptr)), "0" (__x) \
44 : "memory"); \
45 break; \
46 default: \
47 __xchg_wrong_size(); \
48 } \
49 __x; \
50})
51
52#define xchg(ptr, v) \
53 __xchg((v), (ptr), sizeof(*ptr))
54
19/* 55/*
20 * The semantics of XCHGCMP8B are a bit strange, this is why 56 * The semantics of XCHGCMP8B are a bit strange, this is why
21 * there is a loop and the loading of %%eax and %%edx has to 57 * there is a loop and the loading of %%eax and %%edx has to
@@ -71,57 +107,63 @@ static inline void __set_64bit_var(unsigned long long *ptr,
71 (unsigned int)((value) >> 32)) \ 107 (unsigned int)((value) >> 32)) \
72 : __set_64bit(ptr, ll_low((value)), ll_high((value)))) 108 : __set_64bit(ptr, ll_low((value)), ll_high((value))))
73 109
74/* 110extern void __cmpxchg_wrong_size(void);
75 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
76 * Note 2: xchg has side effect, so that attribute volatile is necessary,
77 * but generally the primitive is invalid, *ptr is output argument. --ANK
78 */
79static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
80 int size)
81{
82 switch (size) {
83 case 1:
84 asm volatile("xchgb %b0,%1"
85 : "=q" (x)
86 : "m" (*__xg(ptr)), "0" (x)
87 : "memory");
88 break;
89 case 2:
90 asm volatile("xchgw %w0,%1"
91 : "=r" (x)
92 : "m" (*__xg(ptr)), "0" (x)
93 : "memory");
94 break;
95 case 4:
96 asm volatile("xchgl %0,%1"
97 : "=r" (x)
98 : "m" (*__xg(ptr)), "0" (x)
99 : "memory");
100 break;
101 }
102 return x;
103}
104 111
105/* 112/*
106 * Atomic compare and exchange. Compare OLD with MEM, if identical, 113 * Atomic compare and exchange. Compare OLD with MEM, if identical,
107 * store NEW in MEM. Return the initial value in MEM. Success is 114 * store NEW in MEM. Return the initial value in MEM. Success is
108 * indicated by comparing RETURN with OLD. 115 * indicated by comparing RETURN with OLD.
109 */ 116 */
117#define __raw_cmpxchg(ptr, old, new, size, lock) \
118({ \
119 __typeof__(*(ptr)) __ret; \
120 __typeof__(*(ptr)) __old = (old); \
121 __typeof__(*(ptr)) __new = (new); \
122 switch (size) { \
123 case 1: \
124 asm volatile(lock "cmpxchgb %b1,%2" \
125 : "=a"(__ret) \
126 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
127 : "memory"); \
128 break; \
129 case 2: \
130 asm volatile(lock "cmpxchgw %w1,%2" \
131 : "=a"(__ret) \
132 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
133 : "memory"); \
134 break; \
135 case 4: \
136 asm volatile(lock "cmpxchgl %1,%2" \
137 : "=a"(__ret) \
138 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
139 : "memory"); \
140 break; \
141 default: \
142 __cmpxchg_wrong_size(); \
143 } \
144 __ret; \
145})
146
147#define __cmpxchg(ptr, old, new, size) \
148 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
149
150#define __sync_cmpxchg(ptr, old, new, size) \
151 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
152
153#define __cmpxchg_local(ptr, old, new, size) \
154 __raw_cmpxchg((ptr), (old), (new), (size), "")
110 155
111#ifdef CONFIG_X86_CMPXCHG 156#ifdef CONFIG_X86_CMPXCHG
112#define __HAVE_ARCH_CMPXCHG 1 157#define __HAVE_ARCH_CMPXCHG 1
113#define cmpxchg(ptr, o, n) \ 158
114 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \ 159#define cmpxchg(ptr, old, new) \
115 (unsigned long)(n), \ 160 __cmpxchg((ptr), (old), (new), sizeof(*ptr))
116 sizeof(*(ptr)))) 161
117#define sync_cmpxchg(ptr, o, n) \ 162#define sync_cmpxchg(ptr, old, new) \
118 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \ 163 __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
119 (unsigned long)(n), \ 164
120 sizeof(*(ptr)))) 165#define cmpxchg_local(ptr, old, new) \
121#define cmpxchg_local(ptr, o, n) \ 166 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
122 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
123 (unsigned long)(n), \
124 sizeof(*(ptr))))
125#endif 167#endif
126 168
127#ifdef CONFIG_X86_CMPXCHG64 169#ifdef CONFIG_X86_CMPXCHG64
@@ -133,94 +175,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr,
133 (unsigned long long)(n))) 175 (unsigned long long)(n)))
134#endif 176#endif
135 177
136static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
137 unsigned long new, int size)
138{
139 unsigned long prev;
140 switch (size) {
141 case 1:
142 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
143 : "=a"(prev)
144 : "q"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 2:
148 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 4:
154 asm volatile(LOCK_PREFIX "cmpxchgl %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162
163/*
164 * Always use locked operations when touching memory shared with a
165 * hypervisor, since the system may be SMP even if the guest kernel
166 * isn't.
167 */
168static inline unsigned long __sync_cmpxchg(volatile void *ptr,
169 unsigned long old,
170 unsigned long new, int size)
171{
172 unsigned long prev;
173 switch (size) {
174 case 1:
175 asm volatile("lock; cmpxchgb %b1,%2"
176 : "=a"(prev)
177 : "q"(new), "m"(*__xg(ptr)), "0"(old)
178 : "memory");
179 return prev;
180 case 2:
181 asm volatile("lock; cmpxchgw %w1,%2"
182 : "=a"(prev)
183 : "r"(new), "m"(*__xg(ptr)), "0"(old)
184 : "memory");
185 return prev;
186 case 4:
187 asm volatile("lock; cmpxchgl %1,%2"
188 : "=a"(prev)
189 : "r"(new), "m"(*__xg(ptr)), "0"(old)
190 : "memory");
191 return prev;
192 }
193 return old;
194}
195
196static inline unsigned long __cmpxchg_local(volatile void *ptr,
197 unsigned long old,
198 unsigned long new, int size)
199{
200 unsigned long prev;
201 switch (size) {
202 case 1:
203 asm volatile("cmpxchgb %b1,%2"
204 : "=a"(prev)
205 : "q"(new), "m"(*__xg(ptr)), "0"(old)
206 : "memory");
207 return prev;
208 case 2:
209 asm volatile("cmpxchgw %w1,%2"
210 : "=a"(prev)
211 : "r"(new), "m"(*__xg(ptr)), "0"(old)
212 : "memory");
213 return prev;
214 case 4:
215 asm volatile("cmpxchgl %1,%2"
216 : "=a"(prev)
217 : "r"(new), "m"(*__xg(ptr)), "0"(old)
218 : "memory");
219 return prev;
220 }
221 return old;
222}
223
224static inline unsigned long long __cmpxchg64(volatile void *ptr, 178static inline unsigned long long __cmpxchg64(volatile void *ptr,
225 unsigned long long old, 179 unsigned long long old,
226 unsigned long long new) 180 unsigned long long new)
diff --git a/arch/x86/include/asm/cmpxchg_64.h b/arch/x86/include/asm/cmpxchg_64.h
index 52de72e0de8c..485ae415faec 100644
--- a/arch/x86/include/asm/cmpxchg_64.h
+++ b/arch/x86/include/asm/cmpxchg_64.h
@@ -3,9 +3,6 @@
3 3
4#include <asm/alternative.h> /* Provides LOCK_PREFIX */ 4#include <asm/alternative.h> /* Provides LOCK_PREFIX */
5 5
6#define xchg(ptr, v) ((__typeof__(*(ptr)))__xchg((unsigned long)(v), \
7 (ptr), sizeof(*(ptr))))
8
9#define __xg(x) ((volatile long *)(x)) 6#define __xg(x) ((volatile long *)(x))
10 7
11static inline void set_64bit(volatile unsigned long *ptr, unsigned long val) 8static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
@@ -15,167 +12,118 @@ static inline void set_64bit(volatile unsigned long *ptr, unsigned long val)
15 12
16#define _set_64bit set_64bit 13#define _set_64bit set_64bit
17 14
15extern void __xchg_wrong_size(void);
16extern void __cmpxchg_wrong_size(void);
17
18/* 18/*
19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway 19 * Note: no "lock" prefix even on SMP: xchg always implies lock anyway
20 * Note 2: xchg has side effect, so that attribute volatile is necessary, 20 * Note 2: xchg has side effect, so that attribute volatile is necessary,
21 * but generally the primitive is invalid, *ptr is output argument. --ANK 21 * but generally the primitive is invalid, *ptr is output argument. --ANK
22 */ 22 */
23static inline unsigned long __xchg(unsigned long x, volatile void *ptr, 23#define __xchg(x, ptr, size) \
24 int size) 24({ \
25{ 25 __typeof(*(ptr)) __x = (x); \
26 switch (size) { 26 switch (size) { \
27 case 1: 27 case 1: \
28 asm volatile("xchgb %b0,%1" 28 asm volatile("xchgb %b0,%1" \
29 : "=q" (x) 29 : "=q" (__x) \
30 : "m" (*__xg(ptr)), "0" (x) 30 : "m" (*__xg(ptr)), "0" (__x) \
31 : "memory"); 31 : "memory"); \
32 break; 32 break; \
33 case 2: 33 case 2: \
34 asm volatile("xchgw %w0,%1" 34 asm volatile("xchgw %w0,%1" \
35 : "=r" (x) 35 : "=r" (__x) \
36 : "m" (*__xg(ptr)), "0" (x) 36 : "m" (*__xg(ptr)), "0" (__x) \
37 : "memory"); 37 : "memory"); \
38 break; 38 break; \
39 case 4: 39 case 4: \
40 asm volatile("xchgl %k0,%1" 40 asm volatile("xchgl %k0,%1" \
41 : "=r" (x) 41 : "=r" (__x) \
42 : "m" (*__xg(ptr)), "0" (x) 42 : "m" (*__xg(ptr)), "0" (__x) \
43 : "memory"); 43 : "memory"); \
44 break; 44 break; \
45 case 8: 45 case 8: \
46 asm volatile("xchgq %0,%1" 46 asm volatile("xchgq %0,%1" \
47 : "=r" (x) 47 : "=r" (__x) \
48 : "m" (*__xg(ptr)), "0" (x) 48 : "m" (*__xg(ptr)), "0" (__x) \
49 : "memory"); 49 : "memory"); \
50 break; 50 break; \
51 } 51 default: \
52 return x; 52 __xchg_wrong_size(); \
53} 53 } \
54 __x; \
55})
56
57#define xchg(ptr, v) \
58 __xchg((v), (ptr), sizeof(*ptr))
59
60#define __HAVE_ARCH_CMPXCHG 1
54 61
55/* 62/*
56 * Atomic compare and exchange. Compare OLD with MEM, if identical, 63 * Atomic compare and exchange. Compare OLD with MEM, if identical,
57 * store NEW in MEM. Return the initial value in MEM. Success is 64 * store NEW in MEM. Return the initial value in MEM. Success is
58 * indicated by comparing RETURN with OLD. 65 * indicated by comparing RETURN with OLD.
59 */ 66 */
67#define __raw_cmpxchg(ptr, old, new, size, lock) \
68({ \
69 __typeof__(*(ptr)) __ret; \
70 __typeof__(*(ptr)) __old = (old); \
71 __typeof__(*(ptr)) __new = (new); \
72 switch (size) { \
73 case 1: \
74 asm volatile(lock "cmpxchgb %b1,%2" \
75 : "=a"(__ret) \
76 : "q"(__new), "m"(*__xg(ptr)), "0"(__old) \
77 : "memory"); \
78 break; \
79 case 2: \
80 asm volatile(lock "cmpxchgw %w1,%2" \
81 : "=a"(__ret) \
82 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
83 : "memory"); \
84 break; \
85 case 4: \
86 asm volatile(lock "cmpxchgl %k1,%2" \
87 : "=a"(__ret) \
88 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
89 : "memory"); \
90 break; \
91 case 8: \
92 asm volatile(lock "cmpxchgq %1,%2" \
93 : "=a"(__ret) \
94 : "r"(__new), "m"(*__xg(ptr)), "0"(__old) \
95 : "memory"); \
96 break; \
97 default: \
98 __cmpxchg_wrong_size(); \
99 } \
100 __ret; \
101})
60 102
61#define __HAVE_ARCH_CMPXCHG 1 103#define __cmpxchg(ptr, old, new, size) \
104 __raw_cmpxchg((ptr), (old), (new), (size), LOCK_PREFIX)
62 105
63static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, 106#define __sync_cmpxchg(ptr, old, new, size) \
64 unsigned long new, int size) 107 __raw_cmpxchg((ptr), (old), (new), (size), "lock; ")
65{
66 unsigned long prev;
67 switch (size) {
68 case 1:
69 asm volatile(LOCK_PREFIX "cmpxchgb %b1,%2"
70 : "=a"(prev)
71 : "q"(new), "m"(*__xg(ptr)), "0"(old)
72 : "memory");
73 return prev;
74 case 2:
75 asm volatile(LOCK_PREFIX "cmpxchgw %w1,%2"
76 : "=a"(prev)
77 : "r"(new), "m"(*__xg(ptr)), "0"(old)
78 : "memory");
79 return prev;
80 case 4:
81 asm volatile(LOCK_PREFIX "cmpxchgl %k1,%2"
82 : "=a"(prev)
83 : "r"(new), "m"(*__xg(ptr)), "0"(old)
84 : "memory");
85 return prev;
86 case 8:
87 asm volatile(LOCK_PREFIX "cmpxchgq %1,%2"
88 : "=a"(prev)
89 : "r"(new), "m"(*__xg(ptr)), "0"(old)
90 : "memory");
91 return prev;
92 }
93 return old;
94}
95 108
96/* 109#define __cmpxchg_local(ptr, old, new, size) \
97 * Always use locked operations when touching memory shared with a 110 __raw_cmpxchg((ptr), (old), (new), (size), "")
98 * hypervisor, since the system may be SMP even if the guest kernel
99 * isn't.
100 */
101static inline unsigned long __sync_cmpxchg(volatile void *ptr,
102 unsigned long old,
103 unsigned long new, int size)
104{
105 unsigned long prev;
106 switch (size) {
107 case 1:
108 asm volatile("lock; cmpxchgb %b1,%2"
109 : "=a"(prev)
110 : "q"(new), "m"(*__xg(ptr)), "0"(old)
111 : "memory");
112 return prev;
113 case 2:
114 asm volatile("lock; cmpxchgw %w1,%2"
115 : "=a"(prev)
116 : "r"(new), "m"(*__xg(ptr)), "0"(old)
117 : "memory");
118 return prev;
119 case 4:
120 asm volatile("lock; cmpxchgl %1,%2"
121 : "=a"(prev)
122 : "r"(new), "m"(*__xg(ptr)), "0"(old)
123 : "memory");
124 return prev;
125 }
126 return old;
127}
128 111
129static inline unsigned long __cmpxchg_local(volatile void *ptr, 112#define cmpxchg(ptr, old, new) \
130 unsigned long old, 113 __cmpxchg((ptr), (old), (new), sizeof(*ptr))
131 unsigned long new, int size) 114
132{ 115#define sync_cmpxchg(ptr, old, new) \
133 unsigned long prev; 116 __sync_cmpxchg((ptr), (old), (new), sizeof(*ptr))
134 switch (size) { 117
135 case 1: 118#define cmpxchg_local(ptr, old, new) \
136 asm volatile("cmpxchgb %b1,%2" 119 __cmpxchg_local((ptr), (old), (new), sizeof(*ptr))
137 : "=a"(prev)
138 : "q"(new), "m"(*__xg(ptr)), "0"(old)
139 : "memory");
140 return prev;
141 case 2:
142 asm volatile("cmpxchgw %w1,%2"
143 : "=a"(prev)
144 : "r"(new), "m"(*__xg(ptr)), "0"(old)
145 : "memory");
146 return prev;
147 case 4:
148 asm volatile("cmpxchgl %k1,%2"
149 : "=a"(prev)
150 : "r"(new), "m"(*__xg(ptr)), "0"(old)
151 : "memory");
152 return prev;
153 case 8:
154 asm volatile("cmpxchgq %1,%2"
155 : "=a"(prev)
156 : "r"(new), "m"(*__xg(ptr)), "0"(old)
157 : "memory");
158 return prev;
159 }
160 return old;
161}
162 120
163#define cmpxchg(ptr, o, n) \
164 ((__typeof__(*(ptr)))__cmpxchg((ptr), (unsigned long)(o), \
165 (unsigned long)(n), sizeof(*(ptr))))
166#define cmpxchg64(ptr, o, n) \ 121#define cmpxchg64(ptr, o, n) \
167({ \ 122({ \
168 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ 123 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
169 cmpxchg((ptr), (o), (n)); \ 124 cmpxchg((ptr), (o), (n)); \
170}) 125})
171#define cmpxchg_local(ptr, o, n) \ 126
172 ((__typeof__(*(ptr)))__cmpxchg_local((ptr), (unsigned long)(o), \
173 (unsigned long)(n), \
174 sizeof(*(ptr))))
175#define sync_cmpxchg(ptr, o, n) \
176 ((__typeof__(*(ptr)))__sync_cmpxchg((ptr), (unsigned long)(o), \
177 (unsigned long)(n), \
178 sizeof(*(ptr))))
179#define cmpxchg64_local(ptr, o, n) \ 127#define cmpxchg64_local(ptr, o, n) \
180({ \ 128({ \
181 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \ 129 BUILD_BUG_ON(sizeof(*(ptr)) != 8); \
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 9a9c7bdc923d..306160e58b48 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -8,7 +8,8 @@
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <asm/user32.h> 9#include <asm/user32.h>
10 10
11#define COMPAT_USER_HZ 100 11#define COMPAT_USER_HZ 100
12#define COMPAT_UTS_MACHINE "i686\0\0"
12 13
13typedef u32 compat_size_t; 14typedef u32 compat_size_t;
14typedef s32 compat_ssize_t; 15typedef s32 compat_ssize_t;
diff --git a/arch/x86/include/asm/cpu_debug.h b/arch/x86/include/asm/cpu_debug.h
deleted file mode 100644
index d96c1ee3a95c..000000000000
--- a/arch/x86/include/asm/cpu_debug.h
+++ /dev/null
@@ -1,127 +0,0 @@
1#ifndef _ASM_X86_CPU_DEBUG_H
2#define _ASM_X86_CPU_DEBUG_H
3
4/*
5 * CPU x86 architecture debug
6 *
7 * Copyright(C) 2009 Jaswinder Singh Rajput
8 */
9
10/* Register flags */
11enum cpu_debug_bit {
12/* Model Specific Registers (MSRs) */
13 CPU_MC_BIT, /* Machine Check */
14 CPU_MONITOR_BIT, /* Monitor */
15 CPU_TIME_BIT, /* Time */
16 CPU_PMC_BIT, /* Performance Monitor */
17 CPU_PLATFORM_BIT, /* Platform */
18 CPU_APIC_BIT, /* APIC */
19 CPU_POWERON_BIT, /* Power-on */
20 CPU_CONTROL_BIT, /* Control */
21 CPU_FEATURES_BIT, /* Features control */
22 CPU_LBRANCH_BIT, /* Last Branch */
23 CPU_BIOS_BIT, /* BIOS */
24 CPU_FREQ_BIT, /* Frequency */
25 CPU_MTTR_BIT, /* MTRR */
26 CPU_PERF_BIT, /* Performance */
27 CPU_CACHE_BIT, /* Cache */
28 CPU_SYSENTER_BIT, /* Sysenter */
29 CPU_THERM_BIT, /* Thermal */
30 CPU_MISC_BIT, /* Miscellaneous */
31 CPU_DEBUG_BIT, /* Debug */
32 CPU_PAT_BIT, /* PAT */
33 CPU_VMX_BIT, /* VMX */
34 CPU_CALL_BIT, /* System Call */
35 CPU_BASE_BIT, /* BASE Address */
36 CPU_VER_BIT, /* Version ID */
37 CPU_CONF_BIT, /* Configuration */
38 CPU_SMM_BIT, /* System mgmt mode */
39 CPU_SVM_BIT, /*Secure Virtual Machine*/
40 CPU_OSVM_BIT, /* OS-Visible Workaround*/
41/* Standard Registers */
42 CPU_TSS_BIT, /* Task Stack Segment */
43 CPU_CR_BIT, /* Control Registers */
44 CPU_DT_BIT, /* Descriptor Table */
45/* End of Registers flags */
46 CPU_REG_ALL_BIT, /* Select all Registers */
47};
48
49#define CPU_REG_ALL (~0) /* Select all Registers */
50
51#define CPU_MC (1 << CPU_MC_BIT)
52#define CPU_MONITOR (1 << CPU_MONITOR_BIT)
53#define CPU_TIME (1 << CPU_TIME_BIT)
54#define CPU_PMC (1 << CPU_PMC_BIT)
55#define CPU_PLATFORM (1 << CPU_PLATFORM_BIT)
56#define CPU_APIC (1 << CPU_APIC_BIT)
57#define CPU_POWERON (1 << CPU_POWERON_BIT)
58#define CPU_CONTROL (1 << CPU_CONTROL_BIT)
59#define CPU_FEATURES (1 << CPU_FEATURES_BIT)
60#define CPU_LBRANCH (1 << CPU_LBRANCH_BIT)
61#define CPU_BIOS (1 << CPU_BIOS_BIT)
62#define CPU_FREQ (1 << CPU_FREQ_BIT)
63#define CPU_MTRR (1 << CPU_MTTR_BIT)
64#define CPU_PERF (1 << CPU_PERF_BIT)
65#define CPU_CACHE (1 << CPU_CACHE_BIT)
66#define CPU_SYSENTER (1 << CPU_SYSENTER_BIT)
67#define CPU_THERM (1 << CPU_THERM_BIT)
68#define CPU_MISC (1 << CPU_MISC_BIT)
69#define CPU_DEBUG (1 << CPU_DEBUG_BIT)
70#define CPU_PAT (1 << CPU_PAT_BIT)
71#define CPU_VMX (1 << CPU_VMX_BIT)
72#define CPU_CALL (1 << CPU_CALL_BIT)
73#define CPU_BASE (1 << CPU_BASE_BIT)
74#define CPU_VER (1 << CPU_VER_BIT)
75#define CPU_CONF (1 << CPU_CONF_BIT)
76#define CPU_SMM (1 << CPU_SMM_BIT)
77#define CPU_SVM (1 << CPU_SVM_BIT)
78#define CPU_OSVM (1 << CPU_OSVM_BIT)
79#define CPU_TSS (1 << CPU_TSS_BIT)
80#define CPU_CR (1 << CPU_CR_BIT)
81#define CPU_DT (1 << CPU_DT_BIT)
82
83/* Register file flags */
84enum cpu_file_bit {
85 CPU_INDEX_BIT, /* index */
86 CPU_VALUE_BIT, /* value */
87};
88
89#define CPU_FILE_VALUE (1 << CPU_VALUE_BIT)
90
91#define MAX_CPU_FILES 512
92
93struct cpu_private {
94 unsigned cpu;
95 unsigned type;
96 unsigned reg;
97 unsigned file;
98};
99
100struct cpu_debug_base {
101 char *name; /* Register name */
102 unsigned flag; /* Register flag */
103 unsigned write; /* Register write flag */
104};
105
106/*
107 * Currently it looks similar to cpu_debug_base but once we add more files
108 * cpu_file_base will go in different direction
109 */
110struct cpu_file_base {
111 char *name; /* Register file name */
112 unsigned flag; /* Register file flag */
113 unsigned write; /* Register write flag */
114};
115
116struct cpu_cpuX_base {
117 struct dentry *dentry; /* Register dentry */
118 int init; /* Register index file */
119};
120
121struct cpu_debug_range {
122 unsigned min; /* Register range min */
123 unsigned max; /* Register range max */
124 unsigned flag; /* Supported flags */
125};
126
127#endif /* _ASM_X86_CPU_DEBUG_H */
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 9cfc88b97742..0cd82d068613 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -153,6 +153,7 @@
153#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */ 153#define X86_FEATURE_SSE5 (6*32+11) /* SSE-5 */
154#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */ 154#define X86_FEATURE_SKINIT (6*32+12) /* SKINIT/STGI instructions */
155#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */ 155#define X86_FEATURE_WDT (6*32+13) /* Watchdog timer */
156#define X86_FEATURE_NODEID_MSR (6*32+19) /* NodeId MSR */
156 157
157/* 158/*
158 * Auxiliary flags: Linux defined - For features scattered in various 159 * Auxiliary flags: Linux defined - For features scattered in various
@@ -167,6 +168,10 @@
167#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */ 168#define X86_FEATURE_FLEXPRIORITY (8*32+ 2) /* Intel FlexPriority */
168#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */ 169#define X86_FEATURE_EPT (8*32+ 3) /* Intel Extended Page Table */
169#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */ 170#define X86_FEATURE_VPID (8*32+ 4) /* Intel Virtual Processor ID */
171#define X86_FEATURE_NPT (8*32+5) /* AMD Nested Page Table support */
172#define X86_FEATURE_LBRV (8*32+6) /* AMD LBR Virtualization support */
173#define X86_FEATURE_SVML (8*32+7) /* "svm_lock" AMD SVM locking MSR */
174#define X86_FEATURE_NRIPS (8*32+8) /* "nrip_save" AMD SVM next_rip save */
170 175
171#if defined(__KERNEL__) && !defined(__ASSEMBLY__) 176#if defined(__KERNEL__) && !defined(__ASSEMBLY__)
172 177
@@ -248,6 +253,7 @@ extern const char * const x86_power_flags[32];
248#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC) 253#define cpu_has_x2apic boot_cpu_has(X86_FEATURE_X2APIC)
249#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE) 254#define cpu_has_xsave boot_cpu_has(X86_FEATURE_XSAVE)
250#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR) 255#define cpu_has_hypervisor boot_cpu_has(X86_FEATURE_HYPERVISOR)
256#define cpu_has_pclmulqdq boot_cpu_has(X86_FEATURE_PCLMULQDQ)
251 257
252#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64) 258#if defined(CONFIG_X86_INVLPG) || defined(CONFIG_X86_64)
253# define cpu_has_invlpg 1 259# define cpu_has_invlpg 1
diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h
index 3ea6f37be9e2..b81002f23614 100644
--- a/arch/x86/include/asm/debugreg.h
+++ b/arch/x86/include/asm/debugreg.h
@@ -14,10 +14,14 @@
14 which debugging register was responsible for the trap. The other bits 14 which debugging register was responsible for the trap. The other bits
15 are either reserved or not of interest to us. */ 15 are either reserved or not of interest to us. */
16 16
17/* Define reserved bits in DR6 which are always set to 1 */
18#define DR6_RESERVED (0xFFFF0FF0)
19
17#define DR_TRAP0 (0x1) /* db0 */ 20#define DR_TRAP0 (0x1) /* db0 */
18#define DR_TRAP1 (0x2) /* db1 */ 21#define DR_TRAP1 (0x2) /* db1 */
19#define DR_TRAP2 (0x4) /* db2 */ 22#define DR_TRAP2 (0x4) /* db2 */
20#define DR_TRAP3 (0x8) /* db3 */ 23#define DR_TRAP3 (0x8) /* db3 */
24#define DR_TRAP_BITS (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)
21 25
22#define DR_STEP (0x4000) /* single-step */ 26#define DR_STEP (0x4000) /* single-step */
23#define DR_SWITCH (0x8000) /* task switch */ 27#define DR_SWITCH (0x8000) /* task switch */
@@ -49,6 +53,8 @@
49 53
50#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */ 54#define DR_LOCAL_ENABLE_SHIFT 0 /* Extra shift to the local enable bit */
51#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */ 55#define DR_GLOBAL_ENABLE_SHIFT 1 /* Extra shift to the global enable bit */
56#define DR_LOCAL_ENABLE (0x1) /* Local enable for reg 0 */
57#define DR_GLOBAL_ENABLE (0x2) /* Global enable for reg 0 */
52#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */ 58#define DR_ENABLE_SIZE 2 /* 2 enable bits per register */
53 59
54#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */ 60#define DR_LOCAL_ENABLE_MASK (0x55) /* Set local bits for all 4 regs */
@@ -67,4 +73,34 @@
67#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */ 73#define DR_LOCAL_SLOWDOWN (0x100) /* Local slow the pipeline */
68#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */ 74#define DR_GLOBAL_SLOWDOWN (0x200) /* Global slow the pipeline */
69 75
76/*
77 * HW breakpoint additions
78 */
79#ifdef __KERNEL__
80
81DECLARE_PER_CPU(unsigned long, cpu_dr7);
82
83static inline void hw_breakpoint_disable(void)
84{
85 /* Zero the control register for HW Breakpoint */
86 set_debugreg(0UL, 7);
87
88 /* Zero-out the individual HW breakpoint address registers */
89 set_debugreg(0UL, 0);
90 set_debugreg(0UL, 1);
91 set_debugreg(0UL, 2);
92 set_debugreg(0UL, 3);
93}
94
95static inline int hw_breakpoint_active(void)
96{
97 return __get_cpu_var(cpu_dr7) & DR_GLOBAL_ENABLE_MASK;
98}
99
100extern void aout_dump_debugregs(struct user *dump);
101
102extern void hw_breakpoint_restore(void);
103
104#endif /* __KERNEL__ */
105
70#endif /* _ASM_X86_DEBUGREG_H */ 106#endif /* _ASM_X86_DEBUGREG_H */
diff --git a/arch/x86/include/asm/desc_defs.h b/arch/x86/include/asm/desc_defs.h
index 9d6684849fd9..278441f39856 100644
--- a/arch/x86/include/asm/desc_defs.h
+++ b/arch/x86/include/asm/desc_defs.h
@@ -12,9 +12,9 @@
12#include <linux/types.h> 12#include <linux/types.h>
13 13
14/* 14/*
15 * FIXME: Acessing the desc_struct through its fields is more elegant, 15 * FIXME: Accessing the desc_struct through its fields is more elegant,
16 * and should be the one valid thing to do. However, a lot of open code 16 * and should be the one valid thing to do. However, a lot of open code
17 * still touches the a and b acessors, and doing this allow us to do it 17 * still touches the a and b accessors, and doing this allow us to do it
18 * incrementally. We keep the signature as a struct, rather than an union, 18 * incrementally. We keep the signature as a struct, rather than an union,
19 * so we can get rid of it transparently in the future -- glommer 19 * so we can get rid of it transparently in the future -- glommer
20 */ 20 */
diff --git a/arch/x86/include/asm/device.h b/arch/x86/include/asm/device.h
index cee34e9ca45b..029f230ab637 100644
--- a/arch/x86/include/asm/device.h
+++ b/arch/x86/include/asm/device.h
@@ -8,7 +8,7 @@ struct dev_archdata {
8#ifdef CONFIG_X86_64 8#ifdef CONFIG_X86_64
9struct dma_map_ops *dma_ops; 9struct dma_map_ops *dma_ops;
10#endif 10#endif
11#ifdef CONFIG_DMAR 11#if defined(CONFIG_DMAR) || defined(CONFIG_AMD_IOMMU)
12 void *iommu; /* hook for IOMMU specific extension */ 12 void *iommu; /* hook for IOMMU specific extension */
13#endif 13#endif
14}; 14};
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 6a25d5d42836..ac91eed21061 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -20,7 +20,8 @@
20# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32) 20# define ISA_DMA_BIT_MASK DMA_BIT_MASK(32)
21#endif 21#endif
22 22
23extern dma_addr_t bad_dma_address; 23#define DMA_ERROR_CODE 0
24
24extern int iommu_merge; 25extern int iommu_merge;
25extern struct device x86_dma_fallback_dev; 26extern struct device x86_dma_fallback_dev;
26extern int panic_on_overflow; 27extern int panic_on_overflow;
@@ -48,7 +49,7 @@ static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
48 if (ops->mapping_error) 49 if (ops->mapping_error)
49 return ops->mapping_error(dev, dma_addr); 50 return ops->mapping_error(dev, dma_addr);
50 51
51 return (dma_addr == bad_dma_address); 52 return (dma_addr == DMA_ERROR_CODE);
52} 53}
53 54
54#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f) 55#define dma_alloc_noncoherent(d, s, h, f) dma_alloc_coherent(d, s, h, f)
@@ -66,7 +67,7 @@ static inline bool dma_capable(struct device *dev, dma_addr_t addr, size_t size)
66 if (!dev->dma_mask) 67 if (!dev->dma_mask)
67 return 0; 68 return 0;
68 69
69 return addr + size <= *dev->dma_mask; 70 return addr + size - 1 <= *dev->dma_mask;
70} 71}
71 72
72static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr) 73static inline dma_addr_t phys_to_dma(struct device *dev, phys_addr_t paddr)
diff --git a/arch/x86/include/asm/e820.h b/arch/x86/include/asm/e820.h
index 40b4e614fe71..0e22296790d3 100644
--- a/arch/x86/include/asm/e820.h
+++ b/arch/x86/include/asm/e820.h
@@ -61,6 +61,12 @@ struct e820map {
61 struct e820entry map[E820_X_MAX]; 61 struct e820entry map[E820_X_MAX];
62}; 62};
63 63
64#define ISA_START_ADDRESS 0xa0000
65#define ISA_END_ADDRESS 0x100000
66
67#define BIOS_BEGIN 0x000a0000
68#define BIOS_END 0x00100000
69
64#ifdef __KERNEL__ 70#ifdef __KERNEL__
65/* see comment in arch/x86/kernel/e820.c */ 71/* see comment in arch/x86/kernel/e820.c */
66extern struct e820map e820; 72extern struct e820map e820;
@@ -105,11 +111,8 @@ extern unsigned long end_user_pfn;
105 111
106extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align); 112extern u64 find_e820_area(u64 start, u64 end, u64 size, u64 align);
107extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align); 113extern u64 find_e820_area_size(u64 start, u64 *sizep, u64 align);
108extern void reserve_early(u64 start, u64 end, char *name);
109extern void reserve_early_overlap_ok(u64 start, u64 end, char *name);
110extern void free_early(u64 start, u64 end);
111extern void early_res_to_bootmem(u64 start, u64 end);
112extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align); 114extern u64 early_reserve_e820(u64 startt, u64 sizet, u64 align);
115#include <linux/early_res.h>
113 116
114extern unsigned long e820_end_of_ram_pfn(void); 117extern unsigned long e820_end_of_ram_pfn(void);
115extern unsigned long e820_end_of_low_ram_pfn(void); 118extern unsigned long e820_end_of_low_ram_pfn(void);
@@ -126,15 +129,18 @@ extern void e820_reserve_resources(void);
126extern void e820_reserve_resources_late(void); 129extern void e820_reserve_resources_late(void);
127extern void setup_memory_map(void); 130extern void setup_memory_map(void);
128extern char *default_machine_specific_memory_setup(void); 131extern char *default_machine_specific_memory_setup(void);
129#endif /* __KERNEL__ */
130#endif /* __ASSEMBLY__ */
131 132
132#define ISA_START_ADDRESS 0xa0000 133/*
133#define ISA_END_ADDRESS 0x100000 134 * Returns true iff the specified range [s,e) is completely contained inside
134#define is_ISA_range(s, e) ((s) >= ISA_START_ADDRESS && (e) < ISA_END_ADDRESS) 135 * the ISA region.
136 */
137static inline bool is_ISA_range(u64 s, u64 e)
138{
139 return s >= ISA_START_ADDRESS && e <= ISA_END_ADDRESS;
140}
135 141
136#define BIOS_BEGIN 0x000a0000 142#endif /* __KERNEL__ */
137#define BIOS_END 0x00100000 143#endif /* __ASSEMBLY__ */
138 144
139#ifdef __KERNEL__ 145#ifdef __KERNEL__
140#include <linux/ioport.h> 146#include <linux/ioport.h>
diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h
index 456a304b8172..f2ad2163109d 100644
--- a/arch/x86/include/asm/elf.h
+++ b/arch/x86/include/asm/elf.h
@@ -157,19 +157,6 @@ do { \
157 157
158#define compat_elf_check_arch(x) elf_check_arch_ia32(x) 158#define compat_elf_check_arch(x) elf_check_arch_ia32(x)
159 159
160static inline void start_ia32_thread(struct pt_regs *regs, u32 ip, u32 sp)
161{
162 loadsegment(fs, 0);
163 loadsegment(ds, __USER32_DS);
164 loadsegment(es, __USER32_DS);
165 load_gs_index(0);
166 regs->ip = ip;
167 regs->sp = sp;
168 regs->flags = X86_EFLAGS_IF;
169 regs->cs = __USER32_CS;
170 regs->ss = __USER32_DS;
171}
172
173static inline void elf_common_init(struct thread_struct *t, 160static inline void elf_common_init(struct thread_struct *t,
174 struct pt_regs *regs, const u16 ds) 161 struct pt_regs *regs, const u16 ds)
175{ 162{
@@ -183,28 +170,16 @@ static inline void elf_common_init(struct thread_struct *t,
183} 170}
184 171
185#define ELF_PLAT_INIT(_r, load_addr) \ 172#define ELF_PLAT_INIT(_r, load_addr) \
186do { \ 173 elf_common_init(&current->thread, _r, 0)
187 elf_common_init(&current->thread, _r, 0); \
188 clear_thread_flag(TIF_IA32); \
189} while (0)
190 174
191#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \ 175#define COMPAT_ELF_PLAT_INIT(regs, load_addr) \
192 elf_common_init(&current->thread, regs, __USER_DS) 176 elf_common_init(&current->thread, regs, __USER_DS)
193 177
194#define compat_start_thread(regs, ip, sp) \ 178void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp);
195do { \ 179#define compat_start_thread start_thread_ia32
196 start_ia32_thread(regs, ip, sp); \
197 set_fs(USER_DS); \
198} while (0)
199 180
200#define COMPAT_SET_PERSONALITY(ex) \ 181void set_personality_ia32(void);
201do { \ 182#define COMPAT_SET_PERSONALITY(ex) set_personality_ia32()
202 if (test_thread_flag(TIF_IA32)) \
203 clear_thread_flag(TIF_ABI_PENDING); \
204 else \
205 set_thread_flag(TIF_ABI_PENDING); \
206 current->personality |= force_personality32; \
207} while (0)
208 183
209#define COMPAT_ELF_PLATFORM ("i686") 184#define COMPAT_ELF_PLATFORM ("i686")
210 185
@@ -255,7 +230,6 @@ extern int force_personality32;
255#endif /* !CONFIG_X86_32 */ 230#endif /* !CONFIG_X86_32 */
256 231
257#define CORE_DUMP_USE_REGSET 232#define CORE_DUMP_USE_REGSET
258#define USE_ELF_CORE_DUMP
259#define ELF_EXEC_PAGESIZE 4096 233#define ELF_EXEC_PAGESIZE 4096
260 234
261/* This is the location that an ET_DYN program is loaded if exec'ed. Typical 235/* This is the location that an ET_DYN program is loaded if exec'ed. Typical
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 19e22e3784d0..5d07dea2ebb8 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -35,7 +35,7 @@ BUILD_INTERRUPT3(invalidate_interrupt7,INVALIDATE_TLB_VECTOR_START+7,
35 smp_invalidate_interrupt) 35 smp_invalidate_interrupt)
36#endif 36#endif
37 37
38BUILD_INTERRUPT(generic_interrupt, GENERIC_INTERRUPT_VECTOR) 38BUILD_INTERRUPT(x86_platform_ipi, X86_PLATFORM_IPI_VECTOR)
39 39
40/* 40/*
41 * every pentium local APIC has two 'local interrupts', with a 41 * every pentium local APIC has two 'local interrupts', with a
diff --git a/arch/x86/include/asm/fb.h b/arch/x86/include/asm/fb.h
index 53018464aea6..2519d0679d99 100644
--- a/arch/x86/include/asm/fb.h
+++ b/arch/x86/include/asm/fb.h
@@ -12,10 +12,6 @@ static inline void fb_pgprotect(struct file *file, struct vm_area_struct *vma,
12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD; 12 pgprot_val(vma->vm_page_prot) |= _PAGE_PCD;
13} 13}
14 14
15#ifdef CONFIG_X86_32
16extern int fb_is_primary_device(struct fb_info *info); 15extern int fb_is_primary_device(struct fb_info *info);
17#else
18static inline int fb_is_primary_device(struct fb_info *info) { return 0; }
19#endif
20 16
21#endif /* _ASM_X86_FB_H */ 17#endif /* _ASM_X86_FB_H */
diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h
index 14f9890eb495..d07b44f7d1dc 100644
--- a/arch/x86/include/asm/fixmap.h
+++ b/arch/x86/include/asm/fixmap.h
@@ -82,6 +82,9 @@ enum fixed_addresses {
82#endif 82#endif
83 FIX_DBGP_BASE, 83 FIX_DBGP_BASE,
84 FIX_EARLYCON_MEM_BASE, 84 FIX_EARLYCON_MEM_BASE,
85#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT
86 FIX_OHCI1394_BASE,
87#endif
85#ifdef CONFIG_X86_LOCAL_APIC 88#ifdef CONFIG_X86_LOCAL_APIC
86 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */ 89 FIX_APIC_BASE, /* local (CPU) APIC) -- required for SMP or not */
87#endif 90#endif
@@ -118,17 +121,20 @@ enum fixed_addresses {
118 * 256 temporary boot-time mappings, used by early_ioremap(), 121 * 256 temporary boot-time mappings, used by early_ioremap(),
119 * before ioremap() is functional. 122 * before ioremap() is functional.
120 * 123 *
121 * We round it up to the next 256 pages boundary so that we 124 * If necessary we round it up to the next 256 pages boundary so
122 * can have a single pgd entry and a single pte table: 125 * that we can have a single pgd entry and a single pte table:
123 */ 126 */
124#define NR_FIX_BTMAPS 64 127#define NR_FIX_BTMAPS 64
125#define FIX_BTMAPS_SLOTS 4 128#define FIX_BTMAPS_SLOTS 4
126 FIX_BTMAP_END = __end_of_permanent_fixed_addresses + 256 - 129#define TOTAL_FIX_BTMAPS (NR_FIX_BTMAPS * FIX_BTMAPS_SLOTS)
127 (__end_of_permanent_fixed_addresses & 255), 130 FIX_BTMAP_END =
128 FIX_BTMAP_BEGIN = FIX_BTMAP_END + NR_FIX_BTMAPS*FIX_BTMAPS_SLOTS - 1, 131 (__end_of_permanent_fixed_addresses ^
129#ifdef CONFIG_PROVIDE_OHCI1394_DMA_INIT 132 (__end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS - 1)) &
130 FIX_OHCI1394_BASE, 133 -PTRS_PER_PTE
131#endif 134 ? __end_of_permanent_fixed_addresses + TOTAL_FIX_BTMAPS -
135 (__end_of_permanent_fixed_addresses & (TOTAL_FIX_BTMAPS - 1))
136 : __end_of_permanent_fixed_addresses,
137 FIX_BTMAP_BEGIN = FIX_BTMAP_END + TOTAL_FIX_BTMAPS - 1,
132#ifdef CONFIG_X86_32 138#ifdef CONFIG_X86_32
133 FIX_WP_TEST, 139 FIX_WP_TEST,
134#endif 140#endif
diff --git a/arch/x86/include/asm/gart.h b/arch/x86/include/asm/gart.h
index 6cfdafa409d8..4ac5b0f33fc1 100644
--- a/arch/x86/include/asm/gart.h
+++ b/arch/x86/include/asm/gart.h
@@ -35,8 +35,7 @@ extern int gart_iommu_aperture_allowed;
35extern int gart_iommu_aperture_disabled; 35extern int gart_iommu_aperture_disabled;
36 36
37extern void early_gart_iommu_check(void); 37extern void early_gart_iommu_check(void);
38extern void gart_iommu_init(void); 38extern int gart_iommu_init(void);
39extern void gart_iommu_shutdown(void);
40extern void __init gart_parse_options(char *); 39extern void __init gart_parse_options(char *);
41extern void gart_iommu_hole_init(void); 40extern void gart_iommu_hole_init(void);
42 41
@@ -48,12 +47,6 @@ extern void gart_iommu_hole_init(void);
48static inline void early_gart_iommu_check(void) 47static inline void early_gart_iommu_check(void)
49{ 48{
50} 49}
51static inline void gart_iommu_init(void)
52{
53}
54static inline void gart_iommu_shutdown(void)
55{
56}
57static inline void gart_parse_options(char *options) 50static inline void gart_parse_options(char *options)
58{ 51{
59} 52}
diff --git a/arch/x86/include/asm/geode.h b/arch/x86/include/asm/geode.h
index ad3c2ed75481..7cd73552a4e8 100644
--- a/arch/x86/include/asm/geode.h
+++ b/arch/x86/include/asm/geode.h
@@ -12,160 +12,7 @@
12 12
13#include <asm/processor.h> 13#include <asm/processor.h>
14#include <linux/io.h> 14#include <linux/io.h>
15 15#include <linux/cs5535.h>
16/* Generic southbridge functions */
17
18#define GEODE_DEV_PMS 0
19#define GEODE_DEV_ACPI 1
20#define GEODE_DEV_GPIO 2
21#define GEODE_DEV_MFGPT 3
22
23extern int geode_get_dev_base(unsigned int dev);
24
25/* Useful macros */
26#define geode_pms_base() geode_get_dev_base(GEODE_DEV_PMS)
27#define geode_acpi_base() geode_get_dev_base(GEODE_DEV_ACPI)
28#define geode_gpio_base() geode_get_dev_base(GEODE_DEV_GPIO)
29#define geode_mfgpt_base() geode_get_dev_base(GEODE_DEV_MFGPT)
30
31/* MSRS */
32
33#define MSR_GLIU_P2D_RO0 0x10000029
34
35#define MSR_LX_GLD_MSR_CONFIG 0x48002001
36#define MSR_LX_MSR_PADSEL 0x48002011 /* NOT 0x48000011; the data
37 * sheet has the wrong value */
38#define MSR_GLCP_SYS_RSTPLL 0x4C000014
39#define MSR_GLCP_DOTPLL 0x4C000015
40
41#define MSR_LBAR_SMB 0x5140000B
42#define MSR_LBAR_GPIO 0x5140000C
43#define MSR_LBAR_MFGPT 0x5140000D
44#define MSR_LBAR_ACPI 0x5140000E
45#define MSR_LBAR_PMS 0x5140000F
46
47#define MSR_DIVIL_SOFT_RESET 0x51400017
48
49#define MSR_PIC_YSEL_LOW 0x51400020
50#define MSR_PIC_YSEL_HIGH 0x51400021
51#define MSR_PIC_ZSEL_LOW 0x51400022
52#define MSR_PIC_ZSEL_HIGH 0x51400023
53#define MSR_PIC_IRQM_LPC 0x51400025
54
55#define MSR_MFGPT_IRQ 0x51400028
56#define MSR_MFGPT_NR 0x51400029
57#define MSR_MFGPT_SETUP 0x5140002B
58
59#define MSR_LX_SPARE_MSR 0x80000011 /* DC-specific */
60
61#define MSR_GX_GLD_MSR_CONFIG 0xC0002001
62#define MSR_GX_MSR_PADSEL 0xC0002011
63
64/* Resource Sizes */
65
66#define LBAR_GPIO_SIZE 0xFF
67#define LBAR_MFGPT_SIZE 0x40
68#define LBAR_ACPI_SIZE 0x40
69#define LBAR_PMS_SIZE 0x80
70
71/* ACPI registers (PMS block) */
72
73/*
74 * PM1_EN is only valid when VSA is enabled for 16 bit reads.
75 * When VSA is not enabled, *always* read both PM1_STS and PM1_EN
76 * with a 32 bit read at offset 0x0
77 */
78
79#define PM1_STS 0x00
80#define PM1_EN 0x02
81#define PM1_CNT 0x08
82#define PM2_CNT 0x0C
83#define PM_TMR 0x10
84#define PM_GPE0_STS 0x18
85#define PM_GPE0_EN 0x1C
86
87/* PMC registers (PMS block) */
88
89#define PM_SSD 0x00
90#define PM_SCXA 0x04
91#define PM_SCYA 0x08
92#define PM_OUT_SLPCTL 0x0C
93#define PM_SCLK 0x10
94#define PM_SED 0x1
95#define PM_SCXD 0x18
96#define PM_SCYD 0x1C
97#define PM_IN_SLPCTL 0x20
98#define PM_WKD 0x30
99#define PM_WKXD 0x34
100#define PM_RD 0x38
101#define PM_WKXA 0x3C
102#define PM_FSD 0x40
103#define PM_TSD 0x44
104#define PM_PSD 0x48
105#define PM_NWKD 0x4C
106#define PM_AWKD 0x50
107#define PM_SSC 0x54
108
109/* VSA2 magic values */
110
111#define VSA_VRC_INDEX 0xAC1C
112#define VSA_VRC_DATA 0xAC1E
113#define VSA_VR_UNLOCK 0xFC53 /* unlock virtual register */
114#define VSA_VR_SIGNATURE 0x0003
115#define VSA_VR_MEM_SIZE 0x0200
116#define AMD_VSA_SIG 0x4132 /* signature is ascii 'VSA2' */
117#define GSW_VSA_SIG 0x534d /* General Software signature */
118/* GPIO */
119
120#define GPIO_OUTPUT_VAL 0x00
121#define GPIO_OUTPUT_ENABLE 0x04
122#define GPIO_OUTPUT_OPEN_DRAIN 0x08
123#define GPIO_OUTPUT_INVERT 0x0C
124#define GPIO_OUTPUT_AUX1 0x10
125#define GPIO_OUTPUT_AUX2 0x14
126#define GPIO_PULL_UP 0x18
127#define GPIO_PULL_DOWN 0x1C
128#define GPIO_INPUT_ENABLE 0x20
129#define GPIO_INPUT_INVERT 0x24
130#define GPIO_INPUT_FILTER 0x28
131#define GPIO_INPUT_EVENT_COUNT 0x2C
132#define GPIO_READ_BACK 0x30
133#define GPIO_INPUT_AUX1 0x34
134#define GPIO_EVENTS_ENABLE 0x38
135#define GPIO_LOCK_ENABLE 0x3C
136#define GPIO_POSITIVE_EDGE_EN 0x40
137#define GPIO_NEGATIVE_EDGE_EN 0x44
138#define GPIO_POSITIVE_EDGE_STS 0x48
139#define GPIO_NEGATIVE_EDGE_STS 0x4C
140
141#define GPIO_MAP_X 0xE0
142#define GPIO_MAP_Y 0xE4
143#define GPIO_MAP_Z 0xE8
144#define GPIO_MAP_W 0xEC
145
146static inline u32 geode_gpio(unsigned int nr)
147{
148 BUG_ON(nr > 28);
149 return 1 << nr;
150}
151
152extern void geode_gpio_set(u32, unsigned int);
153extern void geode_gpio_clear(u32, unsigned int);
154extern int geode_gpio_isset(u32, unsigned int);
155extern void geode_gpio_setup_event(unsigned int, int, int);
156extern void geode_gpio_set_irq(unsigned int, unsigned int);
157
158static inline void geode_gpio_event_irq(unsigned int gpio, int pair)
159{
160 geode_gpio_setup_event(gpio, pair, 0);
161}
162
163static inline void geode_gpio_event_pme(unsigned int gpio, int pair)
164{
165 geode_gpio_setup_event(gpio, pair, 1);
166}
167
168/* Specific geode tests */
169 16
170static inline int is_geode_gx(void) 17static inline int is_geode_gx(void)
171{ 18{
@@ -186,68 +33,4 @@ static inline int is_geode(void)
186 return (is_geode_gx() || is_geode_lx()); 33 return (is_geode_gx() || is_geode_lx());
187} 34}
188 35
189#ifdef CONFIG_MGEODE_LX
190extern int geode_has_vsa2(void);
191#else
192static inline int geode_has_vsa2(void)
193{
194 return 0;
195}
196#endif
197
198/* MFGPTs */
199
200#define MFGPT_MAX_TIMERS 8
201#define MFGPT_TIMER_ANY (-1)
202
203#define MFGPT_DOMAIN_WORKING 1
204#define MFGPT_DOMAIN_STANDBY 2
205#define MFGPT_DOMAIN_ANY (MFGPT_DOMAIN_WORKING | MFGPT_DOMAIN_STANDBY)
206
207#define MFGPT_CMP1 0
208#define MFGPT_CMP2 1
209
210#define MFGPT_EVENT_IRQ 0
211#define MFGPT_EVENT_NMI 1
212#define MFGPT_EVENT_RESET 3
213
214#define MFGPT_REG_CMP1 0
215#define MFGPT_REG_CMP2 2
216#define MFGPT_REG_COUNTER 4
217#define MFGPT_REG_SETUP 6
218
219#define MFGPT_SETUP_CNTEN (1 << 15)
220#define MFGPT_SETUP_CMP2 (1 << 14)
221#define MFGPT_SETUP_CMP1 (1 << 13)
222#define MFGPT_SETUP_SETUP (1 << 12)
223#define MFGPT_SETUP_STOPEN (1 << 11)
224#define MFGPT_SETUP_EXTEN (1 << 10)
225#define MFGPT_SETUP_REVEN (1 << 5)
226#define MFGPT_SETUP_CLKSEL (1 << 4)
227
228static inline void geode_mfgpt_write(int timer, u16 reg, u16 value)
229{
230 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
231 outw(value, base + reg + (timer * 8));
232}
233
234static inline u16 geode_mfgpt_read(int timer, u16 reg)
235{
236 u32 base = geode_get_dev_base(GEODE_DEV_MFGPT);
237 return inw(base + reg + (timer * 8));
238}
239
240extern int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable);
241extern int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable);
242extern int geode_mfgpt_alloc_timer(int timer, int domain);
243
244#define geode_mfgpt_setup_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 1)
245#define geode_mfgpt_release_irq(t, c, i) geode_mfgpt_set_irq((t), (c), (i), 0)
246
247#ifdef CONFIG_GEODE_MFGPT_TIMER
248extern int __init mfgpt_timer_setup(void);
249#else
250static inline int mfgpt_timer_setup(void) { return 0; }
251#endif
252
253#endif /* _ASM_X86_GEODE_H */ 36#endif /* _ASM_X86_GEODE_H */
diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h
index 82e3e8f01043..0f8576427cfe 100644
--- a/arch/x86/include/asm/hardirq.h
+++ b/arch/x86/include/asm/hardirq.h
@@ -12,7 +12,7 @@ typedef struct {
12 unsigned int apic_timer_irqs; /* arch dependent */ 12 unsigned int apic_timer_irqs; /* arch dependent */
13 unsigned int irq_spurious_count; 13 unsigned int irq_spurious_count;
14#endif 14#endif
15 unsigned int generic_irqs; /* arch dependent */ 15 unsigned int x86_platform_ipis; /* arch dependent */
16 unsigned int apic_perf_irqs; 16 unsigned int apic_perf_irqs;
17 unsigned int apic_pending_irqs; 17 unsigned int apic_pending_irqs;
18#ifdef CONFIG_SMP 18#ifdef CONFIG_SMP
@@ -20,11 +20,11 @@ typedef struct {
20 unsigned int irq_call_count; 20 unsigned int irq_call_count;
21 unsigned int irq_tlb_count; 21 unsigned int irq_tlb_count;
22#endif 22#endif
23#ifdef CONFIG_X86_MCE 23#ifdef CONFIG_X86_THERMAL_VECTOR
24 unsigned int irq_thermal_count; 24 unsigned int irq_thermal_count;
25# ifdef CONFIG_X86_MCE_THRESHOLD 25#endif
26#ifdef CONFIG_X86_MCE_THRESHOLD
26 unsigned int irq_threshold_count; 27 unsigned int irq_threshold_count;
27# endif
28#endif 28#endif
29} ____cacheline_aligned irq_cpustat_t; 29} ____cacheline_aligned irq_cpustat_t;
30 30
diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h
index 014c2b85ae45..a726650fc80f 100644
--- a/arch/x86/include/asm/highmem.h
+++ b/arch/x86/include/asm/highmem.h
@@ -66,10 +66,6 @@ void *kmap_atomic_pfn(unsigned long pfn, enum km_type type);
66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); 66void *kmap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot);
67struct page *kmap_atomic_to_page(void *ptr); 67struct page *kmap_atomic_to_page(void *ptr);
68 68
69#ifndef CONFIG_PARAVIRT
70#define kmap_atomic_pte(page, type) kmap_atomic(page, type)
71#endif
72
73#define flush_cache_kmaps() do { } while (0) 69#define flush_cache_kmaps() do { } while (0)
74 70
75extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, 71extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn,
diff --git a/arch/x86/include/asm/hpet.h b/arch/x86/include/asm/hpet.h
index 1c22cb05ad6a..1d5c08a1bdfd 100644
--- a/arch/x86/include/asm/hpet.h
+++ b/arch/x86/include/asm/hpet.h
@@ -65,11 +65,13 @@
65/* hpet memory map physical address */ 65/* hpet memory map physical address */
66extern unsigned long hpet_address; 66extern unsigned long hpet_address;
67extern unsigned long force_hpet_address; 67extern unsigned long force_hpet_address;
68extern u8 hpet_blockid;
68extern int hpet_force_user; 69extern int hpet_force_user;
70extern u8 hpet_msi_disable;
69extern int is_hpet_enabled(void); 71extern int is_hpet_enabled(void);
70extern int hpet_enable(void); 72extern int hpet_enable(void);
71extern void hpet_disable(void); 73extern void hpet_disable(void);
72extern unsigned long hpet_readl(unsigned long a); 74extern unsigned int hpet_readl(unsigned int a);
73extern void force_hpet_resume(void); 75extern void force_hpet_resume(void);
74 76
75extern void hpet_msi_unmask(unsigned int irq); 77extern void hpet_msi_unmask(unsigned int irq);
@@ -78,9 +80,9 @@ extern void hpet_msi_write(unsigned int irq, struct msi_msg *msg);
78extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg); 80extern void hpet_msi_read(unsigned int irq, struct msi_msg *msg);
79 81
80#ifdef CONFIG_PCI_MSI 82#ifdef CONFIG_PCI_MSI
81extern int arch_setup_hpet_msi(unsigned int irq); 83extern int arch_setup_hpet_msi(unsigned int irq, unsigned int id);
82#else 84#else
83static inline int arch_setup_hpet_msi(unsigned int irq) 85static inline int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
84{ 86{
85 return -EINVAL; 87 return -EINVAL;
86} 88}
diff --git a/arch/x86/include/asm/hw_breakpoint.h b/arch/x86/include/asm/hw_breakpoint.h
new file mode 100644
index 000000000000..2a1bd8f4f23a
--- /dev/null
+++ b/arch/x86/include/asm/hw_breakpoint.h
@@ -0,0 +1,72 @@
1#ifndef _I386_HW_BREAKPOINT_H
2#define _I386_HW_BREAKPOINT_H
3
4#ifdef __KERNEL__
5#define __ARCH_HW_BREAKPOINT_H
6
7/*
8 * The name should probably be something dealt in
9 * a higher level. While dealing with the user
10 * (display/resolving)
11 */
12struct arch_hw_breakpoint {
13 unsigned long address;
14 u8 len;
15 u8 type;
16};
17
18#include <linux/kdebug.h>
19#include <linux/percpu.h>
20#include <linux/list.h>
21
22/* Available HW breakpoint length encodings */
23#define X86_BREAKPOINT_LEN_1 0x40
24#define X86_BREAKPOINT_LEN_2 0x44
25#define X86_BREAKPOINT_LEN_4 0x4c
26#define X86_BREAKPOINT_LEN_EXECUTE 0x40
27
28#ifdef CONFIG_X86_64
29#define X86_BREAKPOINT_LEN_8 0x48
30#endif
31
32/* Available HW breakpoint type encodings */
33
34/* trigger on instruction execute */
35#define X86_BREAKPOINT_EXECUTE 0x80
36/* trigger on memory write */
37#define X86_BREAKPOINT_WRITE 0x81
38/* trigger on memory read or write */
39#define X86_BREAKPOINT_RW 0x83
40
41/* Total number of available HW breakpoint registers */
42#define HBP_NUM 4
43
44struct perf_event;
45struct pmu;
46
47extern int arch_check_va_in_userspace(unsigned long va, u8 hbp_len);
48extern int arch_validate_hwbkpt_settings(struct perf_event *bp,
49 struct task_struct *tsk);
50extern int hw_breakpoint_exceptions_notify(struct notifier_block *unused,
51 unsigned long val, void *data);
52
53
54int arch_install_hw_breakpoint(struct perf_event *bp);
55void arch_uninstall_hw_breakpoint(struct perf_event *bp);
56void hw_breakpoint_pmu_read(struct perf_event *bp);
57void hw_breakpoint_pmu_unthrottle(struct perf_event *bp);
58
59extern void
60arch_fill_perf_breakpoint(struct perf_event *bp);
61
62unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type);
63int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type);
64
65extern int arch_bp_generic_fields(int x86_len, int x86_type,
66 int *gen_len, int *gen_type);
67
68extern struct pmu perf_ops_bp;
69
70#endif /* __KERNEL__ */
71#endif /* _I386_HW_BREAKPOINT_H */
72
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index 68900e7dada8..c17411503f28 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -27,7 +27,7 @@
27 27
28/* Interrupt handlers registered during init_IRQ */ 28/* Interrupt handlers registered during init_IRQ */
29extern void apic_timer_interrupt(void); 29extern void apic_timer_interrupt(void);
30extern void generic_interrupt(void); 30extern void x86_platform_ipi(void);
31extern void error_interrupt(void); 31extern void error_interrupt(void);
32extern void perf_pending_interrupt(void); 32extern void perf_pending_interrupt(void);
33 33
@@ -55,13 +55,6 @@ extern void call_function_single_interrupt(void);
55 55
56extern void pull_timers_interrupt(void); 56extern void pull_timers_interrupt(void);
57 57
58/* PIC specific functions */
59extern void disable_8259A_irq(unsigned int irq);
60extern void enable_8259A_irq(unsigned int irq);
61extern int i8259A_irq_pending(unsigned int irq);
62extern void make_8259A_irq(unsigned int irq);
63extern void init_8259A(int aeoi);
64
65/* IOAPIC */ 58/* IOAPIC */
66#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs)) 59#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
67extern unsigned long io_apic_irqs; 60extern unsigned long io_apic_irqs;
@@ -81,14 +74,33 @@ static inline void set_io_apic_irq_attr(struct io_apic_irq_attr *irq_attr,
81 int ioapic, int ioapic_pin, 74 int ioapic, int ioapic_pin,
82 int trigger, int polarity) 75 int trigger, int polarity)
83{ 76{
84 irq_attr->ioapic = ioapic; 77 irq_attr->ioapic = ioapic;
85 irq_attr->ioapic_pin = ioapic_pin; 78 irq_attr->ioapic_pin = ioapic_pin;
86 irq_attr->trigger = trigger; 79 irq_attr->trigger = trigger;
87 irq_attr->polarity = polarity; 80 irq_attr->polarity = polarity;
88} 81}
89 82
90extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, 83/*
91 struct io_apic_irq_attr *irq_attr); 84 * This is performance-critical, we want to do it O(1)
85 *
86 * Most irqs are mapped 1:1 with pins.
87 */
88struct irq_cfg {
89 struct irq_pin_list *irq_2_pin;
90 cpumask_var_t domain;
91 cpumask_var_t old_domain;
92 u8 vector;
93 u8 move_in_progress : 1;
94};
95
96extern struct irq_cfg *irq_cfg(unsigned int);
97extern int assign_irq_vector(int, struct irq_cfg *, const struct cpumask *);
98extern void send_cleanup_vector(struct irq_cfg *);
99
100struct irq_desc;
101extern unsigned int set_desc_affinity(struct irq_desc *, const struct cpumask *,
102 unsigned int *dest_id);
103extern int IO_APIC_get_PCI_irq_vector(int bus, int devfn, int pin, struct io_apic_irq_attr *irq_attr);
92extern void setup_ioapic_dest(void); 104extern void setup_ioapic_dest(void);
93 105
94extern void enable_IO_APIC(void); 106extern void enable_IO_APIC(void);
@@ -103,7 +115,7 @@ extern void eisa_set_level_irq(unsigned int irq);
103/* SMP */ 115/* SMP */
104extern void smp_apic_timer_interrupt(struct pt_regs *); 116extern void smp_apic_timer_interrupt(struct pt_regs *);
105extern void smp_spurious_interrupt(struct pt_regs *); 117extern void smp_spurious_interrupt(struct pt_regs *);
106extern void smp_generic_interrupt(struct pt_regs *); 118extern void smp_x86_platform_ipi(struct pt_regs *);
107extern void smp_error_interrupt(struct pt_regs *); 119extern void smp_error_interrupt(struct pt_regs *);
108#ifdef CONFIG_X86_IO_APIC 120#ifdef CONFIG_X86_IO_APIC
109extern asmlinkage void smp_irq_move_cleanup_interrupt(void); 121extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
@@ -124,6 +136,7 @@ extern void (*__initconst interrupt[NR_VECTORS-FIRST_EXTERNAL_VECTOR])(void);
124 136
125typedef int vector_irq_t[NR_VECTORS]; 137typedef int vector_irq_t[NR_VECTORS];
126DECLARE_PER_CPU(vector_irq_t, vector_irq); 138DECLARE_PER_CPU(vector_irq_t, vector_irq);
139extern void setup_vector_irq(int cpu);
127 140
128#ifdef CONFIG_X86_IO_APIC 141#ifdef CONFIG_X86_IO_APIC
129extern void lock_vector_lock(void); 142extern void lock_vector_lock(void);
diff --git a/arch/x86/include/asm/hyperv.h b/arch/x86/include/asm/hyperv.h
new file mode 100644
index 000000000000..e153a2b3889a
--- /dev/null
+++ b/arch/x86/include/asm/hyperv.h
@@ -0,0 +1,186 @@
1#ifndef _ASM_X86_KVM_HYPERV_H
2#define _ASM_X86_KVM_HYPERV_H
3
4#include <linux/types.h>
5
6/*
7 * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent
8 * is set by CPUID(HvCpuIdFunctionVersionAndFeatures).
9 */
10#define HYPERV_CPUID_VENDOR_AND_MAX_FUNCTIONS 0x40000000
11#define HYPERV_CPUID_INTERFACE 0x40000001
12#define HYPERV_CPUID_VERSION 0x40000002
13#define HYPERV_CPUID_FEATURES 0x40000003
14#define HYPERV_CPUID_ENLIGHTMENT_INFO 0x40000004
15#define HYPERV_CPUID_IMPLEMENT_LIMITS 0x40000005
16
17/*
18 * Feature identification. EAX indicates which features are available
19 * to the partition based upon the current partition privileges.
20 */
21
22/* VP Runtime (HV_X64_MSR_VP_RUNTIME) available */
23#define HV_X64_MSR_VP_RUNTIME_AVAILABLE (1 << 0)
24/* Partition Reference Counter (HV_X64_MSR_TIME_REF_COUNT) available*/
25#define HV_X64_MSR_TIME_REF_COUNT_AVAILABLE (1 << 1)
26/*
27 * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM
28 * and HV_X64_MSR_SINT0 through HV_X64_MSR_SINT15) available
29 */
30#define HV_X64_MSR_SYNIC_AVAILABLE (1 << 2)
31/*
32 * Synthetic Timer MSRs (HV_X64_MSR_STIMER0_CONFIG through
33 * HV_X64_MSR_STIMER3_COUNT) available
34 */
35#define HV_X64_MSR_SYNTIMER_AVAILABLE (1 << 3)
36/*
37 * APIC access MSRs (HV_X64_MSR_EOI, HV_X64_MSR_ICR and HV_X64_MSR_TPR)
38 * are available
39 */
40#define HV_X64_MSR_APIC_ACCESS_AVAILABLE (1 << 4)
41/* Hypercall MSRs (HV_X64_MSR_GUEST_OS_ID and HV_X64_MSR_HYPERCALL) available*/
42#define HV_X64_MSR_HYPERCALL_AVAILABLE (1 << 5)
43/* Access virtual processor index MSR (HV_X64_MSR_VP_INDEX) available*/
44#define HV_X64_MSR_VP_INDEX_AVAILABLE (1 << 6)
45/* Virtual system reset MSR (HV_X64_MSR_RESET) is available*/
46#define HV_X64_MSR_RESET_AVAILABLE (1 << 7)
47 /*
48 * Access statistics pages MSRs (HV_X64_MSR_STATS_PARTITION_RETAIL_PAGE,
49 * HV_X64_MSR_STATS_PARTITION_INTERNAL_PAGE, HV_X64_MSR_STATS_VP_RETAIL_PAGE,
50 * HV_X64_MSR_STATS_VP_INTERNAL_PAGE) available
51 */
52#define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8)
53
54/*
55 * Feature identification: EBX indicates which flags were specified at
56 * partition creation. The format is the same as the partition creation
57 * flag structure defined in section Partition Creation Flags.
58 */
59#define HV_X64_CREATE_PARTITIONS (1 << 0)
60#define HV_X64_ACCESS_PARTITION_ID (1 << 1)
61#define HV_X64_ACCESS_MEMORY_POOL (1 << 2)
62#define HV_X64_ADJUST_MESSAGE_BUFFERS (1 << 3)
63#define HV_X64_POST_MESSAGES (1 << 4)
64#define HV_X64_SIGNAL_EVENTS (1 << 5)
65#define HV_X64_CREATE_PORT (1 << 6)
66#define HV_X64_CONNECT_PORT (1 << 7)
67#define HV_X64_ACCESS_STATS (1 << 8)
68#define HV_X64_DEBUGGING (1 << 11)
69#define HV_X64_CPU_POWER_MANAGEMENT (1 << 12)
70#define HV_X64_CONFIGURE_PROFILER (1 << 13)
71
72/*
73 * Feature identification. EDX indicates which miscellaneous features
74 * are available to the partition.
75 */
76/* The MWAIT instruction is available (per section MONITOR / MWAIT) */
77#define HV_X64_MWAIT_AVAILABLE (1 << 0)
78/* Guest debugging support is available */
79#define HV_X64_GUEST_DEBUGGING_AVAILABLE (1 << 1)
80/* Performance Monitor support is available*/
81#define HV_X64_PERF_MONITOR_AVAILABLE (1 << 2)
82/* Support for physical CPU dynamic partitioning events is available*/
83#define HV_X64_CPU_DYNAMIC_PARTITIONING_AVAILABLE (1 << 3)
84/*
85 * Support for passing hypercall input parameter block via XMM
86 * registers is available
87 */
88#define HV_X64_HYPERCALL_PARAMS_XMM_AVAILABLE (1 << 4)
89/* Support for a virtual guest idle state is available */
90#define HV_X64_GUEST_IDLE_STATE_AVAILABLE (1 << 5)
91
92/*
93 * Implementation recommendations. Indicates which behaviors the hypervisor
94 * recommends the OS implement for optimal performance.
95 */
96 /*
97 * Recommend using hypercall for address space switches rather
98 * than MOV to CR3 instruction
99 */
100#define HV_X64_MWAIT_RECOMMENDED (1 << 0)
101/* Recommend using hypercall for local TLB flushes rather
102 * than INVLPG or MOV to CR3 instructions */
103#define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1)
104/*
105 * Recommend using hypercall for remote TLB flushes rather
106 * than inter-processor interrupts
107 */
108#define HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED (1 << 2)
109/*
110 * Recommend using MSRs for accessing APIC registers
111 * EOI, ICR and TPR rather than their memory-mapped counterparts
112 */
113#define HV_X64_APIC_ACCESS_RECOMMENDED (1 << 3)
114/* Recommend using the hypervisor-provided MSR to initiate a system RESET */
115#define HV_X64_SYSTEM_RESET_RECOMMENDED (1 << 4)
116/*
117 * Recommend using relaxed timing for this partition. If used,
118 * the VM should disable any watchdog timeouts that rely on the
119 * timely delivery of external interrupts
120 */
121#define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5)
122
123/* MSR used to identify the guest OS. */
124#define HV_X64_MSR_GUEST_OS_ID 0x40000000
125
126/* MSR used to setup pages used to communicate with the hypervisor. */
127#define HV_X64_MSR_HYPERCALL 0x40000001
128
129/* MSR used to provide vcpu index */
130#define HV_X64_MSR_VP_INDEX 0x40000002
131
132/* Define the virtual APIC registers */
133#define HV_X64_MSR_EOI 0x40000070
134#define HV_X64_MSR_ICR 0x40000071
135#define HV_X64_MSR_TPR 0x40000072
136#define HV_X64_MSR_APIC_ASSIST_PAGE 0x40000073
137
138/* Define synthetic interrupt controller model specific registers. */
139#define HV_X64_MSR_SCONTROL 0x40000080
140#define HV_X64_MSR_SVERSION 0x40000081
141#define HV_X64_MSR_SIEFP 0x40000082
142#define HV_X64_MSR_SIMP 0x40000083
143#define HV_X64_MSR_EOM 0x40000084
144#define HV_X64_MSR_SINT0 0x40000090
145#define HV_X64_MSR_SINT1 0x40000091
146#define HV_X64_MSR_SINT2 0x40000092
147#define HV_X64_MSR_SINT3 0x40000093
148#define HV_X64_MSR_SINT4 0x40000094
149#define HV_X64_MSR_SINT5 0x40000095
150#define HV_X64_MSR_SINT6 0x40000096
151#define HV_X64_MSR_SINT7 0x40000097
152#define HV_X64_MSR_SINT8 0x40000098
153#define HV_X64_MSR_SINT9 0x40000099
154#define HV_X64_MSR_SINT10 0x4000009A
155#define HV_X64_MSR_SINT11 0x4000009B
156#define HV_X64_MSR_SINT12 0x4000009C
157#define HV_X64_MSR_SINT13 0x4000009D
158#define HV_X64_MSR_SINT14 0x4000009E
159#define HV_X64_MSR_SINT15 0x4000009F
160
161
162#define HV_X64_MSR_HYPERCALL_ENABLE 0x00000001
163#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT 12
164#define HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_MASK \
165 (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1))
166
167/* Declare the various hypercall operations. */
168#define HV_X64_HV_NOTIFY_LONG_SPIN_WAIT 0x0008
169
170#define HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE 0x00000001
171#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT 12
172#define HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_MASK \
173 (~((1ull << HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT) - 1))
174
175#define HV_PROCESSOR_POWER_STATE_C0 0
176#define HV_PROCESSOR_POWER_STATE_C1 1
177#define HV_PROCESSOR_POWER_STATE_C2 2
178#define HV_PROCESSOR_POWER_STATE_C3 3
179
180/* hypercall status code */
181#define HV_STATUS_SUCCESS 0
182#define HV_STATUS_INVALID_HYPERCALL_CODE 2
183#define HV_STATUS_INVALID_HYPERCALL_INPUT 3
184#define HV_STATUS_INVALID_ALIGNMENT 4
185
186#endif
diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h
index 0b20bbb758f2..da2930924501 100644
--- a/arch/x86/include/asm/i387.h
+++ b/arch/x86/include/asm/i387.h
@@ -10,6 +10,8 @@
10#ifndef _ASM_X86_I387_H 10#ifndef _ASM_X86_I387_H
11#define _ASM_X86_I387_H 11#define _ASM_X86_I387_H
12 12
13#ifndef __ASSEMBLY__
14
13#include <linux/sched.h> 15#include <linux/sched.h>
14#include <linux/kernel_stat.h> 16#include <linux/kernel_stat.h>
15#include <linux/regset.h> 17#include <linux/regset.h>
@@ -31,8 +33,16 @@ extern void init_thread_xstate(void);
31extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); 33extern int dump_fpu(struct pt_regs *, struct user_i387_struct *);
32 34
33extern user_regset_active_fn fpregs_active, xfpregs_active; 35extern user_regset_active_fn fpregs_active, xfpregs_active;
34extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get; 36extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get,
35extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set; 37 xstateregs_get;
38extern user_regset_set_fn fpregs_set, xfpregs_set, fpregs_soft_set,
39 xstateregs_set;
40
41/*
42 * xstateregs_active == fpregs_active. Please refer to the comment
43 * at the definition of fpregs_active.
44 */
45#define xstateregs_active fpregs_active
36 46
37extern struct _fpx_sw_bytes fx_sw_reserved; 47extern struct _fpx_sw_bytes fx_sw_reserved;
38#ifdef CONFIG_IA32_EMULATION 48#ifdef CONFIG_IA32_EMULATION
@@ -411,4 +421,9 @@ static inline unsigned short get_fpu_mxcsr(struct task_struct *tsk)
411 } 421 }
412} 422}
413 423
424#endif /* __ASSEMBLY__ */
425
426#define PSHUFB_XMM5_XMM0 .byte 0x66, 0x0f, 0x38, 0x00, 0xc5
427#define PSHUFB_XMM5_XMM6 .byte 0x66, 0x0f, 0x38, 0x00, 0xf5
428
414#endif /* _ASM_X86_I387_H */ 429#endif /* _ASM_X86_I387_H */
diff --git a/arch/x86/include/asm/i8259.h b/arch/x86/include/asm/i8259.h
index 58d7091eeb1f..1655147646aa 100644
--- a/arch/x86/include/asm/i8259.h
+++ b/arch/x86/include/asm/i8259.h
@@ -24,12 +24,7 @@ extern unsigned int cached_irq_mask;
24#define SLAVE_ICW4_DEFAULT 0x01 24#define SLAVE_ICW4_DEFAULT 0x01
25#define PIC_ICW4_AEOI 2 25#define PIC_ICW4_AEOI 2
26 26
27extern spinlock_t i8259A_lock; 27extern raw_spinlock_t i8259A_lock;
28
29extern void init_8259A(int auto_eoi);
30extern void enable_8259A_irq(unsigned int irq);
31extern void disable_8259A_irq(unsigned int irq);
32extern unsigned int startup_8259A_irq(unsigned int irq);
33 28
34/* the PIC may need a careful delay on some platforms, hence specific calls */ 29/* the PIC may need a careful delay on some platforms, hence specific calls */
35static inline unsigned char inb_pic(unsigned int port) 30static inline unsigned char inb_pic(unsigned int port)
@@ -57,7 +52,17 @@ static inline void outb_pic(unsigned char value, unsigned int port)
57 52
58extern struct irq_chip i8259A_chip; 53extern struct irq_chip i8259A_chip;
59 54
60extern void mask_8259A(void); 55struct legacy_pic {
61extern void unmask_8259A(void); 56 int nr_legacy_irqs;
57 struct irq_chip *chip;
58 void (*mask_all)(void);
59 void (*restore_mask)(void);
60 void (*init)(int auto_eoi);
61 int (*irq_pending)(unsigned int irq);
62 void (*make_irq)(unsigned int irq);
63};
64
65extern struct legacy_pic *legacy_pic;
66extern struct legacy_pic null_legacy_pic;
62 67
63#endif /* _ASM_X86_I8259_H */ 68#endif /* _ASM_X86_I8259_H */
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
new file mode 100644
index 000000000000..205b063e3e32
--- /dev/null
+++ b/arch/x86/include/asm/inat.h
@@ -0,0 +1,220 @@
1#ifndef _ASM_X86_INAT_H
2#define _ASM_X86_INAT_H
3/*
4 * x86 instruction attributes
5 *
6 * Written by Masami Hiramatsu <mhiramat@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 */
23#include <asm/inat_types.h>
24
25/*
26 * Internal bits. Don't use bitmasks directly, because these bits are
27 * unstable. You should use checking functions.
28 */
29
30#define INAT_OPCODE_TABLE_SIZE 256
31#define INAT_GROUP_TABLE_SIZE 8
32
33/* Legacy last prefixes */
34#define INAT_PFX_OPNDSZ 1 /* 0x66 */ /* LPFX1 */
35#define INAT_PFX_REPE 2 /* 0xF3 */ /* LPFX2 */
36#define INAT_PFX_REPNE 3 /* 0xF2 */ /* LPFX3 */
37/* Other Legacy prefixes */
38#define INAT_PFX_LOCK 4 /* 0xF0 */
39#define INAT_PFX_CS 5 /* 0x2E */
40#define INAT_PFX_DS 6 /* 0x3E */
41#define INAT_PFX_ES 7 /* 0x26 */
42#define INAT_PFX_FS 8 /* 0x64 */
43#define INAT_PFX_GS 9 /* 0x65 */
44#define INAT_PFX_SS 10 /* 0x36 */
45#define INAT_PFX_ADDRSZ 11 /* 0x67 */
46/* x86-64 REX prefix */
47#define INAT_PFX_REX 12 /* 0x4X */
48/* AVX VEX prefixes */
49#define INAT_PFX_VEX2 13 /* 2-bytes VEX prefix */
50#define INAT_PFX_VEX3 14 /* 3-bytes VEX prefix */
51
52#define INAT_LSTPFX_MAX 3
53#define INAT_LGCPFX_MAX 11
54
55/* Immediate size */
56#define INAT_IMM_BYTE 1
57#define INAT_IMM_WORD 2
58#define INAT_IMM_DWORD 3
59#define INAT_IMM_QWORD 4
60#define INAT_IMM_PTR 5
61#define INAT_IMM_VWORD32 6
62#define INAT_IMM_VWORD 7
63
64/* Legacy prefix */
65#define INAT_PFX_OFFS 0
66#define INAT_PFX_BITS 4
67#define INAT_PFX_MAX ((1 << INAT_PFX_BITS) - 1)
68#define INAT_PFX_MASK (INAT_PFX_MAX << INAT_PFX_OFFS)
69/* Escape opcodes */
70#define INAT_ESC_OFFS (INAT_PFX_OFFS + INAT_PFX_BITS)
71#define INAT_ESC_BITS 2
72#define INAT_ESC_MAX ((1 << INAT_ESC_BITS) - 1)
73#define INAT_ESC_MASK (INAT_ESC_MAX << INAT_ESC_OFFS)
74/* Group opcodes (1-16) */
75#define INAT_GRP_OFFS (INAT_ESC_OFFS + INAT_ESC_BITS)
76#define INAT_GRP_BITS 5
77#define INAT_GRP_MAX ((1 << INAT_GRP_BITS) - 1)
78#define INAT_GRP_MASK (INAT_GRP_MAX << INAT_GRP_OFFS)
79/* Immediates */
80#define INAT_IMM_OFFS (INAT_GRP_OFFS + INAT_GRP_BITS)
81#define INAT_IMM_BITS 3
82#define INAT_IMM_MASK (((1 << INAT_IMM_BITS) - 1) << INAT_IMM_OFFS)
83/* Flags */
84#define INAT_FLAG_OFFS (INAT_IMM_OFFS + INAT_IMM_BITS)
85#define INAT_MODRM (1 << (INAT_FLAG_OFFS))
86#define INAT_FORCE64 (1 << (INAT_FLAG_OFFS + 1))
87#define INAT_SCNDIMM (1 << (INAT_FLAG_OFFS + 2))
88#define INAT_MOFFSET (1 << (INAT_FLAG_OFFS + 3))
89#define INAT_VARIANT (1 << (INAT_FLAG_OFFS + 4))
90#define INAT_VEXOK (1 << (INAT_FLAG_OFFS + 5))
91#define INAT_VEXONLY (1 << (INAT_FLAG_OFFS + 6))
92/* Attribute making macros for attribute tables */
93#define INAT_MAKE_PREFIX(pfx) (pfx << INAT_PFX_OFFS)
94#define INAT_MAKE_ESCAPE(esc) (esc << INAT_ESC_OFFS)
95#define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM)
96#define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS)
97
98/* Attribute search APIs */
99extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
100extern insn_attr_t inat_get_escape_attribute(insn_byte_t opcode,
101 insn_byte_t last_pfx,
102 insn_attr_t esc_attr);
103extern insn_attr_t inat_get_group_attribute(insn_byte_t modrm,
104 insn_byte_t last_pfx,
105 insn_attr_t esc_attr);
106extern insn_attr_t inat_get_avx_attribute(insn_byte_t opcode,
107 insn_byte_t vex_m,
108 insn_byte_t vex_pp);
109
110/* Attribute checking functions */
111static inline int inat_is_legacy_prefix(insn_attr_t attr)
112{
113 attr &= INAT_PFX_MASK;
114 return attr && attr <= INAT_LGCPFX_MAX;
115}
116
117static inline int inat_is_address_size_prefix(insn_attr_t attr)
118{
119 return (attr & INAT_PFX_MASK) == INAT_PFX_ADDRSZ;
120}
121
122static inline int inat_is_operand_size_prefix(insn_attr_t attr)
123{
124 return (attr & INAT_PFX_MASK) == INAT_PFX_OPNDSZ;
125}
126
127static inline int inat_is_rex_prefix(insn_attr_t attr)
128{
129 return (attr & INAT_PFX_MASK) == INAT_PFX_REX;
130}
131
132static inline int inat_last_prefix_id(insn_attr_t attr)
133{
134 if ((attr & INAT_PFX_MASK) > INAT_LSTPFX_MAX)
135 return 0;
136 else
137 return attr & INAT_PFX_MASK;
138}
139
140static inline int inat_is_vex_prefix(insn_attr_t attr)
141{
142 attr &= INAT_PFX_MASK;
143 return attr == INAT_PFX_VEX2 || attr == INAT_PFX_VEX3;
144}
145
146static inline int inat_is_vex3_prefix(insn_attr_t attr)
147{
148 return (attr & INAT_PFX_MASK) == INAT_PFX_VEX3;
149}
150
151static inline int inat_is_escape(insn_attr_t attr)
152{
153 return attr & INAT_ESC_MASK;
154}
155
156static inline int inat_escape_id(insn_attr_t attr)
157{
158 return (attr & INAT_ESC_MASK) >> INAT_ESC_OFFS;
159}
160
161static inline int inat_is_group(insn_attr_t attr)
162{
163 return attr & INAT_GRP_MASK;
164}
165
166static inline int inat_group_id(insn_attr_t attr)
167{
168 return (attr & INAT_GRP_MASK) >> INAT_GRP_OFFS;
169}
170
171static inline int inat_group_common_attribute(insn_attr_t attr)
172{
173 return attr & ~INAT_GRP_MASK;
174}
175
176static inline int inat_has_immediate(insn_attr_t attr)
177{
178 return attr & INAT_IMM_MASK;
179}
180
181static inline int inat_immediate_size(insn_attr_t attr)
182{
183 return (attr & INAT_IMM_MASK) >> INAT_IMM_OFFS;
184}
185
186static inline int inat_has_modrm(insn_attr_t attr)
187{
188 return attr & INAT_MODRM;
189}
190
191static inline int inat_is_force64(insn_attr_t attr)
192{
193 return attr & INAT_FORCE64;
194}
195
196static inline int inat_has_second_immediate(insn_attr_t attr)
197{
198 return attr & INAT_SCNDIMM;
199}
200
201static inline int inat_has_moffset(insn_attr_t attr)
202{
203 return attr & INAT_MOFFSET;
204}
205
206static inline int inat_has_variant(insn_attr_t attr)
207{
208 return attr & INAT_VARIANT;
209}
210
211static inline int inat_accept_vex(insn_attr_t attr)
212{
213 return attr & INAT_VEXOK;
214}
215
216static inline int inat_must_vex(insn_attr_t attr)
217{
218 return attr & INAT_VEXONLY;
219}
220#endif
diff --git a/arch/x86/include/asm/inat_types.h b/arch/x86/include/asm/inat_types.h
new file mode 100644
index 000000000000..cb3c20ce39cf
--- /dev/null
+++ b/arch/x86/include/asm/inat_types.h
@@ -0,0 +1,29 @@
1#ifndef _ASM_X86_INAT_TYPES_H
2#define _ASM_X86_INAT_TYPES_H
3/*
4 * x86 instruction attributes
5 *
6 * Written by Masami Hiramatsu <mhiramat@redhat.com>
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; if not, write to the Free Software
20 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 *
22 */
23
24/* Instruction attributes */
25typedef unsigned int insn_attr_t;
26typedef unsigned char insn_byte_t;
27typedef signed int insn_value_t;
28
29#endif
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
new file mode 100644
index 000000000000..96c2e0ad04ca
--- /dev/null
+++ b/arch/x86/include/asm/insn.h
@@ -0,0 +1,184 @@
1#ifndef _ASM_X86_INSN_H
2#define _ASM_X86_INSN_H
3/*
4 * x86 instruction analysis
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 * Copyright (C) IBM Corporation, 2009
21 */
22
23/* insn_attr_t is defined in inat.h */
24#include <asm/inat.h>
25
26struct insn_field {
27 union {
28 insn_value_t value;
29 insn_byte_t bytes[4];
30 };
31 /* !0 if we've run insn_get_xxx() for this field */
32 unsigned char got;
33 unsigned char nbytes;
34};
35
36struct insn {
37 struct insn_field prefixes; /*
38 * Prefixes
39 * prefixes.bytes[3]: last prefix
40 */
41 struct insn_field rex_prefix; /* REX prefix */
42 struct insn_field vex_prefix; /* VEX prefix */
43 struct insn_field opcode; /*
44 * opcode.bytes[0]: opcode1
45 * opcode.bytes[1]: opcode2
46 * opcode.bytes[2]: opcode3
47 */
48 struct insn_field modrm;
49 struct insn_field sib;
50 struct insn_field displacement;
51 union {
52 struct insn_field immediate;
53 struct insn_field moffset1; /* for 64bit MOV */
54 struct insn_field immediate1; /* for 64bit imm or off16/32 */
55 };
56 union {
57 struct insn_field moffset2; /* for 64bit MOV */
58 struct insn_field immediate2; /* for 64bit imm or seg16 */
59 };
60
61 insn_attr_t attr;
62 unsigned char opnd_bytes;
63 unsigned char addr_bytes;
64 unsigned char length;
65 unsigned char x86_64;
66
67 const insn_byte_t *kaddr; /* kernel address of insn to analyze */
68 const insn_byte_t *next_byte;
69};
70
71#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
72#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
73#define X86_MODRM_RM(modrm) ((modrm) & 0x07)
74
75#define X86_SIB_SCALE(sib) (((sib) & 0xc0) >> 6)
76#define X86_SIB_INDEX(sib) (((sib) & 0x38) >> 3)
77#define X86_SIB_BASE(sib) ((sib) & 0x07)
78
79#define X86_REX_W(rex) ((rex) & 8)
80#define X86_REX_R(rex) ((rex) & 4)
81#define X86_REX_X(rex) ((rex) & 2)
82#define X86_REX_B(rex) ((rex) & 1)
83
84/* VEX bit flags */
85#define X86_VEX_W(vex) ((vex) & 0x80) /* VEX3 Byte2 */
86#define X86_VEX_R(vex) ((vex) & 0x80) /* VEX2/3 Byte1 */
87#define X86_VEX_X(vex) ((vex) & 0x40) /* VEX3 Byte1 */
88#define X86_VEX_B(vex) ((vex) & 0x20) /* VEX3 Byte1 */
89#define X86_VEX_L(vex) ((vex) & 0x04) /* VEX3 Byte2, VEX2 Byte1 */
90/* VEX bit fields */
91#define X86_VEX3_M(vex) ((vex) & 0x1f) /* VEX3 Byte1 */
92#define X86_VEX2_M 1 /* VEX2.M always 1 */
93#define X86_VEX_V(vex) (((vex) & 0x78) >> 3) /* VEX3 Byte2, VEX2 Byte1 */
94#define X86_VEX_P(vex) ((vex) & 0x03) /* VEX3 Byte2, VEX2 Byte1 */
95#define X86_VEX_M_MAX 0x1f /* VEX3.M Maximum value */
96
97/* The last prefix is needed for two-byte and three-byte opcodes */
98static inline insn_byte_t insn_last_prefix(struct insn *insn)
99{
100 return insn->prefixes.bytes[3];
101}
102
103extern void insn_init(struct insn *insn, const void *kaddr, int x86_64);
104extern void insn_get_prefixes(struct insn *insn);
105extern void insn_get_opcode(struct insn *insn);
106extern void insn_get_modrm(struct insn *insn);
107extern void insn_get_sib(struct insn *insn);
108extern void insn_get_displacement(struct insn *insn);
109extern void insn_get_immediate(struct insn *insn);
110extern void insn_get_length(struct insn *insn);
111
112/* Attribute will be determined after getting ModRM (for opcode groups) */
113static inline void insn_get_attribute(struct insn *insn)
114{
115 insn_get_modrm(insn);
116}
117
118/* Instruction uses RIP-relative addressing */
119extern int insn_rip_relative(struct insn *insn);
120
121/* Init insn for kernel text */
122static inline void kernel_insn_init(struct insn *insn, const void *kaddr)
123{
124#ifdef CONFIG_X86_64
125 insn_init(insn, kaddr, 1);
126#else /* CONFIG_X86_32 */
127 insn_init(insn, kaddr, 0);
128#endif
129}
130
131static inline int insn_is_avx(struct insn *insn)
132{
133 if (!insn->prefixes.got)
134 insn_get_prefixes(insn);
135 return (insn->vex_prefix.value != 0);
136}
137
138static inline insn_byte_t insn_vex_m_bits(struct insn *insn)
139{
140 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
141 return X86_VEX2_M;
142 else
143 return X86_VEX3_M(insn->vex_prefix.bytes[1]);
144}
145
146static inline insn_byte_t insn_vex_p_bits(struct insn *insn)
147{
148 if (insn->vex_prefix.nbytes == 2) /* 2 bytes VEX */
149 return X86_VEX_P(insn->vex_prefix.bytes[1]);
150 else
151 return X86_VEX_P(insn->vex_prefix.bytes[2]);
152}
153
154/* Offset of each field from kaddr */
155static inline int insn_offset_rex_prefix(struct insn *insn)
156{
157 return insn->prefixes.nbytes;
158}
159static inline int insn_offset_vex_prefix(struct insn *insn)
160{
161 return insn_offset_rex_prefix(insn) + insn->rex_prefix.nbytes;
162}
163static inline int insn_offset_opcode(struct insn *insn)
164{
165 return insn_offset_vex_prefix(insn) + insn->vex_prefix.nbytes;
166}
167static inline int insn_offset_modrm(struct insn *insn)
168{
169 return insn_offset_opcode(insn) + insn->opcode.nbytes;
170}
171static inline int insn_offset_sib(struct insn *insn)
172{
173 return insn_offset_modrm(insn) + insn->modrm.nbytes;
174}
175static inline int insn_offset_displacement(struct insn *insn)
176{
177 return insn_offset_sib(insn) + insn->sib.nbytes;
178}
179static inline int insn_offset_immediate(struct insn *insn)
180{
181 return insn_offset_displacement(insn) + insn->displacement.nbytes;
182}
183
184#endif /* _ASM_X86_INSN_H */
diff --git a/arch/x86/include/asm/inst.h b/arch/x86/include/asm/inst.h
new file mode 100644
index 000000000000..14cf526091f9
--- /dev/null
+++ b/arch/x86/include/asm/inst.h
@@ -0,0 +1,150 @@
1/*
2 * Generate .byte code for some instructions not supported by old
3 * binutils.
4 */
5#ifndef X86_ASM_INST_H
6#define X86_ASM_INST_H
7
8#ifdef __ASSEMBLY__
9
10 .macro XMM_NUM opd xmm
11 .ifc \xmm,%xmm0
12 \opd = 0
13 .endif
14 .ifc \xmm,%xmm1
15 \opd = 1
16 .endif
17 .ifc \xmm,%xmm2
18 \opd = 2
19 .endif
20 .ifc \xmm,%xmm3
21 \opd = 3
22 .endif
23 .ifc \xmm,%xmm4
24 \opd = 4
25 .endif
26 .ifc \xmm,%xmm5
27 \opd = 5
28 .endif
29 .ifc \xmm,%xmm6
30 \opd = 6
31 .endif
32 .ifc \xmm,%xmm7
33 \opd = 7
34 .endif
35 .ifc \xmm,%xmm8
36 \opd = 8
37 .endif
38 .ifc \xmm,%xmm9
39 \opd = 9
40 .endif
41 .ifc \xmm,%xmm10
42 \opd = 10
43 .endif
44 .ifc \xmm,%xmm11
45 \opd = 11
46 .endif
47 .ifc \xmm,%xmm12
48 \opd = 12
49 .endif
50 .ifc \xmm,%xmm13
51 \opd = 13
52 .endif
53 .ifc \xmm,%xmm14
54 \opd = 14
55 .endif
56 .ifc \xmm,%xmm15
57 \opd = 15
58 .endif
59 .endm
60
61 .macro PFX_OPD_SIZE
62 .byte 0x66
63 .endm
64
65 .macro PFX_REX opd1 opd2
66 .if (\opd1 | \opd2) & 8
67 .byte 0x40 | ((\opd1 & 8) >> 3) | ((\opd2 & 8) >> 1)
68 .endif
69 .endm
70
71 .macro MODRM mod opd1 opd2
72 .byte \mod | (\opd1 & 7) | ((\opd2 & 7) << 3)
73 .endm
74
75 .macro PSHUFB_XMM xmm1 xmm2
76 XMM_NUM pshufb_opd1 \xmm1
77 XMM_NUM pshufb_opd2 \xmm2
78 PFX_OPD_SIZE
79 PFX_REX pshufb_opd1 pshufb_opd2
80 .byte 0x0f, 0x38, 0x00
81 MODRM 0xc0 pshufb_opd1 pshufb_opd2
82 .endm
83
84 .macro PCLMULQDQ imm8 xmm1 xmm2
85 XMM_NUM clmul_opd1 \xmm1
86 XMM_NUM clmul_opd2 \xmm2
87 PFX_OPD_SIZE
88 PFX_REX clmul_opd1 clmul_opd2
89 .byte 0x0f, 0x3a, 0x44
90 MODRM 0xc0 clmul_opd1 clmul_opd2
91 .byte \imm8
92 .endm
93
94 .macro AESKEYGENASSIST rcon xmm1 xmm2
95 XMM_NUM aeskeygen_opd1 \xmm1
96 XMM_NUM aeskeygen_opd2 \xmm2
97 PFX_OPD_SIZE
98 PFX_REX aeskeygen_opd1 aeskeygen_opd2
99 .byte 0x0f, 0x3a, 0xdf
100 MODRM 0xc0 aeskeygen_opd1 aeskeygen_opd2
101 .byte \rcon
102 .endm
103
104 .macro AESIMC xmm1 xmm2
105 XMM_NUM aesimc_opd1 \xmm1
106 XMM_NUM aesimc_opd2 \xmm2
107 PFX_OPD_SIZE
108 PFX_REX aesimc_opd1 aesimc_opd2
109 .byte 0x0f, 0x38, 0xdb
110 MODRM 0xc0 aesimc_opd1 aesimc_opd2
111 .endm
112
113 .macro AESENC xmm1 xmm2
114 XMM_NUM aesenc_opd1 \xmm1
115 XMM_NUM aesenc_opd2 \xmm2
116 PFX_OPD_SIZE
117 PFX_REX aesenc_opd1 aesenc_opd2
118 .byte 0x0f, 0x38, 0xdc
119 MODRM 0xc0 aesenc_opd1 aesenc_opd2
120 .endm
121
122 .macro AESENCLAST xmm1 xmm2
123 XMM_NUM aesenclast_opd1 \xmm1
124 XMM_NUM aesenclast_opd2 \xmm2
125 PFX_OPD_SIZE
126 PFX_REX aesenclast_opd1 aesenclast_opd2
127 .byte 0x0f, 0x38, 0xdd
128 MODRM 0xc0 aesenclast_opd1 aesenclast_opd2
129 .endm
130
131 .macro AESDEC xmm1 xmm2
132 XMM_NUM aesdec_opd1 \xmm1
133 XMM_NUM aesdec_opd2 \xmm2
134 PFX_OPD_SIZE
135 PFX_REX aesdec_opd1 aesdec_opd2
136 .byte 0x0f, 0x38, 0xde
137 MODRM 0xc0 aesdec_opd1 aesdec_opd2
138 .endm
139
140 .macro AESDECLAST xmm1 xmm2
141 XMM_NUM aesdeclast_opd1 \xmm1
142 XMM_NUM aesdeclast_opd2 \xmm2
143 PFX_OPD_SIZE
144 PFX_REX aesdeclast_opd1 aesdeclast_opd2
145 .byte 0x0f, 0x38, 0xdf
146 MODRM 0xc0 aesdeclast_opd1 aesdeclast_opd2
147 .endm
148#endif
149
150#endif
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 73739322b6d0..30a3e9776123 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -1,8 +1,42 @@
1#ifndef _ASM_X86_IO_H 1#ifndef _ASM_X86_IO_H
2#define _ASM_X86_IO_H 2#define _ASM_X86_IO_H
3 3
4/*
5 * This file contains the definitions for the x86 IO instructions
6 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
7 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
8 * versions of the single-IO instructions (inb_p/inw_p/..).
9 *
10 * This file is not meant to be obfuscating: it's just complicated
11 * to (a) handle it all in a way that makes gcc able to optimize it
12 * as well as possible and (b) trying to avoid writing the same thing
13 * over and over again with slight variations and possibly making a
14 * mistake somewhere.
15 */
16
17/*
18 * Thanks to James van Artsdalen for a better timing-fix than
19 * the two short jumps: using outb's to a nonexistent port seems
20 * to guarantee better timings even on fast machines.
21 *
22 * On the other hand, I'd like to be sure of a non-existent port:
23 * I feel a bit unsafe about using 0x80 (should be safe, though)
24 *
25 * Linus
26 */
27
28 /*
29 * Bit simplified and optimized by Jan Hubicka
30 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
31 *
32 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
33 * isa_read[wl] and isa_write[wl] fixed
34 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
35 */
36
4#define ARCH_HAS_IOREMAP_WC 37#define ARCH_HAS_IOREMAP_WC
5 38
39#include <linux/string.h>
6#include <linux/compiler.h> 40#include <linux/compiler.h>
7#include <asm-generic/int-ll64.h> 41#include <asm-generic/int-ll64.h>
8#include <asm/page.h> 42#include <asm/page.h>
@@ -173,11 +207,126 @@ static inline void __iomem *ioremap(resource_size_t offset, unsigned long size)
173extern void iounmap(volatile void __iomem *addr); 207extern void iounmap(volatile void __iomem *addr);
174 208
175 209
176#ifdef CONFIG_X86_32 210#ifdef __KERNEL__
177# include "io_32.h" 211
212#include <asm-generic/iomap.h>
213
214#include <linux/vmalloc.h>
215
216/*
217 * Convert a virtual cached pointer to an uncached pointer
218 */
219#define xlate_dev_kmem_ptr(p) p
220
221static inline void
222memset_io(volatile void __iomem *addr, unsigned char val, size_t count)
223{
224 memset((void __force *)addr, val, count);
225}
226
227static inline void
228memcpy_fromio(void *dst, const volatile void __iomem *src, size_t count)
229{
230 memcpy(dst, (const void __force *)src, count);
231}
232
233static inline void
234memcpy_toio(volatile void __iomem *dst, const void *src, size_t count)
235{
236 memcpy((void __force *)dst, src, count);
237}
238
239/*
240 * ISA space is 'always mapped' on a typical x86 system, no need to
241 * explicitly ioremap() it. The fact that the ISA IO space is mapped
242 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
243 * are physical addresses. The following constant pointer can be
244 * used as the IO-area pointer (it can be iounmapped as well, so the
245 * analogy with PCI is quite large):
246 */
247#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
248
249/*
250 * Cache management
251 *
252 * This needed for two cases
253 * 1. Out of order aware processors
254 * 2. Accidentally out of order processors (PPro errata #51)
255 */
256
257static inline void flush_write_buffers(void)
258{
259#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
260 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
261#endif
262}
263
264#endif /* __KERNEL__ */
265
266extern void native_io_delay(void);
267
268extern int io_delay_type;
269extern void io_delay_init(void);
270
271#if defined(CONFIG_PARAVIRT)
272#include <asm/paravirt.h>
178#else 273#else
179# include "io_64.h" 274
275static inline void slow_down_io(void)
276{
277 native_io_delay();
278#ifdef REALLY_SLOW_IO
279 native_io_delay();
280 native_io_delay();
281 native_io_delay();
180#endif 282#endif
283}
284
285#endif
286
287#define BUILDIO(bwl, bw, type) \
288static inline void out##bwl(unsigned type value, int port) \
289{ \
290 asm volatile("out" #bwl " %" #bw "0, %w1" \
291 : : "a"(value), "Nd"(port)); \
292} \
293 \
294static inline unsigned type in##bwl(int port) \
295{ \
296 unsigned type value; \
297 asm volatile("in" #bwl " %w1, %" #bw "0" \
298 : "=a"(value) : "Nd"(port)); \
299 return value; \
300} \
301 \
302static inline void out##bwl##_p(unsigned type value, int port) \
303{ \
304 out##bwl(value, port); \
305 slow_down_io(); \
306} \
307 \
308static inline unsigned type in##bwl##_p(int port) \
309{ \
310 unsigned type value = in##bwl(port); \
311 slow_down_io(); \
312 return value; \
313} \
314 \
315static inline void outs##bwl(int port, const void *addr, unsigned long count) \
316{ \
317 asm volatile("rep; outs" #bwl \
318 : "+S"(addr), "+c"(count) : "d"(port)); \
319} \
320 \
321static inline void ins##bwl(int port, void *addr, unsigned long count) \
322{ \
323 asm volatile("rep; ins" #bwl \
324 : "+D"(addr), "+c"(count) : "d"(port)); \
325}
326
327BUILDIO(b, b, char)
328BUILDIO(w, w, short)
329BUILDIO(l, , int)
181 330
182extern void *xlate_dev_mem_ptr(unsigned long phys); 331extern void *xlate_dev_mem_ptr(unsigned long phys);
183extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr); 332extern void unxlate_dev_mem_ptr(unsigned long phys, void *addr);
@@ -198,6 +347,7 @@ extern void __iomem *early_ioremap(resource_size_t phys_addr,
198extern void __iomem *early_memremap(resource_size_t phys_addr, 347extern void __iomem *early_memremap(resource_size_t phys_addr,
199 unsigned long size); 348 unsigned long size);
200extern void early_iounmap(void __iomem *addr, unsigned long size); 349extern void early_iounmap(void __iomem *addr, unsigned long size);
350extern void fixup_early_ioremap(void);
201 351
202#define IO_SPACE_LIMIT 0xffff 352#define IO_SPACE_LIMIT 0xffff
203 353
diff --git a/arch/x86/include/asm/io_32.h b/arch/x86/include/asm/io_32.h
deleted file mode 100644
index a299900f5920..000000000000
--- a/arch/x86/include/asm/io_32.h
+++ /dev/null
@@ -1,196 +0,0 @@
1#ifndef _ASM_X86_IO_32_H
2#define _ASM_X86_IO_32_H
3
4#include <linux/string.h>
5#include <linux/compiler.h>
6
7/*
8 * This file contains the definitions for the x86 IO instructions
9 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
10 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
11 * versions of the single-IO instructions (inb_p/inw_p/..).
12 *
13 * This file is not meant to be obfuscating: it's just complicated
14 * to (a) handle it all in a way that makes gcc able to optimize it
15 * as well as possible and (b) trying to avoid writing the same thing
16 * over and over again with slight variations and possibly making a
17 * mistake somewhere.
18 */
19
20/*
21 * Thanks to James van Artsdalen for a better timing-fix than
22 * the two short jumps: using outb's to a nonexistent port seems
23 * to guarantee better timings even on fast machines.
24 *
25 * On the other hand, I'd like to be sure of a non-existent port:
26 * I feel a bit unsafe about using 0x80 (should be safe, though)
27 *
28 * Linus
29 */
30
31 /*
32 * Bit simplified and optimized by Jan Hubicka
33 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
34 *
35 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
36 * isa_read[wl] and isa_write[wl] fixed
37 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
38 */
39
40#define XQUAD_PORTIO_BASE 0xfe400000
41#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
42
43#ifdef __KERNEL__
44
45#include <asm-generic/iomap.h>
46
47#include <linux/vmalloc.h>
48
49/*
50 * Convert a virtual cached pointer to an uncached pointer
51 */
52#define xlate_dev_kmem_ptr(p) p
53
54static inline void
55memset_io(volatile void __iomem *addr, unsigned char val, int count)
56{
57 memset((void __force *)addr, val, count);
58}
59
60static inline void
61memcpy_fromio(void *dst, const volatile void __iomem *src, int count)
62{
63 __memcpy(dst, (const void __force *)src, count);
64}
65
66static inline void
67memcpy_toio(volatile void __iomem *dst, const void *src, int count)
68{
69 __memcpy((void __force *)dst, src, count);
70}
71
72/*
73 * ISA space is 'always mapped' on a typical x86 system, no need to
74 * explicitly ioremap() it. The fact that the ISA IO space is mapped
75 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
76 * are physical addresses. The following constant pointer can be
77 * used as the IO-area pointer (it can be iounmapped as well, so the
78 * analogy with PCI is quite large):
79 */
80#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
81
82/*
83 * Cache management
84 *
85 * This needed for two cases
86 * 1. Out of order aware processors
87 * 2. Accidentally out of order processors (PPro errata #51)
88 */
89
90#if defined(CONFIG_X86_OOSTORE) || defined(CONFIG_X86_PPRO_FENCE)
91
92static inline void flush_write_buffers(void)
93{
94 asm volatile("lock; addl $0,0(%%esp)": : :"memory");
95}
96
97#else
98
99#define flush_write_buffers() do { } while (0)
100
101#endif
102
103#endif /* __KERNEL__ */
104
105extern void native_io_delay(void);
106
107extern int io_delay_type;
108extern void io_delay_init(void);
109
110#if defined(CONFIG_PARAVIRT)
111#include <asm/paravirt.h>
112#else
113
114static inline void slow_down_io(void)
115{
116 native_io_delay();
117#ifdef REALLY_SLOW_IO
118 native_io_delay();
119 native_io_delay();
120 native_io_delay();
121#endif
122}
123
124#endif
125
126#define __BUILDIO(bwl, bw, type) \
127static inline void out##bwl(unsigned type value, int port) \
128{ \
129 out##bwl##_local(value, port); \
130} \
131 \
132static inline unsigned type in##bwl(int port) \
133{ \
134 return in##bwl##_local(port); \
135}
136
137#define BUILDIO(bwl, bw, type) \
138static inline void out##bwl##_local(unsigned type value, int port) \
139{ \
140 asm volatile("out" #bwl " %" #bw "0, %w1" \
141 : : "a"(value), "Nd"(port)); \
142} \
143 \
144static inline unsigned type in##bwl##_local(int port) \
145{ \
146 unsigned type value; \
147 asm volatile("in" #bwl " %w1, %" #bw "0" \
148 : "=a"(value) : "Nd"(port)); \
149 return value; \
150} \
151 \
152static inline void out##bwl##_local_p(unsigned type value, int port) \
153{ \
154 out##bwl##_local(value, port); \
155 slow_down_io(); \
156} \
157 \
158static inline unsigned type in##bwl##_local_p(int port) \
159{ \
160 unsigned type value = in##bwl##_local(port); \
161 slow_down_io(); \
162 return value; \
163} \
164 \
165__BUILDIO(bwl, bw, type) \
166 \
167static inline void out##bwl##_p(unsigned type value, int port) \
168{ \
169 out##bwl(value, port); \
170 slow_down_io(); \
171} \
172 \
173static inline unsigned type in##bwl##_p(int port) \
174{ \
175 unsigned type value = in##bwl(port); \
176 slow_down_io(); \
177 return value; \
178} \
179 \
180static inline void outs##bwl(int port, const void *addr, unsigned long count) \
181{ \
182 asm volatile("rep; outs" #bwl \
183 : "+S"(addr), "+c"(count) : "d"(port)); \
184} \
185 \
186static inline void ins##bwl(int port, void *addr, unsigned long count) \
187{ \
188 asm volatile("rep; ins" #bwl \
189 : "+D"(addr), "+c"(count) : "d"(port)); \
190}
191
192BUILDIO(b, b, char)
193BUILDIO(w, w, short)
194BUILDIO(l, , int)
195
196#endif /* _ASM_X86_IO_32_H */
diff --git a/arch/x86/include/asm/io_64.h b/arch/x86/include/asm/io_64.h
deleted file mode 100644
index 244067893af4..000000000000
--- a/arch/x86/include/asm/io_64.h
+++ /dev/null
@@ -1,181 +0,0 @@
1#ifndef _ASM_X86_IO_64_H
2#define _ASM_X86_IO_64_H
3
4
5/*
6 * This file contains the definitions for the x86 IO instructions
7 * inb/inw/inl/outb/outw/outl and the "string versions" of the same
8 * (insb/insw/insl/outsb/outsw/outsl). You can also use "pausing"
9 * versions of the single-IO instructions (inb_p/inw_p/..).
10 *
11 * This file is not meant to be obfuscating: it's just complicated
12 * to (a) handle it all in a way that makes gcc able to optimize it
13 * as well as possible and (b) trying to avoid writing the same thing
14 * over and over again with slight variations and possibly making a
15 * mistake somewhere.
16 */
17
18/*
19 * Thanks to James van Artsdalen for a better timing-fix than
20 * the two short jumps: using outb's to a nonexistent port seems
21 * to guarantee better timings even on fast machines.
22 *
23 * On the other hand, I'd like to be sure of a non-existent port:
24 * I feel a bit unsafe about using 0x80 (should be safe, though)
25 *
26 * Linus
27 */
28
29 /*
30 * Bit simplified and optimized by Jan Hubicka
31 * Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999.
32 *
33 * isa_memset_io, isa_memcpy_fromio, isa_memcpy_toio added,
34 * isa_read[wl] and isa_write[wl] fixed
35 * - Arnaldo Carvalho de Melo <acme@conectiva.com.br>
36 */
37
38extern void native_io_delay(void);
39
40extern int io_delay_type;
41extern void io_delay_init(void);
42
43#if defined(CONFIG_PARAVIRT)
44#include <asm/paravirt.h>
45#else
46
47static inline void slow_down_io(void)
48{
49 native_io_delay();
50#ifdef REALLY_SLOW_IO
51 native_io_delay();
52 native_io_delay();
53 native_io_delay();
54#endif
55}
56#endif
57
58/*
59 * Talk about misusing macros..
60 */
61#define __OUT1(s, x) \
62static inline void out##s(unsigned x value, unsigned short port) {
63
64#define __OUT2(s, s1, s2) \
65asm volatile ("out" #s " %" s1 "0,%" s2 "1"
66
67#ifndef REALLY_SLOW_IO
68#define REALLY_SLOW_IO
69#define UNSET_REALLY_SLOW_IO
70#endif
71
72#define __OUT(s, s1, x) \
73 __OUT1(s, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
74 } \
75 __OUT1(s##_p, x) __OUT2(s, s1, "w") : : "a" (value), "Nd" (port)); \
76 slow_down_io(); \
77}
78
79#define __IN1(s) \
80static inline RETURN_TYPE in##s(unsigned short port) \
81{ \
82 RETURN_TYPE _v;
83
84#define __IN2(s, s1, s2) \
85 asm volatile ("in" #s " %" s2 "1,%" s1 "0"
86
87#define __IN(s, s1, i...) \
88 __IN1(s) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
89 return _v; \
90 } \
91 __IN1(s##_p) __IN2(s, s1, "w") : "=a" (_v) : "Nd" (port), ##i); \
92 slow_down_io(); \
93 return _v; }
94
95#ifdef UNSET_REALLY_SLOW_IO
96#undef REALLY_SLOW_IO
97#endif
98
99#define __INS(s) \
100static inline void ins##s(unsigned short port, void *addr, \
101 unsigned long count) \
102{ \
103 asm volatile ("rep ; ins" #s \
104 : "=D" (addr), "=c" (count) \
105 : "d" (port), "0" (addr), "1" (count)); \
106}
107
108#define __OUTS(s) \
109static inline void outs##s(unsigned short port, const void *addr, \
110 unsigned long count) \
111{ \
112 asm volatile ("rep ; outs" #s \
113 : "=S" (addr), "=c" (count) \
114 : "d" (port), "0" (addr), "1" (count)); \
115}
116
117#define RETURN_TYPE unsigned char
118__IN(b, "")
119#undef RETURN_TYPE
120#define RETURN_TYPE unsigned short
121__IN(w, "")
122#undef RETURN_TYPE
123#define RETURN_TYPE unsigned int
124__IN(l, "")
125#undef RETURN_TYPE
126
127__OUT(b, "b", char)
128__OUT(w, "w", short)
129__OUT(l, , int)
130
131__INS(b)
132__INS(w)
133__INS(l)
134
135__OUTS(b)
136__OUTS(w)
137__OUTS(l)
138
139#if defined(__KERNEL__) && defined(__x86_64__)
140
141#include <linux/vmalloc.h>
142
143#include <asm-generic/iomap.h>
144
145void __memcpy_fromio(void *, unsigned long, unsigned);
146void __memcpy_toio(unsigned long, const void *, unsigned);
147
148static inline void memcpy_fromio(void *to, const volatile void __iomem *from,
149 unsigned len)
150{
151 __memcpy_fromio(to, (unsigned long)from, len);
152}
153
154static inline void memcpy_toio(volatile void __iomem *to, const void *from,
155 unsigned len)
156{
157 __memcpy_toio((unsigned long)to, from, len);
158}
159
160void memset_io(volatile void __iomem *a, int b, size_t c);
161
162/*
163 * ISA space is 'always mapped' on a typical x86 system, no need to
164 * explicitly ioremap() it. The fact that the ISA IO space is mapped
165 * to PAGE_OFFSET is pure coincidence - it does not mean ISA values
166 * are physical addresses. The following constant pointer can be
167 * used as the IO-area pointer (it can be iounmapped as well, so the
168 * analogy with PCI is quite large):
169 */
170#define __ISA_IO_base ((char __iomem *)(PAGE_OFFSET))
171
172#define flush_write_buffers()
173
174/*
175 * Convert a virtual cached pointer to an uncached pointer
176 */
177#define xlate_dev_kmem_ptr(p) p
178
179#endif /* __KERNEL__ */
180
181#endif /* _ASM_X86_IO_64_H */
diff --git a/arch/x86/include/asm/io_apic.h b/arch/x86/include/asm/io_apic.h
index 7c7c16cde1f8..35832a03a515 100644
--- a/arch/x86/include/asm/io_apic.h
+++ b/arch/x86/include/asm/io_apic.h
@@ -143,8 +143,6 @@ extern int noioapicreroute;
143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */ 143/* 1 if the timer IRQ uses the '8259A Virtual Wire' mode */
144extern int timer_through_8259; 144extern int timer_through_8259;
145 145
146extern void io_apic_disable_legacy(void);
147
148/* 146/*
149 * If we use the IO-APIC for IRQ routing, disable automatic 147 * If we use the IO-APIC for IRQ routing, disable automatic
150 * assignment of PCI IRQ's. 148 * assignment of PCI IRQ's.
@@ -160,6 +158,7 @@ extern int io_apic_get_redir_entries(int ioapic);
160struct io_apic_irq_attr; 158struct io_apic_irq_attr;
161extern int io_apic_set_pci_routing(struct device *dev, int irq, 159extern int io_apic_set_pci_routing(struct device *dev, int irq,
162 struct io_apic_irq_attr *irq_attr); 160 struct io_apic_irq_attr *irq_attr);
161void setup_IO_APIC_irq_extra(u32 gsi);
163extern int (*ioapic_renumber_irq)(int ioapic, int irq); 162extern int (*ioapic_renumber_irq)(int ioapic, int irq);
164extern void ioapic_init_mappings(void); 163extern void ioapic_init_mappings(void);
165extern void ioapic_insert_resources(void); 164extern void ioapic_insert_resources(void);
@@ -188,6 +187,7 @@ extern struct mp_ioapic_gsi mp_gsi_routing[];
188int mp_find_ioapic(int gsi); 187int mp_find_ioapic(int gsi);
189int mp_find_ioapic_pin(int ioapic, int gsi); 188int mp_find_ioapic_pin(int ioapic, int gsi);
190void __init mp_register_ioapic(int id, u32 address, u32 gsi_base); 189void __init mp_register_ioapic(int id, u32 address, u32 gsi_base);
190extern void __init pre_init_apic_IRQ0(void);
191 191
192#else /* !CONFIG_X86_IO_APIC */ 192#else /* !CONFIG_X86_IO_APIC */
193 193
@@ -197,7 +197,11 @@ static const int timer_through_8259 = 0;
197static inline void ioapic_init_mappings(void) { } 197static inline void ioapic_init_mappings(void) { }
198static inline void ioapic_insert_resources(void) { } 198static inline void ioapic_insert_resources(void) { }
199static inline void probe_nr_irqs_gsi(void) { } 199static inline void probe_nr_irqs_gsi(void) { }
200static inline int mp_find_ioapic(int gsi) { return 0; }
200 201
202struct io_apic_irq_attr;
203static inline int io_apic_set_pci_routing(struct device *dev, int irq,
204 struct io_apic_irq_attr *irq_attr) { return 0; }
201#endif 205#endif
202 206
203#endif /* _ASM_X86_IO_APIC_H */ 207#endif /* _ASM_X86_IO_APIC_H */
diff --git a/arch/x86/include/asm/iommu.h b/arch/x86/include/asm/iommu.h
index fd6d21bbee6c..345c99cef152 100644
--- a/arch/x86/include/asm/iommu.h
+++ b/arch/x86/include/asm/iommu.h
@@ -1,8 +1,6 @@
1#ifndef _ASM_X86_IOMMU_H 1#ifndef _ASM_X86_IOMMU_H
2#define _ASM_X86_IOMMU_H 2#define _ASM_X86_IOMMU_H
3 3
4extern void pci_iommu_shutdown(void);
5extern void no_iommu_init(void);
6extern struct dma_map_ops nommu_dma_ops; 4extern struct dma_map_ops nommu_dma_ops;
7extern int force_iommu, no_iommu; 5extern int force_iommu, no_iommu;
8extern int iommu_detected; 6extern int iommu_detected;
diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h
index ddda6cbed6f4..5458380b6ef8 100644
--- a/arch/x86/include/asm/irq.h
+++ b/arch/x86/include/asm/irq.h
@@ -34,9 +34,10 @@ static inline int irq_canonicalize(int irq)
34#ifdef CONFIG_HOTPLUG_CPU 34#ifdef CONFIG_HOTPLUG_CPU
35#include <linux/cpumask.h> 35#include <linux/cpumask.h>
36extern void fixup_irqs(void); 36extern void fixup_irqs(void);
37extern void irq_force_complete_move(int);
37#endif 38#endif
38 39
39extern void (*generic_interrupt_extension)(void); 40extern void (*x86_platform_ipi_callback)(void);
40extern void native_init_IRQ(void); 41extern void native_init_IRQ(void);
41extern bool handle_irq(unsigned irq, struct pt_regs *regs); 42extern bool handle_irq(unsigned irq, struct pt_regs *regs);
42 43
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 28c3bf3f4c84..bb5318bbe0e4 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -28,28 +28,33 @@
28#define MCE_VECTOR 0x12 28#define MCE_VECTOR 0x12
29 29
30/* 30/*
31 * IDT vectors usable for external interrupt sources start 31 * IDT vectors usable for external interrupt sources start at 0x20.
32 * at 0x20: 32 * (0x80 is the syscall vector, 0x30-0x3f are for ISA)
33 */ 33 */
34#define FIRST_EXTERNAL_VECTOR 0x20 34#define FIRST_EXTERNAL_VECTOR 0x20
35 35/*
36#ifdef CONFIG_X86_32 36 * We start allocating at 0x21 to spread out vectors evenly between
37# define SYSCALL_VECTOR 0x80 37 * priority levels. (0x80 is the syscall vector)
38# define IA32_SYSCALL_VECTOR 0x80 38 */
39#else 39#define VECTOR_OFFSET_START 1
40# define IA32_SYSCALL_VECTOR 0x80
41#endif
42 40
43/* 41/*
44 * Reserve the lowest usable priority level 0x20 - 0x2f for triggering 42 * Reserve the lowest usable vector (and hence lowest priority) 0x20 for
45 * cleanup after irq migration. 43 * triggering cleanup after irq migration. 0x21-0x2f will still be used
44 * for device interrupts.
46 */ 45 */
47#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR 46#define IRQ_MOVE_CLEANUP_VECTOR FIRST_EXTERNAL_VECTOR
48 47
48#define IA32_SYSCALL_VECTOR 0x80
49#ifdef CONFIG_X86_32
50# define SYSCALL_VECTOR 0x80
51#endif
52
49/* 53/*
50 * Vectors 0x30-0x3f are used for ISA interrupts. 54 * Vectors 0x30-0x3f are used for ISA interrupts.
55 * round up to the next 16-vector boundary
51 */ 56 */
52#define IRQ0_VECTOR (FIRST_EXTERNAL_VECTOR + 0x10) 57#define IRQ0_VECTOR ((FIRST_EXTERNAL_VECTOR + 16) & ~15)
53 58
54#define IRQ1_VECTOR (IRQ0_VECTOR + 1) 59#define IRQ1_VECTOR (IRQ0_VECTOR + 1)
55#define IRQ2_VECTOR (IRQ0_VECTOR + 2) 60#define IRQ2_VECTOR (IRQ0_VECTOR + 2)
@@ -111,27 +116,20 @@
111/* 116/*
112 * Generic system vector for platform specific use 117 * Generic system vector for platform specific use
113 */ 118 */
114#define GENERIC_INTERRUPT_VECTOR 0xed 119#define X86_PLATFORM_IPI_VECTOR 0xed
115 120
116/* 121/*
117 * Performance monitoring pending work vector: 122 * Performance monitoring pending work vector:
118 */ 123 */
119#define LOCAL_PENDING_VECTOR 0xec 124#define LOCAL_PENDING_VECTOR 0xec
120 125
121#define UV_BAU_MESSAGE 0xec 126#define UV_BAU_MESSAGE 0xea
122 127
123/* 128/*
124 * Self IPI vector for machine checks 129 * Self IPI vector for machine checks
125 */ 130 */
126#define MCE_SELF_VECTOR 0xeb 131#define MCE_SELF_VECTOR 0xeb
127 132
128/*
129 * First APIC vector available to drivers: (vectors 0x30-0xee) we
130 * start at 0x31(0x41) to spread out vectors evenly between priority
131 * levels. (0x80 is the syscall vector)
132 */
133#define FIRST_DEVICE_VECTOR (IRQ15_VECTOR + 2)
134
135#define NR_VECTORS 256 133#define NR_VECTORS 256
136 134
137#define FPU_IRQ 13 135#define FPU_IRQ 13
@@ -159,21 +157,21 @@ static inline int invalid_vm86_irq(int irq)
159 157
160#define NR_IRQS_LEGACY 16 158#define NR_IRQS_LEGACY 16
161 159
162#define CPU_VECTOR_LIMIT ( 8 * NR_CPUS )
163#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS ) 160#define IO_APIC_VECTOR_LIMIT ( 32 * MAX_IO_APICS )
164 161
165#ifdef CONFIG_X86_IO_APIC 162#ifdef CONFIG_X86_IO_APIC
166# ifdef CONFIG_SPARSE_IRQ 163# ifdef CONFIG_SPARSE_IRQ
164# define CPU_VECTOR_LIMIT (64 * NR_CPUS)
167# define NR_IRQS \ 165# define NR_IRQS \
168 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \ 166 (CPU_VECTOR_LIMIT > IO_APIC_VECTOR_LIMIT ? \
169 (NR_VECTORS + CPU_VECTOR_LIMIT) : \ 167 (NR_VECTORS + CPU_VECTOR_LIMIT) : \
170 (NR_VECTORS + IO_APIC_VECTOR_LIMIT)) 168 (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
171# else 169# else
172# if NR_CPUS < MAX_IO_APICS 170# define CPU_VECTOR_LIMIT (32 * NR_CPUS)
173# define NR_IRQS (NR_VECTORS + 4*CPU_VECTOR_LIMIT) 171# define NR_IRQS \
174# else 172 (CPU_VECTOR_LIMIT < IO_APIC_VECTOR_LIMIT ? \
175# define NR_IRQS (NR_VECTORS + IO_APIC_VECTOR_LIMIT) 173 (NR_VECTORS + CPU_VECTOR_LIMIT) : \
176# endif 174 (NR_VECTORS + IO_APIC_VECTOR_LIMIT))
177# endif 175# endif
178#else /* !CONFIG_X86_IO_APIC: */ 176#else /* !CONFIG_X86_IO_APIC: */
179# define NR_IRQS NR_IRQS_LEGACY 177# define NR_IRQS NR_IRQS_LEGACY
diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h
index c2d1f3b58e5f..af00bd1d2089 100644
--- a/arch/x86/include/asm/k8.h
+++ b/arch/x86/include/asm/k8.h
@@ -4,20 +4,28 @@
4#include <linux/pci.h> 4#include <linux/pci.h>
5 5
6extern struct pci_device_id k8_nb_ids[]; 6extern struct pci_device_id k8_nb_ids[];
7struct bootnode;
7 8
8extern int early_is_k8_nb(u32 value); 9extern int early_is_k8_nb(u32 value);
9extern struct pci_dev **k8_northbridges; 10extern struct pci_dev **k8_northbridges;
10extern int num_k8_northbridges; 11extern int num_k8_northbridges;
11extern int cache_k8_northbridges(void); 12extern int cache_k8_northbridges(void);
12extern void k8_flush_garts(void); 13extern void k8_flush_garts(void);
13extern int k8_scan_nodes(unsigned long start, unsigned long end); 14extern int k8_get_nodes(struct bootnode *nodes);
15extern int k8_numa_init(unsigned long start_pfn, unsigned long end_pfn);
16extern int k8_scan_nodes(void);
14 17
15#ifdef CONFIG_K8_NB 18#ifdef CONFIG_K8_NB
19extern int num_k8_northbridges;
20
16static inline struct pci_dev *node_to_k8_nb_misc(int node) 21static inline struct pci_dev *node_to_k8_nb_misc(int node)
17{ 22{
18 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; 23 return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL;
19} 24}
25
20#else 26#else
27#define num_k8_northbridges 0
28
21static inline struct pci_dev *node_to_k8_nb_misc(int node) 29static inline struct pci_dev *node_to_k8_nb_misc(int node)
22{ 30{
23 return NULL; 31 return NULL;
diff --git a/arch/x86/include/asm/kprobes.h b/arch/x86/include/asm/kprobes.h
index 4fe681de1e76..4ffa345a8ccb 100644
--- a/arch/x86/include/asm/kprobes.h
+++ b/arch/x86/include/asm/kprobes.h
@@ -32,7 +32,10 @@ struct kprobe;
32 32
33typedef u8 kprobe_opcode_t; 33typedef u8 kprobe_opcode_t;
34#define BREAKPOINT_INSTRUCTION 0xcc 34#define BREAKPOINT_INSTRUCTION 0xcc
35#define RELATIVEJUMP_INSTRUCTION 0xe9 35#define RELATIVEJUMP_OPCODE 0xe9
36#define RELATIVEJUMP_SIZE 5
37#define RELATIVECALL_OPCODE 0xe8
38#define RELATIVE_ADDR_SIZE 4
36#define MAX_INSN_SIZE 16 39#define MAX_INSN_SIZE 16
37#define MAX_STACK_SIZE 64 40#define MAX_STACK_SIZE 64
38#define MIN_STACK_SIZE(ADDR) \ 41#define MIN_STACK_SIZE(ADDR) \
@@ -44,6 +47,17 @@ typedef u8 kprobe_opcode_t;
44 47
45#define flush_insn_slot(p) do { } while (0) 48#define flush_insn_slot(p) do { } while (0)
46 49
50/* optinsn template addresses */
51extern kprobe_opcode_t optprobe_template_entry;
52extern kprobe_opcode_t optprobe_template_val;
53extern kprobe_opcode_t optprobe_template_call;
54extern kprobe_opcode_t optprobe_template_end;
55#define MAX_OPTIMIZED_LENGTH (MAX_INSN_SIZE + RELATIVE_ADDR_SIZE)
56#define MAX_OPTINSN_SIZE \
57 (((unsigned long)&optprobe_template_end - \
58 (unsigned long)&optprobe_template_entry) + \
59 MAX_OPTIMIZED_LENGTH + RELATIVEJUMP_SIZE)
60
47extern const int kretprobe_blacklist_size; 61extern const int kretprobe_blacklist_size;
48 62
49void arch_remove_kprobe(struct kprobe *p); 63void arch_remove_kprobe(struct kprobe *p);
@@ -64,6 +78,21 @@ struct arch_specific_insn {
64 int boostable; 78 int boostable;
65}; 79};
66 80
81struct arch_optimized_insn {
82 /* copy of the original instructions */
83 kprobe_opcode_t copied_insn[RELATIVE_ADDR_SIZE];
84 /* detour code buffer */
85 kprobe_opcode_t *insn;
86 /* the size of instructions copied to detour code buffer */
87 size_t size;
88};
89
90/* Return true (!0) if optinsn is prepared for optimization. */
91static inline int arch_prepared_optinsn(struct arch_optimized_insn *optinsn)
92{
93 return optinsn->size;
94}
95
67struct prev_kprobe { 96struct prev_kprobe {
68 struct kprobe *kp; 97 struct kprobe *kp;
69 unsigned long status; 98 unsigned long status;
diff --git a/arch/x86/include/asm/kvm.h b/arch/x86/include/asm/kvm.h
index 4a5fe914dc59..f46b79f6c16c 100644
--- a/arch/x86/include/asm/kvm.h
+++ b/arch/x86/include/asm/kvm.h
@@ -19,6 +19,8 @@
19#define __KVM_HAVE_MSIX 19#define __KVM_HAVE_MSIX
20#define __KVM_HAVE_MCE 20#define __KVM_HAVE_MCE
21#define __KVM_HAVE_PIT_STATE2 21#define __KVM_HAVE_PIT_STATE2
22#define __KVM_HAVE_XEN_HVM
23#define __KVM_HAVE_VCPU_EVENTS
22 24
23/* Architectural interrupt line count. */ 25/* Architectural interrupt line count. */
24#define KVM_NR_INTERRUPTS 256 26#define KVM_NR_INTERRUPTS 256
@@ -79,6 +81,7 @@ struct kvm_ioapic_state {
79#define KVM_IRQCHIP_PIC_MASTER 0 81#define KVM_IRQCHIP_PIC_MASTER 0
80#define KVM_IRQCHIP_PIC_SLAVE 1 82#define KVM_IRQCHIP_PIC_SLAVE 1
81#define KVM_IRQCHIP_IOAPIC 2 83#define KVM_IRQCHIP_IOAPIC 2
84#define KVM_NR_IRQCHIPS 3
82 85
83/* for KVM_GET_REGS and KVM_SET_REGS */ 86/* for KVM_GET_REGS and KVM_SET_REGS */
84struct kvm_regs { 87struct kvm_regs {
@@ -250,4 +253,35 @@ struct kvm_reinject_control {
250 __u8 pit_reinject; 253 __u8 pit_reinject;
251 __u8 reserved[31]; 254 __u8 reserved[31];
252}; 255};
256
257/* When set in flags, include corresponding fields on KVM_SET_VCPU_EVENTS */
258#define KVM_VCPUEVENT_VALID_NMI_PENDING 0x00000001
259#define KVM_VCPUEVENT_VALID_SIPI_VECTOR 0x00000002
260
261/* for KVM_GET/SET_VCPU_EVENTS */
262struct kvm_vcpu_events {
263 struct {
264 __u8 injected;
265 __u8 nr;
266 __u8 has_error_code;
267 __u8 pad;
268 __u32 error_code;
269 } exception;
270 struct {
271 __u8 injected;
272 __u8 nr;
273 __u8 soft;
274 __u8 pad;
275 } interrupt;
276 struct {
277 __u8 injected;
278 __u8 pending;
279 __u8 masked;
280 __u8 pad;
281 } nmi;
282 __u32 sipi_vector;
283 __u32 flags;
284 __u32 reserved[10];
285};
286
253#endif /* _ASM_X86_KVM_H */ 287#endif /* _ASM_X86_KVM_H */
diff --git a/arch/x86/include/asm/kvm_emulate.h b/arch/x86/include/asm/kvm_emulate.h
index b7ed2c423116..7a6f54fa13ba 100644
--- a/arch/x86/include/asm/kvm_emulate.h
+++ b/arch/x86/include/asm/kvm_emulate.h
@@ -54,13 +54,23 @@ struct x86_emulate_ctxt;
54struct x86_emulate_ops { 54struct x86_emulate_ops {
55 /* 55 /*
56 * read_std: Read bytes of standard (non-emulated/special) memory. 56 * read_std: Read bytes of standard (non-emulated/special) memory.
57 * Used for instruction fetch, stack operations, and others. 57 * Used for descriptor reading.
58 * @addr: [IN ] Linear address from which to read. 58 * @addr: [IN ] Linear address from which to read.
59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'. 59 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
60 * @bytes: [IN ] Number of bytes to read from memory. 60 * @bytes: [IN ] Number of bytes to read from memory.
61 */ 61 */
62 int (*read_std)(unsigned long addr, void *val, 62 int (*read_std)(unsigned long addr, void *val,
63 unsigned int bytes, struct kvm_vcpu *vcpu); 63 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64
65 /*
66 * fetch: Read bytes of standard (non-emulated/special) memory.
67 * Used for instruction fetch.
68 * @addr: [IN ] Linear address from which to read.
69 * @val: [OUT] Value read from memory, zero-extended to 'u_long'.
70 * @bytes: [IN ] Number of bytes to read from memory.
71 */
72 int (*fetch)(unsigned long addr, void *val,
73 unsigned int bytes, struct kvm_vcpu *vcpu, u32 *error);
64 74
65 /* 75 /*
66 * read_emulated: Read bytes from emulated/special memory area. 76 * read_emulated: Read bytes from emulated/special memory area.
@@ -74,7 +84,7 @@ struct x86_emulate_ops {
74 struct kvm_vcpu *vcpu); 84 struct kvm_vcpu *vcpu);
75 85
76 /* 86 /*
77 * write_emulated: Read bytes from emulated/special memory area. 87 * write_emulated: Write bytes to emulated/special memory area.
78 * @addr: [IN ] Linear address to which to write. 88 * @addr: [IN ] Linear address to which to write.
79 * @val: [IN ] Value to write to memory (low-order bytes used as 89 * @val: [IN ] Value to write to memory (low-order bytes used as
80 * required). 90 * required).
@@ -129,7 +139,7 @@ struct decode_cache {
129 u8 seg_override; 139 u8 seg_override;
130 unsigned int d; 140 unsigned int d;
131 unsigned long regs[NR_VCPU_REGS]; 141 unsigned long regs[NR_VCPU_REGS];
132 unsigned long eip; 142 unsigned long eip, eip_orig;
133 /* modrm */ 143 /* modrm */
134 u8 modrm; 144 u8 modrm;
135 u8 modrm_mod; 145 u8 modrm_mod;
@@ -168,6 +178,7 @@ struct x86_emulate_ctxt {
168 178
169/* Execution mode, passed to the emulator. */ 179/* Execution mode, passed to the emulator. */
170#define X86EMUL_MODE_REAL 0 /* Real mode. */ 180#define X86EMUL_MODE_REAL 0 /* Real mode. */
181#define X86EMUL_MODE_VM86 1 /* Virtual 8086 mode. */
171#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */ 182#define X86EMUL_MODE_PROT16 2 /* 16-bit protected mode. */
172#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */ 183#define X86EMUL_MODE_PROT32 4 /* 32-bit protected mode. */
173#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */ 184#define X86EMUL_MODE_PROT64 8 /* 64-bit (long) mode. */
diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index d83892226f73..06d9e79ca37d 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -25,7 +25,7 @@
25#include <asm/mtrr.h> 25#include <asm/mtrr.h>
26#include <asm/msr-index.h> 26#include <asm/msr-index.h>
27 27
28#define KVM_MAX_VCPUS 16 28#define KVM_MAX_VCPUS 64
29#define KVM_MEMORY_SLOTS 32 29#define KVM_MEMORY_SLOTS 32
30/* memory slots that does not exposed to userspace */ 30/* memory slots that does not exposed to userspace */
31#define KVM_PRIVATE_MEM_SLOTS 4 31#define KVM_PRIVATE_MEM_SLOTS 4
@@ -38,19 +38,6 @@
38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \ 38#define CR3_L_MODE_RESERVED_BITS (CR3_NONPAE_RESERVED_BITS | \
39 0xFFFFFF0000000000ULL) 39 0xFFFFFF0000000000ULL)
40 40
41#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
42 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
43#define KVM_GUEST_CR0_MASK \
44 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
45#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
46 (X86_CR0_WP | X86_CR0_NE | X86_CR0_TS | X86_CR0_MP)
47#define KVM_VM_CR0_ALWAYS_ON \
48 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
49#define KVM_GUEST_CR4_MASK \
50 (X86_CR4_VME | X86_CR4_PSE | X86_CR4_PAE | X86_CR4_PGE | X86_CR4_VMXE)
51#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
52#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
53
54#define INVALID_PAGE (~(hpa_t)0) 41#define INVALID_PAGE (~(hpa_t)0)
55#define UNMAPPED_GVA (~(gpa_t)0) 42#define UNMAPPED_GVA (~(gpa_t)0)
56 43
@@ -256,7 +243,8 @@ struct kvm_mmu {
256 void (*new_cr3)(struct kvm_vcpu *vcpu); 243 void (*new_cr3)(struct kvm_vcpu *vcpu);
257 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err); 244 int (*page_fault)(struct kvm_vcpu *vcpu, gva_t gva, u32 err);
258 void (*free)(struct kvm_vcpu *vcpu); 245 void (*free)(struct kvm_vcpu *vcpu);
259 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva); 246 gpa_t (*gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t gva, u32 access,
247 u32 *error);
260 void (*prefetch_page)(struct kvm_vcpu *vcpu, 248 void (*prefetch_page)(struct kvm_vcpu *vcpu,
261 struct kvm_mmu_page *page); 249 struct kvm_mmu_page *page);
262 int (*sync_page)(struct kvm_vcpu *vcpu, 250 int (*sync_page)(struct kvm_vcpu *vcpu,
@@ -282,13 +270,15 @@ struct kvm_vcpu_arch {
282 u32 regs_dirty; 270 u32 regs_dirty;
283 271
284 unsigned long cr0; 272 unsigned long cr0;
273 unsigned long cr0_guest_owned_bits;
285 unsigned long cr2; 274 unsigned long cr2;
286 unsigned long cr3; 275 unsigned long cr3;
287 unsigned long cr4; 276 unsigned long cr4;
277 unsigned long cr4_guest_owned_bits;
288 unsigned long cr8; 278 unsigned long cr8;
289 u32 hflags; 279 u32 hflags;
290 u64 pdptrs[4]; /* pae */ 280 u64 pdptrs[4]; /* pae */
291 u64 shadow_efer; 281 u64 efer;
292 u64 apic_base; 282 u64 apic_base;
293 struct kvm_lapic *apic; /* kernel irqchip context */ 283 struct kvm_lapic *apic; /* kernel irqchip context */
294 int32_t apic_arb_prio; 284 int32_t apic_arb_prio;
@@ -354,7 +344,6 @@ struct kvm_vcpu_arch {
354 unsigned int time_offset; 344 unsigned int time_offset;
355 struct page *time_page; 345 struct page *time_page;
356 346
357 bool singlestep; /* guest is single stepped by KVM */
358 bool nmi_pending; 347 bool nmi_pending;
359 bool nmi_injected; 348 bool nmi_injected;
360 349
@@ -371,17 +360,31 @@ struct kvm_vcpu_arch {
371 u64 mcg_status; 360 u64 mcg_status;
372 u64 mcg_ctl; 361 u64 mcg_ctl;
373 u64 *mce_banks; 362 u64 *mce_banks;
363
364 /* used for guest single stepping over the given code position */
365 u16 singlestep_cs;
366 unsigned long singlestep_rip;
367 /* fields used by HYPER-V emulation */
368 u64 hv_vapic;
374}; 369};
375 370
376struct kvm_mem_alias { 371struct kvm_mem_alias {
377 gfn_t base_gfn; 372 gfn_t base_gfn;
378 unsigned long npages; 373 unsigned long npages;
379 gfn_t target_gfn; 374 gfn_t target_gfn;
375#define KVM_ALIAS_INVALID 1UL
376 unsigned long flags;
380}; 377};
381 378
382struct kvm_arch{ 379#define KVM_ARCH_HAS_UNALIAS_INSTANTIATION
383 int naliases; 380
381struct kvm_mem_aliases {
384 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS]; 382 struct kvm_mem_alias aliases[KVM_ALIAS_SLOTS];
383 int naliases;
384};
385
386struct kvm_arch {
387 struct kvm_mem_aliases *aliases;
385 388
386 unsigned int n_free_mmu_pages; 389 unsigned int n_free_mmu_pages;
387 unsigned int n_requested_mmu_pages; 390 unsigned int n_requested_mmu_pages;
@@ -397,7 +400,6 @@ struct kvm_arch{
397 struct kvm_pic *vpic; 400 struct kvm_pic *vpic;
398 struct kvm_ioapic *vioapic; 401 struct kvm_ioapic *vioapic;
399 struct kvm_pit *vpit; 402 struct kvm_pit *vpit;
400 struct hlist_head irq_ack_notifier_list;
401 int vapics_in_nmi_mode; 403 int vapics_in_nmi_mode;
402 404
403 unsigned int tss_addr; 405 unsigned int tss_addr;
@@ -410,8 +412,14 @@ struct kvm_arch{
410 gpa_t ept_identity_map_addr; 412 gpa_t ept_identity_map_addr;
411 413
412 unsigned long irq_sources_bitmap; 414 unsigned long irq_sources_bitmap;
413 unsigned long irq_states[KVM_IOAPIC_NUM_PINS];
414 u64 vm_init_tsc; 415 u64 vm_init_tsc;
416 s64 kvmclock_offset;
417
418 struct kvm_xen_hvm_config xen_hvm_config;
419
420 /* fields used by HYPER-V emulation */
421 u64 hv_guest_os_id;
422 u64 hv_hypercall;
415}; 423};
416 424
417struct kvm_vm_stat { 425struct kvm_vm_stat {
@@ -461,12 +469,13 @@ struct descriptor_table {
461struct kvm_x86_ops { 469struct kvm_x86_ops {
462 int (*cpu_has_kvm_support)(void); /* __init */ 470 int (*cpu_has_kvm_support)(void); /* __init */
463 int (*disabled_by_bios)(void); /* __init */ 471 int (*disabled_by_bios)(void); /* __init */
464 void (*hardware_enable)(void *dummy); /* __init */ 472 int (*hardware_enable)(void *dummy);
465 void (*hardware_disable)(void *dummy); 473 void (*hardware_disable)(void *dummy);
466 void (*check_processor_compatibility)(void *rtn); 474 void (*check_processor_compatibility)(void *rtn);
467 int (*hardware_setup)(void); /* __init */ 475 int (*hardware_setup)(void); /* __init */
468 void (*hardware_unsetup)(void); /* __exit */ 476 void (*hardware_unsetup)(void); /* __exit */
469 bool (*cpu_has_accelerated_tpr)(void); 477 bool (*cpu_has_accelerated_tpr)(void);
478 void (*cpuid_update)(struct kvm_vcpu *vcpu);
470 479
471 /* Create, but do not attach this VCPU */ 480 /* Create, but do not attach this VCPU */
472 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id); 481 struct kvm_vcpu *(*vcpu_create)(struct kvm *kvm, unsigned id);
@@ -477,8 +486,8 @@ struct kvm_x86_ops {
477 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu); 486 void (*vcpu_load)(struct kvm_vcpu *vcpu, int cpu);
478 void (*vcpu_put)(struct kvm_vcpu *vcpu); 487 void (*vcpu_put)(struct kvm_vcpu *vcpu);
479 488
480 int (*set_guest_debug)(struct kvm_vcpu *vcpu, 489 void (*set_guest_debug)(struct kvm_vcpu *vcpu,
481 struct kvm_guest_debug *dbg); 490 struct kvm_guest_debug *dbg);
482 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata); 491 int (*get_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata);
483 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data); 492 int (*set_msr)(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
484 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg); 493 u64 (*get_segment_base)(struct kvm_vcpu *vcpu, int seg);
@@ -488,6 +497,7 @@ struct kvm_x86_ops {
488 void (*set_segment)(struct kvm_vcpu *vcpu, 497 void (*set_segment)(struct kvm_vcpu *vcpu,
489 struct kvm_segment *var, int seg); 498 struct kvm_segment *var, int seg);
490 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l); 499 void (*get_cs_db_l_bits)(struct kvm_vcpu *vcpu, int *db, int *l);
500 void (*decache_cr0_guest_bits)(struct kvm_vcpu *vcpu);
491 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu); 501 void (*decache_cr4_guest_bits)(struct kvm_vcpu *vcpu);
492 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0); 502 void (*set_cr0)(struct kvm_vcpu *vcpu, unsigned long cr0);
493 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3); 503 void (*set_cr3)(struct kvm_vcpu *vcpu, unsigned long cr3);
@@ -497,17 +507,18 @@ struct kvm_x86_ops {
497 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 507 void (*set_idt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
498 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 508 void (*get_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
499 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt); 509 void (*set_gdt)(struct kvm_vcpu *vcpu, struct descriptor_table *dt);
500 unsigned long (*get_dr)(struct kvm_vcpu *vcpu, int dr); 510 int (*get_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long *dest);
501 void (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value, 511 int (*set_dr)(struct kvm_vcpu *vcpu, int dr, unsigned long value);
502 int *exception);
503 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg); 512 void (*cache_reg)(struct kvm_vcpu *vcpu, enum kvm_reg reg);
504 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu); 513 unsigned long (*get_rflags)(struct kvm_vcpu *vcpu);
505 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags); 514 void (*set_rflags)(struct kvm_vcpu *vcpu, unsigned long rflags);
515 void (*fpu_activate)(struct kvm_vcpu *vcpu);
516 void (*fpu_deactivate)(struct kvm_vcpu *vcpu);
506 517
507 void (*tlb_flush)(struct kvm_vcpu *vcpu); 518 void (*tlb_flush)(struct kvm_vcpu *vcpu);
508 519
509 void (*run)(struct kvm_vcpu *vcpu, struct kvm_run *run); 520 void (*run)(struct kvm_vcpu *vcpu);
510 int (*handle_exit)(struct kvm_run *run, struct kvm_vcpu *vcpu); 521 int (*handle_exit)(struct kvm_vcpu *vcpu);
511 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu); 522 void (*skip_emulated_instruction)(struct kvm_vcpu *vcpu);
512 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 523 void (*set_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
513 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask); 524 u32 (*get_interrupt_shadow)(struct kvm_vcpu *vcpu, int mask);
@@ -519,13 +530,16 @@ struct kvm_x86_ops {
519 bool has_error_code, u32 error_code); 530 bool has_error_code, u32 error_code);
520 int (*interrupt_allowed)(struct kvm_vcpu *vcpu); 531 int (*interrupt_allowed)(struct kvm_vcpu *vcpu);
521 int (*nmi_allowed)(struct kvm_vcpu *vcpu); 532 int (*nmi_allowed)(struct kvm_vcpu *vcpu);
533 bool (*get_nmi_mask)(struct kvm_vcpu *vcpu);
534 void (*set_nmi_mask)(struct kvm_vcpu *vcpu, bool masked);
522 void (*enable_nmi_window)(struct kvm_vcpu *vcpu); 535 void (*enable_nmi_window)(struct kvm_vcpu *vcpu);
523 void (*enable_irq_window)(struct kvm_vcpu *vcpu); 536 void (*enable_irq_window)(struct kvm_vcpu *vcpu);
524 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); 537 void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr);
525 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr); 538 int (*set_tss_addr)(struct kvm *kvm, unsigned int addr);
526 int (*get_tdp_level)(void); 539 int (*get_tdp_level)(void);
527 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio); 540 u64 (*get_mt_mask)(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio);
528 bool (*gb_page_enable)(void); 541 int (*get_lpage_level)(void);
542 bool (*rdtscp_supported)(void);
529 543
530 const struct trace_print_flags *exit_reasons_str; 544 const struct trace_print_flags *exit_reasons_str;
531}; 545};
@@ -568,7 +582,7 @@ enum emulation_result {
568#define EMULTYPE_NO_DECODE (1 << 0) 582#define EMULTYPE_NO_DECODE (1 << 0)
569#define EMULTYPE_TRAP_UD (1 << 1) 583#define EMULTYPE_TRAP_UD (1 << 1)
570#define EMULTYPE_SKIP (1 << 2) 584#define EMULTYPE_SKIP (1 << 2)
571int emulate_instruction(struct kvm_vcpu *vcpu, struct kvm_run *run, 585int emulate_instruction(struct kvm_vcpu *vcpu,
572 unsigned long cr2, u16 error_code, int emulation_type); 586 unsigned long cr2, u16 error_code, int emulation_type);
573void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context); 587void kvm_report_emulation_failure(struct kvm_vcpu *cvpu, const char *context);
574void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address); 588void realmode_lgdt(struct kvm_vcpu *vcpu, u16 size, unsigned long address);
@@ -585,9 +599,9 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data);
585 599
586struct x86_emulate_ctxt; 600struct x86_emulate_ctxt;
587 601
588int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 602int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in,
589 int size, unsigned port); 603 int size, unsigned port);
590int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 604int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
591 int size, unsigned long count, int down, 605 int size, unsigned long count, int down,
592 gva_t address, int rep, unsigned port); 606 gva_t address, int rep, unsigned port);
593void kvm_emulate_cpuid(struct kvm_vcpu *vcpu); 607void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
@@ -600,8 +614,7 @@ int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr,
600 unsigned long value); 614 unsigned long value);
601 615
602void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); 616void kvm_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg);
603int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 617int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg);
604 int type_bits, int seg);
605 618
606int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason); 619int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason);
607 620
@@ -616,6 +629,9 @@ void kvm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l);
616int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata); 629int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata);
617int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data); 630int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data);
618 631
632unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu);
633void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags);
634
619void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr); 635void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr);
620void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); 636void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code);
621void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2, 637void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long cr2,
@@ -644,6 +660,10 @@ void __kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu);
644int kvm_mmu_load(struct kvm_vcpu *vcpu); 660int kvm_mmu_load(struct kvm_vcpu *vcpu);
645void kvm_mmu_unload(struct kvm_vcpu *vcpu); 661void kvm_mmu_unload(struct kvm_vcpu *vcpu);
646void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu); 662void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu);
663gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
664gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
665gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
666gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error);
647 667
648int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); 668int kvm_emulate_hypercall(struct kvm_vcpu *vcpu);
649 669
@@ -657,6 +677,7 @@ void kvm_disable_tdp(void);
657 677
658int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3); 678int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3);
659int complete_pio(struct kvm_vcpu *vcpu); 679int complete_pio(struct kvm_vcpu *vcpu);
680bool kvm_check_iopl(struct kvm_vcpu *vcpu);
660 681
661struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn); 682struct kvm_memory_slot *gfn_to_memslot_unaliased(struct kvm *kvm, gfn_t gfn);
662 683
@@ -802,4 +823,7 @@ int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
802int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu); 823int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
803int kvm_cpu_get_interrupt(struct kvm_vcpu *v); 824int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
804 825
826void kvm_define_shared_msr(unsigned index, u32 msr);
827void kvm_set_shared_msr(unsigned index, u64 val, u64 mask);
828
805#endif /* _ASM_X86_KVM_HOST_H */ 829#endif /* _ASM_X86_KVM_HOST_H */
diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h
index c584076a47f4..ffae1420e7d7 100644
--- a/arch/x86/include/asm/kvm_para.h
+++ b/arch/x86/include/asm/kvm_para.h
@@ -2,6 +2,7 @@
2#define _ASM_X86_KVM_PARA_H 2#define _ASM_X86_KVM_PARA_H
3 3
4#include <linux/types.h> 4#include <linux/types.h>
5#include <asm/hyperv.h>
5 6
6/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It 7/* This CPUID returns the signature 'KVMKVMKVM' in ebx, ecx, and edx. It
7 * should be used to determine that a VM is running under KVM. 8 * should be used to determine that a VM is running under KVM.
diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h
index ba0eed8aa1a6..b60f2924c413 100644
--- a/arch/x86/include/asm/lguest_hcall.h
+++ b/arch/x86/include/asm/lguest_hcall.h
@@ -28,22 +28,39 @@
28 28
29#ifndef __ASSEMBLY__ 29#ifndef __ASSEMBLY__
30#include <asm/hw_irq.h> 30#include <asm/hw_irq.h>
31#include <asm/kvm_para.h>
32 31
33/*G:030 32/*G:030
34 * But first, how does our Guest contact the Host to ask for privileged 33 * But first, how does our Guest contact the Host to ask for privileged
35 * operations? There are two ways: the direct way is to make a "hypercall", 34 * operations? There are two ways: the direct way is to make a "hypercall",
36 * to make requests of the Host Itself. 35 * to make requests of the Host Itself.
37 * 36 *
38 * We use the KVM hypercall mechanism, though completely different hypercall 37 * Our hypercall mechanism uses the highest unused trap code (traps 32 and
39 * numbers. Seventeen hypercalls are available: the hypercall number is put in 38 * above are used by real hardware interrupts). Seventeen hypercalls are
40 * the %eax register, and the arguments (when required) are placed in %ebx, 39 * available: the hypercall number is put in the %eax register, and the
41 * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. 40 * arguments (when required) are placed in %ebx, %ecx, %edx and %esi.
41 * If a return value makes sense, it's returned in %eax.
42 * 42 *
43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful 43 * Grossly invalid calls result in Sudden Death at the hands of the vengeful
44 * Host, rather than returning failure. This reflects Winston Churchill's 44 * Host, rather than returning failure. This reflects Winston Churchill's
45 * definition of a gentleman: "someone who is only rude intentionally". 45 * definition of a gentleman: "someone who is only rude intentionally".
46:*/ 46 */
47static inline unsigned long
48hcall(unsigned long call,
49 unsigned long arg1, unsigned long arg2, unsigned long arg3,
50 unsigned long arg4)
51{
52 /* "int" is the Intel instruction to trigger a trap. */
53 asm volatile("int $" __stringify(LGUEST_TRAP_ENTRY)
54 /* The call in %eax (aka "a") might be overwritten */
55 : "=a"(call)
56 /* The arguments are in %eax, %ebx, %ecx, %edx & %esi */
57 : "a"(call), "b"(arg1), "c"(arg2), "d"(arg3), "S"(arg4)
58 /* "memory" means this might write somewhere in memory.
59 * This isn't true for all calls, but it's safe to tell
60 * gcc that it might happen so it doesn't get clever. */
61 : "memory");
62 return call;
63}
47 64
48/* Can't use our min() macro here: needs to be a constant */ 65/* Can't use our min() macro here: needs to be a constant */
49#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) 66#define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32)
diff --git a/arch/x86/include/asm/local.h b/arch/x86/include/asm/local.h
index 47b9b6f19057..2e9972468a5d 100644
--- a/arch/x86/include/asm/local.h
+++ b/arch/x86/include/asm/local.h
@@ -195,41 +195,4 @@ static inline long local_sub_return(long i, local_t *l)
195#define __local_add(i, l) local_add((i), (l)) 195#define __local_add(i, l) local_add((i), (l))
196#define __local_sub(i, l) local_sub((i), (l)) 196#define __local_sub(i, l) local_sub((i), (l))
197 197
198/* Use these for per-cpu local_t variables: on some archs they are
199 * much more efficient than these naive implementations. Note they take
200 * a variable, not an address.
201 *
202 * X86_64: This could be done better if we moved the per cpu data directly
203 * after GS.
204 */
205
206/* Need to disable preemption for the cpu local counters otherwise we could
207 still access a variable of a previous CPU in a non atomic way. */
208#define cpu_local_wrap_v(l) \
209({ \
210 local_t res__; \
211 preempt_disable(); \
212 res__ = (l); \
213 preempt_enable(); \
214 res__; \
215})
216#define cpu_local_wrap(l) \
217({ \
218 preempt_disable(); \
219 (l); \
220 preempt_enable(); \
221}) \
222
223#define cpu_local_read(l) cpu_local_wrap_v(local_read(&__get_cpu_var((l))))
224#define cpu_local_set(l, i) cpu_local_wrap(local_set(&__get_cpu_var((l)), (i)))
225#define cpu_local_inc(l) cpu_local_wrap(local_inc(&__get_cpu_var((l))))
226#define cpu_local_dec(l) cpu_local_wrap(local_dec(&__get_cpu_var((l))))
227#define cpu_local_add(i, l) cpu_local_wrap(local_add((i), &__get_cpu_var((l))))
228#define cpu_local_sub(i, l) cpu_local_wrap(local_sub((i), &__get_cpu_var((l))))
229
230#define __cpu_local_inc(l) cpu_local_inc((l))
231#define __cpu_local_dec(l) cpu_local_dec((l))
232#define __cpu_local_add(i, l) cpu_local_add((i), (l))
233#define __cpu_local_sub(i, l) cpu_local_sub((i), (l))
234
235#endif /* _ASM_X86_LOCAL_H */ 198#endif /* _ASM_X86_LOCAL_H */
diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h
index f1363b72364f..6c3fdd631ed3 100644
--- a/arch/x86/include/asm/mce.h
+++ b/arch/x86/include/asm/mce.h
@@ -108,8 +108,11 @@ struct mce_log {
108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9) 108#define K8_MCE_THRESHOLD_BANK_5 (MCE_THRESHOLD_BASE + 5 * 9)
109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0) 109#define K8_MCE_THRESHOLD_DRAM_ECC (MCE_THRESHOLD_BANK_4 + 0)
110 110
111
111#ifdef __KERNEL__ 112#ifdef __KERNEL__
112 113
114extern struct atomic_notifier_head x86_mce_decoder_chain;
115
113#include <linux/percpu.h> 116#include <linux/percpu.h>
114#include <linux/init.h> 117#include <linux/init.h>
115#include <asm/atomic.h> 118#include <asm/atomic.h>
@@ -118,9 +121,11 @@ extern int mce_disabled;
118extern int mce_p5_enabled; 121extern int mce_p5_enabled;
119 122
120#ifdef CONFIG_X86_MCE 123#ifdef CONFIG_X86_MCE
121void mcheck_init(struct cpuinfo_x86 *c); 124int mcheck_init(void);
125void mcheck_cpu_init(struct cpuinfo_x86 *c);
122#else 126#else
123static inline void mcheck_init(struct cpuinfo_x86 *c) {} 127static inline int mcheck_init(void) { return 0; }
128static inline void mcheck_cpu_init(struct cpuinfo_x86 *c) {}
124#endif 129#endif
125 130
126#ifdef CONFIG_X86_ANCIENT_MCE 131#ifdef CONFIG_X86_ANCIENT_MCE
@@ -214,5 +219,11 @@ void intel_init_thermal(struct cpuinfo_x86 *c);
214 219
215void mce_log_therm_throt_event(__u64 status); 220void mce_log_therm_throt_event(__u64 status);
216 221
222#ifdef CONFIG_X86_THERMAL_VECTOR
223extern void mcheck_intel_therm_init(void);
224#else
225static inline void mcheck_intel_therm_init(void) { }
226#endif
227
217#endif /* __KERNEL__ */ 228#endif /* __KERNEL__ */
218#endif /* _ASM_X86_MCE_H */ 229#endif /* _ASM_X86_MCE_H */
diff --git a/arch/x86/include/asm/mmzone_32.h b/arch/x86/include/asm/mmzone_32.h
index ede6998bd92c..91df7c51806c 100644
--- a/arch/x86/include/asm/mmzone_32.h
+++ b/arch/x86/include/asm/mmzone_32.h
@@ -47,7 +47,7 @@ static inline void resume_map_numa_kva(pgd_t *pgd) {}
47/* 47/*
48 * generic node memory support, the following assumptions apply: 48 * generic node memory support, the following assumptions apply:
49 * 49 *
50 * 1) memory comes in 64Mb contigious chunks which are either present or not 50 * 1) memory comes in 64Mb contiguous chunks which are either present or not
51 * 2) we will not have more than 64Gb in total 51 * 2) we will not have more than 64Gb in total
52 * 52 *
53 * for now assume that 64Gb is max amount of RAM for whole system 53 * for now assume that 64Gb is max amount of RAM for whole system
diff --git a/arch/x86/include/asm/mmzone_64.h b/arch/x86/include/asm/mmzone_64.h
index a29f48c2a322..288b96f815a6 100644
--- a/arch/x86/include/asm/mmzone_64.h
+++ b/arch/x86/include/asm/mmzone_64.h
@@ -39,11 +39,5 @@ static inline __attribute__((pure)) int phys_to_nid(unsigned long addr)
39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn) 39#define node_start_pfn(nid) (NODE_DATA(nid)->node_start_pfn)
40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \ 40#define node_end_pfn(nid) (NODE_DATA(nid)->node_start_pfn + \
41 NODE_DATA(nid)->node_spanned_pages) 41 NODE_DATA(nid)->node_spanned_pages)
42
43#ifdef CONFIG_NUMA_EMU
44#define FAKE_NODE_MIN_SIZE (64 * 1024 * 1024)
45#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
46#endif
47
48#endif 42#endif
49#endif /* _ASM_X86_MMZONE_64_H */ 43#endif /* _ASM_X86_MMZONE_64_H */
diff --git a/arch/x86/include/asm/mpspec.h b/arch/x86/include/asm/mpspec.h
index 79c94500c0bb..d8bf23a88d05 100644
--- a/arch/x86/include/asm/mpspec.h
+++ b/arch/x86/include/asm/mpspec.h
@@ -71,12 +71,7 @@ static inline void early_get_smp_config(void)
71 71
72static inline void find_smp_config(void) 72static inline void find_smp_config(void)
73{ 73{
74 x86_init.mpparse.find_smp_config(1); 74 x86_init.mpparse.find_smp_config();
75}
76
77static inline void early_find_smp_config(void)
78{
79 x86_init.mpparse.find_smp_config(0);
80} 75}
81 76
82#ifdef CONFIG_X86_MPPARSE 77#ifdef CONFIG_X86_MPPARSE
@@ -89,7 +84,7 @@ extern void default_mpc_oem_bus_info(struct mpc_bus *m, char *str);
89# else 84# else
90# define default_mpc_oem_bus_info NULL 85# define default_mpc_oem_bus_info NULL
91# endif 86# endif
92extern void default_find_smp_config(unsigned int reserve); 87extern void default_find_smp_config(void);
93extern void default_get_smp_config(unsigned int early); 88extern void default_get_smp_config(unsigned int early);
94#else 89#else
95static inline void early_reserve_e820_mpc_new(void) { } 90static inline void early_reserve_e820_mpc_new(void) { }
@@ -97,7 +92,7 @@ static inline void early_reserve_e820_mpc_new(void) { }
97#define default_mpc_apic_id NULL 92#define default_mpc_apic_id NULL
98#define default_smp_read_mpc_oem NULL 93#define default_smp_read_mpc_oem NULL
99#define default_mpc_oem_bus_info NULL 94#define default_mpc_oem_bus_info NULL
100#define default_find_smp_config x86_init_uint_noop 95#define default_find_smp_config x86_init_noop
101#define default_get_smp_config x86_init_uint_noop 96#define default_get_smp_config x86_init_uint_noop
102#endif 97#endif
103 98
@@ -163,14 +158,16 @@ typedef struct physid_mask physid_mask_t;
163#define physids_shift_left(d, s, n) \ 158#define physids_shift_left(d, s, n) \
164 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS) 159 bitmap_shift_left((d).mask, (s).mask, n, MAX_APICS)
165 160
166#define physids_coerce(map) ((map).mask[0]) 161static inline unsigned long physids_coerce(physid_mask_t *map)
162{
163 return map->mask[0];
164}
167 165
168#define physids_promote(physids) \ 166static inline void physids_promote(unsigned long physids, physid_mask_t *map)
169 ({ \ 167{
170 physid_mask_t __physid_mask = PHYSID_MASK_NONE; \ 168 physids_clear(*map);
171 __physid_mask.mask[0] = physids; \ 169 map->mask[0] = physids;
172 __physid_mask; \ 170}
173 })
174 171
175/* Note: will create very large stack frames if physid_mask_t is big */ 172/* Note: will create very large stack frames if physid_mask_t is big */
176#define physid_mask_of_physid(physid) \ 173#define physid_mask_of_physid(physid) \
diff --git a/arch/x86/include/asm/mrst.h b/arch/x86/include/asm/mrst.h
new file mode 100644
index 000000000000..451d30e7f62d
--- /dev/null
+++ b/arch/x86/include/asm/mrst.h
@@ -0,0 +1,19 @@
1/*
2 * mrst.h: Intel Moorestown platform specific setup code
3 *
4 * (C) Copyright 2009 Intel Corporation
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; version 2
9 * of the License.
10 */
11#ifndef _ASM_X86_MRST_H
12#define _ASM_X86_MRST_H
13extern int pci_mrst_init(void);
14int __init sfi_parse_mrtc(struct sfi_table_header *table);
15
16#define SFI_MTMR_MAX_NUM 8
17#define SFI_MRTC_MAX 8
18
19#endif /* _ASM_X86_MRST_H */
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index 4ffe09b2ad75..4604e6a54d36 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -12,6 +12,7 @@
12#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */ 12#define MSR_FS_BASE 0xc0000100 /* 64bit FS base */
13#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */ 13#define MSR_GS_BASE 0xc0000101 /* 64bit GS base */
14#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */ 14#define MSR_KERNEL_GS_BASE 0xc0000102 /* SwapGS GS shadow */
15#define MSR_TSC_AUX 0xc0000103 /* Auxiliary TSC */
15 16
16/* EFER bits: */ 17/* EFER bits: */
17#define _EFER_SCE 0 /* SYSCALL/SYSRET */ 18#define _EFER_SCE 0 /* SYSCALL/SYSRET */
@@ -104,6 +105,8 @@
104#define MSR_AMD64_PATCH_LEVEL 0x0000008b 105#define MSR_AMD64_PATCH_LEVEL 0x0000008b
105#define MSR_AMD64_NB_CFG 0xc001001f 106#define MSR_AMD64_NB_CFG 0xc001001f
106#define MSR_AMD64_PATCH_LOADER 0xc0010020 107#define MSR_AMD64_PATCH_LOADER 0xc0010020
108#define MSR_AMD64_OSVW_ID_LENGTH 0xc0010140
109#define MSR_AMD64_OSVW_STATUS 0xc0010141
107#define MSR_AMD64_IBSFETCHCTL 0xc0011030 110#define MSR_AMD64_IBSFETCHCTL 0xc0011030
108#define MSR_AMD64_IBSFETCHLINAD 0xc0011031 111#define MSR_AMD64_IBSFETCHLINAD 0xc0011031
109#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032 112#define MSR_AMD64_IBSFETCHPHYSAD 0xc0011032
@@ -123,6 +126,7 @@
123#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2 126#define FAM10H_MMIO_CONF_BUSRANGE_SHIFT 2
124#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff 127#define FAM10H_MMIO_CONF_BASE_MASK 0xfffffff
125#define FAM10H_MMIO_CONF_BASE_SHIFT 20 128#define FAM10H_MMIO_CONF_BASE_SHIFT 20
129#define MSR_FAM10H_NODE_ID 0xc001100c
126 130
127/* K8 MSRs */ 131/* K8 MSRs */
128#define MSR_K8_TOP_MEM1 0xc001001a 132#define MSR_K8_TOP_MEM1 0xc001001a
diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h
index 7e2b6ba962ff..c5bc4c2d33f5 100644
--- a/arch/x86/include/asm/msr.h
+++ b/arch/x86/include/asm/msr.h
@@ -27,6 +27,18 @@ struct msr {
27 }; 27 };
28}; 28};
29 29
30struct msr_info {
31 u32 msr_no;
32 struct msr reg;
33 struct msr *msrs;
34 int err;
35};
36
37struct msr_regs_info {
38 u32 *regs;
39 int err;
40};
41
30static inline unsigned long long native_read_tscp(unsigned int *aux) 42static inline unsigned long long native_read_tscp(unsigned int *aux)
31{ 43{
32 unsigned long low, high; 44 unsigned long low, high;
@@ -240,15 +252,18 @@ do { \
240#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \ 252#define checking_wrmsrl(msr, val) wrmsr_safe((msr), (u32)(val), \
241 (u32)((val) >> 32)) 253 (u32)((val) >> 32))
242 254
243#define write_tsc(val1, val2) wrmsr(0x10, (val1), (val2)) 255#define write_tsc(val1, val2) wrmsr(MSR_IA32_TSC, (val1), (val2))
256
257#define write_rdtscp_aux(val) wrmsr(MSR_TSC_AUX, (val), 0)
244 258
245#define write_rdtscp_aux(val) wrmsr(0xc0000103, (val), 0) 259struct msr *msrs_alloc(void);
260void msrs_free(struct msr *msrs);
246 261
247#ifdef CONFIG_SMP 262#ifdef CONFIG_SMP
248int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 263int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
249int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 264int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
250void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 265void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
251void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs); 266void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs);
252int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); 267int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h);
253int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); 268int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h);
254int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); 269int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]);
@@ -264,12 +279,12 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
264 wrmsr(msr_no, l, h); 279 wrmsr(msr_no, l, h);
265 return 0; 280 return 0;
266} 281}
267static inline void rdmsr_on_cpus(const cpumask_t *m, u32 msr_no, 282static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no,
268 struct msr *msrs) 283 struct msr *msrs)
269{ 284{
270 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h)); 285 rdmsr_on_cpu(0, msr_no, &(msrs[0].l), &(msrs[0].h));
271} 286}
272static inline void wrmsr_on_cpus(const cpumask_t *m, u32 msr_no, 287static inline void wrmsr_on_cpus(const struct cpumask *m, u32 msr_no,
273 struct msr *msrs) 288 struct msr *msrs)
274{ 289{
275 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h); 290 wrmsr_on_cpu(0, msr_no, msrs[0].l, msrs[0].h);
diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h
index 139d4c1a33a7..93da9c3f3341 100644
--- a/arch/x86/include/asm/nmi.h
+++ b/arch/x86/include/asm/nmi.h
@@ -19,7 +19,6 @@ extern void die_nmi(char *str, struct pt_regs *regs, int do_panic);
19extern int check_nmi_watchdog(void); 19extern int check_nmi_watchdog(void);
20extern int nmi_watchdog_enabled; 20extern int nmi_watchdog_enabled;
21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int); 21extern int avail_to_resrv_perfctr_nmi_bit(unsigned int);
22extern int avail_to_resrv_perfctr_nmi(unsigned int);
23extern int reserve_perfctr_nmi(unsigned int); 22extern int reserve_perfctr_nmi(unsigned int);
24extern void release_perfctr_nmi(unsigned int); 23extern void release_perfctr_nmi(unsigned int);
25extern int reserve_evntsel_nmi(unsigned int); 24extern int reserve_evntsel_nmi(unsigned int);
diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h
index c4ae822e415f..823e070e7c26 100644
--- a/arch/x86/include/asm/numa_64.h
+++ b/arch/x86/include/asm/numa_64.h
@@ -36,6 +36,11 @@ extern void __cpuinit numa_set_node(int cpu, int node);
36extern void __cpuinit numa_clear_node(int cpu); 36extern void __cpuinit numa_clear_node(int cpu);
37extern void __cpuinit numa_add_cpu(int cpu); 37extern void __cpuinit numa_add_cpu(int cpu);
38extern void __cpuinit numa_remove_cpu(int cpu); 38extern void __cpuinit numa_remove_cpu(int cpu);
39
40#ifdef CONFIG_NUMA_EMU
41#define FAKE_NODE_MIN_SIZE ((u64)64 << 20)
42#define FAKE_NODE_MIN_HASH_MASK (~(FAKE_NODE_MIN_SIZE - 1UL))
43#endif /* CONFIG_NUMA_EMU */
39#else 44#else
40static inline void init_cpu_to_node(void) { } 45static inline void init_cpu_to_node(void) { }
41static inline void numa_set_node(int cpu, int node) { } 46static inline void numa_set_node(int cpu, int node) { }
diff --git a/arch/x86/include/asm/numaq.h b/arch/x86/include/asm/numaq.h
index 9f0a5f5d29ec..37c516545ec8 100644
--- a/arch/x86/include/asm/numaq.h
+++ b/arch/x86/include/asm/numaq.h
@@ -30,9 +30,14 @@
30 30
31extern int found_numaq; 31extern int found_numaq;
32extern int get_memcfg_numaq(void); 32extern int get_memcfg_numaq(void);
33extern int pci_numaq_init(void);
33 34
34extern void *xquad_portio; 35extern void *xquad_portio;
35 36
37#define XQUAD_PORTIO_BASE 0xfe400000
38#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
39#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
40
36/* 41/*
37 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the 42 * SYS_CFG_DATA_PRIV_ADDR, struct eachquadmem, and struct sys_cfg_data are the
38 */ 43 */
diff --git a/arch/x86/include/asm/olpc.h b/arch/x86/include/asm/olpc.h
index 834a30295fab..101229b0d8ed 100644
--- a/arch/x86/include/asm/olpc.h
+++ b/arch/x86/include/asm/olpc.h
@@ -13,7 +13,6 @@ struct olpc_platform_t {
13 13
14#define OLPC_F_PRESENT 0x01 14#define OLPC_F_PRESENT 0x01
15#define OLPC_F_DCON 0x02 15#define OLPC_F_DCON 0x02
16#define OLPC_F_VSA 0x04
17 16
18#ifdef CONFIG_OLPC 17#ifdef CONFIG_OLPC
19 18
@@ -51,18 +50,6 @@ static inline int olpc_has_dcon(void)
51} 50}
52 51
53/* 52/*
54 * The VSA is software from AMD that typical Geode bioses will include.
55 * It is used to emulate the PCI bus, VGA, etc. OLPC's Open Firmware does
56 * not include the VSA; instead, PCI is emulated by the kernel.
57 *
58 * The VSA is described further in arch/x86/pci/olpc.c.
59 */
60static inline int olpc_has_vsa(void)
61{
62 return (olpc_platform_info.flags & OLPC_F_VSA) ? 1 : 0;
63}
64
65/*
66 * The "Mass Production" version of OLPC's XO is identified as being model 53 * The "Mass Production" version of OLPC's XO is identified as being model
67 * C2. During the prototype phase, the following models (in chronological 54 * C2. During the prototype phase, the following models (in chronological
68 * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models 55 * order) were created: A1, B1, B2, B3, B4, C1. The A1 through B2 models
@@ -87,13 +74,10 @@ static inline int olpc_has_dcon(void)
87 return 0; 74 return 0;
88} 75}
89 76
90static inline int olpc_has_vsa(void)
91{
92 return 0;
93}
94
95#endif 77#endif
96 78
79extern int pci_olpc_init(void);
80
97/* EC related functions */ 81/* EC related functions */
98 82
99extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen, 83extern int olpc_ec_cmd(unsigned char cmd, unsigned char *inbuf, size_t inlen,
@@ -120,7 +104,7 @@ extern int olpc_ec_mask_unset(uint8_t bits);
120 104
121/* GPIO assignments */ 105/* GPIO assignments */
122 106
123#define OLPC_GPIO_MIC_AC geode_gpio(1) 107#define OLPC_GPIO_MIC_AC 1
124#define OLPC_GPIO_DCON_IRQ geode_gpio(7) 108#define OLPC_GPIO_DCON_IRQ geode_gpio(7)
125#define OLPC_GPIO_THRM_ALRM geode_gpio(10) 109#define OLPC_GPIO_THRM_ALRM geode_gpio(10)
126#define OLPC_GPIO_SMB_CLK geode_gpio(14) 110#define OLPC_GPIO_SMB_CLK geode_gpio(14)
diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h
index 6473f5ccff85..a667f24c7254 100644
--- a/arch/x86/include/asm/page_types.h
+++ b/arch/x86/include/asm/page_types.h
@@ -40,7 +40,6 @@
40 40
41#ifndef __ASSEMBLY__ 41#ifndef __ASSEMBLY__
42 42
43extern int page_is_ram(unsigned long pagenr);
44extern int devmem_is_allowed(unsigned long pagenr); 43extern int devmem_is_allowed(unsigned long pagenr);
45 44
46extern unsigned long max_low_pfn_mapped; 45extern unsigned long max_low_pfn_mapped;
@@ -49,7 +48,8 @@ extern unsigned long max_pfn_mapped;
49extern unsigned long init_memory_mapping(unsigned long start, 48extern unsigned long init_memory_mapping(unsigned long start,
50 unsigned long end); 49 unsigned long end);
51 50
52extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); 51extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn,
52 int acpi, int k8);
53extern void free_initmem(void); 53extern void free_initmem(void);
54 54
55#endif /* !__ASSEMBLY__ */ 55#endif /* !__ASSEMBLY__ */
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index efb38994859c..5653f43d90e5 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -435,15 +435,6 @@ static inline void paravirt_release_pud(unsigned long pfn)
435 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn); 435 PVOP_VCALL1(pv_mmu_ops.release_pud, pfn);
436} 436}
437 437
438#ifdef CONFIG_HIGHPTE
439static inline void *kmap_atomic_pte(struct page *page, enum km_type type)
440{
441 unsigned long ret;
442 ret = PVOP_CALL2(unsigned long, pv_mmu_ops.kmap_atomic_pte, page, type);
443 return (void *)ret;
444}
445#endif
446
447static inline void pte_update(struct mm_struct *mm, unsigned long addr, 438static inline void pte_update(struct mm_struct *mm, unsigned long addr,
448 pte_t *ptep) 439 pte_t *ptep)
449{ 440{
@@ -731,34 +722,34 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx,
731 722
732#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) 723#if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS)
733 724
734static inline int __raw_spin_is_locked(struct raw_spinlock *lock) 725static inline int arch_spin_is_locked(struct arch_spinlock *lock)
735{ 726{
736 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock); 727 return PVOP_CALL1(int, pv_lock_ops.spin_is_locked, lock);
737} 728}
738 729
739static inline int __raw_spin_is_contended(struct raw_spinlock *lock) 730static inline int arch_spin_is_contended(struct arch_spinlock *lock)
740{ 731{
741 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock); 732 return PVOP_CALL1(int, pv_lock_ops.spin_is_contended, lock);
742} 733}
743#define __raw_spin_is_contended __raw_spin_is_contended 734#define arch_spin_is_contended arch_spin_is_contended
744 735
745static __always_inline void __raw_spin_lock(struct raw_spinlock *lock) 736static __always_inline void arch_spin_lock(struct arch_spinlock *lock)
746{ 737{
747 PVOP_VCALL1(pv_lock_ops.spin_lock, lock); 738 PVOP_VCALL1(pv_lock_ops.spin_lock, lock);
748} 739}
749 740
750static __always_inline void __raw_spin_lock_flags(struct raw_spinlock *lock, 741static __always_inline void arch_spin_lock_flags(struct arch_spinlock *lock,
751 unsigned long flags) 742 unsigned long flags)
752{ 743{
753 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags); 744 PVOP_VCALL2(pv_lock_ops.spin_lock_flags, lock, flags);
754} 745}
755 746
756static __always_inline int __raw_spin_trylock(struct raw_spinlock *lock) 747static __always_inline int arch_spin_trylock(struct arch_spinlock *lock)
757{ 748{
758 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock); 749 return PVOP_CALL1(int, pv_lock_ops.spin_trylock, lock);
759} 750}
760 751
761static __always_inline void __raw_spin_unlock(struct raw_spinlock *lock) 752static __always_inline void arch_spin_unlock(struct arch_spinlock *lock)
762{ 753{
763 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock); 754 PVOP_VCALL1(pv_lock_ops.spin_unlock, lock);
764} 755}
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 9357473c8da0..db9ef5532341 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -304,10 +304,6 @@ struct pv_mmu_ops {
304#endif /* PAGETABLE_LEVELS == 4 */ 304#endif /* PAGETABLE_LEVELS == 4 */
305#endif /* PAGETABLE_LEVELS >= 3 */ 305#endif /* PAGETABLE_LEVELS >= 3 */
306 306
307#ifdef CONFIG_HIGHPTE
308 void *(*kmap_atomic_pte)(struct page *page, enum km_type type);
309#endif
310
311 struct pv_lazy_ops lazy_mode; 307 struct pv_lazy_ops lazy_mode;
312 308
313 /* dom0 ops */ 309 /* dom0 ops */
@@ -318,14 +314,14 @@ struct pv_mmu_ops {
318 phys_addr_t phys, pgprot_t flags); 314 phys_addr_t phys, pgprot_t flags);
319}; 315};
320 316
321struct raw_spinlock; 317struct arch_spinlock;
322struct pv_lock_ops { 318struct pv_lock_ops {
323 int (*spin_is_locked)(struct raw_spinlock *lock); 319 int (*spin_is_locked)(struct arch_spinlock *lock);
324 int (*spin_is_contended)(struct raw_spinlock *lock); 320 int (*spin_is_contended)(struct arch_spinlock *lock);
325 void (*spin_lock)(struct raw_spinlock *lock); 321 void (*spin_lock)(struct arch_spinlock *lock);
326 void (*spin_lock_flags)(struct raw_spinlock *lock, unsigned long flags); 322 void (*spin_lock_flags)(struct arch_spinlock *lock, unsigned long flags);
327 int (*spin_trylock)(struct raw_spinlock *lock); 323 int (*spin_trylock)(struct arch_spinlock *lock);
328 void (*spin_unlock)(struct raw_spinlock *lock); 324 void (*spin_unlock)(struct arch_spinlock *lock);
329}; 325};
330 326
331/* This contains all the paravirt structures: we get a convenient 327/* This contains all the paravirt structures: we get a convenient
diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h
index ada8c201d513..404a880ea325 100644
--- a/arch/x86/include/asm/pci.h
+++ b/arch/x86/include/asm/pci.h
@@ -45,8 +45,15 @@ static inline int pci_proc_domain(struct pci_bus *bus)
45 45
46#ifdef CONFIG_PCI 46#ifdef CONFIG_PCI
47extern unsigned int pcibios_assign_all_busses(void); 47extern unsigned int pcibios_assign_all_busses(void);
48extern int pci_legacy_init(void);
49# ifdef CONFIG_ACPI
50# define x86_default_pci_init pci_acpi_init
51# else
52# define x86_default_pci_init pci_legacy_init
53# endif
48#else 54#else
49#define pcibios_assign_all_busses() 0 55# define pcibios_assign_all_busses() 0
56# define x86_default_pci_init NULL
50#endif 57#endif
51 58
52extern unsigned long pci_mem_start; 59extern unsigned long pci_mem_start;
@@ -90,40 +97,14 @@ extern void pci_iommu_alloc(void);
90 97
91#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys) 98#define PCI_DMA_BUS_IS_PHYS (dma_ops->is_phys)
92 99
93#if defined(CONFIG_X86_64) || defined(CONFIG_DMAR) || defined(CONFIG_DMA_API_DEBUG)
94
95#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) \
96 dma_addr_t ADDR_NAME;
97#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) \
98 __u32 LEN_NAME;
99#define pci_unmap_addr(PTR, ADDR_NAME) \
100 ((PTR)->ADDR_NAME)
101#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
102 (((PTR)->ADDR_NAME) = (VAL))
103#define pci_unmap_len(PTR, LEN_NAME) \
104 ((PTR)->LEN_NAME)
105#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
106 (((PTR)->LEN_NAME) = (VAL))
107
108#else
109
110#define DECLARE_PCI_UNMAP_ADDR(ADDR_NAME) dma_addr_t ADDR_NAME[0];
111#define DECLARE_PCI_UNMAP_LEN(LEN_NAME) unsigned LEN_NAME[0];
112#define pci_unmap_addr(PTR, ADDR_NAME) sizeof((PTR)->ADDR_NAME)
113#define pci_unmap_addr_set(PTR, ADDR_NAME, VAL) \
114 do { break; } while (pci_unmap_addr(PTR, ADDR_NAME))
115#define pci_unmap_len(PTR, LEN_NAME) sizeof((PTR)->LEN_NAME)
116#define pci_unmap_len_set(PTR, LEN_NAME, VAL) \
117 do { break; } while (pci_unmap_len(PTR, LEN_NAME))
118
119#endif
120
121#endif /* __KERNEL__ */ 100#endif /* __KERNEL__ */
122 101
123#ifdef CONFIG_X86_64 102#ifdef CONFIG_X86_64
124#include "pci_64.h" 103#include "pci_64.h"
125#endif 104#endif
126 105
106void dma32_reserve_bootmem(void);
107
127/* implement the pci_ DMA API in terms of the generic device dma_ one */ 108/* implement the pci_ DMA API in terms of the generic device dma_ one */
128#include <asm-generic/pci-dma-compat.h> 109#include <asm-generic/pci-dma-compat.h>
129 110
diff --git a/arch/x86/include/asm/pci_64.h b/arch/x86/include/asm/pci_64.h
index ae5e40f67daf..fe15cfb21b9b 100644
--- a/arch/x86/include/asm/pci_64.h
+++ b/arch/x86/include/asm/pci_64.h
@@ -22,8 +22,6 @@ extern int (*pci_config_read)(int seg, int bus, int dev, int fn,
22extern int (*pci_config_write)(int seg, int bus, int dev, int fn, 22extern int (*pci_config_write)(int seg, int bus, int dev, int fn,
23 int reg, int len, u32 value); 23 int reg, int len, u32 value);
24 24
25extern void dma32_reserve_bootmem(void);
26
27#endif /* __KERNEL__ */ 25#endif /* __KERNEL__ */
28 26
29#endif /* _ASM_X86_PCI_64_H */ 27#endif /* _ASM_X86_PCI_64_H */
diff --git a/arch/x86/include/asm/pci_x86.h b/arch/x86/include/asm/pci_x86.h
index b399988eee3a..1a0422348d6d 100644
--- a/arch/x86/include/asm/pci_x86.h
+++ b/arch/x86/include/asm/pci_x86.h
@@ -29,6 +29,7 @@
29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000 29#define PCI_CHECK_ENABLE_AMD_MMCONF 0x20000
30#define PCI_HAS_IO_ECS 0x40000 30#define PCI_HAS_IO_ECS 0x40000
31#define PCI_NOASSIGN_ROMS 0x80000 31#define PCI_NOASSIGN_ROMS 0x80000
32#define PCI_ROOT_NO_CRS 0x100000
32 33
33extern unsigned int pci_probe; 34extern unsigned int pci_probe;
34extern unsigned long pirq_table_addr; 35extern unsigned long pirq_table_addr;
@@ -82,7 +83,6 @@ struct irq_routing_table {
82 83
83extern unsigned int pcibios_irq_mask; 84extern unsigned int pcibios_irq_mask;
84 85
85extern int pcibios_scanned;
86extern spinlock_t pci_config_lock; 86extern spinlock_t pci_config_lock;
87 87
88extern int (*pcibios_enable_irq)(struct pci_dev *dev); 88extern int (*pcibios_enable_irq)(struct pci_dev *dev);
@@ -105,24 +105,39 @@ extern bool port_cf9_safe;
105extern int pci_direct_probe(void); 105extern int pci_direct_probe(void);
106extern void pci_direct_init(int type); 106extern void pci_direct_init(int type);
107extern void pci_pcbios_init(void); 107extern void pci_pcbios_init(void);
108extern int pci_olpc_init(void);
109extern void __init dmi_check_pciprobe(void); 108extern void __init dmi_check_pciprobe(void);
110extern void __init dmi_check_skip_isa_align(void); 109extern void __init dmi_check_skip_isa_align(void);
111 110
112/* some common used subsys_initcalls */ 111/* some common used subsys_initcalls */
113extern int __init pci_acpi_init(void); 112extern int __init pci_acpi_init(void);
114extern int __init pcibios_irq_init(void); 113extern void __init pcibios_irq_init(void);
115extern int __init pci_visws_init(void);
116extern int __init pci_numaq_init(void);
117extern int __init pcibios_init(void); 114extern int __init pcibios_init(void);
115extern int pci_legacy_init(void);
116extern void pcibios_fixup_irqs(void);
118 117
119/* pci-mmconfig.c */ 118/* pci-mmconfig.c */
120 119
120/* "PCI MMCONFIG %04x [bus %02x-%02x]" */
121#define PCI_MMCFG_RESOURCE_NAME_LEN (22 + 4 + 2 + 2)
122
123struct pci_mmcfg_region {
124 struct list_head list;
125 struct resource res;
126 u64 address;
127 char __iomem *virt;
128 u16 segment;
129 u8 start_bus;
130 u8 end_bus;
131 char name[PCI_MMCFG_RESOURCE_NAME_LEN];
132};
133
121extern int __init pci_mmcfg_arch_init(void); 134extern int __init pci_mmcfg_arch_init(void);
122extern void __init pci_mmcfg_arch_free(void); 135extern void __init pci_mmcfg_arch_free(void);
136extern struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus);
137
138extern struct list_head pci_mmcfg_list;
123 139
124extern struct acpi_mcfg_allocation *pci_mmcfg_config; 140#define PCI_MMCFG_BUS_OFFSET(bus) ((bus) << 20)
125extern int pci_mmcfg_config_num;
126 141
127/* 142/*
128 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space 143 * AMD Fam10h CPUs are buggy, and cannot access MMIO config space
@@ -166,3 +181,17 @@ static inline void mmio_config_writel(void __iomem *pos, u32 val)
166{ 181{
167 asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory"); 182 asm volatile("movl %%eax,(%1)" : : "a" (val), "r" (pos) : "memory");
168} 183}
184
185#ifdef CONFIG_PCI
186# ifdef CONFIG_ACPI
187# define x86_default_pci_init pci_acpi_init
188# else
189# define x86_default_pci_init pci_legacy_init
190# endif
191# define x86_default_pci_init_irq pcibios_irq_init
192# define x86_default_pci_fixup_irqs pcibios_fixup_irqs
193#else
194# define x86_default_pci_init NULL
195# define x86_default_pci_init_irq NULL
196# define x86_default_pci_fixup_irqs NULL
197#endif
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index b65a36defeb7..66a272dfd8b8 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -25,19 +25,18 @@
25 */ 25 */
26#ifdef CONFIG_SMP 26#ifdef CONFIG_SMP
27#define PER_CPU(var, reg) \ 27#define PER_CPU(var, reg) \
28 __percpu_mov_op %__percpu_seg:per_cpu__this_cpu_off, reg; \ 28 __percpu_mov_op %__percpu_seg:this_cpu_off, reg; \
29 lea per_cpu__##var(reg), reg 29 lea var(reg), reg
30#define PER_CPU_VAR(var) %__percpu_seg:per_cpu__##var 30#define PER_CPU_VAR(var) %__percpu_seg:var
31#else /* ! SMP */ 31#else /* ! SMP */
32#define PER_CPU(var, reg) \ 32#define PER_CPU(var, reg) __percpu_mov_op $var, reg
33 __percpu_mov_op $per_cpu__##var, reg 33#define PER_CPU_VAR(var) var
34#define PER_CPU_VAR(var) per_cpu__##var
35#endif /* SMP */ 34#endif /* SMP */
36 35
37#ifdef CONFIG_X86_64_SMP 36#ifdef CONFIG_X86_64_SMP
38#define INIT_PER_CPU_VAR(var) init_per_cpu__##var 37#define INIT_PER_CPU_VAR(var) init_per_cpu__##var
39#else 38#else
40#define INIT_PER_CPU_VAR(var) per_cpu__##var 39#define INIT_PER_CPU_VAR(var) var
41#endif 40#endif
42 41
43#else /* ...!ASSEMBLY */ 42#else /* ...!ASSEMBLY */
@@ -60,12 +59,12 @@
60 * There also must be an entry in vmlinux_64.lds.S 59 * There also must be an entry in vmlinux_64.lds.S
61 */ 60 */
62#define DECLARE_INIT_PER_CPU(var) \ 61#define DECLARE_INIT_PER_CPU(var) \
63 extern typeof(per_cpu_var(var)) init_per_cpu_var(var) 62 extern typeof(var) init_per_cpu_var(var)
64 63
65#ifdef CONFIG_X86_64_SMP 64#ifdef CONFIG_X86_64_SMP
66#define init_per_cpu_var(var) init_per_cpu__##var 65#define init_per_cpu_var(var) init_per_cpu__##var
67#else 66#else
68#define init_per_cpu_var(var) per_cpu_var(var) 67#define init_per_cpu_var(var) var
69#endif 68#endif
70 69
71/* For arch-specific code, we can use direct single-insn ops (they 70/* For arch-specific code, we can use direct single-insn ops (they
@@ -74,63 +73,121 @@ extern void __bad_percpu_size(void);
74 73
75#define percpu_to_op(op, var, val) \ 74#define percpu_to_op(op, var, val) \
76do { \ 75do { \
77 typedef typeof(var) T__; \ 76 typedef typeof(var) pto_T__; \
78 if (0) { \ 77 if (0) { \
79 T__ tmp__; \ 78 pto_T__ pto_tmp__; \
80 tmp__ = (val); \ 79 pto_tmp__ = (val); \
81 } \ 80 } \
82 switch (sizeof(var)) { \ 81 switch (sizeof(var)) { \
83 case 1: \ 82 case 1: \
84 asm(op "b %1,"__percpu_arg(0) \ 83 asm(op "b %1,"__percpu_arg(0) \
85 : "+m" (var) \ 84 : "+m" (var) \
86 : "qi" ((T__)(val))); \ 85 : "qi" ((pto_T__)(val))); \
87 break; \ 86 break; \
88 case 2: \ 87 case 2: \
89 asm(op "w %1,"__percpu_arg(0) \ 88 asm(op "w %1,"__percpu_arg(0) \
90 : "+m" (var) \ 89 : "+m" (var) \
91 : "ri" ((T__)(val))); \ 90 : "ri" ((pto_T__)(val))); \
92 break; \ 91 break; \
93 case 4: \ 92 case 4: \
94 asm(op "l %1,"__percpu_arg(0) \ 93 asm(op "l %1,"__percpu_arg(0) \
95 : "+m" (var) \ 94 : "+m" (var) \
96 : "ri" ((T__)(val))); \ 95 : "ri" ((pto_T__)(val))); \
97 break; \ 96 break; \
98 case 8: \ 97 case 8: \
99 asm(op "q %1,"__percpu_arg(0) \ 98 asm(op "q %1,"__percpu_arg(0) \
100 : "+m" (var) \ 99 : "+m" (var) \
101 : "re" ((T__)(val))); \ 100 : "re" ((pto_T__)(val))); \
102 break; \ 101 break; \
103 default: __bad_percpu_size(); \ 102 default: __bad_percpu_size(); \
104 } \ 103 } \
105} while (0) 104} while (0)
106 105
106/*
107 * Generate a percpu add to memory instruction and optimize code
108 * if a one is added or subtracted.
109 */
110#define percpu_add_op(var, val) \
111do { \
112 typedef typeof(var) pao_T__; \
113 const int pao_ID__ = (__builtin_constant_p(val) && \
114 ((val) == 1 || (val) == -1)) ? (val) : 0; \
115 if (0) { \
116 pao_T__ pao_tmp__; \
117 pao_tmp__ = (val); \
118 } \
119 switch (sizeof(var)) { \
120 case 1: \
121 if (pao_ID__ == 1) \
122 asm("incb "__percpu_arg(0) : "+m" (var)); \
123 else if (pao_ID__ == -1) \
124 asm("decb "__percpu_arg(0) : "+m" (var)); \
125 else \
126 asm("addb %1, "__percpu_arg(0) \
127 : "+m" (var) \
128 : "qi" ((pao_T__)(val))); \
129 break; \
130 case 2: \
131 if (pao_ID__ == 1) \
132 asm("incw "__percpu_arg(0) : "+m" (var)); \
133 else if (pao_ID__ == -1) \
134 asm("decw "__percpu_arg(0) : "+m" (var)); \
135 else \
136 asm("addw %1, "__percpu_arg(0) \
137 : "+m" (var) \
138 : "ri" ((pao_T__)(val))); \
139 break; \
140 case 4: \
141 if (pao_ID__ == 1) \
142 asm("incl "__percpu_arg(0) : "+m" (var)); \
143 else if (pao_ID__ == -1) \
144 asm("decl "__percpu_arg(0) : "+m" (var)); \
145 else \
146 asm("addl %1, "__percpu_arg(0) \
147 : "+m" (var) \
148 : "ri" ((pao_T__)(val))); \
149 break; \
150 case 8: \
151 if (pao_ID__ == 1) \
152 asm("incq "__percpu_arg(0) : "+m" (var)); \
153 else if (pao_ID__ == -1) \
154 asm("decq "__percpu_arg(0) : "+m" (var)); \
155 else \
156 asm("addq %1, "__percpu_arg(0) \
157 : "+m" (var) \
158 : "re" ((pao_T__)(val))); \
159 break; \
160 default: __bad_percpu_size(); \
161 } \
162} while (0)
163
107#define percpu_from_op(op, var, constraint) \ 164#define percpu_from_op(op, var, constraint) \
108({ \ 165({ \
109 typeof(var) ret__; \ 166 typeof(var) pfo_ret__; \
110 switch (sizeof(var)) { \ 167 switch (sizeof(var)) { \
111 case 1: \ 168 case 1: \
112 asm(op "b "__percpu_arg(1)",%0" \ 169 asm(op "b "__percpu_arg(1)",%0" \
113 : "=q" (ret__) \ 170 : "=q" (pfo_ret__) \
114 : constraint); \ 171 : constraint); \
115 break; \ 172 break; \
116 case 2: \ 173 case 2: \
117 asm(op "w "__percpu_arg(1)",%0" \ 174 asm(op "w "__percpu_arg(1)",%0" \
118 : "=r" (ret__) \ 175 : "=r" (pfo_ret__) \
119 : constraint); \ 176 : constraint); \
120 break; \ 177 break; \
121 case 4: \ 178 case 4: \
122 asm(op "l "__percpu_arg(1)",%0" \ 179 asm(op "l "__percpu_arg(1)",%0" \
123 : "=r" (ret__) \ 180 : "=r" (pfo_ret__) \
124 : constraint); \ 181 : constraint); \
125 break; \ 182 break; \
126 case 8: \ 183 case 8: \
127 asm(op "q "__percpu_arg(1)",%0" \ 184 asm(op "q "__percpu_arg(1)",%0" \
128 : "=r" (ret__) \ 185 : "=r" (pfo_ret__) \
129 : constraint); \ 186 : constraint); \
130 break; \ 187 break; \
131 default: __bad_percpu_size(); \ 188 default: __bad_percpu_size(); \
132 } \ 189 } \
133 ret__; \ 190 pfo_ret__; \
134}) 191})
135 192
136/* 193/*
@@ -142,23 +199,99 @@ do { \
142 * per-thread variables implemented as per-cpu variables and thus 199 * per-thread variables implemented as per-cpu variables and thus
143 * stable for the duration of the respective task. 200 * stable for the duration of the respective task.
144 */ 201 */
145#define percpu_read(var) percpu_from_op("mov", per_cpu__##var, \ 202#define percpu_read(var) percpu_from_op("mov", var, "m" (var))
146 "m" (per_cpu__##var)) 203#define percpu_read_stable(var) percpu_from_op("mov", var, "p" (&(var)))
147#define percpu_read_stable(var) percpu_from_op("mov", per_cpu__##var, \ 204#define percpu_write(var, val) percpu_to_op("mov", var, val)
148 "p" (&per_cpu__##var)) 205#define percpu_add(var, val) percpu_add_op(var, val)
149#define percpu_write(var, val) percpu_to_op("mov", per_cpu__##var, val) 206#define percpu_sub(var, val) percpu_add_op(var, -(val))
150#define percpu_add(var, val) percpu_to_op("add", per_cpu__##var, val) 207#define percpu_and(var, val) percpu_to_op("and", var, val)
151#define percpu_sub(var, val) percpu_to_op("sub", per_cpu__##var, val) 208#define percpu_or(var, val) percpu_to_op("or", var, val)
152#define percpu_and(var, val) percpu_to_op("and", per_cpu__##var, val) 209#define percpu_xor(var, val) percpu_to_op("xor", var, val)
153#define percpu_or(var, val) percpu_to_op("or", per_cpu__##var, val) 210
154#define percpu_xor(var, val) percpu_to_op("xor", per_cpu__##var, val) 211#define __this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
212#define __this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
213#define __this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
214
215#define __this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
216#define __this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
217#define __this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
218#define __this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
219#define __this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
220#define __this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
221#define __this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
222#define __this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
223#define __this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
224#define __this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
225#define __this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
226#define __this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
227#define __this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
228#define __this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
229#define __this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
230
231#define this_cpu_read_1(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
232#define this_cpu_read_2(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
233#define this_cpu_read_4(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
234#define this_cpu_write_1(pcp, val) percpu_to_op("mov", (pcp), val)
235#define this_cpu_write_2(pcp, val) percpu_to_op("mov", (pcp), val)
236#define this_cpu_write_4(pcp, val) percpu_to_op("mov", (pcp), val)
237#define this_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
238#define this_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
239#define this_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
240#define this_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
241#define this_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
242#define this_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
243#define this_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
244#define this_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
245#define this_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
246#define this_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
247#define this_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
248#define this_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
249
250#define irqsafe_cpu_add_1(pcp, val) percpu_add_op((pcp), val)
251#define irqsafe_cpu_add_2(pcp, val) percpu_add_op((pcp), val)
252#define irqsafe_cpu_add_4(pcp, val) percpu_add_op((pcp), val)
253#define irqsafe_cpu_and_1(pcp, val) percpu_to_op("and", (pcp), val)
254#define irqsafe_cpu_and_2(pcp, val) percpu_to_op("and", (pcp), val)
255#define irqsafe_cpu_and_4(pcp, val) percpu_to_op("and", (pcp), val)
256#define irqsafe_cpu_or_1(pcp, val) percpu_to_op("or", (pcp), val)
257#define irqsafe_cpu_or_2(pcp, val) percpu_to_op("or", (pcp), val)
258#define irqsafe_cpu_or_4(pcp, val) percpu_to_op("or", (pcp), val)
259#define irqsafe_cpu_xor_1(pcp, val) percpu_to_op("xor", (pcp), val)
260#define irqsafe_cpu_xor_2(pcp, val) percpu_to_op("xor", (pcp), val)
261#define irqsafe_cpu_xor_4(pcp, val) percpu_to_op("xor", (pcp), val)
262
263/*
264 * Per cpu atomic 64 bit operations are only available under 64 bit.
265 * 32 bit must fall back to generic operations.
266 */
267#ifdef CONFIG_X86_64
268#define __this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
269#define __this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
270#define __this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
271#define __this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
272#define __this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
273#define __this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
274
275#define this_cpu_read_8(pcp) percpu_from_op("mov", (pcp), "m"(pcp))
276#define this_cpu_write_8(pcp, val) percpu_to_op("mov", (pcp), val)
277#define this_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
278#define this_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
279#define this_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
280#define this_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
281
282#define irqsafe_cpu_add_8(pcp, val) percpu_add_op((pcp), val)
283#define irqsafe_cpu_and_8(pcp, val) percpu_to_op("and", (pcp), val)
284#define irqsafe_cpu_or_8(pcp, val) percpu_to_op("or", (pcp), val)
285#define irqsafe_cpu_xor_8(pcp, val) percpu_to_op("xor", (pcp), val)
286
287#endif
155 288
156/* This is not atomic against other CPUs -- CPU preemption needs to be off */ 289/* This is not atomic against other CPUs -- CPU preemption needs to be off */
157#define x86_test_and_clear_bit_percpu(bit, var) \ 290#define x86_test_and_clear_bit_percpu(bit, var) \
158({ \ 291({ \
159 int old__; \ 292 int old__; \
160 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \ 293 asm volatile("btr %2,"__percpu_arg(1)"\n\tsbbl %0,%0" \
161 : "=r" (old__), "+m" (per_cpu__##var) \ 294 : "=r" (old__), "+m" (var) \
162 : "dIr" (bit)); \ 295 : "dIr" (bit)); \
163 old__; \ 296 old__; \
164}) 297})
diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h
index ad7ce3fd5065..db6109a885a7 100644
--- a/arch/x86/include/asm/perf_event.h
+++ b/arch/x86/include/asm/perf_event.h
@@ -18,7 +18,8 @@
18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 18#define MSR_ARCH_PERFMON_EVENTSEL0 0x186
19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 19#define MSR_ARCH_PERFMON_EVENTSEL1 0x187
20 20
21#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) 21#define ARCH_PERFMON_EVENTSEL_ENABLE (1 << 22)
22#define ARCH_PERFMON_EVENTSEL_ANY (1 << 21)
22#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) 23#define ARCH_PERFMON_EVENTSEL_INT (1 << 20)
23#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) 24#define ARCH_PERFMON_EVENTSEL_OS (1 << 17)
24#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) 25#define ARCH_PERFMON_EVENTSEL_USR (1 << 16)
@@ -26,11 +27,34 @@
26/* 27/*
27 * Includes eventsel and unit mask as well: 28 * Includes eventsel and unit mask as well:
28 */ 29 */
29#define ARCH_PERFMON_EVENT_MASK 0xffff 30
31
32#define INTEL_ARCH_EVTSEL_MASK 0x000000FFULL
33#define INTEL_ARCH_UNIT_MASK 0x0000FF00ULL
34#define INTEL_ARCH_EDGE_MASK 0x00040000ULL
35#define INTEL_ARCH_INV_MASK 0x00800000ULL
36#define INTEL_ARCH_CNT_MASK 0xFF000000ULL
37#define INTEL_ARCH_EVENT_MASK (INTEL_ARCH_UNIT_MASK|INTEL_ARCH_EVTSEL_MASK)
38
39/*
40 * filter mask to validate fixed counter events.
41 * the following filters disqualify for fixed counters:
42 * - inv
43 * - edge
44 * - cnt-mask
45 * The other filters are supported by fixed counters.
46 * The any-thread option is supported starting with v3.
47 */
48#define INTEL_ARCH_FIXED_MASK \
49 (INTEL_ARCH_CNT_MASK| \
50 INTEL_ARCH_INV_MASK| \
51 INTEL_ARCH_EDGE_MASK|\
52 INTEL_ARCH_UNIT_MASK|\
53 INTEL_ARCH_EVENT_MASK)
30 54
31#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c 55#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c
32#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) 56#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8)
33#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 57#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0
34#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ 58#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \
35 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) 59 (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX))
36 60
@@ -93,6 +117,18 @@ union cpuid10_edx {
93 */ 117 */
94#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16) 118#define X86_PMC_IDX_FIXED_BTS (X86_PMC_IDX_FIXED + 16)
95 119
120/* IbsFetchCtl bits/masks */
121#define IBS_FETCH_RAND_EN (1ULL<<57)
122#define IBS_FETCH_VAL (1ULL<<49)
123#define IBS_FETCH_ENABLE (1ULL<<48)
124#define IBS_FETCH_CNT 0xFFFF0000ULL
125#define IBS_FETCH_MAX_CNT 0x0000FFFFULL
126
127/* IbsOpCtl bits */
128#define IBS_OP_CNT_CTL (1ULL<<19)
129#define IBS_OP_VAL (1ULL<<18)
130#define IBS_OP_ENABLE (1ULL<<17)
131#define IBS_OP_MAX_CNT 0x0000FFFFULL
96 132
97#ifdef CONFIG_PERF_EVENTS 133#ifdef CONFIG_PERF_EVENTS
98extern void init_hw_perf_events(void); 134extern void init_hw_perf_events(void);
diff --git a/arch/x86/include/asm/pgalloc.h b/arch/x86/include/asm/pgalloc.h
index 0e8c2a0fd922..271de94c3810 100644
--- a/arch/x86/include/asm/pgalloc.h
+++ b/arch/x86/include/asm/pgalloc.h
@@ -23,6 +23,11 @@ static inline void paravirt_release_pud(unsigned long pfn) {}
23#endif 23#endif
24 24
25/* 25/*
26 * Flags to use when allocating a user page table page.
27 */
28extern gfp_t __userpte_alloc_gfp;
29
30/*
26 * Allocate and free page tables. 31 * Allocate and free page tables.
27 */ 32 */
28extern pgd_t *pgd_alloc(struct mm_struct *); 33extern pgd_t *pgd_alloc(struct mm_struct *);
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index af6fd360ab35..a34c785c5a63 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -16,6 +16,8 @@
16 16
17#ifndef __ASSEMBLY__ 17#ifndef __ASSEMBLY__
18 18
19#include <asm/x86_init.h>
20
19/* 21/*
20 * ZERO_PAGE is a global shared page that is always zero: used 22 * ZERO_PAGE is a global shared page that is always zero: used
21 * for zero-mapped memory areas etc.. 23 * for zero-mapped memory areas etc..
@@ -270,9 +272,9 @@ static inline int is_new_memtype_allowed(u64 paddr, unsigned long size,
270 unsigned long new_flags) 272 unsigned long new_flags)
271{ 273{
272 /* 274 /*
273 * PAT type is always WB for ISA. So no need to check. 275 * PAT type is always WB for untracked ranges, so no need to check.
274 */ 276 */
275 if (is_ISA_range(paddr, paddr + size - 1)) 277 if (x86_platform.is_untracked_pat_range(paddr, paddr + size))
276 return 1; 278 return 1;
277 279
278 /* 280 /*
diff --git a/arch/x86/include/asm/pgtable_32.h b/arch/x86/include/asm/pgtable_32.h
index 01fd9461d323..2984a25ff383 100644
--- a/arch/x86/include/asm/pgtable_32.h
+++ b/arch/x86/include/asm/pgtable_32.h
@@ -19,7 +19,6 @@
19#include <asm/paravirt.h> 19#include <asm/paravirt.h>
20 20
21#include <linux/bitops.h> 21#include <linux/bitops.h>
22#include <linux/slab.h>
23#include <linux/list.h> 22#include <linux/list.h>
24#include <linux/spinlock.h> 23#include <linux/spinlock.h>
25 24
@@ -54,10 +53,10 @@ extern void set_pmd_pfn(unsigned long, unsigned long, pgprot_t);
54 in_irq() ? KM_IRQ_PTE : \ 53 in_irq() ? KM_IRQ_PTE : \
55 KM_PTE0) 54 KM_PTE0)
56#define pte_offset_map(dir, address) \ 55#define pte_offset_map(dir, address) \
57 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), __KM_PTE) + \ 56 ((pte_t *)kmap_atomic(pmd_page(*(dir)), __KM_PTE) + \
58 pte_index((address))) 57 pte_index((address)))
59#define pte_offset_map_nested(dir, address) \ 58#define pte_offset_map_nested(dir, address) \
60 ((pte_t *)kmap_atomic_pte(pmd_page(*(dir)), KM_PTE1) + \ 59 ((pte_t *)kmap_atomic(pmd_page(*(dir)), KM_PTE1) + \
61 pte_index((address))) 60 pte_index((address)))
62#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE) 61#define pte_unmap(pte) kunmap_atomic((pte), __KM_PTE)
63#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1) 62#define pte_unmap_nested(pte) kunmap_atomic((pte), KM_PTE1)
@@ -80,7 +79,7 @@ do { \
80 * The i386 doesn't have any external MMU info: the kernel page 79 * The i386 doesn't have any external MMU info: the kernel page
81 * tables contain all the necessary information. 80 * tables contain all the necessary information.
82 */ 81 */
83#define update_mmu_cache(vma, address, pte) do { } while (0) 82#define update_mmu_cache(vma, address, ptep) do { } while (0)
84 83
85#endif /* !__ASSEMBLY__ */ 84#endif /* !__ASSEMBLY__ */
86 85
diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h
index c57a30117149..181be528c612 100644
--- a/arch/x86/include/asm/pgtable_64.h
+++ b/arch/x86/include/asm/pgtable_64.h
@@ -129,7 +129,7 @@ static inline int pgd_large(pgd_t pgd) { return 0; }
129#define pte_unmap(pte) /* NOP */ 129#define pte_unmap(pte) /* NOP */
130#define pte_unmap_nested(pte) /* NOP */ 130#define pte_unmap_nested(pte) /* NOP */
131 131
132#define update_mmu_cache(vma, address, pte) do { } while (0) 132#define update_mmu_cache(vma, address, ptep) do { } while (0)
133 133
134/* Encode and de-code a swap entry */ 134/* Encode and de-code a swap entry */
135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE 135#if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e75daac64962..91d323f47364 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -30,6 +30,7 @@ struct mm_struct;
30#include <linux/math64.h> 30#include <linux/math64.h>
31#include <linux/init.h> 31#include <linux/init.h>
32 32
33#define HBP_NUM 4
33/* 34/*
34 * Default implementation of macro that returns current 35 * Default implementation of macro that returns current
35 * instruction pointer ("program counter"). 36 * instruction pointer ("program counter").
@@ -182,7 +183,7 @@ static inline void native_cpuid(unsigned int *eax, unsigned int *ebx,
182 unsigned int *ecx, unsigned int *edx) 183 unsigned int *ecx, unsigned int *edx)
183{ 184{
184 /* ecx is often an input as well as an output. */ 185 /* ecx is often an input as well as an output. */
185 asm("cpuid" 186 asm volatile("cpuid"
186 : "=a" (*eax), 187 : "=a" (*eax),
187 "=b" (*ebx), 188 "=b" (*ebx),
188 "=c" (*ecx), 189 "=c" (*ecx),
@@ -424,6 +425,8 @@ extern unsigned int xstate_size;
424extern void free_thread_xstate(struct task_struct *); 425extern void free_thread_xstate(struct task_struct *);
425extern struct kmem_cache *task_xstate_cachep; 426extern struct kmem_cache *task_xstate_cachep;
426 427
428struct perf_event;
429
427struct thread_struct { 430struct thread_struct {
428 /* Cached TLS descriptors: */ 431 /* Cached TLS descriptors: */
429 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; 432 struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
@@ -445,13 +448,12 @@ struct thread_struct {
445 unsigned long fs; 448 unsigned long fs;
446#endif 449#endif
447 unsigned long gs; 450 unsigned long gs;
448 /* Hardware debugging registers: */ 451 /* Save middle states of ptrace breakpoints */
449 unsigned long debugreg0; 452 struct perf_event *ptrace_bps[HBP_NUM];
450 unsigned long debugreg1; 453 /* Debug status used for traps, single steps, etc... */
451 unsigned long debugreg2; 454 unsigned long debugreg6;
452 unsigned long debugreg3; 455 /* Keep track of the exact dr7 value set by the user */
453 unsigned long debugreg6; 456 unsigned long ptrace_dr7;
454 unsigned long debugreg7;
455 /* Fault info: */ 457 /* Fault info: */
456 unsigned long cr2; 458 unsigned long cr2;
457 unsigned long trap_no; 459 unsigned long trap_no;
diff --git a/arch/x86/include/asm/proto.h b/arch/x86/include/asm/proto.h
index 621f56d73121..6f414ed88620 100644
--- a/arch/x86/include/asm/proto.h
+++ b/arch/x86/include/asm/proto.h
@@ -5,31 +5,22 @@
5 5
6/* misc architecture specific prototypes */ 6/* misc architecture specific prototypes */
7 7
8extern void early_idt_handler(void); 8void early_idt_handler(void);
9 9
10extern void system_call(void); 10void system_call(void);
11extern void syscall_init(void); 11void syscall_init(void);
12 12
13extern void ia32_syscall(void); 13void ia32_syscall(void);
14extern void ia32_cstar_target(void); 14void ia32_cstar_target(void);
15extern void ia32_sysenter_target(void); 15void ia32_sysenter_target(void);
16 16
17extern void syscall32_cpu_init(void); 17void syscall32_cpu_init(void);
18 18
19extern void check_efer(void); 19void x86_configure_nx(void);
20void x86_report_nx(void);
20 21
21extern int reboot_force; 22extern int reboot_force;
22 23
23long do_arch_prctl(struct task_struct *task, int code, unsigned long addr); 24long do_arch_prctl(struct task_struct *task, int code, unsigned long addr);
24 25
25/*
26 * This looks more complex than it should be. But we need to
27 * get the type for the ~ right in round_down (it needs to be
28 * as wide as the result!), and we want to evaluate the macro
29 * arguments just once each.
30 */
31#define __round_mask(x,y) ((__typeof__(x))((y)-1))
32#define round_up(x,y) ((((x)-1) | __round_mask(x,y))+1)
33#define round_down(x,y) ((x) & ~__round_mask(x,y))
34
35#endif /* _ASM_X86_PROTO_H */ 26#endif /* _ASM_X86_PROTO_H */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 0f0d908349aa..69a686a7dff0 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -7,6 +7,7 @@
7 7
8#ifdef __KERNEL__ 8#ifdef __KERNEL__
9#include <asm/segment.h> 9#include <asm/segment.h>
10#include <asm/page_types.h>
10#endif 11#endif
11 12
12#ifndef __ASSEMBLY__ 13#ifndef __ASSEMBLY__
@@ -216,20 +217,72 @@ static inline unsigned long user_stack_pointer(struct pt_regs *regs)
216 return regs->sp; 217 return regs->sp;
217} 218}
218 219
219/* 220/* Query offset/name of register from its name/offset */
220 * These are defined as per linux/ptrace.h, which see. 221extern int regs_query_register_offset(const char *name);
222extern const char *regs_query_register_name(unsigned int offset);
223#define MAX_REG_OFFSET (offsetof(struct pt_regs, ss))
224
225/**
226 * regs_get_register() - get register value from its offset
227 * @regs: pt_regs from which register value is gotten.
228 * @offset: offset number of the register.
229 *
230 * regs_get_register returns the value of a register. The @offset is the
231 * offset of the register in struct pt_regs address which specified by @regs.
232 * If @offset is bigger than MAX_REG_OFFSET, this returns 0.
221 */ 233 */
222#define arch_has_single_step() (1) 234static inline unsigned long regs_get_register(struct pt_regs *regs,
223extern void user_enable_single_step(struct task_struct *); 235 unsigned int offset)
224extern void user_disable_single_step(struct task_struct *); 236{
237 if (unlikely(offset > MAX_REG_OFFSET))
238 return 0;
239 return *(unsigned long *)((unsigned long)regs + offset);
240}
241
242/**
243 * regs_within_kernel_stack() - check the address in the stack
244 * @regs: pt_regs which contains kernel stack pointer.
245 * @addr: address which is checked.
246 *
247 * regs_within_kernel_stack() checks @addr is within the kernel stack page(s).
248 * If @addr is within the kernel stack, it returns true. If not, returns false.
249 */
250static inline int regs_within_kernel_stack(struct pt_regs *regs,
251 unsigned long addr)
252{
253 return ((addr & ~(THREAD_SIZE - 1)) ==
254 (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1)));
255}
225 256
226extern void user_enable_block_step(struct task_struct *); 257/**
258 * regs_get_kernel_stack_nth() - get Nth entry of the stack
259 * @regs: pt_regs which contains kernel stack pointer.
260 * @n: stack entry number.
261 *
262 * regs_get_kernel_stack_nth() returns @n th entry of the kernel stack which
263 * is specified by @regs. If the @n th entry is NOT in the kernel stack,
264 * this returns 0.
265 */
266static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
267 unsigned int n)
268{
269 unsigned long *addr = (unsigned long *)kernel_stack_pointer(regs);
270 addr += n;
271 if (regs_within_kernel_stack(regs, (unsigned long)addr))
272 return *addr;
273 else
274 return 0;
275}
276
277#define arch_has_single_step() (1)
227#ifdef CONFIG_X86_DEBUGCTLMSR 278#ifdef CONFIG_X86_DEBUGCTLMSR
228#define arch_has_block_step() (1) 279#define arch_has_block_step() (1)
229#else 280#else
230#define arch_has_block_step() (boot_cpu_data.x86 >= 6) 281#define arch_has_block_step() (boot_cpu_data.x86 >= 6)
231#endif 282#endif
232 283
284#define ARCH_HAS_USER_SINGLE_STEP_INFO
285
233struct user_desc; 286struct user_desc;
234extern int do_get_thread_area(struct task_struct *p, int idx, 287extern int do_get_thread_area(struct task_struct *p, int idx,
235 struct user_desc __user *info); 288 struct user_desc __user *info);
diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h
index ca7517d33776..606ede126972 100644
--- a/arch/x86/include/asm/rwsem.h
+++ b/arch/x86/include/asm/rwsem.h
@@ -41,6 +41,7 @@
41#include <linux/list.h> 41#include <linux/list.h>
42#include <linux/spinlock.h> 42#include <linux/spinlock.h>
43#include <linux/lockdep.h> 43#include <linux/lockdep.h>
44#include <asm/asm.h>
44 45
45struct rwsem_waiter; 46struct rwsem_waiter;
46 47
@@ -55,17 +56,28 @@ extern asmregparm struct rw_semaphore *
55 56
56/* 57/*
57 * the semaphore definition 58 * the semaphore definition
59 *
60 * The bias values and the counter type limits the number of
61 * potential readers/writers to 32767 for 32 bits and 2147483647
62 * for 64 bits.
58 */ 63 */
59 64
60#define RWSEM_UNLOCKED_VALUE 0x00000000 65#ifdef CONFIG_X86_64
61#define RWSEM_ACTIVE_BIAS 0x00000001 66# define RWSEM_ACTIVE_MASK 0xffffffffL
62#define RWSEM_ACTIVE_MASK 0x0000ffff 67#else
63#define RWSEM_WAITING_BIAS (-0x00010000) 68# define RWSEM_ACTIVE_MASK 0x0000ffffL
69#endif
70
71#define RWSEM_UNLOCKED_VALUE 0x00000000L
72#define RWSEM_ACTIVE_BIAS 0x00000001L
73#define RWSEM_WAITING_BIAS (-RWSEM_ACTIVE_MASK-1)
64#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS 74#define RWSEM_ACTIVE_READ_BIAS RWSEM_ACTIVE_BIAS
65#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS) 75#define RWSEM_ACTIVE_WRITE_BIAS (RWSEM_WAITING_BIAS + RWSEM_ACTIVE_BIAS)
66 76
77typedef signed long rwsem_count_t;
78
67struct rw_semaphore { 79struct rw_semaphore {
68 signed long count; 80 rwsem_count_t count;
69 spinlock_t wait_lock; 81 spinlock_t wait_lock;
70 struct list_head wait_list; 82 struct list_head wait_list;
71#ifdef CONFIG_DEBUG_LOCK_ALLOC 83#ifdef CONFIG_DEBUG_LOCK_ALLOC
@@ -105,7 +117,7 @@ do { \
105static inline void __down_read(struct rw_semaphore *sem) 117static inline void __down_read(struct rw_semaphore *sem)
106{ 118{
107 asm volatile("# beginning down_read\n\t" 119 asm volatile("# beginning down_read\n\t"
108 LOCK_PREFIX " incl (%%eax)\n\t" 120 LOCK_PREFIX _ASM_INC "(%1)\n\t"
109 /* adds 0x00000001, returns the old value */ 121 /* adds 0x00000001, returns the old value */
110 " jns 1f\n" 122 " jns 1f\n"
111 " call call_rwsem_down_read_failed\n" 123 " call call_rwsem_down_read_failed\n"
@@ -121,14 +133,14 @@ static inline void __down_read(struct rw_semaphore *sem)
121 */ 133 */
122static inline int __down_read_trylock(struct rw_semaphore *sem) 134static inline int __down_read_trylock(struct rw_semaphore *sem)
123{ 135{
124 __s32 result, tmp; 136 rwsem_count_t result, tmp;
125 asm volatile("# beginning __down_read_trylock\n\t" 137 asm volatile("# beginning __down_read_trylock\n\t"
126 " movl %0,%1\n\t" 138 " mov %0,%1\n\t"
127 "1:\n\t" 139 "1:\n\t"
128 " movl %1,%2\n\t" 140 " mov %1,%2\n\t"
129 " addl %3,%2\n\t" 141 " add %3,%2\n\t"
130 " jle 2f\n\t" 142 " jle 2f\n\t"
131 LOCK_PREFIX " cmpxchgl %2,%0\n\t" 143 LOCK_PREFIX " cmpxchg %2,%0\n\t"
132 " jnz 1b\n\t" 144 " jnz 1b\n\t"
133 "2:\n\t" 145 "2:\n\t"
134 "# ending __down_read_trylock\n\t" 146 "# ending __down_read_trylock\n\t"
@@ -143,13 +155,13 @@ static inline int __down_read_trylock(struct rw_semaphore *sem)
143 */ 155 */
144static inline void __down_write_nested(struct rw_semaphore *sem, int subclass) 156static inline void __down_write_nested(struct rw_semaphore *sem, int subclass)
145{ 157{
146 int tmp; 158 rwsem_count_t tmp;
147 159
148 tmp = RWSEM_ACTIVE_WRITE_BIAS; 160 tmp = RWSEM_ACTIVE_WRITE_BIAS;
149 asm volatile("# beginning down_write\n\t" 161 asm volatile("# beginning down_write\n\t"
150 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" 162 LOCK_PREFIX " xadd %1,(%2)\n\t"
151 /* subtract 0x0000ffff, returns the old value */ 163 /* subtract 0x0000ffff, returns the old value */
152 " testl %%edx,%%edx\n\t" 164 " test %1,%1\n\t"
153 /* was the count 0 before? */ 165 /* was the count 0 before? */
154 " jz 1f\n" 166 " jz 1f\n"
155 " call call_rwsem_down_write_failed\n" 167 " call call_rwsem_down_write_failed\n"
@@ -170,9 +182,9 @@ static inline void __down_write(struct rw_semaphore *sem)
170 */ 182 */
171static inline int __down_write_trylock(struct rw_semaphore *sem) 183static inline int __down_write_trylock(struct rw_semaphore *sem)
172{ 184{
173 signed long ret = cmpxchg(&sem->count, 185 rwsem_count_t ret = cmpxchg(&sem->count,
174 RWSEM_UNLOCKED_VALUE, 186 RWSEM_UNLOCKED_VALUE,
175 RWSEM_ACTIVE_WRITE_BIAS); 187 RWSEM_ACTIVE_WRITE_BIAS);
176 if (ret == RWSEM_UNLOCKED_VALUE) 188 if (ret == RWSEM_UNLOCKED_VALUE)
177 return 1; 189 return 1;
178 return 0; 190 return 0;
@@ -183,9 +195,9 @@ static inline int __down_write_trylock(struct rw_semaphore *sem)
183 */ 195 */
184static inline void __up_read(struct rw_semaphore *sem) 196static inline void __up_read(struct rw_semaphore *sem)
185{ 197{
186 __s32 tmp = -RWSEM_ACTIVE_READ_BIAS; 198 rwsem_count_t tmp = -RWSEM_ACTIVE_READ_BIAS;
187 asm volatile("# beginning __up_read\n\t" 199 asm volatile("# beginning __up_read\n\t"
188 LOCK_PREFIX " xadd %%edx,(%%eax)\n\t" 200 LOCK_PREFIX " xadd %1,(%2)\n\t"
189 /* subtracts 1, returns the old value */ 201 /* subtracts 1, returns the old value */
190 " jns 1f\n\t" 202 " jns 1f\n\t"
191 " call call_rwsem_wake\n" 203 " call call_rwsem_wake\n"
@@ -201,18 +213,18 @@ static inline void __up_read(struct rw_semaphore *sem)
201 */ 213 */
202static inline void __up_write(struct rw_semaphore *sem) 214static inline void __up_write(struct rw_semaphore *sem)
203{ 215{
216 rwsem_count_t tmp;
204 asm volatile("# beginning __up_write\n\t" 217 asm volatile("# beginning __up_write\n\t"
205 " movl %2,%%edx\n\t" 218 LOCK_PREFIX " xadd %1,(%2)\n\t"
206 LOCK_PREFIX " xaddl %%edx,(%%eax)\n\t"
207 /* tries to transition 219 /* tries to transition
208 0xffff0001 -> 0x00000000 */ 220 0xffff0001 -> 0x00000000 */
209 " jz 1f\n" 221 " jz 1f\n"
210 " call call_rwsem_wake\n" 222 " call call_rwsem_wake\n"
211 "1:\n\t" 223 "1:\n\t"
212 "# ending __up_write\n" 224 "# ending __up_write\n"
213 : "+m" (sem->count) 225 : "+m" (sem->count), "=d" (tmp)
214 : "a" (sem), "i" (-RWSEM_ACTIVE_WRITE_BIAS) 226 : "a" (sem), "1" (-RWSEM_ACTIVE_WRITE_BIAS)
215 : "memory", "cc", "edx"); 227 : "memory", "cc");
216} 228}
217 229
218/* 230/*
@@ -221,33 +233,38 @@ static inline void __up_write(struct rw_semaphore *sem)
221static inline void __downgrade_write(struct rw_semaphore *sem) 233static inline void __downgrade_write(struct rw_semaphore *sem)
222{ 234{
223 asm volatile("# beginning __downgrade_write\n\t" 235 asm volatile("# beginning __downgrade_write\n\t"
224 LOCK_PREFIX " addl %2,(%%eax)\n\t" 236 LOCK_PREFIX _ASM_ADD "%2,(%1)\n\t"
225 /* transitions 0xZZZZ0001 -> 0xYYYY0001 */ 237 /*
238 * transitions 0xZZZZ0001 -> 0xYYYY0001 (i386)
239 * 0xZZZZZZZZ00000001 -> 0xYYYYYYYY00000001 (x86_64)
240 */
226 " jns 1f\n\t" 241 " jns 1f\n\t"
227 " call call_rwsem_downgrade_wake\n" 242 " call call_rwsem_downgrade_wake\n"
228 "1:\n\t" 243 "1:\n\t"
229 "# ending __downgrade_write\n" 244 "# ending __downgrade_write\n"
230 : "+m" (sem->count) 245 : "+m" (sem->count)
231 : "a" (sem), "i" (-RWSEM_WAITING_BIAS) 246 : "a" (sem), "er" (-RWSEM_WAITING_BIAS)
232 : "memory", "cc"); 247 : "memory", "cc");
233} 248}
234 249
235/* 250/*
236 * implement atomic add functionality 251 * implement atomic add functionality
237 */ 252 */
238static inline void rwsem_atomic_add(int delta, struct rw_semaphore *sem) 253static inline void rwsem_atomic_add(rwsem_count_t delta,
254 struct rw_semaphore *sem)
239{ 255{
240 asm volatile(LOCK_PREFIX "addl %1,%0" 256 asm volatile(LOCK_PREFIX _ASM_ADD "%1,%0"
241 : "+m" (sem->count) 257 : "+m" (sem->count)
242 : "ir" (delta)); 258 : "er" (delta));
243} 259}
244 260
245/* 261/*
246 * implement exchange and add functionality 262 * implement exchange and add functionality
247 */ 263 */
248static inline int rwsem_atomic_update(int delta, struct rw_semaphore *sem) 264static inline rwsem_count_t rwsem_atomic_update(rwsem_count_t delta,
265 struct rw_semaphore *sem)
249{ 266{
250 int tmp = delta; 267 rwsem_count_t tmp = delta;
251 268
252 asm volatile(LOCK_PREFIX "xadd %0,%1" 269 asm volatile(LOCK_PREFIX "xadd %0,%1"
253 : "+r" (tmp), "+m" (sem->count) 270 : "+r" (tmp), "+m" (sem->count)
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 1b7ee5d673c2..0a5242428659 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,13 @@
2#define _ASM_X86_SECTIONS_H 2#define _ASM_X86_SECTIONS_H
3 3
4#include <asm-generic/sections.h> 4#include <asm-generic/sections.h>
5#include <asm/uaccess.h>
5 6
6extern char __brk_base[], __brk_limit[]; 7extern char __brk_base[], __brk_limit[];
8extern struct exception_table_entry __stop___ex_table[];
9
10#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
11extern char __end_rodata_hpage_align[];
12#endif
7 13
8#endif /* _ASM_X86_SECTIONS_H */ 14#endif /* _ASM_X86_SECTIONS_H */
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 18e496c98ff0..86b1506f4179 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -37,10 +37,8 @@ void setup_bios_corruption_check(void);
37 37
38#ifdef CONFIG_X86_VISWS 38#ifdef CONFIG_X86_VISWS
39extern void visws_early_detect(void); 39extern void visws_early_detect(void);
40extern int is_visws_box(void);
41#else 40#else
42static inline void visws_early_detect(void) { } 41static inline void visws_early_detect(void) { }
43static inline int is_visws_box(void) { return 0; }
44#endif 42#endif
45 43
46extern unsigned long saved_video_mode; 44extern unsigned long saved_video_mode;
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 72e5a4491661..04459d25e66e 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -124,7 +124,7 @@ struct sigcontext {
124 * fpstate is really (struct _fpstate *) or (struct _xstate *) 124 * fpstate is really (struct _fpstate *) or (struct _xstate *)
125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved 125 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end 126 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
127 * of extended memory layout. See comments at the defintion of 127 * of extended memory layout. See comments at the definition of
128 * (struct _fpx_sw_bytes) 128 * (struct _fpx_sw_bytes)
129 */ 129 */
130 void __user *fpstate; /* zero when no FPU/extended context */ 130 void __user *fpstate; /* zero when no FPU/extended context */
@@ -219,7 +219,7 @@ struct sigcontext {
219 * fpstate is really (struct _fpstate *) or (struct _xstate *) 219 * fpstate is really (struct _fpstate *) or (struct _xstate *)
220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved 220 * depending on the FP_XSTATE_MAGIC1 encoded in the SW reserved
221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end 221 * bytes of (struct _fpstate) and FP_XSTATE_MAGIC2 present at the end
222 * of extended memory layout. See comments at the defintion of 222 * of extended memory layout. See comments at the definition of
223 * (struct _fpx_sw_bytes) 223 * (struct _fpx_sw_bytes)
224 */ 224 */
225 void __user *fpstate; /* zero when no FPU/extended context */ 225 void __user *fpstate; /* zero when no FPU/extended context */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 1e796782cd7b..4cfc90824068 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -135,6 +135,8 @@ int native_cpu_disable(void);
135void native_cpu_die(unsigned int cpu); 135void native_cpu_die(unsigned int cpu);
136void native_play_dead(void); 136void native_play_dead(void);
137void play_dead_common(void); 137void play_dead_common(void);
138void wbinvd_on_cpu(int cpu);
139int wbinvd_on_all_cpus(void);
138 140
139void native_send_call_func_ipi(const struct cpumask *mask); 141void native_send_call_func_ipi(const struct cpumask *mask);
140void native_send_call_func_single_ipi(int cpu); 142void native_send_call_func_single_ipi(int cpu);
@@ -147,6 +149,13 @@ static inline int num_booting_cpus(void)
147{ 149{
148 return cpumask_weight(cpu_callout_mask); 150 return cpumask_weight(cpu_callout_mask);
149} 151}
152#else /* !CONFIG_SMP */
153#define wbinvd_on_cpu(cpu) wbinvd()
154static inline int wbinvd_on_all_cpus(void)
155{
156 wbinvd();
157 return 0;
158}
150#endif /* CONFIG_SMP */ 159#endif /* CONFIG_SMP */
151 160
152extern unsigned disabled_cpus __cpuinitdata; 161extern unsigned disabled_cpus __cpuinitdata;
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 4e77853321db..3089f70c0c52 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -58,7 +58,7 @@
58#if (NR_CPUS < 256) 58#if (NR_CPUS < 256)
59#define TICKET_SHIFT 8 59#define TICKET_SHIFT 8
60 60
61static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) 61static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
62{ 62{
63 short inc = 0x0100; 63 short inc = 0x0100;
64 64
@@ -77,7 +77,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
77 : "memory", "cc"); 77 : "memory", "cc");
78} 78}
79 79
80static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) 80static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
81{ 81{
82 int tmp, new; 82 int tmp, new;
83 83
@@ -96,7 +96,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
96 return tmp; 96 return tmp;
97} 97}
98 98
99static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) 99static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
100{ 100{
101 asm volatile(UNLOCK_LOCK_PREFIX "incb %0" 101 asm volatile(UNLOCK_LOCK_PREFIX "incb %0"
102 : "+m" (lock->slock) 102 : "+m" (lock->slock)
@@ -106,7 +106,7 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
106#else 106#else
107#define TICKET_SHIFT 16 107#define TICKET_SHIFT 16
108 108
109static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock) 109static __always_inline void __ticket_spin_lock(arch_spinlock_t *lock)
110{ 110{
111 int inc = 0x00010000; 111 int inc = 0x00010000;
112 int tmp; 112 int tmp;
@@ -127,7 +127,7 @@ static __always_inline void __ticket_spin_lock(raw_spinlock_t *lock)
127 : "memory", "cc"); 127 : "memory", "cc");
128} 128}
129 129
130static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock) 130static __always_inline int __ticket_spin_trylock(arch_spinlock_t *lock)
131{ 131{
132 int tmp; 132 int tmp;
133 int new; 133 int new;
@@ -149,7 +149,7 @@ static __always_inline int __ticket_spin_trylock(raw_spinlock_t *lock)
149 return tmp; 149 return tmp;
150} 150}
151 151
152static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock) 152static __always_inline void __ticket_spin_unlock(arch_spinlock_t *lock)
153{ 153{
154 asm volatile(UNLOCK_LOCK_PREFIX "incw %0" 154 asm volatile(UNLOCK_LOCK_PREFIX "incw %0"
155 : "+m" (lock->slock) 155 : "+m" (lock->slock)
@@ -158,14 +158,14 @@ static __always_inline void __ticket_spin_unlock(raw_spinlock_t *lock)
158} 158}
159#endif 159#endif
160 160
161static inline int __ticket_spin_is_locked(raw_spinlock_t *lock) 161static inline int __ticket_spin_is_locked(arch_spinlock_t *lock)
162{ 162{
163 int tmp = ACCESS_ONCE(lock->slock); 163 int tmp = ACCESS_ONCE(lock->slock);
164 164
165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1)); 165 return !!(((tmp >> TICKET_SHIFT) ^ tmp) & ((1 << TICKET_SHIFT) - 1));
166} 166}
167 167
168static inline int __ticket_spin_is_contended(raw_spinlock_t *lock) 168static inline int __ticket_spin_is_contended(arch_spinlock_t *lock)
169{ 169{
170 int tmp = ACCESS_ONCE(lock->slock); 170 int tmp = ACCESS_ONCE(lock->slock);
171 171
@@ -174,43 +174,43 @@ static inline int __ticket_spin_is_contended(raw_spinlock_t *lock)
174 174
175#ifndef CONFIG_PARAVIRT_SPINLOCKS 175#ifndef CONFIG_PARAVIRT_SPINLOCKS
176 176
177static inline int __raw_spin_is_locked(raw_spinlock_t *lock) 177static inline int arch_spin_is_locked(arch_spinlock_t *lock)
178{ 178{
179 return __ticket_spin_is_locked(lock); 179 return __ticket_spin_is_locked(lock);
180} 180}
181 181
182static inline int __raw_spin_is_contended(raw_spinlock_t *lock) 182static inline int arch_spin_is_contended(arch_spinlock_t *lock)
183{ 183{
184 return __ticket_spin_is_contended(lock); 184 return __ticket_spin_is_contended(lock);
185} 185}
186#define __raw_spin_is_contended __raw_spin_is_contended 186#define arch_spin_is_contended arch_spin_is_contended
187 187
188static __always_inline void __raw_spin_lock(raw_spinlock_t *lock) 188static __always_inline void arch_spin_lock(arch_spinlock_t *lock)
189{ 189{
190 __ticket_spin_lock(lock); 190 __ticket_spin_lock(lock);
191} 191}
192 192
193static __always_inline int __raw_spin_trylock(raw_spinlock_t *lock) 193static __always_inline int arch_spin_trylock(arch_spinlock_t *lock)
194{ 194{
195 return __ticket_spin_trylock(lock); 195 return __ticket_spin_trylock(lock);
196} 196}
197 197
198static __always_inline void __raw_spin_unlock(raw_spinlock_t *lock) 198static __always_inline void arch_spin_unlock(arch_spinlock_t *lock)
199{ 199{
200 __ticket_spin_unlock(lock); 200 __ticket_spin_unlock(lock);
201} 201}
202 202
203static __always_inline void __raw_spin_lock_flags(raw_spinlock_t *lock, 203static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock,
204 unsigned long flags) 204 unsigned long flags)
205{ 205{
206 __raw_spin_lock(lock); 206 arch_spin_lock(lock);
207} 207}
208 208
209#endif /* CONFIG_PARAVIRT_SPINLOCKS */ 209#endif /* CONFIG_PARAVIRT_SPINLOCKS */
210 210
211static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock) 211static inline void arch_spin_unlock_wait(arch_spinlock_t *lock)
212{ 212{
213 while (__raw_spin_is_locked(lock)) 213 while (arch_spin_is_locked(lock))
214 cpu_relax(); 214 cpu_relax();
215} 215}
216 216
@@ -232,7 +232,7 @@ static inline void __raw_spin_unlock_wait(raw_spinlock_t *lock)
232 * read_can_lock - would read_trylock() succeed? 232 * read_can_lock - would read_trylock() succeed?
233 * @lock: the rwlock in question. 233 * @lock: the rwlock in question.
234 */ 234 */
235static inline int __raw_read_can_lock(raw_rwlock_t *lock) 235static inline int arch_read_can_lock(arch_rwlock_t *lock)
236{ 236{
237 return (int)(lock)->lock > 0; 237 return (int)(lock)->lock > 0;
238} 238}
@@ -241,12 +241,12 @@ static inline int __raw_read_can_lock(raw_rwlock_t *lock)
241 * write_can_lock - would write_trylock() succeed? 241 * write_can_lock - would write_trylock() succeed?
242 * @lock: the rwlock in question. 242 * @lock: the rwlock in question.
243 */ 243 */
244static inline int __raw_write_can_lock(raw_rwlock_t *lock) 244static inline int arch_write_can_lock(arch_rwlock_t *lock)
245{ 245{
246 return (lock)->lock == RW_LOCK_BIAS; 246 return (lock)->lock == RW_LOCK_BIAS;
247} 247}
248 248
249static inline void __raw_read_lock(raw_rwlock_t *rw) 249static inline void arch_read_lock(arch_rwlock_t *rw)
250{ 250{
251 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t" 251 asm volatile(LOCK_PREFIX " subl $1,(%0)\n\t"
252 "jns 1f\n" 252 "jns 1f\n"
@@ -255,7 +255,7 @@ static inline void __raw_read_lock(raw_rwlock_t *rw)
255 ::LOCK_PTR_REG (rw) : "memory"); 255 ::LOCK_PTR_REG (rw) : "memory");
256} 256}
257 257
258static inline void __raw_write_lock(raw_rwlock_t *rw) 258static inline void arch_write_lock(arch_rwlock_t *rw)
259{ 259{
260 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t" 260 asm volatile(LOCK_PREFIX " subl %1,(%0)\n\t"
261 "jz 1f\n" 261 "jz 1f\n"
@@ -264,7 +264,7 @@ static inline void __raw_write_lock(raw_rwlock_t *rw)
264 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory"); 264 ::LOCK_PTR_REG (rw), "i" (RW_LOCK_BIAS) : "memory");
265} 265}
266 266
267static inline int __raw_read_trylock(raw_rwlock_t *lock) 267static inline int arch_read_trylock(arch_rwlock_t *lock)
268{ 268{
269 atomic_t *count = (atomic_t *)lock; 269 atomic_t *count = (atomic_t *)lock;
270 270
@@ -274,7 +274,7 @@ static inline int __raw_read_trylock(raw_rwlock_t *lock)
274 return 0; 274 return 0;
275} 275}
276 276
277static inline int __raw_write_trylock(raw_rwlock_t *lock) 277static inline int arch_write_trylock(arch_rwlock_t *lock)
278{ 278{
279 atomic_t *count = (atomic_t *)lock; 279 atomic_t *count = (atomic_t *)lock;
280 280
@@ -284,23 +284,23 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
284 return 0; 284 return 0;
285} 285}
286 286
287static inline void __raw_read_unlock(raw_rwlock_t *rw) 287static inline void arch_read_unlock(arch_rwlock_t *rw)
288{ 288{
289 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory"); 289 asm volatile(LOCK_PREFIX "incl %0" :"+m" (rw->lock) : : "memory");
290} 290}
291 291
292static inline void __raw_write_unlock(raw_rwlock_t *rw) 292static inline void arch_write_unlock(arch_rwlock_t *rw)
293{ 293{
294 asm volatile(LOCK_PREFIX "addl %1, %0" 294 asm volatile(LOCK_PREFIX "addl %1, %0"
295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory"); 295 : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
296} 296}
297 297
298#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock) 298#define arch_read_lock_flags(lock, flags) arch_read_lock(lock)
299#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock) 299#define arch_write_lock_flags(lock, flags) arch_write_lock(lock)
300 300
301#define _raw_spin_relax(lock) cpu_relax() 301#define arch_spin_relax(lock) cpu_relax()
302#define _raw_read_relax(lock) cpu_relax() 302#define arch_read_relax(lock) cpu_relax()
303#define _raw_write_relax(lock) cpu_relax() 303#define arch_write_relax(lock) cpu_relax()
304 304
305/* The {read|write|spin}_lock() on x86 are full memory barriers. */ 305/* The {read|write|spin}_lock() on x86 are full memory barriers. */
306static inline void smp_mb__after_lock(void) { } 306static inline void smp_mb__after_lock(void) { }
diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h
index 845f81c87091..dcb48b2edc11 100644
--- a/arch/x86/include/asm/spinlock_types.h
+++ b/arch/x86/include/asm/spinlock_types.h
@@ -5,16 +5,16 @@
5# error "please don't include this file directly" 5# error "please don't include this file directly"
6#endif 6#endif
7 7
8typedef struct raw_spinlock { 8typedef struct arch_spinlock {
9 unsigned int slock; 9 unsigned int slock;
10} raw_spinlock_t; 10} arch_spinlock_t;
11 11
12#define __RAW_SPIN_LOCK_UNLOCKED { 0 } 12#define __ARCH_SPIN_LOCK_UNLOCKED { 0 }
13 13
14typedef struct { 14typedef struct {
15 unsigned int lock; 15 unsigned int lock;
16} raw_rwlock_t; 16} arch_rwlock_t;
17 17
18#define __RAW_RW_LOCK_UNLOCKED { RW_LOCK_BIAS } 18#define __ARCH_RW_LOCK_UNLOCKED { RW_LOCK_BIAS }
19 19
20#endif /* _ASM_X86_SPINLOCK_TYPES_H */ 20#endif /* _ASM_X86_SPINLOCK_TYPES_H */
diff --git a/arch/x86/include/asm/stacktrace.h b/arch/x86/include/asm/stacktrace.h
index cf86a5e73815..4dab78edbad9 100644
--- a/arch/x86/include/asm/stacktrace.h
+++ b/arch/x86/include/asm/stacktrace.h
@@ -3,7 +3,28 @@
3 3
4extern int kstack_depth_to_print; 4extern int kstack_depth_to_print;
5 5
6int x86_is_stack_id(int id, char *name); 6struct thread_info;
7struct stacktrace_ops;
8
9typedef unsigned long (*walk_stack_t)(struct thread_info *tinfo,
10 unsigned long *stack,
11 unsigned long bp,
12 const struct stacktrace_ops *ops,
13 void *data,
14 unsigned long *end,
15 int *graph);
16
17extern unsigned long
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22
23extern unsigned long
24print_context_stack_bp(struct thread_info *tinfo,
25 unsigned long *stack, unsigned long bp,
26 const struct stacktrace_ops *ops, void *data,
27 unsigned long *end, int *graph);
7 28
8/* Generic stack tracer with callbacks */ 29/* Generic stack tracer with callbacks */
9 30
@@ -14,6 +35,7 @@ struct stacktrace_ops {
14 void (*address)(void *data, unsigned long address, int reliable); 35 void (*address)(void *data, unsigned long address, int reliable);
15 /* On negative return stop dumping */ 36 /* On negative return stop dumping */
16 int (*stack)(void *data, char *name); 37 int (*stack)(void *data, char *name);
38 walk_stack_t walk_stack;
17}; 39};
18 40
19void dump_trace(struct task_struct *tsk, struct pt_regs *regs, 41void dump_trace(struct task_struct *tsk, struct pt_regs *regs,
diff --git a/arch/x86/include/asm/string_32.h b/arch/x86/include/asm/string_32.h
index ae907e617181..3d3e8353ee5c 100644
--- a/arch/x86/include/asm/string_32.h
+++ b/arch/x86/include/asm/string_32.h
@@ -177,10 +177,15 @@ static inline void *__memcpy3d(void *to, const void *from, size_t len)
177 */ 177 */
178 178
179#ifndef CONFIG_KMEMCHECK 179#ifndef CONFIG_KMEMCHECK
180
181#if (__GNUC__ >= 4)
182#define memcpy(t, f, n) __builtin_memcpy(t, f, n)
183#else
180#define memcpy(t, f, n) \ 184#define memcpy(t, f, n) \
181 (__builtin_constant_p((n)) \ 185 (__builtin_constant_p((n)) \
182 ? __constant_memcpy((t), (f), (n)) \ 186 ? __constant_memcpy((t), (f), (n)) \
183 : __memcpy((t), (f), (n))) 187 : __memcpy((t), (f), (n)))
188#endif
184#else 189#else
185/* 190/*
186 * kmemcheck becomes very happy if we use the REP instructions unconditionally, 191 * kmemcheck becomes very happy if we use the REP instructions unconditionally,
@@ -316,11 +321,15 @@ void *__constant_c_and_count_memset(void *s, unsigned long pattern,
316 : __memset_generic((s), (c), (count))) 321 : __memset_generic((s), (c), (count)))
317 322
318#define __HAVE_ARCH_MEMSET 323#define __HAVE_ARCH_MEMSET
324#if (__GNUC__ >= 4)
325#define memset(s, c, count) __builtin_memset(s, c, count)
326#else
319#define memset(s, c, count) \ 327#define memset(s, c, count) \
320 (__builtin_constant_p(c) \ 328 (__builtin_constant_p(c) \
321 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \ 329 ? __constant_c_x_memset((s), (0x01010101UL * (unsigned char)(c)), \
322 (count)) \ 330 (count)) \
323 : __memset((s), (c), (count))) 331 : __memset((s), (c), (count)))
332#endif
324 333
325/* 334/*
326 * find the first occurrence of byte 'c', or 1 past the area if none 335 * find the first occurrence of byte 'c', or 1 past the area if none
diff --git a/arch/x86/include/asm/svm.h b/arch/x86/include/asm/svm.h
index 85574b7c1bc1..38638cd2fa4c 100644
--- a/arch/x86/include/asm/svm.h
+++ b/arch/x86/include/asm/svm.h
@@ -57,7 +57,8 @@ struct __attribute__ ((__packed__)) vmcb_control_area {
57 u16 intercept_dr_write; 57 u16 intercept_dr_write;
58 u32 intercept_exceptions; 58 u32 intercept_exceptions;
59 u64 intercept; 59 u64 intercept;
60 u8 reserved_1[44]; 60 u8 reserved_1[42];
61 u16 pause_filter_count;
61 u64 iopm_base_pa; 62 u64 iopm_base_pa;
62 u64 msrpm_base_pa; 63 u64 msrpm_base_pa;
63 u64 tsc_offset; 64 u64 tsc_offset;
@@ -312,7 +313,7 @@ struct __attribute__ ((__packed__)) vmcb {
312 313
313#define SVM_EXIT_ERR -1 314#define SVM_EXIT_ERR -1
314 315
315#define SVM_CR0_SELECTIVE_MASK (1 << 3 | 1) /* TS and MP */ 316#define SVM_CR0_SELECTIVE_MASK (X86_CR0_TS | X86_CR0_MP)
316 317
317#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda" 318#define SVM_VMLOAD ".byte 0x0f, 0x01, 0xda"
318#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8" 319#define SVM_VMRUN ".byte 0x0f, 0x01, 0xd8"
diff --git a/arch/x86/include/asm/swiotlb.h b/arch/x86/include/asm/swiotlb.h
index b9e4e20174fb..8085277e1b8b 100644
--- a/arch/x86/include/asm/swiotlb.h
+++ b/arch/x86/include/asm/swiotlb.h
@@ -3,15 +3,16 @@
3 3
4#include <linux/swiotlb.h> 4#include <linux/swiotlb.h>
5 5
6/* SWIOTLB interface */
7
8extern int swiotlb_force;
9
10#ifdef CONFIG_SWIOTLB 6#ifdef CONFIG_SWIOTLB
11extern int swiotlb; 7extern int swiotlb;
12extern void pci_swiotlb_init(void); 8extern int __init pci_swiotlb_detect(void);
9extern void __init pci_swiotlb_init(void);
13#else 10#else
14#define swiotlb 0 11#define swiotlb 0
12static inline int pci_swiotlb_detect(void)
13{
14 return 0;
15}
15static inline void pci_swiotlb_init(void) 16static inline void pci_swiotlb_init(void)
16{ 17{
17} 18}
diff --git a/arch/x86/include/asm/sys_ia32.h b/arch/x86/include/asm/sys_ia32.h
index 72a6dcd1299b..3ad421784ae7 100644
--- a/arch/x86/include/asm/sys_ia32.h
+++ b/arch/x86/include/asm/sys_ia32.h
@@ -26,11 +26,10 @@ asmlinkage long sys32_lstat64(char __user *, struct stat64 __user *);
26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *); 26asmlinkage long sys32_fstat64(unsigned int, struct stat64 __user *);
27asmlinkage long sys32_fstatat(unsigned int, char __user *, 27asmlinkage long sys32_fstatat(unsigned int, char __user *,
28 struct stat64 __user *, int); 28 struct stat64 __user *, int);
29struct mmap_arg_struct; 29struct mmap_arg_struct32;
30asmlinkage long sys32_mmap(struct mmap_arg_struct __user *); 30asmlinkage long sys32_mmap(struct mmap_arg_struct32 __user *);
31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long); 31asmlinkage long sys32_mprotect(unsigned long, size_t, unsigned long);
32 32
33asmlinkage long sys32_pipe(int __user *);
34struct sigaction32; 33struct sigaction32;
35struct old_sigaction32; 34struct old_sigaction32;
36asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *, 35asmlinkage long sys32_rt_sigaction(int, struct sigaction32 __user *,
@@ -41,8 +40,6 @@ asmlinkage long sys32_rt_sigprocmask(int, compat_sigset_t __user *,
41 compat_sigset_t __user *, unsigned int); 40 compat_sigset_t __user *, unsigned int);
42asmlinkage long sys32_alarm(unsigned int); 41asmlinkage long sys32_alarm(unsigned int);
43 42
44struct sel_arg_struct;
45asmlinkage long sys32_old_select(struct sel_arg_struct __user *);
46asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int); 43asmlinkage long sys32_waitpid(compat_pid_t, unsigned int *, int);
47asmlinkage long sys32_sysfs(int, u32, u32); 44asmlinkage long sys32_sysfs(int, u32, u32);
48 45
@@ -51,25 +48,12 @@ asmlinkage long sys32_sched_rr_get_interval(compat_pid_t,
51asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t); 48asmlinkage long sys32_rt_sigpending(compat_sigset_t __user *, compat_size_t);
52asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *); 49asmlinkage long sys32_rt_sigqueueinfo(int, int, compat_siginfo_t __user *);
53 50
54#ifdef CONFIG_SYSCTL_SYSCALL
55struct sysctl_ia32;
56asmlinkage long sys32_sysctl(struct sysctl_ia32 __user *);
57#endif
58
59asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32); 51asmlinkage long sys32_pread(unsigned int, char __user *, u32, u32, u32);
60asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32); 52asmlinkage long sys32_pwrite(unsigned int, char __user *, u32, u32, u32);
61 53
62asmlinkage long sys32_personality(unsigned long); 54asmlinkage long sys32_personality(unsigned long);
63asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32); 55asmlinkage long sys32_sendfile(int, int, compat_off_t __user *, s32);
64 56
65asmlinkage long sys32_mmap2(unsigned long, unsigned long, unsigned long,
66 unsigned long, unsigned long, unsigned long);
67
68struct oldold_utsname;
69struct old_utsname;
70asmlinkage long sys32_olduname(struct oldold_utsname __user *);
71long sys32_uname(struct old_utsname __user *);
72
73asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *, 57asmlinkage long sys32_execve(char __user *, compat_uptr_t __user *,
74 compat_uptr_t __user *, struct pt_regs *); 58 compat_uptr_t __user *, struct pt_regs *);
75asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *); 59asmlinkage long sys32_clone(unsigned int, unsigned int, struct pt_regs *);
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index 8d33bc5462d1..c4a348f7bd43 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -16,6 +16,8 @@
16#include <linux/sched.h> 16#include <linux/sched.h>
17#include <linux/err.h> 17#include <linux/err.h>
18 18
19extern const unsigned long sys_call_table[];
20
19/* 21/*
20 * Only the low 32 bits of orig_ax are meaningful, so we return int. 22 * Only the low 32 bits of orig_ax are meaningful, so we return int.
21 * This importantly ignores the high bits on 64-bit, so comparisons 23 * This importantly ignores the high bits on 64-bit, so comparisons
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 372b76edd63f..5c044b43e9a7 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -18,16 +18,24 @@
18/* Common in X86_32 and X86_64 */ 18/* Common in X86_32 and X86_64 */
19/* kernel/ioport.c */ 19/* kernel/ioport.c */
20asmlinkage long sys_ioperm(unsigned long, unsigned long, int); 20asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
21long sys_iopl(unsigned int, struct pt_regs *);
21 22
22/* kernel/process.c */ 23/* kernel/process.c */
23int sys_fork(struct pt_regs *); 24int sys_fork(struct pt_regs *);
24int sys_vfork(struct pt_regs *); 25int sys_vfork(struct pt_regs *);
26long sys_execve(char __user *, char __user * __user *,
27 char __user * __user *, struct pt_regs *);
28long sys_clone(unsigned long, unsigned long, void __user *,
29 void __user *, struct pt_regs *);
25 30
26/* kernel/ldt.c */ 31/* kernel/ldt.c */
27asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); 32asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
28 33
29/* kernel/signal.c */ 34/* kernel/signal.c */
30long sys_rt_sigreturn(struct pt_regs *); 35long sys_rt_sigreturn(struct pt_regs *);
36long sys_sigaltstack(const stack_t __user *, stack_t __user *,
37 struct pt_regs *);
38
31 39
32/* kernel/tls.c */ 40/* kernel/tls.c */
33asmlinkage int sys_set_thread_area(struct user_desc __user *); 41asmlinkage int sys_set_thread_area(struct user_desc __user *);
@@ -35,63 +43,26 @@ asmlinkage int sys_get_thread_area(struct user_desc __user *);
35 43
36/* X86_32 only */ 44/* X86_32 only */
37#ifdef CONFIG_X86_32 45#ifdef CONFIG_X86_32
38/* kernel/ioport.c */
39long sys_iopl(struct pt_regs *);
40
41/* kernel/process_32.c */
42int sys_clone(struct pt_regs *);
43int sys_execve(struct pt_regs *);
44 46
45/* kernel/signal.c */ 47/* kernel/signal.c */
46asmlinkage int sys_sigsuspend(int, int, old_sigset_t); 48asmlinkage int sys_sigsuspend(int, int, old_sigset_t);
47asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, 49asmlinkage int sys_sigaction(int, const struct old_sigaction __user *,
48 struct old_sigaction __user *); 50 struct old_sigaction __user *);
49int sys_sigaltstack(struct pt_regs *);
50unsigned long sys_sigreturn(struct pt_regs *); 51unsigned long sys_sigreturn(struct pt_regs *);
51 52
52/* kernel/sys_i386_32.c */
53struct mmap_arg_struct;
54struct sel_arg_struct;
55struct oldold_utsname;
56struct old_utsname;
57
58asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long,
59 unsigned long, unsigned long, unsigned long);
60asmlinkage int old_mmap(struct mmap_arg_struct __user *);
61asmlinkage int old_select(struct sel_arg_struct __user *);
62asmlinkage int sys_ipc(uint, int, int, int, void __user *, long);
63asmlinkage int sys_uname(struct old_utsname __user *);
64asmlinkage int sys_olduname(struct oldold_utsname __user *);
65
66/* kernel/vm86_32.c */ 53/* kernel/vm86_32.c */
67int sys_vm86old(struct pt_regs *); 54int sys_vm86old(struct vm86_struct __user *, struct pt_regs *);
68int sys_vm86(struct pt_regs *); 55int sys_vm86(unsigned long, unsigned long, struct pt_regs *);
69 56
70#else /* CONFIG_X86_32 */ 57#else /* CONFIG_X86_32 */
71 58
72/* X86_64 only */ 59/* X86_64 only */
73/* kernel/ioport.c */
74asmlinkage long sys_iopl(unsigned int, struct pt_regs *);
75
76/* kernel/process_64.c */ 60/* kernel/process_64.c */
77asmlinkage long sys_clone(unsigned long, unsigned long,
78 void __user *, void __user *,
79 struct pt_regs *);
80asmlinkage long sys_execve(char __user *, char __user * __user *,
81 char __user * __user *,
82 struct pt_regs *);
83long sys_arch_prctl(int, unsigned long); 61long sys_arch_prctl(int, unsigned long);
84 62
85/* kernel/signal.c */
86asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *,
87 struct pt_regs *);
88
89/* kernel/sys_x86_64.c */ 63/* kernel/sys_x86_64.c */
90struct new_utsname;
91
92asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, 64asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long,
93 unsigned long, unsigned long, unsigned long); 65 unsigned long, unsigned long, unsigned long);
94asmlinkage long sys_uname(struct new_utsname __user *);
95 66
96#endif /* CONFIG_X86_32 */ 67#endif /* CONFIG_X86_32 */
97#endif /* _ASM_X86_SYSCALLS_H */ 68#endif /* _ASM_X86_SYSCALLS_H */
diff --git a/arch/x86/include/asm/system.h b/arch/x86/include/asm/system.h
index f08f97374892..b8fe48ee2ed9 100644
--- a/arch/x86/include/asm/system.h
+++ b/arch/x86/include/asm/system.h
@@ -11,9 +11,9 @@
11#include <linux/irqflags.h> 11#include <linux/irqflags.h>
12 12
13/* entries in ARCH_DLINFO: */ 13/* entries in ARCH_DLINFO: */
14#ifdef CONFIG_IA32_EMULATION 14#if defined(CONFIG_IA32_EMULATION) || !defined(CONFIG_X86_64)
15# define AT_VECTOR_SIZE_ARCH 2 15# define AT_VECTOR_SIZE_ARCH 2
16#else 16#else /* else it's non-compat x86-64 */
17# define AT_VECTOR_SIZE_ARCH 1 17# define AT_VECTOR_SIZE_ARCH 1
18#endif 18#endif
19 19
@@ -23,6 +23,7 @@ struct task_struct *__switch_to(struct task_struct *prev,
23struct tss_struct; 23struct tss_struct;
24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p, 24void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
25 struct tss_struct *tss); 25 struct tss_struct *tss);
26extern void show_regs_common(void);
26 27
27#ifdef CONFIG_X86_32 28#ifdef CONFIG_X86_32
28 29
@@ -31,7 +32,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
31 "movl %P[task_canary](%[next]), %%ebx\n\t" \ 32 "movl %P[task_canary](%[next]), %%ebx\n\t" \
32 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t" 33 "movl %%ebx, "__percpu_arg([stack_canary])"\n\t"
33#define __switch_canary_oparam \ 34#define __switch_canary_oparam \
34 , [stack_canary] "=m" (per_cpu_var(stack_canary.canary)) 35 , [stack_canary] "=m" (stack_canary.canary)
35#define __switch_canary_iparam \ 36#define __switch_canary_iparam \
36 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 37 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
37#else /* CC_STACKPROTECTOR */ 38#else /* CC_STACKPROTECTOR */
@@ -113,7 +114,7 @@ do { \
113 "movq %P[task_canary](%%rsi),%%r8\n\t" \ 114 "movq %P[task_canary](%%rsi),%%r8\n\t" \
114 "movq %%r8,"__percpu_arg([gs_canary])"\n\t" 115 "movq %%r8,"__percpu_arg([gs_canary])"\n\t"
115#define __switch_canary_oparam \ 116#define __switch_canary_oparam \
116 , [gs_canary] "=m" (per_cpu_var(irq_stack_union.stack_canary)) 117 , [gs_canary] "=m" (irq_stack_union.stack_canary)
117#define __switch_canary_iparam \ 118#define __switch_canary_iparam \
118 , [task_canary] "i" (offsetof(struct task_struct, stack_canary)) 119 , [task_canary] "i" (offsetof(struct task_struct, stack_canary))
119#else /* CC_STACKPROTECTOR */ 120#else /* CC_STACKPROTECTOR */
@@ -128,13 +129,11 @@ do { \
128 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ 129 "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \
129 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ 130 "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \
130 "call __switch_to\n\t" \ 131 "call __switch_to\n\t" \
131 ".globl thread_return\n" \
132 "thread_return:\n\t" \
133 "movq "__percpu_arg([current_task])",%%rsi\n\t" \ 132 "movq "__percpu_arg([current_task])",%%rsi\n\t" \
134 __switch_canary \ 133 __switch_canary \
135 "movq %P[thread_info](%%rsi),%%r8\n\t" \ 134 "movq %P[thread_info](%%rsi),%%r8\n\t" \
136 "movq %%rax,%%rdi\n\t" \ 135 "movq %%rax,%%rdi\n\t" \
137 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \ 136 "testl %[_tif_fork],%P[ti_flags](%%r8)\n\t" \
138 "jnz ret_from_fork\n\t" \ 137 "jnz ret_from_fork\n\t" \
139 RESTORE_CONTEXT \ 138 RESTORE_CONTEXT \
140 : "=a" (last) \ 139 : "=a" (last) \
@@ -144,7 +143,7 @@ do { \
144 [ti_flags] "i" (offsetof(struct thread_info, flags)), \ 143 [ti_flags] "i" (offsetof(struct thread_info, flags)), \
145 [_tif_fork] "i" (_TIF_FORK), \ 144 [_tif_fork] "i" (_TIF_FORK), \
146 [thread_info] "i" (offsetof(struct task_struct, stack)), \ 145 [thread_info] "i" (offsetof(struct task_struct, stack)), \
147 [current_task] "m" (per_cpu_var(current_task)) \ 146 [current_task] "m" (current_task) \
148 __switch_canary_iparam \ 147 __switch_canary_iparam \
149 : "memory", "cc" __EXTRA_CLOBBER) 148 : "memory", "cc" __EXTRA_CLOBBER)
150#endif 149#endif
@@ -157,19 +156,22 @@ extern void native_load_gs_index(unsigned);
157 * Load a segment. Fall back on loading the zero 156 * Load a segment. Fall back on loading the zero
158 * segment if something goes wrong.. 157 * segment if something goes wrong..
159 */ 158 */
160#define loadsegment(seg, value) \ 159#define loadsegment(seg, value) \
161 asm volatile("\n" \ 160do { \
162 "1:\t" \ 161 unsigned short __val = (value); \
163 "movl %k0,%%" #seg "\n" \ 162 \
164 "2:\n" \ 163 asm volatile(" \n" \
165 ".section .fixup,\"ax\"\n" \ 164 "1: movl %k0,%%" #seg " \n" \
166 "3:\t" \ 165 \
167 "movl %k1, %%" #seg "\n\t" \ 166 ".section .fixup,\"ax\" \n" \
168 "jmp 2b\n" \ 167 "2: xorl %k0,%k0 \n" \
169 ".previous\n" \ 168 " jmp 1b \n" \
170 _ASM_EXTABLE(1b,3b) \ 169 ".previous \n" \
171 : :"r" (value), "r" (0) : "memory") 170 \
172 171 _ASM_EXTABLE(1b, 2b) \
172 \
173 : "+r" (__val) : : "memory"); \
174} while (0)
173 175
174/* 176/*
175 * Save a segment register away 177 * Save a segment register away
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index d27d0a2fec4c..e0d28901e969 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -83,10 +83,10 @@ struct thread_info {
83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ 83#define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */
84#define TIF_SECCOMP 8 /* secure computing */ 84#define TIF_SECCOMP 8 /* secure computing */
85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ 85#define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */
86#define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */
86#define TIF_NOTSC 16 /* TSC is not accessible in userland */ 87#define TIF_NOTSC 16 /* TSC is not accessible in userland */
87#define TIF_IA32 17 /* 32bit process */ 88#define TIF_IA32 17 /* 32bit process */
88#define TIF_FORK 18 /* ret_from_fork */ 89#define TIF_FORK 18 /* ret_from_fork */
89#define TIF_ABI_PENDING 19
90#define TIF_MEMDIE 20 90#define TIF_MEMDIE 20
91#define TIF_DEBUG 21 /* uses debug registers */ 91#define TIF_DEBUG 21 /* uses debug registers */
92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */ 92#define TIF_IO_BITMAP 22 /* uses I/O bitmap */
@@ -107,10 +107,10 @@ struct thread_info {
107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) 107#define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT)
108#define _TIF_SECCOMP (1 << TIF_SECCOMP) 108#define _TIF_SECCOMP (1 << TIF_SECCOMP)
109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) 109#define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY)
110#define _TIF_USER_RETURN_NOTIFY (1 << TIF_USER_RETURN_NOTIFY)
110#define _TIF_NOTSC (1 << TIF_NOTSC) 111#define _TIF_NOTSC (1 << TIF_NOTSC)
111#define _TIF_IA32 (1 << TIF_IA32) 112#define _TIF_IA32 (1 << TIF_IA32)
112#define _TIF_FORK (1 << TIF_FORK) 113#define _TIF_FORK (1 << TIF_FORK)
113#define _TIF_ABI_PENDING (1 << TIF_ABI_PENDING)
114#define _TIF_DEBUG (1 << TIF_DEBUG) 114#define _TIF_DEBUG (1 << TIF_DEBUG)
115#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) 115#define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP)
116#define _TIF_FREEZE (1 << TIF_FREEZE) 116#define _TIF_FREEZE (1 << TIF_FREEZE)
@@ -142,13 +142,14 @@ struct thread_info {
142 142
143/* Only used for 64 bit */ 143/* Only used for 64 bit */
144#define _TIF_DO_NOTIFY_MASK \ 144#define _TIF_DO_NOTIFY_MASK \
145 (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) 145 (_TIF_SIGPENDING | _TIF_MCE_NOTIFY | _TIF_NOTIFY_RESUME | \
146 _TIF_USER_RETURN_NOTIFY)
146 147
147/* flags to check in __switch_to() */ 148/* flags to check in __switch_to() */
148#define _TIF_WORK_CTXSW \ 149#define _TIF_WORK_CTXSW \
149 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC) 150 (_TIF_IO_BITMAP|_TIF_DEBUGCTLMSR|_TIF_DS_AREA_MSR|_TIF_NOTSC)
150 151
151#define _TIF_WORK_CTXSW_PREV _TIF_WORK_CTXSW 152#define _TIF_WORK_CTXSW_PREV (_TIF_WORK_CTXSW|_TIF_USER_RETURN_NOTIFY)
152#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG) 153#define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW|_TIF_DEBUG)
153 154
154#define PREEMPT_ACTIVE 0x10000000 155#define PREEMPT_ACTIVE 0x10000000
diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h
index 40e37b10c6c0..c5087d796587 100644
--- a/arch/x86/include/asm/topology.h
+++ b/arch/x86/include/asm/topology.h
@@ -35,11 +35,16 @@
35# endif 35# endif
36#endif 36#endif
37 37
38/* Node not present */ 38/*
39#define NUMA_NO_NODE (-1) 39 * to preserve the visibility of NUMA_NO_NODE definition,
40 * moved to there from here. May be used independent of
41 * CONFIG_NUMA.
42 */
43#include <linux/numa.h>
40 44
41#ifdef CONFIG_NUMA 45#ifdef CONFIG_NUMA
42#include <linux/cpumask.h> 46#include <linux/cpumask.h>
47
43#include <asm/mpspec.h> 48#include <asm/mpspec.h>
44 49
45#ifdef CONFIG_X86_32 50#ifdef CONFIG_X86_32
diff --git a/arch/x86/include/asm/trampoline.h b/arch/x86/include/asm/trampoline.h
index 90f06c25221d..cb507bb05d79 100644
--- a/arch/x86/include/asm/trampoline.h
+++ b/arch/x86/include/asm/trampoline.h
@@ -16,7 +16,6 @@ extern unsigned long initial_code;
16extern unsigned long initial_gs; 16extern unsigned long initial_gs;
17 17
18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE) 18#define TRAMPOLINE_SIZE roundup(trampoline_end - trampoline_data, PAGE_SIZE)
19#define TRAMPOLINE_BASE 0x6000
20 19
21extern unsigned long setup_trampoline(void); 20extern unsigned long setup_trampoline(void);
22extern void __init reserve_trampoline_memory(void); 21extern void __init reserve_trampoline_memory(void);
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index d2c6c930b491..abd3e0ea762a 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -570,7 +570,6 @@ extern struct movsl_mask {
570#ifdef CONFIG_X86_32 570#ifdef CONFIG_X86_32
571# include "uaccess_32.h" 571# include "uaccess_32.h"
572#else 572#else
573# define ARCH_HAS_SEARCH_EXTABLE
574# include "uaccess_64.h" 573# include "uaccess_64.h"
575#endif 574#endif
576 575
diff --git a/arch/x86/include/asm/uaccess_32.h b/arch/x86/include/asm/uaccess_32.h
index 632fb44b4cb5..088d09fb1615 100644
--- a/arch/x86/include/asm/uaccess_32.h
+++ b/arch/x86/include/asm/uaccess_32.h
@@ -187,9 +187,33 @@ __copy_from_user_inatomic_nocache(void *to, const void __user *from,
187 187
188unsigned long __must_check copy_to_user(void __user *to, 188unsigned long __must_check copy_to_user(void __user *to,
189 const void *from, unsigned long n); 189 const void *from, unsigned long n);
190unsigned long __must_check copy_from_user(void *to, 190unsigned long __must_check _copy_from_user(void *to,
191 const void __user *from, 191 const void __user *from,
192 unsigned long n); 192 unsigned long n);
193
194
195extern void copy_from_user_overflow(void)
196#ifdef CONFIG_DEBUG_STRICT_USER_COPY_CHECKS
197 __compiletime_error("copy_from_user() buffer size is not provably correct")
198#else
199 __compiletime_warning("copy_from_user() buffer size is not provably correct")
200#endif
201;
202
203static inline unsigned long __must_check copy_from_user(void *to,
204 const void __user *from,
205 unsigned long n)
206{
207 int sz = __compiletime_object_size(to);
208
209 if (likely(sz == -1 || sz >= n))
210 n = _copy_from_user(to, from, n);
211 else
212 copy_from_user_overflow();
213
214 return n;
215}
216
193long __must_check strncpy_from_user(char *dst, const char __user *src, 217long __must_check strncpy_from_user(char *dst, const char __user *src,
194 long count); 218 long count);
195long __must_check __strncpy_from_user(char *dst, 219long __must_check __strncpy_from_user(char *dst,
diff --git a/arch/x86/include/asm/uaccess_64.h b/arch/x86/include/asm/uaccess_64.h
index db24b215fc50..316708d5af92 100644
--- a/arch/x86/include/asm/uaccess_64.h
+++ b/arch/x86/include/asm/uaccess_64.h
@@ -8,6 +8,8 @@
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/prefetch.h> 9#include <linux/prefetch.h>
10#include <linux/lockdep.h> 10#include <linux/lockdep.h>
11#include <asm/alternative.h>
12#include <asm/cpufeature.h>
11#include <asm/page.h> 13#include <asm/page.h>
12 14
13/* 15/*
@@ -16,15 +18,56 @@
16 18
17/* Handles exceptions in both to and from, but doesn't do access_ok */ 19/* Handles exceptions in both to and from, but doesn't do access_ok */
18__must_check unsigned long 20__must_check unsigned long
19copy_user_generic(void *to, const void *from, unsigned len); 21copy_user_generic_string(void *to, const void *from, unsigned len);
22__must_check unsigned long
23copy_user_generic_unrolled(void *to, const void *from, unsigned len);
24
25static __always_inline __must_check unsigned long
26copy_user_generic(void *to, const void *from, unsigned len)
27{
28 unsigned ret;
29
30 alternative_call(copy_user_generic_unrolled,
31 copy_user_generic_string,
32 X86_FEATURE_REP_GOOD,
33 ASM_OUTPUT2("=a" (ret), "=D" (to), "=S" (from),
34 "=d" (len)),
35 "1" (to), "2" (from), "3" (len)
36 : "memory", "rcx", "r8", "r9", "r10", "r11");
37 return ret;
38}
20 39
21__must_check unsigned long 40__must_check unsigned long
22copy_to_user(void __user *to, const void *from, unsigned len); 41_copy_to_user(void __user *to, const void *from, unsigned len);
23__must_check unsigned long 42__must_check unsigned long
24copy_from_user(void *to, const void __user *from, unsigned len); 43_copy_from_user(void *to, const void __user *from, unsigned len);
25__must_check unsigned long 44__must_check unsigned long
26copy_in_user(void __user *to, const void __user *from, unsigned len); 45copy_in_user(void __user *to, const void __user *from, unsigned len);
27 46
47static inline unsigned long __must_check copy_from_user(void *to,
48 const void __user *from,
49 unsigned long n)
50{
51 int sz = __compiletime_object_size(to);
52
53 might_fault();
54 if (likely(sz == -1 || sz >= n))
55 n = _copy_from_user(to, from, n);
56#ifdef CONFIG_DEBUG_VM
57 else
58 WARN(1, "Buffer overflow detected!\n");
59#endif
60 return n;
61}
62
63static __always_inline __must_check
64int copy_to_user(void __user *dst, const void *src, unsigned size)
65{
66 might_fault();
67
68 return _copy_to_user(dst, src, size);
69}
70
28static __always_inline __must_check 71static __always_inline __must_check
29int __copy_from_user(void *dst, const void __user *src, unsigned size) 72int __copy_from_user(void *dst, const void __user *src, unsigned size)
30{ 73{
@@ -176,8 +219,11 @@ __must_check long strlen_user(const char __user *str);
176__must_check unsigned long clear_user(void __user *mem, unsigned long len); 219__must_check unsigned long clear_user(void __user *mem, unsigned long len);
177__must_check unsigned long __clear_user(void __user *mem, unsigned long len); 220__must_check unsigned long __clear_user(void __user *mem, unsigned long len);
178 221
179__must_check long __copy_from_user_inatomic(void *dst, const void __user *src, 222static __must_check __always_inline int
180 unsigned size); 223__copy_from_user_inatomic(void *dst, const void __user *src, unsigned size)
224{
225 return copy_user_generic(dst, (__force const void *)src, size);
226}
181 227
182static __must_check __always_inline int 228static __must_check __always_inline int
183__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size) 229__copy_to_user_inatomic(void __user *dst, const void *src, unsigned size)
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index f9b507f30d65..4f61e8b0715a 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -342,14 +342,15 @@
342#define __NR_pwritev 334 342#define __NR_pwritev 334
343#define __NR_rt_tgsigqueueinfo 335 343#define __NR_rt_tgsigqueueinfo 335
344#define __NR_perf_event_open 336 344#define __NR_perf_event_open 336
345#define __NR_recvmmsg 337
345 346
346#define __NR_LITMUS 337 347#define __NR_LITMUS 338
347 348
348#include "litmus/unistd_32.h" 349#include "litmus/unistd_32.h"
349 350
350#ifdef __KERNEL__ 351#ifdef __KERNEL__
351 352
352#define NR_syscalls 336 + NR_litmus_syscalls 353#define NR_syscalls 339 + NR_litmus_syscalls
353 354
354#define __ARCH_WANT_IPC_PARSE_VERSION 355#define __ARCH_WANT_IPC_PARSE_VERSION
355#define __ARCH_WANT_OLD_READDIR 356#define __ARCH_WANT_OLD_READDIR
@@ -357,6 +358,7 @@
357#define __ARCH_WANT_STAT64 358#define __ARCH_WANT_STAT64
358#define __ARCH_WANT_SYS_ALARM 359#define __ARCH_WANT_SYS_ALARM
359#define __ARCH_WANT_SYS_GETHOSTNAME 360#define __ARCH_WANT_SYS_GETHOSTNAME
361#define __ARCH_WANT_SYS_IPC
360#define __ARCH_WANT_SYS_PAUSE 362#define __ARCH_WANT_SYS_PAUSE
361#define __ARCH_WANT_SYS_SGETMASK 363#define __ARCH_WANT_SYS_SGETMASK
362#define __ARCH_WANT_SYS_SIGNAL 364#define __ARCH_WANT_SYS_SIGNAL
@@ -369,6 +371,9 @@
369#define __ARCH_WANT_SYS_LLSEEK 371#define __ARCH_WANT_SYS_LLSEEK
370#define __ARCH_WANT_SYS_NICE 372#define __ARCH_WANT_SYS_NICE
371#define __ARCH_WANT_SYS_OLD_GETRLIMIT 373#define __ARCH_WANT_SYS_OLD_GETRLIMIT
374#define __ARCH_WANT_SYS_OLD_UNAME
375#define __ARCH_WANT_SYS_OLD_MMAP
376#define __ARCH_WANT_SYS_OLD_SELECT
372#define __ARCH_WANT_SYS_OLDUMOUNT 377#define __ARCH_WANT_SYS_OLDUMOUNT
373#define __ARCH_WANT_SYS_SIGPENDING 378#define __ARCH_WANT_SYS_SIGPENDING
374#define __ARCH_WANT_SYS_SIGPROCMASK 379#define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 33b2003c0450..b21c3b269aac 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -146,7 +146,7 @@ __SYSCALL(__NR_wait4, sys_wait4)
146#define __NR_kill 62 146#define __NR_kill 62
147__SYSCALL(__NR_kill, sys_kill) 147__SYSCALL(__NR_kill, sys_kill)
148#define __NR_uname 63 148#define __NR_uname 63
149__SYSCALL(__NR_uname, sys_uname) 149__SYSCALL(__NR_uname, sys_newuname)
150 150
151#define __NR_semget 64 151#define __NR_semget 64
152__SYSCALL(__NR_semget, sys_semget) 152__SYSCALL(__NR_semget, sys_semget)
@@ -661,6 +661,8 @@ __SYSCALL(__NR_pwritev, sys_pwritev)
661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) 661__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo)
662#define __NR_perf_event_open 298 662#define __NR_perf_event_open 298
663__SYSCALL(__NR_perf_event_open, sys_perf_event_open) 663__SYSCALL(__NR_perf_event_open, sys_perf_event_open)
664#define __NR_recvmmsg 299
665__SYSCALL(__NR_recvmmsg, sys_recvmmsg)
664 666
665#define __NR_LITMUS 299 667#define __NR_LITMUS 299
666 668
@@ -682,6 +684,7 @@ __SYSCALL(__NR_perf_event_open, sys_perf_event_open)
682#define __ARCH_WANT_SYS_LLSEEK 684#define __ARCH_WANT_SYS_LLSEEK
683#define __ARCH_WANT_SYS_NICE 685#define __ARCH_WANT_SYS_NICE
684#define __ARCH_WANT_SYS_OLD_GETRLIMIT 686#define __ARCH_WANT_SYS_OLD_GETRLIMIT
687#define __ARCH_WANT_SYS_OLD_UNAME
685#define __ARCH_WANT_SYS_OLDUMOUNT 688#define __ARCH_WANT_SYS_OLDUMOUNT
686#define __ARCH_WANT_SYS_SIGPENDING 689#define __ARCH_WANT_SYS_SIGPENDING
687#define __ARCH_WANT_SYS_SIGPROCMASK 690#define __ARCH_WANT_SYS_SIGPROCMASK
diff --git a/arch/x86/include/asm/user.h b/arch/x86/include/asm/user.h
index 999873b22e7f..24532c7da3d6 100644
--- a/arch/x86/include/asm/user.h
+++ b/arch/x86/include/asm/user.h
@@ -1,5 +1,63 @@
1#ifndef _ASM_X86_USER_H
2#define _ASM_X86_USER_H
3
1#ifdef CONFIG_X86_32 4#ifdef CONFIG_X86_32
2# include "user_32.h" 5# include "user_32.h"
3#else 6#else
4# include "user_64.h" 7# include "user_64.h"
5#endif 8#endif
9
10#include <asm/types.h>
11
12struct user_ymmh_regs {
13 /* 16 * 16 bytes for each YMMH-reg */
14 __u32 ymmh_space[64];
15};
16
17struct user_xsave_hdr {
18 __u64 xstate_bv;
19 __u64 reserved1[2];
20 __u64 reserved2[5];
21};
22
23/*
24 * The structure layout of user_xstateregs, used for exporting the
25 * extended register state through ptrace and core-dump (NT_X86_XSTATE note)
26 * interfaces will be same as the memory layout of xsave used by the processor
27 * (except for the bytes 464..511, which can be used by the software) and hence
28 * the size of this structure varies depending on the features supported by the
29 * processor and OS. The size of the structure that users need to use can be
30 * obtained by doing:
31 * cpuid_count(0xd, 0, &eax, &ptrace_xstateregs_struct_size, &ecx, &edx);
32 * i.e., cpuid.(eax=0xd,ecx=0).ebx will be the size that user (debuggers, etc.)
33 * need to use.
34 *
35 * For now, only the first 8 bytes of the software usable bytes[464..471] will
36 * be used and will be set to OS enabled xstate mask (which is same as the
37 * 64bit mask returned by the xgetbv's xCR0). Users (analyzing core dump
38 * remotely, etc.) can use this mask as well as the mask saved in the
39 * xstate_hdr bytes and interpret what states the processor/OS supports
40 * and what states are in modified/initialized conditions for the
41 * particular process/thread.
42 *
43 * Also when the user modifies certain state FP/SSE/etc through the
44 * ptrace interface, they must ensure that the xsave_hdr.xstate_bv
45 * bytes[512..519] of the memory layout are updated correspondingly.
46 * i.e., for example when FP state is modified to a non-init state,
47 * xsave_hdr.xstate_bv's bit 0 must be set to '1', when SSE is modified to
48 * non-init state, xsave_hdr.xstate_bv's bit 1 must to be set to '1', etc.
49 */
50#define USER_XSTATE_FX_SW_WORDS 6
51#define USER_XSTATE_XCR0_WORD 0
52
53struct user_xstateregs {
54 struct {
55 __u64 fpx_space[58];
56 __u64 xstate_fx_sw[USER_XSTATE_FX_SW_WORDS];
57 } i387;
58 struct user_xsave_hdr xsave_hdr;
59 struct user_ymmh_regs ymmh;
60 /* further processor state extensions go here */
61};
62
63#endif /* _ASM_X86_USER_H */
diff --git a/arch/x86/include/asm/uv/bios.h b/arch/x86/include/asm/uv/bios.h
index 7ed17ff502b9..71605c7d5c5c 100644
--- a/arch/x86/include/asm/uv/bios.h
+++ b/arch/x86/include/asm/uv/bios.h
@@ -18,8 +18,8 @@
18 * along with this program; if not, write to the Free Software 18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 * 20 *
21 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 21 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
22 * Copyright (c) Russ Anderson 22 * Copyright (c) Russ Anderson <rja@sgi.com>
23 */ 23 */
24 24
25#include <linux/rtc.h> 25#include <linux/rtc.h>
@@ -36,7 +36,8 @@ enum uv_bios_cmd {
36 UV_BIOS_WATCHLIST_ALLOC, 36 UV_BIOS_WATCHLIST_ALLOC,
37 UV_BIOS_WATCHLIST_FREE, 37 UV_BIOS_WATCHLIST_FREE,
38 UV_BIOS_MEMPROTECT, 38 UV_BIOS_MEMPROTECT,
39 UV_BIOS_GET_PARTITION_ADDR 39 UV_BIOS_GET_PARTITION_ADDR,
40 UV_BIOS_SET_LEGACY_VGA_TARGET
40}; 41};
41 42
42/* 43/*
@@ -76,15 +77,6 @@ union partition_info_u {
76 }; 77 };
77}; 78};
78 79
79union uv_watchlist_u {
80 u64 val;
81 struct {
82 u64 blade : 16,
83 size : 32,
84 filler : 16;
85 };
86};
87
88enum uv_memprotect { 80enum uv_memprotect {
89 UV_MEMPROT_RESTRICT_ACCESS, 81 UV_MEMPROT_RESTRICT_ACCESS,
90 UV_MEMPROT_ALLOW_AMO, 82 UV_MEMPROT_ALLOW_AMO,
@@ -98,13 +90,14 @@ extern s64 uv_bios_call(enum uv_bios_cmd, u64, u64, u64, u64, u64);
98extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64); 90extern s64 uv_bios_call_irqsave(enum uv_bios_cmd, u64, u64, u64, u64, u64);
99extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64); 91extern s64 uv_bios_call_reentrant(enum uv_bios_cmd, u64, u64, u64, u64, u64);
100 92
101extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *); 93extern s64 uv_bios_get_sn_info(int, int *, long *, long *, long *, long *);
102extern s64 uv_bios_freq_base(u64, u64 *); 94extern s64 uv_bios_freq_base(u64, u64 *);
103extern int uv_bios_mq_watchlist_alloc(int, unsigned long, unsigned int, 95extern int uv_bios_mq_watchlist_alloc(unsigned long, unsigned int,
104 unsigned long *); 96 unsigned long *);
105extern int uv_bios_mq_watchlist_free(int, int); 97extern int uv_bios_mq_watchlist_free(int, int);
106extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect); 98extern s64 uv_bios_change_memprotect(u64, u64, enum uv_memprotect);
107extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *); 99extern s64 uv_bios_reserved_page_pa(u64, u64 *, u64 *, u64 *);
100extern int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus);
108 101
109extern void uv_bios_init(void); 102extern void uv_bios_init(void);
110 103
@@ -113,6 +106,7 @@ extern int uv_type;
113extern long sn_partition_id; 106extern long sn_partition_id;
114extern long sn_coherency_id; 107extern long sn_coherency_id;
115extern long sn_region_size; 108extern long sn_region_size;
109extern long system_serial_number;
116#define partition_coherence_id() (sn_coherency_id) 110#define partition_coherence_id() (sn_coherency_id)
117 111
118extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */ 112extern struct kobject *sgi_uv_kobj; /* /sys/firmware/sgi_uv */
diff --git a/arch/x86/include/asm/uv/uv.h b/arch/x86/include/asm/uv/uv.h
index c0a01b5d985b..3bb9491b7659 100644
--- a/arch/x86/include/asm/uv/uv.h
+++ b/arch/x86/include/asm/uv/uv.h
@@ -11,6 +11,7 @@ struct mm_struct;
11extern enum uv_system_type get_uv_system_type(void); 11extern enum uv_system_type get_uv_system_type(void);
12extern int is_uv_system(void); 12extern int is_uv_system(void);
13extern void uv_cpu_init(void); 13extern void uv_cpu_init(void);
14extern void uv_nmi_init(void);
14extern void uv_system_init(void); 15extern void uv_system_init(void);
15extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, 16extern const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
16 struct mm_struct *mm, 17 struct mm_struct *mm,
diff --git a/arch/x86/include/asm/uv/uv_bau.h b/arch/x86/include/asm/uv/uv_bau.h
index 80e2984f521c..b414d2b401f6 100644
--- a/arch/x86/include/asm/uv/uv_bau.h
+++ b/arch/x86/include/asm/uv/uv_bau.h
@@ -55,7 +55,7 @@
55#define DESC_STATUS_SOURCE_TIMEOUT 3 55#define DESC_STATUS_SOURCE_TIMEOUT 3
56 56
57/* 57/*
58 * source side threshholds at which message retries print a warning 58 * source side thresholds at which message retries print a warning
59 */ 59 */
60#define SOURCE_TIMEOUT_LIMIT 20 60#define SOURCE_TIMEOUT_LIMIT 20
61#define DESTINATION_TIMEOUT_LIMIT 20 61#define DESTINATION_TIMEOUT_LIMIT 20
diff --git a/arch/x86/include/asm/uv/uv_hub.h b/arch/x86/include/asm/uv/uv_hub.h
index d1414af98559..14cc74ba5d23 100644
--- a/arch/x86/include/asm/uv/uv_hub.h
+++ b/arch/x86/include/asm/uv/uv_hub.h
@@ -31,20 +31,20 @@
31 * contiguous (although various IO spaces may punch holes in 31 * contiguous (although various IO spaces may punch holes in
32 * it).. 32 * it)..
33 * 33 *
34 * N - Number of bits in the node portion of a socket physical 34 * N - Number of bits in the node portion of a socket physical
35 * address. 35 * address.
36 * 36 *
37 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of 37 * NASID - network ID of a router, Mbrick or Cbrick. Nasid values of
38 * routers always have low bit of 1, C/MBricks have low bit 38 * routers always have low bit of 1, C/MBricks have low bit
39 * equal to 0. Most addressing macros that target UV hub chips 39 * equal to 0. Most addressing macros that target UV hub chips
40 * right shift the NASID by 1 to exclude the always-zero bit. 40 * right shift the NASID by 1 to exclude the always-zero bit.
41 * NASIDs contain up to 15 bits. 41 * NASIDs contain up to 15 bits.
42 * 42 *
43 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead 43 * GNODE - NASID right shifted by 1 bit. Most mmrs contain gnodes instead
44 * of nasids. 44 * of nasids.
45 * 45 *
46 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant 46 * PNODE - the low N bits of the GNODE. The PNODE is the most useful variant
47 * of the nasid for socket usage. 47 * of the nasid for socket usage.
48 * 48 *
49 * 49 *
50 * NumaLink Global Physical Address Format: 50 * NumaLink Global Physical Address Format:
@@ -71,12 +71,12 @@
71 * 71 *
72 * 72 *
73 * APICID format 73 * APICID format
74 * NOTE!!!!!! This is the current format of the APICID. However, code 74 * NOTE!!!!!! This is the current format of the APICID. However, code
75 * should assume that this will change in the future. Use functions 75 * should assume that this will change in the future. Use functions
76 * in this file for all APICID bit manipulations and conversion. 76 * in this file for all APICID bit manipulations and conversion.
77 * 77 *
78 * 1111110000000000 78 * 1111110000000000
79 * 5432109876543210 79 * 5432109876543210
80 * pppppppppplc0cch 80 * pppppppppplc0cch
81 * sssssssssss 81 * sssssssssss
82 * 82 *
@@ -89,9 +89,9 @@
89 * Note: Processor only supports 12 bits in the APICID register. The ACPI 89 * Note: Processor only supports 12 bits in the APICID register. The ACPI
90 * tables hold all 16 bits. Software needs to be aware of this. 90 * tables hold all 16 bits. Software needs to be aware of this.
91 * 91 *
92 * Unless otherwise specified, all references to APICID refer to 92 * Unless otherwise specified, all references to APICID refer to
93 * the FULL value contained in ACPI tables, not the subset in the 93 * the FULL value contained in ACPI tables, not the subset in the
94 * processor APICID register. 94 * processor APICID register.
95 */ 95 */
96 96
97 97
@@ -151,16 +151,16 @@ struct uv_hub_info_s {
151}; 151};
152 152
153DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info); 153DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
154#define uv_hub_info (&__get_cpu_var(__uv_hub_info)) 154#define uv_hub_info (&__get_cpu_var(__uv_hub_info))
155#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu)) 155#define uv_cpu_hub_info(cpu) (&per_cpu(__uv_hub_info, cpu))
156 156
157/* 157/*
158 * Local & Global MMR space macros. 158 * Local & Global MMR space macros.
159 * Note: macros are intended to be used ONLY by inline functions 159 * Note: macros are intended to be used ONLY by inline functions
160 * in this file - not by other kernel code. 160 * in this file - not by other kernel code.
161 * n - NASID (full 15-bit global nasid) 161 * n - NASID (full 15-bit global nasid)
162 * g - GNODE (full 15-bit global nasid, right shifted 1) 162 * g - GNODE (full 15-bit global nasid, right shifted 1)
163 * p - PNODE (local part of nsids, right shifted 1) 163 * p - PNODE (local part of nsids, right shifted 1)
164 */ 164 */
165#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask) 165#define UV_NASID_TO_PNODE(n) (((n) >> 1) & uv_hub_info->pnode_mask)
166#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra) 166#define UV_PNODE_TO_GNODE(p) ((p) |uv_hub_info->gnode_extra)
@@ -172,6 +172,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
172#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024) 172#define UV_LOCAL_MMR_SIZE (64UL * 1024 * 1024)
173#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024) 173#define UV_GLOBAL_MMR32_SIZE (64UL * 1024 * 1024)
174 174
175#define UV_GLOBAL_GRU_MMR_BASE 0x4000000
176
175#define UV_GLOBAL_MMR32_PNODE_SHIFT 15 177#define UV_GLOBAL_MMR32_PNODE_SHIFT 15
176#define UV_GLOBAL_MMR64_PNODE_SHIFT 26 178#define UV_GLOBAL_MMR64_PNODE_SHIFT 26
177 179
@@ -213,8 +215,8 @@ DECLARE_PER_CPU(struct uv_hub_info_s, __uv_hub_info);
213/* 215/*
214 * Macros for converting between kernel virtual addresses, socket local physical 216 * Macros for converting between kernel virtual addresses, socket local physical
215 * addresses, and UV global physical addresses. 217 * addresses, and UV global physical addresses.
216 * Note: use the standard __pa() & __va() macros for converting 218 * Note: use the standard __pa() & __va() macros for converting
217 * between socket virtual and socket physical addresses. 219 * between socket virtual and socket physical addresses.
218 */ 220 */
219 221
220/* socket phys RAM --> UV global physical address */ 222/* socket phys RAM --> UV global physical address */
@@ -232,6 +234,26 @@ static inline unsigned long uv_gpa(void *v)
232 return uv_soc_phys_ram_to_gpa(__pa(v)); 234 return uv_soc_phys_ram_to_gpa(__pa(v));
233} 235}
234 236
237/* Top two bits indicate the requested address is in MMR space. */
238static inline int
239uv_gpa_in_mmr_space(unsigned long gpa)
240{
241 return (gpa >> 62) == 0x3UL;
242}
243
244/* UV global physical address --> socket phys RAM */
245static inline unsigned long uv_gpa_to_soc_phys_ram(unsigned long gpa)
246{
247 unsigned long paddr = gpa & uv_hub_info->gpa_mask;
248 unsigned long remap_base = uv_hub_info->lowmem_remap_base;
249 unsigned long remap_top = uv_hub_info->lowmem_remap_top;
250
251 if (paddr >= remap_base && paddr < remap_base + remap_top)
252 paddr -= remap_base;
253 return paddr;
254}
255
256
235/* gnode -> pnode */ 257/* gnode -> pnode */
236static inline unsigned long uv_gpa_to_gnode(unsigned long gpa) 258static inline unsigned long uv_gpa_to_gnode(unsigned long gpa)
237{ 259{
@@ -265,21 +287,18 @@ static inline int uv_apicid_to_pnode(int apicid)
265 * Access global MMRs using the low memory MMR32 space. This region supports 287 * Access global MMRs using the low memory MMR32 space. This region supports
266 * faster MMR access but not all MMRs are accessible in this space. 288 * faster MMR access but not all MMRs are accessible in this space.
267 */ 289 */
268static inline unsigned long *uv_global_mmr32_address(int pnode, 290static inline unsigned long *uv_global_mmr32_address(int pnode, unsigned long offset)
269 unsigned long offset)
270{ 291{
271 return __va(UV_GLOBAL_MMR32_BASE | 292 return __va(UV_GLOBAL_MMR32_BASE |
272 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset); 293 UV_GLOBAL_MMR32_PNODE_BITS(pnode) | offset);
273} 294}
274 295
275static inline void uv_write_global_mmr32(int pnode, unsigned long offset, 296static inline void uv_write_global_mmr32(int pnode, unsigned long offset, unsigned long val)
276 unsigned long val)
277{ 297{
278 writeq(val, uv_global_mmr32_address(pnode, offset)); 298 writeq(val, uv_global_mmr32_address(pnode, offset));
279} 299}
280 300
281static inline unsigned long uv_read_global_mmr32(int pnode, 301static inline unsigned long uv_read_global_mmr32(int pnode, unsigned long offset)
282 unsigned long offset)
283{ 302{
284 return readq(uv_global_mmr32_address(pnode, offset)); 303 return readq(uv_global_mmr32_address(pnode, offset));
285} 304}
@@ -288,26 +307,43 @@ static inline unsigned long uv_read_global_mmr32(int pnode,
288 * Access Global MMR space using the MMR space located at the top of physical 307 * Access Global MMR space using the MMR space located at the top of physical
289 * memory. 308 * memory.
290 */ 309 */
291static inline unsigned long *uv_global_mmr64_address(int pnode, 310static inline unsigned long *uv_global_mmr64_address(int pnode, unsigned long offset)
292 unsigned long offset)
293{ 311{
294 return __va(UV_GLOBAL_MMR64_BASE | 312 return __va(UV_GLOBAL_MMR64_BASE |
295 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset); 313 UV_GLOBAL_MMR64_PNODE_BITS(pnode) | offset);
296} 314}
297 315
298static inline void uv_write_global_mmr64(int pnode, unsigned long offset, 316static inline void uv_write_global_mmr64(int pnode, unsigned long offset, unsigned long val)
299 unsigned long val)
300{ 317{
301 writeq(val, uv_global_mmr64_address(pnode, offset)); 318 writeq(val, uv_global_mmr64_address(pnode, offset));
302} 319}
303 320
304static inline unsigned long uv_read_global_mmr64(int pnode, 321static inline unsigned long uv_read_global_mmr64(int pnode, unsigned long offset)
305 unsigned long offset)
306{ 322{
307 return readq(uv_global_mmr64_address(pnode, offset)); 323 return readq(uv_global_mmr64_address(pnode, offset));
308} 324}
309 325
310/* 326/*
327 * Global MMR space addresses when referenced by the GRU. (GRU does
328 * NOT use socket addressing).
329 */
330static inline unsigned long uv_global_gru_mmr_address(int pnode, unsigned long offset)
331{
332 return UV_GLOBAL_GRU_MMR_BASE | offset |
333 ((unsigned long)pnode << uv_hub_info->m_val);
334}
335
336static inline void uv_write_global_mmr8(int pnode, unsigned long offset, unsigned char val)
337{
338 writeb(val, uv_global_mmr64_address(pnode, offset));
339}
340
341static inline unsigned char uv_read_global_mmr8(int pnode, unsigned long offset)
342{
343 return readb(uv_global_mmr64_address(pnode, offset));
344}
345
346/*
311 * Access hub local MMRs. Faster than using global space but only local MMRs 347 * Access hub local MMRs. Faster than using global space but only local MMRs
312 * are accessible. 348 * are accessible.
313 */ 349 */
@@ -426,14 +462,28 @@ static inline void uv_set_scir_bits(unsigned char value)
426 } 462 }
427} 463}
428 464
465static inline unsigned long uv_scir_offset(int apicid)
466{
467 return SCIR_LOCAL_MMR_BASE | (apicid & 0x3f);
468}
469
429static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value) 470static inline void uv_set_cpu_scir_bits(int cpu, unsigned char value)
430{ 471{
431 if (uv_cpu_hub_info(cpu)->scir.state != value) { 472 if (uv_cpu_hub_info(cpu)->scir.state != value) {
473 uv_write_global_mmr8(uv_cpu_to_pnode(cpu),
474 uv_cpu_hub_info(cpu)->scir.offset, value);
432 uv_cpu_hub_info(cpu)->scir.state = value; 475 uv_cpu_hub_info(cpu)->scir.state = value;
433 uv_write_local_mmr8(uv_cpu_hub_info(cpu)->scir.offset, value);
434 } 476 }
435} 477}
436 478
479static unsigned long uv_hub_ipi_value(int apicid, int vector, int mode)
480{
481 return (1UL << UVH_IPI_INT_SEND_SHFT) |
482 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
483 (mode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
484 (vector << UVH_IPI_INT_VECTOR_SHFT);
485}
486
437static inline void uv_hub_send_ipi(int pnode, int apicid, int vector) 487static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
438{ 488{
439 unsigned long val; 489 unsigned long val;
@@ -442,12 +492,21 @@ static inline void uv_hub_send_ipi(int pnode, int apicid, int vector)
442 if (vector == NMI_VECTOR) 492 if (vector == NMI_VECTOR)
443 dmode = dest_NMI; 493 dmode = dest_NMI;
444 494
445 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 495 val = uv_hub_ipi_value(apicid, vector, dmode);
446 ((apicid) << UVH_IPI_INT_APIC_ID_SHFT) |
447 (dmode << UVH_IPI_INT_DELIVERY_MODE_SHFT) |
448 (vector << UVH_IPI_INT_VECTOR_SHFT);
449 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 496 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
450} 497}
451 498
499/*
500 * Get the minimum revision number of the hub chips within the partition.
501 * 1 - initial rev 1.0 silicon
502 * 2 - rev 2.0 production silicon
503 */
504static inline int uv_get_min_hub_revision_id(void)
505{
506 extern int uv_min_hub_revision_id;
507
508 return uv_min_hub_revision_id;
509}
510
452#endif /* CONFIG_X86_64 */ 511#endif /* CONFIG_X86_64 */
453#endif /* _ASM_X86_UV_UV_HUB_H */ 512#endif /* _ASM_X86_UV_UV_HUB_H */
diff --git a/arch/x86/include/asm/uv/uv_irq.h b/arch/x86/include/asm/uv/uv_irq.h
index 9613c8c0b647..d6b17c760622 100644
--- a/arch/x86/include/asm/uv/uv_irq.h
+++ b/arch/x86/include/asm/uv/uv_irq.h
@@ -25,12 +25,14 @@ struct uv_IO_APIC_route_entry {
25 dest : 32; 25 dest : 32;
26}; 26};
27 27
28extern struct irq_chip uv_irq_chip; 28enum {
29 29 UV_AFFINITY_ALL,
30extern int arch_enable_uv_irq(char *, unsigned int, int, int, unsigned long); 30 UV_AFFINITY_NODE,
31extern void arch_disable_uv_irq(int, unsigned long); 31 UV_AFFINITY_CPU
32};
32 33
33extern int uv_setup_irq(char *, int, int, unsigned long); 34extern int uv_irq_2_mmr_info(int, unsigned long *, int *);
34extern void uv_teardown_irq(unsigned int, int, unsigned long); 35extern int uv_setup_irq(char *, int, int, unsigned long, int);
36extern void uv_teardown_irq(unsigned int);
35 37
36#endif /* _ASM_X86_UV_UV_IRQ_H */ 38#endif /* _ASM_X86_UV_UV_IRQ_H */
diff --git a/arch/x86/include/asm/visws/cobalt.h b/arch/x86/include/asm/visws/cobalt.h
index 166adf61e770..2edb37637ead 100644
--- a/arch/x86/include/asm/visws/cobalt.h
+++ b/arch/x86/include/asm/visws/cobalt.h
@@ -122,4 +122,6 @@ extern char visws_board_type;
122 122
123extern char visws_board_rev; 123extern char visws_board_rev;
124 124
125extern int pci_visws_init(void);
126
125#endif /* _ASM_X86_VISWS_COBALT_H */ 127#endif /* _ASM_X86_VISWS_COBALT_H */
diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index 272514c2d456..fb9a080740ec 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -53,9 +53,11 @@
53 */ 53 */
54#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001 54#define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002 55#define SECONDARY_EXEC_ENABLE_EPT 0x00000002
56#define SECONDARY_EXEC_RDTSCP 0x00000008
56#define SECONDARY_EXEC_ENABLE_VPID 0x00000020 57#define SECONDARY_EXEC_ENABLE_VPID 0x00000020
57#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040 58#define SECONDARY_EXEC_WBINVD_EXITING 0x00000040
58#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080 59#define SECONDARY_EXEC_UNRESTRICTED_GUEST 0x00000080
60#define SECONDARY_EXEC_PAUSE_LOOP_EXITING 0x00000400
59 61
60 62
61#define PIN_BASED_EXT_INTR_MASK 0x00000001 63#define PIN_BASED_EXT_INTR_MASK 0x00000001
@@ -144,6 +146,8 @@ enum vmcs_field {
144 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a, 146 VM_ENTRY_INSTRUCTION_LEN = 0x0000401a,
145 TPR_THRESHOLD = 0x0000401c, 147 TPR_THRESHOLD = 0x0000401c,
146 SECONDARY_VM_EXEC_CONTROL = 0x0000401e, 148 SECONDARY_VM_EXEC_CONTROL = 0x0000401e,
149 PLE_GAP = 0x00004020,
150 PLE_WINDOW = 0x00004022,
147 VM_INSTRUCTION_ERROR = 0x00004400, 151 VM_INSTRUCTION_ERROR = 0x00004400,
148 VM_EXIT_REASON = 0x00004402, 152 VM_EXIT_REASON = 0x00004402,
149 VM_EXIT_INTR_INFO = 0x00004404, 153 VM_EXIT_INTR_INFO = 0x00004404,
@@ -248,6 +252,8 @@ enum vmcs_field {
248#define EXIT_REASON_MSR_READ 31 252#define EXIT_REASON_MSR_READ 31
249#define EXIT_REASON_MSR_WRITE 32 253#define EXIT_REASON_MSR_WRITE 32
250#define EXIT_REASON_MWAIT_INSTRUCTION 36 254#define EXIT_REASON_MWAIT_INSTRUCTION 36
255#define EXIT_REASON_MONITOR_INSTRUCTION 39
256#define EXIT_REASON_PAUSE_INSTRUCTION 40
251#define EXIT_REASON_MCE_DURING_VMENTRY 41 257#define EXIT_REASON_MCE_DURING_VMENTRY 41
252#define EXIT_REASON_TPR_BELOW_THRESHOLD 43 258#define EXIT_REASON_TPR_BELOW_THRESHOLD 43
253#define EXIT_REASON_APIC_ACCESS 44 259#define EXIT_REASON_APIC_ACCESS 44
@@ -358,6 +364,7 @@ enum vmcs_field {
358#define VMX_EPTP_UC_BIT (1ull << 8) 364#define VMX_EPTP_UC_BIT (1ull << 8)
359#define VMX_EPTP_WB_BIT (1ull << 14) 365#define VMX_EPTP_WB_BIT (1ull << 14)
360#define VMX_EPT_2MB_PAGE_BIT (1ull << 16) 366#define VMX_EPT_2MB_PAGE_BIT (1ull << 16)
367#define VMX_EPT_1GB_PAGE_BIT (1ull << 17)
361#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24) 368#define VMX_EPT_EXTENT_INDIVIDUAL_BIT (1ull << 24)
362#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25) 369#define VMX_EPT_EXTENT_CONTEXT_BIT (1ull << 25)
363#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26) 370#define VMX_EPT_EXTENT_GLOBAL_BIT (1ull << 26)
@@ -370,7 +377,7 @@ enum vmcs_field {
370#define VMX_EPT_READABLE_MASK 0x1ull 377#define VMX_EPT_READABLE_MASK 0x1ull
371#define VMX_EPT_WRITABLE_MASK 0x2ull 378#define VMX_EPT_WRITABLE_MASK 0x2ull
372#define VMX_EPT_EXECUTABLE_MASK 0x4ull 379#define VMX_EPT_EXECUTABLE_MASK 0x4ull
373#define VMX_EPT_IGMT_BIT (1ull << 6) 380#define VMX_EPT_IPAT_BIT (1ull << 6)
374 381
375#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul 382#define VMX_EPT_IDENTITY_PAGETABLE_ADDR 0xfffbc000ul
376 383
diff --git a/arch/x86/include/asm/x86_init.h b/arch/x86/include/asm/x86_init.h
index 2c756fd4ab0e..519b54327d75 100644
--- a/arch/x86/include/asm/x86_init.h
+++ b/arch/x86/include/asm/x86_init.h
@@ -26,7 +26,7 @@ struct x86_init_mpparse {
26 void (*smp_read_mpc_oem)(struct mpc_table *mpc); 26 void (*smp_read_mpc_oem)(struct mpc_table *mpc);
27 void (*mpc_oem_pci_bus)(struct mpc_bus *m); 27 void (*mpc_oem_pci_bus)(struct mpc_bus *m);
28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name); 28 void (*mpc_oem_bus_info)(struct mpc_bus *m, char *name);
29 void (*find_smp_config)(unsigned int reserve); 29 void (*find_smp_config)(void);
30 void (*get_smp_config)(unsigned int early); 30 void (*get_smp_config)(unsigned int early);
31}; 31};
32 32
@@ -91,6 +91,28 @@ struct x86_init_timers {
91}; 91};
92 92
93/** 93/**
94 * struct x86_init_iommu - platform specific iommu setup
95 * @iommu_init: platform specific iommu setup
96 */
97struct x86_init_iommu {
98 int (*iommu_init)(void);
99};
100
101/**
102 * struct x86_init_pci - platform specific pci init functions
103 * @arch_init: platform specific pci arch init call
104 * @init: platform specific pci subsystem init
105 * @init_irq: platform specific pci irq init
106 * @fixup_irqs: platform specific pci irq fixup
107 */
108struct x86_init_pci {
109 int (*arch_init)(void);
110 int (*init)(void);
111 void (*init_irq)(void);
112 void (*fixup_irqs)(void);
113};
114
115/**
94 * struct x86_init_ops - functions for platform specific setup 116 * struct x86_init_ops - functions for platform specific setup
95 * 117 *
96 */ 118 */
@@ -101,6 +123,8 @@ struct x86_init_ops {
101 struct x86_init_oem oem; 123 struct x86_init_oem oem;
102 struct x86_init_paging paging; 124 struct x86_init_paging paging;
103 struct x86_init_timers timers; 125 struct x86_init_timers timers;
126 struct x86_init_iommu iommu;
127 struct x86_init_pci pci;
104}; 128};
105 129
106/** 130/**
@@ -116,11 +140,16 @@ struct x86_cpuinit_ops {
116 * @calibrate_tsc: calibrate TSC 140 * @calibrate_tsc: calibrate TSC
117 * @get_wallclock: get time from HW clock like RTC etc. 141 * @get_wallclock: get time from HW clock like RTC etc.
118 * @set_wallclock: set time back to HW clock 142 * @set_wallclock: set time back to HW clock
143 * @is_untracked_pat_range exclude from PAT logic
144 * @nmi_init enable NMI on cpus
119 */ 145 */
120struct x86_platform_ops { 146struct x86_platform_ops {
121 unsigned long (*calibrate_tsc)(void); 147 unsigned long (*calibrate_tsc)(void);
122 unsigned long (*get_wallclock)(void); 148 unsigned long (*get_wallclock)(void);
123 int (*set_wallclock)(unsigned long nowtime); 149 int (*set_wallclock)(unsigned long nowtime);
150 void (*iommu_shutdown)(void);
151 bool (*is_untracked_pat_range)(u64 start, u64 end);
152 void (*nmi_init)(void);
124}; 153};
125 154
126extern struct x86_init_ops x86_init; 155extern struct x86_init_ops x86_init;
diff --git a/arch/x86/include/asm/xen/hypervisor.h b/arch/x86/include/asm/xen/hypervisor.h
index d5b7e90c0edf..396ff4cc8ed4 100644
--- a/arch/x86/include/asm/xen/hypervisor.h
+++ b/arch/x86/include/asm/xen/hypervisor.h
@@ -37,31 +37,4 @@
37extern struct shared_info *HYPERVISOR_shared_info; 37extern struct shared_info *HYPERVISOR_shared_info;
38extern struct start_info *xen_start_info; 38extern struct start_info *xen_start_info;
39 39
40enum xen_domain_type {
41 XEN_NATIVE, /* running on bare hardware */
42 XEN_PV_DOMAIN, /* running in a PV domain */
43 XEN_HVM_DOMAIN, /* running in a Xen hvm domain */
44};
45
46#ifdef CONFIG_XEN
47extern enum xen_domain_type xen_domain_type;
48#else
49#define xen_domain_type XEN_NATIVE
50#endif
51
52#define xen_domain() (xen_domain_type != XEN_NATIVE)
53#define xen_pv_domain() (xen_domain() && \
54 xen_domain_type == XEN_PV_DOMAIN)
55#define xen_hvm_domain() (xen_domain() && \
56 xen_domain_type == XEN_HVM_DOMAIN)
57
58#ifdef CONFIG_XEN_DOM0
59#include <xen/interface/xen.h>
60
61#define xen_initial_domain() (xen_pv_domain() && \
62 xen_start_info->flags & SIF_INITDOMAIN)
63#else /* !CONFIG_XEN_DOM0 */
64#define xen_initial_domain() (0)
65#endif /* CONFIG_XEN_DOM0 */
66
67#endif /* _ASM_X86_XEN_HYPERVISOR_H */ 40#endif /* _ASM_X86_XEN_HYPERVISOR_H */
diff --git a/arch/x86/include/asm/xsave.h b/arch/x86/include/asm/xsave.h
index 727acc152344..ddc04ccad03b 100644
--- a/arch/x86/include/asm/xsave.h
+++ b/arch/x86/include/asm/xsave.h
@@ -27,9 +27,11 @@
27extern unsigned int xstate_size; 27extern unsigned int xstate_size;
28extern u64 pcntxt_mask; 28extern u64 pcntxt_mask;
29extern struct xsave_struct *init_xstate_buf; 29extern struct xsave_struct *init_xstate_buf;
30extern u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
30 31
31extern void xsave_cntxt_init(void); 32extern void xsave_cntxt_init(void);
32extern void xsave_init(void); 33extern void xsave_init(void);
34extern void update_regset_xstate_info(unsigned int size, u64 xstate_mask);
33extern int init_fpu(struct task_struct *child); 35extern int init_fpu(struct task_struct *child);
34extern int check_for_xstate(struct i387_fxsave_struct __user *buf, 36extern int check_for_xstate(struct i387_fxsave_struct __user *buf,
35 void __user *fpstate, 37 void __user *fpstate,
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index a99b34d1b3b8..d09934e22ca5 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -40,7 +40,7 @@ obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o 40obj-$(CONFIG_X86_64) += syscall_64.o vsyscall_64.o
41obj-y += bootflag.o e820.o 41obj-y += bootflag.o e820.o
42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o 42obj-y += pci-dma.o quirks.o i8237.o topology.o kdebugfs.o
43obj-y += alternative.o i8253.o pci-nommu.o 43obj-y += alternative.o i8253.o pci-nommu.o hw_breakpoint.o
44obj-y += tsc.o io_delay.o rtc.o 44obj-y += tsc.o io_delay.o rtc.o
45 45
46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o 46obj-$(CONFIG_X86_TRAMPOLINE) += trampoline.o
@@ -87,9 +87,9 @@ obj-$(CONFIG_VM86) += vm86_32.o
87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o 87obj-$(CONFIG_EARLY_PRINTK) += early_printk.o
88 88
89obj-$(CONFIG_HPET_TIMER) += hpet.o 89obj-$(CONFIG_HPET_TIMER) += hpet.o
90obj-$(CONFIG_APB_TIMER) += apb_timer.o
90 91
91obj-$(CONFIG_K8_NB) += k8.o 92obj-$(CONFIG_K8_NB) += k8.o
92obj-$(CONFIG_MGEODE_LX) += geode_32.o mfgpt_32.o
93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o 93obj-$(CONFIG_DEBUG_RODATA_TEST) += test_rodata.o
94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o 94obj-$(CONFIG_DEBUG_NX_TEST) += test_nx.o
95 95
diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile
index fd5ca97a2ad5..6f35260bb3ef 100644
--- a/arch/x86/kernel/acpi/Makefile
+++ b/arch/x86/kernel/acpi/Makefile
@@ -4,7 +4,7 @@ obj-$(CONFIG_ACPI) += boot.o
4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o 4obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_rm.o wakeup_$(BITS).o
5 5
6ifneq ($(CONFIG_ACPI_PROCESSOR),) 6ifneq ($(CONFIG_ACPI_PROCESSOR),)
7obj-y += cstate.o processor.o 7obj-y += cstate.o
8endif 8endif
9 9
10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin 10$(obj)/wakeup_rm.o: $(obj)/realmode/wakeup.bin
diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c
index 67e929b89875..cd40aba6aa95 100644
--- a/arch/x86/kernel/acpi/boot.c
+++ b/arch/x86/kernel/acpi/boot.c
@@ -31,10 +31,12 @@
31#include <linux/module.h> 31#include <linux/module.h>
32#include <linux/dmi.h> 32#include <linux/dmi.h>
33#include <linux/irq.h> 33#include <linux/irq.h>
34#include <linux/slab.h>
34#include <linux/bootmem.h> 35#include <linux/bootmem.h>
35#include <linux/ioport.h> 36#include <linux/ioport.h>
36#include <linux/pci.h> 37#include <linux/pci.h>
37 38
39#include <asm/pci_x86.h>
38#include <asm/pgtable.h> 40#include <asm/pgtable.h>
39#include <asm/io_apic.h> 41#include <asm/io_apic.h>
40#include <asm/apic.h> 42#include <asm/apic.h>
@@ -49,6 +51,7 @@ EXPORT_SYMBOL(acpi_disabled);
49 51
50#ifdef CONFIG_X86_64 52#ifdef CONFIG_X86_64
51# include <asm/proto.h> 53# include <asm/proto.h>
54# include <asm/numa_64.h>
52#endif /* X86 */ 55#endif /* X86 */
53 56
54#define BAD_MADT_ENTRY(entry, end) ( \ 57#define BAD_MADT_ENTRY(entry, end) ( \
@@ -446,6 +449,12 @@ void __init acpi_pic_sci_set_trigger(unsigned int irq, u16 trigger)
446int acpi_gsi_to_irq(u32 gsi, unsigned int *irq) 449int acpi_gsi_to_irq(u32 gsi, unsigned int *irq)
447{ 450{
448 *irq = gsi; 451 *irq = gsi;
452
453#ifdef CONFIG_X86_IO_APIC
454 if (acpi_irq_model == ACPI_IRQ_MODEL_IOAPIC)
455 setup_IO_APIC_irq_extra(gsi);
456#endif
457
449 return 0; 458 return 0;
450} 459}
451 460
@@ -473,7 +482,8 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
473 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity); 482 plat_gsi = mp_register_gsi(dev, gsi, trigger, polarity);
474 } 483 }
475#endif 484#endif
476 acpi_gsi_to_irq(plat_gsi, &irq); 485 irq = plat_gsi;
486
477 return irq; 487 return irq;
478} 488}
479 489
@@ -481,6 +491,26 @@ int acpi_register_gsi(struct device *dev, u32 gsi, int trigger, int polarity)
481 * ACPI based hotplug support for CPU 491 * ACPI based hotplug support for CPU
482 */ 492 */
483#ifdef CONFIG_ACPI_HOTPLUG_CPU 493#ifdef CONFIG_ACPI_HOTPLUG_CPU
494#include <acpi/processor.h>
495
496static void acpi_map_cpu2node(acpi_handle handle, int cpu, int physid)
497{
498#ifdef CONFIG_ACPI_NUMA
499 int nid;
500
501 nid = acpi_get_node(handle);
502 if (nid == -1 || !node_online(nid))
503 return;
504#ifdef CONFIG_X86_64
505 apicid_to_node[physid] = nid;
506 numa_set_node(cpu, nid);
507#else /* CONFIG_X86_32 */
508 apicid_2_node[physid] = nid;
509 cpu_to_node_map[cpu] = nid;
510#endif
511
512#endif
513}
484 514
485static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu) 515static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
486{ 516{
@@ -539,7 +569,10 @@ static int __cpuinit _acpi_map_lsapic(acpi_handle handle, int *pcpu)
539 goto free_new_map; 569 goto free_new_map;
540 } 570 }
541 571
572 acpi_processor_set_pdc(handle);
573
542 cpu = cpumask_first(new_map); 574 cpu = cpumask_first(new_map);
575 acpi_map_cpu2node(handle, cpu, physid);
543 576
544 *pcpu = cpu; 577 *pcpu = cpu;
545 retval = 0; 578 retval = 0;
@@ -624,6 +657,7 @@ static int __init acpi_parse_hpet(struct acpi_table_header *table)
624 } 657 }
625 658
626 hpet_address = hpet_tbl->address.address; 659 hpet_address = hpet_tbl->address.address;
660 hpet_blockid = hpet_tbl->sequence;
627 661
628 /* 662 /*
629 * Some broken BIOSes advertise HPET at 0x0. We really do not 663 * Some broken BIOSes advertise HPET at 0x0. We really do not
@@ -1122,7 +1156,7 @@ static int __init acpi_parse_madt_ioapic_entries(void)
1122 if (!acpi_sci_override_gsi) 1156 if (!acpi_sci_override_gsi)
1123 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0); 1157 acpi_sci_ioapic_setup(acpi_gbl_FADT.sci_interrupt, 0, 0);
1124 1158
1125 /* Fill in identity legacy mapings where no override */ 1159 /* Fill in identity legacy mappings where no override */
1126 mp_config_acpi_legacy_irqs(); 1160 mp_config_acpi_legacy_irqs();
1127 1161
1128 count = 1162 count =
@@ -1184,9 +1218,6 @@ static void __init acpi_process_madt(void)
1184 if (!error) { 1218 if (!error) {
1185 acpi_lapic = 1; 1219 acpi_lapic = 1;
1186 1220
1187#ifdef CONFIG_X86_BIGSMP
1188 generic_bigsmp_probe();
1189#endif
1190 /* 1221 /*
1191 * Parse MADT IO-APIC entries 1222 * Parse MADT IO-APIC entries
1192 */ 1223 */
@@ -1196,8 +1227,6 @@ static void __init acpi_process_madt(void)
1196 acpi_ioapic = 1; 1227 acpi_ioapic = 1;
1197 1228
1198 smp_found_config = 1; 1229 smp_found_config = 1;
1199 if (apic->setup_apic_routing)
1200 apic->setup_apic_routing();
1201 } 1230 }
1202 } 1231 }
1203 if (error == -EINVAL) { 1232 if (error == -EINVAL) {
@@ -1268,23 +1297,6 @@ static int __init dmi_disable_acpi(const struct dmi_system_id *d)
1268} 1297}
1269 1298
1270/* 1299/*
1271 * Limit ACPI to CPU enumeration for HT
1272 */
1273static int __init force_acpi_ht(const struct dmi_system_id *d)
1274{
1275 if (!acpi_force) {
1276 printk(KERN_NOTICE "%s detected: force use of acpi=ht\n",
1277 d->ident);
1278 disable_acpi();
1279 acpi_ht = 1;
1280 } else {
1281 printk(KERN_NOTICE
1282 "Warning: acpi=force overrules DMI blacklist: acpi=ht\n");
1283 }
1284 return 0;
1285}
1286
1287/*
1288 * Force ignoring BIOS IRQ0 pin2 override 1300 * Force ignoring BIOS IRQ0 pin2 override
1289 */ 1301 */
1290static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d) 1302static int __init dmi_ignore_irq0_timer_override(const struct dmi_system_id *d)
@@ -1320,90 +1332,6 @@ static struct dmi_system_id __initdata acpi_dmi_table[] = {
1320 }, 1332 },
1321 1333
1322 /* 1334 /*
1323 * Boxes that need acpi=ht
1324 */
1325 {
1326 .callback = force_acpi_ht,
1327 .ident = "FSC Primergy T850",
1328 .matches = {
1329 DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"),
1330 DMI_MATCH(DMI_PRODUCT_NAME, "PRIMERGY T850"),
1331 },
1332 },
1333 {
1334 .callback = force_acpi_ht,
1335 .ident = "HP VISUALIZE NT Workstation",
1336 .matches = {
1337 DMI_MATCH(DMI_BOARD_VENDOR, "Hewlett-Packard"),
1338 DMI_MATCH(DMI_PRODUCT_NAME, "HP VISUALIZE NT Workstation"),
1339 },
1340 },
1341 {
1342 .callback = force_acpi_ht,
1343 .ident = "Compaq Workstation W8000",
1344 .matches = {
1345 DMI_MATCH(DMI_SYS_VENDOR, "Compaq"),
1346 DMI_MATCH(DMI_PRODUCT_NAME, "Workstation W8000"),
1347 },
1348 },
1349 {
1350 .callback = force_acpi_ht,
1351 .ident = "ASUS P2B-DS",
1352 .matches = {
1353 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1354 DMI_MATCH(DMI_BOARD_NAME, "P2B-DS"),
1355 },
1356 },
1357 {
1358 .callback = force_acpi_ht,
1359 .ident = "ASUS CUR-DLS",
1360 .matches = {
1361 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
1362 DMI_MATCH(DMI_BOARD_NAME, "CUR-DLS"),
1363 },
1364 },
1365 {
1366 .callback = force_acpi_ht,
1367 .ident = "ABIT i440BX-W83977",
1368 .matches = {
1369 DMI_MATCH(DMI_BOARD_VENDOR, "ABIT <http://www.abit.com>"),
1370 DMI_MATCH(DMI_BOARD_NAME, "i440BX-W83977 (BP6)"),
1371 },
1372 },
1373 {
1374 .callback = force_acpi_ht,
1375 .ident = "IBM Bladecenter",
1376 .matches = {
1377 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1378 DMI_MATCH(DMI_BOARD_NAME, "IBM eServer BladeCenter HS20"),
1379 },
1380 },
1381 {
1382 .callback = force_acpi_ht,
1383 .ident = "IBM eServer xSeries 360",
1384 .matches = {
1385 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1386 DMI_MATCH(DMI_BOARD_NAME, "eServer xSeries 360"),
1387 },
1388 },
1389 {
1390 .callback = force_acpi_ht,
1391 .ident = "IBM eserver xSeries 330",
1392 .matches = {
1393 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1394 DMI_MATCH(DMI_BOARD_NAME, "eserver xSeries 330"),
1395 },
1396 },
1397 {
1398 .callback = force_acpi_ht,
1399 .ident = "IBM eserver xSeries 440",
1400 .matches = {
1401 DMI_MATCH(DMI_BOARD_VENDOR, "IBM"),
1402 DMI_MATCH(DMI_PRODUCT_NAME, "eserver xSeries 440"),
1403 },
1404 },
1405
1406 /*
1407 * Boxes that need ACPI PCI IRQ routing disabled 1335 * Boxes that need ACPI PCI IRQ routing disabled
1408 */ 1336 */
1409 { 1337 {
@@ -1528,16 +1456,10 @@ static struct dmi_system_id __initdata acpi_dmi_table_late[] = {
1528 * if acpi_blacklisted() acpi_disabled = 1; 1456 * if acpi_blacklisted() acpi_disabled = 1;
1529 * acpi_irq_model=... 1457 * acpi_irq_model=...
1530 * ... 1458 * ...
1531 *
1532 * return value: (currently ignored)
1533 * 0: success
1534 * !0: failure
1535 */ 1459 */
1536 1460
1537int __init acpi_boot_table_init(void) 1461void __init acpi_boot_table_init(void)
1538{ 1462{
1539 int error;
1540
1541 dmi_check_system(acpi_dmi_table); 1463 dmi_check_system(acpi_dmi_table);
1542 1464
1543 /* 1465 /*
@@ -1545,15 +1467,14 @@ int __init acpi_boot_table_init(void)
1545 * One exception: acpi=ht continues far enough to enumerate LAPICs 1467 * One exception: acpi=ht continues far enough to enumerate LAPICs
1546 */ 1468 */
1547 if (acpi_disabled && !acpi_ht) 1469 if (acpi_disabled && !acpi_ht)
1548 return 1; 1470 return;
1549 1471
1550 /* 1472 /*
1551 * Initialize the ACPI boot-time table parser. 1473 * Initialize the ACPI boot-time table parser.
1552 */ 1474 */
1553 error = acpi_table_init(); 1475 if (acpi_table_init()) {
1554 if (error) {
1555 disable_acpi(); 1476 disable_acpi();
1556 return error; 1477 return;
1557 } 1478 }
1558 1479
1559 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf); 1480 acpi_table_parse(ACPI_SIG_BOOT, acpi_parse_sbf);
@@ -1561,18 +1482,15 @@ int __init acpi_boot_table_init(void)
1561 /* 1482 /*
1562 * blacklist may disable ACPI entirely 1483 * blacklist may disable ACPI entirely
1563 */ 1484 */
1564 error = acpi_blacklisted(); 1485 if (acpi_blacklisted()) {
1565 if (error) {
1566 if (acpi_force) { 1486 if (acpi_force) {
1567 printk(KERN_WARNING PREFIX "acpi=force override\n"); 1487 printk(KERN_WARNING PREFIX "acpi=force override\n");
1568 } else { 1488 } else {
1569 printk(KERN_WARNING PREFIX "Disabling ACPI support\n"); 1489 printk(KERN_WARNING PREFIX "Disabling ACPI support\n");
1570 disable_acpi(); 1490 disable_acpi();
1571 return error; 1491 return;
1572 } 1492 }
1573 } 1493 }
1574
1575 return 0;
1576} 1494}
1577 1495
1578int __init early_acpi_boot_init(void) 1496int __init early_acpi_boot_init(void)
@@ -1618,6 +1536,9 @@ int __init acpi_boot_init(void)
1618 1536
1619 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet); 1537 acpi_table_parse(ACPI_SIG_HPET, acpi_parse_hpet);
1620 1538
1539 if (!acpi_noirq)
1540 x86_init.pci.init = pci_acpi_init;
1541
1621 return 0; 1542 return 0;
1622} 1543}
1623 1544
@@ -1642,8 +1563,10 @@ static int __init parse_acpi(char *arg)
1642 } 1563 }
1643 /* Limit ACPI just to boot-time to enable HT */ 1564 /* Limit ACPI just to boot-time to enable HT */
1644 else if (strcmp(arg, "ht") == 0) { 1565 else if (strcmp(arg, "ht") == 0) {
1645 if (!acpi_force) 1566 if (!acpi_force) {
1567 printk(KERN_WARNING "acpi=ht will be removed in Linux-2.6.35\n");
1646 disable_acpi(); 1568 disable_acpi();
1569 }
1647 acpi_ht = 1; 1570 acpi_ht = 1;
1648 } 1571 }
1649 /* acpi=rsdt use RSDT instead of XSDT */ 1572 /* acpi=rsdt use RSDT instead of XSDT */
diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c
index 59cdfa4686b2..2e837f5080fe 100644
--- a/arch/x86/kernel/acpi/cstate.c
+++ b/arch/x86/kernel/acpi/cstate.c
@@ -48,7 +48,7 @@ void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags,
48 * P4, Core and beyond CPUs 48 * P4, Core and beyond CPUs
49 */ 49 */
50 if (c->x86_vendor == X86_VENDOR_INTEL && 50 if (c->x86_vendor == X86_VENDOR_INTEL &&
51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 14))) 51 (c->x86 > 0xf || (c->x86 == 6 && c->x86_model >= 0x0f)))
52 flags->bm_control = 0; 52 flags->bm_control = 0;
53} 53}
54EXPORT_SYMBOL(acpi_processor_power_init_bm_check); 54EXPORT_SYMBOL(acpi_processor_power_init_bm_check);
diff --git a/arch/x86/kernel/acpi/processor.c b/arch/x86/kernel/acpi/processor.c
deleted file mode 100644
index d85d1b2432ba..000000000000
--- a/arch/x86/kernel/acpi/processor.c
+++ /dev/null
@@ -1,101 +0,0 @@
1/*
2 * Copyright (C) 2005 Intel Corporation
3 * Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
4 * - Added _PDC for platforms with Intel CPUs
5 */
6
7#include <linux/kernel.h>
8#include <linux/module.h>
9#include <linux/init.h>
10#include <linux/acpi.h>
11
12#include <acpi/processor.h>
13#include <asm/acpi.h>
14
15static void init_intel_pdc(struct acpi_processor *pr, struct cpuinfo_x86 *c)
16{
17 struct acpi_object_list *obj_list;
18 union acpi_object *obj;
19 u32 *buf;
20
21 /* allocate and initialize pdc. It will be used later. */
22 obj_list = kmalloc(sizeof(struct acpi_object_list), GFP_KERNEL);
23 if (!obj_list) {
24 printk(KERN_ERR "Memory allocation error\n");
25 return;
26 }
27
28 obj = kmalloc(sizeof(union acpi_object), GFP_KERNEL);
29 if (!obj) {
30 printk(KERN_ERR "Memory allocation error\n");
31 kfree(obj_list);
32 return;
33 }
34
35 buf = kmalloc(12, GFP_KERNEL);
36 if (!buf) {
37 printk(KERN_ERR "Memory allocation error\n");
38 kfree(obj);
39 kfree(obj_list);
40 return;
41 }
42
43 buf[0] = ACPI_PDC_REVISION_ID;
44 buf[1] = 1;
45 buf[2] = ACPI_PDC_C_CAPABILITY_SMP;
46
47 /*
48 * The default of PDC_SMP_T_SWCOORD bit is set for intel x86 cpu so
49 * that OSPM is capable of native ACPI throttling software
50 * coordination using BIOS supplied _TSD info.
51 */
52 buf[2] |= ACPI_PDC_SMP_T_SWCOORD;
53 if (cpu_has(c, X86_FEATURE_EST))
54 buf[2] |= ACPI_PDC_EST_CAPABILITY_SWSMP;
55
56 if (cpu_has(c, X86_FEATURE_ACPI))
57 buf[2] |= ACPI_PDC_T_FFH;
58
59 /*
60 * If mwait/monitor is unsupported, C2/C3_FFH will be disabled
61 */
62 if (!cpu_has(c, X86_FEATURE_MWAIT))
63 buf[2] &= ~(ACPI_PDC_C_C2C3_FFH);
64
65 obj->type = ACPI_TYPE_BUFFER;
66 obj->buffer.length = 12;
67 obj->buffer.pointer = (u8 *) buf;
68 obj_list->count = 1;
69 obj_list->pointer = obj;
70 pr->pdc = obj_list;
71
72 return;
73}
74
75
76/* Initialize _PDC data based on the CPU vendor */
77void arch_acpi_processor_init_pdc(struct acpi_processor *pr)
78{
79 struct cpuinfo_x86 *c = &cpu_data(pr->id);
80
81 pr->pdc = NULL;
82 if (c->x86_vendor == X86_VENDOR_INTEL ||
83 c->x86_vendor == X86_VENDOR_CENTAUR)
84 init_intel_pdc(pr, c);
85
86 return;
87}
88
89EXPORT_SYMBOL(arch_acpi_processor_init_pdc);
90
91void arch_acpi_processor_cleanup_pdc(struct acpi_processor *pr)
92{
93 if (pr->pdc) {
94 kfree(pr->pdc->pointer->buffer.pointer);
95 kfree(pr->pdc->pointer);
96 kfree(pr->pdc);
97 pr->pdc = NULL;
98 }
99}
100
101EXPORT_SYMBOL(arch_acpi_processor_cleanup_pdc);
diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c
index ca93638ba430..f9961034e557 100644
--- a/arch/x86/kernel/acpi/sleep.c
+++ b/arch/x86/kernel/acpi/sleep.c
@@ -78,12 +78,9 @@ int acpi_save_state_mem(void)
78#ifndef CONFIG_64BIT 78#ifndef CONFIG_64BIT
79 store_gdt((struct desc_ptr *)&header->pmode_gdt); 79 store_gdt((struct desc_ptr *)&header->pmode_gdt);
80 80
81 header->pmode_efer_low = nx_enabled; 81 if (rdmsr_safe(MSR_EFER, &header->pmode_efer_low,
82 if (header->pmode_efer_low & 1) { 82 &header->pmode_efer_high))
83 /* This is strange, why not save efer, always? */ 83 header->pmode_efer_low = header->pmode_efer_high = 0;
84 rdmsr(MSR_EFER, header->pmode_efer_low,
85 header->pmode_efer_high);
86 }
87#endif /* !CONFIG_64BIT */ 84#endif /* !CONFIG_64BIT */
88 85
89 header->pmode_cr0 = read_cr0(); 86 header->pmode_cr0 = read_cr0();
@@ -119,29 +116,32 @@ void acpi_restore_state_mem(void)
119 116
120 117
121/** 118/**
122 * acpi_reserve_bootmem - do _very_ early ACPI initialisation 119 * acpi_reserve_wakeup_memory - do _very_ early ACPI initialisation
123 * 120 *
124 * We allocate a page from the first 1MB of memory for the wakeup 121 * We allocate a page from the first 1MB of memory for the wakeup
125 * routine for when we come back from a sleep state. The 122 * routine for when we come back from a sleep state. The
126 * runtime allocator allows specification of <16MB pages, but not 123 * runtime allocator allows specification of <16MB pages, but not
127 * <1MB pages. 124 * <1MB pages.
128 */ 125 */
129void __init acpi_reserve_bootmem(void) 126void __init acpi_reserve_wakeup_memory(void)
130{ 127{
128 unsigned long mem;
129
131 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) { 130 if ((&wakeup_code_end - &wakeup_code_start) > WAKEUP_SIZE) {
132 printk(KERN_ERR 131 printk(KERN_ERR
133 "ACPI: Wakeup code way too big, S3 disabled.\n"); 132 "ACPI: Wakeup code way too big, S3 disabled.\n");
134 return; 133 return;
135 } 134 }
136 135
137 acpi_realmode = (unsigned long)alloc_bootmem_low(WAKEUP_SIZE); 136 mem = find_e820_area(0, 1<<20, WAKEUP_SIZE, PAGE_SIZE);
138 137
139 if (!acpi_realmode) { 138 if (mem == -1L) {
140 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n"); 139 printk(KERN_ERR "ACPI: Cannot allocate lowmem, S3 disabled.\n");
141 return; 140 return;
142 } 141 }
143 142 acpi_realmode = (unsigned long) phys_to_virt(mem);
144 acpi_wakeup_address = virt_to_phys((void *)acpi_realmode); 143 acpi_wakeup_address = mem;
144 reserve_early(mem, mem + WAKEUP_SIZE, "ACPI WAKEUP");
145} 145}
146 146
147 147
@@ -162,6 +162,8 @@ static int __init acpi_sleep_setup(char *str)
162#endif 162#endif
163 if (strncmp(str, "old_ordering", 12) == 0) 163 if (strncmp(str, "old_ordering", 12) == 0)
164 acpi_old_suspend_ordering(); 164 acpi_old_suspend_ordering();
165 if (strncmp(str, "sci_force_enable", 16) == 0)
166 acpi_set_sci_en_on_resume();
165 str = strchr(str, ','); 167 str = strchr(str, ',');
166 if (str != NULL) 168 if (str != NULL)
167 str += strspn(str, ", \t"); 169 str += strspn(str, ", \t");
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index de7353c0ce9c..1a160d5d44d0 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -7,6 +7,8 @@
7#include <linux/mm.h> 7#include <linux/mm.h>
8#include <linux/vmalloc.h> 8#include <linux/vmalloc.h>
9#include <linux/memory.h> 9#include <linux/memory.h>
10#include <linux/stop_machine.h>
11#include <linux/slab.h>
10#include <asm/alternative.h> 12#include <asm/alternative.h>
11#include <asm/sections.h> 13#include <asm/sections.h>
12#include <asm/pgtable.h> 14#include <asm/pgtable.h>
@@ -205,7 +207,7 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
205 struct alt_instr *end) 207 struct alt_instr *end)
206{ 208{
207 struct alt_instr *a; 209 struct alt_instr *a;
208 char insnbuf[MAX_PATCH_LEN]; 210 u8 insnbuf[MAX_PATCH_LEN];
209 211
210 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); 212 DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
211 for (a = start; a < end; a++) { 213 for (a = start; a < end; a++) {
@@ -223,6 +225,8 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
223 } 225 }
224#endif 226#endif
225 memcpy(insnbuf, a->replacement, a->replacementlen); 227 memcpy(insnbuf, a->replacement, a->replacementlen);
228 if (*insnbuf == 0xe8 && a->replacementlen == 5)
229 *(s32 *)(insnbuf + 1) += a->replacement - a->instr;
226 add_nops(insnbuf + a->replacementlen, 230 add_nops(insnbuf + a->replacementlen,
227 a->instrlen - a->replacementlen); 231 a->instrlen - a->replacementlen);
228 text_poke_early(instr, insnbuf, a->instrlen); 232 text_poke_early(instr, insnbuf, a->instrlen);
@@ -390,6 +394,24 @@ void alternatives_smp_switch(int smp)
390 mutex_unlock(&smp_alt); 394 mutex_unlock(&smp_alt);
391} 395}
392 396
397/* Return 1 if the address range is reserved for smp-alternatives */
398int alternatives_text_reserved(void *start, void *end)
399{
400 struct smp_alt_module *mod;
401 u8 **ptr;
402 u8 *text_start = start;
403 u8 *text_end = end;
404
405 list_for_each_entry(mod, &smp_alt_modules, next) {
406 if (mod->text > text_end || mod->text_end < text_start)
407 continue;
408 for (ptr = mod->locks; ptr < mod->locks_end; ptr++)
409 if (text_start <= *ptr && text_end >= *ptr)
410 return 1;
411 }
412
413 return 0;
414}
393#endif 415#endif
394 416
395#ifdef CONFIG_PARAVIRT 417#ifdef CONFIG_PARAVIRT
@@ -552,3 +574,62 @@ void *__kprobes text_poke(void *addr, const void *opcode, size_t len)
552 local_irq_restore(flags); 574 local_irq_restore(flags);
553 return addr; 575 return addr;
554} 576}
577
578/*
579 * Cross-modifying kernel text with stop_machine().
580 * This code originally comes from immediate value.
581 */
582static atomic_t stop_machine_first;
583static int wrote_text;
584
585struct text_poke_params {
586 void *addr;
587 const void *opcode;
588 size_t len;
589};
590
591static int __kprobes stop_machine_text_poke(void *data)
592{
593 struct text_poke_params *tpp = data;
594
595 if (atomic_dec_and_test(&stop_machine_first)) {
596 text_poke(tpp->addr, tpp->opcode, tpp->len);
597 smp_wmb(); /* Make sure other cpus see that this has run */
598 wrote_text = 1;
599 } else {
600 while (!wrote_text)
601 cpu_relax();
602 smp_mb(); /* Load wrote_text before following execution */
603 }
604
605 flush_icache_range((unsigned long)tpp->addr,
606 (unsigned long)tpp->addr + tpp->len);
607 return 0;
608}
609
610/**
611 * text_poke_smp - Update instructions on a live kernel on SMP
612 * @addr: address to modify
613 * @opcode: source of the copy
614 * @len: length to copy
615 *
616 * Modify multi-byte instruction by using stop_machine() on SMP. This allows
617 * user to poke/set multi-byte text on SMP. Only non-NMI/MCE code modifying
618 * should be allowed, since stop_machine() does _not_ protect code against
619 * NMI and MCE.
620 *
621 * Note: Must be called under get_online_cpus() and text_mutex.
622 */
623void *__kprobes text_poke_smp(void *addr, const void *opcode, size_t len)
624{
625 struct text_poke_params tpp;
626
627 tpp.addr = addr;
628 tpp.opcode = opcode;
629 tpp.len = len;
630 atomic_set(&stop_machine_first, 1);
631 wrote_text = 0;
632 stop_machine(stop_machine_text_poke, (void *)&tpp, NULL);
633 return addr;
634}
635
diff --git a/arch/x86/kernel/amd_iommu.c b/arch/x86/kernel/amd_iommu.c
index 0285521e0a99..f854d89b7edf 100644
--- a/arch/x86/kernel/amd_iommu.c
+++ b/arch/x86/kernel/amd_iommu.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -18,8 +18,8 @@
18 */ 18 */
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/gfp.h> 21#include <linux/bitmap.h>
22#include <linux/bitops.h> 22#include <linux/slab.h>
23#include <linux/debugfs.h> 23#include <linux/debugfs.h>
24#include <linux/scatterlist.h> 24#include <linux/scatterlist.h>
25#include <linux/dma-mapping.h> 25#include <linux/dma-mapping.h>
@@ -28,6 +28,7 @@
28#include <asm/proto.h> 28#include <asm/proto.h>
29#include <asm/iommu.h> 29#include <asm/iommu.h>
30#include <asm/gart.h> 30#include <asm/gart.h>
31#include <asm/amd_iommu_proto.h>
31#include <asm/amd_iommu_types.h> 32#include <asm/amd_iommu_types.h>
32#include <asm/amd_iommu.h> 33#include <asm/amd_iommu.h>
33 34
@@ -56,20 +57,152 @@ struct iommu_cmd {
56 u32 data[4]; 57 u32 data[4];
57}; 58};
58 59
59static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
60 struct unity_map_entry *e);
61static struct dma_ops_domain *find_protection_domain(u16 devid);
62static u64 *alloc_pte(struct protection_domain *domain,
63 unsigned long address, int end_lvl,
64 u64 **pte_page, gfp_t gfp);
65static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
66 unsigned long start_page,
67 unsigned int pages);
68static void reset_iommu_command_buffer(struct amd_iommu *iommu); 60static void reset_iommu_command_buffer(struct amd_iommu *iommu);
69static u64 *fetch_pte(struct protection_domain *domain,
70 unsigned long address, int map_size);
71static void update_domain(struct protection_domain *domain); 61static void update_domain(struct protection_domain *domain);
72 62
63/****************************************************************************
64 *
65 * Helper functions
66 *
67 ****************************************************************************/
68
69static inline u16 get_device_id(struct device *dev)
70{
71 struct pci_dev *pdev = to_pci_dev(dev);
72
73 return calc_devid(pdev->bus->number, pdev->devfn);
74}
75
76static struct iommu_dev_data *get_dev_data(struct device *dev)
77{
78 return dev->archdata.iommu;
79}
80
81/*
82 * In this function the list of preallocated protection domains is traversed to
83 * find the domain for a specific device
84 */
85static struct dma_ops_domain *find_protection_domain(u16 devid)
86{
87 struct dma_ops_domain *entry, *ret = NULL;
88 unsigned long flags;
89 u16 alias = amd_iommu_alias_table[devid];
90
91 if (list_empty(&iommu_pd_list))
92 return NULL;
93
94 spin_lock_irqsave(&iommu_pd_list_lock, flags);
95
96 list_for_each_entry(entry, &iommu_pd_list, list) {
97 if (entry->target_dev == devid ||
98 entry->target_dev == alias) {
99 ret = entry;
100 break;
101 }
102 }
103
104 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
105
106 return ret;
107}
108
109/*
110 * This function checks if the driver got a valid device from the caller to
111 * avoid dereferencing invalid pointers.
112 */
113static bool check_device(struct device *dev)
114{
115 u16 devid;
116
117 if (!dev || !dev->dma_mask)
118 return false;
119
120 /* No device or no PCI device */
121 if (dev->bus != &pci_bus_type)
122 return false;
123
124 devid = get_device_id(dev);
125
126 /* Out of our scope? */
127 if (devid > amd_iommu_last_bdf)
128 return false;
129
130 if (amd_iommu_rlookup_table[devid] == NULL)
131 return false;
132
133 return true;
134}
135
136static int iommu_init_device(struct device *dev)
137{
138 struct iommu_dev_data *dev_data;
139 struct pci_dev *pdev;
140 u16 devid, alias;
141
142 if (dev->archdata.iommu)
143 return 0;
144
145 dev_data = kzalloc(sizeof(*dev_data), GFP_KERNEL);
146 if (!dev_data)
147 return -ENOMEM;
148
149 dev_data->dev = dev;
150
151 devid = get_device_id(dev);
152 alias = amd_iommu_alias_table[devid];
153 pdev = pci_get_bus_and_slot(PCI_BUS(alias), alias & 0xff);
154 if (pdev)
155 dev_data->alias = &pdev->dev;
156
157 atomic_set(&dev_data->bind, 0);
158
159 dev->archdata.iommu = dev_data;
160
161
162 return 0;
163}
164
165static void iommu_uninit_device(struct device *dev)
166{
167 kfree(dev->archdata.iommu);
168}
169
170void __init amd_iommu_uninit_devices(void)
171{
172 struct pci_dev *pdev = NULL;
173
174 for_each_pci_dev(pdev) {
175
176 if (!check_device(&pdev->dev))
177 continue;
178
179 iommu_uninit_device(&pdev->dev);
180 }
181}
182
183int __init amd_iommu_init_devices(void)
184{
185 struct pci_dev *pdev = NULL;
186 int ret = 0;
187
188 for_each_pci_dev(pdev) {
189
190 if (!check_device(&pdev->dev))
191 continue;
192
193 ret = iommu_init_device(&pdev->dev);
194 if (ret)
195 goto out_free;
196 }
197
198 return 0;
199
200out_free:
201
202 amd_iommu_uninit_devices();
203
204 return ret;
205}
73#ifdef CONFIG_AMD_IOMMU_STATS 206#ifdef CONFIG_AMD_IOMMU_STATS
74 207
75/* 208/*
@@ -90,7 +223,6 @@ DECLARE_STATS_COUNTER(alloced_io_mem);
90DECLARE_STATS_COUNTER(total_map_requests); 223DECLARE_STATS_COUNTER(total_map_requests);
91 224
92static struct dentry *stats_dir; 225static struct dentry *stats_dir;
93static struct dentry *de_isolate;
94static struct dentry *de_fflush; 226static struct dentry *de_fflush;
95 227
96static void amd_iommu_stats_add(struct __iommu_counter *cnt) 228static void amd_iommu_stats_add(struct __iommu_counter *cnt)
@@ -108,9 +240,6 @@ static void amd_iommu_stats_init(void)
108 if (stats_dir == NULL) 240 if (stats_dir == NULL)
109 return; 241 return;
110 242
111 de_isolate = debugfs_create_bool("isolation", 0444, stats_dir,
112 (u32 *)&amd_iommu_isolate);
113
114 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir, 243 de_fflush = debugfs_create_bool("fullflush", 0444, stats_dir,
115 (u32 *)&amd_iommu_unmap_flush); 244 (u32 *)&amd_iommu_unmap_flush);
116 245
@@ -130,12 +259,6 @@ static void amd_iommu_stats_init(void)
130 259
131#endif 260#endif
132 261
133/* returns !0 if the IOMMU is caching non-present entries in its TLB */
134static int iommu_has_npcache(struct amd_iommu *iommu)
135{
136 return iommu->cap & (1UL << IOMMU_CAP_NPCACHE);
137}
138
139/**************************************************************************** 262/****************************************************************************
140 * 263 *
141 * Interrupt handling functions 264 * Interrupt handling functions
@@ -199,6 +322,7 @@ static void iommu_print_event(struct amd_iommu *iommu, void *__evt)
199 break; 322 break;
200 case EVENT_TYPE_ILL_CMD: 323 case EVENT_TYPE_ILL_CMD:
201 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address); 324 printk("ILLEGAL_COMMAND_ERROR address=0x%016llx]\n", address);
325 iommu->reset_in_progress = true;
202 reset_iommu_command_buffer(iommu); 326 reset_iommu_command_buffer(iommu);
203 dump_command(address); 327 dump_command(address);
204 break; 328 break;
@@ -268,6 +392,7 @@ static int __iommu_queue_command(struct amd_iommu *iommu, struct iommu_cmd *cmd)
268 u32 tail, head; 392 u32 tail, head;
269 u8 *target; 393 u8 *target;
270 394
395 WARN_ON(iommu->cmd_buf_size & CMD_BUFFER_UNINITIALIZED);
271 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET); 396 tail = readl(iommu->mmio_base + MMIO_CMD_TAIL_OFFSET);
272 target = iommu->cmd_buf + tail; 397 target = iommu->cmd_buf + tail;
273 memcpy_toio(target, cmd, sizeof(*cmd)); 398 memcpy_toio(target, cmd, sizeof(*cmd));
@@ -321,11 +446,8 @@ static void __iommu_wait_for_completion(struct amd_iommu *iommu)
321 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK; 446 status &= ~MMIO_STATUS_COM_WAIT_INT_MASK;
322 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET); 447 writel(status, iommu->mmio_base + MMIO_STATUS_OFFSET);
323 448
324 if (unlikely(i == EXIT_LOOP_COUNT)) { 449 if (unlikely(i == EXIT_LOOP_COUNT))
325 spin_unlock(&iommu->lock); 450 iommu->reset_in_progress = true;
326 reset_iommu_command_buffer(iommu);
327 spin_lock(&iommu->lock);
328 }
329} 451}
330 452
331/* 453/*
@@ -372,26 +494,46 @@ static int iommu_completion_wait(struct amd_iommu *iommu)
372out: 494out:
373 spin_unlock_irqrestore(&iommu->lock, flags); 495 spin_unlock_irqrestore(&iommu->lock, flags);
374 496
497 if (iommu->reset_in_progress)
498 reset_iommu_command_buffer(iommu);
499
375 return 0; 500 return 0;
376} 501}
377 502
503static void iommu_flush_complete(struct protection_domain *domain)
504{
505 int i;
506
507 for (i = 0; i < amd_iommus_present; ++i) {
508 if (!domain->dev_iommu[i])
509 continue;
510
511 /*
512 * Devices of this domain are behind this IOMMU
513 * We need to wait for completion of all commands.
514 */
515 iommu_completion_wait(amd_iommus[i]);
516 }
517}
518
378/* 519/*
379 * Command send function for invalidating a device table entry 520 * Command send function for invalidating a device table entry
380 */ 521 */
381static int iommu_queue_inv_dev_entry(struct amd_iommu *iommu, u16 devid) 522static int iommu_flush_device(struct device *dev)
382{ 523{
524 struct amd_iommu *iommu;
383 struct iommu_cmd cmd; 525 struct iommu_cmd cmd;
384 int ret; 526 u16 devid;
385 527
386 BUG_ON(iommu == NULL); 528 devid = get_device_id(dev);
529 iommu = amd_iommu_rlookup_table[devid];
387 530
531 /* Build command */
388 memset(&cmd, 0, sizeof(cmd)); 532 memset(&cmd, 0, sizeof(cmd));
389 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY); 533 CMD_SET_TYPE(&cmd, CMD_INV_DEV_ENTRY);
390 cmd.data[0] = devid; 534 cmd.data[0] = devid;
391 535
392 ret = iommu_queue_command(iommu, &cmd); 536 return iommu_queue_command(iommu, &cmd);
393
394 return ret;
395} 537}
396 538
397static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address, 539static void __iommu_build_inv_iommu_pages(struct iommu_cmd *cmd, u64 address,
@@ -430,11 +572,11 @@ static int iommu_queue_inv_iommu_pages(struct amd_iommu *iommu,
430 * It invalidates a single PTE if the range to flush is within a single 572 * It invalidates a single PTE if the range to flush is within a single
431 * page. Otherwise it flushes the whole TLB of the IOMMU. 573 * page. Otherwise it flushes the whole TLB of the IOMMU.
432 */ 574 */
433static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid, 575static void __iommu_flush_pages(struct protection_domain *domain,
434 u64 address, size_t size) 576 u64 address, size_t size, int pde)
435{ 577{
436 int s = 0; 578 int s = 0, i;
437 unsigned pages = iommu_num_pages(address, size, PAGE_SIZE); 579 unsigned long pages = iommu_num_pages(address, size, PAGE_SIZE);
438 580
439 address &= PAGE_MASK; 581 address &= PAGE_MASK;
440 582
@@ -447,142 +589,212 @@ static int iommu_flush_pages(struct amd_iommu *iommu, u16 domid,
447 s = 1; 589 s = 1;
448 } 590 }
449 591
450 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, s);
451 592
452 return 0; 593 for (i = 0; i < amd_iommus_present; ++i) {
594 if (!domain->dev_iommu[i])
595 continue;
596
597 /*
598 * Devices of this domain are behind this IOMMU
599 * We need a TLB flush
600 */
601 iommu_queue_inv_iommu_pages(amd_iommus[i], address,
602 domain->id, pde, s);
603 }
604
605 return;
453} 606}
454 607
455/* Flush the whole IO/TLB for a given protection domain */ 608static void iommu_flush_pages(struct protection_domain *domain,
456static void iommu_flush_tlb(struct amd_iommu *iommu, u16 domid) 609 u64 address, size_t size)
457{ 610{
458 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 611 __iommu_flush_pages(domain, address, size, 0);
459 612}
460 INC_STATS_COUNTER(domain_flush_single);
461 613
462 iommu_queue_inv_iommu_pages(iommu, address, domid, 0, 1); 614/* Flush the whole IO/TLB for a given protection domain */
615static void iommu_flush_tlb(struct protection_domain *domain)
616{
617 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 0);
463} 618}
464 619
465/* Flush the whole IO/TLB for a given protection domain - including PDE */ 620/* Flush the whole IO/TLB for a given protection domain - including PDE */
466static void iommu_flush_tlb_pde(struct amd_iommu *iommu, u16 domid) 621static void iommu_flush_tlb_pde(struct protection_domain *domain)
467{ 622{
468 u64 address = CMD_INV_IOMMU_ALL_PAGES_ADDRESS; 623 __iommu_flush_pages(domain, 0, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 1);
469
470 INC_STATS_COUNTER(domain_flush_single);
471
472 iommu_queue_inv_iommu_pages(iommu, address, domid, 1, 1);
473} 624}
474 625
626
475/* 627/*
476 * This function flushes one domain on one IOMMU 628 * This function flushes the DTEs for all devices in domain
477 */ 629 */
478static void flush_domain_on_iommu(struct amd_iommu *iommu, u16 domid) 630static void iommu_flush_domain_devices(struct protection_domain *domain)
479{ 631{
480 struct iommu_cmd cmd; 632 struct iommu_dev_data *dev_data;
481 unsigned long flags; 633 unsigned long flags;
482 634
483 __iommu_build_inv_iommu_pages(&cmd, CMD_INV_IOMMU_ALL_PAGES_ADDRESS, 635 spin_lock_irqsave(&domain->lock, flags);
484 domid, 1, 1);
485 636
486 spin_lock_irqsave(&iommu->lock, flags); 637 list_for_each_entry(dev_data, &domain->dev_list, list)
487 __iommu_queue_command(iommu, &cmd); 638 iommu_flush_device(dev_data->dev);
488 __iommu_completion_wait(iommu); 639
489 __iommu_wait_for_completion(iommu); 640 spin_unlock_irqrestore(&domain->lock, flags);
490 spin_unlock_irqrestore(&iommu->lock, flags);
491} 641}
492 642
493static void flush_all_domains_on_iommu(struct amd_iommu *iommu) 643static void iommu_flush_all_domain_devices(void)
494{ 644{
495 int i; 645 struct protection_domain *domain;
646 unsigned long flags;
496 647
497 for (i = 1; i < MAX_DOMAIN_ID; ++i) { 648 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
498 if (!test_bit(i, amd_iommu_pd_alloc_bitmap)) 649
499 continue; 650 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
500 flush_domain_on_iommu(iommu, i); 651 iommu_flush_domain_devices(domain);
652 iommu_flush_complete(domain);
501 } 653 }
502 654
655 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
656}
657
658void amd_iommu_flush_all_devices(void)
659{
660 iommu_flush_all_domain_devices();
503} 661}
504 662
505/* 663/*
506 * This function is used to flush the IO/TLB for a given protection domain 664 * This function uses heavy locking and may disable irqs for some time. But
507 * on every IOMMU in the system 665 * this is no issue because it is only called during resume.
508 */ 666 */
509static void iommu_flush_domain(u16 domid) 667void amd_iommu_flush_all_domains(void)
510{ 668{
511 struct amd_iommu *iommu; 669 struct protection_domain *domain;
670 unsigned long flags;
512 671
513 INC_STATS_COUNTER(domain_flush_all); 672 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
514 673
515 for_each_iommu(iommu) 674 list_for_each_entry(domain, &amd_iommu_pd_list, list) {
516 flush_domain_on_iommu(iommu, domid); 675 spin_lock(&domain->lock);
676 iommu_flush_tlb_pde(domain);
677 iommu_flush_complete(domain);
678 spin_unlock(&domain->lock);
679 }
680
681 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
517} 682}
518 683
519void amd_iommu_flush_all_domains(void) 684static void reset_iommu_command_buffer(struct amd_iommu *iommu)
520{ 685{
521 struct amd_iommu *iommu; 686 pr_err("AMD-Vi: Resetting IOMMU command buffer\n");
522 687
523 for_each_iommu(iommu) 688 if (iommu->reset_in_progress)
524 flush_all_domains_on_iommu(iommu); 689 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n");
690
691 amd_iommu_reset_cmd_buffer(iommu);
692 amd_iommu_flush_all_devices();
693 amd_iommu_flush_all_domains();
694
695 iommu->reset_in_progress = false;
525} 696}
526 697
527static void flush_all_devices_for_iommu(struct amd_iommu *iommu) 698/****************************************************************************
699 *
700 * The functions below are used the create the page table mappings for
701 * unity mapped regions.
702 *
703 ****************************************************************************/
704
705/*
706 * This function is used to add another level to an IO page table. Adding
707 * another level increases the size of the address space by 9 bits to a size up
708 * to 64 bits.
709 */
710static bool increase_address_space(struct protection_domain *domain,
711 gfp_t gfp)
528{ 712{
529 int i; 713 u64 *pte;
530 714
531 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 715 if (domain->mode == PAGE_MODE_6_LEVEL)
532 if (iommu != amd_iommu_rlookup_table[i]) 716 /* address space already 64 bit large */
533 continue; 717 return false;
534 718
535 iommu_queue_inv_dev_entry(iommu, i); 719 pte = (void *)get_zeroed_page(gfp);
536 iommu_completion_wait(iommu); 720 if (!pte)
537 } 721 return false;
722
723 *pte = PM_LEVEL_PDE(domain->mode,
724 virt_to_phys(domain->pt_root));
725 domain->pt_root = pte;
726 domain->mode += 1;
727 domain->updated = true;
728
729 return true;
538} 730}
539 731
540static void flush_devices_by_domain(struct protection_domain *domain) 732static u64 *alloc_pte(struct protection_domain *domain,
733 unsigned long address,
734 int end_lvl,
735 u64 **pte_page,
736 gfp_t gfp)
541{ 737{
542 struct amd_iommu *iommu; 738 u64 *pte, *page;
543 int i; 739 int level;
544 740
545 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 741 while (address > PM_LEVEL_SIZE(domain->mode))
546 if ((domain == NULL && amd_iommu_pd_table[i] == NULL) || 742 increase_address_space(domain, gfp);
547 (amd_iommu_pd_table[i] != domain))
548 continue;
549 743
550 iommu = amd_iommu_rlookup_table[i]; 744 level = domain->mode - 1;
551 if (!iommu) 745 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
552 continue;
553 746
554 iommu_queue_inv_dev_entry(iommu, i); 747 while (level > end_lvl) {
555 iommu_completion_wait(iommu); 748 if (!IOMMU_PTE_PRESENT(*pte)) {
749 page = (u64 *)get_zeroed_page(gfp);
750 if (!page)
751 return NULL;
752 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
753 }
754
755 level -= 1;
756
757 pte = IOMMU_PTE_PAGE(*pte);
758
759 if (pte_page && level == end_lvl)
760 *pte_page = pte;
761
762 pte = &pte[PM_LEVEL_INDEX(level, address)];
556 } 763 }
764
765 return pte;
557} 766}
558 767
559static void reset_iommu_command_buffer(struct amd_iommu *iommu) 768/*
769 * This function checks if there is a PTE for a given dma address. If
770 * there is one, it returns the pointer to it.
771 */
772static u64 *fetch_pte(struct protection_domain *domain,
773 unsigned long address, int map_size)
560{ 774{
561 pr_err("AMD-Vi: Resetting IOMMU command buffer\n"); 775 int level;
776 u64 *pte;
562 777
563 if (iommu->reset_in_progress) 778 level = domain->mode - 1;
564 panic("AMD-Vi: ILLEGAL_COMMAND_ERROR while resetting command buffer\n"); 779 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
565 780
566 iommu->reset_in_progress = true; 781 while (level > map_size) {
782 if (!IOMMU_PTE_PRESENT(*pte))
783 return NULL;
567 784
568 amd_iommu_reset_cmd_buffer(iommu); 785 level -= 1;
569 flush_all_devices_for_iommu(iommu);
570 flush_all_domains_on_iommu(iommu);
571 786
572 iommu->reset_in_progress = false; 787 pte = IOMMU_PTE_PAGE(*pte);
573} 788 pte = &pte[PM_LEVEL_INDEX(level, address)];
574 789
575void amd_iommu_flush_all_devices(void) 790 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) {
576{ 791 pte = NULL;
577 flush_devices_by_domain(NULL); 792 break;
578} 793 }
794 }
579 795
580/**************************************************************************** 796 return pte;
581 * 797}
582 * The functions below are used the create the page table mappings for
583 * unity mapped regions.
584 *
585 ****************************************************************************/
586 798
587/* 799/*
588 * Generic mapping functions. It maps a physical address into a DMA 800 * Generic mapping functions. It maps a physical address into a DMA
@@ -654,28 +866,6 @@ static int iommu_for_unity_map(struct amd_iommu *iommu,
654} 866}
655 867
656/* 868/*
657 * Init the unity mappings for a specific IOMMU in the system
658 *
659 * Basically iterates over all unity mapping entries and applies them to
660 * the default domain DMA of that IOMMU if necessary.
661 */
662static int iommu_init_unity_mappings(struct amd_iommu *iommu)
663{
664 struct unity_map_entry *entry;
665 int ret;
666
667 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
668 if (!iommu_for_unity_map(iommu, entry))
669 continue;
670 ret = dma_ops_unity_map(iommu->default_dom, entry);
671 if (ret)
672 return ret;
673 }
674
675 return 0;
676}
677
678/*
679 * This function actually applies the mapping to the page table of the 869 * This function actually applies the mapping to the page table of the
680 * dma_ops domain. 870 * dma_ops domain.
681 */ 871 */
@@ -704,6 +894,28 @@ static int dma_ops_unity_map(struct dma_ops_domain *dma_dom,
704} 894}
705 895
706/* 896/*
897 * Init the unity mappings for a specific IOMMU in the system
898 *
899 * Basically iterates over all unity mapping entries and applies them to
900 * the default domain DMA of that IOMMU if necessary.
901 */
902static int iommu_init_unity_mappings(struct amd_iommu *iommu)
903{
904 struct unity_map_entry *entry;
905 int ret;
906
907 list_for_each_entry(entry, &amd_iommu_unity_map, list) {
908 if (!iommu_for_unity_map(iommu, entry))
909 continue;
910 ret = dma_ops_unity_map(iommu->default_dom, entry);
911 if (ret)
912 return ret;
913 }
914
915 return 0;
916}
917
918/*
707 * Inits the unity mappings required for a specific device 919 * Inits the unity mappings required for a specific device
708 */ 920 */
709static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom, 921static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
@@ -740,34 +952,23 @@ static int init_unity_mappings_for_device(struct dma_ops_domain *dma_dom,
740 */ 952 */
741 953
742/* 954/*
743 * This function checks if there is a PTE for a given dma address. If 955 * Used to reserve address ranges in the aperture (e.g. for exclusion
744 * there is one, it returns the pointer to it. 956 * ranges.
745 */ 957 */
746static u64 *fetch_pte(struct protection_domain *domain, 958static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
747 unsigned long address, int map_size) 959 unsigned long start_page,
960 unsigned int pages)
748{ 961{
749 int level; 962 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
750 u64 *pte;
751
752 level = domain->mode - 1;
753 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
754
755 while (level > map_size) {
756 if (!IOMMU_PTE_PRESENT(*pte))
757 return NULL;
758
759 level -= 1;
760 963
761 pte = IOMMU_PTE_PAGE(*pte); 964 if (start_page + pages > last_page)
762 pte = &pte[PM_LEVEL_INDEX(level, address)]; 965 pages = last_page - start_page;
763 966
764 if ((PM_PTE_LEVEL(*pte) == 0) && level != map_size) { 967 for (i = start_page; i < start_page + pages; ++i) {
765 pte = NULL; 968 int index = i / APERTURE_RANGE_PAGES;
766 break; 969 int page = i % APERTURE_RANGE_PAGES;
767 } 970 __set_bit(page, dom->aperture[index]->bitmap);
768 } 971 }
769
770 return pte;
771} 972}
772 973
773/* 974/*
@@ -775,12 +976,12 @@ static u64 *fetch_pte(struct protection_domain *domain,
775 * aperture in case of dma_ops domain allocation or address allocation 976 * aperture in case of dma_ops domain allocation or address allocation
776 * failure. 977 * failure.
777 */ 978 */
778static int alloc_new_range(struct amd_iommu *iommu, 979static int alloc_new_range(struct dma_ops_domain *dma_dom,
779 struct dma_ops_domain *dma_dom,
780 bool populate, gfp_t gfp) 980 bool populate, gfp_t gfp)
781{ 981{
782 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT; 982 int index = dma_dom->aperture_size >> APERTURE_RANGE_SHIFT;
783 int i; 983 struct amd_iommu *iommu;
984 unsigned long i;
784 985
785#ifdef CONFIG_IOMMU_STRESS 986#ifdef CONFIG_IOMMU_STRESS
786 populate = false; 987 populate = false;
@@ -819,14 +1020,17 @@ static int alloc_new_range(struct amd_iommu *iommu,
819 dma_dom->aperture_size += APERTURE_RANGE_SIZE; 1020 dma_dom->aperture_size += APERTURE_RANGE_SIZE;
820 1021
821 /* Intialize the exclusion range if necessary */ 1022 /* Intialize the exclusion range if necessary */
822 if (iommu->exclusion_start && 1023 for_each_iommu(iommu) {
823 iommu->exclusion_start >= dma_dom->aperture[index]->offset && 1024 if (iommu->exclusion_start &&
824 iommu->exclusion_start < dma_dom->aperture_size) { 1025 iommu->exclusion_start >= dma_dom->aperture[index]->offset
825 unsigned long startpage = iommu->exclusion_start >> PAGE_SHIFT; 1026 && iommu->exclusion_start < dma_dom->aperture_size) {
826 int pages = iommu_num_pages(iommu->exclusion_start, 1027 unsigned long startpage;
827 iommu->exclusion_length, 1028 int pages = iommu_num_pages(iommu->exclusion_start,
828 PAGE_SIZE); 1029 iommu->exclusion_length,
829 dma_ops_reserve_addresses(dma_dom, startpage, pages); 1030 PAGE_SIZE);
1031 startpage = iommu->exclusion_start >> PAGE_SHIFT;
1032 dma_ops_reserve_addresses(dma_dom, startpage, pages);
1033 }
830 } 1034 }
831 1035
832 /* 1036 /*
@@ -928,7 +1132,7 @@ static unsigned long dma_ops_alloc_addresses(struct device *dev,
928 } 1132 }
929 1133
930 if (unlikely(address == -1)) 1134 if (unlikely(address == -1))
931 address = bad_dma_address; 1135 address = DMA_ERROR_CODE;
932 1136
933 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size); 1137 WARN_ON((address + (PAGE_SIZE*pages)) > dom->aperture_size);
934 1138
@@ -959,7 +1163,7 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
959 1163
960 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT; 1164 address = (address % APERTURE_RANGE_SIZE) >> PAGE_SHIFT;
961 1165
962 iommu_area_free(range->bitmap, address, pages); 1166 bitmap_clear(range->bitmap, address, pages);
963 1167
964} 1168}
965 1169
@@ -973,6 +1177,31 @@ static void dma_ops_free_addresses(struct dma_ops_domain *dom,
973 * 1177 *
974 ****************************************************************************/ 1178 ****************************************************************************/
975 1179
1180/*
1181 * This function adds a protection domain to the global protection domain list
1182 */
1183static void add_domain_to_list(struct protection_domain *domain)
1184{
1185 unsigned long flags;
1186
1187 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1188 list_add(&domain->list, &amd_iommu_pd_list);
1189 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1190}
1191
1192/*
1193 * This function removes a protection domain to the global
1194 * protection domain list
1195 */
1196static void del_domain_from_list(struct protection_domain *domain)
1197{
1198 unsigned long flags;
1199
1200 spin_lock_irqsave(&amd_iommu_pd_lock, flags);
1201 list_del(&domain->list);
1202 spin_unlock_irqrestore(&amd_iommu_pd_lock, flags);
1203}
1204
976static u16 domain_id_alloc(void) 1205static u16 domain_id_alloc(void)
977{ 1206{
978 unsigned long flags; 1207 unsigned long flags;
@@ -1000,26 +1229,6 @@ static void domain_id_free(int id)
1000 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1229 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1001} 1230}
1002 1231
1003/*
1004 * Used to reserve address ranges in the aperture (e.g. for exclusion
1005 * ranges.
1006 */
1007static void dma_ops_reserve_addresses(struct dma_ops_domain *dom,
1008 unsigned long start_page,
1009 unsigned int pages)
1010{
1011 unsigned int i, last_page = dom->aperture_size >> PAGE_SHIFT;
1012
1013 if (start_page + pages > last_page)
1014 pages = last_page - start_page;
1015
1016 for (i = start_page; i < start_page + pages; ++i) {
1017 int index = i / APERTURE_RANGE_PAGES;
1018 int page = i % APERTURE_RANGE_PAGES;
1019 __set_bit(page, dom->aperture[index]->bitmap);
1020 }
1021}
1022
1023static void free_pagetable(struct protection_domain *domain) 1232static void free_pagetable(struct protection_domain *domain)
1024{ 1233{
1025 int i, j; 1234 int i, j;
@@ -1061,6 +1270,8 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1061 if (!dom) 1270 if (!dom)
1062 return; 1271 return;
1063 1272
1273 del_domain_from_list(&dom->domain);
1274
1064 free_pagetable(&dom->domain); 1275 free_pagetable(&dom->domain);
1065 1276
1066 for (i = 0; i < APERTURE_MAX_RANGES; ++i) { 1277 for (i = 0; i < APERTURE_MAX_RANGES; ++i) {
@@ -1078,7 +1289,7 @@ static void dma_ops_domain_free(struct dma_ops_domain *dom)
1078 * It also intializes the page table and the address allocator data 1289 * It also intializes the page table and the address allocator data
1079 * structures required for the dma_ops interface 1290 * structures required for the dma_ops interface
1080 */ 1291 */
1081static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu) 1292static struct dma_ops_domain *dma_ops_domain_alloc(void)
1082{ 1293{
1083 struct dma_ops_domain *dma_dom; 1294 struct dma_ops_domain *dma_dom;
1084 1295
@@ -1091,6 +1302,7 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1091 dma_dom->domain.id = domain_id_alloc(); 1302 dma_dom->domain.id = domain_id_alloc();
1092 if (dma_dom->domain.id == 0) 1303 if (dma_dom->domain.id == 0)
1093 goto free_dma_dom; 1304 goto free_dma_dom;
1305 INIT_LIST_HEAD(&dma_dom->domain.dev_list);
1094 dma_dom->domain.mode = PAGE_MODE_2_LEVEL; 1306 dma_dom->domain.mode = PAGE_MODE_2_LEVEL;
1095 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL); 1307 dma_dom->domain.pt_root = (void *)get_zeroed_page(GFP_KERNEL);
1096 dma_dom->domain.flags = PD_DMA_OPS_MASK; 1308 dma_dom->domain.flags = PD_DMA_OPS_MASK;
@@ -1101,7 +1313,9 @@ static struct dma_ops_domain *dma_ops_domain_alloc(struct amd_iommu *iommu)
1101 dma_dom->need_flush = false; 1313 dma_dom->need_flush = false;
1102 dma_dom->target_dev = 0xffff; 1314 dma_dom->target_dev = 0xffff;
1103 1315
1104 if (alloc_new_range(iommu, dma_dom, true, GFP_KERNEL)) 1316 add_domain_to_list(&dma_dom->domain);
1317
1318 if (alloc_new_range(dma_dom, true, GFP_KERNEL))
1105 goto free_dma_dom; 1319 goto free_dma_dom;
1106 1320
1107 /* 1321 /*
@@ -1129,22 +1343,6 @@ static bool dma_ops_domain(struct protection_domain *domain)
1129 return domain->flags & PD_DMA_OPS_MASK; 1343 return domain->flags & PD_DMA_OPS_MASK;
1130} 1344}
1131 1345
1132/*
1133 * Find out the protection domain structure for a given PCI device. This
1134 * will give us the pointer to the page table root for example.
1135 */
1136static struct protection_domain *domain_for_device(u16 devid)
1137{
1138 struct protection_domain *dom;
1139 unsigned long flags;
1140
1141 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1142 dom = amd_iommu_pd_table[devid];
1143 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1144
1145 return dom;
1146}
1147
1148static void set_dte_entry(u16 devid, struct protection_domain *domain) 1346static void set_dte_entry(u16 devid, struct protection_domain *domain)
1149{ 1347{
1150 u64 pte_root = virt_to_phys(domain->pt_root); 1348 u64 pte_root = virt_to_phys(domain->pt_root);
@@ -1156,42 +1354,123 @@ static void set_dte_entry(u16 devid, struct protection_domain *domain)
1156 amd_iommu_dev_table[devid].data[2] = domain->id; 1354 amd_iommu_dev_table[devid].data[2] = domain->id;
1157 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root); 1355 amd_iommu_dev_table[devid].data[1] = upper_32_bits(pte_root);
1158 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root); 1356 amd_iommu_dev_table[devid].data[0] = lower_32_bits(pte_root);
1357}
1358
1359static void clear_dte_entry(u16 devid)
1360{
1361 /* remove entry from the device table seen by the hardware */
1362 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1363 amd_iommu_dev_table[devid].data[1] = 0;
1364 amd_iommu_dev_table[devid].data[2] = 0;
1365
1366 amd_iommu_apply_erratum_63(devid);
1367}
1368
1369static void do_attach(struct device *dev, struct protection_domain *domain)
1370{
1371 struct iommu_dev_data *dev_data;
1372 struct amd_iommu *iommu;
1373 u16 devid;
1374
1375 devid = get_device_id(dev);
1376 iommu = amd_iommu_rlookup_table[devid];
1377 dev_data = get_dev_data(dev);
1378
1379 /* Update data structures */
1380 dev_data->domain = domain;
1381 list_add(&dev_data->list, &domain->dev_list);
1382 set_dte_entry(devid, domain);
1383
1384 /* Do reference counting */
1385 domain->dev_iommu[iommu->index] += 1;
1386 domain->dev_cnt += 1;
1159 1387
1160 amd_iommu_pd_table[devid] = domain; 1388 /* Flush the DTE entry */
1389 iommu_flush_device(dev);
1390}
1391
1392static void do_detach(struct device *dev)
1393{
1394 struct iommu_dev_data *dev_data;
1395 struct amd_iommu *iommu;
1396 u16 devid;
1397
1398 devid = get_device_id(dev);
1399 iommu = amd_iommu_rlookup_table[devid];
1400 dev_data = get_dev_data(dev);
1401
1402 /* decrease reference counters */
1403 dev_data->domain->dev_iommu[iommu->index] -= 1;
1404 dev_data->domain->dev_cnt -= 1;
1405
1406 /* Update data structures */
1407 dev_data->domain = NULL;
1408 list_del(&dev_data->list);
1409 clear_dte_entry(devid);
1410
1411 /* Flush the DTE entry */
1412 iommu_flush_device(dev);
1161} 1413}
1162 1414
1163/* 1415/*
1164 * If a device is not yet associated with a domain, this function does 1416 * If a device is not yet associated with a domain, this function does
1165 * assigns it visible for the hardware 1417 * assigns it visible for the hardware
1166 */ 1418 */
1167static void __attach_device(struct amd_iommu *iommu, 1419static int __attach_device(struct device *dev,
1168 struct protection_domain *domain, 1420 struct protection_domain *domain)
1169 u16 devid)
1170{ 1421{
1422 struct iommu_dev_data *dev_data, *alias_data;
1423
1424 dev_data = get_dev_data(dev);
1425 alias_data = get_dev_data(dev_data->alias);
1426
1427 if (!alias_data)
1428 return -EINVAL;
1429
1171 /* lock domain */ 1430 /* lock domain */
1172 spin_lock(&domain->lock); 1431 spin_lock(&domain->lock);
1173 1432
1174 /* update DTE entry */ 1433 /* Some sanity checks */
1175 set_dte_entry(devid, domain); 1434 if (alias_data->domain != NULL &&
1435 alias_data->domain != domain)
1436 return -EBUSY;
1176 1437
1177 domain->dev_cnt += 1; 1438 if (dev_data->domain != NULL &&
1439 dev_data->domain != domain)
1440 return -EBUSY;
1441
1442 /* Do real assignment */
1443 if (dev_data->alias != dev) {
1444 alias_data = get_dev_data(dev_data->alias);
1445 if (alias_data->domain == NULL)
1446 do_attach(dev_data->alias, domain);
1447
1448 atomic_inc(&alias_data->bind);
1449 }
1450
1451 if (dev_data->domain == NULL)
1452 do_attach(dev, domain);
1453
1454 atomic_inc(&dev_data->bind);
1178 1455
1179 /* ready */ 1456 /* ready */
1180 spin_unlock(&domain->lock); 1457 spin_unlock(&domain->lock);
1458
1459 return 0;
1181} 1460}
1182 1461
1183/* 1462/*
1184 * If a device is not yet associated with a domain, this function does 1463 * If a device is not yet associated with a domain, this function does
1185 * assigns it visible for the hardware 1464 * assigns it visible for the hardware
1186 */ 1465 */
1187static void attach_device(struct amd_iommu *iommu, 1466static int attach_device(struct device *dev,
1188 struct protection_domain *domain, 1467 struct protection_domain *domain)
1189 u16 devid)
1190{ 1468{
1191 unsigned long flags; 1469 unsigned long flags;
1470 int ret;
1192 1471
1193 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1472 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1194 __attach_device(iommu, domain, devid); 1473 ret = __attach_device(dev, domain);
1195 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1474 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1196 1475
1197 /* 1476 /*
@@ -1199,98 +1478,130 @@ static void attach_device(struct amd_iommu *iommu,
1199 * left the caches in the IOMMU dirty. So we have to flush 1478 * left the caches in the IOMMU dirty. So we have to flush
1200 * here to evict all dirty stuff. 1479 * here to evict all dirty stuff.
1201 */ 1480 */
1202 iommu_queue_inv_dev_entry(iommu, devid); 1481 iommu_flush_tlb_pde(domain);
1203 iommu_flush_tlb_pde(iommu, domain->id); 1482
1483 return ret;
1204} 1484}
1205 1485
1206/* 1486/*
1207 * Removes a device from a protection domain (unlocked) 1487 * Removes a device from a protection domain (unlocked)
1208 */ 1488 */
1209static void __detach_device(struct protection_domain *domain, u16 devid) 1489static void __detach_device(struct device *dev)
1210{ 1490{
1491 struct iommu_dev_data *dev_data = get_dev_data(dev);
1492 struct iommu_dev_data *alias_data;
1493 struct protection_domain *domain;
1494 unsigned long flags;
1211 1495
1212 /* lock domain */ 1496 BUG_ON(!dev_data->domain);
1213 spin_lock(&domain->lock);
1214 1497
1215 /* remove domain from the lookup table */ 1498 domain = dev_data->domain;
1216 amd_iommu_pd_table[devid] = NULL;
1217 1499
1218 /* remove entry from the device table seen by the hardware */ 1500 spin_lock_irqsave(&domain->lock, flags);
1219 amd_iommu_dev_table[devid].data[0] = IOMMU_PTE_P | IOMMU_PTE_TV;
1220 amd_iommu_dev_table[devid].data[1] = 0;
1221 amd_iommu_dev_table[devid].data[2] = 0;
1222 1501
1223 amd_iommu_apply_erratum_63(devid); 1502 if (dev_data->alias != dev) {
1503 alias_data = get_dev_data(dev_data->alias);
1504 if (atomic_dec_and_test(&alias_data->bind))
1505 do_detach(dev_data->alias);
1506 }
1224 1507
1225 /* decrease reference counter */ 1508 if (atomic_dec_and_test(&dev_data->bind))
1226 domain->dev_cnt -= 1; 1509 do_detach(dev);
1227 1510
1228 /* ready */ 1511 spin_unlock_irqrestore(&domain->lock, flags);
1229 spin_unlock(&domain->lock);
1230 1512
1231 /* 1513 /*
1232 * If we run in passthrough mode the device must be assigned to the 1514 * If we run in passthrough mode the device must be assigned to the
1233 * passthrough domain if it is detached from any other domain 1515 * passthrough domain if it is detached from any other domain.
1516 * Make sure we can deassign from the pt_domain itself.
1234 */ 1517 */
1235 if (iommu_pass_through) { 1518 if (iommu_pass_through &&
1236 struct amd_iommu *iommu = amd_iommu_rlookup_table[devid]; 1519 (dev_data->domain == NULL && domain != pt_domain))
1237 __attach_device(iommu, pt_domain, devid); 1520 __attach_device(dev, pt_domain);
1238 }
1239} 1521}
1240 1522
1241/* 1523/*
1242 * Removes a device from a protection domain (with devtable_lock held) 1524 * Removes a device from a protection domain (with devtable_lock held)
1243 */ 1525 */
1244static void detach_device(struct protection_domain *domain, u16 devid) 1526static void detach_device(struct device *dev)
1245{ 1527{
1246 unsigned long flags; 1528 unsigned long flags;
1247 1529
1248 /* lock device table */ 1530 /* lock device table */
1249 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 1531 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1250 __detach_device(domain, devid); 1532 __detach_device(dev);
1251 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 1533 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1252} 1534}
1253 1535
1536/*
1537 * Find out the protection domain structure for a given PCI device. This
1538 * will give us the pointer to the page table root for example.
1539 */
1540static struct protection_domain *domain_for_device(struct device *dev)
1541{
1542 struct protection_domain *dom;
1543 struct iommu_dev_data *dev_data, *alias_data;
1544 unsigned long flags;
1545 u16 devid, alias;
1546
1547 devid = get_device_id(dev);
1548 alias = amd_iommu_alias_table[devid];
1549 dev_data = get_dev_data(dev);
1550 alias_data = get_dev_data(dev_data->alias);
1551 if (!alias_data)
1552 return NULL;
1553
1554 read_lock_irqsave(&amd_iommu_devtable_lock, flags);
1555 dom = dev_data->domain;
1556 if (dom == NULL &&
1557 alias_data->domain != NULL) {
1558 __attach_device(dev, alias_data->domain);
1559 dom = alias_data->domain;
1560 }
1561
1562 read_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1563
1564 return dom;
1565}
1566
1254static int device_change_notifier(struct notifier_block *nb, 1567static int device_change_notifier(struct notifier_block *nb,
1255 unsigned long action, void *data) 1568 unsigned long action, void *data)
1256{ 1569{
1257 struct device *dev = data; 1570 struct device *dev = data;
1258 struct pci_dev *pdev = to_pci_dev(dev); 1571 u16 devid;
1259 u16 devid = calc_devid(pdev->bus->number, pdev->devfn);
1260 struct protection_domain *domain; 1572 struct protection_domain *domain;
1261 struct dma_ops_domain *dma_domain; 1573 struct dma_ops_domain *dma_domain;
1262 struct amd_iommu *iommu; 1574 struct amd_iommu *iommu;
1263 unsigned long flags; 1575 unsigned long flags;
1264 1576
1265 if (devid > amd_iommu_last_bdf) 1577 if (!check_device(dev))
1266 goto out; 1578 return 0;
1267
1268 devid = amd_iommu_alias_table[devid];
1269
1270 iommu = amd_iommu_rlookup_table[devid];
1271 if (iommu == NULL)
1272 goto out;
1273
1274 domain = domain_for_device(devid);
1275 1579
1276 if (domain && !dma_ops_domain(domain)) 1580 devid = get_device_id(dev);
1277 WARN_ONCE(1, "AMD IOMMU WARNING: device %s already bound " 1581 iommu = amd_iommu_rlookup_table[devid];
1278 "to a non-dma-ops domain\n", dev_name(dev));
1279 1582
1280 switch (action) { 1583 switch (action) {
1281 case BUS_NOTIFY_UNBOUND_DRIVER: 1584 case BUS_NOTIFY_UNBOUND_DRIVER:
1585
1586 domain = domain_for_device(dev);
1587
1282 if (!domain) 1588 if (!domain)
1283 goto out; 1589 goto out;
1284 if (iommu_pass_through) 1590 if (iommu_pass_through)
1285 break; 1591 break;
1286 detach_device(domain, devid); 1592 detach_device(dev);
1287 break; 1593 break;
1288 case BUS_NOTIFY_ADD_DEVICE: 1594 case BUS_NOTIFY_ADD_DEVICE:
1595
1596 iommu_init_device(dev);
1597
1598 domain = domain_for_device(dev);
1599
1289 /* allocate a protection domain if a device is added */ 1600 /* allocate a protection domain if a device is added */
1290 dma_domain = find_protection_domain(devid); 1601 dma_domain = find_protection_domain(devid);
1291 if (dma_domain) 1602 if (dma_domain)
1292 goto out; 1603 goto out;
1293 dma_domain = dma_ops_domain_alloc(iommu); 1604 dma_domain = dma_ops_domain_alloc();
1294 if (!dma_domain) 1605 if (!dma_domain)
1295 goto out; 1606 goto out;
1296 dma_domain->target_dev = devid; 1607 dma_domain->target_dev = devid;
@@ -1300,11 +1611,15 @@ static int device_change_notifier(struct notifier_block *nb,
1300 spin_unlock_irqrestore(&iommu_pd_list_lock, flags); 1611 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1301 1612
1302 break; 1613 break;
1614 case BUS_NOTIFY_DEL_DEVICE:
1615
1616 iommu_uninit_device(dev);
1617
1303 default: 1618 default:
1304 goto out; 1619 goto out;
1305 } 1620 }
1306 1621
1307 iommu_queue_inv_dev_entry(iommu, devid); 1622 iommu_flush_device(dev);
1308 iommu_completion_wait(iommu); 1623 iommu_completion_wait(iommu);
1309 1624
1310out: 1625out:
@@ -1315,6 +1630,11 @@ static struct notifier_block device_nb = {
1315 .notifier_call = device_change_notifier, 1630 .notifier_call = device_change_notifier,
1316}; 1631};
1317 1632
1633void amd_iommu_init_notifier(void)
1634{
1635 bus_register_notifier(&pci_bus_type, &device_nb);
1636}
1637
1318/***************************************************************************** 1638/*****************************************************************************
1319 * 1639 *
1320 * The next functions belong to the dma_ops mapping/unmapping code. 1640 * The next functions belong to the dma_ops mapping/unmapping code.
@@ -1322,106 +1642,46 @@ static struct notifier_block device_nb = {
1322 *****************************************************************************/ 1642 *****************************************************************************/
1323 1643
1324/* 1644/*
1325 * This function checks if the driver got a valid device from the caller to
1326 * avoid dereferencing invalid pointers.
1327 */
1328static bool check_device(struct device *dev)
1329{
1330 if (!dev || !dev->dma_mask)
1331 return false;
1332
1333 return true;
1334}
1335
1336/*
1337 * In this function the list of preallocated protection domains is traversed to
1338 * find the domain for a specific device
1339 */
1340static struct dma_ops_domain *find_protection_domain(u16 devid)
1341{
1342 struct dma_ops_domain *entry, *ret = NULL;
1343 unsigned long flags;
1344
1345 if (list_empty(&iommu_pd_list))
1346 return NULL;
1347
1348 spin_lock_irqsave(&iommu_pd_list_lock, flags);
1349
1350 list_for_each_entry(entry, &iommu_pd_list, list) {
1351 if (entry->target_dev == devid) {
1352 ret = entry;
1353 break;
1354 }
1355 }
1356
1357 spin_unlock_irqrestore(&iommu_pd_list_lock, flags);
1358
1359 return ret;
1360}
1361
1362/*
1363 * In the dma_ops path we only have the struct device. This function 1645 * In the dma_ops path we only have the struct device. This function
1364 * finds the corresponding IOMMU, the protection domain and the 1646 * finds the corresponding IOMMU, the protection domain and the
1365 * requestor id for a given device. 1647 * requestor id for a given device.
1366 * If the device is not yet associated with a domain this is also done 1648 * If the device is not yet associated with a domain this is also done
1367 * in this function. 1649 * in this function.
1368 */ 1650 */
1369static int get_device_resources(struct device *dev, 1651static struct protection_domain *get_domain(struct device *dev)
1370 struct amd_iommu **iommu,
1371 struct protection_domain **domain,
1372 u16 *bdf)
1373{ 1652{
1653 struct protection_domain *domain;
1374 struct dma_ops_domain *dma_dom; 1654 struct dma_ops_domain *dma_dom;
1375 struct pci_dev *pcidev; 1655 u16 devid = get_device_id(dev);
1376 u16 _bdf;
1377
1378 *iommu = NULL;
1379 *domain = NULL;
1380 *bdf = 0xffff;
1381
1382 if (dev->bus != &pci_bus_type)
1383 return 0;
1384
1385 pcidev = to_pci_dev(dev);
1386 _bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
1387 1656
1388 /* device not translated by any IOMMU in the system? */ 1657 if (!check_device(dev))
1389 if (_bdf > amd_iommu_last_bdf) 1658 return ERR_PTR(-EINVAL);
1390 return 0;
1391 1659
1392 *bdf = amd_iommu_alias_table[_bdf]; 1660 domain = domain_for_device(dev);
1661 if (domain != NULL && !dma_ops_domain(domain))
1662 return ERR_PTR(-EBUSY);
1393 1663
1394 *iommu = amd_iommu_rlookup_table[*bdf]; 1664 if (domain != NULL)
1395 if (*iommu == NULL) 1665 return domain;
1396 return 0;
1397 *domain = domain_for_device(*bdf);
1398 if (*domain == NULL) {
1399 dma_dom = find_protection_domain(*bdf);
1400 if (!dma_dom)
1401 dma_dom = (*iommu)->default_dom;
1402 *domain = &dma_dom->domain;
1403 attach_device(*iommu, *domain, *bdf);
1404 DUMP_printk("Using protection domain %d for device %s\n",
1405 (*domain)->id, dev_name(dev));
1406 }
1407 1666
1408 if (domain_for_device(_bdf) == NULL) 1667 /* Device not bount yet - bind it */
1409 attach_device(*iommu, *domain, _bdf); 1668 dma_dom = find_protection_domain(devid);
1669 if (!dma_dom)
1670 dma_dom = amd_iommu_rlookup_table[devid]->default_dom;
1671 attach_device(dev, &dma_dom->domain);
1672 DUMP_printk("Using protection domain %d for device %s\n",
1673 dma_dom->domain.id, dev_name(dev));
1410 1674
1411 return 1; 1675 return &dma_dom->domain;
1412} 1676}
1413 1677
1414static void update_device_table(struct protection_domain *domain) 1678static void update_device_table(struct protection_domain *domain)
1415{ 1679{
1416 unsigned long flags; 1680 struct iommu_dev_data *dev_data;
1417 int i;
1418 1681
1419 for (i = 0; i <= amd_iommu_last_bdf; ++i) { 1682 list_for_each_entry(dev_data, &domain->dev_list, list) {
1420 if (amd_iommu_pd_table[i] != domain) 1683 u16 devid = get_device_id(dev_data->dev);
1421 continue; 1684 set_dte_entry(devid, domain);
1422 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
1423 set_dte_entry(i, domain);
1424 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
1425 } 1685 }
1426} 1686}
1427 1687
@@ -1431,76 +1691,13 @@ static void update_domain(struct protection_domain *domain)
1431 return; 1691 return;
1432 1692
1433 update_device_table(domain); 1693 update_device_table(domain);
1434 flush_devices_by_domain(domain); 1694 iommu_flush_domain_devices(domain);
1435 iommu_flush_domain(domain->id); 1695 iommu_flush_tlb_pde(domain);
1436 1696
1437 domain->updated = false; 1697 domain->updated = false;
1438} 1698}
1439 1699
1440/* 1700/*
1441 * This function is used to add another level to an IO page table. Adding
1442 * another level increases the size of the address space by 9 bits to a size up
1443 * to 64 bits.
1444 */
1445static bool increase_address_space(struct protection_domain *domain,
1446 gfp_t gfp)
1447{
1448 u64 *pte;
1449
1450 if (domain->mode == PAGE_MODE_6_LEVEL)
1451 /* address space already 64 bit large */
1452 return false;
1453
1454 pte = (void *)get_zeroed_page(gfp);
1455 if (!pte)
1456 return false;
1457
1458 *pte = PM_LEVEL_PDE(domain->mode,
1459 virt_to_phys(domain->pt_root));
1460 domain->pt_root = pte;
1461 domain->mode += 1;
1462 domain->updated = true;
1463
1464 return true;
1465}
1466
1467static u64 *alloc_pte(struct protection_domain *domain,
1468 unsigned long address,
1469 int end_lvl,
1470 u64 **pte_page,
1471 gfp_t gfp)
1472{
1473 u64 *pte, *page;
1474 int level;
1475
1476 while (address > PM_LEVEL_SIZE(domain->mode))
1477 increase_address_space(domain, gfp);
1478
1479 level = domain->mode - 1;
1480 pte = &domain->pt_root[PM_LEVEL_INDEX(level, address)];
1481
1482 while (level > end_lvl) {
1483 if (!IOMMU_PTE_PRESENT(*pte)) {
1484 page = (u64 *)get_zeroed_page(gfp);
1485 if (!page)
1486 return NULL;
1487 *pte = PM_LEVEL_PDE(level, virt_to_phys(page));
1488 }
1489
1490 level -= 1;
1491
1492 pte = IOMMU_PTE_PAGE(*pte);
1493
1494 if (pte_page && level == end_lvl)
1495 *pte_page = pte;
1496
1497 pte = &pte[PM_LEVEL_INDEX(level, address)];
1498 }
1499
1500 return pte;
1501}
1502
1503/*
1504 * This function fetches the PTE for a given address in the aperture 1701 * This function fetches the PTE for a given address in the aperture
1505 */ 1702 */
1506static u64* dma_ops_get_pte(struct dma_ops_domain *dom, 1703static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
@@ -1530,8 +1727,7 @@ static u64* dma_ops_get_pte(struct dma_ops_domain *dom,
1530 * This is the generic map function. It maps one 4kb page at paddr to 1727 * This is the generic map function. It maps one 4kb page at paddr to
1531 * the given address in the DMA address space for the domain. 1728 * the given address in the DMA address space for the domain.
1532 */ 1729 */
1533static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu, 1730static dma_addr_t dma_ops_domain_map(struct dma_ops_domain *dom,
1534 struct dma_ops_domain *dom,
1535 unsigned long address, 1731 unsigned long address,
1536 phys_addr_t paddr, 1732 phys_addr_t paddr,
1537 int direction) 1733 int direction)
@@ -1544,7 +1740,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1544 1740
1545 pte = dma_ops_get_pte(dom, address); 1741 pte = dma_ops_get_pte(dom, address);
1546 if (!pte) 1742 if (!pte)
1547 return bad_dma_address; 1743 return DMA_ERROR_CODE;
1548 1744
1549 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC; 1745 __pte = paddr | IOMMU_PTE_P | IOMMU_PTE_FC;
1550 1746
@@ -1565,8 +1761,7 @@ static dma_addr_t dma_ops_domain_map(struct amd_iommu *iommu,
1565/* 1761/*
1566 * The generic unmapping function for on page in the DMA address space. 1762 * The generic unmapping function for on page in the DMA address space.
1567 */ 1763 */
1568static void dma_ops_domain_unmap(struct amd_iommu *iommu, 1764static void dma_ops_domain_unmap(struct dma_ops_domain *dom,
1569 struct dma_ops_domain *dom,
1570 unsigned long address) 1765 unsigned long address)
1571{ 1766{
1572 struct aperture_range *aperture; 1767 struct aperture_range *aperture;
@@ -1597,7 +1792,6 @@ static void dma_ops_domain_unmap(struct amd_iommu *iommu,
1597 * Must be called with the domain lock held. 1792 * Must be called with the domain lock held.
1598 */ 1793 */
1599static dma_addr_t __map_single(struct device *dev, 1794static dma_addr_t __map_single(struct device *dev,
1600 struct amd_iommu *iommu,
1601 struct dma_ops_domain *dma_dom, 1795 struct dma_ops_domain *dma_dom,
1602 phys_addr_t paddr, 1796 phys_addr_t paddr,
1603 size_t size, 1797 size_t size,
@@ -1625,7 +1819,7 @@ static dma_addr_t __map_single(struct device *dev,
1625retry: 1819retry:
1626 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask, 1820 address = dma_ops_alloc_addresses(dev, dma_dom, pages, align_mask,
1627 dma_mask); 1821 dma_mask);
1628 if (unlikely(address == bad_dma_address)) { 1822 if (unlikely(address == DMA_ERROR_CODE)) {
1629 /* 1823 /*
1630 * setting next_address here will let the address 1824 * setting next_address here will let the address
1631 * allocator only scan the new allocated range in the 1825 * allocator only scan the new allocated range in the
@@ -1633,11 +1827,11 @@ retry:
1633 */ 1827 */
1634 dma_dom->next_address = dma_dom->aperture_size; 1828 dma_dom->next_address = dma_dom->aperture_size;
1635 1829
1636 if (alloc_new_range(iommu, dma_dom, false, GFP_ATOMIC)) 1830 if (alloc_new_range(dma_dom, false, GFP_ATOMIC))
1637 goto out; 1831 goto out;
1638 1832
1639 /* 1833 /*
1640 * aperture was sucessfully enlarged by 128 MB, try 1834 * aperture was successfully enlarged by 128 MB, try
1641 * allocation again 1835 * allocation again
1642 */ 1836 */
1643 goto retry; 1837 goto retry;
@@ -1645,8 +1839,8 @@ retry:
1645 1839
1646 start = address; 1840 start = address;
1647 for (i = 0; i < pages; ++i) { 1841 for (i = 0; i < pages; ++i) {
1648 ret = dma_ops_domain_map(iommu, dma_dom, start, paddr, dir); 1842 ret = dma_ops_domain_map(dma_dom, start, paddr, dir);
1649 if (ret == bad_dma_address) 1843 if (ret == DMA_ERROR_CODE)
1650 goto out_unmap; 1844 goto out_unmap;
1651 1845
1652 paddr += PAGE_SIZE; 1846 paddr += PAGE_SIZE;
@@ -1657,10 +1851,10 @@ retry:
1657 ADD_STATS_COUNTER(alloced_io_mem, size); 1851 ADD_STATS_COUNTER(alloced_io_mem, size);
1658 1852
1659 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) { 1853 if (unlikely(dma_dom->need_flush && !amd_iommu_unmap_flush)) {
1660 iommu_flush_tlb(iommu, dma_dom->domain.id); 1854 iommu_flush_tlb(&dma_dom->domain);
1661 dma_dom->need_flush = false; 1855 dma_dom->need_flush = false;
1662 } else if (unlikely(iommu_has_npcache(iommu))) 1856 } else if (unlikely(amd_iommu_np_cache))
1663 iommu_flush_pages(iommu, dma_dom->domain.id, address, size); 1857 iommu_flush_pages(&dma_dom->domain, address, size);
1664 1858
1665out: 1859out:
1666 return address; 1860 return address;
@@ -1669,20 +1863,19 @@ out_unmap:
1669 1863
1670 for (--i; i >= 0; --i) { 1864 for (--i; i >= 0; --i) {
1671 start -= PAGE_SIZE; 1865 start -= PAGE_SIZE;
1672 dma_ops_domain_unmap(iommu, dma_dom, start); 1866 dma_ops_domain_unmap(dma_dom, start);
1673 } 1867 }
1674 1868
1675 dma_ops_free_addresses(dma_dom, address, pages); 1869 dma_ops_free_addresses(dma_dom, address, pages);
1676 1870
1677 return bad_dma_address; 1871 return DMA_ERROR_CODE;
1678} 1872}
1679 1873
1680/* 1874/*
1681 * Does the reverse of the __map_single function. Must be called with 1875 * Does the reverse of the __map_single function. Must be called with
1682 * the domain lock held too 1876 * the domain lock held too
1683 */ 1877 */
1684static void __unmap_single(struct amd_iommu *iommu, 1878static void __unmap_single(struct dma_ops_domain *dma_dom,
1685 struct dma_ops_domain *dma_dom,
1686 dma_addr_t dma_addr, 1879 dma_addr_t dma_addr,
1687 size_t size, 1880 size_t size,
1688 int dir) 1881 int dir)
@@ -1690,7 +1883,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1690 dma_addr_t i, start; 1883 dma_addr_t i, start;
1691 unsigned int pages; 1884 unsigned int pages;
1692 1885
1693 if ((dma_addr == bad_dma_address) || 1886 if ((dma_addr == DMA_ERROR_CODE) ||
1694 (dma_addr + size > dma_dom->aperture_size)) 1887 (dma_addr + size > dma_dom->aperture_size))
1695 return; 1888 return;
1696 1889
@@ -1699,7 +1892,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1699 start = dma_addr; 1892 start = dma_addr;
1700 1893
1701 for (i = 0; i < pages; ++i) { 1894 for (i = 0; i < pages; ++i) {
1702 dma_ops_domain_unmap(iommu, dma_dom, start); 1895 dma_ops_domain_unmap(dma_dom, start);
1703 start += PAGE_SIZE; 1896 start += PAGE_SIZE;
1704 } 1897 }
1705 1898
@@ -1708,7 +1901,7 @@ static void __unmap_single(struct amd_iommu *iommu,
1708 dma_ops_free_addresses(dma_dom, dma_addr, pages); 1901 dma_ops_free_addresses(dma_dom, dma_addr, pages);
1709 1902
1710 if (amd_iommu_unmap_flush || dma_dom->need_flush) { 1903 if (amd_iommu_unmap_flush || dma_dom->need_flush) {
1711 iommu_flush_pages(iommu, dma_dom->domain.id, dma_addr, size); 1904 iommu_flush_pages(&dma_dom->domain, dma_addr, size);
1712 dma_dom->need_flush = false; 1905 dma_dom->need_flush = false;
1713 } 1906 }
1714} 1907}
@@ -1722,36 +1915,29 @@ static dma_addr_t map_page(struct device *dev, struct page *page,
1722 struct dma_attrs *attrs) 1915 struct dma_attrs *attrs)
1723{ 1916{
1724 unsigned long flags; 1917 unsigned long flags;
1725 struct amd_iommu *iommu;
1726 struct protection_domain *domain; 1918 struct protection_domain *domain;
1727 u16 devid;
1728 dma_addr_t addr; 1919 dma_addr_t addr;
1729 u64 dma_mask; 1920 u64 dma_mask;
1730 phys_addr_t paddr = page_to_phys(page) + offset; 1921 phys_addr_t paddr = page_to_phys(page) + offset;
1731 1922
1732 INC_STATS_COUNTER(cnt_map_single); 1923 INC_STATS_COUNTER(cnt_map_single);
1733 1924
1734 if (!check_device(dev)) 1925 domain = get_domain(dev);
1735 return bad_dma_address; 1926 if (PTR_ERR(domain) == -EINVAL)
1736
1737 dma_mask = *dev->dma_mask;
1738
1739 get_device_resources(dev, &iommu, &domain, &devid);
1740
1741 if (iommu == NULL || domain == NULL)
1742 /* device not handled by any AMD IOMMU */
1743 return (dma_addr_t)paddr; 1927 return (dma_addr_t)paddr;
1928 else if (IS_ERR(domain))
1929 return DMA_ERROR_CODE;
1744 1930
1745 if (!dma_ops_domain(domain)) 1931 dma_mask = *dev->dma_mask;
1746 return bad_dma_address;
1747 1932
1748 spin_lock_irqsave(&domain->lock, flags); 1933 spin_lock_irqsave(&domain->lock, flags);
1749 addr = __map_single(dev, iommu, domain->priv, paddr, size, dir, false, 1934
1935 addr = __map_single(dev, domain->priv, paddr, size, dir, false,
1750 dma_mask); 1936 dma_mask);
1751 if (addr == bad_dma_address) 1937 if (addr == DMA_ERROR_CODE)
1752 goto out; 1938 goto out;
1753 1939
1754 iommu_completion_wait(iommu); 1940 iommu_flush_complete(domain);
1755 1941
1756out: 1942out:
1757 spin_unlock_irqrestore(&domain->lock, flags); 1943 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1766,25 +1952,19 @@ static void unmap_page(struct device *dev, dma_addr_t dma_addr, size_t size,
1766 enum dma_data_direction dir, struct dma_attrs *attrs) 1952 enum dma_data_direction dir, struct dma_attrs *attrs)
1767{ 1953{
1768 unsigned long flags; 1954 unsigned long flags;
1769 struct amd_iommu *iommu;
1770 struct protection_domain *domain; 1955 struct protection_domain *domain;
1771 u16 devid;
1772 1956
1773 INC_STATS_COUNTER(cnt_unmap_single); 1957 INC_STATS_COUNTER(cnt_unmap_single);
1774 1958
1775 if (!check_device(dev) || 1959 domain = get_domain(dev);
1776 !get_device_resources(dev, &iommu, &domain, &devid)) 1960 if (IS_ERR(domain))
1777 /* device not handled by any AMD IOMMU */
1778 return;
1779
1780 if (!dma_ops_domain(domain))
1781 return; 1961 return;
1782 1962
1783 spin_lock_irqsave(&domain->lock, flags); 1963 spin_lock_irqsave(&domain->lock, flags);
1784 1964
1785 __unmap_single(iommu, domain->priv, dma_addr, size, dir); 1965 __unmap_single(domain->priv, dma_addr, size, dir);
1786 1966
1787 iommu_completion_wait(iommu); 1967 iommu_flush_complete(domain);
1788 1968
1789 spin_unlock_irqrestore(&domain->lock, flags); 1969 spin_unlock_irqrestore(&domain->lock, flags);
1790} 1970}
@@ -1816,9 +1996,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1816 struct dma_attrs *attrs) 1996 struct dma_attrs *attrs)
1817{ 1997{
1818 unsigned long flags; 1998 unsigned long flags;
1819 struct amd_iommu *iommu;
1820 struct protection_domain *domain; 1999 struct protection_domain *domain;
1821 u16 devid;
1822 int i; 2000 int i;
1823 struct scatterlist *s; 2001 struct scatterlist *s;
1824 phys_addr_t paddr; 2002 phys_addr_t paddr;
@@ -1827,25 +2005,20 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1827 2005
1828 INC_STATS_COUNTER(cnt_map_sg); 2006 INC_STATS_COUNTER(cnt_map_sg);
1829 2007
1830 if (!check_device(dev)) 2008 domain = get_domain(dev);
2009 if (PTR_ERR(domain) == -EINVAL)
2010 return map_sg_no_iommu(dev, sglist, nelems, dir);
2011 else if (IS_ERR(domain))
1831 return 0; 2012 return 0;
1832 2013
1833 dma_mask = *dev->dma_mask; 2014 dma_mask = *dev->dma_mask;
1834 2015
1835 get_device_resources(dev, &iommu, &domain, &devid);
1836
1837 if (!iommu || !domain)
1838 return map_sg_no_iommu(dev, sglist, nelems, dir);
1839
1840 if (!dma_ops_domain(domain))
1841 return 0;
1842
1843 spin_lock_irqsave(&domain->lock, flags); 2016 spin_lock_irqsave(&domain->lock, flags);
1844 2017
1845 for_each_sg(sglist, s, nelems, i) { 2018 for_each_sg(sglist, s, nelems, i) {
1846 paddr = sg_phys(s); 2019 paddr = sg_phys(s);
1847 2020
1848 s->dma_address = __map_single(dev, iommu, domain->priv, 2021 s->dma_address = __map_single(dev, domain->priv,
1849 paddr, s->length, dir, false, 2022 paddr, s->length, dir, false,
1850 dma_mask); 2023 dma_mask);
1851 2024
@@ -1856,7 +2029,7 @@ static int map_sg(struct device *dev, struct scatterlist *sglist,
1856 goto unmap; 2029 goto unmap;
1857 } 2030 }
1858 2031
1859 iommu_completion_wait(iommu); 2032 iommu_flush_complete(domain);
1860 2033
1861out: 2034out:
1862 spin_unlock_irqrestore(&domain->lock, flags); 2035 spin_unlock_irqrestore(&domain->lock, flags);
@@ -1865,7 +2038,7 @@ out:
1865unmap: 2038unmap:
1866 for_each_sg(sglist, s, mapped_elems, i) { 2039 for_each_sg(sglist, s, mapped_elems, i) {
1867 if (s->dma_address) 2040 if (s->dma_address)
1868 __unmap_single(iommu, domain->priv, s->dma_address, 2041 __unmap_single(domain->priv, s->dma_address,
1869 s->dma_length, dir); 2042 s->dma_length, dir);
1870 s->dma_address = s->dma_length = 0; 2043 s->dma_address = s->dma_length = 0;
1871 } 2044 }
@@ -1884,30 +2057,25 @@ static void unmap_sg(struct device *dev, struct scatterlist *sglist,
1884 struct dma_attrs *attrs) 2057 struct dma_attrs *attrs)
1885{ 2058{
1886 unsigned long flags; 2059 unsigned long flags;
1887 struct amd_iommu *iommu;
1888 struct protection_domain *domain; 2060 struct protection_domain *domain;
1889 struct scatterlist *s; 2061 struct scatterlist *s;
1890 u16 devid;
1891 int i; 2062 int i;
1892 2063
1893 INC_STATS_COUNTER(cnt_unmap_sg); 2064 INC_STATS_COUNTER(cnt_unmap_sg);
1894 2065
1895 if (!check_device(dev) || 2066 domain = get_domain(dev);
1896 !get_device_resources(dev, &iommu, &domain, &devid)) 2067 if (IS_ERR(domain))
1897 return;
1898
1899 if (!dma_ops_domain(domain))
1900 return; 2068 return;
1901 2069
1902 spin_lock_irqsave(&domain->lock, flags); 2070 spin_lock_irqsave(&domain->lock, flags);
1903 2071
1904 for_each_sg(sglist, s, nelems, i) { 2072 for_each_sg(sglist, s, nelems, i) {
1905 __unmap_single(iommu, domain->priv, s->dma_address, 2073 __unmap_single(domain->priv, s->dma_address,
1906 s->dma_length, dir); 2074 s->dma_length, dir);
1907 s->dma_address = s->dma_length = 0; 2075 s->dma_address = s->dma_length = 0;
1908 } 2076 }
1909 2077
1910 iommu_completion_wait(iommu); 2078 iommu_flush_complete(domain);
1911 2079
1912 spin_unlock_irqrestore(&domain->lock, flags); 2080 spin_unlock_irqrestore(&domain->lock, flags);
1913} 2081}
@@ -1920,49 +2088,44 @@ static void *alloc_coherent(struct device *dev, size_t size,
1920{ 2088{
1921 unsigned long flags; 2089 unsigned long flags;
1922 void *virt_addr; 2090 void *virt_addr;
1923 struct amd_iommu *iommu;
1924 struct protection_domain *domain; 2091 struct protection_domain *domain;
1925 u16 devid;
1926 phys_addr_t paddr; 2092 phys_addr_t paddr;
1927 u64 dma_mask = dev->coherent_dma_mask; 2093 u64 dma_mask = dev->coherent_dma_mask;
1928 2094
1929 INC_STATS_COUNTER(cnt_alloc_coherent); 2095 INC_STATS_COUNTER(cnt_alloc_coherent);
1930 2096
1931 if (!check_device(dev)) 2097 domain = get_domain(dev);
2098 if (PTR_ERR(domain) == -EINVAL) {
2099 virt_addr = (void *)__get_free_pages(flag, get_order(size));
2100 *dma_addr = __pa(virt_addr);
2101 return virt_addr;
2102 } else if (IS_ERR(domain))
1932 return NULL; 2103 return NULL;
1933 2104
1934 if (!get_device_resources(dev, &iommu, &domain, &devid)) 2105 dma_mask = dev->coherent_dma_mask;
1935 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32); 2106 flag &= ~(__GFP_DMA | __GFP_HIGHMEM | __GFP_DMA32);
2107 flag |= __GFP_ZERO;
1936 2108
1937 flag |= __GFP_ZERO;
1938 virt_addr = (void *)__get_free_pages(flag, get_order(size)); 2109 virt_addr = (void *)__get_free_pages(flag, get_order(size));
1939 if (!virt_addr) 2110 if (!virt_addr)
1940 return NULL; 2111 return NULL;
1941 2112
1942 paddr = virt_to_phys(virt_addr); 2113 paddr = virt_to_phys(virt_addr);
1943 2114
1944 if (!iommu || !domain) {
1945 *dma_addr = (dma_addr_t)paddr;
1946 return virt_addr;
1947 }
1948
1949 if (!dma_ops_domain(domain))
1950 goto out_free;
1951
1952 if (!dma_mask) 2115 if (!dma_mask)
1953 dma_mask = *dev->dma_mask; 2116 dma_mask = *dev->dma_mask;
1954 2117
1955 spin_lock_irqsave(&domain->lock, flags); 2118 spin_lock_irqsave(&domain->lock, flags);
1956 2119
1957 *dma_addr = __map_single(dev, iommu, domain->priv, paddr, 2120 *dma_addr = __map_single(dev, domain->priv, paddr,
1958 size, DMA_BIDIRECTIONAL, true, dma_mask); 2121 size, DMA_BIDIRECTIONAL, true, dma_mask);
1959 2122
1960 if (*dma_addr == bad_dma_address) { 2123 if (*dma_addr == DMA_ERROR_CODE) {
1961 spin_unlock_irqrestore(&domain->lock, flags); 2124 spin_unlock_irqrestore(&domain->lock, flags);
1962 goto out_free; 2125 goto out_free;
1963 } 2126 }
1964 2127
1965 iommu_completion_wait(iommu); 2128 iommu_flush_complete(domain);
1966 2129
1967 spin_unlock_irqrestore(&domain->lock, flags); 2130 spin_unlock_irqrestore(&domain->lock, flags);
1968 2131
@@ -1982,28 +2145,19 @@ static void free_coherent(struct device *dev, size_t size,
1982 void *virt_addr, dma_addr_t dma_addr) 2145 void *virt_addr, dma_addr_t dma_addr)
1983{ 2146{
1984 unsigned long flags; 2147 unsigned long flags;
1985 struct amd_iommu *iommu;
1986 struct protection_domain *domain; 2148 struct protection_domain *domain;
1987 u16 devid;
1988 2149
1989 INC_STATS_COUNTER(cnt_free_coherent); 2150 INC_STATS_COUNTER(cnt_free_coherent);
1990 2151
1991 if (!check_device(dev)) 2152 domain = get_domain(dev);
1992 return; 2153 if (IS_ERR(domain))
1993
1994 get_device_resources(dev, &iommu, &domain, &devid);
1995
1996 if (!iommu || !domain)
1997 goto free_mem;
1998
1999 if (!dma_ops_domain(domain))
2000 goto free_mem; 2154 goto free_mem;
2001 2155
2002 spin_lock_irqsave(&domain->lock, flags); 2156 spin_lock_irqsave(&domain->lock, flags);
2003 2157
2004 __unmap_single(iommu, domain->priv, dma_addr, size, DMA_BIDIRECTIONAL); 2158 __unmap_single(domain->priv, dma_addr, size, DMA_BIDIRECTIONAL);
2005 2159
2006 iommu_completion_wait(iommu); 2160 iommu_flush_complete(domain);
2007 2161
2008 spin_unlock_irqrestore(&domain->lock, flags); 2162 spin_unlock_irqrestore(&domain->lock, flags);
2009 2163
@@ -2017,22 +2171,7 @@ free_mem:
2017 */ 2171 */
2018static int amd_iommu_dma_supported(struct device *dev, u64 mask) 2172static int amd_iommu_dma_supported(struct device *dev, u64 mask)
2019{ 2173{
2020 u16 bdf; 2174 return check_device(dev);
2021 struct pci_dev *pcidev;
2022
2023 /* No device or no PCI device */
2024 if (!dev || dev->bus != &pci_bus_type)
2025 return 0;
2026
2027 pcidev = to_pci_dev(dev);
2028
2029 bdf = calc_devid(pcidev->bus->number, pcidev->devfn);
2030
2031 /* Out of our scope? */
2032 if (bdf > amd_iommu_last_bdf)
2033 return 0;
2034
2035 return 1;
2036} 2175}
2037 2176
2038/* 2177/*
@@ -2046,25 +2185,28 @@ static void prealloc_protection_domains(void)
2046{ 2185{
2047 struct pci_dev *dev = NULL; 2186 struct pci_dev *dev = NULL;
2048 struct dma_ops_domain *dma_dom; 2187 struct dma_ops_domain *dma_dom;
2049 struct amd_iommu *iommu;
2050 u16 devid; 2188 u16 devid;
2051 2189
2052 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2190 for_each_pci_dev(dev) {
2053 devid = calc_devid(dev->bus->number, dev->devfn); 2191
2054 if (devid > amd_iommu_last_bdf) 2192 /* Do we handle this device? */
2055 continue; 2193 if (!check_device(&dev->dev))
2056 devid = amd_iommu_alias_table[devid];
2057 if (domain_for_device(devid))
2058 continue; 2194 continue;
2059 iommu = amd_iommu_rlookup_table[devid]; 2195
2060 if (!iommu) 2196 /* Is there already any domain for it? */
2197 if (domain_for_device(&dev->dev))
2061 continue; 2198 continue;
2062 dma_dom = dma_ops_domain_alloc(iommu); 2199
2200 devid = get_device_id(&dev->dev);
2201
2202 dma_dom = dma_ops_domain_alloc();
2063 if (!dma_dom) 2203 if (!dma_dom)
2064 continue; 2204 continue;
2065 init_unity_mappings_for_device(dma_dom, devid); 2205 init_unity_mappings_for_device(dma_dom, devid);
2066 dma_dom->target_dev = devid; 2206 dma_dom->target_dev = devid;
2067 2207
2208 attach_device(&dev->dev, &dma_dom->domain);
2209
2068 list_add_tail(&dma_dom->list, &iommu_pd_list); 2210 list_add_tail(&dma_dom->list, &iommu_pd_list);
2069 } 2211 }
2070} 2212}
@@ -2082,6 +2224,12 @@ static struct dma_map_ops amd_iommu_dma_ops = {
2082/* 2224/*
2083 * The function which clues the AMD IOMMU driver into dma_ops. 2225 * The function which clues the AMD IOMMU driver into dma_ops.
2084 */ 2226 */
2227
2228void __init amd_iommu_init_api(void)
2229{
2230 register_iommu(&amd_iommu_ops);
2231}
2232
2085int __init amd_iommu_init_dma_ops(void) 2233int __init amd_iommu_init_dma_ops(void)
2086{ 2234{
2087 struct amd_iommu *iommu; 2235 struct amd_iommu *iommu;
@@ -2093,7 +2241,7 @@ int __init amd_iommu_init_dma_ops(void)
2093 * protection domain will be assigned to the default one. 2241 * protection domain will be assigned to the default one.
2094 */ 2242 */
2095 for_each_iommu(iommu) { 2243 for_each_iommu(iommu) {
2096 iommu->default_dom = dma_ops_domain_alloc(iommu); 2244 iommu->default_dom = dma_ops_domain_alloc();
2097 if (iommu->default_dom == NULL) 2245 if (iommu->default_dom == NULL)
2098 return -ENOMEM; 2246 return -ENOMEM;
2099 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK; 2247 iommu->default_dom->domain.flags |= PD_DEFAULT_MASK;
@@ -2103,15 +2251,12 @@ int __init amd_iommu_init_dma_ops(void)
2103 } 2251 }
2104 2252
2105 /* 2253 /*
2106 * If device isolation is enabled, pre-allocate the protection 2254 * Pre-allocate the protection domains for each device.
2107 * domains for each device.
2108 */ 2255 */
2109 if (amd_iommu_isolate) 2256 prealloc_protection_domains();
2110 prealloc_protection_domains();
2111 2257
2112 iommu_detected = 1; 2258 iommu_detected = 1;
2113 force_iommu = 1; 2259 swiotlb = 0;
2114 bad_dma_address = 0;
2115#ifdef CONFIG_GART_IOMMU 2260#ifdef CONFIG_GART_IOMMU
2116 gart_iommu_aperture_disabled = 1; 2261 gart_iommu_aperture_disabled = 1;
2117 gart_iommu_aperture = 0; 2262 gart_iommu_aperture = 0;
@@ -2120,10 +2265,6 @@ int __init amd_iommu_init_dma_ops(void)
2120 /* Make the driver finally visible to the drivers */ 2265 /* Make the driver finally visible to the drivers */
2121 dma_ops = &amd_iommu_dma_ops; 2266 dma_ops = &amd_iommu_dma_ops;
2122 2267
2123 register_iommu(&amd_iommu_ops);
2124
2125 bus_register_notifier(&pci_bus_type, &device_nb);
2126
2127 amd_iommu_stats_init(); 2268 amd_iommu_stats_init();
2128 2269
2129 return 0; 2270 return 0;
@@ -2150,14 +2291,17 @@ free_domains:
2150 2291
2151static void cleanup_domain(struct protection_domain *domain) 2292static void cleanup_domain(struct protection_domain *domain)
2152{ 2293{
2294 struct iommu_dev_data *dev_data, *next;
2153 unsigned long flags; 2295 unsigned long flags;
2154 u16 devid;
2155 2296
2156 write_lock_irqsave(&amd_iommu_devtable_lock, flags); 2297 write_lock_irqsave(&amd_iommu_devtable_lock, flags);
2157 2298
2158 for (devid = 0; devid <= amd_iommu_last_bdf; ++devid) 2299 list_for_each_entry_safe(dev_data, next, &domain->dev_list, list) {
2159 if (amd_iommu_pd_table[devid] == domain) 2300 struct device *dev = dev_data->dev;
2160 __detach_device(domain, devid); 2301
2302 __detach_device(dev);
2303 atomic_set(&dev_data->bind, 0);
2304 }
2161 2305
2162 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags); 2306 write_unlock_irqrestore(&amd_iommu_devtable_lock, flags);
2163} 2307}
@@ -2167,6 +2311,8 @@ static void protection_domain_free(struct protection_domain *domain)
2167 if (!domain) 2311 if (!domain)
2168 return; 2312 return;
2169 2313
2314 del_domain_from_list(domain);
2315
2170 if (domain->id) 2316 if (domain->id)
2171 domain_id_free(domain->id); 2317 domain_id_free(domain->id);
2172 2318
@@ -2182,9 +2328,13 @@ static struct protection_domain *protection_domain_alloc(void)
2182 return NULL; 2328 return NULL;
2183 2329
2184 spin_lock_init(&domain->lock); 2330 spin_lock_init(&domain->lock);
2331 mutex_init(&domain->api_lock);
2185 domain->id = domain_id_alloc(); 2332 domain->id = domain_id_alloc();
2186 if (!domain->id) 2333 if (!domain->id)
2187 goto out_err; 2334 goto out_err;
2335 INIT_LIST_HEAD(&domain->dev_list);
2336
2337 add_domain_to_list(domain);
2188 2338
2189 return domain; 2339 return domain;
2190 2340
@@ -2231,9 +2381,7 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2231 2381
2232 free_pagetable(domain); 2382 free_pagetable(domain);
2233 2383
2234 domain_id_free(domain->id); 2384 protection_domain_free(domain);
2235
2236 kfree(domain);
2237 2385
2238 dom->priv = NULL; 2386 dom->priv = NULL;
2239} 2387}
@@ -2241,26 +2389,23 @@ static void amd_iommu_domain_destroy(struct iommu_domain *dom)
2241static void amd_iommu_detach_device(struct iommu_domain *dom, 2389static void amd_iommu_detach_device(struct iommu_domain *dom,
2242 struct device *dev) 2390 struct device *dev)
2243{ 2391{
2244 struct protection_domain *domain = dom->priv; 2392 struct iommu_dev_data *dev_data = dev->archdata.iommu;
2245 struct amd_iommu *iommu; 2393 struct amd_iommu *iommu;
2246 struct pci_dev *pdev;
2247 u16 devid; 2394 u16 devid;
2248 2395
2249 if (dev->bus != &pci_bus_type) 2396 if (!check_device(dev))
2250 return; 2397 return;
2251 2398
2252 pdev = to_pci_dev(dev); 2399 devid = get_device_id(dev);
2253
2254 devid = calc_devid(pdev->bus->number, pdev->devfn);
2255 2400
2256 if (devid > 0) 2401 if (dev_data->domain != NULL)
2257 detach_device(domain, devid); 2402 detach_device(dev);
2258 2403
2259 iommu = amd_iommu_rlookup_table[devid]; 2404 iommu = amd_iommu_rlookup_table[devid];
2260 if (!iommu) 2405 if (!iommu)
2261 return; 2406 return;
2262 2407
2263 iommu_queue_inv_dev_entry(iommu, devid); 2408 iommu_flush_device(dev);
2264 iommu_completion_wait(iommu); 2409 iommu_completion_wait(iommu);
2265} 2410}
2266 2411
@@ -2268,35 +2413,30 @@ static int amd_iommu_attach_device(struct iommu_domain *dom,
2268 struct device *dev) 2413 struct device *dev)
2269{ 2414{
2270 struct protection_domain *domain = dom->priv; 2415 struct protection_domain *domain = dom->priv;
2271 struct protection_domain *old_domain; 2416 struct iommu_dev_data *dev_data;
2272 struct amd_iommu *iommu; 2417 struct amd_iommu *iommu;
2273 struct pci_dev *pdev; 2418 int ret;
2274 u16 devid; 2419 u16 devid;
2275 2420
2276 if (dev->bus != &pci_bus_type) 2421 if (!check_device(dev))
2277 return -EINVAL; 2422 return -EINVAL;
2278 2423
2279 pdev = to_pci_dev(dev); 2424 dev_data = dev->archdata.iommu;
2280 2425
2281 devid = calc_devid(pdev->bus->number, pdev->devfn); 2426 devid = get_device_id(dev);
2282
2283 if (devid >= amd_iommu_last_bdf ||
2284 devid != amd_iommu_alias_table[devid])
2285 return -EINVAL;
2286 2427
2287 iommu = amd_iommu_rlookup_table[devid]; 2428 iommu = amd_iommu_rlookup_table[devid];
2288 if (!iommu) 2429 if (!iommu)
2289 return -EINVAL; 2430 return -EINVAL;
2290 2431
2291 old_domain = domain_for_device(devid); 2432 if (dev_data->domain)
2292 if (old_domain) 2433 detach_device(dev);
2293 detach_device(old_domain, devid);
2294 2434
2295 attach_device(iommu, domain, devid); 2435 ret = attach_device(dev, domain);
2296 2436
2297 iommu_completion_wait(iommu); 2437 iommu_completion_wait(iommu);
2298 2438
2299 return 0; 2439 return ret;
2300} 2440}
2301 2441
2302static int amd_iommu_map_range(struct iommu_domain *dom, 2442static int amd_iommu_map_range(struct iommu_domain *dom,
@@ -2316,6 +2456,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2316 iova &= PAGE_MASK; 2456 iova &= PAGE_MASK;
2317 paddr &= PAGE_MASK; 2457 paddr &= PAGE_MASK;
2318 2458
2459 mutex_lock(&domain->api_lock);
2460
2319 for (i = 0; i < npages; ++i) { 2461 for (i = 0; i < npages; ++i) {
2320 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k); 2462 ret = iommu_map_page(domain, iova, paddr, prot, PM_MAP_4k);
2321 if (ret) 2463 if (ret)
@@ -2325,6 +2467,8 @@ static int amd_iommu_map_range(struct iommu_domain *dom,
2325 paddr += PAGE_SIZE; 2467 paddr += PAGE_SIZE;
2326 } 2468 }
2327 2469
2470 mutex_unlock(&domain->api_lock);
2471
2328 return 0; 2472 return 0;
2329} 2473}
2330 2474
@@ -2337,12 +2481,16 @@ static void amd_iommu_unmap_range(struct iommu_domain *dom,
2337 2481
2338 iova &= PAGE_MASK; 2482 iova &= PAGE_MASK;
2339 2483
2484 mutex_lock(&domain->api_lock);
2485
2340 for (i = 0; i < npages; ++i) { 2486 for (i = 0; i < npages; ++i) {
2341 iommu_unmap_page(domain, iova, PM_MAP_4k); 2487 iommu_unmap_page(domain, iova, PM_MAP_4k);
2342 iova += PAGE_SIZE; 2488 iova += PAGE_SIZE;
2343 } 2489 }
2344 2490
2345 iommu_flush_domain(domain->id); 2491 iommu_flush_tlb_pde(domain);
2492
2493 mutex_unlock(&domain->api_lock);
2346} 2494}
2347 2495
2348static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom, 2496static phys_addr_t amd_iommu_iova_to_phys(struct iommu_domain *dom,
@@ -2393,10 +2541,11 @@ static struct iommu_ops amd_iommu_ops = {
2393 2541
2394int __init amd_iommu_init_passthrough(void) 2542int __init amd_iommu_init_passthrough(void)
2395{ 2543{
2544 struct amd_iommu *iommu;
2396 struct pci_dev *dev = NULL; 2545 struct pci_dev *dev = NULL;
2397 u16 devid, devid2; 2546 u16 devid;
2398 2547
2399 /* allocate passthroug domain */ 2548 /* allocate passthrough domain */
2400 pt_domain = protection_domain_alloc(); 2549 pt_domain = protection_domain_alloc();
2401 if (!pt_domain) 2550 if (!pt_domain)
2402 return -ENOMEM; 2551 return -ENOMEM;
@@ -2404,20 +2553,17 @@ int __init amd_iommu_init_passthrough(void)
2404 pt_domain->mode |= PAGE_MODE_NONE; 2553 pt_domain->mode |= PAGE_MODE_NONE;
2405 2554
2406 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) { 2555 while ((dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev)) != NULL) {
2407 struct amd_iommu *iommu;
2408 2556
2409 devid = calc_devid(dev->bus->number, dev->devfn); 2557 if (!check_device(&dev->dev))
2410 if (devid > amd_iommu_last_bdf)
2411 continue; 2558 continue;
2412 2559
2413 devid2 = amd_iommu_alias_table[devid]; 2560 devid = get_device_id(&dev->dev);
2414 2561
2415 iommu = amd_iommu_rlookup_table[devid2]; 2562 iommu = amd_iommu_rlookup_table[devid];
2416 if (!iommu) 2563 if (!iommu)
2417 continue; 2564 continue;
2418 2565
2419 __attach_device(iommu, pt_domain, devid); 2566 attach_device(&dev->dev, pt_domain);
2420 __attach_device(iommu, pt_domain, devid2);
2421 } 2567 }
2422 2568
2423 pr_info("AMD-Vi: Initialized for Passthrough Mode\n"); 2569 pr_info("AMD-Vi: Initialized for Passthrough Mode\n");
diff --git a/arch/x86/kernel/amd_iommu_init.c b/arch/x86/kernel/amd_iommu_init.c
index c20001e4f556..6360abf993d4 100644
--- a/arch/x86/kernel/amd_iommu_init.c
+++ b/arch/x86/kernel/amd_iommu_init.c
@@ -1,5 +1,5 @@
1/* 1/*
2 * Copyright (C) 2007-2008 Advanced Micro Devices, Inc. 2 * Copyright (C) 2007-2009 Advanced Micro Devices, Inc.
3 * Author: Joerg Roedel <joerg.roedel@amd.com> 3 * Author: Joerg Roedel <joerg.roedel@amd.com>
4 * Leo Duran <leo.duran@amd.com> 4 * Leo Duran <leo.duran@amd.com>
5 * 5 *
@@ -19,16 +19,18 @@
19 19
20#include <linux/pci.h> 20#include <linux/pci.h>
21#include <linux/acpi.h> 21#include <linux/acpi.h>
22#include <linux/gfp.h>
23#include <linux/list.h> 22#include <linux/list.h>
23#include <linux/slab.h>
24#include <linux/sysdev.h> 24#include <linux/sysdev.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/msi.h> 26#include <linux/msi.h>
27#include <asm/pci-direct.h> 27#include <asm/pci-direct.h>
28#include <asm/amd_iommu_proto.h>
28#include <asm/amd_iommu_types.h> 29#include <asm/amd_iommu_types.h>
29#include <asm/amd_iommu.h> 30#include <asm/amd_iommu.h>
30#include <asm/iommu.h> 31#include <asm/iommu.h>
31#include <asm/gart.h> 32#include <asm/gart.h>
33#include <asm/x86_init.h>
32 34
33/* 35/*
34 * definitions for the ACPI scanning code 36 * definitions for the ACPI scanning code
@@ -123,18 +125,29 @@ u16 amd_iommu_last_bdf; /* largest PCI device id we have
123 to handle */ 125 to handle */
124LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings 126LIST_HEAD(amd_iommu_unity_map); /* a list of required unity mappings
125 we find in ACPI */ 127 we find in ACPI */
126#ifdef CONFIG_IOMMU_STRESS
127bool amd_iommu_isolate = false;
128#else
129bool amd_iommu_isolate = true; /* if true, device isolation is
130 enabled */
131#endif
132
133bool amd_iommu_unmap_flush; /* if true, flush on every unmap */ 128bool amd_iommu_unmap_flush; /* if true, flush on every unmap */
134 129
135LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the 130LIST_HEAD(amd_iommu_list); /* list of all AMD IOMMUs in the
136 system */ 131 system */
137 132
133/* Array to assign indices to IOMMUs*/
134struct amd_iommu *amd_iommus[MAX_IOMMUS];
135int amd_iommus_present;
136
137/* IOMMUs have a non-present cache? */
138bool amd_iommu_np_cache __read_mostly;
139
140/*
141 * The ACPI table parsing functions set this variable on an error
142 */
143static int __initdata amd_iommu_init_err;
144
145/*
146 * List of protection domains - used during resume
147 */
148LIST_HEAD(amd_iommu_pd_list);
149spinlock_t amd_iommu_pd_lock;
150
138/* 151/*
139 * Pointer to the device table which is shared by all AMD IOMMUs 152 * Pointer to the device table which is shared by all AMD IOMMUs
140 * it is indexed by the PCI device id or the HT unit id and contains 153 * it is indexed by the PCI device id or the HT unit id and contains
@@ -157,12 +170,6 @@ u16 *amd_iommu_alias_table;
157struct amd_iommu **amd_iommu_rlookup_table; 170struct amd_iommu **amd_iommu_rlookup_table;
158 171
159/* 172/*
160 * The pd table (protection domain table) is used to find the protection domain
161 * data structure a device belongs to. Indexed with the PCI device id too.
162 */
163struct protection_domain **amd_iommu_pd_table;
164
165/*
166 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap 173 * AMD IOMMU allows up to 2^16 differend protection domains. This is a bitmap
167 * to know which ones are already in use. 174 * to know which ones are already in use.
168 */ 175 */
@@ -384,9 +391,11 @@ static int __init find_last_devid_acpi(struct acpi_table_header *table)
384 */ 391 */
385 for (i = 0; i < table->length; ++i) 392 for (i = 0; i < table->length; ++i)
386 checksum += p[i]; 393 checksum += p[i];
387 if (checksum != 0) 394 if (checksum != 0) {
388 /* ACPI table corrupt */ 395 /* ACPI table corrupt */
389 return -ENODEV; 396 amd_iommu_init_err = -ENODEV;
397 return 0;
398 }
390 399
391 p += IVRS_HEADER_LENGTH; 400 p += IVRS_HEADER_LENGTH;
392 401
@@ -429,7 +438,7 @@ static u8 * __init alloc_command_buffer(struct amd_iommu *iommu)
429 if (cmd_buf == NULL) 438 if (cmd_buf == NULL)
430 return NULL; 439 return NULL;
431 440
432 iommu->cmd_buf_size = CMD_BUFFER_SIZE; 441 iommu->cmd_buf_size = CMD_BUFFER_SIZE | CMD_BUFFER_UNINITIALIZED;
433 442
434 return cmd_buf; 443 return cmd_buf;
435} 444}
@@ -465,12 +474,13 @@ static void iommu_enable_command_buffer(struct amd_iommu *iommu)
465 &entry, sizeof(entry)); 474 &entry, sizeof(entry));
466 475
467 amd_iommu_reset_cmd_buffer(iommu); 476 amd_iommu_reset_cmd_buffer(iommu);
477 iommu->cmd_buf_size &= ~(CMD_BUFFER_UNINITIALIZED);
468} 478}
469 479
470static void __init free_command_buffer(struct amd_iommu *iommu) 480static void __init free_command_buffer(struct amd_iommu *iommu)
471{ 481{
472 free_pages((unsigned long)iommu->cmd_buf, 482 free_pages((unsigned long)iommu->cmd_buf,
473 get_order(iommu->cmd_buf_size)); 483 get_order(iommu->cmd_buf_size & ~(CMD_BUFFER_UNINITIALIZED)));
474} 484}
475 485
476/* allocates the memory where the IOMMU will log its events to */ 486/* allocates the memory where the IOMMU will log its events to */
@@ -838,7 +848,18 @@ static void __init free_iommu_all(void)
838static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h) 848static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
839{ 849{
840 spin_lock_init(&iommu->lock); 850 spin_lock_init(&iommu->lock);
851
852 /* Add IOMMU to internal data structures */
841 list_add_tail(&iommu->list, &amd_iommu_list); 853 list_add_tail(&iommu->list, &amd_iommu_list);
854 iommu->index = amd_iommus_present++;
855
856 if (unlikely(iommu->index >= MAX_IOMMUS)) {
857 WARN(1, "AMD-Vi: System has more IOMMUs than supported by this driver\n");
858 return -ENOSYS;
859 }
860
861 /* Index is fine - add IOMMU to the array */
862 amd_iommus[iommu->index] = iommu;
842 863
843 /* 864 /*
844 * Copy data from ACPI table entry to the iommu struct 865 * Copy data from ACPI table entry to the iommu struct
@@ -868,6 +889,9 @@ static int __init init_iommu_one(struct amd_iommu *iommu, struct ivhd_header *h)
868 init_iommu_from_acpi(iommu, h); 889 init_iommu_from_acpi(iommu, h);
869 init_iommu_devices(iommu); 890 init_iommu_devices(iommu);
870 891
892 if (iommu->cap & (1UL << IOMMU_CAP_NPCACHE))
893 amd_iommu_np_cache = true;
894
871 return pci_enable_device(iommu->dev); 895 return pci_enable_device(iommu->dev);
872} 896}
873 897
@@ -899,11 +923,16 @@ static int __init init_iommu_all(struct acpi_table_header *table)
899 h->mmio_phys); 923 h->mmio_phys);
900 924
901 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL); 925 iommu = kzalloc(sizeof(struct amd_iommu), GFP_KERNEL);
902 if (iommu == NULL) 926 if (iommu == NULL) {
903 return -ENOMEM; 927 amd_iommu_init_err = -ENOMEM;
928 return 0;
929 }
930
904 ret = init_iommu_one(iommu, h); 931 ret = init_iommu_one(iommu, h);
905 if (ret) 932 if (ret) {
906 return ret; 933 amd_iommu_init_err = ret;
934 return 0;
935 }
907 break; 936 break;
908 default: 937 default:
909 break; 938 break;
@@ -925,7 +954,7 @@ static int __init init_iommu_all(struct acpi_table_header *table)
925 * 954 *
926 ****************************************************************************/ 955 ****************************************************************************/
927 956
928static int __init iommu_setup_msi(struct amd_iommu *iommu) 957static int iommu_setup_msi(struct amd_iommu *iommu)
929{ 958{
930 int r; 959 int r;
931 960
@@ -1176,19 +1205,10 @@ static struct sys_device device_amd_iommu = {
1176 * functions. Finally it prints some information about AMD IOMMUs and 1205 * functions. Finally it prints some information about AMD IOMMUs and
1177 * the driver state and enables the hardware. 1206 * the driver state and enables the hardware.
1178 */ 1207 */
1179int __init amd_iommu_init(void) 1208static int __init amd_iommu_init(void)
1180{ 1209{
1181 int i, ret = 0; 1210 int i, ret = 0;
1182 1211
1183
1184 if (no_iommu) {
1185 printk(KERN_INFO "AMD-Vi disabled by kernel command line\n");
1186 return 0;
1187 }
1188
1189 if (!amd_iommu_detected)
1190 return -ENODEV;
1191
1192 /* 1212 /*
1193 * First parse ACPI tables to find the largest Bus/Dev/Func 1213 * First parse ACPI tables to find the largest Bus/Dev/Func
1194 * we need to handle. Upon this information the shared data 1214 * we need to handle. Upon this information the shared data
@@ -1197,6 +1217,10 @@ int __init amd_iommu_init(void)
1197 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0) 1217 if (acpi_table_parse("IVRS", find_last_devid_acpi) != 0)
1198 return -ENODEV; 1218 return -ENODEV;
1199 1219
1220 ret = amd_iommu_init_err;
1221 if (ret)
1222 goto out;
1223
1200 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE); 1224 dev_table_size = tbl_size(DEV_TABLE_ENTRY_SIZE);
1201 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE); 1225 alias_table_size = tbl_size(ALIAS_TABLE_ENTRY_SIZE);
1202 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE); 1226 rlookup_table_size = tbl_size(RLOOKUP_TABLE_ENTRY_SIZE);
@@ -1225,15 +1249,6 @@ int __init amd_iommu_init(void)
1225 if (amd_iommu_rlookup_table == NULL) 1249 if (amd_iommu_rlookup_table == NULL)
1226 goto free; 1250 goto free;
1227 1251
1228 /*
1229 * Protection Domain table - maps devices to protection domains
1230 * This table has the same size as the rlookup_table
1231 */
1232 amd_iommu_pd_table = (void *)__get_free_pages(GFP_KERNEL | __GFP_ZERO,
1233 get_order(rlookup_table_size));
1234 if (amd_iommu_pd_table == NULL)
1235 goto free;
1236
1237 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages( 1252 amd_iommu_pd_alloc_bitmap = (void *)__get_free_pages(
1238 GFP_KERNEL | __GFP_ZERO, 1253 GFP_KERNEL | __GFP_ZERO,
1239 get_order(MAX_DOMAIN_ID/8)); 1254 get_order(MAX_DOMAIN_ID/8));
@@ -1255,6 +1270,8 @@ int __init amd_iommu_init(void)
1255 */ 1270 */
1256 amd_iommu_pd_alloc_bitmap[0] = 1; 1271 amd_iommu_pd_alloc_bitmap[0] = 1;
1257 1272
1273 spin_lock_init(&amd_iommu_pd_lock);
1274
1258 /* 1275 /*
1259 * now the data structures are allocated and basically initialized 1276 * now the data structures are allocated and basically initialized
1260 * start the real acpi table scan 1277 * start the real acpi table scan
@@ -1263,9 +1280,19 @@ int __init amd_iommu_init(void)
1263 if (acpi_table_parse("IVRS", init_iommu_all) != 0) 1280 if (acpi_table_parse("IVRS", init_iommu_all) != 0)
1264 goto free; 1281 goto free;
1265 1282
1283 if (amd_iommu_init_err) {
1284 ret = amd_iommu_init_err;
1285 goto free;
1286 }
1287
1266 if (acpi_table_parse("IVRS", init_memory_definitions) != 0) 1288 if (acpi_table_parse("IVRS", init_memory_definitions) != 0)
1267 goto free; 1289 goto free;
1268 1290
1291 if (amd_iommu_init_err) {
1292 ret = amd_iommu_init_err;
1293 goto free;
1294 }
1295
1269 ret = sysdev_class_register(&amd_iommu_sysdev_class); 1296 ret = sysdev_class_register(&amd_iommu_sysdev_class);
1270 if (ret) 1297 if (ret)
1271 goto free; 1298 goto free;
@@ -1274,39 +1301,44 @@ int __init amd_iommu_init(void)
1274 if (ret) 1301 if (ret)
1275 goto free; 1302 goto free;
1276 1303
1304 ret = amd_iommu_init_devices();
1305 if (ret)
1306 goto free;
1307
1308 enable_iommus();
1309
1277 if (iommu_pass_through) 1310 if (iommu_pass_through)
1278 ret = amd_iommu_init_passthrough(); 1311 ret = amd_iommu_init_passthrough();
1279 else 1312 else
1280 ret = amd_iommu_init_dma_ops(); 1313 ret = amd_iommu_init_dma_ops();
1314
1281 if (ret) 1315 if (ret)
1282 goto free; 1316 goto free;
1283 1317
1284 enable_iommus(); 1318 amd_iommu_init_api();
1319
1320 amd_iommu_init_notifier();
1285 1321
1286 if (iommu_pass_through) 1322 if (iommu_pass_through)
1287 goto out; 1323 goto out;
1288 1324
1289 printk(KERN_INFO "AMD-Vi: device isolation ");
1290 if (amd_iommu_isolate)
1291 printk("enabled\n");
1292 else
1293 printk("disabled\n");
1294
1295 if (amd_iommu_unmap_flush) 1325 if (amd_iommu_unmap_flush)
1296 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n"); 1326 printk(KERN_INFO "AMD-Vi: IO/TLB flush on unmap enabled\n");
1297 else 1327 else
1298 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n"); 1328 printk(KERN_INFO "AMD-Vi: Lazy IO/TLB flushing enabled\n");
1299 1329
1330 x86_platform.iommu_shutdown = disable_iommus;
1300out: 1331out:
1301 return ret; 1332 return ret;
1302 1333
1303free: 1334free:
1335 disable_iommus();
1336
1337 amd_iommu_uninit_devices();
1338
1304 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap, 1339 free_pages((unsigned long)amd_iommu_pd_alloc_bitmap,
1305 get_order(MAX_DOMAIN_ID/8)); 1340 get_order(MAX_DOMAIN_ID/8));
1306 1341
1307 free_pages((unsigned long)amd_iommu_pd_table,
1308 get_order(rlookup_table_size));
1309
1310 free_pages((unsigned long)amd_iommu_rlookup_table, 1342 free_pages((unsigned long)amd_iommu_rlookup_table,
1311 get_order(rlookup_table_size)); 1343 get_order(rlookup_table_size));
1312 1344
@@ -1323,11 +1355,6 @@ free:
1323 goto out; 1355 goto out;
1324} 1356}
1325 1357
1326void amd_iommu_shutdown(void)
1327{
1328 disable_iommus();
1329}
1330
1331/**************************************************************************** 1358/****************************************************************************
1332 * 1359 *
1333 * Early detect code. This code runs at IOMMU detection time in the DMA 1360 * Early detect code. This code runs at IOMMU detection time in the DMA
@@ -1342,16 +1369,16 @@ static int __init early_amd_iommu_detect(struct acpi_table_header *table)
1342 1369
1343void __init amd_iommu_detect(void) 1370void __init amd_iommu_detect(void)
1344{ 1371{
1345 if (swiotlb || no_iommu || (iommu_detected && !gart_iommu_aperture)) 1372 if (no_iommu || (iommu_detected && !gart_iommu_aperture))
1346 return; 1373 return;
1347 1374
1348 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) { 1375 if (acpi_table_parse("IVRS", early_amd_iommu_detect) == 0) {
1349 iommu_detected = 1; 1376 iommu_detected = 1;
1350 amd_iommu_detected = 1; 1377 amd_iommu_detected = 1;
1351#ifdef CONFIG_GART_IOMMU 1378 x86_init.iommu.iommu_init = amd_iommu_init;
1352 gart_iommu_aperture_disabled = 1; 1379
1353 gart_iommu_aperture = 0; 1380 /* Make sure ACS will be enabled */
1354#endif 1381 pci_request_acs();
1355 } 1382 }
1356} 1383}
1357 1384
@@ -1372,10 +1399,6 @@ static int __init parse_amd_iommu_dump(char *str)
1372static int __init parse_amd_iommu_options(char *str) 1399static int __init parse_amd_iommu_options(char *str)
1373{ 1400{
1374 for (; *str; ++str) { 1401 for (; *str; ++str) {
1375 if (strncmp(str, "isolate", 7) == 0)
1376 amd_iommu_isolate = true;
1377 if (strncmp(str, "share", 5) == 0)
1378 amd_iommu_isolate = false;
1379 if (strncmp(str, "fullflush", 9) == 0) 1402 if (strncmp(str, "fullflush", 9) == 0)
1380 amd_iommu_unmap_flush = true; 1403 amd_iommu_unmap_flush = true;
1381 } 1404 }
diff --git a/arch/x86/kernel/apb_timer.c b/arch/x86/kernel/apb_timer.c
new file mode 100644
index 000000000000..a35347501d36
--- /dev/null
+++ b/arch/x86/kernel/apb_timer.c
@@ -0,0 +1,785 @@
1/*
2 * apb_timer.c: Driver for Langwell APB timers
3 *
4 * (C) Copyright 2009 Intel Corporation
5 * Author: Jacob Pan (jacob.jun.pan@intel.com)
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; version 2
10 * of the License.
11 *
12 * Note:
13 * Langwell is the south complex of Intel Moorestown MID platform. There are
14 * eight external timers in total that can be used by the operating system.
15 * The timer information, such as frequency and addresses, is provided to the
16 * OS via SFI tables.
17 * Timer interrupts are routed via FW/HW emulated IOAPIC independently via
18 * individual redirection table entries (RTE).
19 * Unlike HPET, there is no master counter, therefore one of the timers are
20 * used as clocksource. The overall allocation looks like:
21 * - timer 0 - NR_CPUs for per cpu timer
22 * - one timer for clocksource
23 * - one timer for watchdog driver.
24 * It is also worth notice that APB timer does not support true one-shot mode,
25 * free-running mode will be used here to emulate one-shot mode.
26 * APB timer can also be used as broadcast timer along with per cpu local APIC
27 * timer, but by default APB timer has higher rating than local APIC timers.
28 */
29
30#include <linux/clocksource.h>
31#include <linux/clockchips.h>
32#include <linux/delay.h>
33#include <linux/errno.h>
34#include <linux/init.h>
35#include <linux/sysdev.h>
36#include <linux/slab.h>
37#include <linux/pm.h>
38#include <linux/pci.h>
39#include <linux/sfi.h>
40#include <linux/interrupt.h>
41#include <linux/cpu.h>
42#include <linux/irq.h>
43
44#include <asm/fixmap.h>
45#include <asm/apb_timer.h>
46
47#define APBT_MASK CLOCKSOURCE_MASK(32)
48#define APBT_SHIFT 22
49#define APBT_CLOCKEVENT_RATING 150
50#define APBT_CLOCKSOURCE_RATING 250
51#define APBT_MIN_DELTA_USEC 200
52
53#define EVT_TO_APBT_DEV(evt) container_of(evt, struct apbt_dev, evt)
54#define APBT_CLOCKEVENT0_NUM (0)
55#define APBT_CLOCKEVENT1_NUM (1)
56#define APBT_CLOCKSOURCE_NUM (2)
57
58static unsigned long apbt_address;
59static int apb_timer_block_enabled;
60static void __iomem *apbt_virt_address;
61static int phy_cs_timer_id;
62
63/*
64 * Common DW APB timer info
65 */
66static uint64_t apbt_freq;
67
68static void apbt_set_mode(enum clock_event_mode mode,
69 struct clock_event_device *evt);
70static int apbt_next_event(unsigned long delta,
71 struct clock_event_device *evt);
72static cycle_t apbt_read_clocksource(struct clocksource *cs);
73static void apbt_restart_clocksource(struct clocksource *cs);
74
75struct apbt_dev {
76 struct clock_event_device evt;
77 unsigned int num;
78 int cpu;
79 unsigned int irq;
80 unsigned int tick;
81 unsigned int count;
82 unsigned int flags;
83 char name[10];
84};
85
86int disable_apbt_percpu __cpuinitdata;
87
88static DEFINE_PER_CPU(struct apbt_dev, cpu_apbt_dev);
89
90#ifdef CONFIG_SMP
91static unsigned int apbt_num_timers_used;
92static struct apbt_dev *apbt_devs;
93#endif
94
95static inline unsigned long apbt_readl_reg(unsigned long a)
96{
97 return readl(apbt_virt_address + a);
98}
99
100static inline void apbt_writel_reg(unsigned long d, unsigned long a)
101{
102 writel(d, apbt_virt_address + a);
103}
104
105static inline unsigned long apbt_readl(int n, unsigned long a)
106{
107 return readl(apbt_virt_address + a + n * APBTMRS_REG_SIZE);
108}
109
110static inline void apbt_writel(int n, unsigned long d, unsigned long a)
111{
112 writel(d, apbt_virt_address + a + n * APBTMRS_REG_SIZE);
113}
114
115static inline void apbt_set_mapping(void)
116{
117 struct sfi_timer_table_entry *mtmr;
118
119 if (apbt_virt_address) {
120 pr_debug("APBT base already mapped\n");
121 return;
122 }
123 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
124 if (mtmr == NULL) {
125 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
126 APBT_CLOCKEVENT0_NUM);
127 return;
128 }
129 apbt_address = (unsigned long)mtmr->phys_addr;
130 if (!apbt_address) {
131 printk(KERN_WARNING "No timer base from SFI, use default\n");
132 apbt_address = APBT_DEFAULT_BASE;
133 }
134 apbt_virt_address = ioremap_nocache(apbt_address, APBT_MMAP_SIZE);
135 if (apbt_virt_address) {
136 pr_debug("Mapped APBT physical addr %p at virtual addr %p\n",\
137 (void *)apbt_address, (void *)apbt_virt_address);
138 } else {
139 pr_debug("Failed mapping APBT phy address at %p\n",\
140 (void *)apbt_address);
141 goto panic_noapbt;
142 }
143 apbt_freq = mtmr->freq_hz / USEC_PER_SEC;
144 sfi_free_mtmr(mtmr);
145
146 /* Now figure out the physical timer id for clocksource device */
147 mtmr = sfi_get_mtmr(APBT_CLOCKSOURCE_NUM);
148 if (mtmr == NULL)
149 goto panic_noapbt;
150
151 /* Now figure out the physical timer id */
152 phy_cs_timer_id = (unsigned int)(mtmr->phys_addr & 0xff)
153 / APBTMRS_REG_SIZE;
154 pr_debug("Use timer %d for clocksource\n", phy_cs_timer_id);
155 return;
156
157panic_noapbt:
158 panic("Failed to setup APB system timer\n");
159
160}
161
162static inline void apbt_clear_mapping(void)
163{
164 iounmap(apbt_virt_address);
165 apbt_virt_address = NULL;
166}
167
168/*
169 * APBT timer interrupt enable / disable
170 */
171static inline int is_apbt_capable(void)
172{
173 return apbt_virt_address ? 1 : 0;
174}
175
176static struct clocksource clocksource_apbt = {
177 .name = "apbt",
178 .rating = APBT_CLOCKSOURCE_RATING,
179 .read = apbt_read_clocksource,
180 .mask = APBT_MASK,
181 .shift = APBT_SHIFT,
182 .flags = CLOCK_SOURCE_IS_CONTINUOUS,
183 .resume = apbt_restart_clocksource,
184};
185
186/* boot APB clock event device */
187static struct clock_event_device apbt_clockevent = {
188 .name = "apbt0",
189 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
190 .set_mode = apbt_set_mode,
191 .set_next_event = apbt_next_event,
192 .shift = APBT_SHIFT,
193 .irq = 0,
194 .rating = APBT_CLOCKEVENT_RATING,
195};
196
197/*
198 * if user does not want to use per CPU apb timer, just give it a lower rating
199 * than local apic timer and skip the late per cpu timer init.
200 */
201static inline int __init setup_x86_mrst_timer(char *arg)
202{
203 if (!arg)
204 return -EINVAL;
205
206 if (strcmp("apbt_only", arg) == 0)
207 disable_apbt_percpu = 0;
208 else if (strcmp("lapic_and_apbt", arg) == 0)
209 disable_apbt_percpu = 1;
210 else {
211 pr_warning("X86 MRST timer option %s not recognised"
212 " use x86_mrst_timer=apbt_only or lapic_and_apbt\n",
213 arg);
214 return -EINVAL;
215 }
216 return 0;
217}
218__setup("x86_mrst_timer=", setup_x86_mrst_timer);
219
220/*
221 * start count down from 0xffff_ffff. this is done by toggling the enable bit
222 * then load initial load count to ~0.
223 */
224static void apbt_start_counter(int n)
225{
226 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
227
228 ctrl &= ~APBTMR_CONTROL_ENABLE;
229 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
230 apbt_writel(n, ~0, APBTMR_N_LOAD_COUNT);
231 /* enable, mask interrupt */
232 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
233 ctrl |= (APBTMR_CONTROL_ENABLE | APBTMR_CONTROL_INT);
234 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
235 /* read it once to get cached counter value initialized */
236 apbt_read_clocksource(&clocksource_apbt);
237}
238
239static irqreturn_t apbt_interrupt_handler(int irq, void *data)
240{
241 struct apbt_dev *dev = (struct apbt_dev *)data;
242 struct clock_event_device *aevt = &dev->evt;
243
244 if (!aevt->event_handler) {
245 printk(KERN_INFO "Spurious APBT timer interrupt on %d\n",
246 dev->num);
247 return IRQ_NONE;
248 }
249 aevt->event_handler(aevt);
250 return IRQ_HANDLED;
251}
252
253static void apbt_restart_clocksource(struct clocksource *cs)
254{
255 apbt_start_counter(phy_cs_timer_id);
256}
257
258/* Setup IRQ routing via IOAPIC */
259#ifdef CONFIG_SMP
260static void apbt_setup_irq(struct apbt_dev *adev)
261{
262 struct irq_chip *chip;
263 struct irq_desc *desc;
264
265 /* timer0 irq has been setup early */
266 if (adev->irq == 0)
267 return;
268 desc = irq_to_desc(adev->irq);
269 chip = get_irq_chip(adev->irq);
270 disable_irq(adev->irq);
271 desc->status |= IRQ_MOVE_PCNTXT;
272 irq_set_affinity(adev->irq, cpumask_of(adev->cpu));
273 /* APB timer irqs are set up as mp_irqs, timer is edge triggerred */
274 set_irq_chip_and_handler_name(adev->irq, chip, handle_edge_irq, "edge");
275 enable_irq(adev->irq);
276 if (system_state == SYSTEM_BOOTING)
277 if (request_irq(adev->irq, apbt_interrupt_handler,
278 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
279 adev->name, adev)) {
280 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
281 adev->num);
282 }
283}
284#endif
285
286static void apbt_enable_int(int n)
287{
288 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
289 /* clear pending intr */
290 apbt_readl(n, APBTMR_N_EOI);
291 ctrl &= ~APBTMR_CONTROL_INT;
292 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
293}
294
295static void apbt_disable_int(int n)
296{
297 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
298
299 ctrl |= APBTMR_CONTROL_INT;
300 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
301}
302
303
304static int __init apbt_clockevent_register(void)
305{
306 struct sfi_timer_table_entry *mtmr;
307 struct apbt_dev *adev = &__get_cpu_var(cpu_apbt_dev);
308
309 mtmr = sfi_get_mtmr(APBT_CLOCKEVENT0_NUM);
310 if (mtmr == NULL) {
311 printk(KERN_ERR "Failed to get MTMR %d from SFI\n",
312 APBT_CLOCKEVENT0_NUM);
313 return -ENODEV;
314 }
315
316 /*
317 * We need to calculate the scaled math multiplication factor for
318 * nanosecond to apbt tick conversion.
319 * mult = (nsec/cycle)*2^APBT_SHIFT
320 */
321 apbt_clockevent.mult = div_sc((unsigned long) mtmr->freq_hz
322 , NSEC_PER_SEC, APBT_SHIFT);
323
324 /* Calculate the min / max delta */
325 apbt_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF,
326 &apbt_clockevent);
327 apbt_clockevent.min_delta_ns = clockevent_delta2ns(
328 APBT_MIN_DELTA_USEC*apbt_freq,
329 &apbt_clockevent);
330 /*
331 * Start apbt with the boot cpu mask and make it
332 * global if not used for per cpu timer.
333 */
334 apbt_clockevent.cpumask = cpumask_of(smp_processor_id());
335 adev->num = smp_processor_id();
336 memcpy(&adev->evt, &apbt_clockevent, sizeof(struct clock_event_device));
337
338 if (disable_apbt_percpu) {
339 apbt_clockevent.rating = APBT_CLOCKEVENT_RATING - 100;
340 global_clock_event = &adev->evt;
341 printk(KERN_DEBUG "%s clockevent registered as global\n",
342 global_clock_event->name);
343 }
344
345 if (request_irq(apbt_clockevent.irq, apbt_interrupt_handler,
346 IRQF_TIMER | IRQF_DISABLED | IRQF_NOBALANCING,
347 apbt_clockevent.name, adev)) {
348 printk(KERN_ERR "Failed request IRQ for APBT%d\n",
349 apbt_clockevent.irq);
350 }
351
352 clockevents_register_device(&adev->evt);
353 /* Start APBT 0 interrupts */
354 apbt_enable_int(APBT_CLOCKEVENT0_NUM);
355
356 sfi_free_mtmr(mtmr);
357 return 0;
358}
359
360#ifdef CONFIG_SMP
361/* Should be called with per cpu */
362void apbt_setup_secondary_clock(void)
363{
364 struct apbt_dev *adev;
365 struct clock_event_device *aevt;
366 int cpu;
367
368 /* Don't register boot CPU clockevent */
369 cpu = smp_processor_id();
370 if (cpu == boot_cpu_id)
371 return;
372 /*
373 * We need to calculate the scaled math multiplication factor for
374 * nanosecond to apbt tick conversion.
375 * mult = (nsec/cycle)*2^APBT_SHIFT
376 */
377 printk(KERN_INFO "Init per CPU clockevent %d\n", cpu);
378 adev = &per_cpu(cpu_apbt_dev, cpu);
379 aevt = &adev->evt;
380
381 memcpy(aevt, &apbt_clockevent, sizeof(*aevt));
382 aevt->cpumask = cpumask_of(cpu);
383 aevt->name = adev->name;
384 aevt->mode = CLOCK_EVT_MODE_UNUSED;
385
386 printk(KERN_INFO "Registering CPU %d clockevent device %s, mask %08x\n",
387 cpu, aevt->name, *(u32 *)aevt->cpumask);
388
389 apbt_setup_irq(adev);
390
391 clockevents_register_device(aevt);
392
393 apbt_enable_int(cpu);
394
395 return;
396}
397
398/*
399 * this notify handler process CPU hotplug events. in case of S0i3, nonboot
400 * cpus are disabled/enabled frequently, for performance reasons, we keep the
401 * per cpu timer irq registered so that we do need to do free_irq/request_irq.
402 *
403 * TODO: it might be more reliable to directly disable percpu clockevent device
404 * without the notifier chain. currently, cpu 0 may get interrupts from other
405 * cpu timers during the offline process due to the ordering of notification.
406 * the extra interrupt is harmless.
407 */
408static int apbt_cpuhp_notify(struct notifier_block *n,
409 unsigned long action, void *hcpu)
410{
411 unsigned long cpu = (unsigned long)hcpu;
412 struct apbt_dev *adev = &per_cpu(cpu_apbt_dev, cpu);
413
414 switch (action & 0xf) {
415 case CPU_DEAD:
416 apbt_disable_int(cpu);
417 if (system_state == SYSTEM_RUNNING)
418 pr_debug("skipping APBT CPU %lu offline\n", cpu);
419 else if (adev) {
420 pr_debug("APBT clockevent for cpu %lu offline\n", cpu);
421 free_irq(adev->irq, adev);
422 }
423 break;
424 default:
425 pr_debug(KERN_INFO "APBT notified %lu, no action\n", action);
426 }
427 return NOTIFY_OK;
428}
429
430static __init int apbt_late_init(void)
431{
432 if (disable_apbt_percpu || !apb_timer_block_enabled)
433 return 0;
434 /* This notifier should be called after workqueue is ready */
435 hotcpu_notifier(apbt_cpuhp_notify, -20);
436 return 0;
437}
438fs_initcall(apbt_late_init);
439#else
440
441void apbt_setup_secondary_clock(void) {}
442
443#endif /* CONFIG_SMP */
444
445static void apbt_set_mode(enum clock_event_mode mode,
446 struct clock_event_device *evt)
447{
448 unsigned long ctrl;
449 uint64_t delta;
450 int timer_num;
451 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
452
453 timer_num = adev->num;
454 pr_debug("%s CPU %d timer %d mode=%d\n",
455 __func__, first_cpu(*evt->cpumask), timer_num, mode);
456
457 switch (mode) {
458 case CLOCK_EVT_MODE_PERIODIC:
459 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * apbt_clockevent.mult;
460 delta >>= apbt_clockevent.shift;
461 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
462 ctrl |= APBTMR_CONTROL_MODE_PERIODIC;
463 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
464 /*
465 * DW APB p. 46, have to disable timer before load counter,
466 * may cause sync problem.
467 */
468 ctrl &= ~APBTMR_CONTROL_ENABLE;
469 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
470 udelay(1);
471 pr_debug("Setting clock period %d for HZ %d\n", (int)delta, HZ);
472 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
473 ctrl |= APBTMR_CONTROL_ENABLE;
474 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
475 break;
476 /* APB timer does not have one-shot mode, use free running mode */
477 case CLOCK_EVT_MODE_ONESHOT:
478 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
479 /*
480 * set free running mode, this mode will let timer reload max
481 * timeout which will give time (3min on 25MHz clock) to rearm
482 * the next event, therefore emulate the one-shot mode.
483 */
484 ctrl &= ~APBTMR_CONTROL_ENABLE;
485 ctrl &= ~APBTMR_CONTROL_MODE_PERIODIC;
486
487 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
488 /* write again to set free running mode */
489 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
490
491 /*
492 * DW APB p. 46, load counter with all 1s before starting free
493 * running mode.
494 */
495 apbt_writel(timer_num, ~0, APBTMR_N_LOAD_COUNT);
496 ctrl &= ~APBTMR_CONTROL_INT;
497 ctrl |= APBTMR_CONTROL_ENABLE;
498 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
499 break;
500
501 case CLOCK_EVT_MODE_UNUSED:
502 case CLOCK_EVT_MODE_SHUTDOWN:
503 apbt_disable_int(timer_num);
504 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
505 ctrl &= ~APBTMR_CONTROL_ENABLE;
506 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
507 break;
508
509 case CLOCK_EVT_MODE_RESUME:
510 apbt_enable_int(timer_num);
511 break;
512 }
513}
514
515static int apbt_next_event(unsigned long delta,
516 struct clock_event_device *evt)
517{
518 unsigned long ctrl;
519 int timer_num;
520
521 struct apbt_dev *adev = EVT_TO_APBT_DEV(evt);
522
523 timer_num = adev->num;
524 /* Disable timer */
525 ctrl = apbt_readl(timer_num, APBTMR_N_CONTROL);
526 ctrl &= ~APBTMR_CONTROL_ENABLE;
527 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
528 /* write new count */
529 apbt_writel(timer_num, delta, APBTMR_N_LOAD_COUNT);
530 ctrl |= APBTMR_CONTROL_ENABLE;
531 apbt_writel(timer_num, ctrl, APBTMR_N_CONTROL);
532 return 0;
533}
534
535/*
536 * APB timer clock is not in sync with pclk on Langwell, which translates to
537 * unreliable read value caused by sampling error. the error does not add up
538 * overtime and only happens when sampling a 0 as a 1 by mistake. so the time
539 * would go backwards. the following code is trying to prevent time traveling
540 * backwards. little bit paranoid.
541 */
542static cycle_t apbt_read_clocksource(struct clocksource *cs)
543{
544 unsigned long t0, t1, t2;
545 static unsigned long last_read;
546
547bad_count:
548 t1 = apbt_readl(phy_cs_timer_id,
549 APBTMR_N_CURRENT_VALUE);
550 t2 = apbt_readl(phy_cs_timer_id,
551 APBTMR_N_CURRENT_VALUE);
552 if (unlikely(t1 < t2)) {
553 pr_debug("APBT: read current count error %lx:%lx:%lx\n",
554 t1, t2, t2 - t1);
555 goto bad_count;
556 }
557 /*
558 * check against cached last read, makes sure time does not go back.
559 * it could be a normal rollover but we will do tripple check anyway
560 */
561 if (unlikely(t2 > last_read)) {
562 /* check if we have a normal rollover */
563 unsigned long raw_intr_status =
564 apbt_readl_reg(APBTMRS_RAW_INT_STATUS);
565 /*
566 * cs timer interrupt is masked but raw intr bit is set if
567 * rollover occurs. then we read EOI reg to clear it.
568 */
569 if (raw_intr_status & (1 << phy_cs_timer_id)) {
570 apbt_readl(phy_cs_timer_id, APBTMR_N_EOI);
571 goto out;
572 }
573 pr_debug("APB CS going back %lx:%lx:%lx ",
574 t2, last_read, t2 - last_read);
575bad_count_x3:
576 pr_debug(KERN_INFO "tripple check enforced\n");
577 t0 = apbt_readl(phy_cs_timer_id,
578 APBTMR_N_CURRENT_VALUE);
579 udelay(1);
580 t1 = apbt_readl(phy_cs_timer_id,
581 APBTMR_N_CURRENT_VALUE);
582 udelay(1);
583 t2 = apbt_readl(phy_cs_timer_id,
584 APBTMR_N_CURRENT_VALUE);
585 if ((t2 > t1) || (t1 > t0)) {
586 printk(KERN_ERR "Error: APB CS tripple check failed\n");
587 goto bad_count_x3;
588 }
589 }
590out:
591 last_read = t2;
592 return (cycle_t)~t2;
593}
594
595static int apbt_clocksource_register(void)
596{
597 u64 start, now;
598 cycle_t t1;
599
600 /* Start the counter, use timer 2 as source, timer 0/1 for event */
601 apbt_start_counter(phy_cs_timer_id);
602
603 /* Verify whether apbt counter works */
604 t1 = apbt_read_clocksource(&clocksource_apbt);
605 rdtscll(start);
606
607 /*
608 * We don't know the TSC frequency yet, but waiting for
609 * 200000 TSC cycles is safe:
610 * 4 GHz == 50us
611 * 1 GHz == 200us
612 */
613 do {
614 rep_nop();
615 rdtscll(now);
616 } while ((now - start) < 200000UL);
617
618 /* APBT is the only always on clocksource, it has to work! */
619 if (t1 == apbt_read_clocksource(&clocksource_apbt))
620 panic("APBT counter not counting. APBT disabled\n");
621
622 /*
623 * initialize and register APBT clocksource
624 * convert that to ns/clock cycle
625 * mult = (ns/c) * 2^APBT_SHIFT
626 */
627 clocksource_apbt.mult = div_sc(MSEC_PER_SEC,
628 (unsigned long) apbt_freq, APBT_SHIFT);
629 clocksource_register(&clocksource_apbt);
630
631 return 0;
632}
633
634/*
635 * Early setup the APBT timer, only use timer 0 for booting then switch to
636 * per CPU timer if possible.
637 * returns 1 if per cpu apbt is setup
638 * returns 0 if no per cpu apbt is chosen
639 * panic if set up failed, this is the only platform timer on Moorestown.
640 */
641void __init apbt_time_init(void)
642{
643#ifdef CONFIG_SMP
644 int i;
645 struct sfi_timer_table_entry *p_mtmr;
646 unsigned int percpu_timer;
647 struct apbt_dev *adev;
648#endif
649
650 if (apb_timer_block_enabled)
651 return;
652 apbt_set_mapping();
653 if (apbt_virt_address) {
654 pr_debug("Found APBT version 0x%lx\n",\
655 apbt_readl_reg(APBTMRS_COMP_VERSION));
656 } else
657 goto out_noapbt;
658 /*
659 * Read the frequency and check for a sane value, for ESL model
660 * we extend the possible clock range to allow time scaling.
661 */
662
663 if (apbt_freq < APBT_MIN_FREQ || apbt_freq > APBT_MAX_FREQ) {
664 pr_debug("APBT has invalid freq 0x%llx\n", apbt_freq);
665 goto out_noapbt;
666 }
667 if (apbt_clocksource_register()) {
668 pr_debug("APBT has failed to register clocksource\n");
669 goto out_noapbt;
670 }
671 if (!apbt_clockevent_register())
672 apb_timer_block_enabled = 1;
673 else {
674 pr_debug("APBT has failed to register clockevent\n");
675 goto out_noapbt;
676 }
677#ifdef CONFIG_SMP
678 /* kernel cmdline disable apb timer, so we will use lapic timers */
679 if (disable_apbt_percpu) {
680 printk(KERN_INFO "apbt: disabled per cpu timer\n");
681 return;
682 }
683 pr_debug("%s: %d CPUs online\n", __func__, num_online_cpus());
684 if (num_possible_cpus() <= sfi_mtimer_num) {
685 percpu_timer = 1;
686 apbt_num_timers_used = num_possible_cpus();
687 } else {
688 percpu_timer = 0;
689 apbt_num_timers_used = 1;
690 adev = &per_cpu(cpu_apbt_dev, 0);
691 adev->flags &= ~APBT_DEV_USED;
692 }
693 pr_debug("%s: %d APB timers used\n", __func__, apbt_num_timers_used);
694
695 /* here we set up per CPU timer data structure */
696 apbt_devs = kzalloc(sizeof(struct apbt_dev) * apbt_num_timers_used,
697 GFP_KERNEL);
698 if (!apbt_devs) {
699 printk(KERN_ERR "Failed to allocate APB timer devices\n");
700 return;
701 }
702 for (i = 0; i < apbt_num_timers_used; i++) {
703 adev = &per_cpu(cpu_apbt_dev, i);
704 adev->num = i;
705 adev->cpu = i;
706 p_mtmr = sfi_get_mtmr(i);
707 if (p_mtmr) {
708 adev->tick = p_mtmr->freq_hz;
709 adev->irq = p_mtmr->irq;
710 } else
711 printk(KERN_ERR "Failed to get timer for cpu %d\n", i);
712 adev->count = 0;
713 sprintf(adev->name, "apbt%d", i);
714 }
715#endif
716
717 return;
718
719out_noapbt:
720 apbt_clear_mapping();
721 apb_timer_block_enabled = 0;
722 panic("failed to enable APB timer\n");
723}
724
725static inline void apbt_disable(int n)
726{
727 if (is_apbt_capable()) {
728 unsigned long ctrl = apbt_readl(n, APBTMR_N_CONTROL);
729 ctrl &= ~APBTMR_CONTROL_ENABLE;
730 apbt_writel(n, ctrl, APBTMR_N_CONTROL);
731 }
732}
733
734/* called before apb_timer_enable, use early map */
735unsigned long apbt_quick_calibrate()
736{
737 int i, scale;
738 u64 old, new;
739 cycle_t t1, t2;
740 unsigned long khz = 0;
741 u32 loop, shift;
742
743 apbt_set_mapping();
744 apbt_start_counter(phy_cs_timer_id);
745
746 /* check if the timer can count down, otherwise return */
747 old = apbt_read_clocksource(&clocksource_apbt);
748 i = 10000;
749 while (--i) {
750 if (old != apbt_read_clocksource(&clocksource_apbt))
751 break;
752 }
753 if (!i)
754 goto failed;
755
756 /* count 16 ms */
757 loop = (apbt_freq * 1000) << 4;
758
759 /* restart the timer to ensure it won't get to 0 in the calibration */
760 apbt_start_counter(phy_cs_timer_id);
761
762 old = apbt_read_clocksource(&clocksource_apbt);
763 old += loop;
764
765 t1 = __native_read_tsc();
766
767 do {
768 new = apbt_read_clocksource(&clocksource_apbt);
769 } while (new < old);
770
771 t2 = __native_read_tsc();
772
773 shift = 5;
774 if (unlikely(loop >> shift == 0)) {
775 printk(KERN_INFO
776 "APBT TSC calibration failed, not enough resolution\n");
777 return 0;
778 }
779 scale = (int)div_u64((t2 - t1), loop >> shift);
780 khz = (scale * apbt_freq * 1000) >> shift;
781 printk(KERN_INFO "TSC freq calculated by APB timer is %lu khz\n", khz);
782 return khz;
783failed:
784 return 0;
785}
diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c
index 128111d8ffe0..b5d8b0bcf235 100644
--- a/arch/x86/kernel/aperture_64.c
+++ b/arch/x86/kernel/aperture_64.c
@@ -28,6 +28,7 @@
28#include <asm/pci-direct.h> 28#include <asm/pci-direct.h>
29#include <asm/dma.h> 29#include <asm/dma.h>
30#include <asm/k8.h> 30#include <asm/k8.h>
31#include <asm/x86_init.h>
31 32
32int gart_iommu_aperture; 33int gart_iommu_aperture;
33int gart_iommu_aperture_disabled __initdata; 34int gart_iommu_aperture_disabled __initdata;
@@ -279,7 +280,8 @@ void __init early_gart_iommu_check(void)
279 * or BIOS forget to put that in reserved. 280 * or BIOS forget to put that in reserved.
280 * try to update e820 to make that region as reserved. 281 * try to update e820 to make that region as reserved.
281 */ 282 */
282 int i, fix, slot; 283 u32 agp_aper_base = 0, agp_aper_order = 0;
284 int i, fix, slot, valid_agp = 0;
283 u32 ctl; 285 u32 ctl;
284 u32 aper_size = 0, aper_order = 0, last_aper_order = 0; 286 u32 aper_size = 0, aper_order = 0, last_aper_order = 0;
285 u64 aper_base = 0, last_aper_base = 0; 287 u64 aper_base = 0, last_aper_base = 0;
@@ -289,6 +291,8 @@ void __init early_gart_iommu_check(void)
289 return; 291 return;
290 292
291 /* This is mostly duplicate of iommu_hole_init */ 293 /* This is mostly duplicate of iommu_hole_init */
294 agp_aper_base = search_agp_bridge(&agp_aper_order, &valid_agp);
295
292 fix = 0; 296 fix = 0;
293 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 297 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
294 int bus; 298 int bus;
@@ -341,10 +345,10 @@ void __init early_gart_iommu_check(void)
341 } 345 }
342 } 346 }
343 347
344 if (!fix) 348 if (valid_agp)
345 return; 349 return;
346 350
347 /* different nodes have different setting, disable them all at first*/ 351 /* disable them all at first */
348 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 352 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
349 int bus; 353 int bus;
350 int dev_base, dev_limit; 354 int dev_base, dev_limit;
@@ -389,6 +393,7 @@ void __init gart_iommu_hole_init(void)
389 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) { 393 for (i = 0; i < ARRAY_SIZE(bus_dev_ranges); i++) {
390 int bus; 394 int bus;
391 int dev_base, dev_limit; 395 int dev_base, dev_limit;
396 u32 ctl;
392 397
393 bus = bus_dev_ranges[i].bus; 398 bus = bus_dev_ranges[i].bus;
394 dev_base = bus_dev_ranges[i].dev_base; 399 dev_base = bus_dev_ranges[i].dev_base;
@@ -400,8 +405,21 @@ void __init gart_iommu_hole_init(void)
400 405
401 iommu_detected = 1; 406 iommu_detected = 1;
402 gart_iommu_aperture = 1; 407 gart_iommu_aperture = 1;
408 x86_init.iommu.iommu_init = gart_iommu_init;
409
410 ctl = read_pci_config(bus, slot, 3,
411 AMD64_GARTAPERTURECTL);
412
413 /*
414 * Before we do anything else disable the GART. It may
415 * still be enabled if we boot into a crash-kernel here.
416 * Reconfiguring the GART while it is enabled could have
417 * unknown side-effects.
418 */
419 ctl &= ~GARTEN;
420 write_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL, ctl);
403 421
404 aper_order = (read_pci_config(bus, slot, 3, AMD64_GARTAPERTURECTL) >> 1) & 7; 422 aper_order = (ctl >> 1) & 7;
405 aper_size = (32 * 1024 * 1024) << aper_order; 423 aper_size = (32 * 1024 * 1024) << aper_order;
406 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff; 424 aper_base = read_pci_config(bus, slot, 3, AMD64_GARTAPERTUREBASE) & 0x7fff;
407 aper_base <<= 25; 425 aper_base <<= 25;
@@ -456,8 +474,6 @@ out:
456 474
457 if (aper_alloc) { 475 if (aper_alloc) {
458 /* Got the aperture from the AGP bridge */ 476 /* Got the aperture from the AGP bridge */
459 } else if (swiotlb && !valid_agp) {
460 /* Do nothing */
461 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) || 477 } else if ((!no_iommu && max_pfn > MAX_DMA32_PFN) ||
462 force_iommu || 478 force_iommu ||
463 valid_agp || 479 valid_agp ||
diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile
index da7b7b9f8bd8..565c1bfc507d 100644
--- a/arch/x86/kernel/apic/Makefile
+++ b/arch/x86/kernel/apic/Makefile
@@ -2,7 +2,7 @@
2# Makefile for local APIC drivers and for the IO-APIC code 2# Makefile for local APIC drivers and for the IO-APIC code
3# 3#
4 4
5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o probe_$(BITS).o ipi.o nmi.o 5obj-$(CONFIG_X86_LOCAL_APIC) += apic.o apic_noop.o probe_$(BITS).o ipi.o nmi.o
6obj-$(CONFIG_X86_IO_APIC) += io_apic.o 6obj-$(CONFIG_X86_IO_APIC) += io_apic.o
7obj-$(CONFIG_SMP) += ipi.o 7obj-$(CONFIG_SMP) += ipi.o
8 8
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index 894aa97f0717..e5a4a1e01618 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -61,12 +61,6 @@ unsigned int boot_cpu_physical_apicid = -1U;
61 61
62/* 62/*
63 * The highest APIC ID seen during enumeration. 63 * The highest APIC ID seen during enumeration.
64 *
65 * On AMD, this determines the messaging protocol we can use: if all APIC IDs
66 * are in the 0 ... 7 range, then we can use logical addressing which
67 * has some performance advantages (better broadcasting).
68 *
69 * If there's an APIC ID above 8, we use physical addressing.
70 */ 64 */
71unsigned int max_physical_apicid; 65unsigned int max_physical_apicid;
72 66
@@ -241,28 +235,13 @@ static int modern_apic(void)
241} 235}
242 236
243/* 237/*
244 * bare function to substitute write operation 238 * right after this call apic become NOOP driven
245 * and it's _that_ fast :) 239 * so apic->write/read doesn't do anything
246 */
247static void native_apic_write_dummy(u32 reg, u32 v)
248{
249 WARN_ON_ONCE((cpu_has_apic || !disable_apic));
250}
251
252static u32 native_apic_read_dummy(u32 reg)
253{
254 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
255 return 0;
256}
257
258/*
259 * right after this call apic->write/read doesn't do anything
260 * note that there is no restore operation it works one way
261 */ 240 */
262void apic_disable(void) 241void apic_disable(void)
263{ 242{
264 apic->read = native_apic_read_dummy; 243 pr_info("APIC: switched to apic NOOP\n");
265 apic->write = native_apic_write_dummy; 244 apic = &apic_noop;
266} 245}
267 246
268void native_apic_wait_icr_idle(void) 247void native_apic_wait_icr_idle(void)
@@ -459,7 +438,7 @@ static void lapic_timer_setup(enum clock_event_mode mode,
459 v = apic_read(APIC_LVTT); 438 v = apic_read(APIC_LVTT);
460 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR); 439 v |= (APIC_LVT_MASKED | LOCAL_TIMER_VECTOR);
461 apic_write(APIC_LVTT, v); 440 apic_write(APIC_LVTT, v);
462 apic_write(APIC_TMICT, 0xffffffff); 441 apic_write(APIC_TMICT, 0);
463 break; 442 break;
464 case CLOCK_EVT_MODE_RESUME: 443 case CLOCK_EVT_MODE_RESUME:
465 /* Nothing to do here */ 444 /* Nothing to do here */
@@ -602,7 +581,7 @@ calibrate_by_pmtimer(long deltapm, long *delta, long *deltatsc)
602 res = (((u64)(*deltatsc)) * pm_100ms); 581 res = (((u64)(*deltatsc)) * pm_100ms);
603 do_div(res, deltapm); 582 do_div(res, deltapm);
604 apic_printk(APIC_VERBOSE, "TSC delta adjusted to " 583 apic_printk(APIC_VERBOSE, "TSC delta adjusted to "
605 "PM-Timer: %lu (%ld) \n", 584 "PM-Timer: %lu (%ld)\n",
606 (unsigned long)res, *deltatsc); 585 (unsigned long)res, *deltatsc);
607 *deltatsc = (long)res; 586 *deltatsc = (long)res;
608 } 587 }
@@ -662,7 +641,7 @@ static int __init calibrate_APIC_clock(void)
662 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS; 641 calibration_result = (delta * APIC_DIVISOR) / LAPIC_CAL_LOOPS;
663 642
664 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta); 643 apic_printk(APIC_VERBOSE, "..... delta %ld\n", delta);
665 apic_printk(APIC_VERBOSE, "..... mult: %ld\n", lapic_clockevent.mult); 644 apic_printk(APIC_VERBOSE, "..... mult: %u\n", lapic_clockevent.mult);
666 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n", 645 apic_printk(APIC_VERBOSE, "..... calibration result: %u\n",
667 calibration_result); 646 calibration_result);
668 647
@@ -1356,7 +1335,7 @@ void enable_x2apic(void)
1356 1335
1357 rdmsr(MSR_IA32_APICBASE, msr, msr2); 1336 rdmsr(MSR_IA32_APICBASE, msr, msr2);
1358 if (!(msr & X2APIC_ENABLE)) { 1337 if (!(msr & X2APIC_ENABLE)) {
1359 pr_info("Enabling x2apic\n"); 1338 printk_once(KERN_INFO "Enabling x2apic\n");
1360 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0); 1339 wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, 0);
1361 } 1340 }
1362} 1341}
@@ -1392,14 +1371,11 @@ void __init enable_IR_x2apic(void)
1392 unsigned long flags; 1371 unsigned long flags;
1393 struct IO_APIC_route_entry **ioapic_entries = NULL; 1372 struct IO_APIC_route_entry **ioapic_entries = NULL;
1394 int ret, x2apic_enabled = 0; 1373 int ret, x2apic_enabled = 0;
1395 int dmar_table_init_ret = 0; 1374 int dmar_table_init_ret;
1396 1375
1397#ifdef CONFIG_INTR_REMAP
1398 dmar_table_init_ret = dmar_table_init(); 1376 dmar_table_init_ret = dmar_table_init();
1399 if (dmar_table_init_ret) 1377 if (dmar_table_init_ret && !x2apic_supported())
1400 pr_debug("dmar_table_init() failed with %d:\n", 1378 return;
1401 dmar_table_init_ret);
1402#endif
1403 1379
1404 ioapic_entries = alloc_ioapic_entries(); 1380 ioapic_entries = alloc_ioapic_entries();
1405 if (!ioapic_entries) { 1381 if (!ioapic_entries) {
@@ -1414,7 +1390,7 @@ void __init enable_IR_x2apic(void)
1414 } 1390 }
1415 1391
1416 local_irq_save(flags); 1392 local_irq_save(flags);
1417 mask_8259A(); 1393 legacy_pic->mask_all();
1418 mask_IO_APIC_setup(ioapic_entries); 1394 mask_IO_APIC_setup(ioapic_entries);
1419 1395
1420 if (dmar_table_init_ret) 1396 if (dmar_table_init_ret)
@@ -1446,7 +1422,7 @@ void __init enable_IR_x2apic(void)
1446nox2apic: 1422nox2apic:
1447 if (!ret) /* IR enabling failed */ 1423 if (!ret) /* IR enabling failed */
1448 restore_IO_APIC_setup(ioapic_entries); 1424 restore_IO_APIC_setup(ioapic_entries);
1449 unmask_8259A(); 1425 legacy_pic->restore_mask();
1450 local_irq_restore(flags); 1426 local_irq_restore(flags);
1451 1427
1452out: 1428out:
@@ -1664,8 +1640,8 @@ int __init APIC_init_uniprocessor(void)
1664 } 1640 }
1665#endif 1641#endif
1666 1642
1643#ifndef CONFIG_SMP
1667 enable_IR_x2apic(); 1644 enable_IR_x2apic();
1668#ifdef CONFIG_X86_64
1669 default_setup_apic_routing(); 1645 default_setup_apic_routing();
1670#endif 1646#endif
1671 1647
@@ -1915,18 +1891,6 @@ void __cpuinit generic_processor_info(int apicid, int version)
1915 if (apicid > max_physical_apicid) 1891 if (apicid > max_physical_apicid)
1916 max_physical_apicid = apicid; 1892 max_physical_apicid = apicid;
1917 1893
1918#ifdef CONFIG_X86_32
1919 switch (boot_cpu_data.x86_vendor) {
1920 case X86_VENDOR_INTEL:
1921 if (num_processors > 8)
1922 def_to_bigsmp = 1;
1923 break;
1924 case X86_VENDOR_AMD:
1925 if (max_physical_apicid >= 8)
1926 def_to_bigsmp = 1;
1927 }
1928#endif
1929
1930#if defined(CONFIG_SMP) || defined(CONFIG_X86_64) 1894#if defined(CONFIG_SMP) || defined(CONFIG_X86_64)
1931 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid; 1895 early_per_cpu(x86_cpu_to_apicid, cpu) = apicid;
1932 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid; 1896 early_per_cpu(x86_bios_cpu_apicid, cpu) = apicid;
@@ -2056,7 +2020,7 @@ static int lapic_resume(struct sys_device *dev)
2056 } 2020 }
2057 2021
2058 mask_IO_APIC_setup(ioapic_entries); 2022 mask_IO_APIC_setup(ioapic_entries);
2059 mask_8259A(); 2023 legacy_pic->mask_all();
2060 } 2024 }
2061 2025
2062 if (x2apic_mode) 2026 if (x2apic_mode)
@@ -2100,7 +2064,7 @@ static int lapic_resume(struct sys_device *dev)
2100 2064
2101 if (intr_remapping_enabled) { 2065 if (intr_remapping_enabled) {
2102 reenable_intr_remapping(x2apic_mode); 2066 reenable_intr_remapping(x2apic_mode);
2103 unmask_8259A(); 2067 legacy_pic->restore_mask();
2104 restore_IO_APIC_setup(ioapic_entries); 2068 restore_IO_APIC_setup(ioapic_entries);
2105 free_ioapic_entries(ioapic_entries); 2069 free_ioapic_entries(ioapic_entries);
2106 } 2070 }
diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c
index d0c99abc26c3..09d3b17ce0c2 100644
--- a/arch/x86/kernel/apic/apic_flat_64.c
+++ b/arch/x86/kernel/apic/apic_flat_64.c
@@ -223,7 +223,7 @@ struct apic apic_flat = {
223}; 223};
224 224
225/* 225/*
226 * Physflat mode is used when there are more than 8 CPUs on a AMD system. 226 * Physflat mode is used when there are more than 8 CPUs on a system.
227 * We cannot use logical delivery in this case because the mask 227 * We cannot use logical delivery in this case because the mask
228 * overflows, so use physical mode. 228 * overflows, so use physical mode.
229 */ 229 */
@@ -240,6 +240,11 @@ static int physflat_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
240 printk(KERN_DEBUG "system APIC only can use physical flat"); 240 printk(KERN_DEBUG "system APIC only can use physical flat");
241 return 1; 241 return 1;
242 } 242 }
243
244 if (!strncmp(oem_id, "IBM", 3) && !strncmp(oem_table_id, "EXA", 3)) {
245 printk(KERN_DEBUG "IBM Summit detected, will use apic physical");
246 return 1;
247 }
243#endif 248#endif
244 249
245 return 0; 250 return 0;
@@ -306,10 +311,7 @@ physflat_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
306 if (cpumask_test_cpu(cpu, cpu_online_mask)) 311 if (cpumask_test_cpu(cpu, cpu_online_mask))
307 break; 312 break;
308 } 313 }
309 if (cpu < nr_cpu_ids) 314 return per_cpu(x86_cpu_to_apicid, cpu);
310 return per_cpu(x86_cpu_to_apicid, cpu);
311
312 return BAD_APICID;
313} 315}
314 316
315struct apic apic_physflat = { 317struct apic apic_physflat = {
diff --git a/arch/x86/kernel/apic/apic_noop.c b/arch/x86/kernel/apic/apic_noop.c
new file mode 100644
index 000000000000..e31b9ffe25f5
--- /dev/null
+++ b/arch/x86/kernel/apic/apic_noop.c
@@ -0,0 +1,200 @@
1/*
2 * NOOP APIC driver.
3 *
4 * Does almost nothing and should be substituted by a real apic driver via
5 * probe routine.
6 *
7 * Though in case if apic is disabled (for some reason) we try
8 * to not uglify the caller's code and allow to call (some) apic routines
9 * like self-ipi, etc...
10 */
11
12#include <linux/threads.h>
13#include <linux/cpumask.h>
14#include <linux/module.h>
15#include <linux/string.h>
16#include <linux/kernel.h>
17#include <linux/ctype.h>
18#include <linux/init.h>
19#include <linux/errno.h>
20#include <asm/fixmap.h>
21#include <asm/mpspec.h>
22#include <asm/apicdef.h>
23#include <asm/apic.h>
24#include <asm/setup.h>
25
26#include <linux/smp.h>
27#include <asm/ipi.h>
28
29#include <linux/interrupt.h>
30#include <asm/acpi.h>
31#include <asm/e820.h>
32
33static void noop_init_apic_ldr(void) { }
34static void noop_send_IPI_mask(const struct cpumask *cpumask, int vector) { }
35static void noop_send_IPI_mask_allbutself(const struct cpumask *cpumask, int vector) { }
36static void noop_send_IPI_allbutself(int vector) { }
37static void noop_send_IPI_all(int vector) { }
38static void noop_send_IPI_self(int vector) { }
39static void noop_apic_wait_icr_idle(void) { }
40static void noop_apic_icr_write(u32 low, u32 id) { }
41
42static int noop_wakeup_secondary_cpu(int apicid, unsigned long start_eip)
43{
44 return -1;
45}
46
47static u32 noop_safe_apic_wait_icr_idle(void)
48{
49 return 0;
50}
51
52static u64 noop_apic_icr_read(void)
53{
54 return 0;
55}
56
57static int noop_cpu_to_logical_apicid(int cpu)
58{
59 return 0;
60}
61
62static int noop_phys_pkg_id(int cpuid_apic, int index_msb)
63{
64 return 0;
65}
66
67static unsigned int noop_get_apic_id(unsigned long x)
68{
69 return 0;
70}
71
72static int noop_probe(void)
73{
74 /*
75 * NOOP apic should not ever be
76 * enabled via probe routine
77 */
78 return 0;
79}
80
81static int noop_apic_id_registered(void)
82{
83 /*
84 * if we would be really "pedantic"
85 * we should pass read_apic_id() here
86 * but since NOOP suppose APIC ID = 0
87 * lets save a few cycles
88 */
89 return physid_isset(0, phys_cpu_present_map);
90}
91
92static const struct cpumask *noop_target_cpus(void)
93{
94 /* only BSP here */
95 return cpumask_of(0);
96}
97
98static unsigned long noop_check_apicid_used(physid_mask_t *map, int apicid)
99{
100 return physid_isset(apicid, *map);
101}
102
103static unsigned long noop_check_apicid_present(int bit)
104{
105 return physid_isset(bit, phys_cpu_present_map);
106}
107
108static void noop_vector_allocation_domain(int cpu, struct cpumask *retmask)
109{
110 if (cpu != 0)
111 pr_warning("APIC: Vector allocated for non-BSP cpu\n");
112 cpumask_clear(retmask);
113 cpumask_set_cpu(cpu, retmask);
114}
115
116int noop_apicid_to_node(int logical_apicid)
117{
118 /* we're always on node 0 */
119 return 0;
120}
121
122static u32 noop_apic_read(u32 reg)
123{
124 WARN_ON_ONCE((cpu_has_apic && !disable_apic));
125 return 0;
126}
127
128static void noop_apic_write(u32 reg, u32 v)
129{
130 WARN_ON_ONCE(cpu_has_apic && !disable_apic);
131}
132
133struct apic apic_noop = {
134 .name = "noop",
135 .probe = noop_probe,
136 .acpi_madt_oem_check = NULL,
137
138 .apic_id_registered = noop_apic_id_registered,
139
140 .irq_delivery_mode = dest_LowestPrio,
141 /* logical delivery broadcast to all CPUs: */
142 .irq_dest_mode = 1,
143
144 .target_cpus = noop_target_cpus,
145 .disable_esr = 0,
146 .dest_logical = APIC_DEST_LOGICAL,
147 .check_apicid_used = noop_check_apicid_used,
148 .check_apicid_present = noop_check_apicid_present,
149
150 .vector_allocation_domain = noop_vector_allocation_domain,
151 .init_apic_ldr = noop_init_apic_ldr,
152
153 .ioapic_phys_id_map = default_ioapic_phys_id_map,
154 .setup_apic_routing = NULL,
155 .multi_timer_check = NULL,
156 .apicid_to_node = noop_apicid_to_node,
157
158 .cpu_to_logical_apicid = noop_cpu_to_logical_apicid,
159 .cpu_present_to_apicid = default_cpu_present_to_apicid,
160 .apicid_to_cpu_present = physid_set_mask_of_physid,
161
162 .setup_portio_remap = NULL,
163 .check_phys_apicid_present = default_check_phys_apicid_present,
164 .enable_apic_mode = NULL,
165
166 .phys_pkg_id = noop_phys_pkg_id,
167
168 .mps_oem_check = NULL,
169
170 .get_apic_id = noop_get_apic_id,
171 .set_apic_id = NULL,
172 .apic_id_mask = 0x0F << 24,
173
174 .cpu_mask_to_apicid = default_cpu_mask_to_apicid,
175 .cpu_mask_to_apicid_and = default_cpu_mask_to_apicid_and,
176
177 .send_IPI_mask = noop_send_IPI_mask,
178 .send_IPI_mask_allbutself = noop_send_IPI_mask_allbutself,
179 .send_IPI_allbutself = noop_send_IPI_allbutself,
180 .send_IPI_all = noop_send_IPI_all,
181 .send_IPI_self = noop_send_IPI_self,
182
183 .wakeup_secondary_cpu = noop_wakeup_secondary_cpu,
184
185 /* should be safe */
186 .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW,
187 .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH,
188
189 .wait_for_init_deassert = NULL,
190
191 .smp_callin_clear_local_apic = NULL,
192 .inquire_remote_apic = NULL,
193
194 .read = noop_apic_read,
195 .write = noop_apic_write,
196 .icr_read = noop_apic_icr_read,
197 .icr_write = noop_apic_icr_write,
198 .wait_icr_idle = noop_apic_wait_icr_idle,
199 .safe_wait_icr_idle = noop_safe_apic_wait_icr_idle,
200};
diff --git a/arch/x86/kernel/apic/bigsmp_32.c b/arch/x86/kernel/apic/bigsmp_32.c
index 77a06413b6b2..cb804c5091b9 100644
--- a/arch/x86/kernel/apic/bigsmp_32.c
+++ b/arch/x86/kernel/apic/bigsmp_32.c
@@ -35,7 +35,7 @@ static const struct cpumask *bigsmp_target_cpus(void)
35#endif 35#endif
36} 36}
37 37
38static unsigned long bigsmp_check_apicid_used(physid_mask_t bitmap, int apicid) 38static unsigned long bigsmp_check_apicid_used(physid_mask_t *map, int apicid)
39{ 39{
40 return 0; 40 return 0;
41} 41}
@@ -93,11 +93,6 @@ static int bigsmp_cpu_present_to_apicid(int mps_cpu)
93 return BAD_APICID; 93 return BAD_APICID;
94} 94}
95 95
96static physid_mask_t bigsmp_apicid_to_cpu_present(int phys_apicid)
97{
98 return physid_mask_of_physid(phys_apicid);
99}
100
101/* Mapping from cpu number to logical apicid */ 96/* Mapping from cpu number to logical apicid */
102static inline int bigsmp_cpu_to_logical_apicid(int cpu) 97static inline int bigsmp_cpu_to_logical_apicid(int cpu)
103{ 98{
@@ -106,10 +101,10 @@ static inline int bigsmp_cpu_to_logical_apicid(int cpu)
106 return cpu_physical_id(cpu); 101 return cpu_physical_id(cpu);
107} 102}
108 103
109static physid_mask_t bigsmp_ioapic_phys_id_map(physid_mask_t phys_map) 104static void bigsmp_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
110{ 105{
111 /* For clustered we don't have a good way to do this yet - hack */ 106 /* For clustered we don't have a good way to do this yet - hack */
112 return physids_promote(0xFFL); 107 physids_promote(0xFFL, retmap);
113} 108}
114 109
115static int bigsmp_check_phys_apicid_present(int phys_apicid) 110static int bigsmp_check_phys_apicid_present(int phys_apicid)
@@ -136,10 +131,7 @@ static unsigned int bigsmp_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
136 if (cpumask_test_cpu(cpu, cpu_online_mask)) 131 if (cpumask_test_cpu(cpu, cpu_online_mask))
137 break; 132 break;
138 } 133 }
139 if (cpu < nr_cpu_ids) 134 return bigsmp_cpu_to_logical_apicid(cpu);
140 return bigsmp_cpu_to_logical_apicid(cpu);
141
142 return BAD_APICID;
143} 135}
144 136
145static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb) 137static int bigsmp_phys_pkg_id(int cpuid_apic, int index_msb)
@@ -230,7 +222,7 @@ struct apic apic_bigsmp = {
230 .apicid_to_node = bigsmp_apicid_to_node, 222 .apicid_to_node = bigsmp_apicid_to_node,
231 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid, 223 .cpu_to_logical_apicid = bigsmp_cpu_to_logical_apicid,
232 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid, 224 .cpu_present_to_apicid = bigsmp_cpu_present_to_apicid,
233 .apicid_to_cpu_present = bigsmp_apicid_to_cpu_present, 225 .apicid_to_cpu_present = physid_set_mask_of_physid,
234 .setup_portio_remap = NULL, 226 .setup_portio_remap = NULL,
235 .check_phys_apicid_present = bigsmp_check_phys_apicid_present, 227 .check_phys_apicid_present = bigsmp_check_phys_apicid_present,
236 .enable_apic_mode = NULL, 228 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/es7000_32.c b/arch/x86/kernel/apic/es7000_32.c
index 89174f847b49..03ba1b895f5e 100644
--- a/arch/x86/kernel/apic/es7000_32.c
+++ b/arch/x86/kernel/apic/es7000_32.c
@@ -27,6 +27,9 @@
27 * 27 *
28 * http://www.unisys.com 28 * http://www.unisys.com
29 */ 29 */
30
31#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
32
30#include <linux/notifier.h> 33#include <linux/notifier.h>
31#include <linux/spinlock.h> 34#include <linux/spinlock.h>
32#include <linux/cpumask.h> 35#include <linux/cpumask.h>
@@ -39,6 +42,7 @@
39#include <linux/errno.h> 42#include <linux/errno.h>
40#include <linux/acpi.h> 43#include <linux/acpi.h>
41#include <linux/init.h> 44#include <linux/init.h>
45#include <linux/gfp.h>
42#include <linux/nmi.h> 46#include <linux/nmi.h>
43#include <linux/smp.h> 47#include <linux/smp.h>
44#include <linux/io.h> 48#include <linux/io.h>
@@ -223,9 +227,9 @@ static int parse_unisys_oem(char *oemptr)
223 mip_addr = val; 227 mip_addr = val;
224 mip = (struct mip_reg *)val; 228 mip = (struct mip_reg *)val;
225 mip_reg = __va(mip); 229 mip_reg = __va(mip);
226 pr_debug("es7000_mipcfg: host_reg = 0x%lx \n", 230 pr_debug("host_reg = 0x%lx\n",
227 (unsigned long)host_reg); 231 (unsigned long)host_reg);
228 pr_debug("es7000_mipcfg: mip_reg = 0x%lx \n", 232 pr_debug("mip_reg = 0x%lx\n",
229 (unsigned long)mip_reg); 233 (unsigned long)mip_reg);
230 success++; 234 success++;
231 break; 235 break;
@@ -401,7 +405,7 @@ static void es7000_enable_apic_mode(void)
401 if (!es7000_plat) 405 if (!es7000_plat)
402 return; 406 return;
403 407
404 printk(KERN_INFO "ES7000: Enabling APIC mode.\n"); 408 pr_info("Enabling APIC mode.\n");
405 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg)); 409 memset(&es7000_mip_reg, 0, sizeof(struct mip_reg));
406 es7000_mip_reg.off_0x00 = MIP_SW_APIC; 410 es7000_mip_reg.off_0x00 = MIP_SW_APIC;
407 es7000_mip_reg.off_0x38 = MIP_VALID; 411 es7000_mip_reg.off_0x38 = MIP_VALID;
@@ -466,11 +470,11 @@ static const struct cpumask *es7000_target_cpus(void)
466 return cpumask_of(smp_processor_id()); 470 return cpumask_of(smp_processor_id());
467} 471}
468 472
469static unsigned long 473static unsigned long es7000_check_apicid_used(physid_mask_t *map, int apicid)
470es7000_check_apicid_used(physid_mask_t bitmap, int apicid)
471{ 474{
472 return 0; 475 return 0;
473} 476}
477
474static unsigned long es7000_check_apicid_present(int bit) 478static unsigned long es7000_check_apicid_present(int bit)
475{ 479{
476 return physid_isset(bit, phys_cpu_present_map); 480 return physid_isset(bit, phys_cpu_present_map);
@@ -514,8 +518,7 @@ static void es7000_setup_apic_routing(void)
514{ 518{
515 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id()); 519 int apic = per_cpu(x86_bios_cpu_apicid, smp_processor_id());
516 520
517 printk(KERN_INFO 521 pr_info("Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
518 "Enabling APIC mode: %s. Using %d I/O APICs, target cpus %lx\n",
519 (apic_version[apic] == 0x14) ? 522 (apic_version[apic] == 0x14) ?
520 "Physical Cluster" : "Logical Cluster", 523 "Physical Cluster" : "Logical Cluster",
521 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]); 524 nr_ioapics, cpumask_bits(es7000_target_cpus())[0]);
@@ -539,14 +542,10 @@ static int es7000_cpu_present_to_apicid(int mps_cpu)
539 542
540static int cpu_id; 543static int cpu_id;
541 544
542static physid_mask_t es7000_apicid_to_cpu_present(int phys_apicid) 545static void es7000_apicid_to_cpu_present(int phys_apicid, physid_mask_t *retmap)
543{ 546{
544 physid_mask_t mask; 547 physid_set_mask_of_physid(cpu_id, retmap);
545
546 mask = physid_mask_of_physid(cpu_id);
547 ++cpu_id; 548 ++cpu_id;
548
549 return mask;
550} 549}
551 550
552/* Mapping from cpu number to logical apicid */ 551/* Mapping from cpu number to logical apicid */
@@ -561,10 +560,10 @@ static int es7000_cpu_to_logical_apicid(int cpu)
561#endif 560#endif
562} 561}
563 562
564static physid_mask_t es7000_ioapic_phys_id_map(physid_mask_t phys_map) 563static void es7000_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
565{ 564{
566 /* For clustered we don't have a good way to do this yet - hack */ 565 /* For clustered we don't have a good way to do this yet - hack */
567 return physids_promote(0xff); 566 physids_promote(0xFFL, retmap);
568} 567}
569 568
570static int es7000_check_phys_apicid_present(int cpu_physical_apicid) 569static int es7000_check_phys_apicid_present(int cpu_physical_apicid)
diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c
index dc69f28489f5..eb2789c3f721 100644
--- a/arch/x86/kernel/apic/io_apic.c
+++ b/arch/x86/kernel/apic/io_apic.c
@@ -36,6 +36,7 @@
36#include <linux/freezer.h> 36#include <linux/freezer.h>
37#include <linux/kthread.h> 37#include <linux/kthread.h>
38#include <linux/jiffies.h> /* time_after() */ 38#include <linux/jiffies.h> /* time_after() */
39#include <linux/slab.h>
39#ifdef CONFIG_ACPI 40#ifdef CONFIG_ACPI
40#include <acpi/acpi_bus.h> 41#include <acpi/acpi_bus.h>
41#endif 42#endif
@@ -60,8 +61,6 @@
60#include <asm/irq_remapping.h> 61#include <asm/irq_remapping.h>
61#include <asm/hpet.h> 62#include <asm/hpet.h>
62#include <asm/hw_irq.h> 63#include <asm/hw_irq.h>
63#include <asm/uv/uv_hub.h>
64#include <asm/uv/uv_irq.h>
65 64
66#include <asm/apic.h> 65#include <asm/apic.h>
67 66
@@ -75,8 +74,8 @@
75 */ 74 */
76int sis_apic_bug = -1; 75int sis_apic_bug = -1;
77 76
78static DEFINE_SPINLOCK(ioapic_lock); 77static DEFINE_RAW_SPINLOCK(ioapic_lock);
79static DEFINE_SPINLOCK(vector_lock); 78static DEFINE_RAW_SPINLOCK(vector_lock);
80 79
81/* 80/*
82 * # of IRQ routing registers 81 * # of IRQ routing registers
@@ -96,8 +95,6 @@ struct mpc_intsrc mp_irqs[MAX_IRQ_SOURCES];
96/* # of MP IRQ source entries */ 95/* # of MP IRQ source entries */
97int mp_irq_entries; 96int mp_irq_entries;
98 97
99/* Number of legacy interrupts */
100static int nr_legacy_irqs __read_mostly = NR_IRQS_LEGACY;
101/* GSI interrupts */ 98/* GSI interrupts */
102static int nr_irqs_gsi = NR_IRQS_LEGACY; 99static int nr_irqs_gsi = NR_IRQS_LEGACY;
103 100
@@ -140,49 +137,12 @@ static struct irq_pin_list *get_one_free_irq_2_pin(int node)
140 return pin; 137 return pin;
141} 138}
142 139
143/*
144 * This is performance-critical, we want to do it O(1)
145 *
146 * Most irqs are mapped 1:1 with pins.
147 */
148struct irq_cfg {
149 struct irq_pin_list *irq_2_pin;
150 cpumask_var_t domain;
151 cpumask_var_t old_domain;
152 unsigned move_cleanup_count;
153 u8 vector;
154 u8 move_in_progress : 1;
155};
156
157/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */ 140/* irq_cfg is indexed by the sum of all RTEs in all I/O APICs. */
158#ifdef CONFIG_SPARSE_IRQ 141#ifdef CONFIG_SPARSE_IRQ
159static struct irq_cfg irq_cfgx[] = { 142static struct irq_cfg irq_cfgx[NR_IRQS_LEGACY];
160#else 143#else
161static struct irq_cfg irq_cfgx[NR_IRQS] = { 144static struct irq_cfg irq_cfgx[NR_IRQS];
162#endif 145#endif
163 [0] = { .vector = IRQ0_VECTOR, },
164 [1] = { .vector = IRQ1_VECTOR, },
165 [2] = { .vector = IRQ2_VECTOR, },
166 [3] = { .vector = IRQ3_VECTOR, },
167 [4] = { .vector = IRQ4_VECTOR, },
168 [5] = { .vector = IRQ5_VECTOR, },
169 [6] = { .vector = IRQ6_VECTOR, },
170 [7] = { .vector = IRQ7_VECTOR, },
171 [8] = { .vector = IRQ8_VECTOR, },
172 [9] = { .vector = IRQ9_VECTOR, },
173 [10] = { .vector = IRQ10_VECTOR, },
174 [11] = { .vector = IRQ11_VECTOR, },
175 [12] = { .vector = IRQ12_VECTOR, },
176 [13] = { .vector = IRQ13_VECTOR, },
177 [14] = { .vector = IRQ14_VECTOR, },
178 [15] = { .vector = IRQ15_VECTOR, },
179};
180
181void __init io_apic_disable_legacy(void)
182{
183 nr_legacy_irqs = 0;
184 nr_irqs_gsi = 0;
185}
186 146
187int __init arch_early_irq_init(void) 147int __init arch_early_irq_init(void)
188{ 148{
@@ -192,6 +152,11 @@ int __init arch_early_irq_init(void)
192 int node; 152 int node;
193 int i; 153 int i;
194 154
155 if (!legacy_pic->nr_legacy_irqs) {
156 nr_irqs_gsi = 0;
157 io_apic_irqs = ~0UL;
158 }
159
195 cfg = irq_cfgx; 160 cfg = irq_cfgx;
196 count = ARRAY_SIZE(irq_cfgx); 161 count = ARRAY_SIZE(irq_cfgx);
197 node= cpu_to_node(boot_cpu_id); 162 node= cpu_to_node(boot_cpu_id);
@@ -201,15 +166,21 @@ int __init arch_early_irq_init(void)
201 desc->chip_data = &cfg[i]; 166 desc->chip_data = &cfg[i];
202 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node); 167 zalloc_cpumask_var_node(&cfg[i].domain, GFP_NOWAIT, node);
203 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node); 168 zalloc_cpumask_var_node(&cfg[i].old_domain, GFP_NOWAIT, node);
204 if (i < nr_legacy_irqs) 169 /*
205 cpumask_setall(cfg[i].domain); 170 * For legacy IRQ's, start with assigning irq0 to irq15 to
171 * IRQ0_VECTOR to IRQ15_VECTOR on cpu 0.
172 */
173 if (i < legacy_pic->nr_legacy_irqs) {
174 cfg[i].vector = IRQ0_VECTOR + i;
175 cpumask_set_cpu(0, cfg[i].domain);
176 }
206 } 177 }
207 178
208 return 0; 179 return 0;
209} 180}
210 181
211#ifdef CONFIG_SPARSE_IRQ 182#ifdef CONFIG_SPARSE_IRQ
212static struct irq_cfg *irq_cfg(unsigned int irq) 183struct irq_cfg *irq_cfg(unsigned int irq)
213{ 184{
214 struct irq_cfg *cfg = NULL; 185 struct irq_cfg *cfg = NULL;
215 struct irq_desc *desc; 186 struct irq_desc *desc;
@@ -361,7 +332,7 @@ void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc)
361/* end for move_irq_desc */ 332/* end for move_irq_desc */
362 333
363#else 334#else
364static struct irq_cfg *irq_cfg(unsigned int irq) 335struct irq_cfg *irq_cfg(unsigned int irq)
365{ 336{
366 return irq < nr_irqs ? irq_cfgx + irq : NULL; 337 return irq < nr_irqs ? irq_cfgx + irq : NULL;
367} 338}
@@ -422,7 +393,7 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
422 struct irq_pin_list *entry; 393 struct irq_pin_list *entry;
423 unsigned long flags; 394 unsigned long flags;
424 395
425 spin_lock_irqsave(&ioapic_lock, flags); 396 raw_spin_lock_irqsave(&ioapic_lock, flags);
426 for_each_irq_pin(entry, cfg->irq_2_pin) { 397 for_each_irq_pin(entry, cfg->irq_2_pin) {
427 unsigned int reg; 398 unsigned int reg;
428 int pin; 399 int pin;
@@ -431,11 +402,11 @@ static bool io_apic_level_ack_pending(struct irq_cfg *cfg)
431 reg = io_apic_read(entry->apic, 0x10 + pin*2); 402 reg = io_apic_read(entry->apic, 0x10 + pin*2);
432 /* Is the remote IRR bit set? */ 403 /* Is the remote IRR bit set? */
433 if (reg & IO_APIC_REDIR_REMOTE_IRR) { 404 if (reg & IO_APIC_REDIR_REMOTE_IRR) {
434 spin_unlock_irqrestore(&ioapic_lock, flags); 405 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
435 return true; 406 return true;
436 } 407 }
437 } 408 }
438 spin_unlock_irqrestore(&ioapic_lock, flags); 409 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
439 410
440 return false; 411 return false;
441} 412}
@@ -449,10 +420,10 @@ static struct IO_APIC_route_entry ioapic_read_entry(int apic, int pin)
449{ 420{
450 union entry_union eu; 421 union entry_union eu;
451 unsigned long flags; 422 unsigned long flags;
452 spin_lock_irqsave(&ioapic_lock, flags); 423 raw_spin_lock_irqsave(&ioapic_lock, flags);
453 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin); 424 eu.w1 = io_apic_read(apic, 0x10 + 2 * pin);
454 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin); 425 eu.w2 = io_apic_read(apic, 0x11 + 2 * pin);
455 spin_unlock_irqrestore(&ioapic_lock, flags); 426 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
456 return eu.entry; 427 return eu.entry;
457} 428}
458 429
@@ -475,9 +446,9 @@ __ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
475void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e) 446void ioapic_write_entry(int apic, int pin, struct IO_APIC_route_entry e)
476{ 447{
477 unsigned long flags; 448 unsigned long flags;
478 spin_lock_irqsave(&ioapic_lock, flags); 449 raw_spin_lock_irqsave(&ioapic_lock, flags);
479 __ioapic_write_entry(apic, pin, e); 450 __ioapic_write_entry(apic, pin, e);
480 spin_unlock_irqrestore(&ioapic_lock, flags); 451 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
481} 452}
482 453
483/* 454/*
@@ -490,10 +461,10 @@ static void ioapic_mask_entry(int apic, int pin)
490 unsigned long flags; 461 unsigned long flags;
491 union entry_union eu = { .entry.mask = 1 }; 462 union entry_union eu = { .entry.mask = 1 };
492 463
493 spin_lock_irqsave(&ioapic_lock, flags); 464 raw_spin_lock_irqsave(&ioapic_lock, flags);
494 io_apic_write(apic, 0x10 + 2*pin, eu.w1); 465 io_apic_write(apic, 0x10 + 2*pin, eu.w1);
495 io_apic_write(apic, 0x11 + 2*pin, eu.w2); 466 io_apic_write(apic, 0x11 + 2*pin, eu.w2);
496 spin_unlock_irqrestore(&ioapic_lock, flags); 467 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
497} 468}
498 469
499/* 470/*
@@ -555,23 +526,41 @@ static void __init replace_pin_at_irq_node(struct irq_cfg *cfg, int node,
555 add_pin_to_irq_node(cfg, node, newapic, newpin); 526 add_pin_to_irq_node(cfg, node, newapic, newpin);
556} 527}
557 528
529static void __io_apic_modify_irq(struct irq_pin_list *entry,
530 int mask_and, int mask_or,
531 void (*final)(struct irq_pin_list *entry))
532{
533 unsigned int reg, pin;
534
535 pin = entry->pin;
536 reg = io_apic_read(entry->apic, 0x10 + pin * 2);
537 reg &= mask_and;
538 reg |= mask_or;
539 io_apic_modify(entry->apic, 0x10 + pin * 2, reg);
540 if (final)
541 final(entry);
542}
543
558static void io_apic_modify_irq(struct irq_cfg *cfg, 544static void io_apic_modify_irq(struct irq_cfg *cfg,
559 int mask_and, int mask_or, 545 int mask_and, int mask_or,
560 void (*final)(struct irq_pin_list *entry)) 546 void (*final)(struct irq_pin_list *entry))
561{ 547{
562 int pin;
563 struct irq_pin_list *entry; 548 struct irq_pin_list *entry;
564 549
565 for_each_irq_pin(entry, cfg->irq_2_pin) { 550 for_each_irq_pin(entry, cfg->irq_2_pin)
566 unsigned int reg; 551 __io_apic_modify_irq(entry, mask_and, mask_or, final);
567 pin = entry->pin; 552}
568 reg = io_apic_read(entry->apic, 0x10 + pin * 2); 553
569 reg &= mask_and; 554static void __mask_and_edge_IO_APIC_irq(struct irq_pin_list *entry)
570 reg |= mask_or; 555{
571 io_apic_modify(entry->apic, 0x10 + pin * 2, reg); 556 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_LEVEL_TRIGGER,
572 if (final) 557 IO_APIC_REDIR_MASKED, NULL);
573 final(entry); 558}
574 } 559
560static void __unmask_and_level_IO_APIC_irq(struct irq_pin_list *entry)
561{
562 __io_apic_modify_irq(entry, ~IO_APIC_REDIR_MASKED,
563 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
575} 564}
576 565
577static void __unmask_IO_APIC_irq(struct irq_cfg *cfg) 566static void __unmask_IO_APIC_irq(struct irq_cfg *cfg)
@@ -595,18 +584,6 @@ static void __mask_IO_APIC_irq(struct irq_cfg *cfg)
595 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync); 584 io_apic_modify_irq(cfg, ~0, IO_APIC_REDIR_MASKED, &io_apic_sync);
596} 585}
597 586
598static void __mask_and_edge_IO_APIC_irq(struct irq_cfg *cfg)
599{
600 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_LEVEL_TRIGGER,
601 IO_APIC_REDIR_MASKED, NULL);
602}
603
604static void __unmask_and_level_IO_APIC_irq(struct irq_cfg *cfg)
605{
606 io_apic_modify_irq(cfg, ~IO_APIC_REDIR_MASKED,
607 IO_APIC_REDIR_LEVEL_TRIGGER, NULL);
608}
609
610static void mask_IO_APIC_irq_desc(struct irq_desc *desc) 587static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
611{ 588{
612 struct irq_cfg *cfg = desc->chip_data; 589 struct irq_cfg *cfg = desc->chip_data;
@@ -614,9 +591,9 @@ static void mask_IO_APIC_irq_desc(struct irq_desc *desc)
614 591
615 BUG_ON(!cfg); 592 BUG_ON(!cfg);
616 593
617 spin_lock_irqsave(&ioapic_lock, flags); 594 raw_spin_lock_irqsave(&ioapic_lock, flags);
618 __mask_IO_APIC_irq(cfg); 595 __mask_IO_APIC_irq(cfg);
619 spin_unlock_irqrestore(&ioapic_lock, flags); 596 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
620} 597}
621 598
622static void unmask_IO_APIC_irq_desc(struct irq_desc *desc) 599static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
@@ -624,9 +601,9 @@ static void unmask_IO_APIC_irq_desc(struct irq_desc *desc)
624 struct irq_cfg *cfg = desc->chip_data; 601 struct irq_cfg *cfg = desc->chip_data;
625 unsigned long flags; 602 unsigned long flags;
626 603
627 spin_lock_irqsave(&ioapic_lock, flags); 604 raw_spin_lock_irqsave(&ioapic_lock, flags);
628 __unmask_IO_APIC_irq(cfg); 605 __unmask_IO_APIC_irq(cfg);
629 spin_unlock_irqrestore(&ioapic_lock, flags); 606 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
630} 607}
631 608
632static void mask_IO_APIC_irq(unsigned int irq) 609static void mask_IO_APIC_irq(unsigned int irq)
@@ -875,7 +852,7 @@ static int __init find_isa_irq_apic(int irq, int type)
875 */ 852 */
876static int EISA_ELCR(unsigned int irq) 853static int EISA_ELCR(unsigned int irq)
877{ 854{
878 if (irq < nr_legacy_irqs) { 855 if (irq < legacy_pic->nr_legacy_irqs) {
879 unsigned int port = 0x4d0 + (irq >> 3); 856 unsigned int port = 0x4d0 + (irq >> 3);
880 return (inb(port) >> (irq & 7)) & 1; 857 return (inb(port) >> (irq & 7)) & 1;
881 } 858 }
@@ -1150,12 +1127,12 @@ void lock_vector_lock(void)
1150 /* Used to the online set of cpus does not change 1127 /* Used to the online set of cpus does not change
1151 * during assign_irq_vector. 1128 * during assign_irq_vector.
1152 */ 1129 */
1153 spin_lock(&vector_lock); 1130 raw_spin_lock(&vector_lock);
1154} 1131}
1155 1132
1156void unlock_vector_lock(void) 1133void unlock_vector_lock(void)
1157{ 1134{
1158 spin_unlock(&vector_lock); 1135 raw_spin_unlock(&vector_lock);
1159} 1136}
1160 1137
1161static int 1138static int
@@ -1172,12 +1149,13 @@ __assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1172 * Also, we've got to be careful not to trash gate 1149 * Also, we've got to be careful not to trash gate
1173 * 0x80, because int 0x80 is hm, kind of importantish. ;) 1150 * 0x80, because int 0x80 is hm, kind of importantish. ;)
1174 */ 1151 */
1175 static int current_vector = FIRST_DEVICE_VECTOR, current_offset = 0; 1152 static int current_vector = FIRST_EXTERNAL_VECTOR + VECTOR_OFFSET_START;
1153 static int current_offset = VECTOR_OFFSET_START % 8;
1176 unsigned int old_vector; 1154 unsigned int old_vector;
1177 int cpu, err; 1155 int cpu, err;
1178 cpumask_var_t tmp_mask; 1156 cpumask_var_t tmp_mask;
1179 1157
1180 if ((cfg->move_in_progress) || cfg->move_cleanup_count) 1158 if (cfg->move_in_progress)
1181 return -EBUSY; 1159 return -EBUSY;
1182 1160
1183 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC)) 1161 if (!alloc_cpumask_var(&tmp_mask, GFP_ATOMIC))
@@ -1208,7 +1186,7 @@ next:
1208 if (vector >= first_system_vector) { 1186 if (vector >= first_system_vector) {
1209 /* If out of vectors on large boxen, must share them. */ 1187 /* If out of vectors on large boxen, must share them. */
1210 offset = (offset + 1) % 8; 1188 offset = (offset + 1) % 8;
1211 vector = FIRST_DEVICE_VECTOR + offset; 1189 vector = FIRST_EXTERNAL_VECTOR + offset;
1212 } 1190 }
1213 if (unlikely(current_vector == vector)) 1191 if (unlikely(current_vector == vector))
1214 continue; 1192 continue;
@@ -1237,15 +1215,14 @@ next:
1237 return err; 1215 return err;
1238} 1216}
1239 1217
1240static int 1218int assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1241assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask)
1242{ 1219{
1243 int err; 1220 int err;
1244 unsigned long flags; 1221 unsigned long flags;
1245 1222
1246 spin_lock_irqsave(&vector_lock, flags); 1223 raw_spin_lock_irqsave(&vector_lock, flags);
1247 err = __assign_irq_vector(irq, cfg, mask); 1224 err = __assign_irq_vector(irq, cfg, mask);
1248 spin_unlock_irqrestore(&vector_lock, flags); 1225 raw_spin_unlock_irqrestore(&vector_lock, flags);
1249 return err; 1226 return err;
1250} 1227}
1251 1228
@@ -1279,14 +1256,27 @@ static void __clear_irq_vector(int irq, struct irq_cfg *cfg)
1279void __setup_vector_irq(int cpu) 1256void __setup_vector_irq(int cpu)
1280{ 1257{
1281 /* Initialize vector_irq on a new cpu */ 1258 /* Initialize vector_irq on a new cpu */
1282 /* This function must be called with vector_lock held */
1283 int irq, vector; 1259 int irq, vector;
1284 struct irq_cfg *cfg; 1260 struct irq_cfg *cfg;
1285 struct irq_desc *desc; 1261 struct irq_desc *desc;
1286 1262
1263 /*
1264 * vector_lock will make sure that we don't run into irq vector
1265 * assignments that might be happening on another cpu in parallel,
1266 * while we setup our initial vector to irq mappings.
1267 */
1268 raw_spin_lock(&vector_lock);
1287 /* Mark the inuse vectors */ 1269 /* Mark the inuse vectors */
1288 for_each_irq_desc(irq, desc) { 1270 for_each_irq_desc(irq, desc) {
1289 cfg = desc->chip_data; 1271 cfg = desc->chip_data;
1272
1273 /*
1274 * If it is a legacy IRQ handled by the legacy PIC, this cpu
1275 * will be part of the irq_cfg's domain.
1276 */
1277 if (irq < legacy_pic->nr_legacy_irqs && !IO_APIC_IRQ(irq))
1278 cpumask_set_cpu(cpu, cfg->domain);
1279
1290 if (!cpumask_test_cpu(cpu, cfg->domain)) 1280 if (!cpumask_test_cpu(cpu, cfg->domain))
1291 continue; 1281 continue;
1292 vector = cfg->vector; 1282 vector = cfg->vector;
@@ -1302,6 +1292,7 @@ void __setup_vector_irq(int cpu)
1302 if (!cpumask_test_cpu(cpu, cfg->domain)) 1292 if (!cpumask_test_cpu(cpu, cfg->domain))
1303 per_cpu(vector_irq, cpu)[vector] = -1; 1293 per_cpu(vector_irq, cpu)[vector] = -1;
1304 } 1294 }
1295 raw_spin_unlock(&vector_lock);
1305} 1296}
1306 1297
1307static struct irq_chip ioapic_chip; 1298static struct irq_chip ioapic_chip;
@@ -1451,6 +1442,14 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1451 1442
1452 cfg = desc->chip_data; 1443 cfg = desc->chip_data;
1453 1444
1445 /*
1446 * For legacy irqs, cfg->domain starts with cpu 0 for legacy
1447 * controllers like 8259. Now that IO-APIC can handle this irq, update
1448 * the cfg->domain.
1449 */
1450 if (irq < legacy_pic->nr_legacy_irqs && cpumask_test_cpu(0, cfg->domain))
1451 apic->vector_allocation_domain(0, cfg->domain);
1452
1454 if (assign_irq_vector(irq, cfg, apic->target_cpus())) 1453 if (assign_irq_vector(irq, cfg, apic->target_cpus()))
1455 return; 1454 return;
1456 1455
@@ -1472,8 +1471,8 @@ static void setup_IO_APIC_irq(int apic_id, int pin, unsigned int irq, struct irq
1472 } 1471 }
1473 1472
1474 ioapic_register_intr(irq, desc, trigger); 1473 ioapic_register_intr(irq, desc, trigger);
1475 if (irq < nr_legacy_irqs) 1474 if (irq < legacy_pic->nr_legacy_irqs)
1476 disable_8259A_irq(irq); 1475 legacy_pic->chip->mask(irq);
1477 1476
1478 ioapic_write_entry(apic_id, pin, entry); 1477 ioapic_write_entry(apic_id, pin, entry);
1479} 1478}
@@ -1484,7 +1483,7 @@ static struct {
1484 1483
1485static void __init setup_IO_APIC_irqs(void) 1484static void __init setup_IO_APIC_irqs(void)
1486{ 1485{
1487 int apic_id = 0, pin, idx, irq; 1486 int apic_id, pin, idx, irq;
1488 int notcon = 0; 1487 int notcon = 0;
1489 struct irq_desc *desc; 1488 struct irq_desc *desc;
1490 struct irq_cfg *cfg; 1489 struct irq_cfg *cfg;
@@ -1492,14 +1491,7 @@ static void __init setup_IO_APIC_irqs(void)
1492 1491
1493 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n"); 1492 apic_printk(APIC_VERBOSE, KERN_DEBUG "init IO_APIC IRQs\n");
1494 1493
1495#ifdef CONFIG_ACPI 1494 for (apic_id = 0; apic_id < nr_ioapics; apic_id++)
1496 if (!acpi_disabled && acpi_ioapic) {
1497 apic_id = mp_find_ioapic(0);
1498 if (apic_id < 0)
1499 apic_id = 0;
1500 }
1501#endif
1502
1503 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) { 1495 for (pin = 0; pin < nr_ioapic_registers[apic_id]; pin++) {
1504 idx = find_irq_entry(apic_id, pin, mp_INT); 1496 idx = find_irq_entry(apic_id, pin, mp_INT);
1505 if (idx == -1) { 1497 if (idx == -1) {
@@ -1521,6 +1513,9 @@ static void __init setup_IO_APIC_irqs(void)
1521 1513
1522 irq = pin_2_irq(idx, apic_id, pin); 1514 irq = pin_2_irq(idx, apic_id, pin);
1523 1515
1516 if ((apic_id > 0) && (irq > 16))
1517 continue;
1518
1524 /* 1519 /*
1525 * Skip the timer IRQ if there's a quirk handler 1520 * Skip the timer IRQ if there's a quirk handler
1526 * installed and if it returns 1: 1521 * installed and if it returns 1:
@@ -1550,6 +1545,56 @@ static void __init setup_IO_APIC_irqs(void)
1550} 1545}
1551 1546
1552/* 1547/*
1548 * for the gsit that is not in first ioapic
1549 * but could not use acpi_register_gsi()
1550 * like some special sci in IBM x3330
1551 */
1552void setup_IO_APIC_irq_extra(u32 gsi)
1553{
1554 int apic_id = 0, pin, idx, irq;
1555 int node = cpu_to_node(boot_cpu_id);
1556 struct irq_desc *desc;
1557 struct irq_cfg *cfg;
1558
1559 /*
1560 * Convert 'gsi' to 'ioapic.pin'.
1561 */
1562 apic_id = mp_find_ioapic(gsi);
1563 if (apic_id < 0)
1564 return;
1565
1566 pin = mp_find_ioapic_pin(apic_id, gsi);
1567 idx = find_irq_entry(apic_id, pin, mp_INT);
1568 if (idx == -1)
1569 return;
1570
1571 irq = pin_2_irq(idx, apic_id, pin);
1572#ifdef CONFIG_SPARSE_IRQ
1573 desc = irq_to_desc(irq);
1574 if (desc)
1575 return;
1576#endif
1577 desc = irq_to_desc_alloc_node(irq, node);
1578 if (!desc) {
1579 printk(KERN_INFO "can not get irq_desc for %d\n", irq);
1580 return;
1581 }
1582
1583 cfg = desc->chip_data;
1584 add_pin_to_irq_node(cfg, node, apic_id, pin);
1585
1586 if (test_bit(pin, mp_ioapic_routing[apic_id].pin_programmed)) {
1587 pr_debug("Pin %d-%d already programmed\n",
1588 mp_ioapics[apic_id].apicid, pin);
1589 return;
1590 }
1591 set_bit(pin, mp_ioapic_routing[apic_id].pin_programmed);
1592
1593 setup_IO_APIC_irq(apic_id, pin, irq, desc,
1594 irq_trigger(idx), irq_polarity(idx));
1595}
1596
1597/*
1553 * Set up the timer pin, possibly with the 8259A-master behind. 1598 * Set up the timer pin, possibly with the 8259A-master behind.
1554 */ 1599 */
1555static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin, 1600static void __init setup_timer_IRQ0_pin(unsigned int apic_id, unsigned int pin,
@@ -1599,9 +1644,6 @@ __apicdebuginit(void) print_IO_APIC(void)
1599 struct irq_desc *desc; 1644 struct irq_desc *desc;
1600 unsigned int irq; 1645 unsigned int irq;
1601 1646
1602 if (apic_verbosity == APIC_QUIET)
1603 return;
1604
1605 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries); 1647 printk(KERN_DEBUG "number of MP IRQ sources: %d.\n", mp_irq_entries);
1606 for (i = 0; i < nr_ioapics; i++) 1648 for (i = 0; i < nr_ioapics; i++)
1607 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n", 1649 printk(KERN_DEBUG "number of IO-APIC #%d registers: %d.\n",
@@ -1615,14 +1657,14 @@ __apicdebuginit(void) print_IO_APIC(void)
1615 1657
1616 for (apic = 0; apic < nr_ioapics; apic++) { 1658 for (apic = 0; apic < nr_ioapics; apic++) {
1617 1659
1618 spin_lock_irqsave(&ioapic_lock, flags); 1660 raw_spin_lock_irqsave(&ioapic_lock, flags);
1619 reg_00.raw = io_apic_read(apic, 0); 1661 reg_00.raw = io_apic_read(apic, 0);
1620 reg_01.raw = io_apic_read(apic, 1); 1662 reg_01.raw = io_apic_read(apic, 1);
1621 if (reg_01.bits.version >= 0x10) 1663 if (reg_01.bits.version >= 0x10)
1622 reg_02.raw = io_apic_read(apic, 2); 1664 reg_02.raw = io_apic_read(apic, 2);
1623 if (reg_01.bits.version >= 0x20) 1665 if (reg_01.bits.version >= 0x20)
1624 reg_03.raw = io_apic_read(apic, 3); 1666 reg_03.raw = io_apic_read(apic, 3);
1625 spin_unlock_irqrestore(&ioapic_lock, flags); 1667 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1626 1668
1627 printk("\n"); 1669 printk("\n");
1628 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid); 1670 printk(KERN_DEBUG "IO APIC #%d......\n", mp_ioapics[apic].apicid);
@@ -1661,7 +1703,7 @@ __apicdebuginit(void) print_IO_APIC(void)
1661 printk(KERN_DEBUG ".... IRQ redirection table:\n"); 1703 printk(KERN_DEBUG ".... IRQ redirection table:\n");
1662 1704
1663 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol" 1705 printk(KERN_DEBUG " NR Dst Mask Trig IRR Pol"
1664 " Stat Dmod Deli Vect: \n"); 1706 " Stat Dmod Deli Vect:\n");
1665 1707
1666 for (i = 0; i <= reg_01.bits.entries; i++) { 1708 for (i = 0; i <= reg_01.bits.entries; i++) {
1667 struct IO_APIC_route_entry entry; 1709 struct IO_APIC_route_entry entry;
@@ -1708,9 +1750,6 @@ __apicdebuginit(void) print_APIC_field(int base)
1708{ 1750{
1709 int i; 1751 int i;
1710 1752
1711 if (apic_verbosity == APIC_QUIET)
1712 return;
1713
1714 printk(KERN_DEBUG); 1753 printk(KERN_DEBUG);
1715 1754
1716 for (i = 0; i < 8; i++) 1755 for (i = 0; i < 8; i++)
@@ -1724,9 +1763,6 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1724 unsigned int i, v, ver, maxlvt; 1763 unsigned int i, v, ver, maxlvt;
1725 u64 icr; 1764 u64 icr;
1726 1765
1727 if (apic_verbosity == APIC_QUIET)
1728 return;
1729
1730 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n", 1766 printk(KERN_DEBUG "printing local APIC contents on CPU#%d/%d:\n",
1731 smp_processor_id(), hard_smp_processor_id()); 1767 smp_processor_id(), hard_smp_processor_id());
1732 v = apic_read(APIC_ID); 1768 v = apic_read(APIC_ID);
@@ -1824,13 +1860,19 @@ __apicdebuginit(void) print_local_APIC(void *dummy)
1824 printk("\n"); 1860 printk("\n");
1825} 1861}
1826 1862
1827__apicdebuginit(void) print_all_local_APICs(void) 1863__apicdebuginit(void) print_local_APICs(int maxcpu)
1828{ 1864{
1829 int cpu; 1865 int cpu;
1830 1866
1867 if (!maxcpu)
1868 return;
1869
1831 preempt_disable(); 1870 preempt_disable();
1832 for_each_online_cpu(cpu) 1871 for_each_online_cpu(cpu) {
1872 if (cpu >= maxcpu)
1873 break;
1833 smp_call_function_single(cpu, print_local_APIC, NULL, 1); 1874 smp_call_function_single(cpu, print_local_APIC, NULL, 1);
1875 }
1834 preempt_enable(); 1876 preempt_enable();
1835} 1877}
1836 1878
@@ -1839,12 +1881,12 @@ __apicdebuginit(void) print_PIC(void)
1839 unsigned int v; 1881 unsigned int v;
1840 unsigned long flags; 1882 unsigned long flags;
1841 1883
1842 if (apic_verbosity == APIC_QUIET || !nr_legacy_irqs) 1884 if (!legacy_pic->nr_legacy_irqs)
1843 return; 1885 return;
1844 1886
1845 printk(KERN_DEBUG "\nprinting PIC contents\n"); 1887 printk(KERN_DEBUG "\nprinting PIC contents\n");
1846 1888
1847 spin_lock_irqsave(&i8259A_lock, flags); 1889 raw_spin_lock_irqsave(&i8259A_lock, flags);
1848 1890
1849 v = inb(0xa1) << 8 | inb(0x21); 1891 v = inb(0xa1) << 8 | inb(0x21);
1850 printk(KERN_DEBUG "... PIC IMR: %04x\n", v); 1892 printk(KERN_DEBUG "... PIC IMR: %04x\n", v);
@@ -1858,7 +1900,7 @@ __apicdebuginit(void) print_PIC(void)
1858 outb(0x0a,0xa0); 1900 outb(0x0a,0xa0);
1859 outb(0x0a,0x20); 1901 outb(0x0a,0x20);
1860 1902
1861 spin_unlock_irqrestore(&i8259A_lock, flags); 1903 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
1862 1904
1863 printk(KERN_DEBUG "... PIC ISR: %04x\n", v); 1905 printk(KERN_DEBUG "... PIC ISR: %04x\n", v);
1864 1906
@@ -1866,21 +1908,41 @@ __apicdebuginit(void) print_PIC(void)
1866 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v); 1908 printk(KERN_DEBUG "... PIC ELCR: %04x\n", v);
1867} 1909}
1868 1910
1869__apicdebuginit(int) print_all_ICs(void) 1911static int __initdata show_lapic = 1;
1912static __init int setup_show_lapic(char *arg)
1913{
1914 int num = -1;
1915
1916 if (strcmp(arg, "all") == 0) {
1917 show_lapic = CONFIG_NR_CPUS;
1918 } else {
1919 get_option(&arg, &num);
1920 if (num >= 0)
1921 show_lapic = num;
1922 }
1923
1924 return 1;
1925}
1926__setup("show_lapic=", setup_show_lapic);
1927
1928__apicdebuginit(int) print_ICs(void)
1870{ 1929{
1930 if (apic_verbosity == APIC_QUIET)
1931 return 0;
1932
1871 print_PIC(); 1933 print_PIC();
1872 1934
1873 /* don't print out if apic is not there */ 1935 /* don't print out if apic is not there */
1874 if (!cpu_has_apic && !apic_from_smp_config()) 1936 if (!cpu_has_apic && !apic_from_smp_config())
1875 return 0; 1937 return 0;
1876 1938
1877 print_all_local_APICs(); 1939 print_local_APICs(show_lapic);
1878 print_IO_APIC(); 1940 print_IO_APIC();
1879 1941
1880 return 0; 1942 return 0;
1881} 1943}
1882 1944
1883fs_initcall(print_all_ICs); 1945fs_initcall(print_ICs);
1884 1946
1885 1947
1886/* Where if anywhere is the i8259 connect in external int mode */ 1948/* Where if anywhere is the i8259 connect in external int mode */
@@ -1897,13 +1959,13 @@ void __init enable_IO_APIC(void)
1897 * The number of IO-APIC IRQ registers (== #pins): 1959 * The number of IO-APIC IRQ registers (== #pins):
1898 */ 1960 */
1899 for (apic = 0; apic < nr_ioapics; apic++) { 1961 for (apic = 0; apic < nr_ioapics; apic++) {
1900 spin_lock_irqsave(&ioapic_lock, flags); 1962 raw_spin_lock_irqsave(&ioapic_lock, flags);
1901 reg_01.raw = io_apic_read(apic, 1); 1963 reg_01.raw = io_apic_read(apic, 1);
1902 spin_unlock_irqrestore(&ioapic_lock, flags); 1964 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
1903 nr_ioapic_registers[apic] = reg_01.bits.entries+1; 1965 nr_ioapic_registers[apic] = reg_01.bits.entries+1;
1904 } 1966 }
1905 1967
1906 if (!nr_legacy_irqs) 1968 if (!legacy_pic->nr_legacy_irqs)
1907 return; 1969 return;
1908 1970
1909 for(apic = 0; apic < nr_ioapics; apic++) { 1971 for(apic = 0; apic < nr_ioapics; apic++) {
@@ -1960,7 +2022,7 @@ void disable_IO_APIC(void)
1960 */ 2022 */
1961 clear_IO_APIC(); 2023 clear_IO_APIC();
1962 2024
1963 if (!nr_legacy_irqs) 2025 if (!legacy_pic->nr_legacy_irqs)
1964 return; 2026 return;
1965 2027
1966 /* 2028 /*
@@ -2031,7 +2093,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2031 * This is broken; anything with a real cpu count has to 2093 * This is broken; anything with a real cpu count has to
2032 * circumvent this idiocy regardless. 2094 * circumvent this idiocy regardless.
2033 */ 2095 */
2034 phys_id_present_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 2096 apic->ioapic_phys_id_map(&phys_cpu_present_map, &phys_id_present_map);
2035 2097
2036 /* 2098 /*
2037 * Set the IOAPIC ID to the value stored in the MPC table. 2099 * Set the IOAPIC ID to the value stored in the MPC table.
@@ -2039,9 +2101,9 @@ void __init setup_ioapic_ids_from_mpc(void)
2039 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) { 2101 for (apic_id = 0; apic_id < nr_ioapics; apic_id++) {
2040 2102
2041 /* Read the register 0 value */ 2103 /* Read the register 0 value */
2042 spin_lock_irqsave(&ioapic_lock, flags); 2104 raw_spin_lock_irqsave(&ioapic_lock, flags);
2043 reg_00.raw = io_apic_read(apic_id, 0); 2105 reg_00.raw = io_apic_read(apic_id, 0);
2044 spin_unlock_irqrestore(&ioapic_lock, flags); 2106 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2045 2107
2046 old_id = mp_ioapics[apic_id].apicid; 2108 old_id = mp_ioapics[apic_id].apicid;
2047 2109
@@ -2058,7 +2120,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2058 * system must have a unique ID or we get lots of nice 2120 * system must have a unique ID or we get lots of nice
2059 * 'stuck on smp_invalidate_needed IPI wait' messages. 2121 * 'stuck on smp_invalidate_needed IPI wait' messages.
2060 */ 2122 */
2061 if (apic->check_apicid_used(phys_id_present_map, 2123 if (apic->check_apicid_used(&phys_id_present_map,
2062 mp_ioapics[apic_id].apicid)) { 2124 mp_ioapics[apic_id].apicid)) {
2063 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n", 2125 printk(KERN_ERR "BIOS bug, IO-APIC#%d ID %d is already used!...\n",
2064 apic_id, mp_ioapics[apic_id].apicid); 2126 apic_id, mp_ioapics[apic_id].apicid);
@@ -2073,7 +2135,7 @@ void __init setup_ioapic_ids_from_mpc(void)
2073 mp_ioapics[apic_id].apicid = i; 2135 mp_ioapics[apic_id].apicid = i;
2074 } else { 2136 } else {
2075 physid_mask_t tmp; 2137 physid_mask_t tmp;
2076 tmp = apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid); 2138 apic->apicid_to_cpu_present(mp_ioapics[apic_id].apicid, &tmp);
2077 apic_printk(APIC_VERBOSE, "Setting %d in the " 2139 apic_printk(APIC_VERBOSE, "Setting %d in the "
2078 "phys_id_present_map\n", 2140 "phys_id_present_map\n",
2079 mp_ioapics[apic_id].apicid); 2141 mp_ioapics[apic_id].apicid);
@@ -2100,16 +2162,16 @@ void __init setup_ioapic_ids_from_mpc(void)
2100 mp_ioapics[apic_id].apicid); 2162 mp_ioapics[apic_id].apicid);
2101 2163
2102 reg_00.bits.ID = mp_ioapics[apic_id].apicid; 2164 reg_00.bits.ID = mp_ioapics[apic_id].apicid;
2103 spin_lock_irqsave(&ioapic_lock, flags); 2165 raw_spin_lock_irqsave(&ioapic_lock, flags);
2104 io_apic_write(apic_id, 0, reg_00.raw); 2166 io_apic_write(apic_id, 0, reg_00.raw);
2105 spin_unlock_irqrestore(&ioapic_lock, flags); 2167 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2106 2168
2107 /* 2169 /*
2108 * Sanity check 2170 * Sanity check
2109 */ 2171 */
2110 spin_lock_irqsave(&ioapic_lock, flags); 2172 raw_spin_lock_irqsave(&ioapic_lock, flags);
2111 reg_00.raw = io_apic_read(apic_id, 0); 2173 reg_00.raw = io_apic_read(apic_id, 0);
2112 spin_unlock_irqrestore(&ioapic_lock, flags); 2174 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2113 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid) 2175 if (reg_00.bits.ID != mp_ioapics[apic_id].apicid)
2114 printk("could not set ID!\n"); 2176 printk("could not set ID!\n");
2115 else 2177 else
@@ -2192,15 +2254,15 @@ static unsigned int startup_ioapic_irq(unsigned int irq)
2192 unsigned long flags; 2254 unsigned long flags;
2193 struct irq_cfg *cfg; 2255 struct irq_cfg *cfg;
2194 2256
2195 spin_lock_irqsave(&ioapic_lock, flags); 2257 raw_spin_lock_irqsave(&ioapic_lock, flags);
2196 if (irq < nr_legacy_irqs) { 2258 if (irq < legacy_pic->nr_legacy_irqs) {
2197 disable_8259A_irq(irq); 2259 legacy_pic->chip->mask(irq);
2198 if (i8259A_irq_pending(irq)) 2260 if (legacy_pic->irq_pending(irq))
2199 was_pending = 1; 2261 was_pending = 1;
2200 } 2262 }
2201 cfg = irq_cfg(irq); 2263 cfg = irq_cfg(irq);
2202 __unmask_IO_APIC_irq(cfg); 2264 __unmask_IO_APIC_irq(cfg);
2203 spin_unlock_irqrestore(&ioapic_lock, flags); 2265 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2204 2266
2205 return was_pending; 2267 return was_pending;
2206} 2268}
@@ -2211,9 +2273,9 @@ static int ioapic_retrigger_irq(unsigned int irq)
2211 struct irq_cfg *cfg = irq_cfg(irq); 2273 struct irq_cfg *cfg = irq_cfg(irq);
2212 unsigned long flags; 2274 unsigned long flags;
2213 2275
2214 spin_lock_irqsave(&vector_lock, flags); 2276 raw_spin_lock_irqsave(&vector_lock, flags);
2215 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector); 2277 apic->send_IPI_mask(cpumask_of(cpumask_first(cfg->domain)), cfg->vector);
2216 spin_unlock_irqrestore(&vector_lock, flags); 2278 raw_spin_unlock_irqrestore(&vector_lock, flags);
2217 2279
2218 return 1; 2280 return 1;
2219} 2281}
@@ -2228,20 +2290,16 @@ static int ioapic_retrigger_irq(unsigned int irq)
2228 */ 2290 */
2229 2291
2230#ifdef CONFIG_SMP 2292#ifdef CONFIG_SMP
2231static void send_cleanup_vector(struct irq_cfg *cfg) 2293void send_cleanup_vector(struct irq_cfg *cfg)
2232{ 2294{
2233 cpumask_var_t cleanup_mask; 2295 cpumask_var_t cleanup_mask;
2234 2296
2235 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) { 2297 if (unlikely(!alloc_cpumask_var(&cleanup_mask, GFP_ATOMIC))) {
2236 unsigned int i; 2298 unsigned int i;
2237 cfg->move_cleanup_count = 0;
2238 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2239 cfg->move_cleanup_count++;
2240 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask) 2299 for_each_cpu_and(i, cfg->old_domain, cpu_online_mask)
2241 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR); 2300 apic->send_IPI_mask(cpumask_of(i), IRQ_MOVE_CLEANUP_VECTOR);
2242 } else { 2301 } else {
2243 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask); 2302 cpumask_and(cleanup_mask, cfg->old_domain, cpu_online_mask);
2244 cfg->move_cleanup_count = cpumask_weight(cleanup_mask);
2245 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR); 2303 apic->send_IPI_mask(cleanup_mask, IRQ_MOVE_CLEANUP_VECTOR);
2246 free_cpumask_var(cleanup_mask); 2304 free_cpumask_var(cleanup_mask);
2247 } 2305 }
@@ -2272,31 +2330,30 @@ static void __target_IO_APIC_irq(unsigned int irq, unsigned int dest, struct irq
2272 } 2330 }
2273} 2331}
2274 2332
2275static int
2276assign_irq_vector(int irq, struct irq_cfg *cfg, const struct cpumask *mask);
2277
2278/* 2333/*
2279 * Either sets desc->affinity to a valid value, and returns 2334 * Either sets desc->affinity to a valid value, and returns
2280 * ->cpu_mask_to_apicid of that, or returns BAD_APICID and 2335 * ->cpu_mask_to_apicid of that in dest_id, or returns -1 and
2281 * leaves desc->affinity untouched. 2336 * leaves desc->affinity untouched.
2282 */ 2337 */
2283static unsigned int 2338unsigned int
2284set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask) 2339set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask,
2340 unsigned int *dest_id)
2285{ 2341{
2286 struct irq_cfg *cfg; 2342 struct irq_cfg *cfg;
2287 unsigned int irq; 2343 unsigned int irq;
2288 2344
2289 if (!cpumask_intersects(mask, cpu_online_mask)) 2345 if (!cpumask_intersects(mask, cpu_online_mask))
2290 return BAD_APICID; 2346 return -1;
2291 2347
2292 irq = desc->irq; 2348 irq = desc->irq;
2293 cfg = desc->chip_data; 2349 cfg = desc->chip_data;
2294 if (assign_irq_vector(irq, cfg, mask)) 2350 if (assign_irq_vector(irq, cfg, mask))
2295 return BAD_APICID; 2351 return -1;
2296 2352
2297 cpumask_copy(desc->affinity, mask); 2353 cpumask_copy(desc->affinity, mask);
2298 2354
2299 return apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain); 2355 *dest_id = apic->cpu_mask_to_apicid_and(desc->affinity, cfg->domain);
2356 return 0;
2300} 2357}
2301 2358
2302static int 2359static int
@@ -2311,15 +2368,14 @@ set_ioapic_affinity_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
2311 irq = desc->irq; 2368 irq = desc->irq;
2312 cfg = desc->chip_data; 2369 cfg = desc->chip_data;
2313 2370
2314 spin_lock_irqsave(&ioapic_lock, flags); 2371 raw_spin_lock_irqsave(&ioapic_lock, flags);
2315 dest = set_desc_affinity(desc, mask); 2372 ret = set_desc_affinity(desc, mask, &dest);
2316 if (dest != BAD_APICID) { 2373 if (!ret) {
2317 /* Only the high 8 bits are valid. */ 2374 /* Only the high 8 bits are valid. */
2318 dest = SET_APIC_LOGICAL_ID(dest); 2375 dest = SET_APIC_LOGICAL_ID(dest);
2319 __target_IO_APIC_irq(irq, dest, cfg); 2376 __target_IO_APIC_irq(irq, dest, cfg);
2320 ret = 0;
2321 } 2377 }
2322 spin_unlock_irqrestore(&ioapic_lock, flags); 2378 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2323 2379
2324 return ret; 2380 return ret;
2325} 2381}
@@ -2432,8 +2488,13 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2432 continue; 2488 continue;
2433 2489
2434 cfg = irq_cfg(irq); 2490 cfg = irq_cfg(irq);
2435 spin_lock(&desc->lock); 2491 raw_spin_lock(&desc->lock);
2436 if (!cfg->move_cleanup_count) 2492
2493 /*
2494 * Check if the irq migration is in progress. If so, we
2495 * haven't received the cleanup request yet for this irq.
2496 */
2497 if (cfg->move_in_progress)
2437 goto unlock; 2498 goto unlock;
2438 2499
2439 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2500 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
@@ -2452,29 +2513,43 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void)
2452 goto unlock; 2513 goto unlock;
2453 } 2514 }
2454 __get_cpu_var(vector_irq)[vector] = -1; 2515 __get_cpu_var(vector_irq)[vector] = -1;
2455 cfg->move_cleanup_count--;
2456unlock: 2516unlock:
2457 spin_unlock(&desc->lock); 2517 raw_spin_unlock(&desc->lock);
2458 } 2518 }
2459 2519
2460 irq_exit(); 2520 irq_exit();
2461} 2521}
2462 2522
2463static void irq_complete_move(struct irq_desc **descp) 2523static void __irq_complete_move(struct irq_desc **descp, unsigned vector)
2464{ 2524{
2465 struct irq_desc *desc = *descp; 2525 struct irq_desc *desc = *descp;
2466 struct irq_cfg *cfg = desc->chip_data; 2526 struct irq_cfg *cfg = desc->chip_data;
2467 unsigned vector, me; 2527 unsigned me;
2468 2528
2469 if (likely(!cfg->move_in_progress)) 2529 if (likely(!cfg->move_in_progress))
2470 return; 2530 return;
2471 2531
2472 vector = ~get_irq_regs()->orig_ax;
2473 me = smp_processor_id(); 2532 me = smp_processor_id();
2474 2533
2475 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain)) 2534 if (vector == cfg->vector && cpumask_test_cpu(me, cfg->domain))
2476 send_cleanup_vector(cfg); 2535 send_cleanup_vector(cfg);
2477} 2536}
2537
2538static void irq_complete_move(struct irq_desc **descp)
2539{
2540 __irq_complete_move(descp, ~get_irq_regs()->orig_ax);
2541}
2542
2543void irq_force_complete_move(int irq)
2544{
2545 struct irq_desc *desc = irq_to_desc(irq);
2546 struct irq_cfg *cfg = desc->chip_data;
2547
2548 if (!cfg)
2549 return;
2550
2551 __irq_complete_move(&desc, cfg->vector);
2552}
2478#else 2553#else
2479static inline void irq_complete_move(struct irq_desc **descp) {} 2554static inline void irq_complete_move(struct irq_desc **descp) {}
2480#endif 2555#endif
@@ -2490,6 +2565,59 @@ static void ack_apic_edge(unsigned int irq)
2490 2565
2491atomic_t irq_mis_count; 2566atomic_t irq_mis_count;
2492 2567
2568/*
2569 * IO-APIC versions below 0x20 don't support EOI register.
2570 * For the record, here is the information about various versions:
2571 * 0Xh 82489DX
2572 * 1Xh I/OAPIC or I/O(x)APIC which are not PCI 2.2 Compliant
2573 * 2Xh I/O(x)APIC which is PCI 2.2 Compliant
2574 * 30h-FFh Reserved
2575 *
2576 * Some of the Intel ICH Specs (ICH2 to ICH5) documents the io-apic
2577 * version as 0x2. This is an error with documentation and these ICH chips
2578 * use io-apic's of version 0x20.
2579 *
2580 * For IO-APIC's with EOI register, we use that to do an explicit EOI.
2581 * Otherwise, we simulate the EOI message manually by changing the trigger
2582 * mode to edge and then back to level, with RTE being masked during this.
2583*/
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin) {
2589 if (mp_ioapics[entry->apic].apicver >= 0x20) {
2590 /*
2591 * Intr-remapping uses pin number as the virtual vector
2592 * in the RTE. Actual vector is programmed in
2593 * intr-remapping table entry. Hence for the io-apic
2594 * EOI we use the pin number.
2595 */
2596 if (irq_remapped(irq))
2597 io_apic_eoi(entry->apic, entry->pin);
2598 else
2599 io_apic_eoi(entry->apic, cfg->vector);
2600 } else {
2601 __mask_and_edge_IO_APIC_irq(entry);
2602 __unmask_and_level_IO_APIC_irq(entry);
2603 }
2604 }
2605}
2606
2607static void eoi_ioapic_irq(struct irq_desc *desc)
2608{
2609 struct irq_cfg *cfg;
2610 unsigned long flags;
2611 unsigned int irq;
2612
2613 irq = desc->irq;
2614 cfg = desc->chip_data;
2615
2616 raw_spin_lock_irqsave(&ioapic_lock, flags);
2617 __eoi_ioapic_irq(irq, cfg);
2618 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
2619}
2620
2493static void ack_apic_level(unsigned int irq) 2621static void ack_apic_level(unsigned int irq)
2494{ 2622{
2495 struct irq_desc *desc = irq_to_desc(irq); 2623 struct irq_desc *desc = irq_to_desc(irq);
@@ -2525,6 +2653,19 @@ static void ack_apic_level(unsigned int irq)
2525 * level-triggered interrupt. We mask the source for the time of the 2653 * level-triggered interrupt. We mask the source for the time of the
2526 * operation to prevent an edge-triggered interrupt escaping meanwhile. 2654 * operation to prevent an edge-triggered interrupt escaping meanwhile.
2527 * The idea is from Manfred Spraul. --macro 2655 * The idea is from Manfred Spraul. --macro
2656 *
2657 * Also in the case when cpu goes offline, fixup_irqs() will forward
2658 * any unhandled interrupt on the offlined cpu to the new cpu
2659 * destination that is handling the corresponding interrupt. This
2660 * interrupt forwarding is done via IPI's. Hence, in this case also
2661 * level-triggered io-apic interrupt will be seen as an edge
2662 * interrupt in the IRR. And we can't rely on the cpu's EOI
2663 * to be broadcasted to the IO-APIC's which will clear the remoteIRR
2664 * corresponding to the level-triggered interrupt. Hence on IO-APIC's
2665 * supporting EOI register, we do an explicit EOI to clear the
2666 * remote IRR and on IO-APIC's which don't have an EOI register,
2667 * we use the above logic (mask+edge followed by unmask+level) from
2668 * Manfred Spraul to clear the remote IRR.
2528 */ 2669 */
2529 cfg = desc->chip_data; 2670 cfg = desc->chip_data;
2530 i = cfg->vector; 2671 i = cfg->vector;
@@ -2536,6 +2677,19 @@ static void ack_apic_level(unsigned int irq)
2536 */ 2677 */
2537 ack_APIC_irq(); 2678 ack_APIC_irq();
2538 2679
2680 /*
2681 * Tail end of clearing remote IRR bit (either by delivering the EOI
2682 * message via io-apic EOI register write or simulating it using
2683 * mask+edge followed by unnask+level logic) manually when the
2684 * level triggered interrupt is seen as the edge triggered interrupt
2685 * at the cpu.
2686 */
2687 if (!(v & (1 << (i & 0x1f)))) {
2688 atomic_inc(&irq_mis_count);
2689
2690 eoi_ioapic_irq(desc);
2691 }
2692
2539 /* Now we can move and renable the irq */ 2693 /* Now we can move and renable the irq */
2540 if (unlikely(do_unmask_irq)) { 2694 if (unlikely(do_unmask_irq)) {
2541 /* Only migrate the irq if the ack has been received. 2695 /* Only migrate the irq if the ack has been received.
@@ -2569,41 +2723,9 @@ static void ack_apic_level(unsigned int irq)
2569 move_masked_irq(irq); 2723 move_masked_irq(irq);
2570 unmask_IO_APIC_irq_desc(desc); 2724 unmask_IO_APIC_irq_desc(desc);
2571 } 2725 }
2572
2573 /* Tail end of version 0x11 I/O APIC bug workaround */
2574 if (!(v & (1 << (i & 0x1f)))) {
2575 atomic_inc(&irq_mis_count);
2576 spin_lock(&ioapic_lock);
2577 __mask_and_edge_IO_APIC_irq(cfg);
2578 __unmask_and_level_IO_APIC_irq(cfg);
2579 spin_unlock(&ioapic_lock);
2580 }
2581} 2726}
2582 2727
2583#ifdef CONFIG_INTR_REMAP 2728#ifdef CONFIG_INTR_REMAP
2584static void __eoi_ioapic_irq(unsigned int irq, struct irq_cfg *cfg)
2585{
2586 struct irq_pin_list *entry;
2587
2588 for_each_irq_pin(entry, cfg->irq_2_pin)
2589 io_apic_eoi(entry->apic, entry->pin);
2590}
2591
2592static void
2593eoi_ioapic_irq(struct irq_desc *desc)
2594{
2595 struct irq_cfg *cfg;
2596 unsigned long flags;
2597 unsigned int irq;
2598
2599 irq = desc->irq;
2600 cfg = desc->chip_data;
2601
2602 spin_lock_irqsave(&ioapic_lock, flags);
2603 __eoi_ioapic_irq(irq, cfg);
2604 spin_unlock_irqrestore(&ioapic_lock, flags);
2605}
2606
2607static void ir_ack_apic_edge(unsigned int irq) 2729static void ir_ack_apic_edge(unsigned int irq)
2608{ 2730{
2609 ack_APIC_irq(); 2731 ack_APIC_irq();
@@ -2671,8 +2793,8 @@ static inline void init_IO_APIC_traps(void)
2671 * so default to an old-fashioned 8259 2793 * so default to an old-fashioned 8259
2672 * interrupt if we can.. 2794 * interrupt if we can..
2673 */ 2795 */
2674 if (irq < nr_legacy_irqs) 2796 if (irq < legacy_pic->nr_legacy_irqs)
2675 make_8259A_irq(irq); 2797 legacy_pic->make_irq(irq);
2676 else 2798 else
2677 /* Strange. Oh, well.. */ 2799 /* Strange. Oh, well.. */
2678 desc->chip = &no_irq_chip; 2800 desc->chip = &no_irq_chip;
@@ -2829,7 +2951,7 @@ static inline void __init check_timer(void)
2829 /* 2951 /*
2830 * get/set the timer IRQ vector: 2952 * get/set the timer IRQ vector:
2831 */ 2953 */
2832 disable_8259A_irq(0); 2954 legacy_pic->chip->mask(0);
2833 assign_irq_vector(0, cfg, apic->target_cpus()); 2955 assign_irq_vector(0, cfg, apic->target_cpus());
2834 2956
2835 /* 2957 /*
@@ -2842,7 +2964,7 @@ static inline void __init check_timer(void)
2842 * automatically. 2964 * automatically.
2843 */ 2965 */
2844 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT); 2966 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_EXTINT);
2845 init_8259A(1); 2967 legacy_pic->init(1);
2846#ifdef CONFIG_X86_32 2968#ifdef CONFIG_X86_32
2847 { 2969 {
2848 unsigned int ver; 2970 unsigned int ver;
@@ -2901,7 +3023,7 @@ static inline void __init check_timer(void)
2901 if (timer_irq_works()) { 3023 if (timer_irq_works()) {
2902 if (nmi_watchdog == NMI_IO_APIC) { 3024 if (nmi_watchdog == NMI_IO_APIC) {
2903 setup_nmi(); 3025 setup_nmi();
2904 enable_8259A_irq(0); 3026 legacy_pic->chip->unmask(0);
2905 } 3027 }
2906 if (disable_timer_pin_1 > 0) 3028 if (disable_timer_pin_1 > 0)
2907 clear_IO_APIC_pin(0, pin1); 3029 clear_IO_APIC_pin(0, pin1);
@@ -2924,14 +3046,14 @@ static inline void __init check_timer(void)
2924 */ 3046 */
2925 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2); 3047 replace_pin_at_irq_node(cfg, node, apic1, pin1, apic2, pin2);
2926 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector); 3048 setup_timer_IRQ0_pin(apic2, pin2, cfg->vector);
2927 enable_8259A_irq(0); 3049 legacy_pic->chip->unmask(0);
2928 if (timer_irq_works()) { 3050 if (timer_irq_works()) {
2929 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n"); 3051 apic_printk(APIC_QUIET, KERN_INFO "....... works.\n");
2930 timer_through_8259 = 1; 3052 timer_through_8259 = 1;
2931 if (nmi_watchdog == NMI_IO_APIC) { 3053 if (nmi_watchdog == NMI_IO_APIC) {
2932 disable_8259A_irq(0); 3054 legacy_pic->chip->mask(0);
2933 setup_nmi(); 3055 setup_nmi();
2934 enable_8259A_irq(0); 3056 legacy_pic->chip->unmask(0);
2935 } 3057 }
2936 goto out; 3058 goto out;
2937 } 3059 }
@@ -2939,7 +3061,7 @@ static inline void __init check_timer(void)
2939 * Cleanup, just in case ... 3061 * Cleanup, just in case ...
2940 */ 3062 */
2941 local_irq_disable(); 3063 local_irq_disable();
2942 disable_8259A_irq(0); 3064 legacy_pic->chip->mask(0);
2943 clear_IO_APIC_pin(apic2, pin2); 3065 clear_IO_APIC_pin(apic2, pin2);
2944 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n"); 3066 apic_printk(APIC_QUIET, KERN_INFO "....... failed.\n");
2945 } 3067 }
@@ -2958,22 +3080,22 @@ static inline void __init check_timer(void)
2958 3080
2959 lapic_register_intr(0, desc); 3081 lapic_register_intr(0, desc);
2960 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */ 3082 apic_write(APIC_LVT0, APIC_DM_FIXED | cfg->vector); /* Fixed mode */
2961 enable_8259A_irq(0); 3083 legacy_pic->chip->unmask(0);
2962 3084
2963 if (timer_irq_works()) { 3085 if (timer_irq_works()) {
2964 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n"); 3086 apic_printk(APIC_QUIET, KERN_INFO "..... works.\n");
2965 goto out; 3087 goto out;
2966 } 3088 }
2967 local_irq_disable(); 3089 local_irq_disable();
2968 disable_8259A_irq(0); 3090 legacy_pic->chip->mask(0);
2969 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector); 3091 apic_write(APIC_LVT0, APIC_LVT_MASKED | APIC_DM_FIXED | cfg->vector);
2970 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n"); 3092 apic_printk(APIC_QUIET, KERN_INFO "..... failed.\n");
2971 3093
2972 apic_printk(APIC_QUIET, KERN_INFO 3094 apic_printk(APIC_QUIET, KERN_INFO
2973 "...trying to set up timer as ExtINT IRQ...\n"); 3095 "...trying to set up timer as ExtINT IRQ...\n");
2974 3096
2975 init_8259A(0); 3097 legacy_pic->init(0);
2976 make_8259A_irq(0); 3098 legacy_pic->make_irq(0);
2977 apic_write(APIC_LVT0, APIC_DM_EXTINT); 3099 apic_write(APIC_LVT0, APIC_DM_EXTINT);
2978 3100
2979 unlock_ExtINT_logic(); 3101 unlock_ExtINT_logic();
@@ -3015,7 +3137,7 @@ void __init setup_IO_APIC(void)
3015 /* 3137 /*
3016 * calling enable_IO_APIC() is moved to setup_local_APIC for BP 3138 * calling enable_IO_APIC() is moved to setup_local_APIC for BP
3017 */ 3139 */
3018 io_apic_irqs = nr_legacy_irqs ? ~PIC_IRQS : ~0UL; 3140 io_apic_irqs = legacy_pic->nr_legacy_irqs ? ~PIC_IRQS : ~0UL;
3019 3141
3020 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n"); 3142 apic_printk(APIC_VERBOSE, "ENABLING IO-APIC IRQs\n");
3021 /* 3143 /*
@@ -3026,7 +3148,7 @@ void __init setup_IO_APIC(void)
3026 sync_Arb_IDs(); 3148 sync_Arb_IDs();
3027 setup_IO_APIC_irqs(); 3149 setup_IO_APIC_irqs();
3028 init_IO_APIC_traps(); 3150 init_IO_APIC_traps();
3029 if (nr_legacy_irqs) 3151 if (legacy_pic->nr_legacy_irqs)
3030 check_timer(); 3152 check_timer();
3031} 3153}
3032 3154
@@ -3075,13 +3197,13 @@ static int ioapic_resume(struct sys_device *dev)
3075 data = container_of(dev, struct sysfs_ioapic_data, dev); 3197 data = container_of(dev, struct sysfs_ioapic_data, dev);
3076 entry = data->entry; 3198 entry = data->entry;
3077 3199
3078 spin_lock_irqsave(&ioapic_lock, flags); 3200 raw_spin_lock_irqsave(&ioapic_lock, flags);
3079 reg_00.raw = io_apic_read(dev->id, 0); 3201 reg_00.raw = io_apic_read(dev->id, 0);
3080 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) { 3202 if (reg_00.bits.ID != mp_ioapics[dev->id].apicid) {
3081 reg_00.bits.ID = mp_ioapics[dev->id].apicid; 3203 reg_00.bits.ID = mp_ioapics[dev->id].apicid;
3082 io_apic_write(dev->id, 0, reg_00.raw); 3204 io_apic_write(dev->id, 0, reg_00.raw);
3083 } 3205 }
3084 spin_unlock_irqrestore(&ioapic_lock, flags); 3206 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3085 for (i = 0; i < nr_ioapic_registers[dev->id]; i++) 3207 for (i = 0; i < nr_ioapic_registers[dev->id]; i++)
3086 ioapic_write_entry(dev->id, i, entry[i]); 3208 ioapic_write_entry(dev->id, i, entry[i]);
3087 3209
@@ -3144,7 +3266,7 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3144 if (irq_want < nr_irqs_gsi) 3266 if (irq_want < nr_irqs_gsi)
3145 irq_want = nr_irqs_gsi; 3267 irq_want = nr_irqs_gsi;
3146 3268
3147 spin_lock_irqsave(&vector_lock, flags); 3269 raw_spin_lock_irqsave(&vector_lock, flags);
3148 for (new = irq_want; new < nr_irqs; new++) { 3270 for (new = irq_want; new < nr_irqs; new++) {
3149 desc_new = irq_to_desc_alloc_node(new, node); 3271 desc_new = irq_to_desc_alloc_node(new, node);
3150 if (!desc_new) { 3272 if (!desc_new) {
@@ -3157,19 +3279,17 @@ unsigned int create_irq_nr(unsigned int irq_want, int node)
3157 continue; 3279 continue;
3158 3280
3159 desc_new = move_irq_desc(desc_new, node); 3281 desc_new = move_irq_desc(desc_new, node);
3282 cfg_new = desc_new->chip_data;
3160 3283
3161 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0) 3284 if (__assign_irq_vector(new, cfg_new, apic->target_cpus()) == 0)
3162 irq = new; 3285 irq = new;
3163 break; 3286 break;
3164 } 3287 }
3165 spin_unlock_irqrestore(&vector_lock, flags); 3288 raw_spin_unlock_irqrestore(&vector_lock, flags);
3289
3290 if (irq > 0)
3291 dynamic_irq_init_keep_chip_data(irq);
3166 3292
3167 if (irq > 0) {
3168 dynamic_irq_init(irq);
3169 /* restore it, in case dynamic_irq_init clear it */
3170 if (desc_new)
3171 desc_new->chip_data = cfg_new;
3172 }
3173 return irq; 3293 return irq;
3174} 3294}
3175 3295
@@ -3191,27 +3311,21 @@ int create_irq(void)
3191void destroy_irq(unsigned int irq) 3311void destroy_irq(unsigned int irq)
3192{ 3312{
3193 unsigned long flags; 3313 unsigned long flags;
3194 struct irq_cfg *cfg;
3195 struct irq_desc *desc;
3196 3314
3197 /* store it, in case dynamic_irq_cleanup clear it */ 3315 dynamic_irq_cleanup_keep_chip_data(irq);
3198 desc = irq_to_desc(irq);
3199 cfg = desc->chip_data;
3200 dynamic_irq_cleanup(irq);
3201 /* connect back irq_cfg */
3202 desc->chip_data = cfg;
3203 3316
3204 free_irte(irq); 3317 free_irte(irq);
3205 spin_lock_irqsave(&vector_lock, flags); 3318 raw_spin_lock_irqsave(&vector_lock, flags);
3206 __clear_irq_vector(irq, cfg); 3319 __clear_irq_vector(irq, get_irq_chip_data(irq));
3207 spin_unlock_irqrestore(&vector_lock, flags); 3320 raw_spin_unlock_irqrestore(&vector_lock, flags);
3208} 3321}
3209 3322
3210/* 3323/*
3211 * MSI message composition 3324 * MSI message composition
3212 */ 3325 */
3213#ifdef CONFIG_PCI_MSI 3326#ifdef CONFIG_PCI_MSI
3214static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_msg *msg) 3327static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq,
3328 struct msi_msg *msg, u8 hpet_id)
3215{ 3329{
3216 struct irq_cfg *cfg; 3330 struct irq_cfg *cfg;
3217 int err; 3331 int err;
@@ -3245,7 +3359,10 @@ static int msi_compose_msg(struct pci_dev *pdev, unsigned int irq, struct msi_ms
3245 irte.dest_id = IRTE_DEST(dest); 3359 irte.dest_id = IRTE_DEST(dest);
3246 3360
3247 /* Set source-id of interrupt request */ 3361 /* Set source-id of interrupt request */
3248 set_msi_sid(&irte, pdev); 3362 if (pdev)
3363 set_msi_sid(&irte, pdev);
3364 else
3365 set_hpet_sid(&irte, hpet_id);
3249 3366
3250 modify_irte(irq, &irte); 3367 modify_irte(irq, &irte);
3251 3368
@@ -3291,8 +3408,7 @@ static int set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3291 struct msi_msg msg; 3408 struct msi_msg msg;
3292 unsigned int dest; 3409 unsigned int dest;
3293 3410
3294 dest = set_desc_affinity(desc, mask); 3411 if (set_desc_affinity(desc, mask, &dest))
3295 if (dest == BAD_APICID)
3296 return -1; 3412 return -1;
3297 3413
3298 cfg = desc->chip_data; 3414 cfg = desc->chip_data;
@@ -3324,8 +3440,7 @@ ir_set_msi_irq_affinity(unsigned int irq, const struct cpumask *mask)
3324 if (get_irte(irq, &irte)) 3440 if (get_irte(irq, &irte))
3325 return -1; 3441 return -1;
3326 3442
3327 dest = set_desc_affinity(desc, mask); 3443 if (set_desc_affinity(desc, mask, &dest))
3328 if (dest == BAD_APICID)
3329 return -1; 3444 return -1;
3330 3445
3331 irte.vector = cfg->vector; 3446 irte.vector = cfg->vector;
@@ -3410,7 +3525,7 @@ static int setup_msi_irq(struct pci_dev *dev, struct msi_desc *msidesc, int irq)
3410 int ret; 3525 int ret;
3411 struct msi_msg msg; 3526 struct msi_msg msg;
3412 3527
3413 ret = msi_compose_msg(dev, irq, &msg); 3528 ret = msi_compose_msg(dev, irq, &msg, -1);
3414 if (ret < 0) 3529 if (ret < 0)
3415 return ret; 3530 return ret;
3416 3531
@@ -3507,8 +3622,7 @@ static int dmar_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3507 struct msi_msg msg; 3622 struct msi_msg msg;
3508 unsigned int dest; 3623 unsigned int dest;
3509 3624
3510 dest = set_desc_affinity(desc, mask); 3625 if (set_desc_affinity(desc, mask, &dest))
3511 if (dest == BAD_APICID)
3512 return -1; 3626 return -1;
3513 3627
3514 cfg = desc->chip_data; 3628 cfg = desc->chip_data;
@@ -3543,7 +3657,7 @@ int arch_setup_dmar_msi(unsigned int irq)
3543 int ret; 3657 int ret;
3544 struct msi_msg msg; 3658 struct msi_msg msg;
3545 3659
3546 ret = msi_compose_msg(NULL, irq, &msg); 3660 ret = msi_compose_msg(NULL, irq, &msg, -1);
3547 if (ret < 0) 3661 if (ret < 0)
3548 return ret; 3662 return ret;
3549 dmar_msi_write(irq, &msg); 3663 dmar_msi_write(irq, &msg);
@@ -3563,8 +3677,7 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3563 struct msi_msg msg; 3677 struct msi_msg msg;
3564 unsigned int dest; 3678 unsigned int dest;
3565 3679
3566 dest = set_desc_affinity(desc, mask); 3680 if (set_desc_affinity(desc, mask, &dest))
3567 if (dest == BAD_APICID)
3568 return -1; 3681 return -1;
3569 3682
3570 cfg = desc->chip_data; 3683 cfg = desc->chip_data;
@@ -3583,6 +3696,19 @@ static int hpet_msi_set_affinity(unsigned int irq, const struct cpumask *mask)
3583 3696
3584#endif /* CONFIG_SMP */ 3697#endif /* CONFIG_SMP */
3585 3698
3699static struct irq_chip ir_hpet_msi_type = {
3700 .name = "IR-HPET_MSI",
3701 .unmask = hpet_msi_unmask,
3702 .mask = hpet_msi_mask,
3703#ifdef CONFIG_INTR_REMAP
3704 .ack = ir_ack_apic_edge,
3705#ifdef CONFIG_SMP
3706 .set_affinity = ir_set_msi_irq_affinity,
3707#endif
3708#endif
3709 .retrigger = ioapic_retrigger_irq,
3710};
3711
3586static struct irq_chip hpet_msi_type = { 3712static struct irq_chip hpet_msi_type = {
3587 .name = "HPET_MSI", 3713 .name = "HPET_MSI",
3588 .unmask = hpet_msi_unmask, 3714 .unmask = hpet_msi_unmask,
@@ -3594,20 +3720,36 @@ static struct irq_chip hpet_msi_type = {
3594 .retrigger = ioapic_retrigger_irq, 3720 .retrigger = ioapic_retrigger_irq,
3595}; 3721};
3596 3722
3597int arch_setup_hpet_msi(unsigned int irq) 3723int arch_setup_hpet_msi(unsigned int irq, unsigned int id)
3598{ 3724{
3599 int ret; 3725 int ret;
3600 struct msi_msg msg; 3726 struct msi_msg msg;
3601 struct irq_desc *desc = irq_to_desc(irq); 3727 struct irq_desc *desc = irq_to_desc(irq);
3602 3728
3603 ret = msi_compose_msg(NULL, irq, &msg); 3729 if (intr_remapping_enabled) {
3730 struct intel_iommu *iommu = map_hpet_to_ir(id);
3731 int index;
3732
3733 if (!iommu)
3734 return -1;
3735
3736 index = alloc_irte(iommu, irq, 1);
3737 if (index < 0)
3738 return -1;
3739 }
3740
3741 ret = msi_compose_msg(NULL, irq, &msg, id);
3604 if (ret < 0) 3742 if (ret < 0)
3605 return ret; 3743 return ret;
3606 3744
3607 hpet_msi_write(irq, &msg); 3745 hpet_msi_write(irq, &msg);
3608 desc->status |= IRQ_MOVE_PCNTXT; 3746 desc->status |= IRQ_MOVE_PCNTXT;
3609 set_irq_chip_and_handler_name(irq, &hpet_msi_type, handle_edge_irq, 3747 if (irq_remapped(irq))
3610 "edge"); 3748 set_irq_chip_and_handler_name(irq, &ir_hpet_msi_type,
3749 handle_edge_irq, "edge");
3750 else
3751 set_irq_chip_and_handler_name(irq, &hpet_msi_type,
3752 handle_edge_irq, "edge");
3611 3753
3612 return 0; 3754 return 0;
3613} 3755}
@@ -3641,8 +3783,7 @@ static int set_ht_irq_affinity(unsigned int irq, const struct cpumask *mask)
3641 struct irq_cfg *cfg; 3783 struct irq_cfg *cfg;
3642 unsigned int dest; 3784 unsigned int dest;
3643 3785
3644 dest = set_desc_affinity(desc, mask); 3786 if (set_desc_affinity(desc, mask, &dest))
3645 if (dest == BAD_APICID)
3646 return -1; 3787 return -1;
3647 3788
3648 cfg = desc->chip_data; 3789 cfg = desc->chip_data;
@@ -3708,83 +3849,14 @@ int arch_setup_ht_irq(unsigned int irq, struct pci_dev *dev)
3708} 3849}
3709#endif /* CONFIG_HT_IRQ */ 3850#endif /* CONFIG_HT_IRQ */
3710 3851
3711#ifdef CONFIG_X86_UV
3712/*
3713 * Re-target the irq to the specified CPU and enable the specified MMR located
3714 * on the specified blade to allow the sending of MSIs to the specified CPU.
3715 */
3716int arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
3717 unsigned long mmr_offset)
3718{
3719 const struct cpumask *eligible_cpu = cpumask_of(cpu);
3720 struct irq_cfg *cfg;
3721 int mmr_pnode;
3722 unsigned long mmr_value;
3723 struct uv_IO_APIC_route_entry *entry;
3724 unsigned long flags;
3725 int err;
3726
3727 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3728
3729 cfg = irq_cfg(irq);
3730
3731 err = assign_irq_vector(irq, cfg, eligible_cpu);
3732 if (err != 0)
3733 return err;
3734
3735 spin_lock_irqsave(&vector_lock, flags);
3736 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
3737 irq_name);
3738 spin_unlock_irqrestore(&vector_lock, flags);
3739
3740 mmr_value = 0;
3741 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3742 entry->vector = cfg->vector;
3743 entry->delivery_mode = apic->irq_delivery_mode;
3744 entry->dest_mode = apic->irq_dest_mode;
3745 entry->polarity = 0;
3746 entry->trigger = 0;
3747 entry->mask = 0;
3748 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
3749
3750 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3751 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3752
3753 if (cfg->move_in_progress)
3754 send_cleanup_vector(cfg);
3755
3756 return irq;
3757}
3758
3759/*
3760 * Disable the specified MMR located on the specified blade so that MSIs are
3761 * longer allowed to be sent.
3762 */
3763void arch_disable_uv_irq(int mmr_blade, unsigned long mmr_offset)
3764{
3765 unsigned long mmr_value;
3766 struct uv_IO_APIC_route_entry *entry;
3767 int mmr_pnode;
3768
3769 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) != sizeof(unsigned long));
3770
3771 mmr_value = 0;
3772 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
3773 entry->mask = 1;
3774
3775 mmr_pnode = uv_blade_to_pnode(mmr_blade);
3776 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
3777}
3778#endif /* CONFIG_X86_64 */
3779
3780int __init io_apic_get_redir_entries (int ioapic) 3852int __init io_apic_get_redir_entries (int ioapic)
3781{ 3853{
3782 union IO_APIC_reg_01 reg_01; 3854 union IO_APIC_reg_01 reg_01;
3783 unsigned long flags; 3855 unsigned long flags;
3784 3856
3785 spin_lock_irqsave(&ioapic_lock, flags); 3857 raw_spin_lock_irqsave(&ioapic_lock, flags);
3786 reg_01.raw = io_apic_read(ioapic, 1); 3858 reg_01.raw = io_apic_read(ioapic, 1);
3787 spin_unlock_irqrestore(&ioapic_lock, flags); 3859 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3788 3860
3789 return reg_01.bits.entries; 3861 return reg_01.bits.entries;
3790} 3862}
@@ -3867,7 +3939,7 @@ static int __io_apic_set_pci_routing(struct device *dev, int irq,
3867 /* 3939 /*
3868 * IRQs < 16 are already in the irq_2_pin[] map 3940 * IRQs < 16 are already in the irq_2_pin[] map
3869 */ 3941 */
3870 if (irq >= nr_legacy_irqs) { 3942 if (irq >= legacy_pic->nr_legacy_irqs) {
3871 cfg = desc->chip_data; 3943 cfg = desc->chip_data;
3872 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) { 3944 if (add_pin_to_irq_node_nopanic(cfg, node, ioapic, pin)) {
3873 printk(KERN_INFO "can not add pin %d for irq %d\n", 3945 printk(KERN_INFO "can not add pin %d for irq %d\n",
@@ -3944,11 +4016,11 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3944 */ 4016 */
3945 4017
3946 if (physids_empty(apic_id_map)) 4018 if (physids_empty(apic_id_map))
3947 apic_id_map = apic->ioapic_phys_id_map(phys_cpu_present_map); 4019 apic->ioapic_phys_id_map(&phys_cpu_present_map, &apic_id_map);
3948 4020
3949 spin_lock_irqsave(&ioapic_lock, flags); 4021 raw_spin_lock_irqsave(&ioapic_lock, flags);
3950 reg_00.raw = io_apic_read(ioapic, 0); 4022 reg_00.raw = io_apic_read(ioapic, 0);
3951 spin_unlock_irqrestore(&ioapic_lock, flags); 4023 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3952 4024
3953 if (apic_id >= get_physical_broadcast()) { 4025 if (apic_id >= get_physical_broadcast()) {
3954 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying " 4026 printk(KERN_WARNING "IOAPIC[%d]: Invalid apic_id %d, trying "
@@ -3960,10 +4032,10 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3960 * Every APIC in a system must have a unique ID or we get lots of nice 4032 * Every APIC in a system must have a unique ID or we get lots of nice
3961 * 'stuck on smp_invalidate_needed IPI wait' messages. 4033 * 'stuck on smp_invalidate_needed IPI wait' messages.
3962 */ 4034 */
3963 if (apic->check_apicid_used(apic_id_map, apic_id)) { 4035 if (apic->check_apicid_used(&apic_id_map, apic_id)) {
3964 4036
3965 for (i = 0; i < get_physical_broadcast(); i++) { 4037 for (i = 0; i < get_physical_broadcast(); i++) {
3966 if (!apic->check_apicid_used(apic_id_map, i)) 4038 if (!apic->check_apicid_used(&apic_id_map, i))
3967 break; 4039 break;
3968 } 4040 }
3969 4041
@@ -3976,16 +4048,16 @@ int __init io_apic_get_unique_id(int ioapic, int apic_id)
3976 apic_id = i; 4048 apic_id = i;
3977 } 4049 }
3978 4050
3979 tmp = apic->apicid_to_cpu_present(apic_id); 4051 apic->apicid_to_cpu_present(apic_id, &tmp);
3980 physids_or(apic_id_map, apic_id_map, tmp); 4052 physids_or(apic_id_map, apic_id_map, tmp);
3981 4053
3982 if (reg_00.bits.ID != apic_id) { 4054 if (reg_00.bits.ID != apic_id) {
3983 reg_00.bits.ID = apic_id; 4055 reg_00.bits.ID = apic_id;
3984 4056
3985 spin_lock_irqsave(&ioapic_lock, flags); 4057 raw_spin_lock_irqsave(&ioapic_lock, flags);
3986 io_apic_write(ioapic, 0, reg_00.raw); 4058 io_apic_write(ioapic, 0, reg_00.raw);
3987 reg_00.raw = io_apic_read(ioapic, 0); 4059 reg_00.raw = io_apic_read(ioapic, 0);
3988 spin_unlock_irqrestore(&ioapic_lock, flags); 4060 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
3989 4061
3990 /* Sanity check */ 4062 /* Sanity check */
3991 if (reg_00.bits.ID != apic_id) { 4063 if (reg_00.bits.ID != apic_id) {
@@ -4006,9 +4078,9 @@ int __init io_apic_get_version(int ioapic)
4006 union IO_APIC_reg_01 reg_01; 4078 union IO_APIC_reg_01 reg_01;
4007 unsigned long flags; 4079 unsigned long flags;
4008 4080
4009 spin_lock_irqsave(&ioapic_lock, flags); 4081 raw_spin_lock_irqsave(&ioapic_lock, flags);
4010 reg_01.raw = io_apic_read(ioapic, 1); 4082 reg_01.raw = io_apic_read(ioapic, 1);
4011 spin_unlock_irqrestore(&ioapic_lock, flags); 4083 raw_spin_unlock_irqrestore(&ioapic_lock, flags);
4012 4084
4013 return reg_01.bits.version; 4085 return reg_01.bits.version;
4014} 4086}
@@ -4040,27 +4112,23 @@ int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity)
4040#ifdef CONFIG_SMP 4112#ifdef CONFIG_SMP
4041void __init setup_ioapic_dest(void) 4113void __init setup_ioapic_dest(void)
4042{ 4114{
4043 int pin, ioapic = 0, irq, irq_entry; 4115 int pin, ioapic, irq, irq_entry;
4044 struct irq_desc *desc; 4116 struct irq_desc *desc;
4045 const struct cpumask *mask; 4117 const struct cpumask *mask;
4046 4118
4047 if (skip_ioapic_setup == 1) 4119 if (skip_ioapic_setup == 1)
4048 return; 4120 return;
4049 4121
4050#ifdef CONFIG_ACPI 4122 for (ioapic = 0; ioapic < nr_ioapics; ioapic++)
4051 if (!acpi_disabled && acpi_ioapic) {
4052 ioapic = mp_find_ioapic(0);
4053 if (ioapic < 0)
4054 ioapic = 0;
4055 }
4056#endif
4057
4058 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) { 4123 for (pin = 0; pin < nr_ioapic_registers[ioapic]; pin++) {
4059 irq_entry = find_irq_entry(ioapic, pin, mp_INT); 4124 irq_entry = find_irq_entry(ioapic, pin, mp_INT);
4060 if (irq_entry == -1) 4125 if (irq_entry == -1)
4061 continue; 4126 continue;
4062 irq = pin_2_irq(irq_entry, ioapic, pin); 4127 irq = pin_2_irq(irq_entry, ioapic, pin);
4063 4128
4129 if ((ioapic > 0) && (irq > 16))
4130 continue;
4131
4064 desc = irq_to_desc(irq); 4132 desc = irq_to_desc(irq);
4065 4133
4066 /* 4134 /*
@@ -4106,7 +4174,7 @@ static struct resource * __init ioapic_setup_resources(int nr_ioapics)
4106 for (i = 0; i < nr_ioapics; i++) { 4174 for (i = 0; i < nr_ioapics; i++) {
4107 res[i].name = mem; 4175 res[i].name = mem;
4108 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY; 4176 res[i].flags = IORESOURCE_MEM | IORESOURCE_BUSY;
4109 sprintf(mem, "IOAPIC %u", i); 4177 snprintf(mem, IOAPIC_RESOURCE_NAME_SIZE, "IOAPIC %u", i);
4110 mem += IOAPIC_RESOURCE_NAME_SIZE; 4178 mem += IOAPIC_RESOURCE_NAME_SIZE;
4111 } 4179 }
4112 4180
@@ -4140,18 +4208,17 @@ void __init ioapic_init_mappings(void)
4140#ifdef CONFIG_X86_32 4208#ifdef CONFIG_X86_32
4141fake_ioapic_page: 4209fake_ioapic_page:
4142#endif 4210#endif
4143 ioapic_phys = (unsigned long) 4211 ioapic_phys = (unsigned long)alloc_bootmem_pages(PAGE_SIZE);
4144 alloc_bootmem_pages(PAGE_SIZE);
4145 ioapic_phys = __pa(ioapic_phys); 4212 ioapic_phys = __pa(ioapic_phys);
4146 } 4213 }
4147 set_fixmap_nocache(idx, ioapic_phys); 4214 set_fixmap_nocache(idx, ioapic_phys);
4148 apic_printk(APIC_VERBOSE, 4215 apic_printk(APIC_VERBOSE, "mapped IOAPIC to %08lx (%08lx)\n",
4149 "mapped IOAPIC to %08lx (%08lx)\n", 4216 __fix_to_virt(idx) + (ioapic_phys & ~PAGE_MASK),
4150 __fix_to_virt(idx), ioapic_phys); 4217 ioapic_phys);
4151 idx++; 4218 idx++;
4152 4219
4153 ioapic_res->start = ioapic_phys; 4220 ioapic_res->start = ioapic_phys;
4154 ioapic_res->end = ioapic_phys + (4 * 1024) - 1; 4221 ioapic_res->end = ioapic_phys + IO_APIC_SLOT_SIZE - 1;
4155 ioapic_res++; 4222 ioapic_res++;
4156 } 4223 }
4157} 4224}
@@ -4246,3 +4313,24 @@ void __init mp_register_ioapic(int id, u32 address, u32 gsi_base)
4246 4313
4247 nr_ioapics++; 4314 nr_ioapics++;
4248} 4315}
4316
4317/* Enable IOAPIC early just for system timer */
4318void __init pre_init_apic_IRQ0(void)
4319{
4320 struct irq_cfg *cfg;
4321 struct irq_desc *desc;
4322
4323 printk(KERN_INFO "Early APIC setup for system timer0\n");
4324#ifndef CONFIG_SMP
4325 phys_cpu_present_map = physid_mask_of_physid(boot_cpu_physical_apicid);
4326#endif
4327 desc = irq_to_desc_alloc_node(0, 0);
4328
4329 setup_local_APIC();
4330
4331 cfg = irq_cfg(0);
4332 add_pin_to_irq_node(cfg, 0, 0, 0);
4333 set_irq_chip_and_handler_name(0, &ioapic_chip, handle_edge_irq, "edge");
4334
4335 setup_IO_APIC_irq(0, 0, 0, desc, 0, 0);
4336}
diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c
index 7ff61d6a188a..1edaf15c0b8e 100644
--- a/arch/x86/kernel/apic/nmi.c
+++ b/arch/x86/kernel/apic/nmi.c
@@ -18,6 +18,7 @@
18#include <linux/delay.h> 18#include <linux/delay.h>
19#include <linux/interrupt.h> 19#include <linux/interrupt.h>
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/slab.h>
21#include <linux/sysdev.h> 22#include <linux/sysdev.h>
22#include <linux/sysctl.h> 23#include <linux/sysctl.h>
23#include <linux/percpu.h> 24#include <linux/percpu.h>
@@ -39,7 +40,8 @@
39int unknown_nmi_panic; 40int unknown_nmi_panic;
40int nmi_watchdog_enabled; 41int nmi_watchdog_enabled;
41 42
42static cpumask_t backtrace_mask __read_mostly; 43/* For reliability, we're prepared to waste bits here. */
44static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly;
43 45
44/* nmi_active: 46/* nmi_active:
45 * >0: the lapic NMI watchdog is active, but can be disabled 47 * >0: the lapic NMI watchdog is active, but can be disabled
@@ -176,7 +178,7 @@ int __init check_nmi_watchdog(void)
176error: 178error:
177 if (nmi_watchdog == NMI_IO_APIC) { 179 if (nmi_watchdog == NMI_IO_APIC) {
178 if (!timer_through_8259) 180 if (!timer_through_8259)
179 disable_8259A_irq(0); 181 legacy_pic->chip->mask(0);
180 on_each_cpu(__acpi_nmi_disable, NULL, 1); 182 on_each_cpu(__acpi_nmi_disable, NULL, 1);
181 } 183 }
182 184
@@ -360,7 +362,7 @@ void stop_apic_nmi_watchdog(void *unused)
360 */ 362 */
361 363
362static DEFINE_PER_CPU(unsigned, last_irq_sum); 364static DEFINE_PER_CPU(unsigned, last_irq_sum);
363static DEFINE_PER_CPU(local_t, alert_counter); 365static DEFINE_PER_CPU(long, alert_counter);
364static DEFINE_PER_CPU(int, nmi_touch); 366static DEFINE_PER_CPU(int, nmi_touch);
365 367
366void touch_nmi_watchdog(void) 368void touch_nmi_watchdog(void)
@@ -414,15 +416,15 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
414 } 416 }
415 417
416 /* We can be called before check_nmi_watchdog, hence NULL check. */ 418 /* We can be called before check_nmi_watchdog, hence NULL check. */
417 if (cpumask_test_cpu(cpu, &backtrace_mask)) { 419 if (cpumask_test_cpu(cpu, to_cpumask(backtrace_mask))) {
418 static DEFINE_SPINLOCK(lock); /* Serialise the printks */ 420 static DEFINE_RAW_SPINLOCK(lock); /* Serialise the printks */
419 421
420 spin_lock(&lock); 422 raw_spin_lock(&lock);
421 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu); 423 printk(KERN_WARNING "NMI backtrace for cpu %d\n", cpu);
422 show_regs(regs); 424 show_regs(regs);
423 dump_stack(); 425 dump_stack();
424 spin_unlock(&lock); 426 raw_spin_unlock(&lock);
425 cpumask_clear_cpu(cpu, &backtrace_mask); 427 cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask));
426 428
427 rc = 1; 429 rc = 1;
428 } 430 }
@@ -437,8 +439,8 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
437 * Ayiee, looks like this CPU is stuck ... 439 * Ayiee, looks like this CPU is stuck ...
438 * wait a few IRQs (5 seconds) before doing the oops ... 440 * wait a few IRQs (5 seconds) before doing the oops ...
439 */ 441 */
440 local_inc(&__get_cpu_var(alert_counter)); 442 __this_cpu_inc(alert_counter);
441 if (local_read(&__get_cpu_var(alert_counter)) == 5 * nmi_hz) 443 if (__this_cpu_read(alert_counter) == 5 * nmi_hz)
442 /* 444 /*
443 * die_nmi will return ONLY if NOTIFY_STOP happens.. 445 * die_nmi will return ONLY if NOTIFY_STOP happens..
444 */ 446 */
@@ -446,7 +448,7 @@ nmi_watchdog_tick(struct pt_regs *regs, unsigned reason)
446 regs, panic_on_timeout); 448 regs, panic_on_timeout);
447 } else { 449 } else {
448 __get_cpu_var(last_irq_sum) = sum; 450 __get_cpu_var(last_irq_sum) = sum;
449 local_set(&__get_cpu_var(alert_counter), 0); 451 __this_cpu_write(alert_counter, 0);
450 } 452 }
451 453
452 /* see if the nmi watchdog went off */ 454 /* see if the nmi watchdog went off */
@@ -558,14 +560,14 @@ void arch_trigger_all_cpu_backtrace(void)
558{ 560{
559 int i; 561 int i;
560 562
561 cpumask_copy(&backtrace_mask, cpu_online_mask); 563 cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask);
562 564
563 printk(KERN_INFO "sending NMI to all CPUs:\n"); 565 printk(KERN_INFO "sending NMI to all CPUs:\n");
564 apic->send_IPI_all(NMI_VECTOR); 566 apic->send_IPI_all(NMI_VECTOR);
565 567
566 /* Wait for up to 10 seconds for all CPUs to do the backtrace */ 568 /* Wait for up to 10 seconds for all CPUs to do the backtrace */
567 for (i = 0; i < 10 * 1000; i++) { 569 for (i = 0; i < 10 * 1000; i++) {
568 if (cpumask_empty(&backtrace_mask)) 570 if (cpumask_empty(to_cpumask(backtrace_mask)))
569 break; 571 break;
570 mdelay(1); 572 mdelay(1);
571 } 573 }
diff --git a/arch/x86/kernel/apic/numaq_32.c b/arch/x86/kernel/apic/numaq_32.c
index efa00e2b8505..3e28401f161c 100644
--- a/arch/x86/kernel/apic/numaq_32.c
+++ b/arch/x86/kernel/apic/numaq_32.c
@@ -225,7 +225,7 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
225 225
226 mpc_record = 0; 226 mpc_record = 0;
227 printk(KERN_INFO 227 printk(KERN_INFO
228 "Found an OEM MPC table at %8p - parsing it ... \n", oemtable); 228 "Found an OEM MPC table at %8p - parsing it...\n", oemtable);
229 229
230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) { 230 if (memcmp(oemtable->signature, MPC_OEM_SIGNATURE, 4)) {
231 printk(KERN_WARNING 231 printk(KERN_WARNING
@@ -264,11 +264,6 @@ static void __init smp_read_mpc_oem(struct mpc_table *mpc)
264static __init void early_check_numaq(void) 264static __init void early_check_numaq(void)
265{ 265{
266 /* 266 /*
267 * Find possible boot-time SMP configuration:
268 */
269 early_find_smp_config();
270
271 /*
272 * get boot-time SMP configuration: 267 * get boot-time SMP configuration:
273 */ 268 */
274 if (smp_found_config) 269 if (smp_found_config)
@@ -282,6 +277,7 @@ static __init void early_check_numaq(void)
282 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus; 277 x86_init.mpparse.mpc_oem_pci_bus = mpc_oem_pci_bus;
283 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info; 278 x86_init.mpparse.mpc_oem_bus_info = mpc_oem_bus_info;
284 x86_init.timers.tsc_pre_init = numaq_tsc_init; 279 x86_init.timers.tsc_pre_init = numaq_tsc_init;
280 x86_init.pci.init = pci_numaq_init;
285 } 281 }
286} 282}
287 283
@@ -334,10 +330,9 @@ static inline const struct cpumask *numaq_target_cpus(void)
334 return cpu_all_mask; 330 return cpu_all_mask;
335} 331}
336 332
337static inline unsigned long 333static unsigned long numaq_check_apicid_used(physid_mask_t *map, int apicid)
338numaq_check_apicid_used(physid_mask_t bitmap, int apicid)
339{ 334{
340 return physid_isset(apicid, bitmap); 335 return physid_isset(apicid, *map);
341} 336}
342 337
343static inline unsigned long numaq_check_apicid_present(int bit) 338static inline unsigned long numaq_check_apicid_present(int bit)
@@ -371,10 +366,10 @@ static inline int numaq_multi_timer_check(int apic, int irq)
371 return apic != 0 && irq == 0; 366 return apic != 0 && irq == 0;
372} 367}
373 368
374static inline physid_mask_t numaq_ioapic_phys_id_map(physid_mask_t phys_map) 369static inline void numaq_ioapic_phys_id_map(physid_mask_t *phys_map, physid_mask_t *retmap)
375{ 370{
376 /* We don't have a good way to do this yet - hack */ 371 /* We don't have a good way to do this yet - hack */
377 return physids_promote(0xFUL); 372 return physids_promote(0xFUL, retmap);
378} 373}
379 374
380static inline int numaq_cpu_to_logical_apicid(int cpu) 375static inline int numaq_cpu_to_logical_apicid(int cpu)
@@ -402,12 +397,12 @@ static inline int numaq_apicid_to_node(int logical_apicid)
402 return logical_apicid >> 4; 397 return logical_apicid >> 4;
403} 398}
404 399
405static inline physid_mask_t numaq_apicid_to_cpu_present(int logical_apicid) 400static void numaq_apicid_to_cpu_present(int logical_apicid, physid_mask_t *retmap)
406{ 401{
407 int node = numaq_apicid_to_node(logical_apicid); 402 int node = numaq_apicid_to_node(logical_apicid);
408 int cpu = __ffs(logical_apicid & 0xf); 403 int cpu = __ffs(logical_apicid & 0xf);
409 404
410 return physid_mask_of_physid(cpu + 4*node); 405 physid_set_mask_of_physid(cpu + 4*node, retmap);
411} 406}
412 407
413/* Where the IO area was mapped on multiquad, always 0 otherwise */ 408/* Where the IO area was mapped on multiquad, always 0 otherwise */
diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c
index 0c0182cc947d..99d2fe016084 100644
--- a/arch/x86/kernel/apic/probe_32.c
+++ b/arch/x86/kernel/apic/probe_32.c
@@ -52,7 +52,32 @@ static int __init print_ipi_mode(void)
52} 52}
53late_initcall(print_ipi_mode); 53late_initcall(print_ipi_mode);
54 54
55void default_setup_apic_routing(void) 55void __init default_setup_apic_routing(void)
56{
57 int version = apic_version[boot_cpu_physical_apicid];
58
59 if (num_possible_cpus() > 8) {
60 switch (boot_cpu_data.x86_vendor) {
61 case X86_VENDOR_INTEL:
62 if (!APIC_XAPIC(version)) {
63 def_to_bigsmp = 0;
64 break;
65 }
66 /* If P4 and above fall through */
67 case X86_VENDOR_AMD:
68 def_to_bigsmp = 1;
69 }
70 }
71
72#ifdef CONFIG_X86_BIGSMP
73 generic_bigsmp_probe();
74#endif
75
76 if (apic->setup_apic_routing)
77 apic->setup_apic_routing();
78}
79
80static void setup_apic_flat_routing(void)
56{ 81{
57#ifdef CONFIG_X86_IO_APIC 82#ifdef CONFIG_X86_IO_APIC
58 printk(KERN_INFO 83 printk(KERN_INFO
@@ -103,12 +128,12 @@ struct apic apic_default = {
103 .init_apic_ldr = default_init_apic_ldr, 128 .init_apic_ldr = default_init_apic_ldr,
104 129
105 .ioapic_phys_id_map = default_ioapic_phys_id_map, 130 .ioapic_phys_id_map = default_ioapic_phys_id_map,
106 .setup_apic_routing = default_setup_apic_routing, 131 .setup_apic_routing = setup_apic_flat_routing,
107 .multi_timer_check = NULL, 132 .multi_timer_check = NULL,
108 .apicid_to_node = default_apicid_to_node, 133 .apicid_to_node = default_apicid_to_node,
109 .cpu_to_logical_apicid = default_cpu_to_logical_apicid, 134 .cpu_to_logical_apicid = default_cpu_to_logical_apicid,
110 .cpu_present_to_apicid = default_cpu_present_to_apicid, 135 .cpu_present_to_apicid = default_cpu_present_to_apicid,
111 .apicid_to_cpu_present = default_apicid_to_cpu_present, 136 .apicid_to_cpu_present = physid_set_mask_of_physid,
112 .setup_portio_remap = NULL, 137 .setup_portio_remap = NULL,
113 .check_phys_apicid_present = default_check_phys_apicid_present, 138 .check_phys_apicid_present = default_check_phys_apicid_present,
114 .enable_apic_mode = NULL, 139 .enable_apic_mode = NULL,
diff --git a/arch/x86/kernel/apic/probe_64.c b/arch/x86/kernel/apic/probe_64.c
index c4cbd3080c1c..83e9be4778e2 100644
--- a/arch/x86/kernel/apic/probe_64.c
+++ b/arch/x86/kernel/apic/probe_64.c
@@ -67,17 +67,8 @@ void __init default_setup_apic_routing(void)
67 } 67 }
68#endif 68#endif
69 69
70 if (apic == &apic_flat) { 70 if (apic == &apic_flat && num_possible_cpus() > 8)
71 switch (boot_cpu_data.x86_vendor) { 71 apic = &apic_physflat;
72 case X86_VENDOR_INTEL:
73 if (num_processors > 8)
74 apic = &apic_physflat;
75 break;
76 case X86_VENDOR_AMD:
77 if (max_physical_apicid >= 8)
78 apic = &apic_physflat;
79 }
80 }
81 72
82 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name); 73 printk(KERN_INFO "Setting APIC routing to %s\n", apic->name);
83 74
diff --git a/arch/x86/kernel/apic/summit_32.c b/arch/x86/kernel/apic/summit_32.c
index 645ecc4ff0be..9b419263d90d 100644
--- a/arch/x86/kernel/apic/summit_32.c
+++ b/arch/x86/kernel/apic/summit_32.c
@@ -183,7 +183,7 @@ static const struct cpumask *summit_target_cpus(void)
183 return cpumask_of(0); 183 return cpumask_of(0);
184} 184}
185 185
186static unsigned long summit_check_apicid_used(physid_mask_t bitmap, int apicid) 186static unsigned long summit_check_apicid_used(physid_mask_t *map, int apicid)
187{ 187{
188 return 0; 188 return 0;
189} 189}
@@ -261,15 +261,15 @@ static int summit_cpu_present_to_apicid(int mps_cpu)
261 return BAD_APICID; 261 return BAD_APICID;
262} 262}
263 263
264static physid_mask_t summit_ioapic_phys_id_map(physid_mask_t phys_id_map) 264static void summit_ioapic_phys_id_map(physid_mask_t *phys_id_map, physid_mask_t *retmap)
265{ 265{
266 /* For clustered we don't have a good way to do this yet - hack */ 266 /* For clustered we don't have a good way to do this yet - hack */
267 return physids_promote(0x0F); 267 physids_promote(0x0FL, retmap);
268} 268}
269 269
270static physid_mask_t summit_apicid_to_cpu_present(int apicid) 270static void summit_apicid_to_cpu_present(int apicid, physid_mask_t *retmap)
271{ 271{
272 return physid_mask_of_physid(0); 272 physid_set_mask_of_physid(0, retmap);
273} 273}
274 274
275static int summit_check_phys_apicid_present(int physical_apicid) 275static int summit_check_phys_apicid_present(int physical_apicid)
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index a5371ec36776..cf69c59f4910 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -148,10 +148,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
148 break; 148 break;
149 } 149 }
150 150
151 if (cpu < nr_cpu_ids) 151 return per_cpu(x86_cpu_to_logical_apicid, cpu);
152 return per_cpu(x86_cpu_to_logical_apicid, cpu);
153
154 return BAD_APICID;
155} 152}
156 153
157static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x) 154static unsigned int x2apic_cluster_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_phys.c b/arch/x86/kernel/apic/x2apic_phys.c
index a8989aadc99a..8972f38c5ced 100644
--- a/arch/x86/kernel/apic/x2apic_phys.c
+++ b/arch/x86/kernel/apic/x2apic_phys.c
@@ -146,10 +146,7 @@ x2apic_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
146 break; 146 break;
147 } 147 }
148 148
149 if (cpu < nr_cpu_ids) 149 return per_cpu(x86_cpu_to_apicid, cpu);
150 return per_cpu(x86_cpu_to_apicid, cpu);
151
152 return BAD_APICID;
153} 150}
154 151
155static unsigned int x2apic_phys_get_apic_id(unsigned long x) 152static unsigned int x2apic_phys_get_apic_id(unsigned long x)
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 326c25477d3d..c085d52dbaf2 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -5,7 +5,7 @@
5 * 5 *
6 * SGI UV APIC functions (note: not an Intel compatible APIC) 6 * SGI UV APIC functions (note: not an Intel compatible APIC)
7 * 7 *
8 * Copyright (C) 2007-2008 Silicon Graphics, Inc. All rights reserved. 8 * Copyright (C) 2007-2009 Silicon Graphics, Inc. All rights reserved.
9 */ 9 */
10#include <linux/cpumask.h> 10#include <linux/cpumask.h>
11#include <linux/hardirq.h> 11#include <linux/hardirq.h>
@@ -17,9 +17,12 @@
17#include <linux/ctype.h> 17#include <linux/ctype.h>
18#include <linux/sched.h> 18#include <linux/sched.h>
19#include <linux/timer.h> 19#include <linux/timer.h>
20#include <linux/slab.h>
20#include <linux/cpu.h> 21#include <linux/cpu.h>
21#include <linux/init.h> 22#include <linux/init.h>
22#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/pci.h>
25#include <linux/kdebug.h>
23 26
24#include <asm/uv/uv_mmrs.h> 27#include <asm/uv/uv_mmrs.h>
25#include <asm/uv/uv_hub.h> 28#include <asm/uv/uv_hub.h>
@@ -30,10 +33,27 @@
30#include <asm/apic.h> 33#include <asm/apic.h>
31#include <asm/ipi.h> 34#include <asm/ipi.h>
32#include <asm/smp.h> 35#include <asm/smp.h>
36#include <asm/x86_init.h>
33 37
34DEFINE_PER_CPU(int, x2apic_extra_bits); 38DEFINE_PER_CPU(int, x2apic_extra_bits);
35 39
40#define PR_DEVEL(fmt, args...) pr_devel("%s: " fmt, __func__, args)
41
36static enum uv_system_type uv_system_type; 42static enum uv_system_type uv_system_type;
43static u64 gru_start_paddr, gru_end_paddr;
44int uv_min_hub_revision_id;
45EXPORT_SYMBOL_GPL(uv_min_hub_revision_id);
46static DEFINE_SPINLOCK(uv_nmi_lock);
47
48static inline bool is_GRU_range(u64 start, u64 end)
49{
50 return start >= gru_start_paddr && end <= gru_end_paddr;
51}
52
53static bool uv_is_untracked_pat_range(u64 start, u64 end)
54{
55 return is_ISA_range(start, end) || is_GRU_range(start, end);
56}
37 57
38static int early_get_nodeid(void) 58static int early_get_nodeid(void)
39{ 59{
@@ -43,19 +63,28 @@ static int early_get_nodeid(void)
43 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr)); 63 mmr = early_ioremap(UV_LOCAL_MMR_BASE | UVH_NODE_ID, sizeof(*mmr));
44 node_id.v = *mmr; 64 node_id.v = *mmr;
45 early_iounmap(mmr, sizeof(*mmr)); 65 early_iounmap(mmr, sizeof(*mmr));
66
67 /* Currently, all blades have same revision number */
68 uv_min_hub_revision_id = node_id.s.revision;
69
46 return node_id.s.node_id; 70 return node_id.s.node_id;
47} 71}
48 72
49static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) 73static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
50{ 74{
75 int nodeid;
76
51 if (!strcmp(oem_id, "SGI")) { 77 if (!strcmp(oem_id, "SGI")) {
78 nodeid = early_get_nodeid();
79 x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
80 x86_platform.nmi_init = uv_nmi_init;
52 if (!strcmp(oem_table_id, "UVL")) 81 if (!strcmp(oem_table_id, "UVL"))
53 uv_system_type = UV_LEGACY_APIC; 82 uv_system_type = UV_LEGACY_APIC;
54 else if (!strcmp(oem_table_id, "UVX")) 83 else if (!strcmp(oem_table_id, "UVX"))
55 uv_system_type = UV_X2APIC; 84 uv_system_type = UV_X2APIC;
56 else if (!strcmp(oem_table_id, "UVH")) { 85 else if (!strcmp(oem_table_id, "UVH")) {
57 __get_cpu_var(x2apic_extra_bits) = 86 __get_cpu_var(x2apic_extra_bits) =
58 early_get_nodeid() << (UV_APIC_PNODE_SHIFT - 1); 87 nodeid << (UV_APIC_PNODE_SHIFT - 1);
59 uv_system_type = UV_NON_UNIQUE_APIC; 88 uv_system_type = UV_NON_UNIQUE_APIC;
60 return 1; 89 return 1;
61 } 90 }
@@ -92,11 +121,9 @@ EXPORT_SYMBOL_GPL(uv_possible_blades);
92unsigned long sn_rtc_cycles_per_second; 121unsigned long sn_rtc_cycles_per_second;
93EXPORT_SYMBOL(sn_rtc_cycles_per_second); 122EXPORT_SYMBOL(sn_rtc_cycles_per_second);
94 123
95/* Start with all IRQs pointing to boot CPU. IRQ balancing will shift them. */
96
97static const struct cpumask *uv_target_cpus(void) 124static const struct cpumask *uv_target_cpus(void)
98{ 125{
99 return cpumask_of(0); 126 return cpu_online_mask;
100} 127}
101 128
102static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask) 129static void uv_vector_allocation_domain(int cpu, struct cpumask *retmask)
@@ -212,10 +239,7 @@ uv_cpu_mask_to_apicid_and(const struct cpumask *cpumask,
212 if (cpumask_test_cpu(cpu, cpu_online_mask)) 239 if (cpumask_test_cpu(cpu, cpu_online_mask))
213 break; 240 break;
214 } 241 }
215 if (cpu < nr_cpu_ids) 242 return per_cpu(x86_cpu_to_apicid, cpu);
216 return per_cpu(x86_cpu_to_apicid, cpu);
217
218 return BAD_APICID;
219} 243}
220 244
221static unsigned int x2apic_get_apic_id(unsigned long x) 245static unsigned int x2apic_get_apic_id(unsigned long x)
@@ -364,13 +388,13 @@ static __init void get_lowmem_redirect(unsigned long *base, unsigned long *size)
364 388
365enum map_type {map_wb, map_uc}; 389enum map_type {map_wb, map_uc};
366 390
367static __init void map_high(char *id, unsigned long base, int shift, 391static __init void map_high(char *id, unsigned long base, int pshift,
368 int max_pnode, enum map_type map_type) 392 int bshift, int max_pnode, enum map_type map_type)
369{ 393{
370 unsigned long bytes, paddr; 394 unsigned long bytes, paddr;
371 395
372 paddr = base << shift; 396 paddr = base << pshift;
373 bytes = (1UL << shift) * (max_pnode + 1); 397 bytes = (1UL << bshift) * (max_pnode + 1);
374 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr, 398 printk(KERN_INFO "UV: Map %s_HI 0x%lx - 0x%lx\n", id, paddr,
375 paddr + bytes); 399 paddr + bytes);
376 if (map_type == map_uc) 400 if (map_type == map_uc)
@@ -385,8 +409,12 @@ static __init void map_gru_high(int max_pnode)
385 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT; 409 int shift = UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR_BASE_SHFT;
386 410
387 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR); 411 gru.v = uv_read_local_mmr(UVH_RH_GAM_GRU_OVERLAY_CONFIG_MMR);
388 if (gru.s.enable) 412 if (gru.s.enable) {
389 map_high("GRU", gru.s.base, shift, max_pnode, map_wb); 413 map_high("GRU", gru.s.base, shift, shift, max_pnode, map_wb);
414 gru_start_paddr = ((u64)gru.s.base << shift);
415 gru_end_paddr = gru_start_paddr + (1UL << shift) * (max_pnode + 1);
416
417 }
390} 418}
391 419
392static __init void map_mmr_high(int max_pnode) 420static __init void map_mmr_high(int max_pnode)
@@ -396,7 +424,7 @@ static __init void map_mmr_high(int max_pnode)
396 424
397 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR); 425 mmr.v = uv_read_local_mmr(UVH_RH_GAM_MMR_OVERLAY_CONFIG_MMR);
398 if (mmr.s.enable) 426 if (mmr.s.enable)
399 map_high("MMR", mmr.s.base, shift, max_pnode, map_uc); 427 map_high("MMR", mmr.s.base, shift, shift, max_pnode, map_uc);
400} 428}
401 429
402static __init void map_mmioh_high(int max_pnode) 430static __init void map_mmioh_high(int max_pnode)
@@ -406,7 +434,14 @@ static __init void map_mmioh_high(int max_pnode)
406 434
407 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR); 435 mmioh.v = uv_read_local_mmr(UVH_RH_GAM_MMIOH_OVERLAY_CONFIG_MMR);
408 if (mmioh.s.enable) 436 if (mmioh.s.enable)
409 map_high("MMIOH", mmioh.s.base, shift, max_pnode, map_uc); 437 map_high("MMIOH", mmioh.s.base, shift, mmioh.s.m_io,
438 max_pnode, map_uc);
439}
440
441static __init void map_low_mmrs(void)
442{
443 init_extra_mapping_uc(UV_GLOBAL_MMR32_BASE, UV_GLOBAL_MMR32_SIZE);
444 init_extra_mapping_uc(UV_LOCAL_MMR_BASE, UV_LOCAL_MMR_SIZE);
410} 445}
411 446
412static __init void uv_rtc_init(void) 447static __init void uv_rtc_init(void)
@@ -452,7 +487,7 @@ static void uv_heartbeat(unsigned long ignored)
452 487
453static void __cpuinit uv_heartbeat_enable(int cpu) 488static void __cpuinit uv_heartbeat_enable(int cpu)
454{ 489{
455 if (!uv_cpu_hub_info(cpu)->scir.enabled) { 490 while (!uv_cpu_hub_info(cpu)->scir.enabled) {
456 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer; 491 struct timer_list *timer = &uv_cpu_hub_info(cpu)->scir.timer;
457 492
458 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); 493 uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY);
@@ -460,11 +495,10 @@ static void __cpuinit uv_heartbeat_enable(int cpu)
460 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; 495 timer->expires = jiffies + SCIR_CPU_HB_INTERVAL;
461 add_timer_on(timer, cpu); 496 add_timer_on(timer, cpu);
462 uv_cpu_hub_info(cpu)->scir.enabled = 1; 497 uv_cpu_hub_info(cpu)->scir.enabled = 1;
463 }
464 498
465 /* check boot cpu */ 499 /* also ensure that boot cpu is enabled */
466 if (!uv_cpu_hub_info(0)->scir.enabled) 500 cpu = 0;
467 uv_heartbeat_enable(0); 501 }
468} 502}
469 503
470#ifdef CONFIG_HOTPLUG_CPU 504#ifdef CONFIG_HOTPLUG_CPU
@@ -523,6 +557,30 @@ late_initcall(uv_init_heartbeat);
523 557
524#endif /* !CONFIG_HOTPLUG_CPU */ 558#endif /* !CONFIG_HOTPLUG_CPU */
525 559
560/* Direct Legacy VGA I/O traffic to designated IOH */
561int uv_set_vga_state(struct pci_dev *pdev, bool decode,
562 unsigned int command_bits, bool change_bridge)
563{
564 int domain, bus, rc;
565
566 PR_DEVEL("devfn %x decode %d cmd %x chg_brdg %d\n",
567 pdev->devfn, decode, command_bits, change_bridge);
568
569 if (!change_bridge)
570 return 0;
571
572 if ((command_bits & PCI_COMMAND_IO) == 0)
573 return 0;
574
575 domain = pci_domain_nr(pdev->bus);
576 bus = pdev->bus->number;
577
578 rc = uv_bios_set_legacy_vga_target(decode, domain, bus);
579 PR_DEVEL("vga decode %d %x:%x, rc: %d\n", decode, domain, bus, rc);
580
581 return rc;
582}
583
526/* 584/*
527 * Called on each cpu to initialize the per_cpu UV data area. 585 * Called on each cpu to initialize the per_cpu UV data area.
528 * FIXME: hotplug not supported yet 586 * FIXME: hotplug not supported yet
@@ -539,6 +597,46 @@ void __cpuinit uv_cpu_init(void)
539 set_x2apic_extra_bits(uv_hub_info->pnode); 597 set_x2apic_extra_bits(uv_hub_info->pnode);
540} 598}
541 599
600/*
601 * When NMI is received, print a stack trace.
602 */
603int uv_handle_nmi(struct notifier_block *self, unsigned long reason, void *data)
604{
605 if (reason != DIE_NMI_IPI)
606 return NOTIFY_OK;
607 /*
608 * Use a lock so only one cpu prints at a time
609 * to prevent intermixed output.
610 */
611 spin_lock(&uv_nmi_lock);
612 pr_info("NMI stack dump cpu %u:\n", smp_processor_id());
613 dump_stack();
614 spin_unlock(&uv_nmi_lock);
615
616 return NOTIFY_STOP;
617}
618
619static struct notifier_block uv_dump_stack_nmi_nb = {
620 .notifier_call = uv_handle_nmi
621};
622
623void uv_register_nmi_notifier(void)
624{
625 if (register_die_notifier(&uv_dump_stack_nmi_nb))
626 printk(KERN_WARNING "UV NMI handler failed to register\n");
627}
628
629void uv_nmi_init(void)
630{
631 unsigned int value;
632
633 /*
634 * Unmask NMI on all cpus
635 */
636 value = apic_read(APIC_LVT1) | APIC_DM_NMI;
637 value &= ~APIC_LVT_MASKED;
638 apic_write(APIC_LVT1, value);
639}
542 640
543void __init uv_system_init(void) 641void __init uv_system_init(void)
544{ 642{
@@ -550,6 +648,8 @@ void __init uv_system_init(void)
550 unsigned long mmr_base, present, paddr; 648 unsigned long mmr_base, present, paddr;
551 unsigned short pnode_mask; 649 unsigned short pnode_mask;
552 650
651 map_low_mmrs();
652
553 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG); 653 m_n_config.v = uv_read_local_mmr(UVH_SI_ADDR_MAP_CONFIG);
554 m_val = m_n_config.s.m_skt; 654 m_val = m_n_config.s.m_skt;
555 n_val = m_n_config.s.n_skt; 655 n_val = m_n_config.s.n_skt;
@@ -602,13 +702,15 @@ void __init uv_system_init(void)
602 } 702 }
603 703
604 uv_bios_init(); 704 uv_bios_init();
605 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, 705 uv_bios_get_sn_info(0, &uv_type, &sn_partition_id, &sn_coherency_id,
606 &sn_coherency_id, &sn_region_size); 706 &sn_region_size, &system_serial_number);
607 uv_rtc_init(); 707 uv_rtc_init();
608 708
609 for_each_present_cpu(cpu) { 709 for_each_present_cpu(cpu) {
710 int apicid = per_cpu(x86_cpu_to_apicid, cpu);
711
610 nid = cpu_to_node(cpu); 712 nid = cpu_to_node(cpu);
611 pnode = uv_apicid_to_pnode(per_cpu(x86_cpu_to_apicid, cpu)); 713 pnode = uv_apicid_to_pnode(apicid);
612 blade = boot_pnode_to_blade(pnode); 714 blade = boot_pnode_to_blade(pnode);
613 lcpu = uv_blade_info[blade].nr_possible_cpus; 715 lcpu = uv_blade_info[blade].nr_possible_cpus;
614 uv_blade_info[blade].nr_possible_cpus++; 716 uv_blade_info[blade].nr_possible_cpus++;
@@ -629,15 +731,13 @@ void __init uv_system_init(void)
629 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra; 731 uv_cpu_hub_info(cpu)->gnode_extra = gnode_extra;
630 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base; 732 uv_cpu_hub_info(cpu)->global_mmr_base = mmr_base;
631 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id; 733 uv_cpu_hub_info(cpu)->coherency_domain_number = sn_coherency_id;
632 uv_cpu_hub_info(cpu)->scir.offset = SCIR_LOCAL_MMR_BASE + lcpu; 734 uv_cpu_hub_info(cpu)->scir.offset = uv_scir_offset(apicid);
633 uv_node_to_blade[nid] = blade; 735 uv_node_to_blade[nid] = blade;
634 uv_cpu_to_blade[cpu] = blade; 736 uv_cpu_to_blade[cpu] = blade;
635 max_pnode = max(pnode, max_pnode); 737 max_pnode = max(pnode, max_pnode);
636 738
637 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, " 739 printk(KERN_DEBUG "UV: cpu %d, apicid 0x%x, pnode %d, nid %d, lcpu %d, blade %d\n",
638 "lcpu %d, blade %d\n", 740 cpu, apicid, pnode, nid, lcpu, blade);
639 cpu, per_cpu(x86_cpu_to_apicid, cpu), pnode, nid,
640 lcpu, blade);
641 } 741 }
642 742
643 /* Add blade/pnode info for nodes without cpus */ 743 /* Add blade/pnode info for nodes without cpus */
@@ -658,5 +758,9 @@ void __init uv_system_init(void)
658 758
659 uv_cpu_init(); 759 uv_cpu_init();
660 uv_scir_register_cpu_notifier(); 760 uv_scir_register_cpu_notifier();
761 uv_register_nmi_notifier();
661 proc_mkdir("sgi_uv", NULL); 762 proc_mkdir("sgi_uv", NULL);
763
764 /* register Legacy VGA I/O redirection handler */
765 pci_register_set_vga_state(uv_set_vga_state);
662} 766}
diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 151ace69a5aa..031aa887b0eb 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -204,7 +204,6 @@
204#include <linux/module.h> 204#include <linux/module.h>
205 205
206#include <linux/poll.h> 206#include <linux/poll.h>
207#include <linux/smp_lock.h>
208#include <linux/types.h> 207#include <linux/types.h>
209#include <linux/stddef.h> 208#include <linux/stddef.h>
210#include <linux/timer.h> 209#include <linux/timer.h>
@@ -403,6 +402,7 @@ static DECLARE_WAIT_QUEUE_HEAD(apm_waitqueue);
403static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue); 402static DECLARE_WAIT_QUEUE_HEAD(apm_suspend_waitqueue);
404static struct apm_user *user_list; 403static struct apm_user *user_list;
405static DEFINE_SPINLOCK(user_list_lock); 404static DEFINE_SPINLOCK(user_list_lock);
405static DEFINE_MUTEX(apm_mutex);
406 406
407/* 407/*
408 * Set up a segment that references the real mode segment 0x40 408 * Set up a segment that references the real mode segment 0x40
@@ -1531,7 +1531,7 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1531 return -EPERM; 1531 return -EPERM;
1532 switch (cmd) { 1532 switch (cmd) {
1533 case APM_IOC_STANDBY: 1533 case APM_IOC_STANDBY:
1534 lock_kernel(); 1534 mutex_lock(&apm_mutex);
1535 if (as->standbys_read > 0) { 1535 if (as->standbys_read > 0) {
1536 as->standbys_read--; 1536 as->standbys_read--;
1537 as->standbys_pending--; 1537 as->standbys_pending--;
@@ -1540,10 +1540,10 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1540 queue_event(APM_USER_STANDBY, as); 1540 queue_event(APM_USER_STANDBY, as);
1541 if (standbys_pending <= 0) 1541 if (standbys_pending <= 0)
1542 standby(); 1542 standby();
1543 unlock_kernel(); 1543 mutex_unlock(&apm_mutex);
1544 break; 1544 break;
1545 case APM_IOC_SUSPEND: 1545 case APM_IOC_SUSPEND:
1546 lock_kernel(); 1546 mutex_lock(&apm_mutex);
1547 if (as->suspends_read > 0) { 1547 if (as->suspends_read > 0) {
1548 as->suspends_read--; 1548 as->suspends_read--;
1549 as->suspends_pending--; 1549 as->suspends_pending--;
@@ -1552,13 +1552,14 @@ static long do_ioctl(struct file *filp, u_int cmd, u_long arg)
1552 queue_event(APM_USER_SUSPEND, as); 1552 queue_event(APM_USER_SUSPEND, as);
1553 if (suspends_pending <= 0) { 1553 if (suspends_pending <= 0) {
1554 ret = suspend(1); 1554 ret = suspend(1);
1555 mutex_unlock(&apm_mutex);
1555 } else { 1556 } else {
1556 as->suspend_wait = 1; 1557 as->suspend_wait = 1;
1558 mutex_unlock(&apm_mutex);
1557 wait_event_interruptible(apm_suspend_waitqueue, 1559 wait_event_interruptible(apm_suspend_waitqueue,
1558 as->suspend_wait == 0); 1560 as->suspend_wait == 0);
1559 ret = as->suspend_result; 1561 ret = as->suspend_result;
1560 } 1562 }
1561 unlock_kernel();
1562 return ret; 1563 return ret;
1563 default: 1564 default:
1564 return -ENOTTY; 1565 return -ENOTTY;
@@ -1608,12 +1609,10 @@ static int do_open(struct inode *inode, struct file *filp)
1608{ 1609{
1609 struct apm_user *as; 1610 struct apm_user *as;
1610 1611
1611 lock_kernel();
1612 as = kmalloc(sizeof(*as), GFP_KERNEL); 1612 as = kmalloc(sizeof(*as), GFP_KERNEL);
1613 if (as == NULL) { 1613 if (as == NULL) {
1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n", 1614 printk(KERN_ERR "apm: cannot allocate struct of size %d bytes\n",
1615 sizeof(*as)); 1615 sizeof(*as));
1616 unlock_kernel();
1617 return -ENOMEM; 1616 return -ENOMEM;
1618 } 1617 }
1619 as->magic = APM_BIOS_MAGIC; 1618 as->magic = APM_BIOS_MAGIC;
@@ -1635,7 +1634,6 @@ static int do_open(struct inode *inode, struct file *filp)
1635 user_list = as; 1634 user_list = as;
1636 spin_unlock(&user_list_lock); 1635 spin_unlock(&user_list_lock);
1637 filp->private_data = as; 1636 filp->private_data = as;
1638 unlock_kernel();
1639 return 0; 1637 return 0;
1640} 1638}
1641 1639
@@ -1994,8 +1992,8 @@ static int __init apm_is_horked_d850md(const struct dmi_system_id *d)
1994 apm_info.disabled = 1; 1992 apm_info.disabled = 1;
1995 printk(KERN_INFO "%s machine detected. " 1993 printk(KERN_INFO "%s machine detected. "
1996 "Disabling APM.\n", d->ident); 1994 "Disabling APM.\n", d->ident);
1997 printk(KERN_INFO "This bug is fixed in bios P15 which is available for \n"); 1995 printk(KERN_INFO "This bug is fixed in bios P15 which is available for\n");
1998 printk(KERN_INFO "download from support.intel.com \n"); 1996 printk(KERN_INFO "download from support.intel.com\n");
1999 } 1997 }
2000 return 0; 1998 return 0;
2001} 1999}
diff --git a/arch/x86/kernel/bios_uv.c b/arch/x86/kernel/bios_uv.c
index 63a88e1f987d..8bc57baaa9ad 100644
--- a/arch/x86/kernel/bios_uv.c
+++ b/arch/x86/kernel/bios_uv.c
@@ -15,8 +15,8 @@
15 * along with this program; if not, write to the Free Software 15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 16 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
17 * 17 *
18 * Copyright (c) 2008 Silicon Graphics, Inc. All Rights Reserved. 18 * Copyright (c) 2008-2009 Silicon Graphics, Inc. All Rights Reserved.
19 * Copyright (c) Russ Anderson 19 * Copyright (c) Russ Anderson <rja@sgi.com>
20 */ 20 */
21 21
22#include <linux/efi.h> 22#include <linux/efi.h>
@@ -30,6 +30,7 @@ static struct uv_systab uv_systab;
30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5) 30s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
31{ 31{
32 struct uv_systab *tab = &uv_systab; 32 struct uv_systab *tab = &uv_systab;
33 s64 ret;
33 34
34 if (!tab->function) 35 if (!tab->function)
35 /* 36 /*
@@ -37,9 +38,11 @@ s64 uv_bios_call(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, u64 a4, u64 a5)
37 */ 38 */
38 return BIOS_STATUS_UNIMPLEMENTED; 39 return BIOS_STATUS_UNIMPLEMENTED;
39 40
40 return efi_call6((void *)__va(tab->function), 41 ret = efi_call6((void *)__va(tab->function), (u64)which,
41 (u64)which, a1, a2, a3, a4, a5); 42 a1, a2, a3, a4, a5);
43 return ret;
42} 44}
45EXPORT_SYMBOL_GPL(uv_bios_call);
43 46
44s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3, 47s64 uv_bios_call_irqsave(enum uv_bios_cmd which, u64 a1, u64 a2, u64 a3,
45 u64 a4, u64 a5) 48 u64 a4, u64 a5)
@@ -73,11 +76,14 @@ long sn_coherency_id;
73EXPORT_SYMBOL_GPL(sn_coherency_id); 76EXPORT_SYMBOL_GPL(sn_coherency_id);
74long sn_region_size; 77long sn_region_size;
75EXPORT_SYMBOL_GPL(sn_region_size); 78EXPORT_SYMBOL_GPL(sn_region_size);
79long system_serial_number;
80EXPORT_SYMBOL_GPL(system_serial_number);
76int uv_type; 81int uv_type;
82EXPORT_SYMBOL_GPL(uv_type);
77 83
78 84
79s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher, 85s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
80 long *region) 86 long *region, long *ssn)
81{ 87{
82 s64 ret; 88 s64 ret;
83 u64 v0, v1; 89 u64 v0, v1;
@@ -97,25 +103,24 @@ s64 uv_bios_get_sn_info(int fc, int *uvtype, long *partid, long *coher,
97 *coher = part.coherence_id; 103 *coher = part.coherence_id;
98 if (region) 104 if (region)
99 *region = part.region_size; 105 *region = part.region_size;
106 if (ssn)
107 *ssn = v1;
100 return ret; 108 return ret;
101} 109}
110EXPORT_SYMBOL_GPL(uv_bios_get_sn_info);
102 111
103int 112int
104uv_bios_mq_watchlist_alloc(int blade, unsigned long addr, unsigned int mq_size, 113uv_bios_mq_watchlist_alloc(unsigned long addr, unsigned int mq_size,
105 unsigned long *intr_mmr_offset) 114 unsigned long *intr_mmr_offset)
106{ 115{
107 union uv_watchlist_u size_blade;
108 u64 watchlist; 116 u64 watchlist;
109 s64 ret; 117 s64 ret;
110 118
111 size_blade.size = mq_size;
112 size_blade.blade = blade;
113
114 /* 119 /*
115 * bios returns watchlist number or negative error number. 120 * bios returns watchlist number or negative error number.
116 */ 121 */
117 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr, 122 ret = (int)uv_bios_call_irqsave(UV_BIOS_WATCHLIST_ALLOC, addr,
118 size_blade.val, (u64)intr_mmr_offset, 123 mq_size, (u64)intr_mmr_offset,
119 (u64)&watchlist, 0); 124 (u64)&watchlist, 0);
120 if (ret < BIOS_STATUS_SUCCESS) 125 if (ret < BIOS_STATUS_SUCCESS)
121 return ret; 126 return ret;
@@ -158,6 +163,25 @@ s64 uv_bios_freq_base(u64 clock_type, u64 *ticks_per_second)
158} 163}
159EXPORT_SYMBOL_GPL(uv_bios_freq_base); 164EXPORT_SYMBOL_GPL(uv_bios_freq_base);
160 165
166/*
167 * uv_bios_set_legacy_vga_target - Set Legacy VGA I/O Target
168 * @decode: true to enable target, false to disable target
169 * @domain: PCI domain number
170 * @bus: PCI bus number
171 *
172 * Returns:
173 * 0: Success
174 * -EINVAL: Invalid domain or bus number
175 * -ENOSYS: Capability not available
176 * -EBUSY: Legacy VGA I/O cannot be retargeted at this time
177 */
178int uv_bios_set_legacy_vga_target(bool decode, int domain, int bus)
179{
180 return uv_bios_call(UV_BIOS_SET_LEGACY_VGA_TARGET,
181 (u64)decode, (u64)domain, (u64)bus, 0, 0);
182}
183EXPORT_SYMBOL_GPL(uv_bios_set_legacy_vga_target);
184
161 185
162#ifdef CONFIG_EFI 186#ifdef CONFIG_EFI
163void uv_bios_init(void) 187void uv_bios_init(void)
@@ -189,4 +213,3 @@ void uv_bios_init(void)
189 213
190void uv_bios_init(void) { } 214void uv_bios_init(void) { }
191#endif 215#endif
192
diff --git a/arch/x86/kernel/bootflag.c b/arch/x86/kernel/bootflag.c
index 30f25a75fe28..5de7f4c56971 100644
--- a/arch/x86/kernel/bootflag.c
+++ b/arch/x86/kernel/bootflag.c
@@ -5,7 +5,6 @@
5#include <linux/kernel.h> 5#include <linux/kernel.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/slab.h>
9#include <linux/spinlock.h> 8#include <linux/spinlock.h>
10#include <linux/acpi.h> 9#include <linux/acpi.h>
11#include <asm/io.h> 10#include <asm/io.h>
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index 68537e957a9b..c202b62f3671 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -5,6 +5,7 @@
5# Don't trace early stages of a secondary CPU boot 5# Don't trace early stages of a secondary CPU boot
6ifdef CONFIG_FUNCTION_TRACER 6ifdef CONFIG_FUNCTION_TRACER
7CFLAGS_REMOVE_common.o = -pg 7CFLAGS_REMOVE_common.o = -pg
8CFLAGS_REMOVE_perf_event.o = -pg
8endif 9endif
9 10
10# Make sure load_percpu_segment has no stackprotector 11# Make sure load_percpu_segment has no stackprotector
@@ -18,8 +19,6 @@ obj-y += vmware.o hypervisor.o sched.o
18obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o 19obj-$(CONFIG_X86_32) += bugs.o cmpxchg.o
19obj-$(CONFIG_X86_64) += bugs_64.o 20obj-$(CONFIG_X86_64) += bugs_64.o
20 21
21obj-$(CONFIG_X86_CPU_DEBUG) += cpu_debug.o
22
23obj-$(CONFIG_CPU_SUP_INTEL) += intel.o 22obj-$(CONFIG_CPU_SUP_INTEL) += intel.o
24obj-$(CONFIG_CPU_SUP_AMD) += amd.o 23obj-$(CONFIG_CPU_SUP_AMD) += amd.o
25obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o 24obj-$(CONFIG_CPU_SUP_CYRIX_32) += cyrix.o
diff --git a/arch/x86/kernel/cpu/addon_cpuid_features.c b/arch/x86/kernel/cpu/addon_cpuid_features.c
index c965e5212714..97ad79cdf688 100644
--- a/arch/x86/kernel/cpu/addon_cpuid_features.c
+++ b/arch/x86/kernel/cpu/addon_cpuid_features.c
@@ -32,6 +32,10 @@ void __cpuinit init_scattered_cpuid_features(struct cpuinfo_x86 *c)
32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = { 32 static const struct cpuid_bit __cpuinitconst cpuid_bits[] = {
33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 }, 33 { X86_FEATURE_IDA, CR_EAX, 1, 0x00000006 },
34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 }, 34 { X86_FEATURE_ARAT, CR_EAX, 2, 0x00000006 },
35 { X86_FEATURE_NPT, CR_EDX, 0, 0x8000000a },
36 { X86_FEATURE_LBRV, CR_EDX, 1, 0x8000000a },
37 { X86_FEATURE_SVML, CR_EDX, 2, 0x8000000a },
38 { X86_FEATURE_NRIPS, CR_EDX, 3, 0x8000000a },
35 { 0, 0, 0, 0 } 39 { 0, 0, 0, 0 }
36 }; 40 };
37 41
@@ -74,6 +78,7 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
74 unsigned int eax, ebx, ecx, edx, sub_index; 78 unsigned int eax, ebx, ecx, edx, sub_index;
75 unsigned int ht_mask_width, core_plus_mask_width; 79 unsigned int ht_mask_width, core_plus_mask_width;
76 unsigned int core_select_mask, core_level_siblings; 80 unsigned int core_select_mask, core_level_siblings;
81 static bool printed;
77 82
78 if (c->cpuid_level < 0xb) 83 if (c->cpuid_level < 0xb)
79 return; 84 return;
@@ -127,12 +132,14 @@ void __cpuinit detect_extended_topology(struct cpuinfo_x86 *c)
127 132
128 c->x86_max_cores = (core_level_siblings / smp_num_siblings); 133 c->x86_max_cores = (core_level_siblings / smp_num_siblings);
129 134
130 135 if (!printed) {
131 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 136 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
132 c->phys_proc_id); 137 c->phys_proc_id);
133 if (c->x86_max_cores > 1) 138 if (c->x86_max_cores > 1)
134 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 139 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
135 c->cpu_core_id); 140 c->cpu_core_id);
141 printed = 1;
142 }
136 return; 143 return;
137#endif 144#endif
138} 145}
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index c910a716a71c..e485825130d2 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -254,59 +254,36 @@ static int __cpuinit nearby_node(int apicid)
254 254
255/* 255/*
256 * Fixup core topology information for AMD multi-node processors. 256 * Fixup core topology information for AMD multi-node processors.
257 * Assumption 1: Number of cores in each internal node is the same. 257 * Assumption: Number of cores in each internal node is the same.
258 * Assumption 2: Mixed systems with both single-node and dual-node
259 * processors are not supported.
260 */ 258 */
261#ifdef CONFIG_X86_HT 259#ifdef CONFIG_X86_HT
262static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c) 260static void __cpuinit amd_fixup_dcm(struct cpuinfo_x86 *c)
263{ 261{
264#ifdef CONFIG_PCI 262 unsigned long long value;
265 u32 t, cpn; 263 u32 nodes, cores_per_node;
266 u8 n, n_id;
267 int cpu = smp_processor_id(); 264 int cpu = smp_processor_id();
268 265
266 if (!cpu_has(c, X86_FEATURE_NODEID_MSR))
267 return;
268
269 /* fixup topology information only once for a core */ 269 /* fixup topology information only once for a core */
270 if (cpu_has(c, X86_FEATURE_AMD_DCM)) 270 if (cpu_has(c, X86_FEATURE_AMD_DCM))
271 return; 271 return;
272 272
273 /* check for multi-node processor on boot cpu */ 273 rdmsrl(MSR_FAM10H_NODE_ID, value);
274 t = read_pci_config(0, 24, 3, 0xe8); 274
275 if (!(t & (1 << 29))) 275 nodes = ((value >> 3) & 7) + 1;
276 if (nodes == 1)
276 return; 277 return;
277 278
278 set_cpu_cap(c, X86_FEATURE_AMD_DCM); 279 set_cpu_cap(c, X86_FEATURE_AMD_DCM);
280 cores_per_node = c->x86_max_cores / nodes;
279 281
280 /* cores per node: each internal node has half the number of cores */ 282 /* store NodeID, use llc_shared_map to store sibling info */
281 cpn = c->x86_max_cores >> 1; 283 per_cpu(cpu_llc_id, cpu) = value & 7;
282
283 /* even-numbered NB_id of this dual-node processor */
284 n = c->phys_proc_id << 1;
285
286 /*
287 * determine internal node id and assign cores fifty-fifty to
288 * each node of the dual-node processor
289 */
290 t = read_pci_config(0, 24 + n, 3, 0xe8);
291 n = (t>>30) & 0x3;
292 if (n == 0) {
293 if (c->cpu_core_id < cpn)
294 n_id = 0;
295 else
296 n_id = 1;
297 } else {
298 if (c->cpu_core_id < cpn)
299 n_id = 1;
300 else
301 n_id = 0;
302 }
303
304 /* compute entire NodeID, use llc_shared_map to store sibling info */
305 per_cpu(cpu_llc_id, cpu) = (c->phys_proc_id << 1) + n_id;
306 284
307 /* fixup core id to be in range from 0 to cpn */ 285 /* fixup core id to be in range from 0 to (cores_per_node - 1) */
308 c->cpu_core_id = c->cpu_core_id % cpn; 286 c->cpu_core_id = c->cpu_core_id % cores_per_node;
309#endif
310} 287}
311#endif 288#endif
312 289
@@ -375,8 +352,6 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
375 node = nearby_node(apicid); 352 node = nearby_node(apicid);
376 } 353 }
377 numa_set_node(cpu, node); 354 numa_set_node(cpu, node);
378
379 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
380#endif 355#endif
381} 356}
382 357
@@ -535,7 +510,7 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c)
535 } 510 }
536 } 511 }
537 512
538 display_cacheinfo(c); 513 cpu_detect_cache_sizes(c);
539 514
540 /* Multi core CPU? */ 515 /* Multi core CPU? */
541 if (c->extended_cpuid_level >= 0x80000008) { 516 if (c->extended_cpuid_level >= 0x80000008) {
diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c
index c95e831bb095..e58d978e0758 100644
--- a/arch/x86/kernel/cpu/centaur.c
+++ b/arch/x86/kernel/cpu/centaur.c
@@ -294,7 +294,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c)
294 set_cpu_cap(c, X86_FEATURE_REP_GOOD); 294 set_cpu_cap(c, X86_FEATURE_REP_GOOD);
295 } 295 }
296 296
297 display_cacheinfo(c); 297 cpu_detect_cache_sizes(c);
298} 298}
299 299
300enum { 300enum {
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index cc25c2b4a567..4868e4a951ee 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -61,7 +61,7 @@ void __init setup_cpu_local_masks(void)
61static void __cpuinit default_init(struct cpuinfo_x86 *c) 61static void __cpuinit default_init(struct cpuinfo_x86 *c)
62{ 62{
63#ifdef CONFIG_X86_64 63#ifdef CONFIG_X86_64
64 display_cacheinfo(c); 64 cpu_detect_cache_sizes(c);
65#else 65#else
66 /* Not much we can do here... */ 66 /* Not much we can do here... */
67 /* Check if at least it has cpuid */ 67 /* Check if at least it has cpuid */
@@ -383,7 +383,7 @@ static void __cpuinit get_model_name(struct cpuinfo_x86 *c)
383 } 383 }
384} 384}
385 385
386void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c) 386void __cpuinit cpu_detect_cache_sizes(struct cpuinfo_x86 *c)
387{ 387{
388 unsigned int n, dummy, ebx, ecx, edx, l2size; 388 unsigned int n, dummy, ebx, ecx, edx, l2size;
389 389
@@ -391,8 +391,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
391 391
392 if (n >= 0x80000005) { 392 if (n >= 0x80000005) {
393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx); 393 cpuid(0x80000005, &dummy, &ebx, &ecx, &edx);
394 printk(KERN_INFO "CPU: L1 I Cache: %dK (%d bytes/line), D cache %dK (%d bytes/line)\n",
395 edx>>24, edx&0xFF, ecx>>24, ecx&0xFF);
396 c->x86_cache_size = (ecx>>24) + (edx>>24); 394 c->x86_cache_size = (ecx>>24) + (edx>>24);
397#ifdef CONFIG_X86_64 395#ifdef CONFIG_X86_64
398 /* On K8 L1 TLB is inclusive, so don't count it */ 396 /* On K8 L1 TLB is inclusive, so don't count it */
@@ -422,9 +420,6 @@ void __cpuinit display_cacheinfo(struct cpuinfo_x86 *c)
422#endif 420#endif
423 421
424 c->x86_cache_size = l2size; 422 c->x86_cache_size = l2size;
425
426 printk(KERN_INFO "CPU: L2 Cache: %dK (%d bytes/line)\n",
427 l2size, ecx & 0xFF);
428} 423}
429 424
430void __cpuinit detect_ht(struct cpuinfo_x86 *c) 425void __cpuinit detect_ht(struct cpuinfo_x86 *c)
@@ -432,6 +427,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
432#ifdef CONFIG_X86_HT 427#ifdef CONFIG_X86_HT
433 u32 eax, ebx, ecx, edx; 428 u32 eax, ebx, ecx, edx;
434 int index_msb, core_bits; 429 int index_msb, core_bits;
430 static bool printed;
435 431
436 if (!cpu_has(c, X86_FEATURE_HT)) 432 if (!cpu_has(c, X86_FEATURE_HT))
437 return; 433 return;
@@ -447,7 +443,7 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
447 smp_num_siblings = (ebx & 0xff0000) >> 16; 443 smp_num_siblings = (ebx & 0xff0000) >> 16;
448 444
449 if (smp_num_siblings == 1) { 445 if (smp_num_siblings == 1) {
450 printk(KERN_INFO "CPU: Hyper-Threading is disabled\n"); 446 printk_once(KERN_INFO "CPU0: Hyper-Threading is disabled\n");
451 goto out; 447 goto out;
452 } 448 }
453 449
@@ -474,11 +470,12 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
474 ((1 << core_bits) - 1); 470 ((1 << core_bits) - 1);
475 471
476out: 472out:
477 if ((c->x86_max_cores * smp_num_siblings) > 1) { 473 if (!printed && (c->x86_max_cores * smp_num_siblings) > 1) {
478 printk(KERN_INFO "CPU: Physical Processor ID: %d\n", 474 printk(KERN_INFO "CPU: Physical Processor ID: %d\n",
479 c->phys_proc_id); 475 c->phys_proc_id);
480 printk(KERN_INFO "CPU: Processor Core ID: %d\n", 476 printk(KERN_INFO "CPU: Processor Core ID: %d\n",
481 c->cpu_core_id); 477 c->cpu_core_id);
478 printed = 1;
482 } 479 }
483#endif 480#endif
484} 481}
@@ -659,24 +656,31 @@ void __init early_cpu_init(void)
659 const struct cpu_dev *const *cdev; 656 const struct cpu_dev *const *cdev;
660 int count = 0; 657 int count = 0;
661 658
659#ifdef PROCESSOR_SELECT
662 printk(KERN_INFO "KERNEL supported cpus:\n"); 660 printk(KERN_INFO "KERNEL supported cpus:\n");
661#endif
662
663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) { 663 for (cdev = __x86_cpu_dev_start; cdev < __x86_cpu_dev_end; cdev++) {
664 const struct cpu_dev *cpudev = *cdev; 664 const struct cpu_dev *cpudev = *cdev;
665 unsigned int j;
666 665
667 if (count >= X86_VENDOR_NUM) 666 if (count >= X86_VENDOR_NUM)
668 break; 667 break;
669 cpu_devs[count] = cpudev; 668 cpu_devs[count] = cpudev;
670 count++; 669 count++;
671 670
672 for (j = 0; j < 2; j++) { 671#ifdef PROCESSOR_SELECT
673 if (!cpudev->c_ident[j]) 672 {
674 continue; 673 unsigned int j;
675 printk(KERN_INFO " %s %s\n", cpudev->c_vendor, 674
676 cpudev->c_ident[j]); 675 for (j = 0; j < 2; j++) {
676 if (!cpudev->c_ident[j])
677 continue;
678 printk(KERN_INFO " %s %s\n", cpudev->c_vendor,
679 cpudev->c_ident[j]);
680 }
677 } 681 }
682#endif
678 } 683 }
679
680 early_identify_cpu(&boot_cpu_data); 684 early_identify_cpu(&boot_cpu_data);
681} 685}
682 686
@@ -837,10 +841,8 @@ static void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
837 boot_cpu_data.x86_capability[i] &= c->x86_capability[i]; 841 boot_cpu_data.x86_capability[i] &= c->x86_capability[i];
838 } 842 }
839 843
840#ifdef CONFIG_X86_MCE
841 /* Init Machine Check Exception if available. */ 844 /* Init Machine Check Exception if available. */
842 mcheck_init(c); 845 mcheck_cpu_init(c);
843#endif
844 846
845 select_idle_routine(c); 847 select_idle_routine(c);
846 848
@@ -1093,7 +1095,7 @@ static void clear_all_debug_regs(void)
1093 1095
1094void __cpuinit cpu_init(void) 1096void __cpuinit cpu_init(void)
1095{ 1097{
1096 struct orig_ist *orig_ist; 1098 struct orig_ist *oist;
1097 struct task_struct *me; 1099 struct task_struct *me;
1098 struct tss_struct *t; 1100 struct tss_struct *t;
1099 unsigned long v; 1101 unsigned long v;
@@ -1102,7 +1104,7 @@ void __cpuinit cpu_init(void)
1102 1104
1103 cpu = stack_smp_processor_id(); 1105 cpu = stack_smp_processor_id();
1104 t = &per_cpu(init_tss, cpu); 1106 t = &per_cpu(init_tss, cpu);
1105 orig_ist = &per_cpu(orig_ist, cpu); 1107 oist = &per_cpu(orig_ist, cpu);
1106 1108
1107#ifdef CONFIG_NUMA 1109#ifdef CONFIG_NUMA
1108 if (cpu != 0 && percpu_read(node_number) == 0 && 1110 if (cpu != 0 && percpu_read(node_number) == 0 &&
@@ -1115,7 +1117,7 @@ void __cpuinit cpu_init(void)
1115 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask)) 1117 if (cpumask_test_and_set_cpu(cpu, cpu_initialized_mask))
1116 panic("CPU#%d already initialized!\n", cpu); 1118 panic("CPU#%d already initialized!\n", cpu);
1117 1119
1118 printk(KERN_INFO "Initializing CPU#%d\n", cpu); 1120 pr_debug("Initializing CPU#%d\n", cpu);
1119 1121
1120 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE); 1122 clear_in_cr4(X86_CR4_VME|X86_CR4_PVI|X86_CR4_TSD|X86_CR4_DE);
1121 1123
@@ -1136,19 +1138,19 @@ void __cpuinit cpu_init(void)
1136 wrmsrl(MSR_KERNEL_GS_BASE, 0); 1138 wrmsrl(MSR_KERNEL_GS_BASE, 0);
1137 barrier(); 1139 barrier();
1138 1140
1139 check_efer(); 1141 x86_configure_nx();
1140 if (cpu != 0) 1142 if (cpu != 0)
1141 enable_x2apic(); 1143 enable_x2apic();
1142 1144
1143 /* 1145 /*
1144 * set up and load the per-CPU TSS 1146 * set up and load the per-CPU TSS
1145 */ 1147 */
1146 if (!orig_ist->ist[0]) { 1148 if (!oist->ist[0]) {
1147 char *estacks = per_cpu(exception_stacks, cpu); 1149 char *estacks = per_cpu(exception_stacks, cpu);
1148 1150
1149 for (v = 0; v < N_EXCEPTION_STACKS; v++) { 1151 for (v = 0; v < N_EXCEPTION_STACKS; v++) {
1150 estacks += exception_stack_sizes[v]; 1152 estacks += exception_stack_sizes[v];
1151 orig_ist->ist[v] = t->x86_tss.ist[v] = 1153 oist->ist[v] = t->x86_tss.ist[v] =
1152 (unsigned long)estacks; 1154 (unsigned long)estacks;
1153 } 1155 }
1154 } 1156 }
diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h
index 6de9a908e400..3624e8a0f71b 100644
--- a/arch/x86/kernel/cpu/cpu.h
+++ b/arch/x86/kernel/cpu/cpu.h
@@ -32,6 +32,6 @@ struct cpu_dev {
32extern const struct cpu_dev *const __x86_cpu_dev_start[], 32extern const struct cpu_dev *const __x86_cpu_dev_start[],
33 *const __x86_cpu_dev_end[]; 33 *const __x86_cpu_dev_end[];
34 34
35extern void display_cacheinfo(struct cpuinfo_x86 *c); 35extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c);
36 36
37#endif 37#endif
diff --git a/arch/x86/kernel/cpu/cpu_debug.c b/arch/x86/kernel/cpu/cpu_debug.c
deleted file mode 100644
index dca325c03999..000000000000
--- a/arch/x86/kernel/cpu/cpu_debug.c
+++ /dev/null
@@ -1,688 +0,0 @@
1/*
2 * CPU x86 architecture debug code
3 *
4 * Copyright(C) 2009 Jaswinder Singh Rajput
5 *
6 * For licencing details see kernel-base/COPYING
7 */
8
9#include <linux/interrupt.h>
10#include <linux/compiler.h>
11#include <linux/seq_file.h>
12#include <linux/debugfs.h>
13#include <linux/kprobes.h>
14#include <linux/uaccess.h>
15#include <linux/kernel.h>
16#include <linux/module.h>
17#include <linux/percpu.h>
18#include <linux/signal.h>
19#include <linux/errno.h>
20#include <linux/sched.h>
21#include <linux/types.h>
22#include <linux/init.h>
23#include <linux/slab.h>
24#include <linux/smp.h>
25
26#include <asm/cpu_debug.h>
27#include <asm/paravirt.h>
28#include <asm/system.h>
29#include <asm/traps.h>
30#include <asm/apic.h>
31#include <asm/desc.h>
32
33static DEFINE_PER_CPU(struct cpu_cpuX_base [CPU_REG_ALL_BIT], cpu_arr);
34static DEFINE_PER_CPU(struct cpu_private * [MAX_CPU_FILES], priv_arr);
35static DEFINE_PER_CPU(int, cpu_priv_count);
36
37static DEFINE_MUTEX(cpu_debug_lock);
38
39static struct dentry *cpu_debugfs_dir;
40
41static struct cpu_debug_base cpu_base[] = {
42 { "mc", CPU_MC, 0 },
43 { "monitor", CPU_MONITOR, 0 },
44 { "time", CPU_TIME, 0 },
45 { "pmc", CPU_PMC, 1 },
46 { "platform", CPU_PLATFORM, 0 },
47 { "apic", CPU_APIC, 0 },
48 { "poweron", CPU_POWERON, 0 },
49 { "control", CPU_CONTROL, 0 },
50 { "features", CPU_FEATURES, 0 },
51 { "lastbranch", CPU_LBRANCH, 0 },
52 { "bios", CPU_BIOS, 0 },
53 { "freq", CPU_FREQ, 0 },
54 { "mtrr", CPU_MTRR, 0 },
55 { "perf", CPU_PERF, 0 },
56 { "cache", CPU_CACHE, 0 },
57 { "sysenter", CPU_SYSENTER, 0 },
58 { "therm", CPU_THERM, 0 },
59 { "misc", CPU_MISC, 0 },
60 { "debug", CPU_DEBUG, 0 },
61 { "pat", CPU_PAT, 0 },
62 { "vmx", CPU_VMX, 0 },
63 { "call", CPU_CALL, 0 },
64 { "base", CPU_BASE, 0 },
65 { "ver", CPU_VER, 0 },
66 { "conf", CPU_CONF, 0 },
67 { "smm", CPU_SMM, 0 },
68 { "svm", CPU_SVM, 0 },
69 { "osvm", CPU_OSVM, 0 },
70 { "tss", CPU_TSS, 0 },
71 { "cr", CPU_CR, 0 },
72 { "dt", CPU_DT, 0 },
73 { "registers", CPU_REG_ALL, 0 },
74};
75
76static struct cpu_file_base cpu_file[] = {
77 { "index", CPU_REG_ALL, 0 },
78 { "value", CPU_REG_ALL, 1 },
79};
80
81/* CPU Registers Range */
82static struct cpu_debug_range cpu_reg_range[] = {
83 { 0x00000000, 0x00000001, CPU_MC, },
84 { 0x00000006, 0x00000007, CPU_MONITOR, },
85 { 0x00000010, 0x00000010, CPU_TIME, },
86 { 0x00000011, 0x00000013, CPU_PMC, },
87 { 0x00000017, 0x00000017, CPU_PLATFORM, },
88 { 0x0000001B, 0x0000001B, CPU_APIC, },
89 { 0x0000002A, 0x0000002B, CPU_POWERON, },
90 { 0x0000002C, 0x0000002C, CPU_FREQ, },
91 { 0x0000003A, 0x0000003A, CPU_CONTROL, },
92 { 0x00000040, 0x00000047, CPU_LBRANCH, },
93 { 0x00000060, 0x00000067, CPU_LBRANCH, },
94 { 0x00000079, 0x00000079, CPU_BIOS, },
95 { 0x00000088, 0x0000008A, CPU_CACHE, },
96 { 0x0000008B, 0x0000008B, CPU_BIOS, },
97 { 0x0000009B, 0x0000009B, CPU_MONITOR, },
98 { 0x000000C1, 0x000000C4, CPU_PMC, },
99 { 0x000000CD, 0x000000CD, CPU_FREQ, },
100 { 0x000000E7, 0x000000E8, CPU_PERF, },
101 { 0x000000FE, 0x000000FE, CPU_MTRR, },
102
103 { 0x00000116, 0x0000011E, CPU_CACHE, },
104 { 0x00000174, 0x00000176, CPU_SYSENTER, },
105 { 0x00000179, 0x0000017B, CPU_MC, },
106 { 0x00000186, 0x00000189, CPU_PMC, },
107 { 0x00000198, 0x00000199, CPU_PERF, },
108 { 0x0000019A, 0x0000019A, CPU_TIME, },
109 { 0x0000019B, 0x0000019D, CPU_THERM, },
110 { 0x000001A0, 0x000001A0, CPU_MISC, },
111 { 0x000001C9, 0x000001C9, CPU_LBRANCH, },
112 { 0x000001D7, 0x000001D8, CPU_LBRANCH, },
113 { 0x000001D9, 0x000001D9, CPU_DEBUG, },
114 { 0x000001DA, 0x000001E0, CPU_LBRANCH, },
115
116 { 0x00000200, 0x0000020F, CPU_MTRR, },
117 { 0x00000250, 0x00000250, CPU_MTRR, },
118 { 0x00000258, 0x00000259, CPU_MTRR, },
119 { 0x00000268, 0x0000026F, CPU_MTRR, },
120 { 0x00000277, 0x00000277, CPU_PAT, },
121 { 0x000002FF, 0x000002FF, CPU_MTRR, },
122
123 { 0x00000300, 0x00000311, CPU_PMC, },
124 { 0x00000345, 0x00000345, CPU_PMC, },
125 { 0x00000360, 0x00000371, CPU_PMC, },
126 { 0x0000038D, 0x00000390, CPU_PMC, },
127 { 0x000003A0, 0x000003BE, CPU_PMC, },
128 { 0x000003C0, 0x000003CD, CPU_PMC, },
129 { 0x000003E0, 0x000003E1, CPU_PMC, },
130 { 0x000003F0, 0x000003F2, CPU_PMC, },
131
132 { 0x00000400, 0x00000417, CPU_MC, },
133 { 0x00000480, 0x0000048B, CPU_VMX, },
134
135 { 0x00000600, 0x00000600, CPU_DEBUG, },
136 { 0x00000680, 0x0000068F, CPU_LBRANCH, },
137 { 0x000006C0, 0x000006CF, CPU_LBRANCH, },
138
139 { 0x000107CC, 0x000107D3, CPU_PMC, },
140
141 { 0xC0000080, 0xC0000080, CPU_FEATURES, },
142 { 0xC0000081, 0xC0000084, CPU_CALL, },
143 { 0xC0000100, 0xC0000102, CPU_BASE, },
144 { 0xC0000103, 0xC0000103, CPU_TIME, },
145
146 { 0xC0010000, 0xC0010007, CPU_PMC, },
147 { 0xC0010010, 0xC0010010, CPU_CONF, },
148 { 0xC0010015, 0xC0010015, CPU_CONF, },
149 { 0xC0010016, 0xC001001A, CPU_MTRR, },
150 { 0xC001001D, 0xC001001D, CPU_MTRR, },
151 { 0xC001001F, 0xC001001F, CPU_CONF, },
152 { 0xC0010030, 0xC0010035, CPU_BIOS, },
153 { 0xC0010044, 0xC0010048, CPU_MC, },
154 { 0xC0010050, 0xC0010056, CPU_SMM, },
155 { 0xC0010058, 0xC0010058, CPU_CONF, },
156 { 0xC0010060, 0xC0010060, CPU_CACHE, },
157 { 0xC0010061, 0xC0010068, CPU_SMM, },
158 { 0xC0010069, 0xC001006B, CPU_SMM, },
159 { 0xC0010070, 0xC0010071, CPU_SMM, },
160 { 0xC0010111, 0xC0010113, CPU_SMM, },
161 { 0xC0010114, 0xC0010118, CPU_SVM, },
162 { 0xC0010140, 0xC0010141, CPU_OSVM, },
163 { 0xC0011022, 0xC0011023, CPU_CONF, },
164};
165
166static int is_typeflag_valid(unsigned cpu, unsigned flag)
167{
168 int i;
169
170 /* Standard Registers should be always valid */
171 if (flag >= CPU_TSS)
172 return 1;
173
174 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
175 if (cpu_reg_range[i].flag == flag)
176 return 1;
177 }
178
179 /* Invalid */
180 return 0;
181}
182
183static unsigned get_cpu_range(unsigned cpu, unsigned *min, unsigned *max,
184 int index, unsigned flag)
185{
186 if (cpu_reg_range[index].flag == flag) {
187 *min = cpu_reg_range[index].min;
188 *max = cpu_reg_range[index].max;
189 } else
190 *max = 0;
191
192 return *max;
193}
194
195/* This function can also be called with seq = NULL for printk */
196static void print_cpu_data(struct seq_file *seq, unsigned type,
197 u32 low, u32 high)
198{
199 struct cpu_private *priv;
200 u64 val = high;
201
202 if (seq) {
203 priv = seq->private;
204 if (priv->file) {
205 val = (val << 32) | low;
206 seq_printf(seq, "0x%llx\n", val);
207 } else
208 seq_printf(seq, " %08x: %08x_%08x\n",
209 type, high, low);
210 } else
211 printk(KERN_INFO " %08x: %08x_%08x\n", type, high, low);
212}
213
214/* This function can also be called with seq = NULL for printk */
215static void print_msr(struct seq_file *seq, unsigned cpu, unsigned flag)
216{
217 unsigned msr, msr_min, msr_max;
218 struct cpu_private *priv;
219 u32 low, high;
220 int i;
221
222 if (seq) {
223 priv = seq->private;
224 if (priv->file) {
225 if (!rdmsr_safe_on_cpu(priv->cpu, priv->reg,
226 &low, &high))
227 print_cpu_data(seq, priv->reg, low, high);
228 return;
229 }
230 }
231
232 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
233 if (!get_cpu_range(cpu, &msr_min, &msr_max, i, flag))
234 continue;
235
236 for (msr = msr_min; msr <= msr_max; msr++) {
237 if (rdmsr_safe_on_cpu(cpu, msr, &low, &high))
238 continue;
239 print_cpu_data(seq, msr, low, high);
240 }
241 }
242}
243
244static void print_tss(void *arg)
245{
246 struct pt_regs *regs = task_pt_regs(current);
247 struct seq_file *seq = arg;
248 unsigned int seg;
249
250 seq_printf(seq, " RAX\t: %016lx\n", regs->ax);
251 seq_printf(seq, " RBX\t: %016lx\n", regs->bx);
252 seq_printf(seq, " RCX\t: %016lx\n", regs->cx);
253 seq_printf(seq, " RDX\t: %016lx\n", regs->dx);
254
255 seq_printf(seq, " RSI\t: %016lx\n", regs->si);
256 seq_printf(seq, " RDI\t: %016lx\n", regs->di);
257 seq_printf(seq, " RBP\t: %016lx\n", regs->bp);
258 seq_printf(seq, " ESP\t: %016lx\n", regs->sp);
259
260#ifdef CONFIG_X86_64
261 seq_printf(seq, " R08\t: %016lx\n", regs->r8);
262 seq_printf(seq, " R09\t: %016lx\n", regs->r9);
263 seq_printf(seq, " R10\t: %016lx\n", regs->r10);
264 seq_printf(seq, " R11\t: %016lx\n", regs->r11);
265 seq_printf(seq, " R12\t: %016lx\n", regs->r12);
266 seq_printf(seq, " R13\t: %016lx\n", regs->r13);
267 seq_printf(seq, " R14\t: %016lx\n", regs->r14);
268 seq_printf(seq, " R15\t: %016lx\n", regs->r15);
269#endif
270
271 asm("movl %%cs,%0" : "=r" (seg));
272 seq_printf(seq, " CS\t: %04x\n", seg);
273 asm("movl %%ds,%0" : "=r" (seg));
274 seq_printf(seq, " DS\t: %04x\n", seg);
275 seq_printf(seq, " SS\t: %04lx\n", regs->ss & 0xffff);
276 asm("movl %%es,%0" : "=r" (seg));
277 seq_printf(seq, " ES\t: %04x\n", seg);
278 asm("movl %%fs,%0" : "=r" (seg));
279 seq_printf(seq, " FS\t: %04x\n", seg);
280 asm("movl %%gs,%0" : "=r" (seg));
281 seq_printf(seq, " GS\t: %04x\n", seg);
282
283 seq_printf(seq, " EFLAGS\t: %016lx\n", regs->flags);
284
285 seq_printf(seq, " EIP\t: %016lx\n", regs->ip);
286}
287
288static void print_cr(void *arg)
289{
290 struct seq_file *seq = arg;
291
292 seq_printf(seq, " cr0\t: %016lx\n", read_cr0());
293 seq_printf(seq, " cr2\t: %016lx\n", read_cr2());
294 seq_printf(seq, " cr3\t: %016lx\n", read_cr3());
295 seq_printf(seq, " cr4\t: %016lx\n", read_cr4_safe());
296#ifdef CONFIG_X86_64
297 seq_printf(seq, " cr8\t: %016lx\n", read_cr8());
298#endif
299}
300
301static void print_desc_ptr(char *str, struct seq_file *seq, struct desc_ptr dt)
302{
303 seq_printf(seq, " %s\t: %016llx\n", str, (u64)(dt.address | dt.size));
304}
305
306static void print_dt(void *seq)
307{
308 struct desc_ptr dt;
309 unsigned long ldt;
310
311 /* IDT */
312 store_idt((struct desc_ptr *)&dt);
313 print_desc_ptr("IDT", seq, dt);
314
315 /* GDT */
316 store_gdt((struct desc_ptr *)&dt);
317 print_desc_ptr("GDT", seq, dt);
318
319 /* LDT */
320 store_ldt(ldt);
321 seq_printf(seq, " LDT\t: %016lx\n", ldt);
322
323 /* TR */
324 store_tr(ldt);
325 seq_printf(seq, " TR\t: %016lx\n", ldt);
326}
327
328static void print_dr(void *arg)
329{
330 struct seq_file *seq = arg;
331 unsigned long dr;
332 int i;
333
334 for (i = 0; i < 8; i++) {
335 /* Ignore db4, db5 */
336 if ((i == 4) || (i == 5))
337 continue;
338 get_debugreg(dr, i);
339 seq_printf(seq, " dr%d\t: %016lx\n", i, dr);
340 }
341
342 seq_printf(seq, "\n MSR\t:\n");
343}
344
345static void print_apic(void *arg)
346{
347 struct seq_file *seq = arg;
348
349#ifdef CONFIG_X86_LOCAL_APIC
350 seq_printf(seq, " LAPIC\t:\n");
351 seq_printf(seq, " ID\t\t: %08x\n", apic_read(APIC_ID) >> 24);
352 seq_printf(seq, " LVR\t\t: %08x\n", apic_read(APIC_LVR));
353 seq_printf(seq, " TASKPRI\t: %08x\n", apic_read(APIC_TASKPRI));
354 seq_printf(seq, " ARBPRI\t\t: %08x\n", apic_read(APIC_ARBPRI));
355 seq_printf(seq, " PROCPRI\t: %08x\n", apic_read(APIC_PROCPRI));
356 seq_printf(seq, " LDR\t\t: %08x\n", apic_read(APIC_LDR));
357 seq_printf(seq, " DFR\t\t: %08x\n", apic_read(APIC_DFR));
358 seq_printf(seq, " SPIV\t\t: %08x\n", apic_read(APIC_SPIV));
359 seq_printf(seq, " ISR\t\t: %08x\n", apic_read(APIC_ISR));
360 seq_printf(seq, " ESR\t\t: %08x\n", apic_read(APIC_ESR));
361 seq_printf(seq, " ICR\t\t: %08x\n", apic_read(APIC_ICR));
362 seq_printf(seq, " ICR2\t\t: %08x\n", apic_read(APIC_ICR2));
363 seq_printf(seq, " LVTT\t\t: %08x\n", apic_read(APIC_LVTT));
364 seq_printf(seq, " LVTTHMR\t: %08x\n", apic_read(APIC_LVTTHMR));
365 seq_printf(seq, " LVTPC\t\t: %08x\n", apic_read(APIC_LVTPC));
366 seq_printf(seq, " LVT0\t\t: %08x\n", apic_read(APIC_LVT0));
367 seq_printf(seq, " LVT1\t\t: %08x\n", apic_read(APIC_LVT1));
368 seq_printf(seq, " LVTERR\t\t: %08x\n", apic_read(APIC_LVTERR));
369 seq_printf(seq, " TMICT\t\t: %08x\n", apic_read(APIC_TMICT));
370 seq_printf(seq, " TMCCT\t\t: %08x\n", apic_read(APIC_TMCCT));
371 seq_printf(seq, " TDCR\t\t: %08x\n", apic_read(APIC_TDCR));
372 if (boot_cpu_has(X86_FEATURE_EXTAPIC)) {
373 unsigned int i, v, maxeilvt;
374
375 v = apic_read(APIC_EFEAT);
376 maxeilvt = (v >> 16) & 0xff;
377 seq_printf(seq, " EFEAT\t\t: %08x\n", v);
378 seq_printf(seq, " ECTRL\t\t: %08x\n", apic_read(APIC_ECTRL));
379
380 for (i = 0; i < maxeilvt; i++) {
381 v = apic_read(APIC_EILVTn(i));
382 seq_printf(seq, " EILVT%d\t\t: %08x\n", i, v);
383 }
384 }
385#endif /* CONFIG_X86_LOCAL_APIC */
386 seq_printf(seq, "\n MSR\t:\n");
387}
388
389static int cpu_seq_show(struct seq_file *seq, void *v)
390{
391 struct cpu_private *priv = seq->private;
392
393 if (priv == NULL)
394 return -EINVAL;
395
396 switch (cpu_base[priv->type].flag) {
397 case CPU_TSS:
398 smp_call_function_single(priv->cpu, print_tss, seq, 1);
399 break;
400 case CPU_CR:
401 smp_call_function_single(priv->cpu, print_cr, seq, 1);
402 break;
403 case CPU_DT:
404 smp_call_function_single(priv->cpu, print_dt, seq, 1);
405 break;
406 case CPU_DEBUG:
407 if (priv->file == CPU_INDEX_BIT)
408 smp_call_function_single(priv->cpu, print_dr, seq, 1);
409 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
410 break;
411 case CPU_APIC:
412 if (priv->file == CPU_INDEX_BIT)
413 smp_call_function_single(priv->cpu, print_apic, seq, 1);
414 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
415 break;
416
417 default:
418 print_msr(seq, priv->cpu, cpu_base[priv->type].flag);
419 break;
420 }
421 seq_printf(seq, "\n");
422
423 return 0;
424}
425
426static void *cpu_seq_start(struct seq_file *seq, loff_t *pos)
427{
428 if (*pos == 0) /* One time is enough ;-) */
429 return seq;
430
431 return NULL;
432}
433
434static void *cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
435{
436 (*pos)++;
437
438 return cpu_seq_start(seq, pos);
439}
440
441static void cpu_seq_stop(struct seq_file *seq, void *v)
442{
443}
444
445static const struct seq_operations cpu_seq_ops = {
446 .start = cpu_seq_start,
447 .next = cpu_seq_next,
448 .stop = cpu_seq_stop,
449 .show = cpu_seq_show,
450};
451
452static int cpu_seq_open(struct inode *inode, struct file *file)
453{
454 struct cpu_private *priv = inode->i_private;
455 struct seq_file *seq;
456 int err;
457
458 err = seq_open(file, &cpu_seq_ops);
459 if (!err) {
460 seq = file->private_data;
461 seq->private = priv;
462 }
463
464 return err;
465}
466
467static int write_msr(struct cpu_private *priv, u64 val)
468{
469 u32 low, high;
470
471 high = (val >> 32) & 0xffffffff;
472 low = val & 0xffffffff;
473
474 if (!wrmsr_safe_on_cpu(priv->cpu, priv->reg, low, high))
475 return 0;
476
477 return -EPERM;
478}
479
480static int write_cpu_register(struct cpu_private *priv, const char *buf)
481{
482 int ret = -EPERM;
483 u64 val;
484
485 ret = strict_strtoull(buf, 0, &val);
486 if (ret < 0)
487 return ret;
488
489 /* Supporting only MSRs */
490 if (priv->type < CPU_TSS_BIT)
491 return write_msr(priv, val);
492
493 return ret;
494}
495
496static ssize_t cpu_write(struct file *file, const char __user *ubuf,
497 size_t count, loff_t *off)
498{
499 struct seq_file *seq = file->private_data;
500 struct cpu_private *priv = seq->private;
501 char buf[19];
502
503 if ((priv == NULL) || (count >= sizeof(buf)))
504 return -EINVAL;
505
506 if (copy_from_user(&buf, ubuf, count))
507 return -EFAULT;
508
509 buf[count] = 0;
510
511 if ((cpu_base[priv->type].write) && (cpu_file[priv->file].write))
512 if (!write_cpu_register(priv, buf))
513 return count;
514
515 return -EACCES;
516}
517
518static const struct file_operations cpu_fops = {
519 .owner = THIS_MODULE,
520 .open = cpu_seq_open,
521 .read = seq_read,
522 .write = cpu_write,
523 .llseek = seq_lseek,
524 .release = seq_release,
525};
526
527static int cpu_create_file(unsigned cpu, unsigned type, unsigned reg,
528 unsigned file, struct dentry *dentry)
529{
530 struct cpu_private *priv = NULL;
531
532 /* Already intialized */
533 if (file == CPU_INDEX_BIT)
534 if (per_cpu(cpu_arr[type].init, cpu))
535 return 0;
536
537 priv = kzalloc(sizeof(*priv), GFP_KERNEL);
538 if (priv == NULL)
539 return -ENOMEM;
540
541 priv->cpu = cpu;
542 priv->type = type;
543 priv->reg = reg;
544 priv->file = file;
545 mutex_lock(&cpu_debug_lock);
546 per_cpu(priv_arr[type], cpu) = priv;
547 per_cpu(cpu_priv_count, cpu)++;
548 mutex_unlock(&cpu_debug_lock);
549
550 if (file)
551 debugfs_create_file(cpu_file[file].name, S_IRUGO,
552 dentry, (void *)priv, &cpu_fops);
553 else {
554 debugfs_create_file(cpu_base[type].name, S_IRUGO,
555 per_cpu(cpu_arr[type].dentry, cpu),
556 (void *)priv, &cpu_fops);
557 mutex_lock(&cpu_debug_lock);
558 per_cpu(cpu_arr[type].init, cpu) = 1;
559 mutex_unlock(&cpu_debug_lock);
560 }
561
562 return 0;
563}
564
565static int cpu_init_regfiles(unsigned cpu, unsigned int type, unsigned reg,
566 struct dentry *dentry)
567{
568 unsigned file;
569 int err = 0;
570
571 for (file = 0; file < ARRAY_SIZE(cpu_file); file++) {
572 err = cpu_create_file(cpu, type, reg, file, dentry);
573 if (err)
574 return err;
575 }
576
577 return err;
578}
579
580static int cpu_init_msr(unsigned cpu, unsigned type, struct dentry *dentry)
581{
582 struct dentry *cpu_dentry = NULL;
583 unsigned reg, reg_min, reg_max;
584 int i, err = 0;
585 char reg_dir[12];
586 u32 low, high;
587
588 for (i = 0; i < ARRAY_SIZE(cpu_reg_range); i++) {
589 if (!get_cpu_range(cpu, &reg_min, &reg_max, i,
590 cpu_base[type].flag))
591 continue;
592
593 for (reg = reg_min; reg <= reg_max; reg++) {
594 if (rdmsr_safe_on_cpu(cpu, reg, &low, &high))
595 continue;
596
597 sprintf(reg_dir, "0x%x", reg);
598 cpu_dentry = debugfs_create_dir(reg_dir, dentry);
599 err = cpu_init_regfiles(cpu, type, reg, cpu_dentry);
600 if (err)
601 return err;
602 }
603 }
604
605 return err;
606}
607
608static int cpu_init_allreg(unsigned cpu, struct dentry *dentry)
609{
610 struct dentry *cpu_dentry = NULL;
611 unsigned type;
612 int err = 0;
613
614 for (type = 0; type < ARRAY_SIZE(cpu_base) - 1; type++) {
615 if (!is_typeflag_valid(cpu, cpu_base[type].flag))
616 continue;
617 cpu_dentry = debugfs_create_dir(cpu_base[type].name, dentry);
618 per_cpu(cpu_arr[type].dentry, cpu) = cpu_dentry;
619
620 if (type < CPU_TSS_BIT)
621 err = cpu_init_msr(cpu, type, cpu_dentry);
622 else
623 err = cpu_create_file(cpu, type, 0, CPU_INDEX_BIT,
624 cpu_dentry);
625 if (err)
626 return err;
627 }
628
629 return err;
630}
631
632static int cpu_init_cpu(void)
633{
634 struct dentry *cpu_dentry = NULL;
635 struct cpuinfo_x86 *cpui;
636 char cpu_dir[12];
637 unsigned cpu;
638 int err = 0;
639
640 for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
641 cpui = &cpu_data(cpu);
642 if (!cpu_has(cpui, X86_FEATURE_MSR))
643 continue;
644
645 sprintf(cpu_dir, "cpu%d", cpu);
646 cpu_dentry = debugfs_create_dir(cpu_dir, cpu_debugfs_dir);
647 err = cpu_init_allreg(cpu, cpu_dentry);
648
649 pr_info("cpu%d(%d) debug files %d\n",
650 cpu, nr_cpu_ids, per_cpu(cpu_priv_count, cpu));
651 if (per_cpu(cpu_priv_count, cpu) > MAX_CPU_FILES) {
652 pr_err("Register files count %d exceeds limit %d\n",
653 per_cpu(cpu_priv_count, cpu), MAX_CPU_FILES);
654 per_cpu(cpu_priv_count, cpu) = MAX_CPU_FILES;
655 err = -ENFILE;
656 }
657 if (err)
658 return err;
659 }
660
661 return err;
662}
663
664static int __init cpu_debug_init(void)
665{
666 cpu_debugfs_dir = debugfs_create_dir("cpu", arch_debugfs_dir);
667
668 return cpu_init_cpu();
669}
670
671static void __exit cpu_debug_exit(void)
672{
673 int i, cpu;
674
675 if (cpu_debugfs_dir)
676 debugfs_remove_recursive(cpu_debugfs_dir);
677
678 for (cpu = 0; cpu < nr_cpu_ids; cpu++)
679 for (i = 0; i < per_cpu(cpu_priv_count, cpu); i++)
680 kfree(per_cpu(priv_arr[i], cpu));
681}
682
683module_init(cpu_debug_init);
684module_exit(cpu_debug_exit);
685
686MODULE_AUTHOR("Jaswinder Singh Rajput");
687MODULE_DESCRIPTION("CPU Debug module");
688MODULE_LICENSE("GPL");
diff --git a/arch/x86/kernel/cpu/cpufreq/Kconfig b/arch/x86/kernel/cpu/cpufreq/Kconfig
index f138c6c389b9..870e6cc6ad28 100644
--- a/arch/x86/kernel/cpu/cpufreq/Kconfig
+++ b/arch/x86/kernel/cpu/cpufreq/Kconfig
@@ -10,6 +10,20 @@ if CPU_FREQ
10 10
11comment "CPUFreq processor drivers" 11comment "CPUFreq processor drivers"
12 12
13config X86_PCC_CPUFREQ
14 tristate "Processor Clocking Control interface driver"
15 depends on ACPI && ACPI_PROCESSOR
16 help
17 This driver adds support for the PCC interface.
18
19 For details, take a look at:
20 <file:Documentation/cpu-freq/pcc-cpufreq.txt>.
21
22 To compile this driver as a module, choose M here: the
23 module will be called pcc-cpufreq.
24
25 If in doubt, say N.
26
13config X86_ACPI_CPUFREQ 27config X86_ACPI_CPUFREQ
14 tristate "ACPI Processor P-States driver" 28 tristate "ACPI Processor P-States driver"
15 select CPU_FREQ_TABLE 29 select CPU_FREQ_TABLE
diff --git a/arch/x86/kernel/cpu/cpufreq/Makefile b/arch/x86/kernel/cpu/cpufreq/Makefile
index 509296df294d..1840c0a5170b 100644
--- a/arch/x86/kernel/cpu/cpufreq/Makefile
+++ b/arch/x86/kernel/cpu/cpufreq/Makefile
@@ -4,6 +4,7 @@
4 4
5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o 5obj-$(CONFIG_X86_POWERNOW_K8) += powernow-k8.o
6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o 6obj-$(CONFIG_X86_ACPI_CPUFREQ) += acpi-cpufreq.o
7obj-$(CONFIG_X86_PCC_CPUFREQ) += pcc-cpufreq.o
7obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o 8obj-$(CONFIG_X86_POWERNOW_K6) += powernow-k6.o
8obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o 9obj-$(CONFIG_X86_POWERNOW_K7) += powernow-k7.o
9obj-$(CONFIG_X86_LONGHAUL) += longhaul.o 10obj-$(CONFIG_X86_LONGHAUL) += longhaul.o
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 8b581d3905cb..459168083b77 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,6 +33,7 @@
33#include <linux/cpufreq.h> 33#include <linux/cpufreq.h>
34#include <linux/compiler.h> 34#include <linux/compiler.h>
35#include <linux/dmi.h> 35#include <linux/dmi.h>
36#include <linux/slab.h>
36#include <trace/events/power.h> 37#include <trace/events/power.h>
37 38
38#include <linux/acpi.h> 39#include <linux/acpi.h>
@@ -68,9 +69,9 @@ struct acpi_cpufreq_data {
68 unsigned int cpu_feature; 69 unsigned int cpu_feature;
69}; 70};
70 71
71static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data); 72static DEFINE_PER_CPU(struct acpi_cpufreq_data *, acfreq_data);
72 73
73static DEFINE_PER_CPU(struct aperfmperf, old_perf); 74static DEFINE_PER_CPU(struct aperfmperf, acfreq_old_perf);
74 75
75/* acpi_perf_data is a pointer to percpu data. */ 76/* acpi_perf_data is a pointer to percpu data. */
76static struct acpi_processor_performance *acpi_perf_data; 77static struct acpi_processor_performance *acpi_perf_data;
@@ -190,9 +191,11 @@ static void do_drv_write(void *_cmd)
190 191
191static void drv_read(struct drv_cmd *cmd) 192static void drv_read(struct drv_cmd *cmd)
192{ 193{
194 int err;
193 cmd->val = 0; 195 cmd->val = 0;
194 196
195 smp_call_function_single(cpumask_any(cmd->mask), do_drv_read, cmd, 1); 197 err = smp_call_function_any(cmd->mask, do_drv_read, cmd, 1);
198 WARN_ON_ONCE(err); /* smp_call_function_any() was buggy? */
196} 199}
197 200
198static void drv_write(struct drv_cmd *cmd) 201static void drv_write(struct drv_cmd *cmd)
@@ -214,14 +217,14 @@ static u32 get_cur_val(const struct cpumask *mask)
214 if (unlikely(cpumask_empty(mask))) 217 if (unlikely(cpumask_empty(mask)))
215 return 0; 218 return 0;
216 219
217 switch (per_cpu(drv_data, cpumask_first(mask))->cpu_feature) { 220 switch (per_cpu(acfreq_data, cpumask_first(mask))->cpu_feature) {
218 case SYSTEM_INTEL_MSR_CAPABLE: 221 case SYSTEM_INTEL_MSR_CAPABLE:
219 cmd.type = SYSTEM_INTEL_MSR_CAPABLE; 222 cmd.type = SYSTEM_INTEL_MSR_CAPABLE;
220 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS; 223 cmd.addr.msr.reg = MSR_IA32_PERF_STATUS;
221 break; 224 break;
222 case SYSTEM_IO_CAPABLE: 225 case SYSTEM_IO_CAPABLE:
223 cmd.type = SYSTEM_IO_CAPABLE; 226 cmd.type = SYSTEM_IO_CAPABLE;
224 perf = per_cpu(drv_data, cpumask_first(mask))->acpi_data; 227 perf = per_cpu(acfreq_data, cpumask_first(mask))->acpi_data;
225 cmd.addr.io.port = perf->control_register.address; 228 cmd.addr.io.port = perf->control_register.address;
226 cmd.addr.io.bit_width = perf->control_register.bit_width; 229 cmd.addr.io.bit_width = perf->control_register.bit_width;
227 break; 230 break;
@@ -268,8 +271,8 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
268 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1)) 271 if (smp_call_function_single(cpu, read_measured_perf_ctrs, &perf, 1))
269 return 0; 272 return 0;
270 273
271 ratio = calc_aperfmperf_ratio(&per_cpu(old_perf, cpu), &perf); 274 ratio = calc_aperfmperf_ratio(&per_cpu(acfreq_old_perf, cpu), &perf);
272 per_cpu(old_perf, cpu) = perf; 275 per_cpu(acfreq_old_perf, cpu) = perf;
273 276
274 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT; 277 retval = (policy->cpuinfo.max_freq * ratio) >> APERFMPERF_SHIFT;
275 278
@@ -278,7 +281,7 @@ static unsigned int get_measured_perf(struct cpufreq_policy *policy,
278 281
279static unsigned int get_cur_freq_on_cpu(unsigned int cpu) 282static unsigned int get_cur_freq_on_cpu(unsigned int cpu)
280{ 283{
281 struct acpi_cpufreq_data *data = per_cpu(drv_data, cpu); 284 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, cpu);
282 unsigned int freq; 285 unsigned int freq;
283 unsigned int cached_freq; 286 unsigned int cached_freq;
284 287
@@ -322,7 +325,7 @@ static unsigned int check_freqs(const struct cpumask *mask, unsigned int freq,
322static int acpi_cpufreq_target(struct cpufreq_policy *policy, 325static int acpi_cpufreq_target(struct cpufreq_policy *policy,
323 unsigned int target_freq, unsigned int relation) 326 unsigned int target_freq, unsigned int relation)
324{ 327{
325 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 328 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
326 struct acpi_processor_performance *perf; 329 struct acpi_processor_performance *perf;
327 struct cpufreq_freqs freqs; 330 struct cpufreq_freqs freqs;
328 struct drv_cmd cmd; 331 struct drv_cmd cmd;
@@ -416,7 +419,7 @@ out:
416 419
417static int acpi_cpufreq_verify(struct cpufreq_policy *policy) 420static int acpi_cpufreq_verify(struct cpufreq_policy *policy)
418{ 421{
419 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 422 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
420 423
421 dprintk("acpi_cpufreq_verify\n"); 424 dprintk("acpi_cpufreq_verify\n");
422 425
@@ -574,7 +577,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
574 return -ENOMEM; 577 return -ENOMEM;
575 578
576 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu); 579 data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
577 per_cpu(drv_data, cpu) = data; 580 per_cpu(acfreq_data, cpu) = data;
578 581
579 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC)) 582 if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
580 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS; 583 acpi_cpufreq_driver.flags |= CPUFREQ_CONST_LOOPS;
@@ -725,20 +728,20 @@ err_unreg:
725 acpi_processor_unregister_performance(perf, cpu); 728 acpi_processor_unregister_performance(perf, cpu);
726err_free: 729err_free:
727 kfree(data); 730 kfree(data);
728 per_cpu(drv_data, cpu) = NULL; 731 per_cpu(acfreq_data, cpu) = NULL;
729 732
730 return result; 733 return result;
731} 734}
732 735
733static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy) 736static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
734{ 737{
735 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 738 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
736 739
737 dprintk("acpi_cpufreq_cpu_exit\n"); 740 dprintk("acpi_cpufreq_cpu_exit\n");
738 741
739 if (data) { 742 if (data) {
740 cpufreq_frequency_table_put_attr(policy->cpu); 743 cpufreq_frequency_table_put_attr(policy->cpu);
741 per_cpu(drv_data, policy->cpu) = NULL; 744 per_cpu(acfreq_data, policy->cpu) = NULL;
742 acpi_processor_unregister_performance(data->acpi_data, 745 acpi_processor_unregister_performance(data->acpi_data,
743 policy->cpu); 746 policy->cpu);
744 kfree(data); 747 kfree(data);
@@ -749,7 +752,7 @@ static int acpi_cpufreq_cpu_exit(struct cpufreq_policy *policy)
749 752
750static int acpi_cpufreq_resume(struct cpufreq_policy *policy) 753static int acpi_cpufreq_resume(struct cpufreq_policy *policy)
751{ 754{
752 struct acpi_cpufreq_data *data = per_cpu(drv_data, policy->cpu); 755 struct acpi_cpufreq_data *data = per_cpu(acfreq_data, policy->cpu);
753 756
754 dprintk("acpi_cpufreq_resume\n"); 757 dprintk("acpi_cpufreq_resume\n");
755 758
@@ -764,14 +767,15 @@ static struct freq_attr *acpi_cpufreq_attr[] = {
764}; 767};
765 768
766static struct cpufreq_driver acpi_cpufreq_driver = { 769static struct cpufreq_driver acpi_cpufreq_driver = {
767 .verify = acpi_cpufreq_verify, 770 .verify = acpi_cpufreq_verify,
768 .target = acpi_cpufreq_target, 771 .target = acpi_cpufreq_target,
769 .init = acpi_cpufreq_cpu_init, 772 .bios_limit = acpi_processor_get_bios_limit,
770 .exit = acpi_cpufreq_cpu_exit, 773 .init = acpi_cpufreq_cpu_init,
771 .resume = acpi_cpufreq_resume, 774 .exit = acpi_cpufreq_cpu_exit,
772 .name = "acpi-cpufreq", 775 .resume = acpi_cpufreq_resume,
773 .owner = THIS_MODULE, 776 .name = "acpi-cpufreq",
774 .attr = acpi_cpufreq_attr, 777 .owner = THIS_MODULE,
778 .attr = acpi_cpufreq_attr,
775}; 779};
776 780
777static int __init acpi_cpufreq_init(void) 781static int __init acpi_cpufreq_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/elanfreq.c b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
index 006b278b0d5d..c587db472a75 100644
--- a/arch/x86/kernel/cpu/cpufreq/elanfreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/elanfreq.c
@@ -20,7 +20,6 @@
20#include <linux/module.h> 20#include <linux/module.h>
21#include <linux/init.h> 21#include <linux/init.h>
22 22
23#include <linux/slab.h>
24#include <linux/delay.h> 23#include <linux/delay.h>
25#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
26 25
diff --git a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
index ac27ec2264d5..16e3483be9e3 100644
--- a/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/gx-suspmod.c
@@ -80,6 +80,7 @@
80#include <linux/cpufreq.h> 80#include <linux/cpufreq.h>
81#include <linux/pci.h> 81#include <linux/pci.h>
82#include <linux/errno.h> 82#include <linux/errno.h>
83#include <linux/slab.h>
83 84
84#include <asm/processor-cyrix.h> 85#include <asm/processor-cyrix.h>
85 86
diff --git a/arch/x86/kernel/cpu/cpufreq/longhaul.c b/arch/x86/kernel/cpu/cpufreq/longhaul.c
index cabd2fa3fc93..7e7eea4f8261 100644
--- a/arch/x86/kernel/cpu/cpufreq/longhaul.c
+++ b/arch/x86/kernel/cpu/cpufreq/longhaul.c
@@ -885,7 +885,7 @@ static int __init longhaul_cpu_init(struct cpufreq_policy *policy)
885 885
886 /* Find ACPI data for processor */ 886 /* Find ACPI data for processor */
887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT, 887 acpi_walk_namespace(ACPI_TYPE_PROCESSOR, ACPI_ROOT_OBJECT,
888 ACPI_UINT32_MAX, &longhaul_walk_callback, 888 ACPI_UINT32_MAX, &longhaul_walk_callback, NULL,
889 NULL, (void *)&pr); 889 NULL, (void *)&pr);
890 890
891 /* Check ACPI support for C3 state */ 891 /* Check ACPI support for C3 state */
diff --git a/arch/x86/kernel/cpu/cpufreq/longrun.c b/arch/x86/kernel/cpu/cpufreq/longrun.c
index da5f70fcb766..e7b559d74c52 100644
--- a/arch/x86/kernel/cpu/cpufreq/longrun.c
+++ b/arch/x86/kernel/cpu/cpufreq/longrun.c
@@ -9,7 +9,6 @@
9#include <linux/kernel.h> 9#include <linux/kernel.h>
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/init.h> 11#include <linux/init.h>
12#include <linux/slab.h>
13#include <linux/cpufreq.h> 12#include <linux/cpufreq.h>
14#include <linux/timex.h> 13#include <linux/timex.h>
15 14
diff --git a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
index 869615193720..7b8a8ba67b07 100644
--- a/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
+++ b/arch/x86/kernel/cpu/cpufreq/p4-clockmod.c
@@ -25,7 +25,6 @@
25#include <linux/init.h> 25#include <linux/init.h>
26#include <linux/smp.h> 26#include <linux/smp.h>
27#include <linux/cpufreq.h> 27#include <linux/cpufreq.h>
28#include <linux/slab.h>
29#include <linux/cpumask.h> 28#include <linux/cpumask.h>
30#include <linux/timex.h> 29#include <linux/timex.h>
31 30
diff --git a/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
new file mode 100644
index 000000000000..ce7cde713e71
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpufreq/pcc-cpufreq.c
@@ -0,0 +1,621 @@
1/*
2 * pcc-cpufreq.c - Processor Clocking Control firmware cpufreq interface
3 *
4 * Copyright (C) 2009 Red Hat, Matthew Garrett <mjg@redhat.com>
5 * Copyright (C) 2009 Hewlett-Packard Development Company, L.P.
6 * Nagananda Chumbalkar <nagananda.chumbalkar@hp.com>
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; version 2 of the License.
13 *
14 * This program is distributed in the hope that it will be useful, but
15 * WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or NON
17 * INFRINGEMENT. See the GNU General Public License for more details.
18 *
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 675 Mass Ave, Cambridge, MA 02139, USA.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25
26#include <linux/kernel.h>
27#include <linux/module.h>
28#include <linux/init.h>
29#include <linux/smp.h>
30#include <linux/sched.h>
31#include <linux/cpufreq.h>
32#include <linux/compiler.h>
33#include <linux/slab.h>
34
35#include <linux/acpi.h>
36#include <linux/io.h>
37#include <linux/spinlock.h>
38#include <linux/uaccess.h>
39
40#include <acpi/processor.h>
41
42#define PCC_VERSION "1.00.00"
43#define POLL_LOOPS 300
44
45#define CMD_COMPLETE 0x1
46#define CMD_GET_FREQ 0x0
47#define CMD_SET_FREQ 0x1
48
49#define BUF_SZ 4
50
51#define dprintk(msg...) cpufreq_debug_printk(CPUFREQ_DEBUG_DRIVER, \
52 "pcc-cpufreq", msg)
53
54struct pcc_register_resource {
55 u8 descriptor;
56 u16 length;
57 u8 space_id;
58 u8 bit_width;
59 u8 bit_offset;
60 u8 access_size;
61 u64 address;
62} __attribute__ ((packed));
63
64struct pcc_memory_resource {
65 u8 descriptor;
66 u16 length;
67 u8 space_id;
68 u8 resource_usage;
69 u8 type_specific;
70 u64 granularity;
71 u64 minimum;
72 u64 maximum;
73 u64 translation_offset;
74 u64 address_length;
75} __attribute__ ((packed));
76
77static struct cpufreq_driver pcc_cpufreq_driver;
78
79struct pcc_header {
80 u32 signature;
81 u16 length;
82 u8 major;
83 u8 minor;
84 u32 features;
85 u16 command;
86 u16 status;
87 u32 latency;
88 u32 minimum_time;
89 u32 maximum_time;
90 u32 nominal;
91 u32 throttled_frequency;
92 u32 minimum_frequency;
93};
94
95static void __iomem *pcch_virt_addr;
96static struct pcc_header __iomem *pcch_hdr;
97
98static DEFINE_SPINLOCK(pcc_lock);
99
100static struct acpi_generic_address doorbell;
101
102static u64 doorbell_preserve;
103static u64 doorbell_write;
104
105static u8 OSC_UUID[16] = {0x63, 0x9B, 0x2C, 0x9F, 0x70, 0x91, 0x49, 0x1f,
106 0xBB, 0x4F, 0xA5, 0x98, 0x2F, 0xA1, 0xB5, 0x46};
107
108struct pcc_cpu {
109 u32 input_offset;
110 u32 output_offset;
111};
112
113static struct pcc_cpu *pcc_cpu_info;
114
115static int pcc_cpufreq_verify(struct cpufreq_policy *policy)
116{
117 cpufreq_verify_within_limits(policy, policy->cpuinfo.min_freq,
118 policy->cpuinfo.max_freq);
119 return 0;
120}
121
122static inline void pcc_cmd(void)
123{
124 u64 doorbell_value;
125 int i;
126
127 acpi_read(&doorbell_value, &doorbell);
128 acpi_write((doorbell_value & doorbell_preserve) | doorbell_write,
129 &doorbell);
130
131 for (i = 0; i < POLL_LOOPS; i++) {
132 if (ioread16(&pcch_hdr->status) & CMD_COMPLETE)
133 break;
134 }
135}
136
137static inline void pcc_clear_mapping(void)
138{
139 if (pcch_virt_addr)
140 iounmap(pcch_virt_addr);
141 pcch_virt_addr = NULL;
142}
143
144static unsigned int pcc_get_freq(unsigned int cpu)
145{
146 struct pcc_cpu *pcc_cpu_data;
147 unsigned int curr_freq;
148 unsigned int freq_limit;
149 u16 status;
150 u32 input_buffer;
151 u32 output_buffer;
152
153 spin_lock(&pcc_lock);
154
155 dprintk("get: get_freq for CPU %d\n", cpu);
156 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
157
158 input_buffer = 0x1;
159 iowrite32(input_buffer,
160 (pcch_virt_addr + pcc_cpu_data->input_offset));
161 iowrite16(CMD_GET_FREQ, &pcch_hdr->command);
162
163 pcc_cmd();
164
165 output_buffer =
166 ioread32(pcch_virt_addr + pcc_cpu_data->output_offset);
167
168 /* Clear the input buffer - we are done with the current command */
169 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
170
171 status = ioread16(&pcch_hdr->status);
172 if (status != CMD_COMPLETE) {
173 dprintk("get: FAILED: for CPU %d, status is %d\n",
174 cpu, status);
175 goto cmd_incomplete;
176 }
177 iowrite16(0, &pcch_hdr->status);
178 curr_freq = (((ioread32(&pcch_hdr->nominal) * (output_buffer & 0xff))
179 / 100) * 1000);
180
181 dprintk("get: SUCCESS: (virtual) output_offset for cpu %d is "
182 "0x%x, contains a value of: 0x%x. Speed is: %d MHz\n",
183 cpu, (pcch_virt_addr + pcc_cpu_data->output_offset),
184 output_buffer, curr_freq);
185
186 freq_limit = (output_buffer >> 8) & 0xff;
187 if (freq_limit != 0xff) {
188 dprintk("get: frequency for cpu %d is being temporarily"
189 " capped at %d\n", cpu, curr_freq);
190 }
191
192 spin_unlock(&pcc_lock);
193 return curr_freq;
194
195cmd_incomplete:
196 iowrite16(0, &pcch_hdr->status);
197 spin_unlock(&pcc_lock);
198 return -EINVAL;
199}
200
201static int pcc_cpufreq_target(struct cpufreq_policy *policy,
202 unsigned int target_freq,
203 unsigned int relation)
204{
205 struct pcc_cpu *pcc_cpu_data;
206 struct cpufreq_freqs freqs;
207 u16 status;
208 u32 input_buffer;
209 int cpu;
210
211 spin_lock(&pcc_lock);
212 cpu = policy->cpu;
213 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
214
215 dprintk("target: CPU %d should go to target freq: %d "
216 "(virtual) input_offset is 0x%x\n",
217 cpu, target_freq,
218 (pcch_virt_addr + pcc_cpu_data->input_offset));
219
220 freqs.new = target_freq;
221 freqs.cpu = cpu;
222 cpufreq_notify_transition(&freqs, CPUFREQ_PRECHANGE);
223
224 input_buffer = 0x1 | (((target_freq * 100)
225 / (ioread32(&pcch_hdr->nominal) * 1000)) << 8);
226 iowrite32(input_buffer,
227 (pcch_virt_addr + pcc_cpu_data->input_offset));
228 iowrite16(CMD_SET_FREQ, &pcch_hdr->command);
229
230 pcc_cmd();
231
232 /* Clear the input buffer - we are done with the current command */
233 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
234
235 status = ioread16(&pcch_hdr->status);
236 if (status != CMD_COMPLETE) {
237 dprintk("target: FAILED for cpu %d, with status: 0x%x\n",
238 cpu, status);
239 goto cmd_incomplete;
240 }
241 iowrite16(0, &pcch_hdr->status);
242
243 cpufreq_notify_transition(&freqs, CPUFREQ_POSTCHANGE);
244 dprintk("target: was SUCCESSFUL for cpu %d\n", cpu);
245 spin_unlock(&pcc_lock);
246
247 return 0;
248
249cmd_incomplete:
250 iowrite16(0, &pcch_hdr->status);
251 spin_unlock(&pcc_lock);
252 return -EINVAL;
253}
254
255static int pcc_get_offset(int cpu)
256{
257 acpi_status status;
258 struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL};
259 union acpi_object *pccp, *offset;
260 struct pcc_cpu *pcc_cpu_data;
261 struct acpi_processor *pr;
262 int ret = 0;
263
264 pr = per_cpu(processors, cpu);
265 pcc_cpu_data = per_cpu_ptr(pcc_cpu_info, cpu);
266
267 status = acpi_evaluate_object(pr->handle, "PCCP", NULL, &buffer);
268 if (ACPI_FAILURE(status))
269 return -ENODEV;
270
271 pccp = buffer.pointer;
272 if (!pccp || pccp->type != ACPI_TYPE_PACKAGE) {
273 ret = -ENODEV;
274 goto out_free;
275 };
276
277 offset = &(pccp->package.elements[0]);
278 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
279 ret = -ENODEV;
280 goto out_free;
281 }
282
283 pcc_cpu_data->input_offset = offset->integer.value;
284
285 offset = &(pccp->package.elements[1]);
286 if (!offset || offset->type != ACPI_TYPE_INTEGER) {
287 ret = -ENODEV;
288 goto out_free;
289 }
290
291 pcc_cpu_data->output_offset = offset->integer.value;
292
293 memset_io((pcch_virt_addr + pcc_cpu_data->input_offset), 0, BUF_SZ);
294 memset_io((pcch_virt_addr + pcc_cpu_data->output_offset), 0, BUF_SZ);
295
296 dprintk("pcc_get_offset: for CPU %d: pcc_cpu_data "
297 "input_offset: 0x%x, pcc_cpu_data output_offset: 0x%x\n",
298 cpu, pcc_cpu_data->input_offset, pcc_cpu_data->output_offset);
299out_free:
300 kfree(buffer.pointer);
301 return ret;
302}
303
304static int __init pcc_cpufreq_do_osc(acpi_handle *handle)
305{
306 acpi_status status;
307 struct acpi_object_list input;
308 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
309 union acpi_object in_params[4];
310 union acpi_object *out_obj;
311 u32 capabilities[2];
312 u32 errors;
313 u32 supported;
314 int ret = 0;
315
316 input.count = 4;
317 input.pointer = in_params;
318 input.count = 4;
319 input.pointer = in_params;
320 in_params[0].type = ACPI_TYPE_BUFFER;
321 in_params[0].buffer.length = 16;
322 in_params[0].buffer.pointer = OSC_UUID;
323 in_params[1].type = ACPI_TYPE_INTEGER;
324 in_params[1].integer.value = 1;
325 in_params[2].type = ACPI_TYPE_INTEGER;
326 in_params[2].integer.value = 2;
327 in_params[3].type = ACPI_TYPE_BUFFER;
328 in_params[3].buffer.length = 8;
329 in_params[3].buffer.pointer = (u8 *)&capabilities;
330
331 capabilities[0] = OSC_QUERY_ENABLE;
332 capabilities[1] = 0x1;
333
334 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
335 if (ACPI_FAILURE(status))
336 return -ENODEV;
337
338 if (!output.length)
339 return -ENODEV;
340
341 out_obj = output.pointer;
342 if (out_obj->type != ACPI_TYPE_BUFFER) {
343 ret = -ENODEV;
344 goto out_free;
345 }
346
347 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
348 if (errors) {
349 ret = -ENODEV;
350 goto out_free;
351 }
352
353 supported = *((u32 *)(out_obj->buffer.pointer + 4));
354 if (!(supported & 0x1)) {
355 ret = -ENODEV;
356 goto out_free;
357 }
358
359 kfree(output.pointer);
360 capabilities[0] = 0x0;
361 capabilities[1] = 0x1;
362
363 status = acpi_evaluate_object(*handle, "_OSC", &input, &output);
364 if (ACPI_FAILURE(status))
365 return -ENODEV;
366
367 if (!output.length)
368 return -ENODEV;
369
370 out_obj = output.pointer;
371 if (out_obj->type != ACPI_TYPE_BUFFER) {
372 ret = -ENODEV;
373 goto out_free;
374 }
375
376 errors = *((u32 *)out_obj->buffer.pointer) & ~(1 << 0);
377 if (errors) {
378 ret = -ENODEV;
379 goto out_free;
380 }
381
382 supported = *((u32 *)(out_obj->buffer.pointer + 4));
383 if (!(supported & 0x1)) {
384 ret = -ENODEV;
385 goto out_free;
386 }
387
388out_free:
389 kfree(output.pointer);
390 return ret;
391}
392
393static int __init pcc_cpufreq_probe(void)
394{
395 acpi_status status;
396 struct acpi_buffer output = {ACPI_ALLOCATE_BUFFER, NULL};
397 struct pcc_memory_resource *mem_resource;
398 struct pcc_register_resource *reg_resource;
399 union acpi_object *out_obj, *member;
400 acpi_handle handle, osc_handle;
401 int ret = 0;
402
403 status = acpi_get_handle(NULL, "\\_SB", &handle);
404 if (ACPI_FAILURE(status))
405 return -ENODEV;
406
407 status = acpi_get_handle(handle, "_OSC", &osc_handle);
408 if (ACPI_SUCCESS(status)) {
409 ret = pcc_cpufreq_do_osc(&osc_handle);
410 if (ret)
411 dprintk("probe: _OSC evaluation did not succeed\n");
412 /* Firmware's use of _OSC is optional */
413 ret = 0;
414 }
415
416 status = acpi_evaluate_object(handle, "PCCH", NULL, &output);
417 if (ACPI_FAILURE(status))
418 return -ENODEV;
419
420 out_obj = output.pointer;
421 if (out_obj->type != ACPI_TYPE_PACKAGE) {
422 ret = -ENODEV;
423 goto out_free;
424 }
425
426 member = &out_obj->package.elements[0];
427 if (member->type != ACPI_TYPE_BUFFER) {
428 ret = -ENODEV;
429 goto out_free;
430 }
431
432 mem_resource = (struct pcc_memory_resource *)member->buffer.pointer;
433
434 dprintk("probe: mem_resource descriptor: 0x%x,"
435 " length: %d, space_id: %d, resource_usage: %d,"
436 " type_specific: %d, granularity: 0x%llx,"
437 " minimum: 0x%llx, maximum: 0x%llx,"
438 " translation_offset: 0x%llx, address_length: 0x%llx\n",
439 mem_resource->descriptor, mem_resource->length,
440 mem_resource->space_id, mem_resource->resource_usage,
441 mem_resource->type_specific, mem_resource->granularity,
442 mem_resource->minimum, mem_resource->maximum,
443 mem_resource->translation_offset,
444 mem_resource->address_length);
445
446 if (mem_resource->space_id != ACPI_ADR_SPACE_SYSTEM_MEMORY) {
447 ret = -ENODEV;
448 goto out_free;
449 }
450
451 pcch_virt_addr = ioremap_nocache(mem_resource->minimum,
452 mem_resource->address_length);
453 if (pcch_virt_addr == NULL) {
454 dprintk("probe: could not map shared mem region\n");
455 goto out_free;
456 }
457 pcch_hdr = pcch_virt_addr;
458
459 dprintk("probe: PCCH header (virtual) addr: 0x%p\n", pcch_hdr);
460 dprintk("probe: PCCH header is at physical address: 0x%llx,"
461 " signature: 0x%x, length: %d bytes, major: %d, minor: %d,"
462 " supported features: 0x%x, command field: 0x%x,"
463 " status field: 0x%x, nominal latency: %d us\n",
464 mem_resource->minimum, ioread32(&pcch_hdr->signature),
465 ioread16(&pcch_hdr->length), ioread8(&pcch_hdr->major),
466 ioread8(&pcch_hdr->minor), ioread32(&pcch_hdr->features),
467 ioread16(&pcch_hdr->command), ioread16(&pcch_hdr->status),
468 ioread32(&pcch_hdr->latency));
469
470 dprintk("probe: min time between commands: %d us,"
471 " max time between commands: %d us,"
472 " nominal CPU frequency: %d MHz,"
473 " minimum CPU frequency: %d MHz,"
474 " minimum CPU frequency without throttling: %d MHz\n",
475 ioread32(&pcch_hdr->minimum_time),
476 ioread32(&pcch_hdr->maximum_time),
477 ioread32(&pcch_hdr->nominal),
478 ioread32(&pcch_hdr->throttled_frequency),
479 ioread32(&pcch_hdr->minimum_frequency));
480
481 member = &out_obj->package.elements[1];
482 if (member->type != ACPI_TYPE_BUFFER) {
483 ret = -ENODEV;
484 goto pcch_free;
485 }
486
487 reg_resource = (struct pcc_register_resource *)member->buffer.pointer;
488
489 doorbell.space_id = reg_resource->space_id;
490 doorbell.bit_width = reg_resource->bit_width;
491 doorbell.bit_offset = reg_resource->bit_offset;
492 doorbell.access_width = 64;
493 doorbell.address = reg_resource->address;
494
495 dprintk("probe: doorbell: space_id is %d, bit_width is %d, "
496 "bit_offset is %d, access_width is %d, address is 0x%llx\n",
497 doorbell.space_id, doorbell.bit_width, doorbell.bit_offset,
498 doorbell.access_width, reg_resource->address);
499
500 member = &out_obj->package.elements[2];
501 if (member->type != ACPI_TYPE_INTEGER) {
502 ret = -ENODEV;
503 goto pcch_free;
504 }
505
506 doorbell_preserve = member->integer.value;
507
508 member = &out_obj->package.elements[3];
509 if (member->type != ACPI_TYPE_INTEGER) {
510 ret = -ENODEV;
511 goto pcch_free;
512 }
513
514 doorbell_write = member->integer.value;
515
516 dprintk("probe: doorbell_preserve: 0x%llx,"
517 " doorbell_write: 0x%llx\n",
518 doorbell_preserve, doorbell_write);
519
520 pcc_cpu_info = alloc_percpu(struct pcc_cpu);
521 if (!pcc_cpu_info) {
522 ret = -ENOMEM;
523 goto pcch_free;
524 }
525
526 printk(KERN_DEBUG "pcc-cpufreq: (v%s) driver loaded with frequency"
527 " limits: %d MHz, %d MHz\n", PCC_VERSION,
528 ioread32(&pcch_hdr->minimum_frequency),
529 ioread32(&pcch_hdr->nominal));
530 kfree(output.pointer);
531 return ret;
532pcch_free:
533 pcc_clear_mapping();
534out_free:
535 kfree(output.pointer);
536 return ret;
537}
538
539static int pcc_cpufreq_cpu_init(struct cpufreq_policy *policy)
540{
541 unsigned int cpu = policy->cpu;
542 unsigned int result = 0;
543
544 if (!pcch_virt_addr) {
545 result = -1;
546 goto pcch_null;
547 }
548
549 result = pcc_get_offset(cpu);
550 if (result) {
551 dprintk("init: PCCP evaluation failed\n");
552 goto free;
553 }
554
555 policy->max = policy->cpuinfo.max_freq =
556 ioread32(&pcch_hdr->nominal) * 1000;
557 policy->min = policy->cpuinfo.min_freq =
558 ioread32(&pcch_hdr->minimum_frequency) * 1000;
559 policy->cur = pcc_get_freq(cpu);
560
561 dprintk("init: policy->max is %d, policy->min is %d\n",
562 policy->max, policy->min);
563
564 return 0;
565free:
566 pcc_clear_mapping();
567 free_percpu(pcc_cpu_info);
568pcch_null:
569 return result;
570}
571
572static int pcc_cpufreq_cpu_exit(struct cpufreq_policy *policy)
573{
574 return 0;
575}
576
577static struct cpufreq_driver pcc_cpufreq_driver = {
578 .flags = CPUFREQ_CONST_LOOPS,
579 .get = pcc_get_freq,
580 .verify = pcc_cpufreq_verify,
581 .target = pcc_cpufreq_target,
582 .init = pcc_cpufreq_cpu_init,
583 .exit = pcc_cpufreq_cpu_exit,
584 .name = "pcc-cpufreq",
585 .owner = THIS_MODULE,
586};
587
588static int __init pcc_cpufreq_init(void)
589{
590 int ret;
591
592 if (acpi_disabled)
593 return 0;
594
595 ret = pcc_cpufreq_probe();
596 if (ret) {
597 dprintk("pcc_cpufreq_init: PCCH evaluation failed\n");
598 return ret;
599 }
600
601 ret = cpufreq_register_driver(&pcc_cpufreq_driver);
602
603 return ret;
604}
605
606static void __exit pcc_cpufreq_exit(void)
607{
608 cpufreq_unregister_driver(&pcc_cpufreq_driver);
609
610 pcc_clear_mapping();
611
612 free_percpu(pcc_cpu_info);
613}
614
615MODULE_AUTHOR("Matthew Garrett, Naga Chumbalkar");
616MODULE_VERSION(PCC_VERSION);
617MODULE_DESCRIPTION("Processor Clocking Control interface driver");
618MODULE_LICENSE("GPL");
619
620late_initcall(pcc_cpufreq_init);
621module_exit(pcc_cpufreq_exit);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
index f10dea409f40..b3379d6a5c57 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k6.c
@@ -13,7 +13,6 @@
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/cpufreq.h> 14#include <linux/cpufreq.h>
15#include <linux/ioport.h> 15#include <linux/ioport.h>
16#include <linux/slab.h>
17#include <linux/timex.h> 16#include <linux/timex.h>
18#include <linux/io.h> 17#include <linux/io.h>
19 18
@@ -164,7 +163,7 @@ static int powernow_k6_cpu_init(struct cpufreq_policy *policy)
164 } 163 }
165 164
166 /* cpuinfo and default policy values */ 165 /* cpuinfo and default policy values */
167 policy->cpuinfo.transition_latency = CPUFREQ_ETERNAL; 166 policy->cpuinfo.transition_latency = 200000;
168 policy->cur = busfreq * max_multiplier; 167 policy->cur = busfreq * max_multiplier;
169 168
170 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio); 169 result = cpufreq_frequency_table_cpuinfo(policy, clock_ratio);
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
index d47c775eb0ab..9a97116f89e5 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k7.c
@@ -714,14 +714,17 @@ static struct freq_attr *powernow_table_attr[] = {
714}; 714};
715 715
716static struct cpufreq_driver powernow_driver = { 716static struct cpufreq_driver powernow_driver = {
717 .verify = powernow_verify, 717 .verify = powernow_verify,
718 .target = powernow_target, 718 .target = powernow_target,
719 .get = powernow_get, 719 .get = powernow_get,
720 .init = powernow_cpu_init, 720#ifdef CONFIG_X86_POWERNOW_K7_ACPI
721 .exit = powernow_cpu_exit, 721 .bios_limit = acpi_processor_get_bios_limit,
722 .name = "powernow-k7", 722#endif
723 .owner = THIS_MODULE, 723 .init = powernow_cpu_init,
724 .attr = powernow_table_attr, 724 .exit = powernow_cpu_exit,
725 .name = "powernow-k7",
726 .owner = THIS_MODULE,
727 .attr = powernow_table_attr,
725}; 728};
726 729
727static int __init powernow_init(void) 730static int __init powernow_init(void)
diff --git a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
index 3f12dabeab52..b6215b9798e2 100644
--- a/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
+++ b/arch/x86/kernel/cpu/cpufreq/powernow-k8.c
@@ -806,7 +806,7 @@ static int find_psb_table(struct powernow_k8_data *data)
806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data, 806static void powernow_k8_acpi_pst_values(struct powernow_k8_data *data,
807 unsigned int index) 807 unsigned int index)
808{ 808{
809 acpi_integer control; 809 u64 control;
810 810
811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE)) 811 if (!data->acpi_data.state_count || (cpu_family == CPU_HW_PSTATE))
812 return; 812 return;
@@ -824,7 +824,7 @@ static int powernow_k8_cpu_init_acpi(struct powernow_k8_data *data)
824{ 824{
825 struct cpufreq_frequency_table *powernow_table; 825 struct cpufreq_frequency_table *powernow_table;
826 int ret_val = -ENODEV; 826 int ret_val = -ENODEV;
827 acpi_integer control, status; 827 u64 control, status;
828 828
829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) { 829 if (acpi_processor_register_performance(&data->acpi_data, data->cpu)) {
830 dprintk("register performance failed: bad ACPI data\n"); 830 dprintk("register performance failed: bad ACPI data\n");
@@ -929,7 +929,8 @@ static int fill_powernow_table_pstate(struct powernow_k8_data *data,
929 powernow_table[i].index = index; 929 powernow_table[i].index = index;
930 930
931 /* Frequency may be rounded for these */ 931 /* Frequency may be rounded for these */
932 if (boot_cpu_data.x86 == 0x10 || boot_cpu_data.x86 == 0x11) { 932 if ((boot_cpu_data.x86 == 0x10 && boot_cpu_data.x86_model < 10)
933 || boot_cpu_data.x86 == 0x11) {
933 powernow_table[i].frequency = 934 powernow_table[i].frequency =
934 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7); 935 freq_from_fid_did(lo & 0x3f, (lo >> 6) & 7);
935 } else 936 } else
@@ -948,7 +949,7 @@ static int fill_powernow_table_fidvid(struct powernow_k8_data *data,
948 u32 fid; 949 u32 fid;
949 u32 vid; 950 u32 vid;
950 u32 freq, index; 951 u32 freq, index;
951 acpi_integer status, control; 952 u64 status, control;
952 953
953 if (data->exttype) { 954 if (data->exttype) {
954 status = data->acpi_data.states[i].status; 955 status = data->acpi_data.states[i].status;
@@ -1118,7 +1119,7 @@ static int transition_frequency_pstate(struct powernow_k8_data *data,
1118static int powernowk8_target(struct cpufreq_policy *pol, 1119static int powernowk8_target(struct cpufreq_policy *pol,
1119 unsigned targfreq, unsigned relation) 1120 unsigned targfreq, unsigned relation)
1120{ 1121{
1121 cpumask_t oldmask; 1122 cpumask_var_t oldmask;
1122 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu); 1123 struct powernow_k8_data *data = per_cpu(powernow_data, pol->cpu);
1123 u32 checkfid; 1124 u32 checkfid;
1124 u32 checkvid; 1125 u32 checkvid;
@@ -1131,9 +1132,13 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1131 checkfid = data->currfid; 1132 checkfid = data->currfid;
1132 checkvid = data->currvid; 1133 checkvid = data->currvid;
1133 1134
1134 /* only run on specific CPU from here on */ 1135 /* only run on specific CPU from here on. */
1135 oldmask = current->cpus_allowed; 1136 /* This is poor form: use a workqueue or smp_call_function_single */
1136 set_cpus_allowed_ptr(current, &cpumask_of_cpu(pol->cpu)); 1137 if (!alloc_cpumask_var(&oldmask, GFP_KERNEL))
1138 return -ENOMEM;
1139
1140 cpumask_copy(oldmask, tsk_cpus_allowed(current));
1141 set_cpus_allowed_ptr(current, cpumask_of(pol->cpu));
1137 1142
1138 if (smp_processor_id() != pol->cpu) { 1143 if (smp_processor_id() != pol->cpu) {
1139 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu); 1144 printk(KERN_ERR PFX "limiting to cpu %u failed\n", pol->cpu);
@@ -1193,7 +1198,8 @@ static int powernowk8_target(struct cpufreq_policy *pol,
1193 ret = 0; 1198 ret = 0;
1194 1199
1195err_out: 1200err_out:
1196 set_cpus_allowed_ptr(current, &oldmask); 1201 set_cpus_allowed_ptr(current, oldmask);
1202 free_cpumask_var(oldmask);
1197 return ret; 1203 return ret;
1198} 1204}
1199 1205
@@ -1351,6 +1357,7 @@ static int __devexit powernowk8_cpu_exit(struct cpufreq_policy *pol)
1351 1357
1352 kfree(data->powernow_table); 1358 kfree(data->powernow_table);
1353 kfree(data); 1359 kfree(data);
1360 per_cpu(powernow_data, pol->cpu) = NULL;
1354 1361
1355 return 0; 1362 return 0;
1356} 1363}
@@ -1370,7 +1377,7 @@ static unsigned int powernowk8_get(unsigned int cpu)
1370 int err; 1377 int err;
1371 1378
1372 if (!data) 1379 if (!data)
1373 return -EINVAL; 1380 return 0;
1374 1381
1375 smp_call_function_single(cpu, query_values_on_cpu, &err, true); 1382 smp_call_function_single(cpu, query_values_on_cpu, &err, true);
1376 if (err) 1383 if (err)
@@ -1393,14 +1400,15 @@ static struct freq_attr *powernow_k8_attr[] = {
1393}; 1400};
1394 1401
1395static struct cpufreq_driver cpufreq_amd64_driver = { 1402static struct cpufreq_driver cpufreq_amd64_driver = {
1396 .verify = powernowk8_verify, 1403 .verify = powernowk8_verify,
1397 .target = powernowk8_target, 1404 .target = powernowk8_target,
1398 .init = powernowk8_cpu_init, 1405 .bios_limit = acpi_processor_get_bios_limit,
1399 .exit = __devexit_p(powernowk8_cpu_exit), 1406 .init = powernowk8_cpu_init,
1400 .get = powernowk8_get, 1407 .exit = __devexit_p(powernowk8_cpu_exit),
1401 .name = "powernow-k8", 1408 .get = powernowk8_get,
1402 .owner = THIS_MODULE, 1409 .name = "powernow-k8",
1403 .attr = powernow_k8_attr, 1410 .owner = THIS_MODULE,
1411 .attr = powernow_k8_attr,
1404}; 1412};
1405 1413
1406/* driver entry point for init */ 1414/* driver entry point for init */
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
index 8d672ef162ce..9b1ff37de46a 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-centrino.c
@@ -20,6 +20,7 @@
20#include <linux/sched.h> /* current */ 20#include <linux/sched.h> /* current */
21#include <linux/delay.h> 21#include <linux/delay.h>
22#include <linux/compiler.h> 22#include <linux/compiler.h>
23#include <linux/gfp.h>
23 24
24#include <asm/msr.h> 25#include <asm/msr.h>
25#include <asm/processor.h> 26#include <asm/processor.h>
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
index 3ae5a7a3a500..561758e95180 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-ich.c
@@ -23,7 +23,6 @@
23#include <linux/init.h> 23#include <linux/init.h>
24#include <linux/cpufreq.h> 24#include <linux/cpufreq.h>
25#include <linux/pci.h> 25#include <linux/pci.h>
26#include <linux/slab.h>
27#include <linux/sched.h> 26#include <linux/sched.h>
28 27
29#include "speedstep-lib.h" 28#include "speedstep-lib.h"
@@ -39,7 +38,7 @@ static struct pci_dev *speedstep_chipset_dev;
39 38
40/* speedstep_processor 39/* speedstep_processor
41 */ 40 */
42static unsigned int speedstep_processor; 41static enum speedstep_processor speedstep_processor;
43 42
44static u32 pmbase; 43static u32 pmbase;
45 44
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
index f4c290b8482f..a94ec6be69fa 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.c
@@ -13,7 +13,6 @@
13#include <linux/moduleparam.h> 13#include <linux/moduleparam.h>
14#include <linux/init.h> 14#include <linux/init.h>
15#include <linux/cpufreq.h> 15#include <linux/cpufreq.h>
16#include <linux/slab.h>
17 16
18#include <asm/msr.h> 17#include <asm/msr.h>
19#include <asm/tsc.h> 18#include <asm/tsc.h>
@@ -34,7 +33,7 @@ static int relaxed_check;
34 * GET PROCESSOR CORE SPEED IN KHZ * 33 * GET PROCESSOR CORE SPEED IN KHZ *
35 *********************************************************************/ 34 *********************************************************************/
36 35
37static unsigned int pentium3_get_frequency(unsigned int processor) 36static unsigned int pentium3_get_frequency(enum speedstep_processor processor)
38{ 37{
39 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */ 38 /* See table 14 of p3_ds.pdf and table 22 of 29834003.pdf */
40 struct { 39 struct {
@@ -227,7 +226,7 @@ static unsigned int pentium4_get_frequency(void)
227 226
228 227
229/* Warning: may get called from smp_call_function_single. */ 228/* Warning: may get called from smp_call_function_single. */
230unsigned int speedstep_get_frequency(unsigned int processor) 229unsigned int speedstep_get_frequency(enum speedstep_processor processor)
231{ 230{
232 switch (processor) { 231 switch (processor) {
233 case SPEEDSTEP_CPU_PCORE: 232 case SPEEDSTEP_CPU_PCORE:
@@ -380,7 +379,7 @@ EXPORT_SYMBOL_GPL(speedstep_detect_processor);
380 * DETECT SPEEDSTEP SPEEDS * 379 * DETECT SPEEDSTEP SPEEDS *
381 *********************************************************************/ 380 *********************************************************************/
382 381
383unsigned int speedstep_get_freqs(unsigned int processor, 382unsigned int speedstep_get_freqs(enum speedstep_processor processor,
384 unsigned int *low_speed, 383 unsigned int *low_speed,
385 unsigned int *high_speed, 384 unsigned int *high_speed,
386 unsigned int *transition_latency, 385 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
index 2b6c04e5a304..70d9cea1219d 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-lib.h
@@ -11,18 +11,18 @@
11 11
12 12
13/* processors */ 13/* processors */
14 14enum speedstep_processor {
15#define SPEEDSTEP_CPU_PIII_C_EARLY 0x00000001 /* Coppermine core */ 15 SPEEDSTEP_CPU_PIII_C_EARLY = 0x00000001, /* Coppermine core */
16#define SPEEDSTEP_CPU_PIII_C 0x00000002 /* Coppermine core */ 16 SPEEDSTEP_CPU_PIII_C = 0x00000002, /* Coppermine core */
17#define SPEEDSTEP_CPU_PIII_T 0x00000003 /* Tualatin core */ 17 SPEEDSTEP_CPU_PIII_T = 0x00000003, /* Tualatin core */
18#define SPEEDSTEP_CPU_P4M 0x00000004 /* P4-M */ 18 SPEEDSTEP_CPU_P4M = 0x00000004, /* P4-M */
19
20/* the following processors are not speedstep-capable and are not auto-detected 19/* the following processors are not speedstep-capable and are not auto-detected
21 * in speedstep_detect_processor(). However, their speed can be detected using 20 * in speedstep_detect_processor(). However, their speed can be detected using
22 * the speedstep_get_frequency() call. */ 21 * the speedstep_get_frequency() call. */
23#define SPEEDSTEP_CPU_PM 0xFFFFFF03 /* Pentium M */ 22 SPEEDSTEP_CPU_PM = 0xFFFFFF03, /* Pentium M */
24#define SPEEDSTEP_CPU_P4D 0xFFFFFF04 /* desktop P4 */ 23 SPEEDSTEP_CPU_P4D = 0xFFFFFF04, /* desktop P4 */
25#define SPEEDSTEP_CPU_PCORE 0xFFFFFF05 /* Core */ 24 SPEEDSTEP_CPU_PCORE = 0xFFFFFF05, /* Core */
25};
26 26
27/* speedstep states -- only two of them */ 27/* speedstep states -- only two of them */
28 28
@@ -31,10 +31,10 @@
31 31
32 32
33/* detect a speedstep-capable processor */ 33/* detect a speedstep-capable processor */
34extern unsigned int speedstep_detect_processor (void); 34extern enum speedstep_processor speedstep_detect_processor(void);
35 35
36/* detect the current speed (in khz) of the processor */ 36/* detect the current speed (in khz) of the processor */
37extern unsigned int speedstep_get_frequency(unsigned int processor); 37extern unsigned int speedstep_get_frequency(enum speedstep_processor processor);
38 38
39 39
40/* detect the low and high speeds of the processor. The callback 40/* detect the low and high speeds of the processor. The callback
@@ -42,7 +42,7 @@ extern unsigned int speedstep_get_frequency(unsigned int processor);
42 * SPEEDSTEP_LOW; the second argument is zero so that no 42 * SPEEDSTEP_LOW; the second argument is zero so that no
43 * cpufreq_notify_transition calls are initiated. 43 * cpufreq_notify_transition calls are initiated.
44 */ 44 */
45extern unsigned int speedstep_get_freqs(unsigned int processor, 45extern unsigned int speedstep_get_freqs(enum speedstep_processor processor,
46 unsigned int *low_speed, 46 unsigned int *low_speed,
47 unsigned int *high_speed, 47 unsigned int *high_speed,
48 unsigned int *transition_latency, 48 unsigned int *transition_latency,
diff --git a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
index befea088e4f5..8abd869baabf 100644
--- a/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
+++ b/arch/x86/kernel/cpu/cpufreq/speedstep-smi.c
@@ -17,7 +17,6 @@
17#include <linux/moduleparam.h> 17#include <linux/moduleparam.h>
18#include <linux/init.h> 18#include <linux/init.h>
19#include <linux/cpufreq.h> 19#include <linux/cpufreq.h>
20#include <linux/slab.h>
21#include <linux/delay.h> 20#include <linux/delay.h>
22#include <linux/io.h> 21#include <linux/io.h>
23#include <asm/ist.h> 22#include <asm/ist.h>
@@ -35,7 +34,7 @@ static int smi_cmd;
35static unsigned int smi_sig; 34static unsigned int smi_sig;
36 35
37/* info about the processor */ 36/* info about the processor */
38static unsigned int speedstep_processor; 37static enum speedstep_processor speedstep_processor;
39 38
40/* 39/*
41 * There are only two frequency states for each processor. Values 40 * There are only two frequency states for each processor. Values
diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c
index 19807b89f058..4fbd384fb645 100644
--- a/arch/x86/kernel/cpu/cyrix.c
+++ b/arch/x86/kernel/cpu/cyrix.c
@@ -373,7 +373,7 @@ static void __cpuinit init_nsc(struct cpuinfo_x86 *c)
373 /* Handle the GX (Formally known as the GX2) */ 373 /* Handle the GX (Formally known as the GX2) */
374 374
375 if (c->x86 == 5 && c->x86_model == 5) 375 if (c->x86 == 5 && c->x86_model == 5)
376 display_cacheinfo(c); 376 cpu_detect_cache_sizes(c);
377 else 377 else
378 init_cyrix(c); 378 init_cyrix(c);
379} 379}
diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c
index 40e1835b35e8..1366c7cfd483 100644
--- a/arch/x86/kernel/cpu/intel.c
+++ b/arch/x86/kernel/cpu/intel.c
@@ -47,6 +47,27 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
47 (c->x86 == 0x6 && c->x86_model >= 0x0e)) 47 (c->x86 == 0x6 && c->x86_model >= 0x0e))
48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 48 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
49 49
50 /*
51 * Atom erratum AAE44/AAF40/AAG38/AAH41:
52 *
53 * A race condition between speculative fetches and invalidating
54 * a large page. This is worked around in microcode, but we
55 * need the microcode to have already been loaded... so if it is
56 * not, recommend a BIOS update and disable large pages.
57 */
58 if (c->x86 == 6 && c->x86_model == 0x1c && c->x86_mask <= 2) {
59 u32 ucode, junk;
60
61 wrmsr(MSR_IA32_UCODE_REV, 0, 0);
62 sync_core();
63 rdmsr(MSR_IA32_UCODE_REV, junk, ucode);
64
65 if (ucode < 0x20e) {
66 printk(KERN_WARNING "Atom PSE erratum detected, BIOS microcode update recommended\n");
67 clear_cpu_cap(c, X86_FEATURE_PSE);
68 }
69 }
70
50#ifdef CONFIG_X86_64 71#ifdef CONFIG_X86_64
51 set_cpu_cap(c, X86_FEATURE_SYSENTER32); 72 set_cpu_cap(c, X86_FEATURE_SYSENTER32);
52#else 73#else
@@ -70,8 +91,8 @@ static void __cpuinit early_init_intel(struct cpuinfo_x86 *c)
70 if (c->x86_power & (1 << 8)) { 91 if (c->x86_power & (1 << 8)) {
71 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); 92 set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC);
72 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC); 93 set_cpu_cap(c, X86_FEATURE_NONSTOP_TSC);
73 set_cpu_cap(c, X86_FEATURE_TSC_RELIABLE); 94 if (!check_tsc_unstable())
74 sched_clock_stable = 1; 95 sched_clock_stable = 1;
75 } 96 }
76 97
77 /* 98 /*
@@ -263,11 +284,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c)
263 /* Don't do the funky fallback heuristics the AMD version employs 284 /* Don't do the funky fallback heuristics the AMD version employs
264 for now. */ 285 for now. */
265 node = apicid_to_node[apicid]; 286 node = apicid_to_node[apicid];
266 if (node == NUMA_NO_NODE || !node_online(node)) 287 if (node == NUMA_NO_NODE)
267 node = first_node(node_online_map); 288 node = first_node(node_online_map);
289 else if (!node_online(node)) {
290 /* reuse the value from init_cpu_to_node() */
291 node = cpu_to_node(cpu);
292 }
268 numa_set_node(cpu, node); 293 numa_set_node(cpu, node);
269
270 printk(KERN_INFO "CPU %d/0x%x -> Node %d\n", cpu, apicid, node);
271#endif 294#endif
272} 295}
273 296
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 3167c3d72596..94d8e475744c 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -18,6 +18,7 @@
18#include <asm/processor.h> 18#include <asm/processor.h>
19#include <linux/smp.h> 19#include <linux/smp.h>
20#include <asm/k8.h> 20#include <asm/k8.h>
21#include <asm/smp.h>
21 22
22#define LVL_1_INST 1 23#define LVL_1_INST 1
23#define LVL_1_DATA 2 24#define LVL_1_DATA 2
@@ -31,6 +32,8 @@ struct _cache_table {
31 short size; 32 short size;
32}; 33};
33 34
35#define MB(x) ((x) * 1024)
36
34/* All the cache descriptor types we care about (no TLB or 37/* All the cache descriptor types we care about (no TLB or
35 trace cache entries) */ 38 trace cache entries) */
36 39
@@ -44,9 +47,9 @@ static const struct _cache_table __cpuinitconst cache_table[] =
44 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */ 47 { 0x0d, LVL_1_DATA, 16 }, /* 4-way set assoc, 64 byte line size */
45 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */ 48 { 0x21, LVL_2, 256 }, /* 8-way set assoc, 64 byte line size */
46 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 49 { 0x22, LVL_3, 512 }, /* 4-way set assoc, sectored cache, 64 byte line size */
47 { 0x23, LVL_3, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 50 { 0x23, LVL_3, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
48 { 0x25, LVL_3, 2048 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 51 { 0x25, LVL_3, MB(2) }, /* 8-way set assoc, sectored cache, 64 byte line size */
49 { 0x29, LVL_3, 4096 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 52 { 0x29, LVL_3, MB(4) }, /* 8-way set assoc, sectored cache, 64 byte line size */
50 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */ 53 { 0x2c, LVL_1_DATA, 32 }, /* 8-way set assoc, 64 byte line size */
51 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */ 54 { 0x30, LVL_1_INST, 32 }, /* 8-way set assoc, 64 byte line size */
52 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 55 { 0x39, LVL_2, 128 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -59,16 +62,16 @@ static const struct _cache_table __cpuinitconst cache_table[] =
59 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */ 62 { 0x41, LVL_2, 128 }, /* 4-way set assoc, 32 byte line size */
60 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */ 63 { 0x42, LVL_2, 256 }, /* 4-way set assoc, 32 byte line size */
61 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */ 64 { 0x43, LVL_2, 512 }, /* 4-way set assoc, 32 byte line size */
62 { 0x44, LVL_2, 1024 }, /* 4-way set assoc, 32 byte line size */ 65 { 0x44, LVL_2, MB(1) }, /* 4-way set assoc, 32 byte line size */
63 { 0x45, LVL_2, 2048 }, /* 4-way set assoc, 32 byte line size */ 66 { 0x45, LVL_2, MB(2) }, /* 4-way set assoc, 32 byte line size */
64 { 0x46, LVL_3, 4096 }, /* 4-way set assoc, 64 byte line size */ 67 { 0x46, LVL_3, MB(4) }, /* 4-way set assoc, 64 byte line size */
65 { 0x47, LVL_3, 8192 }, /* 8-way set assoc, 64 byte line size */ 68 { 0x47, LVL_3, MB(8) }, /* 8-way set assoc, 64 byte line size */
66 { 0x49, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 69 { 0x49, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
67 { 0x4a, LVL_3, 6144 }, /* 12-way set assoc, 64 byte line size */ 70 { 0x4a, LVL_3, MB(6) }, /* 12-way set assoc, 64 byte line size */
68 { 0x4b, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 71 { 0x4b, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
69 { 0x4c, LVL_3, 12288 }, /* 12-way set assoc, 64 byte line size */ 72 { 0x4c, LVL_3, MB(12) }, /* 12-way set assoc, 64 byte line size */
70 { 0x4d, LVL_3, 16384 }, /* 16-way set assoc, 64 byte line size */ 73 { 0x4d, LVL_3, MB(16) }, /* 16-way set assoc, 64 byte line size */
71 { 0x4e, LVL_2, 6144 }, /* 24-way set assoc, 64 byte line size */ 74 { 0x4e, LVL_2, MB(6) }, /* 24-way set assoc, 64 byte line size */
72 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 75 { 0x60, LVL_1_DATA, 16 }, /* 8-way set assoc, sectored cache, 64 byte line size */
73 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 76 { 0x66, LVL_1_DATA, 8 }, /* 4-way set assoc, sectored cache, 64 byte line size */
74 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */ 77 { 0x67, LVL_1_DATA, 16 }, /* 4-way set assoc, sectored cache, 64 byte line size */
@@ -77,31 +80,34 @@ static const struct _cache_table __cpuinitconst cache_table[] =
77 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */ 80 { 0x71, LVL_TRACE, 16 }, /* 8-way set assoc */
78 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */ 81 { 0x72, LVL_TRACE, 32 }, /* 8-way set assoc */
79 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */ 82 { 0x73, LVL_TRACE, 64 }, /* 8-way set assoc */
80 { 0x78, LVL_2, 1024 }, /* 4-way set assoc, 64 byte line size */ 83 { 0x78, LVL_2, MB(1) }, /* 4-way set assoc, 64 byte line size */
81 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 84 { 0x79, LVL_2, 128 }, /* 8-way set assoc, sectored cache, 64 byte line size */
82 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 85 { 0x7a, LVL_2, 256 }, /* 8-way set assoc, sectored cache, 64 byte line size */
83 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 86 { 0x7b, LVL_2, 512 }, /* 8-way set assoc, sectored cache, 64 byte line size */
84 { 0x7c, LVL_2, 1024 }, /* 8-way set assoc, sectored cache, 64 byte line size */ 87 { 0x7c, LVL_2, MB(1) }, /* 8-way set assoc, sectored cache, 64 byte line size */
85 { 0x7d, LVL_2, 2048 }, /* 8-way set assoc, 64 byte line size */ 88 { 0x7d, LVL_2, MB(2) }, /* 8-way set assoc, 64 byte line size */
86 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */ 89 { 0x7f, LVL_2, 512 }, /* 2-way set assoc, 64 byte line size */
87 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */ 90 { 0x82, LVL_2, 256 }, /* 8-way set assoc, 32 byte line size */
88 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */ 91 { 0x83, LVL_2, 512 }, /* 8-way set assoc, 32 byte line size */
89 { 0x84, LVL_2, 1024 }, /* 8-way set assoc, 32 byte line size */ 92 { 0x84, LVL_2, MB(1) }, /* 8-way set assoc, 32 byte line size */
90 { 0x85, LVL_2, 2048 }, /* 8-way set assoc, 32 byte line size */ 93 { 0x85, LVL_2, MB(2) }, /* 8-way set assoc, 32 byte line size */
91 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */ 94 { 0x86, LVL_2, 512 }, /* 4-way set assoc, 64 byte line size */
92 { 0x87, LVL_2, 1024 }, /* 8-way set assoc, 64 byte line size */ 95 { 0x87, LVL_2, MB(1) }, /* 8-way set assoc, 64 byte line size */
93 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */ 96 { 0xd0, LVL_3, 512 }, /* 4-way set assoc, 64 byte line size */
94 { 0xd1, LVL_3, 1024 }, /* 4-way set assoc, 64 byte line size */ 97 { 0xd1, LVL_3, MB(1) }, /* 4-way set assoc, 64 byte line size */
95 { 0xd2, LVL_3, 2048 }, /* 4-way set assoc, 64 byte line size */ 98 { 0xd2, LVL_3, MB(2) }, /* 4-way set assoc, 64 byte line size */
96 { 0xd6, LVL_3, 1024 }, /* 8-way set assoc, 64 byte line size */ 99 { 0xd6, LVL_3, MB(1) }, /* 8-way set assoc, 64 byte line size */
97 { 0xd7, LVL_3, 2038 }, /* 8-way set assoc, 64 byte line size */ 100 { 0xd7, LVL_3, MB(2) }, /* 8-way set assoc, 64 byte line size */
98 { 0xd8, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 101 { 0xd8, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
99 { 0xdc, LVL_3, 2048 }, /* 12-way set assoc, 64 byte line size */ 102 { 0xdc, LVL_3, MB(2) }, /* 12-way set assoc, 64 byte line size */
100 { 0xdd, LVL_3, 4096 }, /* 12-way set assoc, 64 byte line size */ 103 { 0xdd, LVL_3, MB(4) }, /* 12-way set assoc, 64 byte line size */
101 { 0xde, LVL_3, 8192 }, /* 12-way set assoc, 64 byte line size */ 104 { 0xde, LVL_3, MB(8) }, /* 12-way set assoc, 64 byte line size */
102 { 0xe2, LVL_3, 2048 }, /* 16-way set assoc, 64 byte line size */ 105 { 0xe2, LVL_3, MB(2) }, /* 16-way set assoc, 64 byte line size */
103 { 0xe3, LVL_3, 4096 }, /* 16-way set assoc, 64 byte line size */ 106 { 0xe3, LVL_3, MB(4) }, /* 16-way set assoc, 64 byte line size */
104 { 0xe4, LVL_3, 8192 }, /* 16-way set assoc, 64 byte line size */ 107 { 0xe4, LVL_3, MB(8) }, /* 16-way set assoc, 64 byte line size */
108 { 0xea, LVL_3, MB(12) }, /* 24-way set assoc, 64 byte line size */
109 { 0xeb, LVL_3, MB(18) }, /* 24-way set assoc, 64 byte line size */
110 { 0xec, LVL_3, MB(24) }, /* 24-way set assoc, 64 byte line size */
105 { 0x00, 0, 0} 111 { 0x00, 0, 0}
106}; 112};
107 113
@@ -147,7 +153,8 @@ struct _cpuid4_info {
147 union _cpuid4_leaf_ebx ebx; 153 union _cpuid4_leaf_ebx ebx;
148 union _cpuid4_leaf_ecx ecx; 154 union _cpuid4_leaf_ecx ecx;
149 unsigned long size; 155 unsigned long size;
150 unsigned long can_disable; 156 bool can_disable;
157 unsigned int l3_indices;
151 DECLARE_BITMAP(shared_cpu_map, NR_CPUS); 158 DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
152}; 159};
153 160
@@ -157,7 +164,8 @@ struct _cpuid4_info_regs {
157 union _cpuid4_leaf_ebx ebx; 164 union _cpuid4_leaf_ebx ebx;
158 union _cpuid4_leaf_ecx ecx; 165 union _cpuid4_leaf_ecx ecx;
159 unsigned long size; 166 unsigned long size;
160 unsigned long can_disable; 167 bool can_disable;
168 unsigned int l3_indices;
161}; 169};
162 170
163unsigned short num_cache_leaves; 171unsigned short num_cache_leaves;
@@ -287,6 +295,36 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
287 (ebx->split.ways_of_associativity + 1) - 1; 295 (ebx->split.ways_of_associativity + 1) - 1;
288} 296}
289 297
298struct _cache_attr {
299 struct attribute attr;
300 ssize_t (*show)(struct _cpuid4_info *, char *);
301 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
302};
303
304#ifdef CONFIG_CPU_SUP_AMD
305static unsigned int __cpuinit amd_calc_l3_indices(void)
306{
307 /*
308 * We're called over smp_call_function_single() and therefore
309 * are on the correct cpu.
310 */
311 int cpu = smp_processor_id();
312 int node = cpu_to_node(cpu);
313 struct pci_dev *dev = node_to_k8_nb_misc(node);
314 unsigned int sc0, sc1, sc2, sc3;
315 u32 val = 0;
316
317 pci_read_config_dword(dev, 0x1C4, &val);
318
319 /* calculate subcache sizes */
320 sc0 = !(val & BIT(0));
321 sc1 = !(val & BIT(4));
322 sc2 = !(val & BIT(8)) + !(val & BIT(9));
323 sc3 = !(val & BIT(12)) + !(val & BIT(13));
324
325 return (max(max(max(sc0, sc1), sc2), sc3) << 10) - 1;
326}
327
290static void __cpuinit 328static void __cpuinit
291amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) 329amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
292{ 330{
@@ -296,13 +334,108 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
296 if (boot_cpu_data.x86 == 0x11) 334 if (boot_cpu_data.x86 == 0x11)
297 return; 335 return;
298 336
299 /* see erratum #382 */ 337 /* see errata #382 and #388 */
300 if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) 338 if ((boot_cpu_data.x86 == 0x10) &&
339 ((boot_cpu_data.x86_model < 0x8) ||
340 (boot_cpu_data.x86_mask < 0x1)))
301 return; 341 return;
302 342
303 this_leaf->can_disable = 1; 343 /* not in virtualized environments */
344 if (num_k8_northbridges == 0)
345 return;
346
347 this_leaf->can_disable = true;
348 this_leaf->l3_indices = amd_calc_l3_indices();
349}
350
351static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
352 unsigned int index)
353{
354 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
355 int node = amd_get_nb_id(cpu);
356 struct pci_dev *dev = node_to_k8_nb_misc(node);
357 unsigned int reg = 0;
358
359 if (!this_leaf->can_disable)
360 return -EINVAL;
361
362 if (!dev)
363 return -EINVAL;
364
365 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
366 return sprintf(buf, "0x%08x\n", reg);
304} 367}
305 368
369#define SHOW_CACHE_DISABLE(index) \
370static ssize_t \
371show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
372{ \
373 return show_cache_disable(this_leaf, buf, index); \
374}
375SHOW_CACHE_DISABLE(0)
376SHOW_CACHE_DISABLE(1)
377
378static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
379 const char *buf, size_t count, unsigned int index)
380{
381 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
382 int node = amd_get_nb_id(cpu);
383 struct pci_dev *dev = node_to_k8_nb_misc(node);
384 unsigned long val = 0;
385
386#define SUBCACHE_MASK (3UL << 20)
387#define SUBCACHE_INDEX 0xfff
388
389 if (!this_leaf->can_disable)
390 return -EINVAL;
391
392 if (!capable(CAP_SYS_ADMIN))
393 return -EPERM;
394
395 if (!dev)
396 return -EINVAL;
397
398 if (strict_strtoul(buf, 10, &val) < 0)
399 return -EINVAL;
400
401 /* do not allow writes outside of allowed bits */
402 if ((val & ~(SUBCACHE_MASK | SUBCACHE_INDEX)) ||
403 ((val & SUBCACHE_INDEX) > this_leaf->l3_indices))
404 return -EINVAL;
405
406 val |= BIT(30);
407 pci_write_config_dword(dev, 0x1BC + index * 4, val);
408 /*
409 * We need to WBINVD on a core on the node containing the L3 cache which
410 * indices we disable therefore a simple wbinvd() is not sufficient.
411 */
412 wbinvd_on_cpu(cpu);
413 pci_write_config_dword(dev, 0x1BC + index * 4, val | BIT(31));
414 return count;
415}
416
417#define STORE_CACHE_DISABLE(index) \
418static ssize_t \
419store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
420 const char *buf, size_t count) \
421{ \
422 return store_cache_disable(this_leaf, buf, count, index); \
423}
424STORE_CACHE_DISABLE(0)
425STORE_CACHE_DISABLE(1)
426
427static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
428 show_cache_disable_0, store_cache_disable_0);
429static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
430 show_cache_disable_1, store_cache_disable_1);
431
432#else /* CONFIG_CPU_SUP_AMD */
433static void __cpuinit
434amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf)
435{
436};
437#endif /* CONFIG_CPU_SUP_AMD */
438
306static int 439static int
307__cpuinit cpuid4_cache_lookup_regs(int index, 440__cpuinit cpuid4_cache_lookup_regs(int index,
308 struct _cpuid4_info_regs *this_leaf) 441 struct _cpuid4_info_regs *this_leaf)
@@ -488,22 +621,6 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
488#endif 621#endif
489 } 622 }
490 623
491 if (trace)
492 printk(KERN_INFO "CPU: Trace cache: %dK uops", trace);
493 else if (l1i)
494 printk(KERN_INFO "CPU: L1 I cache: %dK", l1i);
495
496 if (l1d)
497 printk(KERN_CONT ", L1 D cache: %dK\n", l1d);
498 else
499 printk(KERN_CONT "\n");
500
501 if (l2)
502 printk(KERN_INFO "CPU: L2 cache: %dK\n", l2);
503
504 if (l3)
505 printk(KERN_INFO "CPU: L3 cache: %dK\n", l3);
506
507 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d)); 624 c->x86_cache_size = l3 ? l3 : (l2 ? l2 : (l1i+l1d));
508 625
509 return l2; 626 return l2;
@@ -512,8 +629,8 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
512#ifdef CONFIG_SYSFS 629#ifdef CONFIG_SYSFS
513 630
514/* pointer to _cpuid4_info array (for each cache leaf) */ 631/* pointer to _cpuid4_info array (for each cache leaf) */
515static DEFINE_PER_CPU(struct _cpuid4_info *, cpuid4_info); 632static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
516#define CPUID4_INFO_IDX(x, y) (&((per_cpu(cpuid4_info, x))[y])) 633#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
517 634
518/* returns CPUs that share the index cache with cpu */ 635/* returns CPUs that share the index cache with cpu */
519int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index) 636int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
@@ -537,18 +654,19 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
537{ 654{
538 struct _cpuid4_info *this_leaf, *sibling_leaf; 655 struct _cpuid4_info *this_leaf, *sibling_leaf;
539 unsigned long num_threads_sharing; 656 unsigned long num_threads_sharing;
540 int index_msb, i; 657 int index_msb, i, sibling;
541 struct cpuinfo_x86 *c = &cpu_data(cpu); 658 struct cpuinfo_x86 *c = &cpu_data(cpu);
542 659
543 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) { 660 if ((index == 3) && (c->x86_vendor == X86_VENDOR_AMD)) {
544 struct cpuinfo_x86 *d; 661 for_each_cpu(i, c->llc_shared_map) {
545 for_each_online_cpu(i) { 662 if (!per_cpu(ici_cpuid4_info, i))
546 if (!per_cpu(cpuid4_info, i))
547 continue; 663 continue;
548 d = &cpu_data(i);
549 this_leaf = CPUID4_INFO_IDX(i, index); 664 this_leaf = CPUID4_INFO_IDX(i, index);
550 cpumask_copy(to_cpumask(this_leaf->shared_cpu_map), 665 for_each_cpu(sibling, c->llc_shared_map) {
551 d->llc_shared_map); 666 if (!cpu_online(sibling))
667 continue;
668 set_bit(sibling, this_leaf->shared_cpu_map);
669 }
552 } 670 }
553 return; 671 return;
554 } 672 }
@@ -565,7 +683,7 @@ static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
565 c->apicid >> index_msb) { 683 c->apicid >> index_msb) {
566 cpumask_set_cpu(i, 684 cpumask_set_cpu(i,
567 to_cpumask(this_leaf->shared_cpu_map)); 685 to_cpumask(this_leaf->shared_cpu_map));
568 if (i != cpu && per_cpu(cpuid4_info, i)) { 686 if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
569 sibling_leaf = 687 sibling_leaf =
570 CPUID4_INFO_IDX(i, index); 688 CPUID4_INFO_IDX(i, index);
571 cpumask_set_cpu(cpu, to_cpumask( 689 cpumask_set_cpu(cpu, to_cpumask(
@@ -604,8 +722,8 @@ static void __cpuinit free_cache_attributes(unsigned int cpu)
604 for (i = 0; i < num_cache_leaves; i++) 722 for (i = 0; i < num_cache_leaves; i++)
605 cache_remove_shared_cpu_map(cpu, i); 723 cache_remove_shared_cpu_map(cpu, i);
606 724
607 kfree(per_cpu(cpuid4_info, cpu)); 725 kfree(per_cpu(ici_cpuid4_info, cpu));
608 per_cpu(cpuid4_info, cpu) = NULL; 726 per_cpu(ici_cpuid4_info, cpu) = NULL;
609} 727}
610 728
611static int 729static int
@@ -644,15 +762,15 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
644 if (num_cache_leaves == 0) 762 if (num_cache_leaves == 0)
645 return -ENOENT; 763 return -ENOENT;
646 764
647 per_cpu(cpuid4_info, cpu) = kzalloc( 765 per_cpu(ici_cpuid4_info, cpu) = kzalloc(
648 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); 766 sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
649 if (per_cpu(cpuid4_info, cpu) == NULL) 767 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
650 return -ENOMEM; 768 return -ENOMEM;
651 769
652 smp_call_function_single(cpu, get_cpu_leaves, &retval, true); 770 smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
653 if (retval) { 771 if (retval) {
654 kfree(per_cpu(cpuid4_info, cpu)); 772 kfree(per_cpu(ici_cpuid4_info, cpu));
655 per_cpu(cpuid4_info, cpu) = NULL; 773 per_cpu(ici_cpuid4_info, cpu) = NULL;
656 } 774 }
657 775
658 return retval; 776 return retval;
@@ -664,7 +782,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu)
664extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ 782extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */
665 783
666/* pointer to kobject for cpuX/cache */ 784/* pointer to kobject for cpuX/cache */
667static DEFINE_PER_CPU(struct kobject *, cache_kobject); 785static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
668 786
669struct _index_kobject { 787struct _index_kobject {
670 struct kobject kobj; 788 struct kobject kobj;
@@ -673,8 +791,8 @@ struct _index_kobject {
673}; 791};
674 792
675/* pointer to array of kobjects for cpuX/cache/indexY */ 793/* pointer to array of kobjects for cpuX/cache/indexY */
676static DEFINE_PER_CPU(struct _index_kobject *, index_kobject); 794static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
677#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(index_kobject, x))[y])) 795#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
678 796
679#define show_one_plus(file_name, object, val) \ 797#define show_one_plus(file_name, object, val) \
680static ssize_t show_##file_name \ 798static ssize_t show_##file_name \
@@ -740,82 +858,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf)
740#define to_object(k) container_of(k, struct _index_kobject, kobj) 858#define to_object(k) container_of(k, struct _index_kobject, kobj)
741#define to_attr(a) container_of(a, struct _cache_attr, attr) 859#define to_attr(a) container_of(a, struct _cache_attr, attr)
742 860
743static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
744 unsigned int index)
745{
746 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
747 int node = cpu_to_node(cpu);
748 struct pci_dev *dev = node_to_k8_nb_misc(node);
749 unsigned int reg = 0;
750
751 if (!this_leaf->can_disable)
752 return -EINVAL;
753
754 if (!dev)
755 return -EINVAL;
756
757 pci_read_config_dword(dev, 0x1BC + index * 4, &reg);
758 return sprintf(buf, "%x\n", reg);
759}
760
761#define SHOW_CACHE_DISABLE(index) \
762static ssize_t \
763show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \
764{ \
765 return show_cache_disable(this_leaf, buf, index); \
766}
767SHOW_CACHE_DISABLE(0)
768SHOW_CACHE_DISABLE(1)
769
770static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
771 const char *buf, size_t count, unsigned int index)
772{
773 int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
774 int node = cpu_to_node(cpu);
775 struct pci_dev *dev = node_to_k8_nb_misc(node);
776 unsigned long val = 0;
777 unsigned int scrubber = 0;
778
779 if (!this_leaf->can_disable)
780 return -EINVAL;
781
782 if (!capable(CAP_SYS_ADMIN))
783 return -EPERM;
784
785 if (!dev)
786 return -EINVAL;
787
788 if (strict_strtoul(buf, 10, &val) < 0)
789 return -EINVAL;
790
791 val |= 0xc0000000;
792
793 pci_read_config_dword(dev, 0x58, &scrubber);
794 scrubber &= ~0x1f000000;
795 pci_write_config_dword(dev, 0x58, scrubber);
796
797 pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000);
798 wbinvd();
799 pci_write_config_dword(dev, 0x1BC + index * 4, val);
800 return count;
801}
802
803#define STORE_CACHE_DISABLE(index) \
804static ssize_t \
805store_cache_disable_##index(struct _cpuid4_info *this_leaf, \
806 const char *buf, size_t count) \
807{ \
808 return store_cache_disable(this_leaf, buf, count, index); \
809}
810STORE_CACHE_DISABLE(0)
811STORE_CACHE_DISABLE(1)
812
813struct _cache_attr {
814 struct attribute attr;
815 ssize_t (*show)(struct _cpuid4_info *, char *);
816 ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count);
817};
818
819#define define_one_ro(_name) \ 861#define define_one_ro(_name) \
820static struct _cache_attr _name = \ 862static struct _cache_attr _name = \
821 __ATTR(_name, 0444, show_##_name, NULL) 863 __ATTR(_name, 0444, show_##_name, NULL)
@@ -830,23 +872,28 @@ define_one_ro(size);
830define_one_ro(shared_cpu_map); 872define_one_ro(shared_cpu_map);
831define_one_ro(shared_cpu_list); 873define_one_ro(shared_cpu_list);
832 874
833static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, 875#define DEFAULT_SYSFS_CACHE_ATTRS \
834 show_cache_disable_0, store_cache_disable_0); 876 &type.attr, \
835static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, 877 &level.attr, \
836 show_cache_disable_1, store_cache_disable_1); 878 &coherency_line_size.attr, \
879 &physical_line_partition.attr, \
880 &ways_of_associativity.attr, \
881 &number_of_sets.attr, \
882 &size.attr, \
883 &shared_cpu_map.attr, \
884 &shared_cpu_list.attr
837 885
838static struct attribute *default_attrs[] = { 886static struct attribute *default_attrs[] = {
839 &type.attr, 887 DEFAULT_SYSFS_CACHE_ATTRS,
840 &level.attr, 888 NULL
841 &coherency_line_size.attr, 889};
842 &physical_line_partition.attr, 890
843 &ways_of_associativity.attr, 891static struct attribute *default_l3_attrs[] = {
844 &number_of_sets.attr, 892 DEFAULT_SYSFS_CACHE_ATTRS,
845 &size.attr, 893#ifdef CONFIG_CPU_SUP_AMD
846 &shared_cpu_map.attr,
847 &shared_cpu_list.attr,
848 &cache_disable_0.attr, 894 &cache_disable_0.attr,
849 &cache_disable_1.attr, 895 &cache_disable_1.attr,
896#endif
850 NULL 897 NULL
851}; 898};
852 899
@@ -877,7 +924,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
877 return ret; 924 return ret;
878} 925}
879 926
880static struct sysfs_ops sysfs_ops = { 927static const struct sysfs_ops sysfs_ops = {
881 .show = show, 928 .show = show,
882 .store = store, 929 .store = store,
883}; 930};
@@ -893,10 +940,10 @@ static struct kobj_type ktype_percpu_entry = {
893 940
894static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu) 941static void __cpuinit cpuid4_cache_sysfs_exit(unsigned int cpu)
895{ 942{
896 kfree(per_cpu(cache_kobject, cpu)); 943 kfree(per_cpu(ici_cache_kobject, cpu));
897 kfree(per_cpu(index_kobject, cpu)); 944 kfree(per_cpu(ici_index_kobject, cpu));
898 per_cpu(cache_kobject, cpu) = NULL; 945 per_cpu(ici_cache_kobject, cpu) = NULL;
899 per_cpu(index_kobject, cpu) = NULL; 946 per_cpu(ici_index_kobject, cpu) = NULL;
900 free_cache_attributes(cpu); 947 free_cache_attributes(cpu);
901} 948}
902 949
@@ -912,14 +959,14 @@ static int __cpuinit cpuid4_cache_sysfs_init(unsigned int cpu)
912 return err; 959 return err;
913 960
914 /* Allocate all required memory */ 961 /* Allocate all required memory */
915 per_cpu(cache_kobject, cpu) = 962 per_cpu(ici_cache_kobject, cpu) =
916 kzalloc(sizeof(struct kobject), GFP_KERNEL); 963 kzalloc(sizeof(struct kobject), GFP_KERNEL);
917 if (unlikely(per_cpu(cache_kobject, cpu) == NULL)) 964 if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
918 goto err_out; 965 goto err_out;
919 966
920 per_cpu(index_kobject, cpu) = kzalloc( 967 per_cpu(ici_index_kobject, cpu) = kzalloc(
921 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); 968 sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
922 if (unlikely(per_cpu(index_kobject, cpu) == NULL)) 969 if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
923 goto err_out; 970 goto err_out;
924 971
925 return 0; 972 return 0;
@@ -937,13 +984,14 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
937 unsigned int cpu = sys_dev->id; 984 unsigned int cpu = sys_dev->id;
938 unsigned long i, j; 985 unsigned long i, j;
939 struct _index_kobject *this_object; 986 struct _index_kobject *this_object;
987 struct _cpuid4_info *this_leaf;
940 int retval; 988 int retval;
941 989
942 retval = cpuid4_cache_sysfs_init(cpu); 990 retval = cpuid4_cache_sysfs_init(cpu);
943 if (unlikely(retval < 0)) 991 if (unlikely(retval < 0))
944 return retval; 992 return retval;
945 993
946 retval = kobject_init_and_add(per_cpu(cache_kobject, cpu), 994 retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
947 &ktype_percpu_entry, 995 &ktype_percpu_entry,
948 &sys_dev->kobj, "%s", "cache"); 996 &sys_dev->kobj, "%s", "cache");
949 if (retval < 0) { 997 if (retval < 0) {
@@ -955,14 +1003,22 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
955 this_object = INDEX_KOBJECT_PTR(cpu, i); 1003 this_object = INDEX_KOBJECT_PTR(cpu, i);
956 this_object->cpu = cpu; 1004 this_object->cpu = cpu;
957 this_object->index = i; 1005 this_object->index = i;
1006
1007 this_leaf = CPUID4_INFO_IDX(cpu, i);
1008
1009 if (this_leaf->can_disable)
1010 ktype_cache.default_attrs = default_l3_attrs;
1011 else
1012 ktype_cache.default_attrs = default_attrs;
1013
958 retval = kobject_init_and_add(&(this_object->kobj), 1014 retval = kobject_init_and_add(&(this_object->kobj),
959 &ktype_cache, 1015 &ktype_cache,
960 per_cpu(cache_kobject, cpu), 1016 per_cpu(ici_cache_kobject, cpu),
961 "index%1lu", i); 1017 "index%1lu", i);
962 if (unlikely(retval)) { 1018 if (unlikely(retval)) {
963 for (j = 0; j < i; j++) 1019 for (j = 0; j < i; j++)
964 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); 1020 kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
965 kobject_put(per_cpu(cache_kobject, cpu)); 1021 kobject_put(per_cpu(ici_cache_kobject, cpu));
966 cpuid4_cache_sysfs_exit(cpu); 1022 cpuid4_cache_sysfs_exit(cpu);
967 return retval; 1023 return retval;
968 } 1024 }
@@ -970,7 +1026,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev)
970 } 1026 }
971 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); 1027 cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
972 1028
973 kobject_uevent(per_cpu(cache_kobject, cpu), KOBJ_ADD); 1029 kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
974 return 0; 1030 return 0;
975} 1031}
976 1032
@@ -979,7 +1035,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
979 unsigned int cpu = sys_dev->id; 1035 unsigned int cpu = sys_dev->id;
980 unsigned long i; 1036 unsigned long i;
981 1037
982 if (per_cpu(cpuid4_info, cpu) == NULL) 1038 if (per_cpu(ici_cpuid4_info, cpu) == NULL)
983 return; 1039 return;
984 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) 1040 if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
985 return; 1041 return;
@@ -987,7 +1043,7 @@ static void __cpuinit cache_remove_dev(struct sys_device * sys_dev)
987 1043
988 for (i = 0; i < num_cache_leaves; i++) 1044 for (i = 0; i < num_cache_leaves; i++)
989 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); 1045 kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
990 kobject_put(per_cpu(cache_kobject, cpu)); 1046 kobject_put(per_cpu(ici_cache_kobject, cpu));
991 cpuid4_cache_sysfs_exit(cpu); 1047 cpuid4_cache_sysfs_exit(cpu);
992} 1048}
993 1049
diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c
index 472763d92098..e7dbde7bfedb 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-inject.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c
@@ -22,6 +22,7 @@
22#include <linux/kdebug.h> 22#include <linux/kdebug.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/sched.h> 24#include <linux/sched.h>
25#include <linux/gfp.h>
25#include <asm/mce.h> 26#include <asm/mce.h>
26#include <asm/apic.h> 27#include <asm/apic.h>
27 28
@@ -74,7 +75,7 @@ static void raise_exception(struct mce *m, struct pt_regs *pregs)
74 m->finished = 0; 75 m->finished = 0;
75} 76}
76 77
77static cpumask_t mce_inject_cpumask; 78static cpumask_var_t mce_inject_cpumask;
78 79
79static int mce_raise_notify(struct notifier_block *self, 80static int mce_raise_notify(struct notifier_block *self,
80 unsigned long val, void *data) 81 unsigned long val, void *data)
@@ -82,9 +83,9 @@ static int mce_raise_notify(struct notifier_block *self,
82 struct die_args *args = (struct die_args *)data; 83 struct die_args *args = (struct die_args *)data;
83 int cpu = smp_processor_id(); 84 int cpu = smp_processor_id();
84 struct mce *m = &__get_cpu_var(injectm); 85 struct mce *m = &__get_cpu_var(injectm);
85 if (val != DIE_NMI_IPI || !cpu_isset(cpu, mce_inject_cpumask)) 86 if (val != DIE_NMI_IPI || !cpumask_test_cpu(cpu, mce_inject_cpumask))
86 return NOTIFY_DONE; 87 return NOTIFY_DONE;
87 cpu_clear(cpu, mce_inject_cpumask); 88 cpumask_clear_cpu(cpu, mce_inject_cpumask);
88 if (m->inject_flags & MCJ_EXCEPTION) 89 if (m->inject_flags & MCJ_EXCEPTION)
89 raise_exception(m, args->regs); 90 raise_exception(m, args->regs);
90 else if (m->status) 91 else if (m->status)
@@ -148,22 +149,22 @@ static void raise_mce(struct mce *m)
148 unsigned long start; 149 unsigned long start;
149 int cpu; 150 int cpu;
150 get_online_cpus(); 151 get_online_cpus();
151 mce_inject_cpumask = cpu_online_map; 152 cpumask_copy(mce_inject_cpumask, cpu_online_mask);
152 cpu_clear(get_cpu(), mce_inject_cpumask); 153 cpumask_clear_cpu(get_cpu(), mce_inject_cpumask);
153 for_each_online_cpu(cpu) { 154 for_each_online_cpu(cpu) {
154 struct mce *mcpu = &per_cpu(injectm, cpu); 155 struct mce *mcpu = &per_cpu(injectm, cpu);
155 if (!mcpu->finished || 156 if (!mcpu->finished ||
156 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) 157 MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM)
157 cpu_clear(cpu, mce_inject_cpumask); 158 cpumask_clear_cpu(cpu, mce_inject_cpumask);
158 } 159 }
159 if (!cpus_empty(mce_inject_cpumask)) 160 if (!cpumask_empty(mce_inject_cpumask))
160 apic->send_IPI_mask(&mce_inject_cpumask, NMI_VECTOR); 161 apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR);
161 start = jiffies; 162 start = jiffies;
162 while (!cpus_empty(mce_inject_cpumask)) { 163 while (!cpumask_empty(mce_inject_cpumask)) {
163 if (!time_before(jiffies, start + 2*HZ)) { 164 if (!time_before(jiffies, start + 2*HZ)) {
164 printk(KERN_ERR 165 printk(KERN_ERR
165 "Timeout waiting for mce inject NMI %lx\n", 166 "Timeout waiting for mce inject NMI %lx\n",
166 *cpus_addr(mce_inject_cpumask)); 167 *cpumask_bits(mce_inject_cpumask));
167 break; 168 break;
168 } 169 }
169 cpu_relax(); 170 cpu_relax();
@@ -210,6 +211,8 @@ static ssize_t mce_write(struct file *filp, const char __user *ubuf,
210 211
211static int inject_init(void) 212static int inject_init(void)
212{ 213{
214 if (!alloc_cpumask_var(&mce_inject_cpumask, GFP_KERNEL))
215 return -ENOMEM;
213 printk(KERN_INFO "Machine check injector initialized\n"); 216 printk(KERN_INFO "Machine check injector initialized\n");
214 mce_chrdev_ops.write = mce_write; 217 mce_chrdev_ops.write = mce_write;
215 register_die_notifier(&mce_raise_nb); 218 register_die_notifier(&mce_raise_nb);
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 721a77ca8115..8a6f0afa767e 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/sysfs.h> 27#include <linux/sysfs.h>
28#include <linux/types.h> 28#include <linux/types.h>
29#include <linux/slab.h>
29#include <linux/init.h> 30#include <linux/init.h>
30#include <linux/kmod.h> 31#include <linux/kmod.h>
31#include <linux/poll.h> 32#include <linux/poll.h>
@@ -46,6 +47,16 @@
46 47
47#include "mce-internal.h" 48#include "mce-internal.h"
48 49
50static DEFINE_MUTEX(mce_read_mutex);
51
52#define rcu_dereference_check_mce(p) \
53 rcu_dereference_check((p), \
54 rcu_read_lock_sched_held() || \
55 lockdep_is_held(&mce_read_mutex))
56
57#define CREATE_TRACE_POINTS
58#include <trace/events/mce.h>
59
49int mce_disabled __read_mostly; 60int mce_disabled __read_mostly;
50 61
51#define MISC_MCELOG_MINOR 227 62#define MISC_MCELOG_MINOR 227
@@ -85,18 +96,26 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_wait);
85static DEFINE_PER_CPU(struct mce, mces_seen); 96static DEFINE_PER_CPU(struct mce, mces_seen);
86static int cpu_missing; 97static int cpu_missing;
87 98
88static void default_decode_mce(struct mce *m) 99/*
100 * CPU/chipset specific EDAC code can register a notifier call here to print
101 * MCE errors in a human-readable form.
102 */
103ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain);
104EXPORT_SYMBOL_GPL(x86_mce_decoder_chain);
105
106static int default_decode_mce(struct notifier_block *nb, unsigned long val,
107 void *data)
89{ 108{
90 pr_emerg("No human readable MCE decoding support on this CPU type.\n"); 109 pr_emerg("No human readable MCE decoding support on this CPU type.\n");
91 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n"); 110 pr_emerg("Run the message through 'mcelog --ascii' to decode.\n");
111
112 return NOTIFY_STOP;
92} 113}
93 114
94/* 115static struct notifier_block mce_dec_nb = {
95 * CPU/chipset specific EDAC code can register a callback here to print 116 .notifier_call = default_decode_mce,
96 * MCE errors in a human-readable form: 117 .priority = -1,
97 */ 118};
98void (*x86_mce_decode_callback)(struct mce *m) = default_decode_mce;
99EXPORT_SYMBOL(x86_mce_decode_callback);
100 119
101/* MCA banks polled by the period polling timer for corrected events */ 120/* MCA banks polled by the period polling timer for corrected events */
102DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { 121DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = {
@@ -141,10 +160,13 @@ void mce_log(struct mce *mce)
141{ 160{
142 unsigned next, entry; 161 unsigned next, entry;
143 162
163 /* Emit the trace record: */
164 trace_mce_record(mce);
165
144 mce->finished = 0; 166 mce->finished = 0;
145 wmb(); 167 wmb();
146 for (;;) { 168 for (;;) {
147 entry = rcu_dereference(mcelog.next); 169 entry = rcu_dereference_check_mce(mcelog.next);
148 for (;;) { 170 for (;;) {
149 /* 171 /*
150 * When the buffer fills up discard new entries. 172 * When the buffer fills up discard new entries.
@@ -204,9 +226,9 @@ static void print_mce(struct mce *m)
204 226
205 /* 227 /*
206 * Print out human-readable details about the MCE error, 228 * Print out human-readable details about the MCE error,
207 * (if the CPU has an implementation for that): 229 * (if the CPU has an implementation for that)
208 */ 230 */
209 x86_mce_decode_callback(m); 231 atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m);
210} 232}
211 233
212static void print_mce_head(void) 234static void print_mce_head(void)
@@ -1122,7 +1144,7 @@ static int check_interval = 5 * 60; /* 5 minutes */
1122static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */ 1144static DEFINE_PER_CPU(int, mce_next_interval); /* in jiffies */
1123static DEFINE_PER_CPU(struct timer_list, mce_timer); 1145static DEFINE_PER_CPU(struct timer_list, mce_timer);
1124 1146
1125static void mcheck_timer(unsigned long data) 1147static void mce_start_timer(unsigned long data)
1126{ 1148{
1127 struct timer_list *t = &per_cpu(mce_timer, data); 1149 struct timer_list *t = &per_cpu(mce_timer, data);
1128 int *n; 1150 int *n;
@@ -1187,7 +1209,7 @@ int mce_notify_irq(void)
1187} 1209}
1188EXPORT_SYMBOL_GPL(mce_notify_irq); 1210EXPORT_SYMBOL_GPL(mce_notify_irq);
1189 1211
1190static int mce_banks_init(void) 1212static int __cpuinit __mcheck_cpu_mce_banks_init(void)
1191{ 1213{
1192 int i; 1214 int i;
1193 1215
@@ -1206,7 +1228,7 @@ static int mce_banks_init(void)
1206/* 1228/*
1207 * Initialize Machine Checks for a CPU. 1229 * Initialize Machine Checks for a CPU.
1208 */ 1230 */
1209static int __cpuinit mce_cap_init(void) 1231static int __cpuinit __mcheck_cpu_cap_init(void)
1210{ 1232{
1211 unsigned b; 1233 unsigned b;
1212 u64 cap; 1234 u64 cap;
@@ -1228,7 +1250,7 @@ static int __cpuinit mce_cap_init(void)
1228 WARN_ON(banks != 0 && b != banks); 1250 WARN_ON(banks != 0 && b != banks);
1229 banks = b; 1251 banks = b;
1230 if (!mce_banks) { 1252 if (!mce_banks) {
1231 int err = mce_banks_init(); 1253 int err = __mcheck_cpu_mce_banks_init();
1232 1254
1233 if (err) 1255 if (err)
1234 return err; 1256 return err;
@@ -1244,7 +1266,7 @@ static int __cpuinit mce_cap_init(void)
1244 return 0; 1266 return 0;
1245} 1267}
1246 1268
1247static void mce_init(void) 1269static void __mcheck_cpu_init_generic(void)
1248{ 1270{
1249 mce_banks_t all_banks; 1271 mce_banks_t all_banks;
1250 u64 cap; 1272 u64 cap;
@@ -1273,7 +1295,7 @@ static void mce_init(void)
1273} 1295}
1274 1296
1275/* Add per CPU specific workarounds here */ 1297/* Add per CPU specific workarounds here */
1276static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c) 1298static int __cpuinit __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
1277{ 1299{
1278 if (c->x86_vendor == X86_VENDOR_UNKNOWN) { 1300 if (c->x86_vendor == X86_VENDOR_UNKNOWN) {
1279 pr_info("MCE: unknown CPU type - not enabling MCE support.\n"); 1301 pr_info("MCE: unknown CPU type - not enabling MCE support.\n");
@@ -1341,7 +1363,7 @@ static int __cpuinit mce_cpu_quirks(struct cpuinfo_x86 *c)
1341 return 0; 1363 return 0;
1342} 1364}
1343 1365
1344static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c) 1366static void __cpuinit __mcheck_cpu_ancient_init(struct cpuinfo_x86 *c)
1345{ 1367{
1346 if (c->x86 != 5) 1368 if (c->x86 != 5)
1347 return; 1369 return;
@@ -1355,7 +1377,7 @@ static void __cpuinit mce_ancient_init(struct cpuinfo_x86 *c)
1355 } 1377 }
1356} 1378}
1357 1379
1358static void mce_cpu_features(struct cpuinfo_x86 *c) 1380static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
1359{ 1381{
1360 switch (c->x86_vendor) { 1382 switch (c->x86_vendor) {
1361 case X86_VENDOR_INTEL: 1383 case X86_VENDOR_INTEL:
@@ -1369,18 +1391,19 @@ static void mce_cpu_features(struct cpuinfo_x86 *c)
1369 } 1391 }
1370} 1392}
1371 1393
1372static void mce_init_timer(void) 1394static void __mcheck_cpu_init_timer(void)
1373{ 1395{
1374 struct timer_list *t = &__get_cpu_var(mce_timer); 1396 struct timer_list *t = &__get_cpu_var(mce_timer);
1375 int *n = &__get_cpu_var(mce_next_interval); 1397 int *n = &__get_cpu_var(mce_next_interval);
1376 1398
1399 setup_timer(t, mce_start_timer, smp_processor_id());
1400
1377 if (mce_ignore_ce) 1401 if (mce_ignore_ce)
1378 return; 1402 return;
1379 1403
1380 *n = check_interval * HZ; 1404 *n = check_interval * HZ;
1381 if (!*n) 1405 if (!*n)
1382 return; 1406 return;
1383 setup_timer(t, mcheck_timer, smp_processor_id());
1384 t->expires = round_jiffies(jiffies + *n); 1407 t->expires = round_jiffies(jiffies + *n);
1385 add_timer_on(t, smp_processor_id()); 1408 add_timer_on(t, smp_processor_id());
1386} 1409}
@@ -1400,27 +1423,28 @@ void (*machine_check_vector)(struct pt_regs *, long error_code) =
1400 * Called for each booted CPU to set up machine checks. 1423 * Called for each booted CPU to set up machine checks.
1401 * Must be called with preempt off: 1424 * Must be called with preempt off:
1402 */ 1425 */
1403void __cpuinit mcheck_init(struct cpuinfo_x86 *c) 1426void __cpuinit mcheck_cpu_init(struct cpuinfo_x86 *c)
1404{ 1427{
1405 if (mce_disabled) 1428 if (mce_disabled)
1406 return; 1429 return;
1407 1430
1408 mce_ancient_init(c); 1431 __mcheck_cpu_ancient_init(c);
1409 1432
1410 if (!mce_available(c)) 1433 if (!mce_available(c))
1411 return; 1434 return;
1412 1435
1413 if (mce_cap_init() < 0 || mce_cpu_quirks(c) < 0) { 1436 if (__mcheck_cpu_cap_init() < 0 || __mcheck_cpu_apply_quirks(c) < 0) {
1414 mce_disabled = 1; 1437 mce_disabled = 1;
1415 return; 1438 return;
1416 } 1439 }
1417 1440
1418 machine_check_vector = do_machine_check; 1441 machine_check_vector = do_machine_check;
1419 1442
1420 mce_init(); 1443 __mcheck_cpu_init_generic();
1421 mce_cpu_features(c); 1444 __mcheck_cpu_init_vendor(c);
1422 mce_init_timer(); 1445 __mcheck_cpu_init_timer();
1423 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work); 1446 INIT_WORK(&__get_cpu_var(mce_work), mce_process_work);
1447
1424} 1448}
1425 1449
1426/* 1450/*
@@ -1469,8 +1493,6 @@ static void collect_tscs(void *data)
1469 rdtscll(cpu_tsc[smp_processor_id()]); 1493 rdtscll(cpu_tsc[smp_processor_id()]);
1470} 1494}
1471 1495
1472static DEFINE_MUTEX(mce_read_mutex);
1473
1474static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize, 1496static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1475 loff_t *off) 1497 loff_t *off)
1476{ 1498{
@@ -1484,7 +1506,7 @@ static ssize_t mce_read(struct file *filp, char __user *ubuf, size_t usize,
1484 return -ENOMEM; 1506 return -ENOMEM;
1485 1507
1486 mutex_lock(&mce_read_mutex); 1508 mutex_lock(&mce_read_mutex);
1487 next = rcu_dereference(mcelog.next); 1509 next = rcu_dereference_check_mce(mcelog.next);
1488 1510
1489 /* Only supports full reads right now */ 1511 /* Only supports full reads right now */
1490 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) { 1512 if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) {
@@ -1549,7 +1571,7 @@ timeout:
1549static unsigned int mce_poll(struct file *file, poll_table *wait) 1571static unsigned int mce_poll(struct file *file, poll_table *wait)
1550{ 1572{
1551 poll_wait(file, &mce_wait, wait); 1573 poll_wait(file, &mce_wait, wait);
1552 if (rcu_dereference(mcelog.next)) 1574 if (rcu_dereference_check_mce(mcelog.next))
1553 return POLLIN | POLLRDNORM; 1575 return POLLIN | POLLRDNORM;
1554 return 0; 1576 return 0;
1555} 1577}
@@ -1640,6 +1662,15 @@ static int __init mcheck_enable(char *str)
1640} 1662}
1641__setup("mce", mcheck_enable); 1663__setup("mce", mcheck_enable);
1642 1664
1665int __init mcheck_init(void)
1666{
1667 atomic_notifier_chain_register(&x86_mce_decoder_chain, &mce_dec_nb);
1668
1669 mcheck_intel_therm_init();
1670
1671 return 0;
1672}
1673
1643/* 1674/*
1644 * Sysfs support 1675 * Sysfs support
1645 */ 1676 */
@@ -1648,7 +1679,7 @@ __setup("mce", mcheck_enable);
1648 * Disable machine checks on suspend and shutdown. We can't really handle 1679 * Disable machine checks on suspend and shutdown. We can't really handle
1649 * them later. 1680 * them later.
1650 */ 1681 */
1651static int mce_disable(void) 1682static int mce_disable_error_reporting(void)
1652{ 1683{
1653 int i; 1684 int i;
1654 1685
@@ -1663,12 +1694,12 @@ static int mce_disable(void)
1663 1694
1664static int mce_suspend(struct sys_device *dev, pm_message_t state) 1695static int mce_suspend(struct sys_device *dev, pm_message_t state)
1665{ 1696{
1666 return mce_disable(); 1697 return mce_disable_error_reporting();
1667} 1698}
1668 1699
1669static int mce_shutdown(struct sys_device *dev) 1700static int mce_shutdown(struct sys_device *dev)
1670{ 1701{
1671 return mce_disable(); 1702 return mce_disable_error_reporting();
1672} 1703}
1673 1704
1674/* 1705/*
@@ -1678,8 +1709,8 @@ static int mce_shutdown(struct sys_device *dev)
1678 */ 1709 */
1679static int mce_resume(struct sys_device *dev) 1710static int mce_resume(struct sys_device *dev)
1680{ 1711{
1681 mce_init(); 1712 __mcheck_cpu_init_generic();
1682 mce_cpu_features(&current_cpu_data); 1713 __mcheck_cpu_init_vendor(&current_cpu_data);
1683 1714
1684 return 0; 1715 return 0;
1685} 1716}
@@ -1689,8 +1720,8 @@ static void mce_cpu_restart(void *data)
1689 del_timer_sync(&__get_cpu_var(mce_timer)); 1720 del_timer_sync(&__get_cpu_var(mce_timer));
1690 if (!mce_available(&current_cpu_data)) 1721 if (!mce_available(&current_cpu_data))
1691 return; 1722 return;
1692 mce_init(); 1723 __mcheck_cpu_init_generic();
1693 mce_init_timer(); 1724 __mcheck_cpu_init_timer();
1694} 1725}
1695 1726
1696/* Reinit MCEs after user configuration changes */ 1727/* Reinit MCEs after user configuration changes */
@@ -1716,7 +1747,7 @@ static void mce_enable_ce(void *all)
1716 cmci_reenable(); 1747 cmci_reenable();
1717 cmci_recheck(); 1748 cmci_recheck();
1718 if (all) 1749 if (all)
1719 mce_init_timer(); 1750 __mcheck_cpu_init_timer();
1720} 1751}
1721 1752
1722static struct sysdev_class mce_sysclass = { 1753static struct sysdev_class mce_sysclass = {
@@ -1904,7 +1935,7 @@ error2:
1904 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr); 1935 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[j].attr);
1905error: 1936error:
1906 while (--i >= 0) 1937 while (--i >= 0)
1907 sysdev_remove_file(&per_cpu(mce_dev, cpu), &mce_banks[i].attr); 1938 sysdev_remove_file(&per_cpu(mce_dev, cpu), mce_attrs[i]);
1908 1939
1909 sysdev_unregister(&per_cpu(mce_dev, cpu)); 1940 sysdev_unregister(&per_cpu(mce_dev, cpu));
1910 1941
@@ -1929,13 +1960,14 @@ static __cpuinit void mce_remove_device(unsigned int cpu)
1929} 1960}
1930 1961
1931/* Make sure there are no machine checks on offlined CPUs. */ 1962/* Make sure there are no machine checks on offlined CPUs. */
1932static void mce_disable_cpu(void *h) 1963static void __cpuinit mce_disable_cpu(void *h)
1933{ 1964{
1934 unsigned long action = *(unsigned long *)h; 1965 unsigned long action = *(unsigned long *)h;
1935 int i; 1966 int i;
1936 1967
1937 if (!mce_available(&current_cpu_data)) 1968 if (!mce_available(&current_cpu_data))
1938 return; 1969 return;
1970
1939 if (!(action & CPU_TASKS_FROZEN)) 1971 if (!(action & CPU_TASKS_FROZEN))
1940 cmci_clear(); 1972 cmci_clear();
1941 for (i = 0; i < banks; i++) { 1973 for (i = 0; i < banks; i++) {
@@ -1946,7 +1978,7 @@ static void mce_disable_cpu(void *h)
1946 } 1978 }
1947} 1979}
1948 1980
1949static void mce_reenable_cpu(void *h) 1981static void __cpuinit mce_reenable_cpu(void *h)
1950{ 1982{
1951 unsigned long action = *(unsigned long *)h; 1983 unsigned long action = *(unsigned long *)h;
1952 int i; 1984 int i;
@@ -1991,9 +2023,11 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
1991 break; 2023 break;
1992 case CPU_DOWN_FAILED: 2024 case CPU_DOWN_FAILED:
1993 case CPU_DOWN_FAILED_FROZEN: 2025 case CPU_DOWN_FAILED_FROZEN:
1994 t->expires = round_jiffies(jiffies + 2026 if (!mce_ignore_ce && check_interval) {
2027 t->expires = round_jiffies(jiffies +
1995 __get_cpu_var(mce_next_interval)); 2028 __get_cpu_var(mce_next_interval));
1996 add_timer_on(t, cpu); 2029 add_timer_on(t, cpu);
2030 }
1997 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1); 2031 smp_call_function_single(cpu, mce_reenable_cpu, &action, 1);
1998 break; 2032 break;
1999 case CPU_POST_DEAD: 2033 case CPU_POST_DEAD:
@@ -2016,6 +2050,7 @@ static __init void mce_init_banks(void)
2016 struct mce_bank *b = &mce_banks[i]; 2050 struct mce_bank *b = &mce_banks[i];
2017 struct sysdev_attribute *a = &b->attr; 2051 struct sysdev_attribute *a = &b->attr;
2018 2052
2053 sysfs_attr_init(&a->attr);
2019 a->attr.name = b->attrname; 2054 a->attr.name = b->attrname;
2020 snprintf(b->attrname, ATTR_LEN, "bank%d", i); 2055 snprintf(b->attrname, ATTR_LEN, "bank%d", i);
2021 2056
@@ -2025,7 +2060,7 @@ static __init void mce_init_banks(void)
2025 } 2060 }
2026} 2061}
2027 2062
2028static __init int mce_init_device(void) 2063static __init int mcheck_init_device(void)
2029{ 2064{
2030 int err; 2065 int err;
2031 int i = 0; 2066 int i = 0;
@@ -2053,7 +2088,7 @@ static __init int mce_init_device(void)
2053 return err; 2088 return err;
2054} 2089}
2055 2090
2056device_initcall(mce_init_device); 2091device_initcall(mcheck_init_device);
2057 2092
2058/* 2093/*
2059 * Old style boot options parsing. Only for compatibility. 2094 * Old style boot options parsing. Only for compatibility.
@@ -2101,7 +2136,7 @@ static int fake_panic_set(void *data, u64 val)
2101DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get, 2136DEFINE_SIMPLE_ATTRIBUTE(fake_panic_fops, fake_panic_get,
2102 fake_panic_set, "%llu\n"); 2137 fake_panic_set, "%llu\n");
2103 2138
2104static int __init mce_debugfs_init(void) 2139static int __init mcheck_debugfs_init(void)
2105{ 2140{
2106 struct dentry *dmce, *ffake_panic; 2141 struct dentry *dmce, *ffake_panic;
2107 2142
@@ -2115,5 +2150,5 @@ static int __init mce_debugfs_init(void)
2115 2150
2116 return 0; 2151 return 0;
2117} 2152}
2118late_initcall(mce_debugfs_init); 2153late_initcall(mcheck_debugfs_init);
2119#endif 2154#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index 83a3d1f4efca..224392d8fe8c 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -21,6 +21,7 @@
21#include <linux/errno.h> 21#include <linux/errno.h>
22#include <linux/sched.h> 22#include <linux/sched.h>
23#include <linux/sysfs.h> 23#include <linux/sysfs.h>
24#include <linux/slab.h>
24#include <linux/init.h> 25#include <linux/init.h>
25#include <linux/cpu.h> 26#include <linux/cpu.h>
26#include <linux/smp.h> 27#include <linux/smp.h>
@@ -388,7 +389,7 @@ static ssize_t store(struct kobject *kobj, struct attribute *attr,
388 return ret; 389 return ret;
389} 390}
390 391
391static struct sysfs_ops threshold_ops = { 392static const struct sysfs_ops threshold_ops = {
392 .show = show, 393 .show = show,
393 .store = store, 394 .store = store,
394}; 395};
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 7c785634af2b..62b48e40920a 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -5,6 +5,7 @@
5 * Author: Andi Kleen 5 * Author: Andi Kleen
6 */ 6 */
7 7
8#include <linux/gfp.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/interrupt.h> 10#include <linux/interrupt.h>
10#include <linux/percpu.h> 11#include <linux/percpu.h>
@@ -95,7 +96,7 @@ static void cmci_discover(int banks, int boot)
95 96
96 /* Already owned by someone else? */ 97 /* Already owned by someone else? */
97 if (val & CMCI_EN) { 98 if (val & CMCI_EN) {
98 if (test_and_clear_bit(i, owned) || boot) 99 if (test_and_clear_bit(i, owned) && !boot)
99 print_update("SHD", &hdr, i); 100 print_update("SHD", &hdr, i);
100 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 101 __clear_bit(i, __get_cpu_var(mce_poll_banks));
101 continue; 102 continue;
@@ -107,7 +108,7 @@ static void cmci_discover(int banks, int boot)
107 108
108 /* Did the enable bit stick? -- the bank supports CMCI */ 109 /* Did the enable bit stick? -- the bank supports CMCI */
109 if (val & CMCI_EN) { 110 if (val & CMCI_EN) {
110 if (!test_and_set_bit(i, owned) || boot) 111 if (!test_and_set_bit(i, owned) && !boot)
111 print_update("CMCI", &hdr, i); 112 print_update("CMCI", &hdr, i);
112 __clear_bit(i, __get_cpu_var(mce_poll_banks)); 113 __clear_bit(i, __get_cpu_var(mce_poll_banks));
113 } else { 114 } else {
diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c
index b3a1dba75330..81c499eceb21 100644
--- a/arch/x86/kernel/cpu/mcheck/therm_throt.c
+++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c
@@ -49,6 +49,8 @@ static DEFINE_PER_CPU(struct thermal_state, thermal_state);
49 49
50static atomic_t therm_throt_en = ATOMIC_INIT(0); 50static atomic_t therm_throt_en = ATOMIC_INIT(0);
51 51
52static u32 lvtthmr_init __read_mostly;
53
52#ifdef CONFIG_SYSFS 54#ifdef CONFIG_SYSFS
53#define define_therm_throt_sysdev_one_ro(_name) \ 55#define define_therm_throt_sysdev_one_ro(_name) \
54 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL) 56 static SYSDEV_ATTR(_name, 0444, therm_throt_sysdev_show_##_name, NULL)
@@ -254,14 +256,34 @@ asmlinkage void smp_thermal_interrupt(struct pt_regs *regs)
254 ack_APIC_irq(); 256 ack_APIC_irq();
255} 257}
256 258
259/* Thermal monitoring depends on APIC, ACPI and clock modulation */
260static int intel_thermal_supported(struct cpuinfo_x86 *c)
261{
262 if (!cpu_has_apic)
263 return 0;
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return 0;
266 return 1;
267}
268
269void __init mcheck_intel_therm_init(void)
270{
271 /*
272 * This function is only called on boot CPU. Save the init thermal
273 * LVT value on BSP and use that value to restore APs' thermal LVT
274 * entry BIOS programmed later
275 */
276 if (intel_thermal_supported(&boot_cpu_data))
277 lvtthmr_init = apic_read(APIC_LVTTHMR);
278}
279
257void intel_init_thermal(struct cpuinfo_x86 *c) 280void intel_init_thermal(struct cpuinfo_x86 *c)
258{ 281{
259 unsigned int cpu = smp_processor_id(); 282 unsigned int cpu = smp_processor_id();
260 int tm2 = 0; 283 int tm2 = 0;
261 u32 l, h; 284 u32 l, h;
262 285
263 /* Thermal monitoring depends on ACPI and clock modulation*/ 286 if (!intel_thermal_supported(c))
264 if (!cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_ACC))
265 return; 287 return;
266 288
267 /* 289 /*
@@ -270,7 +292,20 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
270 * since it might be delivered via SMI already: 292 * since it might be delivered via SMI already:
271 */ 293 */
272 rdmsr(MSR_IA32_MISC_ENABLE, l, h); 294 rdmsr(MSR_IA32_MISC_ENABLE, l, h);
273 h = apic_read(APIC_LVTTHMR); 295
296 /*
297 * The initial value of thermal LVT entries on all APs always reads
298 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
299 * sequence to them and LVT registers are reset to 0s except for
300 * the mask bits which are set to 1s when APs receive INIT IPI.
301 * Always restore the value that BIOS has programmed on AP based on
302 * BSP's info we saved since BIOS is always setting the same value
303 * for all threads/cores
304 */
305 apic_write(APIC_LVTTHMR, lvtthmr_init);
306
307 h = lvtthmr_init;
308
274 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) { 309 if ((l & MSR_IA32_MISC_ENABLE_TM1) && (h & APIC_DM_SMI)) {
275 printk(KERN_DEBUG 310 printk(KERN_DEBUG
276 "CPU%d: Thermal monitoring handled by SMI\n", cpu); 311 "CPU%d: Thermal monitoring handled by SMI\n", cpu);
@@ -312,8 +347,8 @@ void intel_init_thermal(struct cpuinfo_x86 *c)
312 l = apic_read(APIC_LVTTHMR); 347 l = apic_read(APIC_LVTTHMR);
313 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED); 348 apic_write(APIC_LVTTHMR, l & ~APIC_LVT_MASKED);
314 349
315 printk(KERN_INFO "CPU%d: Thermal monitoring enabled (%s)\n", 350 printk_once(KERN_INFO "CPU0: Thermal monitoring enabled (%s)\n",
316 cpu, tm2 ? "TM2" : "TM1"); 351 tm2 ? "TM2" : "TM1");
317 352
318 /* enable thermal throttle processing */ 353 /* enable thermal throttle processing */
319 atomic_set(&therm_throt_en, 1); 354 atomic_set(&therm_throt_en, 1);
diff --git a/arch/x86/kernel/cpu/mtrr/Makefile b/arch/x86/kernel/cpu/mtrr/Makefile
index f4361b56f8e9..ad9e5ed81181 100644
--- a/arch/x86/kernel/cpu/mtrr/Makefile
+++ b/arch/x86/kernel/cpu/mtrr/Makefile
@@ -1,3 +1,3 @@
1obj-y := main.o if.o generic.o state.o cleanup.o 1obj-y := main.o if.o generic.o cleanup.o
2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o 2obj-$(CONFIG_X86_32) += amd.o cyrix.o centaur.o
3 3
diff --git a/arch/x86/kernel/cpu/mtrr/amd.c b/arch/x86/kernel/cpu/mtrr/amd.c
index 33af14110dfd..92ba9cd31c9a 100644
--- a/arch/x86/kernel/cpu/mtrr/amd.c
+++ b/arch/x86/kernel/cpu/mtrr/amd.c
@@ -108,7 +108,7 @@ amd_validate_add_page(unsigned long base, unsigned long size, unsigned int type)
108 return 0; 108 return 0;
109} 109}
110 110
111static struct mtrr_ops amd_mtrr_ops = { 111static const struct mtrr_ops amd_mtrr_ops = {
112 .vendor = X86_VENDOR_AMD, 112 .vendor = X86_VENDOR_AMD,
113 .set = amd_set_mtrr, 113 .set = amd_set_mtrr,
114 .get = amd_get_mtrr, 114 .get = amd_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/centaur.c b/arch/x86/kernel/cpu/mtrr/centaur.c
index de89f14eff3a..316fe3e60a97 100644
--- a/arch/x86/kernel/cpu/mtrr/centaur.c
+++ b/arch/x86/kernel/cpu/mtrr/centaur.c
@@ -110,7 +110,7 @@ centaur_validate_add_page(unsigned long base, unsigned long size, unsigned int t
110 return 0; 110 return 0;
111} 111}
112 112
113static struct mtrr_ops centaur_mtrr_ops = { 113static const struct mtrr_ops centaur_mtrr_ops = {
114 .vendor = X86_VENDOR_CENTAUR, 114 .vendor = X86_VENDOR_CENTAUR,
115 .set = centaur_set_mcr, 115 .set = centaur_set_mcr,
116 .get = centaur_get_mcr, 116 .get = centaur_get_mcr,
diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c
index 73c86db5acbe..06130b52f012 100644
--- a/arch/x86/kernel/cpu/mtrr/cleanup.c
+++ b/arch/x86/kernel/cpu/mtrr/cleanup.c
@@ -22,10 +22,10 @@
22#include <linux/pci.h> 22#include <linux/pci.h>
23#include <linux/smp.h> 23#include <linux/smp.h>
24#include <linux/cpu.h> 24#include <linux/cpu.h>
25#include <linux/sort.h>
26#include <linux/mutex.h> 25#include <linux/mutex.h>
27#include <linux/uaccess.h> 26#include <linux/uaccess.h>
28#include <linux/kvm_para.h> 27#include <linux/kvm_para.h>
28#include <linux/range.h>
29 29
30#include <asm/processor.h> 30#include <asm/processor.h>
31#include <asm/e820.h> 31#include <asm/e820.h>
@@ -34,11 +34,6 @@
34 34
35#include "mtrr.h" 35#include "mtrr.h"
36 36
37struct res_range {
38 unsigned long start;
39 unsigned long end;
40};
41
42struct var_mtrr_range_state { 37struct var_mtrr_range_state {
43 unsigned long base_pfn; 38 unsigned long base_pfn;
44 unsigned long size_pfn; 39 unsigned long size_pfn;
@@ -56,7 +51,7 @@ struct var_mtrr_state {
56/* Should be related to MTRR_VAR_RANGES nums */ 51/* Should be related to MTRR_VAR_RANGES nums */
57#define RANGE_NUM 256 52#define RANGE_NUM 256
58 53
59static struct res_range __initdata range[RANGE_NUM]; 54static struct range __initdata range[RANGE_NUM];
60static int __initdata nr_range; 55static int __initdata nr_range;
61 56
62static struct var_mtrr_range_state __initdata range_state[RANGE_NUM]; 57static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
@@ -64,117 +59,11 @@ static struct var_mtrr_range_state __initdata range_state[RANGE_NUM];
64static int __initdata debug_print; 59static int __initdata debug_print;
65#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0) 60#define Dprintk(x...) do { if (debug_print) printk(KERN_DEBUG x); } while (0)
66 61
67
68static int __init
69add_range(struct res_range *range, int nr_range,
70 unsigned long start, unsigned long end)
71{
72 /* Out of slots: */
73 if (nr_range >= RANGE_NUM)
74 return nr_range;
75
76 range[nr_range].start = start;
77 range[nr_range].end = end;
78
79 nr_range++;
80
81 return nr_range;
82}
83
84static int __init
85add_range_with_merge(struct res_range *range, int nr_range,
86 unsigned long start, unsigned long end)
87{
88 int i;
89
90 /* Try to merge it with old one: */
91 for (i = 0; i < nr_range; i++) {
92 unsigned long final_start, final_end;
93 unsigned long common_start, common_end;
94
95 if (!range[i].end)
96 continue;
97
98 common_start = max(range[i].start, start);
99 common_end = min(range[i].end, end);
100 if (common_start > common_end + 1)
101 continue;
102
103 final_start = min(range[i].start, start);
104 final_end = max(range[i].end, end);
105
106 range[i].start = final_start;
107 range[i].end = final_end;
108 return nr_range;
109 }
110
111 /* Need to add it: */
112 return add_range(range, nr_range, start, end);
113}
114
115static void __init
116subtract_range(struct res_range *range, unsigned long start, unsigned long end)
117{
118 int i, j;
119
120 for (j = 0; j < RANGE_NUM; j++) {
121 if (!range[j].end)
122 continue;
123
124 if (start <= range[j].start && end >= range[j].end) {
125 range[j].start = 0;
126 range[j].end = 0;
127 continue;
128 }
129
130 if (start <= range[j].start && end < range[j].end &&
131 range[j].start < end + 1) {
132 range[j].start = end + 1;
133 continue;
134 }
135
136
137 if (start > range[j].start && end >= range[j].end &&
138 range[j].end > start - 1) {
139 range[j].end = start - 1;
140 continue;
141 }
142
143 if (start > range[j].start && end < range[j].end) {
144 /* Find the new spare: */
145 for (i = 0; i < RANGE_NUM; i++) {
146 if (range[i].end == 0)
147 break;
148 }
149 if (i < RANGE_NUM) {
150 range[i].end = range[j].end;
151 range[i].start = end + 1;
152 } else {
153 printk(KERN_ERR "run of slot in ranges\n");
154 }
155 range[j].end = start - 1;
156 continue;
157 }
158 }
159}
160
161static int __init cmp_range(const void *x1, const void *x2)
162{
163 const struct res_range *r1 = x1;
164 const struct res_range *r2 = x2;
165 long start1, start2;
166
167 start1 = r1->start;
168 start2 = r2->start;
169
170 return start1 - start2;
171}
172
173#define BIOS_BUG_MSG KERN_WARNING \ 62#define BIOS_BUG_MSG KERN_WARNING \
174 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n" 63 "WARNING: BIOS bug: VAR MTRR %d contains strange UC entry under 1M, check with your system vendor!\n"
175 64
176static int __init 65static int __init
177x86_get_mtrr_mem_range(struct res_range *range, int nr_range, 66x86_get_mtrr_mem_range(struct range *range, int nr_range,
178 unsigned long extra_remove_base, 67 unsigned long extra_remove_base,
179 unsigned long extra_remove_size) 68 unsigned long extra_remove_size)
180{ 69{
@@ -188,14 +77,14 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
188 continue; 77 continue;
189 base = range_state[i].base_pfn; 78 base = range_state[i].base_pfn;
190 size = range_state[i].size_pfn; 79 size = range_state[i].size_pfn;
191 nr_range = add_range_with_merge(range, nr_range, base, 80 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range,
192 base + size - 1); 81 base, base + size);
193 } 82 }
194 if (debug_print) { 83 if (debug_print) {
195 printk(KERN_DEBUG "After WB checking\n"); 84 printk(KERN_DEBUG "After WB checking\n");
196 for (i = 0; i < nr_range; i++) 85 for (i = 0; i < nr_range; i++)
197 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 86 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
198 range[i].start, range[i].end + 1); 87 range[i].start, range[i].end);
199 } 88 }
200 89
201 /* Take out UC ranges: */ 90 /* Take out UC ranges: */
@@ -217,51 +106,43 @@ x86_get_mtrr_mem_range(struct res_range *range, int nr_range,
217 size -= (1<<(20-PAGE_SHIFT)) - base; 106 size -= (1<<(20-PAGE_SHIFT)) - base;
218 base = 1<<(20-PAGE_SHIFT); 107 base = 1<<(20-PAGE_SHIFT);
219 } 108 }
220 subtract_range(range, base, base + size - 1); 109 subtract_range(range, RANGE_NUM, base, base + size);
221 } 110 }
222 if (extra_remove_size) 111 if (extra_remove_size)
223 subtract_range(range, extra_remove_base, 112 subtract_range(range, RANGE_NUM, extra_remove_base,
224 extra_remove_base + extra_remove_size - 1); 113 extra_remove_base + extra_remove_size);
225 114
226 /* get new range num */
227 nr_range = 0;
228 for (i = 0; i < RANGE_NUM; i++) {
229 if (!range[i].end)
230 continue;
231 nr_range++;
232 }
233 if (debug_print) { 115 if (debug_print) {
234 printk(KERN_DEBUG "After UC checking\n"); 116 printk(KERN_DEBUG "After UC checking\n");
235 for (i = 0; i < nr_range; i++) 117 for (i = 0; i < RANGE_NUM; i++) {
236 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 118 if (!range[i].end)
237 range[i].start, range[i].end + 1); 119 continue;
120 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
121 range[i].start, range[i].end);
122 }
238 } 123 }
239 124
240 /* sort the ranges */ 125 /* sort the ranges */
241 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 126 nr_range = clean_sort_range(range, RANGE_NUM);
242 if (debug_print) { 127 if (debug_print) {
243 printk(KERN_DEBUG "After sorting\n"); 128 printk(KERN_DEBUG "After sorting\n");
244 for (i = 0; i < nr_range; i++) 129 for (i = 0; i < nr_range; i++)
245 printk(KERN_DEBUG "MTRR MAP PFN: %016lx - %016lx\n", 130 printk(KERN_DEBUG "MTRR MAP PFN: %016llx - %016llx\n",
246 range[i].start, range[i].end + 1); 131 range[i].start, range[i].end);
247 } 132 }
248 133
249 /* clear those is not used */
250 for (i = nr_range; i < RANGE_NUM; i++)
251 memset(&range[i], 0, sizeof(range[i]));
252
253 return nr_range; 134 return nr_range;
254} 135}
255 136
256#ifdef CONFIG_MTRR_SANITIZER 137#ifdef CONFIG_MTRR_SANITIZER
257 138
258static unsigned long __init sum_ranges(struct res_range *range, int nr_range) 139static unsigned long __init sum_ranges(struct range *range, int nr_range)
259{ 140{
260 unsigned long sum = 0; 141 unsigned long sum = 0;
261 int i; 142 int i;
262 143
263 for (i = 0; i < nr_range; i++) 144 for (i = 0; i < nr_range; i++)
264 sum += range[i].end + 1 - range[i].start; 145 sum += range[i].end - range[i].start;
265 146
266 return sum; 147 return sum;
267} 148}
@@ -590,7 +471,7 @@ static int __init parse_mtrr_spare_reg(char *arg)
590early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg); 471early_param("mtrr_spare_reg_nr", parse_mtrr_spare_reg);
591 472
592static int __init 473static int __init
593x86_setup_var_mtrrs(struct res_range *range, int nr_range, 474x86_setup_var_mtrrs(struct range *range, int nr_range,
594 u64 chunk_size, u64 gran_size) 475 u64 chunk_size, u64 gran_size)
595{ 476{
596 struct var_mtrr_state var_state; 477 struct var_mtrr_state var_state;
@@ -608,7 +489,7 @@ x86_setup_var_mtrrs(struct res_range *range, int nr_range,
608 /* Write the range: */ 489 /* Write the range: */
609 for (i = 0; i < nr_range; i++) { 490 for (i = 0; i < nr_range; i++) {
610 set_var_mtrr_range(&var_state, range[i].start, 491 set_var_mtrr_range(&var_state, range[i].start,
611 range[i].end - range[i].start + 1); 492 range[i].end - range[i].start);
612 } 493 }
613 494
614 /* Write the last range: */ 495 /* Write the last range: */
@@ -689,8 +570,6 @@ static int __init mtrr_need_cleanup(void)
689 continue; 570 continue;
690 if (!size) 571 if (!size)
691 type = MTRR_NUM_TYPES; 572 type = MTRR_NUM_TYPES;
692 if (type == MTRR_TYPE_WRPROT)
693 type = MTRR_TYPE_UNCACHABLE;
694 num[type]++; 573 num[type]++;
695 } 574 }
696 575
@@ -713,7 +592,7 @@ mtrr_calc_range_state(u64 chunk_size, u64 gran_size,
713 unsigned long x_remove_base, 592 unsigned long x_remove_base,
714 unsigned long x_remove_size, int i) 593 unsigned long x_remove_size, int i)
715{ 594{
716 static struct res_range range_new[RANGE_NUM]; 595 static struct range range_new[RANGE_NUM];
717 unsigned long range_sums_new; 596 unsigned long range_sums_new;
718 static int nr_range_new; 597 static int nr_range_new;
719 int num_reg; 598 int num_reg;
@@ -840,10 +719,10 @@ int __init mtrr_cleanup(unsigned address_bits)
840 * [0, 1M) should always be covered by var mtrr with WB 719 * [0, 1M) should always be covered by var mtrr with WB
841 * and fixed mtrrs should take effect before var mtrr for it: 720 * and fixed mtrrs should take effect before var mtrr for it:
842 */ 721 */
843 nr_range = add_range_with_merge(range, nr_range, 0, 722 nr_range = add_range_with_merge(range, RANGE_NUM, nr_range, 0,
844 (1ULL<<(20 - PAGE_SHIFT)) - 1); 723 1ULL<<(20 - PAGE_SHIFT));
845 /* Sort the ranges: */ 724 /* Sort the ranges: */
846 sort(range, nr_range, sizeof(struct res_range), cmp_range, NULL); 725 sort_range(range, nr_range);
847 726
848 range_sums = sum_ranges(range, nr_range); 727 range_sums = sum_ranges(range, nr_range);
849 printk(KERN_INFO "total RAM covered: %ldM\n", 728 printk(KERN_INFO "total RAM covered: %ldM\n",
@@ -1060,9 +939,9 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1060 nr_range = 0; 939 nr_range = 0;
1061 if (mtrr_tom2) { 940 if (mtrr_tom2) {
1062 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT)); 941 range[nr_range].start = (1ULL<<(32 - PAGE_SHIFT));
1063 range[nr_range].end = (mtrr_tom2 >> PAGE_SHIFT) - 1; 942 range[nr_range].end = mtrr_tom2 >> PAGE_SHIFT;
1064 if (highest_pfn < range[nr_range].end + 1) 943 if (highest_pfn < range[nr_range].end)
1065 highest_pfn = range[nr_range].end + 1; 944 highest_pfn = range[nr_range].end;
1066 nr_range++; 945 nr_range++;
1067 } 946 }
1068 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0); 947 nr_range = x86_get_mtrr_mem_range(range, nr_range, 0, 0);
@@ -1074,15 +953,15 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn)
1074 953
1075 /* Check the holes: */ 954 /* Check the holes: */
1076 for (i = 0; i < nr_range - 1; i++) { 955 for (i = 0; i < nr_range - 1; i++) {
1077 if (range[i].end + 1 < range[i+1].start) 956 if (range[i].end < range[i+1].start)
1078 total_trim_size += real_trim_memory(range[i].end + 1, 957 total_trim_size += real_trim_memory(range[i].end,
1079 range[i+1].start); 958 range[i+1].start);
1080 } 959 }
1081 960
1082 /* Check the top: */ 961 /* Check the top: */
1083 i = nr_range - 1; 962 i = nr_range - 1;
1084 if (range[i].end + 1 < end_pfn) 963 if (range[i].end < end_pfn)
1085 total_trim_size += real_trim_memory(range[i].end + 1, 964 total_trim_size += real_trim_memory(range[i].end,
1086 end_pfn); 965 end_pfn);
1087 966
1088 if (total_trim_size) { 967 if (total_trim_size) {
diff --git a/arch/x86/kernel/cpu/mtrr/cyrix.c b/arch/x86/kernel/cpu/mtrr/cyrix.c
index 228d982ce09c..68a3343e5798 100644
--- a/arch/x86/kernel/cpu/mtrr/cyrix.c
+++ b/arch/x86/kernel/cpu/mtrr/cyrix.c
@@ -265,7 +265,7 @@ static void cyrix_set_all(void)
265 post_set(); 265 post_set();
266} 266}
267 267
268static struct mtrr_ops cyrix_mtrr_ops = { 268static const struct mtrr_ops cyrix_mtrr_ops = {
269 .vendor = X86_VENDOR_CYRIX, 269 .vendor = X86_VENDOR_CYRIX,
270 .set_all = cyrix_set_all, 270 .set_all = cyrix_set_all,
271 .set = cyrix_set_arr, 271 .set = cyrix_set_arr,
diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c
index 55da0c5f68dd..fd31a441c61c 100644
--- a/arch/x86/kernel/cpu/mtrr/generic.c
+++ b/arch/x86/kernel/cpu/mtrr/generic.c
@@ -6,7 +6,6 @@
6 6
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/io.h> 9#include <linux/io.h>
11#include <linux/mm.h> 10#include <linux/mm.h>
12 11
@@ -464,7 +463,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base,
464 tmp |= ~((1<<(hi - 1)) - 1); 463 tmp |= ~((1<<(hi - 1)) - 1);
465 464
466 if (tmp != mask_lo) { 465 if (tmp != mask_lo) {
467 WARN_ONCE(1, KERN_INFO "mtrr: your BIOS has set up an incorrect mask, fixing it up.\n"); 466 printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n");
468 mask_lo = tmp; 467 mask_lo = tmp;
469 } 468 }
470 } 469 }
@@ -570,7 +569,7 @@ static unsigned long set_mtrr_state(void)
570 569
571 570
572static unsigned long cr4; 571static unsigned long cr4;
573static DEFINE_SPINLOCK(set_atomicity_lock); 572static DEFINE_RAW_SPINLOCK(set_atomicity_lock);
574 573
575/* 574/*
576 * Since we are disabling the cache don't allow any interrupts, 575 * Since we are disabling the cache don't allow any interrupts,
@@ -590,7 +589,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock)
590 * changes to the way the kernel boots 589 * changes to the way the kernel boots
591 */ 590 */
592 591
593 spin_lock(&set_atomicity_lock); 592 raw_spin_lock(&set_atomicity_lock);
594 593
595 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */ 594 /* Enter the no-fill (CD=1, NW=0) cache mode and flush caches. */
596 cr0 = read_cr0() | X86_CR0_CD; 595 cr0 = read_cr0() | X86_CR0_CD;
@@ -627,7 +626,7 @@ static void post_set(void) __releases(set_atomicity_lock)
627 /* Restore value of CR4 */ 626 /* Restore value of CR4 */
628 if (cpu_has_pge) 627 if (cpu_has_pge)
629 write_cr4(cr4); 628 write_cr4(cr4);
630 spin_unlock(&set_atomicity_lock); 629 raw_spin_unlock(&set_atomicity_lock);
631} 630}
632 631
633static void generic_set_all(void) 632static void generic_set_all(void)
@@ -752,7 +751,7 @@ int positive_have_wrcomb(void)
752/* 751/*
753 * Generic structure... 752 * Generic structure...
754 */ 753 */
755struct mtrr_ops generic_mtrr_ops = { 754const struct mtrr_ops generic_mtrr_ops = {
756 .use_intel_if = 1, 755 .use_intel_if = 1,
757 .set_all = generic_set_all, 756 .set_all = generic_set_all,
758 .get = generic_get_mtrr, 757 .get = generic_get_mtrr,
diff --git a/arch/x86/kernel/cpu/mtrr/if.c b/arch/x86/kernel/cpu/mtrr/if.c
index 3c1b12d461d1..79289632cb27 100644
--- a/arch/x86/kernel/cpu/mtrr/if.c
+++ b/arch/x86/kernel/cpu/mtrr/if.c
@@ -4,6 +4,8 @@
4#include <linux/proc_fs.h> 4#include <linux/proc_fs.h>
5#include <linux/module.h> 5#include <linux/module.h>
6#include <linux/ctype.h> 6#include <linux/ctype.h>
7#include <linux/string.h>
8#include <linux/slab.h>
7#include <linux/init.h> 9#include <linux/init.h>
8 10
9#define LINE_SIZE 80 11#define LINE_SIZE 80
@@ -133,8 +135,7 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
133 return -EINVAL; 135 return -EINVAL;
134 136
135 base = simple_strtoull(line + 5, &ptr, 0); 137 base = simple_strtoull(line + 5, &ptr, 0);
136 while (isspace(*ptr)) 138 ptr = skip_spaces(ptr);
137 ptr++;
138 139
139 if (strncmp(ptr, "size=", 5)) 140 if (strncmp(ptr, "size=", 5))
140 return -EINVAL; 141 return -EINVAL;
@@ -142,14 +143,11 @@ mtrr_write(struct file *file, const char __user *buf, size_t len, loff_t * ppos)
142 size = simple_strtoull(ptr + 5, &ptr, 0); 143 size = simple_strtoull(ptr + 5, &ptr, 0);
143 if ((base & 0xfff) || (size & 0xfff)) 144 if ((base & 0xfff) || (size & 0xfff))
144 return -EINVAL; 145 return -EINVAL;
145 while (isspace(*ptr)) 146 ptr = skip_spaces(ptr);
146 ptr++;
147 147
148 if (strncmp(ptr, "type=", 5)) 148 if (strncmp(ptr, "type=", 5))
149 return -EINVAL; 149 return -EINVAL;
150 ptr += 5; 150 ptr = skip_spaces(ptr + 5);
151 while (isspace(*ptr))
152 ptr++;
153 151
154 for (i = 0; i < MTRR_NUM_TYPES; ++i) { 152 for (i = 0; i < MTRR_NUM_TYPES; ++i) {
155 if (strcmp(ptr, mtrr_strings[i])) 153 if (strcmp(ptr, mtrr_strings[i]))
diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c
index 84e83de54575..79556bd9b602 100644
--- a/arch/x86/kernel/cpu/mtrr/main.c
+++ b/arch/x86/kernel/cpu/mtrr/main.c
@@ -60,14 +60,14 @@ static DEFINE_MUTEX(mtrr_mutex);
60u64 size_or_mask, size_and_mask; 60u64 size_or_mask, size_and_mask;
61static bool mtrr_aps_delayed_init; 61static bool mtrr_aps_delayed_init;
62 62
63static struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM]; 63static const struct mtrr_ops *mtrr_ops[X86_VENDOR_NUM];
64 64
65struct mtrr_ops *mtrr_if; 65const struct mtrr_ops *mtrr_if;
66 66
67static void set_mtrr(unsigned int reg, unsigned long base, 67static void set_mtrr(unsigned int reg, unsigned long base,
68 unsigned long size, mtrr_type type); 68 unsigned long size, mtrr_type type);
69 69
70void set_mtrr_ops(struct mtrr_ops *ops) 70void set_mtrr_ops(const struct mtrr_ops *ops)
71{ 71{
72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM) 72 if (ops->vendor && ops->vendor < X86_VENDOR_NUM)
73 mtrr_ops[ops->vendor] = ops; 73 mtrr_ops[ops->vendor] = ops;
@@ -145,6 +145,7 @@ struct set_mtrr_data {
145 145
146/** 146/**
147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs. 147 * ipi_handler - Synchronisation handler. Executed by "other" CPUs.
148 * @info: pointer to mtrr configuration data
148 * 149 *
149 * Returns nothing. 150 * Returns nothing.
150 */ 151 */
diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h
index a501dee9a87a..df5e41f31a27 100644
--- a/arch/x86/kernel/cpu/mtrr/mtrr.h
+++ b/arch/x86/kernel/cpu/mtrr/mtrr.h
@@ -32,7 +32,7 @@ extern int generic_get_free_region(unsigned long base, unsigned long size,
32extern int generic_validate_add_page(unsigned long base, unsigned long size, 32extern int generic_validate_add_page(unsigned long base, unsigned long size,
33 unsigned int type); 33 unsigned int type);
34 34
35extern struct mtrr_ops generic_mtrr_ops; 35extern const struct mtrr_ops generic_mtrr_ops;
36 36
37extern int positive_have_wrcomb(void); 37extern int positive_have_wrcomb(void);
38 38
@@ -53,10 +53,10 @@ void fill_mtrr_var_range(unsigned int index,
53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi); 53 u32 base_lo, u32 base_hi, u32 mask_lo, u32 mask_hi);
54void get_mtrr_state(void); 54void get_mtrr_state(void);
55 55
56extern void set_mtrr_ops(struct mtrr_ops *ops); 56extern void set_mtrr_ops(const struct mtrr_ops *ops);
57 57
58extern u64 size_or_mask, size_and_mask; 58extern u64 size_or_mask, size_and_mask;
59extern struct mtrr_ops *mtrr_if; 59extern const struct mtrr_ops *mtrr_if;
60 60
61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd) 61#define is_cpu(vnd) (mtrr_if && mtrr_if->vendor == X86_VENDOR_##vnd)
62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1) 62#define use_intel() (mtrr_if && mtrr_if->use_intel_if == 1)
diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c
deleted file mode 100644
index dfc80b4e6b0d..000000000000
--- a/arch/x86/kernel/cpu/mtrr/state.c
+++ /dev/null
@@ -1,94 +0,0 @@
1#include <linux/init.h>
2#include <linux/io.h>
3#include <linux/mm.h>
4
5#include <asm/processor-cyrix.h>
6#include <asm/processor-flags.h>
7#include <asm/mtrr.h>
8#include <asm/msr.h>
9
10#include "mtrr.h"
11
12/* Put the processor into a state where MTRRs can be safely set */
13void set_mtrr_prepare_save(struct set_mtrr_context *ctxt)
14{
15 unsigned int cr0;
16
17 /* Disable interrupts locally */
18 local_irq_save(ctxt->flags);
19
20 if (use_intel() || is_cpu(CYRIX)) {
21
22 /* Save value of CR4 and clear Page Global Enable (bit 7) */
23 if (cpu_has_pge) {
24 ctxt->cr4val = read_cr4();
25 write_cr4(ctxt->cr4val & ~X86_CR4_PGE);
26 }
27
28 /*
29 * Disable and flush caches. Note that wbinvd flushes the TLBs
30 * as a side-effect
31 */
32 cr0 = read_cr0() | X86_CR0_CD;
33 wbinvd();
34 write_cr0(cr0);
35 wbinvd();
36
37 if (use_intel()) {
38 /* Save MTRR state */
39 rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi);
40 } else {
41 /*
42 * Cyrix ARRs -
43 * everything else were excluded at the top
44 */
45 ctxt->ccr3 = getCx86(CX86_CCR3);
46 }
47 }
48}
49
50void set_mtrr_cache_disable(struct set_mtrr_context *ctxt)
51{
52 if (use_intel()) {
53 /* Disable MTRRs, and set the default type to uncached */
54 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL,
55 ctxt->deftype_hi);
56 } else {
57 if (is_cpu(CYRIX)) {
58 /* Cyrix ARRs - everything else were excluded at the top */
59 setCx86(CX86_CCR3, (ctxt->ccr3 & 0x0f) | 0x10);
60 }
61 }
62}
63
64/* Restore the processor after a set_mtrr_prepare */
65void set_mtrr_done(struct set_mtrr_context *ctxt)
66{
67 if (use_intel() || is_cpu(CYRIX)) {
68
69 /* Flush caches and TLBs */
70 wbinvd();
71
72 /* Restore MTRRdefType */
73 if (use_intel()) {
74 /* Intel (P6) standard MTRRs */
75 mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo,
76 ctxt->deftype_hi);
77 } else {
78 /*
79 * Cyrix ARRs -
80 * everything else was excluded at the top
81 */
82 setCx86(CX86_CCR3, ctxt->ccr3);
83 }
84
85 /* Enable caches */
86 write_cr0(read_cr0() & 0xbfffffff);
87
88 /* Restore value of CR4 */
89 if (cpu_has_pge)
90 write_cr4(ctxt->cr4val);
91 }
92 /* Re-enable interrupts locally (if enabled previously) */
93 local_irq_restore(ctxt->flags);
94}
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b5801c311846..db5bdc8addf8 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -7,6 +7,7 @@
7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter 7 * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter
8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com> 8 * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra <pzijlstr@redhat.com>
9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com> 9 * Copyright (C) 2009 Intel Corporation, <markus.t.metzger@intel.com>
10 * Copyright (C) 2009 Google, Inc., Stephane Eranian
10 * 11 *
11 * For licencing details see kernel-base/COPYING 12 * For licencing details see kernel-base/COPYING
12 */ 13 */
@@ -20,12 +21,15 @@
20#include <linux/kdebug.h> 21#include <linux/kdebug.h>
21#include <linux/sched.h> 22#include <linux/sched.h>
22#include <linux/uaccess.h> 23#include <linux/uaccess.h>
24#include <linux/slab.h>
23#include <linux/highmem.h> 25#include <linux/highmem.h>
24#include <linux/cpu.h> 26#include <linux/cpu.h>
27#include <linux/bitops.h>
25 28
26#include <asm/apic.h> 29#include <asm/apic.h>
27#include <asm/stacktrace.h> 30#include <asm/stacktrace.h>
28#include <asm/nmi.h> 31#include <asm/nmi.h>
32#include <asm/compat.h>
29 33
30static u64 perf_event_mask __read_mostly; 34static u64 perf_event_mask __read_mostly;
31 35
@@ -68,15 +72,60 @@ struct debug_store {
68 u64 pebs_event_reset[MAX_PEBS_EVENTS]; 72 u64 pebs_event_reset[MAX_PEBS_EVENTS];
69}; 73};
70 74
75struct event_constraint {
76 union {
77 unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
78 u64 idxmsk64;
79 };
80 u64 code;
81 u64 cmask;
82 int weight;
83};
84
85struct amd_nb {
86 int nb_id; /* NorthBridge id */
87 int refcnt; /* reference count */
88 struct perf_event *owners[X86_PMC_IDX_MAX];
89 struct event_constraint event_constraints[X86_PMC_IDX_MAX];
90};
91
71struct cpu_hw_events { 92struct cpu_hw_events {
72 struct perf_event *events[X86_PMC_IDX_MAX]; 93 struct perf_event *events[X86_PMC_IDX_MAX]; /* in counter order */
73 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
74 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; 94 unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
75 unsigned long interrupts; 95 unsigned long interrupts;
76 int enabled; 96 int enabled;
77 struct debug_store *ds; 97 struct debug_store *ds;
98
99 int n_events;
100 int n_added;
101 int assign[X86_PMC_IDX_MAX]; /* event to counter assignment */
102 u64 tags[X86_PMC_IDX_MAX];
103 struct perf_event *event_list[X86_PMC_IDX_MAX]; /* in enabled order */
104 struct amd_nb *amd_nb;
78}; 105};
79 106
107#define __EVENT_CONSTRAINT(c, n, m, w) {\
108 { .idxmsk64 = (n) }, \
109 .code = (c), \
110 .cmask = (m), \
111 .weight = (w), \
112}
113
114#define EVENT_CONSTRAINT(c, n, m) \
115 __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n))
116
117#define INTEL_EVENT_CONSTRAINT(c, n) \
118 EVENT_CONSTRAINT(c, n, INTEL_ARCH_EVTSEL_MASK)
119
120#define FIXED_EVENT_CONSTRAINT(c, n) \
121 EVENT_CONSTRAINT(c, (1ULL << (32+n)), INTEL_ARCH_FIXED_MASK)
122
123#define EVENT_CONSTRAINT_END \
124 EVENT_CONSTRAINT(0, 0, 0)
125
126#define for_each_event_constraint(e, c) \
127 for ((e) = (c); (e)->cmask; (e)++)
128
80/* 129/*
81 * struct x86_pmu - generic x86 pmu 130 * struct x86_pmu - generic x86 pmu
82 */ 131 */
@@ -86,8 +135,8 @@ struct x86_pmu {
86 int (*handle_irq)(struct pt_regs *); 135 int (*handle_irq)(struct pt_regs *);
87 void (*disable_all)(void); 136 void (*disable_all)(void);
88 void (*enable_all)(void); 137 void (*enable_all)(void);
89 void (*enable)(struct hw_perf_event *, int); 138 void (*enable)(struct perf_event *);
90 void (*disable)(struct hw_perf_event *, int); 139 void (*disable)(struct perf_event *);
91 unsigned eventsel; 140 unsigned eventsel;
92 unsigned perfctr; 141 unsigned perfctr;
93 u64 (*event_map)(int); 142 u64 (*event_map)(int);
@@ -102,78 +151,28 @@ struct x86_pmu {
102 u64 intel_ctrl; 151 u64 intel_ctrl;
103 void (*enable_bts)(u64 config); 152 void (*enable_bts)(u64 config);
104 void (*disable_bts)(void); 153 void (*disable_bts)(void);
105};
106 154
107static struct x86_pmu x86_pmu __read_mostly; 155 struct event_constraint *
156 (*get_event_constraints)(struct cpu_hw_events *cpuc,
157 struct perf_event *event);
108 158
109static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = { 159 void (*put_event_constraints)(struct cpu_hw_events *cpuc,
110 .enabled = 1, 160 struct perf_event *event);
111}; 161 struct event_constraint *event_constraints;
112 162
113/* 163 int (*cpu_prepare)(int cpu);
114 * Not sure about some of these 164 void (*cpu_starting)(int cpu);
115 */ 165 void (*cpu_dying)(int cpu);
116static const u64 p6_perfmon_event_map[] = 166 void (*cpu_dead)(int cpu);
117{
118 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
119 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
120 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
121 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
122 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
123 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
124 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
125}; 167};
126 168
127static u64 p6_pmu_event_map(int hw_event) 169static struct x86_pmu x86_pmu __read_mostly;
128{
129 return p6_perfmon_event_map[hw_event];
130}
131
132/*
133 * Event setting that is specified not to count anything.
134 * We use this to effectively disable a counter.
135 *
136 * L2_RQSTS with 0 MESI unit mask.
137 */
138#define P6_NOP_EVENT 0x0000002EULL
139
140static u64 p6_pmu_raw_event(u64 hw_event)
141{
142#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
143#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
144#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
145#define P6_EVNTSEL_INV_MASK 0x00800000ULL
146#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
147
148#define P6_EVNTSEL_MASK \
149 (P6_EVNTSEL_EVENT_MASK | \
150 P6_EVNTSEL_UNIT_MASK | \
151 P6_EVNTSEL_EDGE_MASK | \
152 P6_EVNTSEL_INV_MASK | \
153 P6_EVNTSEL_REG_MASK)
154
155 return hw_event & P6_EVNTSEL_MASK;
156}
157
158 170
159/* 171static DEFINE_PER_CPU(struct cpu_hw_events, cpu_hw_events) = {
160 * Intel PerfMon v3. Used on Core2 and later. 172 .enabled = 1,
161 */
162static const u64 intel_perfmon_event_map[] =
163{
164 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
165 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
166 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
167 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
168 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
169 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
170 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
171}; 173};
172 174
173static u64 intel_pmu_event_map(int hw_event) 175static int x86_perf_event_set_period(struct perf_event *event);
174{
175 return intel_perfmon_event_map[hw_event];
176}
177 176
178/* 177/*
179 * Generalized hw caching related hw_event table, filled 178 * Generalized hw caching related hw_event table, filled
@@ -190,435 +189,18 @@ static u64 __read_mostly hw_cache_event_ids
190 [PERF_COUNT_HW_CACHE_OP_MAX] 189 [PERF_COUNT_HW_CACHE_OP_MAX]
191 [PERF_COUNT_HW_CACHE_RESULT_MAX]; 190 [PERF_COUNT_HW_CACHE_RESULT_MAX];
192 191
193static const u64 nehalem_hw_cache_event_ids
194 [PERF_COUNT_HW_CACHE_MAX]
195 [PERF_COUNT_HW_CACHE_OP_MAX]
196 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
197{
198 [ C(L1D) ] = {
199 [ C(OP_READ) ] = {
200 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
201 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
202 },
203 [ C(OP_WRITE) ] = {
204 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
205 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
206 },
207 [ C(OP_PREFETCH) ] = {
208 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
209 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
210 },
211 },
212 [ C(L1I ) ] = {
213 [ C(OP_READ) ] = {
214 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
215 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
216 },
217 [ C(OP_WRITE) ] = {
218 [ C(RESULT_ACCESS) ] = -1,
219 [ C(RESULT_MISS) ] = -1,
220 },
221 [ C(OP_PREFETCH) ] = {
222 [ C(RESULT_ACCESS) ] = 0x0,
223 [ C(RESULT_MISS) ] = 0x0,
224 },
225 },
226 [ C(LL ) ] = {
227 [ C(OP_READ) ] = {
228 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
229 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
230 },
231 [ C(OP_WRITE) ] = {
232 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
233 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
234 },
235 [ C(OP_PREFETCH) ] = {
236 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
237 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
238 },
239 },
240 [ C(DTLB) ] = {
241 [ C(OP_READ) ] = {
242 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
243 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
244 },
245 [ C(OP_WRITE) ] = {
246 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
247 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
248 },
249 [ C(OP_PREFETCH) ] = {
250 [ C(RESULT_ACCESS) ] = 0x0,
251 [ C(RESULT_MISS) ] = 0x0,
252 },
253 },
254 [ C(ITLB) ] = {
255 [ C(OP_READ) ] = {
256 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
257 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
258 },
259 [ C(OP_WRITE) ] = {
260 [ C(RESULT_ACCESS) ] = -1,
261 [ C(RESULT_MISS) ] = -1,
262 },
263 [ C(OP_PREFETCH) ] = {
264 [ C(RESULT_ACCESS) ] = -1,
265 [ C(RESULT_MISS) ] = -1,
266 },
267 },
268 [ C(BPU ) ] = {
269 [ C(OP_READ) ] = {
270 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
271 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
272 },
273 [ C(OP_WRITE) ] = {
274 [ C(RESULT_ACCESS) ] = -1,
275 [ C(RESULT_MISS) ] = -1,
276 },
277 [ C(OP_PREFETCH) ] = {
278 [ C(RESULT_ACCESS) ] = -1,
279 [ C(RESULT_MISS) ] = -1,
280 },
281 },
282};
283
284static const u64 core2_hw_cache_event_ids
285 [PERF_COUNT_HW_CACHE_MAX]
286 [PERF_COUNT_HW_CACHE_OP_MAX]
287 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
288{
289 [ C(L1D) ] = {
290 [ C(OP_READ) ] = {
291 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
292 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
293 },
294 [ C(OP_WRITE) ] = {
295 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
296 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
297 },
298 [ C(OP_PREFETCH) ] = {
299 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
300 [ C(RESULT_MISS) ] = 0,
301 },
302 },
303 [ C(L1I ) ] = {
304 [ C(OP_READ) ] = {
305 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
306 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
307 },
308 [ C(OP_WRITE) ] = {
309 [ C(RESULT_ACCESS) ] = -1,
310 [ C(RESULT_MISS) ] = -1,
311 },
312 [ C(OP_PREFETCH) ] = {
313 [ C(RESULT_ACCESS) ] = 0,
314 [ C(RESULT_MISS) ] = 0,
315 },
316 },
317 [ C(LL ) ] = {
318 [ C(OP_READ) ] = {
319 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
320 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
321 },
322 [ C(OP_WRITE) ] = {
323 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
324 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
325 },
326 [ C(OP_PREFETCH) ] = {
327 [ C(RESULT_ACCESS) ] = 0,
328 [ C(RESULT_MISS) ] = 0,
329 },
330 },
331 [ C(DTLB) ] = {
332 [ C(OP_READ) ] = {
333 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
334 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
335 },
336 [ C(OP_WRITE) ] = {
337 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
338 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
339 },
340 [ C(OP_PREFETCH) ] = {
341 [ C(RESULT_ACCESS) ] = 0,
342 [ C(RESULT_MISS) ] = 0,
343 },
344 },
345 [ C(ITLB) ] = {
346 [ C(OP_READ) ] = {
347 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
348 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
349 },
350 [ C(OP_WRITE) ] = {
351 [ C(RESULT_ACCESS) ] = -1,
352 [ C(RESULT_MISS) ] = -1,
353 },
354 [ C(OP_PREFETCH) ] = {
355 [ C(RESULT_ACCESS) ] = -1,
356 [ C(RESULT_MISS) ] = -1,
357 },
358 },
359 [ C(BPU ) ] = {
360 [ C(OP_READ) ] = {
361 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
362 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
363 },
364 [ C(OP_WRITE) ] = {
365 [ C(RESULT_ACCESS) ] = -1,
366 [ C(RESULT_MISS) ] = -1,
367 },
368 [ C(OP_PREFETCH) ] = {
369 [ C(RESULT_ACCESS) ] = -1,
370 [ C(RESULT_MISS) ] = -1,
371 },
372 },
373};
374
375static const u64 atom_hw_cache_event_ids
376 [PERF_COUNT_HW_CACHE_MAX]
377 [PERF_COUNT_HW_CACHE_OP_MAX]
378 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
379{
380 [ C(L1D) ] = {
381 [ C(OP_READ) ] = {
382 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
383 [ C(RESULT_MISS) ] = 0,
384 },
385 [ C(OP_WRITE) ] = {
386 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
387 [ C(RESULT_MISS) ] = 0,
388 },
389 [ C(OP_PREFETCH) ] = {
390 [ C(RESULT_ACCESS) ] = 0x0,
391 [ C(RESULT_MISS) ] = 0,
392 },
393 },
394 [ C(L1I ) ] = {
395 [ C(OP_READ) ] = {
396 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
397 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
398 },
399 [ C(OP_WRITE) ] = {
400 [ C(RESULT_ACCESS) ] = -1,
401 [ C(RESULT_MISS) ] = -1,
402 },
403 [ C(OP_PREFETCH) ] = {
404 [ C(RESULT_ACCESS) ] = 0,
405 [ C(RESULT_MISS) ] = 0,
406 },
407 },
408 [ C(LL ) ] = {
409 [ C(OP_READ) ] = {
410 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
411 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
412 },
413 [ C(OP_WRITE) ] = {
414 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
415 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
416 },
417 [ C(OP_PREFETCH) ] = {
418 [ C(RESULT_ACCESS) ] = 0,
419 [ C(RESULT_MISS) ] = 0,
420 },
421 },
422 [ C(DTLB) ] = {
423 [ C(OP_READ) ] = {
424 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
425 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
426 },
427 [ C(OP_WRITE) ] = {
428 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
429 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
430 },
431 [ C(OP_PREFETCH) ] = {
432 [ C(RESULT_ACCESS) ] = 0,
433 [ C(RESULT_MISS) ] = 0,
434 },
435 },
436 [ C(ITLB) ] = {
437 [ C(OP_READ) ] = {
438 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
439 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
440 },
441 [ C(OP_WRITE) ] = {
442 [ C(RESULT_ACCESS) ] = -1,
443 [ C(RESULT_MISS) ] = -1,
444 },
445 [ C(OP_PREFETCH) ] = {
446 [ C(RESULT_ACCESS) ] = -1,
447 [ C(RESULT_MISS) ] = -1,
448 },
449 },
450 [ C(BPU ) ] = {
451 [ C(OP_READ) ] = {
452 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
453 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
454 },
455 [ C(OP_WRITE) ] = {
456 [ C(RESULT_ACCESS) ] = -1,
457 [ C(RESULT_MISS) ] = -1,
458 },
459 [ C(OP_PREFETCH) ] = {
460 [ C(RESULT_ACCESS) ] = -1,
461 [ C(RESULT_MISS) ] = -1,
462 },
463 },
464};
465
466static u64 intel_pmu_raw_event(u64 hw_event)
467{
468#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
469#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
470#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
471#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
472#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
473
474#define CORE_EVNTSEL_MASK \
475 (CORE_EVNTSEL_EVENT_MASK | \
476 CORE_EVNTSEL_UNIT_MASK | \
477 CORE_EVNTSEL_EDGE_MASK | \
478 CORE_EVNTSEL_INV_MASK | \
479 CORE_EVNTSEL_REG_MASK)
480
481 return hw_event & CORE_EVNTSEL_MASK;
482}
483
484static const u64 amd_hw_cache_event_ids
485 [PERF_COUNT_HW_CACHE_MAX]
486 [PERF_COUNT_HW_CACHE_OP_MAX]
487 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
488{
489 [ C(L1D) ] = {
490 [ C(OP_READ) ] = {
491 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
492 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
493 },
494 [ C(OP_WRITE) ] = {
495 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
496 [ C(RESULT_MISS) ] = 0,
497 },
498 [ C(OP_PREFETCH) ] = {
499 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
500 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
501 },
502 },
503 [ C(L1I ) ] = {
504 [ C(OP_READ) ] = {
505 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
506 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
507 },
508 [ C(OP_WRITE) ] = {
509 [ C(RESULT_ACCESS) ] = -1,
510 [ C(RESULT_MISS) ] = -1,
511 },
512 [ C(OP_PREFETCH) ] = {
513 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
514 [ C(RESULT_MISS) ] = 0,
515 },
516 },
517 [ C(LL ) ] = {
518 [ C(OP_READ) ] = {
519 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
520 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
521 },
522 [ C(OP_WRITE) ] = {
523 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
524 [ C(RESULT_MISS) ] = 0,
525 },
526 [ C(OP_PREFETCH) ] = {
527 [ C(RESULT_ACCESS) ] = 0,
528 [ C(RESULT_MISS) ] = 0,
529 },
530 },
531 [ C(DTLB) ] = {
532 [ C(OP_READ) ] = {
533 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
534 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
535 },
536 [ C(OP_WRITE) ] = {
537 [ C(RESULT_ACCESS) ] = 0,
538 [ C(RESULT_MISS) ] = 0,
539 },
540 [ C(OP_PREFETCH) ] = {
541 [ C(RESULT_ACCESS) ] = 0,
542 [ C(RESULT_MISS) ] = 0,
543 },
544 },
545 [ C(ITLB) ] = {
546 [ C(OP_READ) ] = {
547 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
548 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
549 },
550 [ C(OP_WRITE) ] = {
551 [ C(RESULT_ACCESS) ] = -1,
552 [ C(RESULT_MISS) ] = -1,
553 },
554 [ C(OP_PREFETCH) ] = {
555 [ C(RESULT_ACCESS) ] = -1,
556 [ C(RESULT_MISS) ] = -1,
557 },
558 },
559 [ C(BPU ) ] = {
560 [ C(OP_READ) ] = {
561 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
562 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
563 },
564 [ C(OP_WRITE) ] = {
565 [ C(RESULT_ACCESS) ] = -1,
566 [ C(RESULT_MISS) ] = -1,
567 },
568 [ C(OP_PREFETCH) ] = {
569 [ C(RESULT_ACCESS) ] = -1,
570 [ C(RESULT_MISS) ] = -1,
571 },
572 },
573};
574
575/*
576 * AMD Performance Monitor K7 and later.
577 */
578static const u64 amd_perfmon_event_map[] =
579{
580 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
581 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
582 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
583 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
584 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
585 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
586};
587
588static u64 amd_pmu_event_map(int hw_event)
589{
590 return amd_perfmon_event_map[hw_event];
591}
592
593static u64 amd_pmu_raw_event(u64 hw_event)
594{
595#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL
596#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
597#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
598#define K7_EVNTSEL_INV_MASK 0x000800000ULL
599#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
600
601#define K7_EVNTSEL_MASK \
602 (K7_EVNTSEL_EVENT_MASK | \
603 K7_EVNTSEL_UNIT_MASK | \
604 K7_EVNTSEL_EDGE_MASK | \
605 K7_EVNTSEL_INV_MASK | \
606 K7_EVNTSEL_REG_MASK)
607
608 return hw_event & K7_EVNTSEL_MASK;
609}
610
611/* 192/*
612 * Propagate event elapsed time into the generic event. 193 * Propagate event elapsed time into the generic event.
613 * Can only be executed on the CPU where the event is active. 194 * Can only be executed on the CPU where the event is active.
614 * Returns the delta events processed. 195 * Returns the delta events processed.
615 */ 196 */
616static u64 197static u64
617x86_perf_event_update(struct perf_event *event, 198x86_perf_event_update(struct perf_event *event)
618 struct hw_perf_event *hwc, int idx)
619{ 199{
200 struct hw_perf_event *hwc = &event->hw;
620 int shift = 64 - x86_pmu.event_bits; 201 int shift = 64 - x86_pmu.event_bits;
621 u64 prev_raw_count, new_raw_count; 202 u64 prev_raw_count, new_raw_count;
203 int idx = hwc->idx;
622 s64 delta; 204 s64 delta;
623 205
624 if (idx == X86_PMC_IDX_FIXED_BTS) 206 if (idx == X86_PMC_IDX_FIXED_BTS)
@@ -718,7 +300,7 @@ static inline bool bts_available(void)
718 return x86_pmu.enable_bts != NULL; 300 return x86_pmu.enable_bts != NULL;
719} 301}
720 302
721static inline void init_debug_store_on_cpu(int cpu) 303static void init_debug_store_on_cpu(int cpu)
722{ 304{
723 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; 305 struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds;
724 306
@@ -730,7 +312,7 @@ static inline void init_debug_store_on_cpu(int cpu)
730 (u32)((u64)(unsigned long)ds >> 32)); 312 (u32)((u64)(unsigned long)ds >> 32));
731} 313}
732 314
733static inline void fini_debug_store_on_cpu(int cpu) 315static void fini_debug_store_on_cpu(int cpu)
734{ 316{
735 if (!per_cpu(cpu_hw_events, cpu).ds) 317 if (!per_cpu(cpu_hw_events, cpu).ds)
736 return; 318 return;
@@ -859,42 +441,6 @@ set_ext_hw_attr(struct hw_perf_event *hwc, struct perf_event_attr *attr)
859 return 0; 441 return 0;
860} 442}
861 443
862static void intel_pmu_enable_bts(u64 config)
863{
864 unsigned long debugctlmsr;
865
866 debugctlmsr = get_debugctlmsr();
867
868 debugctlmsr |= X86_DEBUGCTL_TR;
869 debugctlmsr |= X86_DEBUGCTL_BTS;
870 debugctlmsr |= X86_DEBUGCTL_BTINT;
871
872 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
873 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
874
875 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
876 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
877
878 update_debugctlmsr(debugctlmsr);
879}
880
881static void intel_pmu_disable_bts(void)
882{
883 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
884 unsigned long debugctlmsr;
885
886 if (!cpuc->ds)
887 return;
888
889 debugctlmsr = get_debugctlmsr();
890
891 debugctlmsr &=
892 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
893 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
894
895 update_debugctlmsr(debugctlmsr);
896}
897
898/* 444/*
899 * Setup the hardware configuration for a given attr_type 445 * Setup the hardware configuration for a given attr_type
900 */ 446 */
@@ -932,6 +478,10 @@ static int __hw_perf_event_init(struct perf_event *event)
932 */ 478 */
933 hwc->config = ARCH_PERFMON_EVENTSEL_INT; 479 hwc->config = ARCH_PERFMON_EVENTSEL_INT;
934 480
481 hwc->idx = -1;
482 hwc->last_cpu = -1;
483 hwc->last_tag = ~0ULL;
484
935 /* 485 /*
936 * Count user and OS events unless requested not to. 486 * Count user and OS events unless requested not to.
937 */ 487 */
@@ -960,6 +510,9 @@ static int __hw_perf_event_init(struct perf_event *event)
960 */ 510 */
961 if (attr->type == PERF_TYPE_RAW) { 511 if (attr->type == PERF_TYPE_RAW) {
962 hwc->config |= x86_pmu.raw_event(attr->config); 512 hwc->config |= x86_pmu.raw_event(attr->config);
513 if ((hwc->config & ARCH_PERFMON_EVENTSEL_ANY) &&
514 perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
515 return -EACCES;
963 return 0; 516 return 0;
964 } 517 }
965 518
@@ -999,216 +552,314 @@ static int __hw_perf_event_init(struct perf_event *event)
999 return 0; 552 return 0;
1000} 553}
1001 554
1002static void p6_pmu_disable_all(void) 555static void x86_pmu_disable_all(void)
1003{ 556{
1004 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 557 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1005 u64 val; 558 int idx;
1006
1007 if (!cpuc->enabled)
1008 return;
1009 559
1010 cpuc->enabled = 0; 560 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1011 barrier(); 561 u64 val;
1012 562
1013 /* p6 only has one enable register */ 563 if (!test_bit(idx, cpuc->active_mask))
1014 rdmsrl(MSR_P6_EVNTSEL0, val); 564 continue;
1015 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 565 rdmsrl(x86_pmu.eventsel + idx, val);
1016 wrmsrl(MSR_P6_EVNTSEL0, val); 566 if (!(val & ARCH_PERFMON_EVENTSEL_ENABLE))
567 continue;
568 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
569 wrmsrl(x86_pmu.eventsel + idx, val);
570 }
1017} 571}
1018 572
1019static void intel_pmu_disable_all(void) 573void hw_perf_disable(void)
1020{ 574{
1021 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 575 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1022 576
577 if (!x86_pmu_initialized())
578 return;
579
1023 if (!cpuc->enabled) 580 if (!cpuc->enabled)
1024 return; 581 return;
1025 582
583 cpuc->n_added = 0;
1026 cpuc->enabled = 0; 584 cpuc->enabled = 0;
1027 barrier(); 585 barrier();
1028 586
1029 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); 587 x86_pmu.disable_all();
1030
1031 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
1032 intel_pmu_disable_bts();
1033} 588}
1034 589
1035static void amd_pmu_disable_all(void) 590static void x86_pmu_enable_all(void)
1036{ 591{
1037 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 592 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1038 int idx; 593 int idx;
1039 594
1040 if (!cpuc->enabled)
1041 return;
1042
1043 cpuc->enabled = 0;
1044 /*
1045 * ensure we write the disable before we start disabling the
1046 * events proper, so that amd_pmu_enable_event() does the
1047 * right thing.
1048 */
1049 barrier();
1050
1051 for (idx = 0; idx < x86_pmu.num_events; idx++) { 595 for (idx = 0; idx < x86_pmu.num_events; idx++) {
596 struct perf_event *event = cpuc->events[idx];
1052 u64 val; 597 u64 val;
1053 598
1054 if (!test_bit(idx, cpuc->active_mask)) 599 if (!test_bit(idx, cpuc->active_mask))
1055 continue; 600 continue;
1056 rdmsrl(MSR_K7_EVNTSEL0 + idx, val); 601
1057 if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) 602 val = event->hw.config;
1058 continue; 603 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
1059 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 604 wrmsrl(x86_pmu.eventsel + idx, val);
1060 wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
1061 } 605 }
1062} 606}
1063 607
1064void hw_perf_disable(void) 608static const struct pmu pmu;
609
610static inline int is_x86_event(struct perf_event *event)
1065{ 611{
1066 if (!x86_pmu_initialized()) 612 return event->pmu == &pmu;
1067 return;
1068 return x86_pmu.disable_all();
1069} 613}
1070 614
1071static void p6_pmu_enable_all(void) 615static int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign)
1072{ 616{
1073 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 617 struct event_constraint *c, *constraints[X86_PMC_IDX_MAX];
1074 unsigned long val; 618 unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
619 int i, j, w, wmax, num = 0;
620 struct hw_perf_event *hwc;
1075 621
1076 if (cpuc->enabled) 622 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1077 return;
1078 623
1079 cpuc->enabled = 1; 624 for (i = 0; i < n; i++) {
1080 barrier(); 625 c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]);
626 constraints[i] = c;
627 }
1081 628
1082 /* p6 only has one enable register */ 629 /*
1083 rdmsrl(MSR_P6_EVNTSEL0, val); 630 * fastpath, try to reuse previous register
1084 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 631 */
1085 wrmsrl(MSR_P6_EVNTSEL0, val); 632 for (i = 0; i < n; i++) {
1086} 633 hwc = &cpuc->event_list[i]->hw;
634 c = constraints[i];
1087 635
1088static void intel_pmu_enable_all(void) 636 /* never assigned */
1089{ 637 if (hwc->idx == -1)
1090 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 638 break;
1091 639
1092 if (cpuc->enabled) 640 /* constraint still honored */
1093 return; 641 if (!test_bit(hwc->idx, c->idxmsk))
642 break;
1094 643
1095 cpuc->enabled = 1; 644 /* not already used */
1096 barrier(); 645 if (test_bit(hwc->idx, used_mask))
646 break;
1097 647
1098 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); 648 __set_bit(hwc->idx, used_mask);
649 if (assign)
650 assign[i] = hwc->idx;
651 }
652 if (i == n)
653 goto done;
1099 654
1100 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) { 655 /*
1101 struct perf_event *event = 656 * begin slow path
1102 cpuc->events[X86_PMC_IDX_FIXED_BTS]; 657 */
1103 658
1104 if (WARN_ON_ONCE(!event)) 659 bitmap_zero(used_mask, X86_PMC_IDX_MAX);
1105 return;
1106 660
1107 intel_pmu_enable_bts(event->hw.config); 661 /*
1108 } 662 * weight = number of possible counters
1109} 663 *
664 * 1 = most constrained, only works on one counter
665 * wmax = least constrained, works on any counter
666 *
667 * assign events to counters starting with most
668 * constrained events.
669 */
670 wmax = x86_pmu.num_events;
1110 671
1111static void amd_pmu_enable_all(void) 672 /*
1112{ 673 * when fixed event counters are present,
1113 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 674 * wmax is incremented by 1 to account
1114 int idx; 675 * for one more choice
676 */
677 if (x86_pmu.num_events_fixed)
678 wmax++;
1115 679
1116 if (cpuc->enabled) 680 for (w = 1, num = n; num && w <= wmax; w++) {
1117 return; 681 /* for each event */
682 for (i = 0; num && i < n; i++) {
683 c = constraints[i];
684 hwc = &cpuc->event_list[i]->hw;
1118 685
1119 cpuc->enabled = 1; 686 if (c->weight != w)
1120 barrier(); 687 continue;
1121 688
1122 for (idx = 0; idx < x86_pmu.num_events; idx++) { 689 for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) {
1123 struct perf_event *event = cpuc->events[idx]; 690 if (!test_bit(j, used_mask))
1124 u64 val; 691 break;
692 }
1125 693
1126 if (!test_bit(idx, cpuc->active_mask)) 694 if (j == X86_PMC_IDX_MAX)
1127 continue; 695 break;
1128 696
1129 val = event->hw.config; 697 __set_bit(j, used_mask);
1130 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 698
1131 wrmsrl(MSR_K7_EVNTSEL0 + idx, val); 699 if (assign)
700 assign[i] = j;
701 num--;
702 }
1132 } 703 }
704done:
705 /*
706 * scheduling failed or is just a simulation,
707 * free resources if necessary
708 */
709 if (!assign || num) {
710 for (i = 0; i < n; i++) {
711 if (x86_pmu.put_event_constraints)
712 x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]);
713 }
714 }
715 return num ? -ENOSPC : 0;
1133} 716}
1134 717
1135void hw_perf_enable(void) 718/*
719 * dogrp: true if must collect siblings events (group)
720 * returns total number of events and error code
721 */
722static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, bool dogrp)
1136{ 723{
1137 if (!x86_pmu_initialized()) 724 struct perf_event *event;
1138 return; 725 int n, max_count;
1139 x86_pmu.enable_all();
1140}
1141 726
1142static inline u64 intel_pmu_get_status(void) 727 max_count = x86_pmu.num_events + x86_pmu.num_events_fixed;
1143{
1144 u64 status;
1145 728
1146 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); 729 /* current number of events already accepted */
730 n = cpuc->n_events;
1147 731
1148 return status; 732 if (is_x86_event(leader)) {
1149} 733 if (n >= max_count)
734 return -ENOSPC;
735 cpuc->event_list[n] = leader;
736 n++;
737 }
738 if (!dogrp)
739 return n;
1150 740
1151static inline void intel_pmu_ack_status(u64 ack) 741 list_for_each_entry(event, &leader->sibling_list, group_entry) {
1152{ 742 if (!is_x86_event(event) ||
1153 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); 743 event->state <= PERF_EVENT_STATE_OFF)
1154} 744 continue;
1155 745
1156static inline void x86_pmu_enable_event(struct hw_perf_event *hwc, int idx) 746 if (n >= max_count)
1157{ 747 return -ENOSPC;
1158 (void)checking_wrmsrl(hwc->config_base + idx,
1159 hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
1160}
1161 748
1162static inline void x86_pmu_disable_event(struct hw_perf_event *hwc, int idx) 749 cpuc->event_list[n] = event;
1163{ 750 n++;
1164 (void)checking_wrmsrl(hwc->config_base + idx, hwc->config); 751 }
752 return n;
1165} 753}
1166 754
1167static inline void 755static inline void x86_assign_hw_event(struct perf_event *event,
1168intel_pmu_disable_fixed(struct hw_perf_event *hwc, int __idx) 756 struct cpu_hw_events *cpuc, int i)
1169{ 757{
1170 int idx = __idx - X86_PMC_IDX_FIXED; 758 struct hw_perf_event *hwc = &event->hw;
1171 u64 ctrl_val, mask;
1172 759
1173 mask = 0xfULL << (idx * 4); 760 hwc->idx = cpuc->assign[i];
761 hwc->last_cpu = smp_processor_id();
762 hwc->last_tag = ++cpuc->tags[i];
1174 763
1175 rdmsrl(hwc->config_base, ctrl_val); 764 if (hwc->idx == X86_PMC_IDX_FIXED_BTS) {
1176 ctrl_val &= ~mask; 765 hwc->config_base = 0;
1177 (void)checking_wrmsrl(hwc->config_base, ctrl_val); 766 hwc->event_base = 0;
767 } else if (hwc->idx >= X86_PMC_IDX_FIXED) {
768 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
769 /*
770 * We set it so that event_base + idx in wrmsr/rdmsr maps to
771 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
772 */
773 hwc->event_base =
774 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
775 } else {
776 hwc->config_base = x86_pmu.eventsel;
777 hwc->event_base = x86_pmu.perfctr;
778 }
1178} 779}
1179 780
1180static inline void 781static inline int match_prev_assignment(struct hw_perf_event *hwc,
1181p6_pmu_disable_event(struct hw_perf_event *hwc, int idx) 782 struct cpu_hw_events *cpuc,
783 int i)
1182{ 784{
1183 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 785 return hwc->idx == cpuc->assign[i] &&
1184 u64 val = P6_NOP_EVENT; 786 hwc->last_cpu == smp_processor_id() &&
1185 787 hwc->last_tag == cpuc->tags[i];
1186 if (cpuc->enabled)
1187 val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
1188
1189 (void)checking_wrmsrl(hwc->config_base + idx, val);
1190} 788}
1191 789
1192static inline void 790static int x86_pmu_start(struct perf_event *event);
1193intel_pmu_disable_event(struct hw_perf_event *hwc, int idx) 791static void x86_pmu_stop(struct perf_event *event);
792
793void hw_perf_enable(void)
1194{ 794{
1195 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 795 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1196 intel_pmu_disable_bts(); 796 struct perf_event *event;
797 struct hw_perf_event *hwc;
798 int i;
799
800 if (!x86_pmu_initialized())
1197 return; 801 return;
1198 }
1199 802
1200 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { 803 if (cpuc->enabled)
1201 intel_pmu_disable_fixed(hwc, idx);
1202 return; 804 return;
805
806 if (cpuc->n_added) {
807 int n_running = cpuc->n_events - cpuc->n_added;
808 /*
809 * apply assignment obtained either from
810 * hw_perf_group_sched_in() or x86_pmu_enable()
811 *
812 * step1: save events moving to new counters
813 * step2: reprogram moved events into new counters
814 */
815 for (i = 0; i < n_running; i++) {
816 event = cpuc->event_list[i];
817 hwc = &event->hw;
818
819 /*
820 * we can avoid reprogramming counter if:
821 * - assigned same counter as last time
822 * - running on same CPU as last time
823 * - no other event has used the counter since
824 */
825 if (hwc->idx == -1 ||
826 match_prev_assignment(hwc, cpuc, i))
827 continue;
828
829 x86_pmu_stop(event);
830 }
831
832 for (i = 0; i < cpuc->n_events; i++) {
833 event = cpuc->event_list[i];
834 hwc = &event->hw;
835
836 if (!match_prev_assignment(hwc, cpuc, i))
837 x86_assign_hw_event(event, cpuc, i);
838 else if (i < n_running)
839 continue;
840
841 x86_pmu_start(event);
842 }
843 cpuc->n_added = 0;
844 perf_events_lapic_init();
1203 } 845 }
1204 846
1205 x86_pmu_disable_event(hwc, idx); 847 cpuc->enabled = 1;
848 barrier();
849
850 x86_pmu.enable_all();
1206} 851}
1207 852
1208static inline void 853static inline void __x86_pmu_enable_event(struct hw_perf_event *hwc)
1209amd_pmu_disable_event(struct hw_perf_event *hwc, int idx)
1210{ 854{
1211 x86_pmu_disable_event(hwc, idx); 855 (void)checking_wrmsrl(hwc->config_base + hwc->idx,
856 hwc->config | ARCH_PERFMON_EVENTSEL_ENABLE);
857}
858
859static inline void x86_pmu_disable_event(struct perf_event *event)
860{
861 struct hw_perf_event *hwc = &event->hw;
862 (void)checking_wrmsrl(hwc->config_base + hwc->idx, hwc->config);
1212} 863}
1213 864
1214static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left); 865static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
@@ -1218,18 +869,18 @@ static DEFINE_PER_CPU(u64 [X86_PMC_IDX_MAX], pmc_prev_left);
1218 * To be called with the event disabled in hw: 869 * To be called with the event disabled in hw:
1219 */ 870 */
1220static int 871static int
1221x86_perf_event_set_period(struct perf_event *event, 872x86_perf_event_set_period(struct perf_event *event)
1222 struct hw_perf_event *hwc, int idx)
1223{ 873{
874 struct hw_perf_event *hwc = &event->hw;
1224 s64 left = atomic64_read(&hwc->period_left); 875 s64 left = atomic64_read(&hwc->period_left);
1225 s64 period = hwc->sample_period; 876 s64 period = hwc->sample_period;
1226 int err, ret = 0; 877 int err, ret = 0, idx = hwc->idx;
1227 878
1228 if (idx == X86_PMC_IDX_FIXED_BTS) 879 if (idx == X86_PMC_IDX_FIXED_BTS)
1229 return 0; 880 return 0;
1230 881
1231 /* 882 /*
1232 * If we are way outside a reasoable range then just skip forward: 883 * If we are way outside a reasonable range then just skip forward:
1233 */ 884 */
1234 if (unlikely(left <= -period)) { 885 if (unlikely(left <= -period)) {
1235 left = period; 886 left = period;
@@ -1269,157 +920,63 @@ x86_perf_event_set_period(struct perf_event *event,
1269 return ret; 920 return ret;
1270} 921}
1271 922
1272static inline void 923static void x86_pmu_enable_event(struct perf_event *event)
1273intel_pmu_enable_fixed(struct hw_perf_event *hwc, int __idx)
1274{
1275 int idx = __idx - X86_PMC_IDX_FIXED;
1276 u64 ctrl_val, bits, mask;
1277 int err;
1278
1279 /*
1280 * Enable IRQ generation (0x8),
1281 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
1282 * if requested:
1283 */
1284 bits = 0x8ULL;
1285 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
1286 bits |= 0x2;
1287 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
1288 bits |= 0x1;
1289 bits <<= (idx * 4);
1290 mask = 0xfULL << (idx * 4);
1291
1292 rdmsrl(hwc->config_base, ctrl_val);
1293 ctrl_val &= ~mask;
1294 ctrl_val |= bits;
1295 err = checking_wrmsrl(hwc->config_base, ctrl_val);
1296}
1297
1298static void p6_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1299{ 924{
1300 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 925 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1301 u64 val;
1302
1303 val = hwc->config;
1304 if (cpuc->enabled) 926 if (cpuc->enabled)
1305 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 927 __x86_pmu_enable_event(&event->hw);
1306
1307 (void)checking_wrmsrl(hwc->config_base + idx, val);
1308} 928}
1309 929
1310 930/*
1311static void intel_pmu_enable_event(struct hw_perf_event *hwc, int idx) 931 * activate a single event
1312{ 932 *
1313 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) { 933 * The event is added to the group of enabled events
1314 if (!__get_cpu_var(cpu_hw_events).enabled) 934 * but only if it can be scehduled with existing events.
1315 return; 935 *
1316 936 * Called with PMU disabled. If successful and return value 1,
1317 intel_pmu_enable_bts(hwc->config); 937 * then guaranteed to call perf_enable() and hw_perf_enable()
1318 return; 938 */
1319 } 939static int x86_pmu_enable(struct perf_event *event)
1320
1321 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
1322 intel_pmu_enable_fixed(hwc, idx);
1323 return;
1324 }
1325
1326 x86_pmu_enable_event(hwc, idx);
1327}
1328
1329static void amd_pmu_enable_event(struct hw_perf_event *hwc, int idx)
1330{ 940{
1331 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 941 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
942 struct hw_perf_event *hwc;
943 int assign[X86_PMC_IDX_MAX];
944 int n, n0, ret;
1332 945
1333 if (cpuc->enabled) 946 hwc = &event->hw;
1334 x86_pmu_enable_event(hwc, idx);
1335}
1336
1337static int
1338fixed_mode_idx(struct perf_event *event, struct hw_perf_event *hwc)
1339{
1340 unsigned int hw_event;
1341
1342 hw_event = hwc->config & ARCH_PERFMON_EVENT_MASK;
1343 947
1344 if (unlikely((hw_event == 948 n0 = cpuc->n_events;
1345 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) && 949 n = collect_events(cpuc, event, false);
1346 (hwc->sample_period == 1))) 950 if (n < 0)
1347 return X86_PMC_IDX_FIXED_BTS; 951 return n;
1348 952
1349 if (!x86_pmu.num_events_fixed) 953 ret = x86_schedule_events(cpuc, n, assign);
1350 return -1; 954 if (ret)
955 return ret;
956 /*
957 * copy new assignment, now we know it is possible
958 * will be used by hw_perf_enable()
959 */
960 memcpy(cpuc->assign, assign, n*sizeof(int));
1351 961
1352 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_INSTRUCTIONS))) 962 cpuc->n_events = n;
1353 return X86_PMC_IDX_FIXED_INSTRUCTIONS; 963 cpuc->n_added += n - n0;
1354 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_CPU_CYCLES)))
1355 return X86_PMC_IDX_FIXED_CPU_CYCLES;
1356 if (unlikely(hw_event == x86_pmu.event_map(PERF_COUNT_HW_BUS_CYCLES)))
1357 return X86_PMC_IDX_FIXED_BUS_CYCLES;
1358 964
1359 return -1; 965 return 0;
1360} 966}
1361 967
1362/* 968static int x86_pmu_start(struct perf_event *event)
1363 * Find a PMC slot for the freshly enabled / scheduled in event:
1364 */
1365static int x86_pmu_enable(struct perf_event *event)
1366{ 969{
1367 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 970 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1368 struct hw_perf_event *hwc = &event->hw; 971 int idx = event->hw.idx;
1369 int idx;
1370 972
1371 idx = fixed_mode_idx(event, hwc); 973 if (idx == -1)
1372 if (idx == X86_PMC_IDX_FIXED_BTS) { 974 return -EAGAIN;
1373 /* BTS is already occupied. */
1374 if (test_and_set_bit(idx, cpuc->used_mask))
1375 return -EAGAIN;
1376
1377 hwc->config_base = 0;
1378 hwc->event_base = 0;
1379 hwc->idx = idx;
1380 } else if (idx >= 0) {
1381 /*
1382 * Try to get the fixed event, if that is already taken
1383 * then try to get a generic event:
1384 */
1385 if (test_and_set_bit(idx, cpuc->used_mask))
1386 goto try_generic;
1387
1388 hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL;
1389 /*
1390 * We set it so that event_base + idx in wrmsr/rdmsr maps to
1391 * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2:
1392 */
1393 hwc->event_base =
1394 MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED;
1395 hwc->idx = idx;
1396 } else {
1397 idx = hwc->idx;
1398 /* Try to get the previous generic event again */
1399 if (test_and_set_bit(idx, cpuc->used_mask)) {
1400try_generic:
1401 idx = find_first_zero_bit(cpuc->used_mask,
1402 x86_pmu.num_events);
1403 if (idx == x86_pmu.num_events)
1404 return -EAGAIN;
1405
1406 set_bit(idx, cpuc->used_mask);
1407 hwc->idx = idx;
1408 }
1409 hwc->config_base = x86_pmu.eventsel;
1410 hwc->event_base = x86_pmu.perfctr;
1411 }
1412
1413 perf_events_lapic_init();
1414
1415 x86_pmu.disable(hwc, idx);
1416 975
976 x86_perf_event_set_period(event);
1417 cpuc->events[idx] = event; 977 cpuc->events[idx] = event;
1418 set_bit(idx, cpuc->active_mask); 978 __set_bit(idx, cpuc->active_mask);
1419 979 x86_pmu.enable(event);
1420 x86_perf_event_set_period(event, hwc, idx);
1421 x86_pmu.enable(hwc, idx);
1422
1423 perf_event_update_userpage(event); 980 perf_event_update_userpage(event);
1424 981
1425 return 0; 982 return 0;
@@ -1427,14 +984,8 @@ try_generic:
1427 984
1428static void x86_pmu_unthrottle(struct perf_event *event) 985static void x86_pmu_unthrottle(struct perf_event *event)
1429{ 986{
1430 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 987 int ret = x86_pmu_start(event);
1431 struct hw_perf_event *hwc = &event->hw; 988 WARN_ON_ONCE(ret);
1432
1433 if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX ||
1434 cpuc->events[hwc->idx] != event))
1435 return;
1436
1437 x86_pmu.enable(hwc, hwc->idx);
1438} 989}
1439 990
1440void perf_event_print_debug(void) 991void perf_event_print_debug(void)
@@ -1464,7 +1015,7 @@ void perf_event_print_debug(void)
1464 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); 1015 pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow);
1465 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); 1016 pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed);
1466 } 1017 }
1467 pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); 1018 pr_info("CPU#%d: active: %016llx\n", cpu, *(u64 *)cpuc->active_mask);
1468 1019
1469 for (idx = 0; idx < x86_pmu.num_events; idx++) { 1020 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1470 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); 1021 rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl);
@@ -1488,254 +1039,50 @@ void perf_event_print_debug(void)
1488 local_irq_restore(flags); 1039 local_irq_restore(flags);
1489} 1040}
1490 1041
1491static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) 1042static void x86_pmu_stop(struct perf_event *event)
1492{
1493 struct debug_store *ds = cpuc->ds;
1494 struct bts_record {
1495 u64 from;
1496 u64 to;
1497 u64 flags;
1498 };
1499 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
1500 struct bts_record *at, *top;
1501 struct perf_output_handle handle;
1502 struct perf_event_header header;
1503 struct perf_sample_data data;
1504 struct pt_regs regs;
1505
1506 if (!event)
1507 return;
1508
1509 if (!ds)
1510 return;
1511
1512 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
1513 top = (struct bts_record *)(unsigned long)ds->bts_index;
1514
1515 if (top <= at)
1516 return;
1517
1518 ds->bts_index = ds->bts_buffer_base;
1519
1520
1521 data.period = event->hw.last_period;
1522 data.addr = 0;
1523 regs.ip = 0;
1524
1525 /*
1526 * Prepare a generic sample, i.e. fill in the invariant fields.
1527 * We will overwrite the from and to address before we output
1528 * the sample.
1529 */
1530 perf_prepare_sample(&header, &data, event, &regs);
1531
1532 if (perf_output_begin(&handle, event,
1533 header.size * (top - at), 1, 1))
1534 return;
1535
1536 for (; at < top; at++) {
1537 data.ip = at->from;
1538 data.addr = at->to;
1539
1540 perf_output_sample(&handle, &header, &data, event);
1541 }
1542
1543 perf_output_end(&handle);
1544
1545 /* There's new data available. */
1546 event->hw.interrupts++;
1547 event->pending_kill = POLL_IN;
1548}
1549
1550static void x86_pmu_disable(struct perf_event *event)
1551{ 1043{
1552 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events); 1044 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1553 struct hw_perf_event *hwc = &event->hw; 1045 struct hw_perf_event *hwc = &event->hw;
1554 int idx = hwc->idx; 1046 int idx = hwc->idx;
1555 1047
1556 /* 1048 if (!__test_and_clear_bit(idx, cpuc->active_mask))
1557 * Must be done before we disable, otherwise the nmi handler 1049 return;
1558 * could reenable again:
1559 */
1560 clear_bit(idx, cpuc->active_mask);
1561 x86_pmu.disable(hwc, idx);
1562 1050
1563 /* 1051 x86_pmu.disable(event);
1564 * Make sure the cleared pointer becomes visible before we
1565 * (potentially) free the event:
1566 */
1567 barrier();
1568 1052
1569 /* 1053 /*
1570 * Drain the remaining delta count out of a event 1054 * Drain the remaining delta count out of a event
1571 * that we are disabling: 1055 * that we are disabling:
1572 */ 1056 */
1573 x86_perf_event_update(event, hwc, idx); 1057 x86_perf_event_update(event);
1574
1575 /* Drain the remaining BTS records. */
1576 if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
1577 intel_pmu_drain_bts_buffer(cpuc);
1578 1058
1579 cpuc->events[idx] = NULL; 1059 cpuc->events[idx] = NULL;
1580 clear_bit(idx, cpuc->used_mask);
1581
1582 perf_event_update_userpage(event);
1583}
1584
1585/*
1586 * Save and restart an expired event. Called by NMI contexts,
1587 * so it has to be careful about preempting normal event ops:
1588 */
1589static int intel_pmu_save_and_restart(struct perf_event *event)
1590{
1591 struct hw_perf_event *hwc = &event->hw;
1592 int idx = hwc->idx;
1593 int ret;
1594
1595 x86_perf_event_update(event, hwc, idx);
1596 ret = x86_perf_event_set_period(event, hwc, idx);
1597
1598 if (event->state == PERF_EVENT_STATE_ACTIVE)
1599 intel_pmu_enable_event(hwc, idx);
1600
1601 return ret;
1602} 1060}
1603 1061
1604static void intel_pmu_reset(void) 1062static void x86_pmu_disable(struct perf_event *event)
1605{
1606 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
1607 unsigned long flags;
1608 int idx;
1609
1610 if (!x86_pmu.num_events)
1611 return;
1612
1613 local_irq_save(flags);
1614
1615 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
1616
1617 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1618 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
1619 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
1620 }
1621 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
1622 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
1623 }
1624 if (ds)
1625 ds->bts_index = ds->bts_buffer_base;
1626
1627 local_irq_restore(flags);
1628}
1629
1630static int p6_pmu_handle_irq(struct pt_regs *regs)
1631{
1632 struct perf_sample_data data;
1633 struct cpu_hw_events *cpuc;
1634 struct perf_event *event;
1635 struct hw_perf_event *hwc;
1636 int idx, handled = 0;
1637 u64 val;
1638
1639 data.addr = 0;
1640
1641 cpuc = &__get_cpu_var(cpu_hw_events);
1642
1643 for (idx = 0; idx < x86_pmu.num_events; idx++) {
1644 if (!test_bit(idx, cpuc->active_mask))
1645 continue;
1646
1647 event = cpuc->events[idx];
1648 hwc = &event->hw;
1649
1650 val = x86_perf_event_update(event, hwc, idx);
1651 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1652 continue;
1653
1654 /*
1655 * event overflow
1656 */
1657 handled = 1;
1658 data.period = event->hw.last_period;
1659
1660 if (!x86_perf_event_set_period(event, hwc, idx))
1661 continue;
1662
1663 if (perf_event_overflow(event, 1, &data, regs))
1664 p6_pmu_disable_event(hwc, idx);
1665 }
1666
1667 if (handled)
1668 inc_irq_stat(apic_perf_irqs);
1669
1670 return handled;
1671}
1672
1673/*
1674 * This handler is triggered by the local APIC, so the APIC IRQ handling
1675 * rules apply:
1676 */
1677static int intel_pmu_handle_irq(struct pt_regs *regs)
1678{ 1063{
1679 struct perf_sample_data data; 1064 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1680 struct cpu_hw_events *cpuc; 1065 int i;
1681 int bit, loops;
1682 u64 ack, status;
1683
1684 data.addr = 0;
1685
1686 cpuc = &__get_cpu_var(cpu_hw_events);
1687
1688 perf_disable();
1689 intel_pmu_drain_bts_buffer(cpuc);
1690 status = intel_pmu_get_status();
1691 if (!status) {
1692 perf_enable();
1693 return 0;
1694 }
1695
1696 loops = 0;
1697again:
1698 if (++loops > 100) {
1699 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
1700 perf_event_print_debug();
1701 intel_pmu_reset();
1702 perf_enable();
1703 return 1;
1704 }
1705 1066
1706 inc_irq_stat(apic_perf_irqs); 1067 x86_pmu_stop(event);
1707 ack = status;
1708 for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
1709 struct perf_event *event = cpuc->events[bit];
1710 1068
1711 clear_bit(bit, (unsigned long *) &status); 1069 for (i = 0; i < cpuc->n_events; i++) {
1712 if (!test_bit(bit, cpuc->active_mask)) 1070 if (event == cpuc->event_list[i]) {
1713 continue;
1714 1071
1715 if (!intel_pmu_save_and_restart(event)) 1072 if (x86_pmu.put_event_constraints)
1716 continue; 1073 x86_pmu.put_event_constraints(cpuc, event);
1717 1074
1718 data.period = event->hw.last_period; 1075 while (++i < cpuc->n_events)
1076 cpuc->event_list[i-1] = cpuc->event_list[i];
1719 1077
1720 if (perf_event_overflow(event, 1, &data, regs)) 1078 --cpuc->n_events;
1721 intel_pmu_disable_event(&event->hw, bit); 1079 break;
1080 }
1722 } 1081 }
1723 1082 perf_event_update_userpage(event);
1724 intel_pmu_ack_status(ack);
1725
1726 /*
1727 * Repeat if there is more work to be done:
1728 */
1729 status = intel_pmu_get_status();
1730 if (status)
1731 goto again;
1732
1733 perf_enable();
1734
1735 return 1;
1736} 1083}
1737 1084
1738static int amd_pmu_handle_irq(struct pt_regs *regs) 1085static int x86_pmu_handle_irq(struct pt_regs *regs)
1739{ 1086{
1740 struct perf_sample_data data; 1087 struct perf_sample_data data;
1741 struct cpu_hw_events *cpuc; 1088 struct cpu_hw_events *cpuc;
@@ -1744,7 +1091,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1744 int idx, handled = 0; 1091 int idx, handled = 0;
1745 u64 val; 1092 u64 val;
1746 1093
1747 data.addr = 0; 1094 perf_sample_data_init(&data, 0);
1748 1095
1749 cpuc = &__get_cpu_var(cpu_hw_events); 1096 cpuc = &__get_cpu_var(cpu_hw_events);
1750 1097
@@ -1755,7 +1102,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1755 event = cpuc->events[idx]; 1102 event = cpuc->events[idx];
1756 hwc = &event->hw; 1103 hwc = &event->hw;
1757 1104
1758 val = x86_perf_event_update(event, hwc, idx); 1105 val = x86_perf_event_update(event);
1759 if (val & (1ULL << (x86_pmu.event_bits - 1))) 1106 if (val & (1ULL << (x86_pmu.event_bits - 1)))
1760 continue; 1107 continue;
1761 1108
@@ -1765,11 +1112,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
1765 handled = 1; 1112 handled = 1;
1766 data.period = event->hw.last_period; 1113 data.period = event->hw.last_period;
1767 1114
1768 if (!x86_perf_event_set_period(event, hwc, idx)) 1115 if (!x86_perf_event_set_period(event))
1769 continue; 1116 continue;
1770 1117
1771 if (perf_event_overflow(event, 1, &data, regs)) 1118 if (perf_event_overflow(event, 1, &data, regs))
1772 amd_pmu_disable_event(hwc, idx); 1119 x86_pmu_stop(event);
1773 } 1120 }
1774 1121
1775 if (handled) 1122 if (handled)
@@ -1852,196 +1199,186 @@ static __read_mostly struct notifier_block perf_event_nmi_notifier = {
1852 .priority = 1 1199 .priority = 1
1853}; 1200};
1854 1201
1855static struct x86_pmu p6_pmu = { 1202static struct event_constraint unconstrained;
1856 .name = "p6", 1203static struct event_constraint emptyconstraint;
1857 .handle_irq = p6_pmu_handle_irq,
1858 .disable_all = p6_pmu_disable_all,
1859 .enable_all = p6_pmu_enable_all,
1860 .enable = p6_pmu_enable_event,
1861 .disable = p6_pmu_disable_event,
1862 .eventsel = MSR_P6_EVNTSEL0,
1863 .perfctr = MSR_P6_PERFCTR0,
1864 .event_map = p6_pmu_event_map,
1865 .raw_event = p6_pmu_raw_event,
1866 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
1867 .apic = 1,
1868 .max_period = (1ULL << 31) - 1,
1869 .version = 0,
1870 .num_events = 2,
1871 /*
1872 * Events have 40 bits implemented. However they are designed such
1873 * that bits [32-39] are sign extensions of bit 31. As such the
1874 * effective width of a event for P6-like PMU is 32 bits only.
1875 *
1876 * See IA-32 Intel Architecture Software developer manual Vol 3B
1877 */
1878 .event_bits = 32,
1879 .event_mask = (1ULL << 32) - 1,
1880};
1881 1204
1882static struct x86_pmu intel_pmu = { 1205static struct event_constraint *
1883 .name = "Intel", 1206x86_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
1884 .handle_irq = intel_pmu_handle_irq, 1207{
1885 .disable_all = intel_pmu_disable_all, 1208 struct event_constraint *c;
1886 .enable_all = intel_pmu_enable_all,
1887 .enable = intel_pmu_enable_event,
1888 .disable = intel_pmu_disable_event,
1889 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
1890 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
1891 .event_map = intel_pmu_event_map,
1892 .raw_event = intel_pmu_raw_event,
1893 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
1894 .apic = 1,
1895 /*
1896 * Intel PMCs cannot be accessed sanely above 32 bit width,
1897 * so we install an artificial 1<<31 period regardless of
1898 * the generic event period:
1899 */
1900 .max_period = (1ULL << 31) - 1,
1901 .enable_bts = intel_pmu_enable_bts,
1902 .disable_bts = intel_pmu_disable_bts,
1903};
1904 1209
1905static struct x86_pmu amd_pmu = { 1210 if (x86_pmu.event_constraints) {
1906 .name = "AMD", 1211 for_each_event_constraint(c, x86_pmu.event_constraints) {
1907 .handle_irq = amd_pmu_handle_irq, 1212 if ((event->hw.config & c->cmask) == c->code)
1908 .disable_all = amd_pmu_disable_all, 1213 return c;
1909 .enable_all = amd_pmu_enable_all, 1214 }
1910 .enable = amd_pmu_enable_event, 1215 }
1911 .disable = amd_pmu_disable_event, 1216
1912 .eventsel = MSR_K7_EVNTSEL0, 1217 return &unconstrained;
1913 .perfctr = MSR_K7_PERFCTR0, 1218}
1914 .event_map = amd_pmu_event_map,
1915 .raw_event = amd_pmu_raw_event,
1916 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
1917 .num_events = 4,
1918 .event_bits = 48,
1919 .event_mask = (1ULL << 48) - 1,
1920 .apic = 1,
1921 /* use highest bit to detect overflow */
1922 .max_period = (1ULL << 47) - 1,
1923};
1924 1219
1925static int p6_pmu_init(void) 1220static int x86_event_sched_in(struct perf_event *event,
1221 struct perf_cpu_context *cpuctx)
1926{ 1222{
1927 switch (boot_cpu_data.x86_model) { 1223 int ret = 0;
1928 case 1:
1929 case 3: /* Pentium Pro */
1930 case 5:
1931 case 6: /* Pentium II */
1932 case 7:
1933 case 8:
1934 case 11: /* Pentium III */
1935 break;
1936 case 9:
1937 case 13:
1938 /* Pentium M */
1939 break;
1940 default:
1941 pr_cont("unsupported p6 CPU model %d ",
1942 boot_cpu_data.x86_model);
1943 return -ENODEV;
1944 }
1945 1224
1946 x86_pmu = p6_pmu; 1225 event->state = PERF_EVENT_STATE_ACTIVE;
1226 event->oncpu = smp_processor_id();
1227 event->tstamp_running += event->ctx->time - event->tstamp_stopped;
1947 1228
1948 if (!cpu_has_apic) { 1229 if (!is_x86_event(event))
1949 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); 1230 ret = event->pmu->enable(event);
1950 pr_info("no hardware sampling interrupt available.\n");
1951 x86_pmu.apic = 0;
1952 }
1953 1231
1954 return 0; 1232 if (!ret && !is_software_event(event))
1233 cpuctx->active_oncpu++;
1234
1235 if (!ret && event->attr.exclusive)
1236 cpuctx->exclusive = 1;
1237
1238 return ret;
1955} 1239}
1956 1240
1957static int intel_pmu_init(void) 1241static void x86_event_sched_out(struct perf_event *event,
1242 struct perf_cpu_context *cpuctx)
1958{ 1243{
1959 union cpuid10_edx edx; 1244 event->state = PERF_EVENT_STATE_INACTIVE;
1960 union cpuid10_eax eax; 1245 event->oncpu = -1;
1961 unsigned int unused;
1962 unsigned int ebx;
1963 int version;
1964
1965 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
1966 /* check for P6 processor family */
1967 if (boot_cpu_data.x86 == 6) {
1968 return p6_pmu_init();
1969 } else {
1970 return -ENODEV;
1971 }
1972 }
1973 1246
1974 /* 1247 if (!is_x86_event(event))
1975 * Check whether the Architectural PerfMon supports 1248 event->pmu->disable(event);
1976 * Branch Misses Retired hw_event or not.
1977 */
1978 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
1979 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
1980 return -ENODEV;
1981 1249
1982 version = eax.split.version_id; 1250 event->tstamp_running -= event->ctx->time - event->tstamp_stopped;
1983 if (version < 2) 1251
1984 return -ENODEV; 1252 if (!is_software_event(event))
1253 cpuctx->active_oncpu--;
1985 1254
1986 x86_pmu = intel_pmu; 1255 if (event->attr.exclusive || !cpuctx->active_oncpu)
1987 x86_pmu.version = version; 1256 cpuctx->exclusive = 0;
1988 x86_pmu.num_events = eax.split.num_events; 1257}
1989 x86_pmu.event_bits = eax.split.bit_width;
1990 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
1991 1258
1259/*
1260 * Called to enable a whole group of events.
1261 * Returns 1 if the group was enabled, or -EAGAIN if it could not be.
1262 * Assumes the caller has disabled interrupts and has
1263 * frozen the PMU with hw_perf_save_disable.
1264 *
1265 * called with PMU disabled. If successful and return value 1,
1266 * then guaranteed to call perf_enable() and hw_perf_enable()
1267 */
1268int hw_perf_group_sched_in(struct perf_event *leader,
1269 struct perf_cpu_context *cpuctx,
1270 struct perf_event_context *ctx)
1271{
1272 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
1273 struct perf_event *sub;
1274 int assign[X86_PMC_IDX_MAX];
1275 int n0, n1, ret;
1276
1277 /* n0 = total number of events */
1278 n0 = collect_events(cpuc, leader, true);
1279 if (n0 < 0)
1280 return n0;
1281
1282 ret = x86_schedule_events(cpuc, n0, assign);
1283 if (ret)
1284 return ret;
1285
1286 ret = x86_event_sched_in(leader, cpuctx);
1287 if (ret)
1288 return ret;
1289
1290 n1 = 1;
1291 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
1292 if (sub->state > PERF_EVENT_STATE_OFF) {
1293 ret = x86_event_sched_in(sub, cpuctx);
1294 if (ret)
1295 goto undo;
1296 ++n1;
1297 }
1298 }
1992 /* 1299 /*
1993 * Quirk: v2 perfmon does not report fixed-purpose events, so 1300 * copy new assignment, now we know it is possible
1994 * assume at least 3 events: 1301 * will be used by hw_perf_enable()
1995 */ 1302 */
1996 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3); 1303 memcpy(cpuc->assign, assign, n0*sizeof(int));
1304
1305 cpuc->n_events = n0;
1306 cpuc->n_added += n1;
1307 ctx->nr_active += n1;
1997 1308
1998 /* 1309 /*
1999 * Install the hw-cache-events table: 1310 * 1 means successful and events are active
1311 * This is not quite true because we defer
1312 * actual activation until hw_perf_enable() but
1313 * this way we* ensure caller won't try to enable
1314 * individual events
2000 */ 1315 */
2001 switch (boot_cpu_data.x86_model) { 1316 return 1;
2002 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ 1317undo:
2003 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ 1318 x86_event_sched_out(leader, cpuctx);
2004 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ 1319 n0 = 1;
2005 case 29: /* six-core 45 nm xeon "Dunnington" */ 1320 list_for_each_entry(sub, &leader->sibling_list, group_entry) {
2006 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids, 1321 if (sub->state == PERF_EVENT_STATE_ACTIVE) {
2007 sizeof(hw_cache_event_ids)); 1322 x86_event_sched_out(sub, cpuctx);
2008 1323 if (++n0 == n1)
2009 pr_cont("Core2 events, "); 1324 break;
1325 }
1326 }
1327 return ret;
1328}
1329
1330#include "perf_event_amd.c"
1331#include "perf_event_p6.c"
1332#include "perf_event_intel.c"
1333
1334static int __cpuinit
1335x86_pmu_notifier(struct notifier_block *self, unsigned long action, void *hcpu)
1336{
1337 unsigned int cpu = (long)hcpu;
1338 int ret = NOTIFY_OK;
1339
1340 switch (action & ~CPU_TASKS_FROZEN) {
1341 case CPU_UP_PREPARE:
1342 if (x86_pmu.cpu_prepare)
1343 ret = x86_pmu.cpu_prepare(cpu);
2010 break; 1344 break;
2011 default:
2012 case 26:
2013 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
2014 sizeof(hw_cache_event_ids));
2015 1345
2016 pr_cont("Nehalem/Corei7 events, "); 1346 case CPU_STARTING:
1347 if (x86_pmu.cpu_starting)
1348 x86_pmu.cpu_starting(cpu);
2017 break; 1349 break;
2018 case 28:
2019 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
2020 sizeof(hw_cache_event_ids));
2021 1350
2022 pr_cont("Atom events, "); 1351 case CPU_DYING:
1352 if (x86_pmu.cpu_dying)
1353 x86_pmu.cpu_dying(cpu);
1354 break;
1355
1356 case CPU_UP_CANCELED:
1357 case CPU_DEAD:
1358 if (x86_pmu.cpu_dead)
1359 x86_pmu.cpu_dead(cpu);
1360 break;
1361
1362 default:
2023 break; 1363 break;
2024 } 1364 }
2025 return 0; 1365
1366 return ret;
2026} 1367}
2027 1368
2028static int amd_pmu_init(void) 1369static void __init pmu_check_apic(void)
2029{ 1370{
2030 /* Performance-monitoring supported from K7 and later: */ 1371 if (cpu_has_apic)
2031 if (boot_cpu_data.x86 < 6) 1372 return;
2032 return -ENODEV;
2033
2034 x86_pmu = amd_pmu;
2035
2036 /* Events are common for all AMDs */
2037 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
2038 sizeof(hw_cache_event_ids));
2039 1373
2040 return 0; 1374 x86_pmu.apic = 0;
1375 pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n");
1376 pr_info("no hardware sampling interrupt available.\n");
2041} 1377}
2042 1378
2043void __init init_hw_perf_events(void) 1379void __init init_hw_perf_events(void)
2044{ 1380{
1381 struct event_constraint *c;
2045 int err; 1382 int err;
2046 1383
2047 pr_info("Performance Events: "); 1384 pr_info("Performance Events: ");
@@ -2061,6 +1398,8 @@ void __init init_hw_perf_events(void)
2061 return; 1398 return;
2062 } 1399 }
2063 1400
1401 pmu_check_apic();
1402
2064 pr_cont("%s PMU driver.\n", x86_pmu.name); 1403 pr_cont("%s PMU driver.\n", x86_pmu.name);
2065 1404
2066 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) { 1405 if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {
@@ -2084,6 +1423,20 @@ void __init init_hw_perf_events(void)
2084 perf_events_lapic_init(); 1423 perf_events_lapic_init();
2085 register_die_notifier(&perf_event_nmi_notifier); 1424 register_die_notifier(&perf_event_nmi_notifier);
2086 1425
1426 unconstrained = (struct event_constraint)
1427 __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_events) - 1,
1428 0, x86_pmu.num_events);
1429
1430 if (x86_pmu.event_constraints) {
1431 for_each_event_constraint(c, x86_pmu.event_constraints) {
1432 if (c->cmask != INTEL_ARCH_FIXED_MASK)
1433 continue;
1434
1435 c->idxmsk64 |= (1ULL << x86_pmu.num_events) - 1;
1436 c->weight += x86_pmu.num_events;
1437 }
1438 }
1439
2087 pr_info("... version: %d\n", x86_pmu.version); 1440 pr_info("... version: %d\n", x86_pmu.version);
2088 pr_info("... bit width: %d\n", x86_pmu.event_bits); 1441 pr_info("... bit width: %d\n", x86_pmu.event_bits);
2089 pr_info("... generic registers: %d\n", x86_pmu.num_events); 1442 pr_info("... generic registers: %d\n", x86_pmu.num_events);
@@ -2091,25 +1444,92 @@ void __init init_hw_perf_events(void)
2091 pr_info("... max period: %016Lx\n", x86_pmu.max_period); 1444 pr_info("... max period: %016Lx\n", x86_pmu.max_period);
2092 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed); 1445 pr_info("... fixed-purpose events: %d\n", x86_pmu.num_events_fixed);
2093 pr_info("... event mask: %016Lx\n", perf_event_mask); 1446 pr_info("... event mask: %016Lx\n", perf_event_mask);
1447
1448 perf_cpu_notifier(x86_pmu_notifier);
2094} 1449}
2095 1450
2096static inline void x86_pmu_read(struct perf_event *event) 1451static inline void x86_pmu_read(struct perf_event *event)
2097{ 1452{
2098 x86_perf_event_update(event, &event->hw, event->hw.idx); 1453 x86_perf_event_update(event);
2099} 1454}
2100 1455
2101static const struct pmu pmu = { 1456static const struct pmu pmu = {
2102 .enable = x86_pmu_enable, 1457 .enable = x86_pmu_enable,
2103 .disable = x86_pmu_disable, 1458 .disable = x86_pmu_disable,
1459 .start = x86_pmu_start,
1460 .stop = x86_pmu_stop,
2104 .read = x86_pmu_read, 1461 .read = x86_pmu_read,
2105 .unthrottle = x86_pmu_unthrottle, 1462 .unthrottle = x86_pmu_unthrottle,
2106}; 1463};
2107 1464
1465/*
1466 * validate a single event group
1467 *
1468 * validation include:
1469 * - check events are compatible which each other
1470 * - events do not compete for the same counter
1471 * - number of events <= number of counters
1472 *
1473 * validation ensures the group can be loaded onto the
1474 * PMU if it was the only group available.
1475 */
1476static int validate_group(struct perf_event *event)
1477{
1478 struct perf_event *leader = event->group_leader;
1479 struct cpu_hw_events *fake_cpuc;
1480 int ret, n;
1481
1482 ret = -ENOMEM;
1483 fake_cpuc = kmalloc(sizeof(*fake_cpuc), GFP_KERNEL | __GFP_ZERO);
1484 if (!fake_cpuc)
1485 goto out;
1486
1487 /*
1488 * the event is not yet connected with its
1489 * siblings therefore we must first collect
1490 * existing siblings, then add the new event
1491 * before we can simulate the scheduling
1492 */
1493 ret = -ENOSPC;
1494 n = collect_events(fake_cpuc, leader, true);
1495 if (n < 0)
1496 goto out_free;
1497
1498 fake_cpuc->n_events = n;
1499 n = collect_events(fake_cpuc, event, false);
1500 if (n < 0)
1501 goto out_free;
1502
1503 fake_cpuc->n_events = n;
1504
1505 ret = x86_schedule_events(fake_cpuc, n, NULL);
1506
1507out_free:
1508 kfree(fake_cpuc);
1509out:
1510 return ret;
1511}
1512
2108const struct pmu *hw_perf_event_init(struct perf_event *event) 1513const struct pmu *hw_perf_event_init(struct perf_event *event)
2109{ 1514{
1515 const struct pmu *tmp;
2110 int err; 1516 int err;
2111 1517
2112 err = __hw_perf_event_init(event); 1518 err = __hw_perf_event_init(event);
1519 if (!err) {
1520 /*
1521 * we temporarily connect event to its pmu
1522 * such that validate_group() can classify
1523 * it as an x86 event using is_x86_event()
1524 */
1525 tmp = event->pmu;
1526 event->pmu = &pmu;
1527
1528 if (event->group_leader != event)
1529 err = validate_group(event);
1530
1531 event->pmu = tmp;
1532 }
2113 if (err) { 1533 if (err) {
2114 if (event->destroy) 1534 if (event->destroy)
2115 event->destroy(event); 1535 event->destroy(event);
@@ -2132,7 +1552,6 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip)
2132 1552
2133static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); 1553static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry);
2134static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); 1554static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry);
2135static DEFINE_PER_CPU(int, in_nmi_frame);
2136 1555
2137 1556
2138static void 1557static void
@@ -2148,9 +1567,6 @@ static void backtrace_warning(void *data, char *msg)
2148 1567
2149static int backtrace_stack(void *data, char *name) 1568static int backtrace_stack(void *data, char *name)
2150{ 1569{
2151 per_cpu(in_nmi_frame, smp_processor_id()) =
2152 x86_is_stack_id(NMI_STACK, name);
2153
2154 return 0; 1570 return 0;
2155} 1571}
2156 1572
@@ -2158,9 +1574,6 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
2158{ 1574{
2159 struct perf_callchain_entry *entry = data; 1575 struct perf_callchain_entry *entry = data;
2160 1576
2161 if (per_cpu(in_nmi_frame, smp_processor_id()))
2162 return;
2163
2164 if (reliable) 1577 if (reliable)
2165 callchain_store(entry, addr); 1578 callchain_store(entry, addr);
2166} 1579}
@@ -2170,6 +1583,7 @@ static const struct stacktrace_ops backtrace_ops = {
2170 .warning_symbol = backtrace_warning_symbol, 1583 .warning_symbol = backtrace_warning_symbol,
2171 .stack = backtrace_stack, 1584 .stack = backtrace_stack,
2172 .address = backtrace_address, 1585 .address = backtrace_address,
1586 .walk_stack = print_context_stack_bp,
2173}; 1587};
2174 1588
2175#include "../dumpstack.h" 1589#include "../dumpstack.h"
@@ -2180,7 +1594,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry)
2180 callchain_store(entry, PERF_CONTEXT_KERNEL); 1594 callchain_store(entry, PERF_CONTEXT_KERNEL);
2181 callchain_store(entry, regs->ip); 1595 callchain_store(entry, regs->ip);
2182 1596
2183 dump_trace(NULL, regs, NULL, 0, &backtrace_ops, entry); 1597 dump_trace(NULL, regs, NULL, regs->bp, &backtrace_ops, entry);
2184} 1598}
2185 1599
2186/* 1600/*
@@ -2218,14 +1632,42 @@ copy_from_user_nmi(void *to, const void __user *from, unsigned long n)
2218 return len; 1632 return len;
2219} 1633}
2220 1634
2221static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) 1635#ifdef CONFIG_COMPAT
1636static inline int
1637perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
2222{ 1638{
2223 unsigned long bytes; 1639 /* 32-bit process in 64-bit kernel. */
1640 struct stack_frame_ia32 frame;
1641 const void __user *fp;
1642
1643 if (!test_thread_flag(TIF_IA32))
1644 return 0;
1645
1646 fp = compat_ptr(regs->bp);
1647 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1648 unsigned long bytes;
1649 frame.next_frame = 0;
1650 frame.return_address = 0;
2224 1651
2225 bytes = copy_from_user_nmi(frame, fp, sizeof(*frame)); 1652 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1653 if (bytes != sizeof(frame))
1654 break;
2226 1655
2227 return bytes == sizeof(*frame); 1656 if (fp < compat_ptr(regs->sp))
1657 break;
1658
1659 callchain_store(entry, frame.return_address);
1660 fp = compat_ptr(frame.next_frame);
1661 }
1662 return 1;
2228} 1663}
1664#else
1665static inline int
1666perf_callchain_user32(struct pt_regs *regs, struct perf_callchain_entry *entry)
1667{
1668 return 0;
1669}
1670#endif
2229 1671
2230static void 1672static void
2231perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) 1673perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
@@ -2241,11 +1683,16 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry)
2241 callchain_store(entry, PERF_CONTEXT_USER); 1683 callchain_store(entry, PERF_CONTEXT_USER);
2242 callchain_store(entry, regs->ip); 1684 callchain_store(entry, regs->ip);
2243 1685
1686 if (perf_callchain_user32(regs, entry))
1687 return;
1688
2244 while (entry->nr < PERF_MAX_STACK_DEPTH) { 1689 while (entry->nr < PERF_MAX_STACK_DEPTH) {
1690 unsigned long bytes;
2245 frame.next_frame = NULL; 1691 frame.next_frame = NULL;
2246 frame.return_address = 0; 1692 frame.return_address = 0;
2247 1693
2248 if (!copy_stack_frame(fp, &frame)) 1694 bytes = copy_from_user_nmi(&frame, fp, sizeof(frame));
1695 if (bytes != sizeof(frame))
2249 break; 1696 break;
2250 1697
2251 if ((unsigned long)fp < regs->sp) 1698 if ((unsigned long)fp < regs->sp)
@@ -2266,9 +1713,6 @@ perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry)
2266 1713
2267 is_user = user_mode(regs); 1714 is_user = user_mode(regs);
2268 1715
2269 if (!current || current->pid == 0)
2270 return;
2271
2272 if (is_user && current->state != TASK_RUNNING) 1716 if (is_user && current->state != TASK_RUNNING)
2273 return; 1717 return;
2274 1718
@@ -2295,7 +1739,14 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
2295 return entry; 1739 return entry;
2296} 1740}
2297 1741
2298void hw_perf_event_setup_online(int cpu) 1742void perf_arch_fetch_caller_regs(struct pt_regs *regs, unsigned long ip, int skip)
2299{ 1743{
2300 init_debug_store_on_cpu(cpu); 1744 regs->ip = ip;
1745 /*
1746 * perf_arch_fetch_caller_regs adds another call, we need to increment
1747 * the skip level
1748 */
1749 regs->bp = rewind_frame_pointer(skip + 1);
1750 regs->cs = __KERNEL_CS;
1751 local_save_flags(regs->flags);
2301} 1752}
diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c
new file mode 100644
index 000000000000..db6f7d4056e1
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_amd.c
@@ -0,0 +1,422 @@
1#ifdef CONFIG_CPU_SUP_AMD
2
3static DEFINE_RAW_SPINLOCK(amd_nb_lock);
4
5static __initconst u64 amd_hw_cache_event_ids
6 [PERF_COUNT_HW_CACHE_MAX]
7 [PERF_COUNT_HW_CACHE_OP_MAX]
8 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
9{
10 [ C(L1D) ] = {
11 [ C(OP_READ) ] = {
12 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
13 [ C(RESULT_MISS) ] = 0x0041, /* Data Cache Misses */
14 },
15 [ C(OP_WRITE) ] = {
16 [ C(RESULT_ACCESS) ] = 0x0142, /* Data Cache Refills :system */
17 [ C(RESULT_MISS) ] = 0,
18 },
19 [ C(OP_PREFETCH) ] = {
20 [ C(RESULT_ACCESS) ] = 0x0267, /* Data Prefetcher :attempts */
21 [ C(RESULT_MISS) ] = 0x0167, /* Data Prefetcher :cancelled */
22 },
23 },
24 [ C(L1I ) ] = {
25 [ C(OP_READ) ] = {
26 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction cache fetches */
27 [ C(RESULT_MISS) ] = 0x0081, /* Instruction cache misses */
28 },
29 [ C(OP_WRITE) ] = {
30 [ C(RESULT_ACCESS) ] = -1,
31 [ C(RESULT_MISS) ] = -1,
32 },
33 [ C(OP_PREFETCH) ] = {
34 [ C(RESULT_ACCESS) ] = 0x014B, /* Prefetch Instructions :Load */
35 [ C(RESULT_MISS) ] = 0,
36 },
37 },
38 [ C(LL ) ] = {
39 [ C(OP_READ) ] = {
40 [ C(RESULT_ACCESS) ] = 0x037D, /* Requests to L2 Cache :IC+DC */
41 [ C(RESULT_MISS) ] = 0x037E, /* L2 Cache Misses : IC+DC */
42 },
43 [ C(OP_WRITE) ] = {
44 [ C(RESULT_ACCESS) ] = 0x017F, /* L2 Fill/Writeback */
45 [ C(RESULT_MISS) ] = 0,
46 },
47 [ C(OP_PREFETCH) ] = {
48 [ C(RESULT_ACCESS) ] = 0,
49 [ C(RESULT_MISS) ] = 0,
50 },
51 },
52 [ C(DTLB) ] = {
53 [ C(OP_READ) ] = {
54 [ C(RESULT_ACCESS) ] = 0x0040, /* Data Cache Accesses */
55 [ C(RESULT_MISS) ] = 0x0046, /* L1 DTLB and L2 DLTB Miss */
56 },
57 [ C(OP_WRITE) ] = {
58 [ C(RESULT_ACCESS) ] = 0,
59 [ C(RESULT_MISS) ] = 0,
60 },
61 [ C(OP_PREFETCH) ] = {
62 [ C(RESULT_ACCESS) ] = 0,
63 [ C(RESULT_MISS) ] = 0,
64 },
65 },
66 [ C(ITLB) ] = {
67 [ C(OP_READ) ] = {
68 [ C(RESULT_ACCESS) ] = 0x0080, /* Instruction fecthes */
69 [ C(RESULT_MISS) ] = 0x0085, /* Instr. fetch ITLB misses */
70 },
71 [ C(OP_WRITE) ] = {
72 [ C(RESULT_ACCESS) ] = -1,
73 [ C(RESULT_MISS) ] = -1,
74 },
75 [ C(OP_PREFETCH) ] = {
76 [ C(RESULT_ACCESS) ] = -1,
77 [ C(RESULT_MISS) ] = -1,
78 },
79 },
80 [ C(BPU ) ] = {
81 [ C(OP_READ) ] = {
82 [ C(RESULT_ACCESS) ] = 0x00c2, /* Retired Branch Instr. */
83 [ C(RESULT_MISS) ] = 0x00c3, /* Retired Mispredicted BI */
84 },
85 [ C(OP_WRITE) ] = {
86 [ C(RESULT_ACCESS) ] = -1,
87 [ C(RESULT_MISS) ] = -1,
88 },
89 [ C(OP_PREFETCH) ] = {
90 [ C(RESULT_ACCESS) ] = -1,
91 [ C(RESULT_MISS) ] = -1,
92 },
93 },
94};
95
96/*
97 * AMD Performance Monitor K7 and later.
98 */
99static const u64 amd_perfmon_event_map[] =
100{
101 [PERF_COUNT_HW_CPU_CYCLES] = 0x0076,
102 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
103 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080,
104 [PERF_COUNT_HW_CACHE_MISSES] = 0x0081,
105 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
106 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
107};
108
109static u64 amd_pmu_event_map(int hw_event)
110{
111 return amd_perfmon_event_map[hw_event];
112}
113
114static u64 amd_pmu_raw_event(u64 hw_event)
115{
116#define K7_EVNTSEL_EVENT_MASK 0xF000000FFULL
117#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL
118#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL
119#define K7_EVNTSEL_INV_MASK 0x000800000ULL
120#define K7_EVNTSEL_REG_MASK 0x0FF000000ULL
121
122#define K7_EVNTSEL_MASK \
123 (K7_EVNTSEL_EVENT_MASK | \
124 K7_EVNTSEL_UNIT_MASK | \
125 K7_EVNTSEL_EDGE_MASK | \
126 K7_EVNTSEL_INV_MASK | \
127 K7_EVNTSEL_REG_MASK)
128
129 return hw_event & K7_EVNTSEL_MASK;
130}
131
132/*
133 * AMD64 events are detected based on their event codes.
134 */
135static inline int amd_is_nb_event(struct hw_perf_event *hwc)
136{
137 return (hwc->config & 0xe0) == 0xe0;
138}
139
140static inline int amd_has_nb(struct cpu_hw_events *cpuc)
141{
142 struct amd_nb *nb = cpuc->amd_nb;
143
144 return nb && nb->nb_id != -1;
145}
146
147static void amd_put_event_constraints(struct cpu_hw_events *cpuc,
148 struct perf_event *event)
149{
150 struct hw_perf_event *hwc = &event->hw;
151 struct amd_nb *nb = cpuc->amd_nb;
152 int i;
153
154 /*
155 * only care about NB events
156 */
157 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
158 return;
159
160 /*
161 * need to scan whole list because event may not have
162 * been assigned during scheduling
163 *
164 * no race condition possible because event can only
165 * be removed on one CPU at a time AND PMU is disabled
166 * when we come here
167 */
168 for (i = 0; i < x86_pmu.num_events; i++) {
169 if (nb->owners[i] == event) {
170 cmpxchg(nb->owners+i, event, NULL);
171 break;
172 }
173 }
174}
175
176 /*
177 * AMD64 NorthBridge events need special treatment because
178 * counter access needs to be synchronized across all cores
179 * of a package. Refer to BKDG section 3.12
180 *
181 * NB events are events measuring L3 cache, Hypertransport
182 * traffic. They are identified by an event code >= 0xe00.
183 * They measure events on the NorthBride which is shared
184 * by all cores on a package. NB events are counted on a
185 * shared set of counters. When a NB event is programmed
186 * in a counter, the data actually comes from a shared
187 * counter. Thus, access to those counters needs to be
188 * synchronized.
189 *
190 * We implement the synchronization such that no two cores
191 * can be measuring NB events using the same counters. Thus,
192 * we maintain a per-NB allocation table. The available slot
193 * is propagated using the event_constraint structure.
194 *
195 * We provide only one choice for each NB event based on
196 * the fact that only NB events have restrictions. Consequently,
197 * if a counter is available, there is a guarantee the NB event
198 * will be assigned to it. If no slot is available, an empty
199 * constraint is returned and scheduling will eventually fail
200 * for this event.
201 *
202 * Note that all cores attached the same NB compete for the same
203 * counters to host NB events, this is why we use atomic ops. Some
204 * multi-chip CPUs may have more than one NB.
205 *
206 * Given that resources are allocated (cmpxchg), they must be
207 * eventually freed for others to use. This is accomplished by
208 * calling amd_put_event_constraints().
209 *
210 * Non NB events are not impacted by this restriction.
211 */
212static struct event_constraint *
213amd_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
214{
215 struct hw_perf_event *hwc = &event->hw;
216 struct amd_nb *nb = cpuc->amd_nb;
217 struct perf_event *old = NULL;
218 int max = x86_pmu.num_events;
219 int i, j, k = -1;
220
221 /*
222 * if not NB event or no NB, then no constraints
223 */
224 if (!(amd_has_nb(cpuc) && amd_is_nb_event(hwc)))
225 return &unconstrained;
226
227 /*
228 * detect if already present, if so reuse
229 *
230 * cannot merge with actual allocation
231 * because of possible holes
232 *
233 * event can already be present yet not assigned (in hwc->idx)
234 * because of successive calls to x86_schedule_events() from
235 * hw_perf_group_sched_in() without hw_perf_enable()
236 */
237 for (i = 0; i < max; i++) {
238 /*
239 * keep track of first free slot
240 */
241 if (k == -1 && !nb->owners[i])
242 k = i;
243
244 /* already present, reuse */
245 if (nb->owners[i] == event)
246 goto done;
247 }
248 /*
249 * not present, so grab a new slot
250 * starting either at:
251 */
252 if (hwc->idx != -1) {
253 /* previous assignment */
254 i = hwc->idx;
255 } else if (k != -1) {
256 /* start from free slot found */
257 i = k;
258 } else {
259 /*
260 * event not found, no slot found in
261 * first pass, try again from the
262 * beginning
263 */
264 i = 0;
265 }
266 j = i;
267 do {
268 old = cmpxchg(nb->owners+i, NULL, event);
269 if (!old)
270 break;
271 if (++i == max)
272 i = 0;
273 } while (i != j);
274done:
275 if (!old)
276 return &nb->event_constraints[i];
277
278 return &emptyconstraint;
279}
280
281static struct amd_nb *amd_alloc_nb(int cpu, int nb_id)
282{
283 struct amd_nb *nb;
284 int i;
285
286 nb = kmalloc(sizeof(struct amd_nb), GFP_KERNEL);
287 if (!nb)
288 return NULL;
289
290 memset(nb, 0, sizeof(*nb));
291 nb->nb_id = nb_id;
292
293 /*
294 * initialize all possible NB constraints
295 */
296 for (i = 0; i < x86_pmu.num_events; i++) {
297 __set_bit(i, nb->event_constraints[i].idxmsk);
298 nb->event_constraints[i].weight = 1;
299 }
300 return nb;
301}
302
303static int amd_pmu_cpu_prepare(int cpu)
304{
305 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
306
307 WARN_ON_ONCE(cpuc->amd_nb);
308
309 if (boot_cpu_data.x86_max_cores < 2)
310 return NOTIFY_OK;
311
312 cpuc->amd_nb = amd_alloc_nb(cpu, -1);
313 if (!cpuc->amd_nb)
314 return NOTIFY_BAD;
315
316 return NOTIFY_OK;
317}
318
319static void amd_pmu_cpu_starting(int cpu)
320{
321 struct cpu_hw_events *cpuc = &per_cpu(cpu_hw_events, cpu);
322 struct amd_nb *nb;
323 int i, nb_id;
324
325 if (boot_cpu_data.x86_max_cores < 2)
326 return;
327
328 nb_id = amd_get_nb_id(cpu);
329 WARN_ON_ONCE(nb_id == BAD_APICID);
330
331 raw_spin_lock(&amd_nb_lock);
332
333 for_each_online_cpu(i) {
334 nb = per_cpu(cpu_hw_events, i).amd_nb;
335 if (WARN_ON_ONCE(!nb))
336 continue;
337
338 if (nb->nb_id == nb_id) {
339 kfree(cpuc->amd_nb);
340 cpuc->amd_nb = nb;
341 break;
342 }
343 }
344
345 cpuc->amd_nb->nb_id = nb_id;
346 cpuc->amd_nb->refcnt++;
347
348 raw_spin_unlock(&amd_nb_lock);
349}
350
351static void amd_pmu_cpu_dead(int cpu)
352{
353 struct cpu_hw_events *cpuhw;
354
355 if (boot_cpu_data.x86_max_cores < 2)
356 return;
357
358 cpuhw = &per_cpu(cpu_hw_events, cpu);
359
360 raw_spin_lock(&amd_nb_lock);
361
362 if (cpuhw->amd_nb) {
363 struct amd_nb *nb = cpuhw->amd_nb;
364
365 if (nb->nb_id == -1 || --nb->refcnt == 0)
366 kfree(nb);
367
368 cpuhw->amd_nb = NULL;
369 }
370
371 raw_spin_unlock(&amd_nb_lock);
372}
373
374static __initconst struct x86_pmu amd_pmu = {
375 .name = "AMD",
376 .handle_irq = x86_pmu_handle_irq,
377 .disable_all = x86_pmu_disable_all,
378 .enable_all = x86_pmu_enable_all,
379 .enable = x86_pmu_enable_event,
380 .disable = x86_pmu_disable_event,
381 .eventsel = MSR_K7_EVNTSEL0,
382 .perfctr = MSR_K7_PERFCTR0,
383 .event_map = amd_pmu_event_map,
384 .raw_event = amd_pmu_raw_event,
385 .max_events = ARRAY_SIZE(amd_perfmon_event_map),
386 .num_events = 4,
387 .event_bits = 48,
388 .event_mask = (1ULL << 48) - 1,
389 .apic = 1,
390 /* use highest bit to detect overflow */
391 .max_period = (1ULL << 47) - 1,
392 .get_event_constraints = amd_get_event_constraints,
393 .put_event_constraints = amd_put_event_constraints,
394
395 .cpu_prepare = amd_pmu_cpu_prepare,
396 .cpu_starting = amd_pmu_cpu_starting,
397 .cpu_dead = amd_pmu_cpu_dead,
398};
399
400static __init int amd_pmu_init(void)
401{
402 /* Performance-monitoring supported from K7 and later: */
403 if (boot_cpu_data.x86 < 6)
404 return -ENODEV;
405
406 x86_pmu = amd_pmu;
407
408 /* Events are common for all AMDs */
409 memcpy(hw_cache_event_ids, amd_hw_cache_event_ids,
410 sizeof(hw_cache_event_ids));
411
412 return 0;
413}
414
415#else /* CONFIG_CPU_SUP_AMD */
416
417static int amd_pmu_init(void)
418{
419 return 0;
420}
421
422#endif
diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c
new file mode 100644
index 000000000000..9c794ac87837
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_intel.c
@@ -0,0 +1,980 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Intel PerfMon, used on Core and later.
5 */
6static const u64 intel_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x003c,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x4f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x412e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x013c,
15};
16
17static struct event_constraint intel_core_event_constraints[] =
18{
19 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
20 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
21 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
22 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
23 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
24 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FP_COMP_INSTR_RET */
25 EVENT_CONSTRAINT_END
26};
27
28static struct event_constraint intel_core2_event_constraints[] =
29{
30 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
31 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
32 /*
33 * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event
34 * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed
35 * ratio between these counters.
36 */
37 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
38 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
39 INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */
40 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
41 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
42 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
43 INTEL_EVENT_CONSTRAINT(0x18, 0x1), /* IDLE_DURING_DIV */
44 INTEL_EVENT_CONSTRAINT(0x19, 0x2), /* DELAYED_BYPASS */
45 INTEL_EVENT_CONSTRAINT(0xa1, 0x1), /* RS_UOPS_DISPATCH_CYCLES */
46 INTEL_EVENT_CONSTRAINT(0xc9, 0x1), /* ITLB_MISS_RETIRED (T30-9) */
47 INTEL_EVENT_CONSTRAINT(0xcb, 0x1), /* MEM_LOAD_RETIRED */
48 EVENT_CONSTRAINT_END
49};
50
51static struct event_constraint intel_nehalem_event_constraints[] =
52{
53 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
54 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
55 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
56 INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */
57 INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */
58 INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */
59 INTEL_EVENT_CONSTRAINT(0x43, 0x3), /* L1D_ALL_REF */
60 INTEL_EVENT_CONSTRAINT(0x48, 0x3), /* L1D_PEND_MISS */
61 INTEL_EVENT_CONSTRAINT(0x4e, 0x3), /* L1D_PREFETCH */
62 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
63 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
64 EVENT_CONSTRAINT_END
65};
66
67static struct event_constraint intel_westmere_event_constraints[] =
68{
69 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
70 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
71 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
72 INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */
73 INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */
74 INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */
75 EVENT_CONSTRAINT_END
76};
77
78static struct event_constraint intel_gen_event_constraints[] =
79{
80 FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */
81 FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */
82 /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */
83 EVENT_CONSTRAINT_END
84};
85
86static u64 intel_pmu_event_map(int hw_event)
87{
88 return intel_perfmon_event_map[hw_event];
89}
90
91static __initconst u64 westmere_hw_cache_event_ids
92 [PERF_COUNT_HW_CACHE_MAX]
93 [PERF_COUNT_HW_CACHE_OP_MAX]
94 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
95{
96 [ C(L1D) ] = {
97 [ C(OP_READ) ] = {
98 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
99 [ C(RESULT_MISS) ] = 0x0151, /* L1D.REPL */
100 },
101 [ C(OP_WRITE) ] = {
102 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
103 [ C(RESULT_MISS) ] = 0x0251, /* L1D.M_REPL */
104 },
105 [ C(OP_PREFETCH) ] = {
106 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
107 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
108 },
109 },
110 [ C(L1I ) ] = {
111 [ C(OP_READ) ] = {
112 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
113 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
114 },
115 [ C(OP_WRITE) ] = {
116 [ C(RESULT_ACCESS) ] = -1,
117 [ C(RESULT_MISS) ] = -1,
118 },
119 [ C(OP_PREFETCH) ] = {
120 [ C(RESULT_ACCESS) ] = 0x0,
121 [ C(RESULT_MISS) ] = 0x0,
122 },
123 },
124 [ C(LL ) ] = {
125 [ C(OP_READ) ] = {
126 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
127 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
128 },
129 [ C(OP_WRITE) ] = {
130 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
131 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
132 },
133 [ C(OP_PREFETCH) ] = {
134 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
135 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
136 },
137 },
138 [ C(DTLB) ] = {
139 [ C(OP_READ) ] = {
140 [ C(RESULT_ACCESS) ] = 0x010b, /* MEM_INST_RETIRED.LOADS */
141 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
142 },
143 [ C(OP_WRITE) ] = {
144 [ C(RESULT_ACCESS) ] = 0x020b, /* MEM_INST_RETURED.STORES */
145 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
146 },
147 [ C(OP_PREFETCH) ] = {
148 [ C(RESULT_ACCESS) ] = 0x0,
149 [ C(RESULT_MISS) ] = 0x0,
150 },
151 },
152 [ C(ITLB) ] = {
153 [ C(OP_READ) ] = {
154 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
155 [ C(RESULT_MISS) ] = 0x0185, /* ITLB_MISSES.ANY */
156 },
157 [ C(OP_WRITE) ] = {
158 [ C(RESULT_ACCESS) ] = -1,
159 [ C(RESULT_MISS) ] = -1,
160 },
161 [ C(OP_PREFETCH) ] = {
162 [ C(RESULT_ACCESS) ] = -1,
163 [ C(RESULT_MISS) ] = -1,
164 },
165 },
166 [ C(BPU ) ] = {
167 [ C(OP_READ) ] = {
168 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
169 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
170 },
171 [ C(OP_WRITE) ] = {
172 [ C(RESULT_ACCESS) ] = -1,
173 [ C(RESULT_MISS) ] = -1,
174 },
175 [ C(OP_PREFETCH) ] = {
176 [ C(RESULT_ACCESS) ] = -1,
177 [ C(RESULT_MISS) ] = -1,
178 },
179 },
180};
181
182static __initconst u64 nehalem_hw_cache_event_ids
183 [PERF_COUNT_HW_CACHE_MAX]
184 [PERF_COUNT_HW_CACHE_OP_MAX]
185 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
186{
187 [ C(L1D) ] = {
188 [ C(OP_READ) ] = {
189 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
190 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
191 },
192 [ C(OP_WRITE) ] = {
193 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
194 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
195 },
196 [ C(OP_PREFETCH) ] = {
197 [ C(RESULT_ACCESS) ] = 0x014e, /* L1D_PREFETCH.REQUESTS */
198 [ C(RESULT_MISS) ] = 0x024e, /* L1D_PREFETCH.MISS */
199 },
200 },
201 [ C(L1I ) ] = {
202 [ C(OP_READ) ] = {
203 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
204 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
205 },
206 [ C(OP_WRITE) ] = {
207 [ C(RESULT_ACCESS) ] = -1,
208 [ C(RESULT_MISS) ] = -1,
209 },
210 [ C(OP_PREFETCH) ] = {
211 [ C(RESULT_ACCESS) ] = 0x0,
212 [ C(RESULT_MISS) ] = 0x0,
213 },
214 },
215 [ C(LL ) ] = {
216 [ C(OP_READ) ] = {
217 [ C(RESULT_ACCESS) ] = 0x0324, /* L2_RQSTS.LOADS */
218 [ C(RESULT_MISS) ] = 0x0224, /* L2_RQSTS.LD_MISS */
219 },
220 [ C(OP_WRITE) ] = {
221 [ C(RESULT_ACCESS) ] = 0x0c24, /* L2_RQSTS.RFOS */
222 [ C(RESULT_MISS) ] = 0x0824, /* L2_RQSTS.RFO_MISS */
223 },
224 [ C(OP_PREFETCH) ] = {
225 [ C(RESULT_ACCESS) ] = 0x4f2e, /* LLC Reference */
226 [ C(RESULT_MISS) ] = 0x412e, /* LLC Misses */
227 },
228 },
229 [ C(DTLB) ] = {
230 [ C(OP_READ) ] = {
231 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
232 [ C(RESULT_MISS) ] = 0x0108, /* DTLB_LOAD_MISSES.ANY */
233 },
234 [ C(OP_WRITE) ] = {
235 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
236 [ C(RESULT_MISS) ] = 0x010c, /* MEM_STORE_RETIRED.DTLB_MISS */
237 },
238 [ C(OP_PREFETCH) ] = {
239 [ C(RESULT_ACCESS) ] = 0x0,
240 [ C(RESULT_MISS) ] = 0x0,
241 },
242 },
243 [ C(ITLB) ] = {
244 [ C(OP_READ) ] = {
245 [ C(RESULT_ACCESS) ] = 0x01c0, /* INST_RETIRED.ANY_P */
246 [ C(RESULT_MISS) ] = 0x20c8, /* ITLB_MISS_RETIRED */
247 },
248 [ C(OP_WRITE) ] = {
249 [ C(RESULT_ACCESS) ] = -1,
250 [ C(RESULT_MISS) ] = -1,
251 },
252 [ C(OP_PREFETCH) ] = {
253 [ C(RESULT_ACCESS) ] = -1,
254 [ C(RESULT_MISS) ] = -1,
255 },
256 },
257 [ C(BPU ) ] = {
258 [ C(OP_READ) ] = {
259 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ALL_BRANCHES */
260 [ C(RESULT_MISS) ] = 0x03e8, /* BPU_CLEARS.ANY */
261 },
262 [ C(OP_WRITE) ] = {
263 [ C(RESULT_ACCESS) ] = -1,
264 [ C(RESULT_MISS) ] = -1,
265 },
266 [ C(OP_PREFETCH) ] = {
267 [ C(RESULT_ACCESS) ] = -1,
268 [ C(RESULT_MISS) ] = -1,
269 },
270 },
271};
272
273static __initconst u64 core2_hw_cache_event_ids
274 [PERF_COUNT_HW_CACHE_MAX]
275 [PERF_COUNT_HW_CACHE_OP_MAX]
276 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
277{
278 [ C(L1D) ] = {
279 [ C(OP_READ) ] = {
280 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI */
281 [ C(RESULT_MISS) ] = 0x0140, /* L1D_CACHE_LD.I_STATE */
282 },
283 [ C(OP_WRITE) ] = {
284 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI */
285 [ C(RESULT_MISS) ] = 0x0141, /* L1D_CACHE_ST.I_STATE */
286 },
287 [ C(OP_PREFETCH) ] = {
288 [ C(RESULT_ACCESS) ] = 0x104e, /* L1D_PREFETCH.REQUESTS */
289 [ C(RESULT_MISS) ] = 0,
290 },
291 },
292 [ C(L1I ) ] = {
293 [ C(OP_READ) ] = {
294 [ C(RESULT_ACCESS) ] = 0x0080, /* L1I.READS */
295 [ C(RESULT_MISS) ] = 0x0081, /* L1I.MISSES */
296 },
297 [ C(OP_WRITE) ] = {
298 [ C(RESULT_ACCESS) ] = -1,
299 [ C(RESULT_MISS) ] = -1,
300 },
301 [ C(OP_PREFETCH) ] = {
302 [ C(RESULT_ACCESS) ] = 0,
303 [ C(RESULT_MISS) ] = 0,
304 },
305 },
306 [ C(LL ) ] = {
307 [ C(OP_READ) ] = {
308 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
309 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
310 },
311 [ C(OP_WRITE) ] = {
312 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
313 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
314 },
315 [ C(OP_PREFETCH) ] = {
316 [ C(RESULT_ACCESS) ] = 0,
317 [ C(RESULT_MISS) ] = 0,
318 },
319 },
320 [ C(DTLB) ] = {
321 [ C(OP_READ) ] = {
322 [ C(RESULT_ACCESS) ] = 0x0f40, /* L1D_CACHE_LD.MESI (alias) */
323 [ C(RESULT_MISS) ] = 0x0208, /* DTLB_MISSES.MISS_LD */
324 },
325 [ C(OP_WRITE) ] = {
326 [ C(RESULT_ACCESS) ] = 0x0f41, /* L1D_CACHE_ST.MESI (alias) */
327 [ C(RESULT_MISS) ] = 0x0808, /* DTLB_MISSES.MISS_ST */
328 },
329 [ C(OP_PREFETCH) ] = {
330 [ C(RESULT_ACCESS) ] = 0,
331 [ C(RESULT_MISS) ] = 0,
332 },
333 },
334 [ C(ITLB) ] = {
335 [ C(OP_READ) ] = {
336 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
337 [ C(RESULT_MISS) ] = 0x1282, /* ITLBMISSES */
338 },
339 [ C(OP_WRITE) ] = {
340 [ C(RESULT_ACCESS) ] = -1,
341 [ C(RESULT_MISS) ] = -1,
342 },
343 [ C(OP_PREFETCH) ] = {
344 [ C(RESULT_ACCESS) ] = -1,
345 [ C(RESULT_MISS) ] = -1,
346 },
347 },
348 [ C(BPU ) ] = {
349 [ C(OP_READ) ] = {
350 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
351 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
352 },
353 [ C(OP_WRITE) ] = {
354 [ C(RESULT_ACCESS) ] = -1,
355 [ C(RESULT_MISS) ] = -1,
356 },
357 [ C(OP_PREFETCH) ] = {
358 [ C(RESULT_ACCESS) ] = -1,
359 [ C(RESULT_MISS) ] = -1,
360 },
361 },
362};
363
364static __initconst u64 atom_hw_cache_event_ids
365 [PERF_COUNT_HW_CACHE_MAX]
366 [PERF_COUNT_HW_CACHE_OP_MAX]
367 [PERF_COUNT_HW_CACHE_RESULT_MAX] =
368{
369 [ C(L1D) ] = {
370 [ C(OP_READ) ] = {
371 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE.LD */
372 [ C(RESULT_MISS) ] = 0,
373 },
374 [ C(OP_WRITE) ] = {
375 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE.ST */
376 [ C(RESULT_MISS) ] = 0,
377 },
378 [ C(OP_PREFETCH) ] = {
379 [ C(RESULT_ACCESS) ] = 0x0,
380 [ C(RESULT_MISS) ] = 0,
381 },
382 },
383 [ C(L1I ) ] = {
384 [ C(OP_READ) ] = {
385 [ C(RESULT_ACCESS) ] = 0x0380, /* L1I.READS */
386 [ C(RESULT_MISS) ] = 0x0280, /* L1I.MISSES */
387 },
388 [ C(OP_WRITE) ] = {
389 [ C(RESULT_ACCESS) ] = -1,
390 [ C(RESULT_MISS) ] = -1,
391 },
392 [ C(OP_PREFETCH) ] = {
393 [ C(RESULT_ACCESS) ] = 0,
394 [ C(RESULT_MISS) ] = 0,
395 },
396 },
397 [ C(LL ) ] = {
398 [ C(OP_READ) ] = {
399 [ C(RESULT_ACCESS) ] = 0x4f29, /* L2_LD.MESI */
400 [ C(RESULT_MISS) ] = 0x4129, /* L2_LD.ISTATE */
401 },
402 [ C(OP_WRITE) ] = {
403 [ C(RESULT_ACCESS) ] = 0x4f2A, /* L2_ST.MESI */
404 [ C(RESULT_MISS) ] = 0x412A, /* L2_ST.ISTATE */
405 },
406 [ C(OP_PREFETCH) ] = {
407 [ C(RESULT_ACCESS) ] = 0,
408 [ C(RESULT_MISS) ] = 0,
409 },
410 },
411 [ C(DTLB) ] = {
412 [ C(OP_READ) ] = {
413 [ C(RESULT_ACCESS) ] = 0x2140, /* L1D_CACHE_LD.MESI (alias) */
414 [ C(RESULT_MISS) ] = 0x0508, /* DTLB_MISSES.MISS_LD */
415 },
416 [ C(OP_WRITE) ] = {
417 [ C(RESULT_ACCESS) ] = 0x2240, /* L1D_CACHE_ST.MESI (alias) */
418 [ C(RESULT_MISS) ] = 0x0608, /* DTLB_MISSES.MISS_ST */
419 },
420 [ C(OP_PREFETCH) ] = {
421 [ C(RESULT_ACCESS) ] = 0,
422 [ C(RESULT_MISS) ] = 0,
423 },
424 },
425 [ C(ITLB) ] = {
426 [ C(OP_READ) ] = {
427 [ C(RESULT_ACCESS) ] = 0x00c0, /* INST_RETIRED.ANY_P */
428 [ C(RESULT_MISS) ] = 0x0282, /* ITLB.MISSES */
429 },
430 [ C(OP_WRITE) ] = {
431 [ C(RESULT_ACCESS) ] = -1,
432 [ C(RESULT_MISS) ] = -1,
433 },
434 [ C(OP_PREFETCH) ] = {
435 [ C(RESULT_ACCESS) ] = -1,
436 [ C(RESULT_MISS) ] = -1,
437 },
438 },
439 [ C(BPU ) ] = {
440 [ C(OP_READ) ] = {
441 [ C(RESULT_ACCESS) ] = 0x00c4, /* BR_INST_RETIRED.ANY */
442 [ C(RESULT_MISS) ] = 0x00c5, /* BP_INST_RETIRED.MISPRED */
443 },
444 [ C(OP_WRITE) ] = {
445 [ C(RESULT_ACCESS) ] = -1,
446 [ C(RESULT_MISS) ] = -1,
447 },
448 [ C(OP_PREFETCH) ] = {
449 [ C(RESULT_ACCESS) ] = -1,
450 [ C(RESULT_MISS) ] = -1,
451 },
452 },
453};
454
455static u64 intel_pmu_raw_event(u64 hw_event)
456{
457#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL
458#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL
459#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL
460#define CORE_EVNTSEL_INV_MASK 0x00800000ULL
461#define CORE_EVNTSEL_REG_MASK 0xFF000000ULL
462
463#define CORE_EVNTSEL_MASK \
464 (INTEL_ARCH_EVTSEL_MASK | \
465 INTEL_ARCH_UNIT_MASK | \
466 INTEL_ARCH_EDGE_MASK | \
467 INTEL_ARCH_INV_MASK | \
468 INTEL_ARCH_CNT_MASK)
469
470 return hw_event & CORE_EVNTSEL_MASK;
471}
472
473static void intel_pmu_enable_bts(u64 config)
474{
475 unsigned long debugctlmsr;
476
477 debugctlmsr = get_debugctlmsr();
478
479 debugctlmsr |= X86_DEBUGCTL_TR;
480 debugctlmsr |= X86_DEBUGCTL_BTS;
481 debugctlmsr |= X86_DEBUGCTL_BTINT;
482
483 if (!(config & ARCH_PERFMON_EVENTSEL_OS))
484 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_OS;
485
486 if (!(config & ARCH_PERFMON_EVENTSEL_USR))
487 debugctlmsr |= X86_DEBUGCTL_BTS_OFF_USR;
488
489 update_debugctlmsr(debugctlmsr);
490}
491
492static void intel_pmu_disable_bts(void)
493{
494 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
495 unsigned long debugctlmsr;
496
497 if (!cpuc->ds)
498 return;
499
500 debugctlmsr = get_debugctlmsr();
501
502 debugctlmsr &=
503 ~(X86_DEBUGCTL_TR | X86_DEBUGCTL_BTS | X86_DEBUGCTL_BTINT |
504 X86_DEBUGCTL_BTS_OFF_OS | X86_DEBUGCTL_BTS_OFF_USR);
505
506 update_debugctlmsr(debugctlmsr);
507}
508
509static void intel_pmu_disable_all(void)
510{
511 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
512
513 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
514
515 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask))
516 intel_pmu_disable_bts();
517}
518
519static void intel_pmu_enable_all(void)
520{
521 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
522
523 wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
524
525 if (test_bit(X86_PMC_IDX_FIXED_BTS, cpuc->active_mask)) {
526 struct perf_event *event =
527 cpuc->events[X86_PMC_IDX_FIXED_BTS];
528
529 if (WARN_ON_ONCE(!event))
530 return;
531
532 intel_pmu_enable_bts(event->hw.config);
533 }
534}
535
536static inline u64 intel_pmu_get_status(void)
537{
538 u64 status;
539
540 rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status);
541
542 return status;
543}
544
545static inline void intel_pmu_ack_status(u64 ack)
546{
547 wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack);
548}
549
550static inline void
551intel_pmu_disable_fixed(struct hw_perf_event *hwc)
552{
553 int idx = hwc->idx - X86_PMC_IDX_FIXED;
554 u64 ctrl_val, mask;
555
556 mask = 0xfULL << (idx * 4);
557
558 rdmsrl(hwc->config_base, ctrl_val);
559 ctrl_val &= ~mask;
560 (void)checking_wrmsrl(hwc->config_base, ctrl_val);
561}
562
563static void intel_pmu_drain_bts_buffer(void)
564{
565 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
566 struct debug_store *ds = cpuc->ds;
567 struct bts_record {
568 u64 from;
569 u64 to;
570 u64 flags;
571 };
572 struct perf_event *event = cpuc->events[X86_PMC_IDX_FIXED_BTS];
573 struct bts_record *at, *top;
574 struct perf_output_handle handle;
575 struct perf_event_header header;
576 struct perf_sample_data data;
577 struct pt_regs regs;
578
579 if (!event)
580 return;
581
582 if (!ds)
583 return;
584
585 at = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
586 top = (struct bts_record *)(unsigned long)ds->bts_index;
587
588 if (top <= at)
589 return;
590
591 ds->bts_index = ds->bts_buffer_base;
592
593 perf_sample_data_init(&data, 0);
594
595 data.period = event->hw.last_period;
596 regs.ip = 0;
597
598 /*
599 * Prepare a generic sample, i.e. fill in the invariant fields.
600 * We will overwrite the from and to address before we output
601 * the sample.
602 */
603 perf_prepare_sample(&header, &data, event, &regs);
604
605 if (perf_output_begin(&handle, event,
606 header.size * (top - at), 1, 1))
607 return;
608
609 for (; at < top; at++) {
610 data.ip = at->from;
611 data.addr = at->to;
612
613 perf_output_sample(&handle, &header, &data, event);
614 }
615
616 perf_output_end(&handle);
617
618 /* There's new data available. */
619 event->hw.interrupts++;
620 event->pending_kill = POLL_IN;
621}
622
623static inline void
624intel_pmu_disable_event(struct perf_event *event)
625{
626 struct hw_perf_event *hwc = &event->hw;
627
628 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
629 intel_pmu_disable_bts();
630 intel_pmu_drain_bts_buffer();
631 return;
632 }
633
634 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
635 intel_pmu_disable_fixed(hwc);
636 return;
637 }
638
639 x86_pmu_disable_event(event);
640}
641
642static inline void
643intel_pmu_enable_fixed(struct hw_perf_event *hwc)
644{
645 int idx = hwc->idx - X86_PMC_IDX_FIXED;
646 u64 ctrl_val, bits, mask;
647 int err;
648
649 /*
650 * Enable IRQ generation (0x8),
651 * and enable ring-3 counting (0x2) and ring-0 counting (0x1)
652 * if requested:
653 */
654 bits = 0x8ULL;
655 if (hwc->config & ARCH_PERFMON_EVENTSEL_USR)
656 bits |= 0x2;
657 if (hwc->config & ARCH_PERFMON_EVENTSEL_OS)
658 bits |= 0x1;
659
660 /*
661 * ANY bit is supported in v3 and up
662 */
663 if (x86_pmu.version > 2 && hwc->config & ARCH_PERFMON_EVENTSEL_ANY)
664 bits |= 0x4;
665
666 bits <<= (idx * 4);
667 mask = 0xfULL << (idx * 4);
668
669 rdmsrl(hwc->config_base, ctrl_val);
670 ctrl_val &= ~mask;
671 ctrl_val |= bits;
672 err = checking_wrmsrl(hwc->config_base, ctrl_val);
673}
674
675static void intel_pmu_enable_event(struct perf_event *event)
676{
677 struct hw_perf_event *hwc = &event->hw;
678
679 if (unlikely(hwc->idx == X86_PMC_IDX_FIXED_BTS)) {
680 if (!__get_cpu_var(cpu_hw_events).enabled)
681 return;
682
683 intel_pmu_enable_bts(hwc->config);
684 return;
685 }
686
687 if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
688 intel_pmu_enable_fixed(hwc);
689 return;
690 }
691
692 __x86_pmu_enable_event(hwc);
693}
694
695/*
696 * Save and restart an expired event. Called by NMI contexts,
697 * so it has to be careful about preempting normal event ops:
698 */
699static int intel_pmu_save_and_restart(struct perf_event *event)
700{
701 x86_perf_event_update(event);
702 return x86_perf_event_set_period(event);
703}
704
705static void intel_pmu_reset(void)
706{
707 struct debug_store *ds = __get_cpu_var(cpu_hw_events).ds;
708 unsigned long flags;
709 int idx;
710
711 if (!x86_pmu.num_events)
712 return;
713
714 local_irq_save(flags);
715
716 printk("clearing PMU state on CPU#%d\n", smp_processor_id());
717
718 for (idx = 0; idx < x86_pmu.num_events; idx++) {
719 checking_wrmsrl(x86_pmu.eventsel + idx, 0ull);
720 checking_wrmsrl(x86_pmu.perfctr + idx, 0ull);
721 }
722 for (idx = 0; idx < x86_pmu.num_events_fixed; idx++) {
723 checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull);
724 }
725 if (ds)
726 ds->bts_index = ds->bts_buffer_base;
727
728 local_irq_restore(flags);
729}
730
731/*
732 * This handler is triggered by the local APIC, so the APIC IRQ handling
733 * rules apply:
734 */
735static int intel_pmu_handle_irq(struct pt_regs *regs)
736{
737 struct perf_sample_data data;
738 struct cpu_hw_events *cpuc;
739 int bit, loops;
740 u64 ack, status;
741
742 perf_sample_data_init(&data, 0);
743
744 cpuc = &__get_cpu_var(cpu_hw_events);
745
746 intel_pmu_disable_all();
747 intel_pmu_drain_bts_buffer();
748 status = intel_pmu_get_status();
749 if (!status) {
750 intel_pmu_enable_all();
751 return 0;
752 }
753
754 loops = 0;
755again:
756 if (++loops > 100) {
757 WARN_ONCE(1, "perfevents: irq loop stuck!\n");
758 perf_event_print_debug();
759 intel_pmu_reset();
760 goto done;
761 }
762
763 inc_irq_stat(apic_perf_irqs);
764 ack = status;
765 for_each_set_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) {
766 struct perf_event *event = cpuc->events[bit];
767
768 if (!test_bit(bit, cpuc->active_mask))
769 continue;
770
771 if (!intel_pmu_save_and_restart(event))
772 continue;
773
774 data.period = event->hw.last_period;
775
776 if (perf_event_overflow(event, 1, &data, regs))
777 x86_pmu_stop(event);
778 }
779
780 intel_pmu_ack_status(ack);
781
782 /*
783 * Repeat if there is more work to be done:
784 */
785 status = intel_pmu_get_status();
786 if (status)
787 goto again;
788
789done:
790 intel_pmu_enable_all();
791 return 1;
792}
793
794static struct event_constraint bts_constraint =
795 EVENT_CONSTRAINT(0, 1ULL << X86_PMC_IDX_FIXED_BTS, 0);
796
797static struct event_constraint *
798intel_special_constraints(struct perf_event *event)
799{
800 unsigned int hw_event;
801
802 hw_event = event->hw.config & INTEL_ARCH_EVENT_MASK;
803
804 if (unlikely((hw_event ==
805 x86_pmu.event_map(PERF_COUNT_HW_BRANCH_INSTRUCTIONS)) &&
806 (event->hw.sample_period == 1))) {
807
808 return &bts_constraint;
809 }
810 return NULL;
811}
812
813static struct event_constraint *
814intel_get_event_constraints(struct cpu_hw_events *cpuc, struct perf_event *event)
815{
816 struct event_constraint *c;
817
818 c = intel_special_constraints(event);
819 if (c)
820 return c;
821
822 return x86_get_event_constraints(cpuc, event);
823}
824
825static __initconst struct x86_pmu core_pmu = {
826 .name = "core",
827 .handle_irq = x86_pmu_handle_irq,
828 .disable_all = x86_pmu_disable_all,
829 .enable_all = x86_pmu_enable_all,
830 .enable = x86_pmu_enable_event,
831 .disable = x86_pmu_disable_event,
832 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
833 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
834 .event_map = intel_pmu_event_map,
835 .raw_event = intel_pmu_raw_event,
836 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
837 .apic = 1,
838 /*
839 * Intel PMCs cannot be accessed sanely above 32 bit width,
840 * so we install an artificial 1<<31 period regardless of
841 * the generic event period:
842 */
843 .max_period = (1ULL << 31) - 1,
844 .get_event_constraints = intel_get_event_constraints,
845 .event_constraints = intel_core_event_constraints,
846};
847
848static __initconst struct x86_pmu intel_pmu = {
849 .name = "Intel",
850 .handle_irq = intel_pmu_handle_irq,
851 .disable_all = intel_pmu_disable_all,
852 .enable_all = intel_pmu_enable_all,
853 .enable = intel_pmu_enable_event,
854 .disable = intel_pmu_disable_event,
855 .eventsel = MSR_ARCH_PERFMON_EVENTSEL0,
856 .perfctr = MSR_ARCH_PERFMON_PERFCTR0,
857 .event_map = intel_pmu_event_map,
858 .raw_event = intel_pmu_raw_event,
859 .max_events = ARRAY_SIZE(intel_perfmon_event_map),
860 .apic = 1,
861 /*
862 * Intel PMCs cannot be accessed sanely above 32 bit width,
863 * so we install an artificial 1<<31 period regardless of
864 * the generic event period:
865 */
866 .max_period = (1ULL << 31) - 1,
867 .enable_bts = intel_pmu_enable_bts,
868 .disable_bts = intel_pmu_disable_bts,
869 .get_event_constraints = intel_get_event_constraints,
870
871 .cpu_starting = init_debug_store_on_cpu,
872 .cpu_dying = fini_debug_store_on_cpu,
873};
874
875static __init int intel_pmu_init(void)
876{
877 union cpuid10_edx edx;
878 union cpuid10_eax eax;
879 unsigned int unused;
880 unsigned int ebx;
881 int version;
882
883 if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
884 /* check for P6 processor family */
885 if (boot_cpu_data.x86 == 6) {
886 return p6_pmu_init();
887 } else {
888 return -ENODEV;
889 }
890 }
891
892 /*
893 * Check whether the Architectural PerfMon supports
894 * Branch Misses Retired hw_event or not.
895 */
896 cpuid(10, &eax.full, &ebx, &unused, &edx.full);
897 if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED)
898 return -ENODEV;
899
900 version = eax.split.version_id;
901 if (version < 2)
902 x86_pmu = core_pmu;
903 else
904 x86_pmu = intel_pmu;
905
906 x86_pmu.version = version;
907 x86_pmu.num_events = eax.split.num_events;
908 x86_pmu.event_bits = eax.split.bit_width;
909 x86_pmu.event_mask = (1ULL << eax.split.bit_width) - 1;
910
911 /*
912 * Quirk: v2 perfmon does not report fixed-purpose events, so
913 * assume at least 3 events:
914 */
915 if (version > 1)
916 x86_pmu.num_events_fixed = max((int)edx.split.num_events_fixed, 3);
917
918 /*
919 * Install the hw-cache-events table:
920 */
921 switch (boot_cpu_data.x86_model) {
922 case 14: /* 65 nm core solo/duo, "Yonah" */
923 pr_cont("Core events, ");
924 break;
925
926 case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */
927 case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */
928 case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */
929 case 29: /* six-core 45 nm xeon "Dunnington" */
930 memcpy(hw_cache_event_ids, core2_hw_cache_event_ids,
931 sizeof(hw_cache_event_ids));
932
933 x86_pmu.event_constraints = intel_core2_event_constraints;
934 pr_cont("Core2 events, ");
935 break;
936
937 case 26: /* 45 nm nehalem, "Bloomfield" */
938 case 30: /* 45 nm nehalem, "Lynnfield" */
939 case 46: /* 45 nm nehalem-ex, "Beckton" */
940 memcpy(hw_cache_event_ids, nehalem_hw_cache_event_ids,
941 sizeof(hw_cache_event_ids));
942
943 x86_pmu.event_constraints = intel_nehalem_event_constraints;
944 pr_cont("Nehalem/Corei7 events, ");
945 break;
946 case 28: /* Atom */
947 memcpy(hw_cache_event_ids, atom_hw_cache_event_ids,
948 sizeof(hw_cache_event_ids));
949
950 x86_pmu.event_constraints = intel_gen_event_constraints;
951 pr_cont("Atom events, ");
952 break;
953
954 case 37: /* 32 nm nehalem, "Clarkdale" */
955 case 44: /* 32 nm nehalem, "Gulftown" */
956 memcpy(hw_cache_event_ids, westmere_hw_cache_event_ids,
957 sizeof(hw_cache_event_ids));
958
959 x86_pmu.event_constraints = intel_westmere_event_constraints;
960 pr_cont("Westmere events, ");
961 break;
962
963 default:
964 /*
965 * default constraints for v2 and up
966 */
967 x86_pmu.event_constraints = intel_gen_event_constraints;
968 pr_cont("generic architected perfmon, ");
969 }
970 return 0;
971}
972
973#else /* CONFIG_CPU_SUP_INTEL */
974
975static int intel_pmu_init(void)
976{
977 return 0;
978}
979
980#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perf_event_p6.c b/arch/x86/kernel/cpu/perf_event_p6.c
new file mode 100644
index 000000000000..a330485d14da
--- /dev/null
+++ b/arch/x86/kernel/cpu/perf_event_p6.c
@@ -0,0 +1,159 @@
1#ifdef CONFIG_CPU_SUP_INTEL
2
3/*
4 * Not sure about some of these
5 */
6static const u64 p6_perfmon_event_map[] =
7{
8 [PERF_COUNT_HW_CPU_CYCLES] = 0x0079,
9 [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0,
10 [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0f2e,
11 [PERF_COUNT_HW_CACHE_MISSES] = 0x012e,
12 [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4,
13 [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5,
14 [PERF_COUNT_HW_BUS_CYCLES] = 0x0062,
15};
16
17static u64 p6_pmu_event_map(int hw_event)
18{
19 return p6_perfmon_event_map[hw_event];
20}
21
22/*
23 * Event setting that is specified not to count anything.
24 * We use this to effectively disable a counter.
25 *
26 * L2_RQSTS with 0 MESI unit mask.
27 */
28#define P6_NOP_EVENT 0x0000002EULL
29
30static u64 p6_pmu_raw_event(u64 hw_event)
31{
32#define P6_EVNTSEL_EVENT_MASK 0x000000FFULL
33#define P6_EVNTSEL_UNIT_MASK 0x0000FF00ULL
34#define P6_EVNTSEL_EDGE_MASK 0x00040000ULL
35#define P6_EVNTSEL_INV_MASK 0x00800000ULL
36#define P6_EVNTSEL_REG_MASK 0xFF000000ULL
37
38#define P6_EVNTSEL_MASK \
39 (P6_EVNTSEL_EVENT_MASK | \
40 P6_EVNTSEL_UNIT_MASK | \
41 P6_EVNTSEL_EDGE_MASK | \
42 P6_EVNTSEL_INV_MASK | \
43 P6_EVNTSEL_REG_MASK)
44
45 return hw_event & P6_EVNTSEL_MASK;
46}
47
48static struct event_constraint p6_event_constraints[] =
49{
50 INTEL_EVENT_CONSTRAINT(0xc1, 0x1), /* FLOPS */
51 INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */
52 INTEL_EVENT_CONSTRAINT(0x11, 0x1), /* FP_ASSIST */
53 INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */
54 INTEL_EVENT_CONSTRAINT(0x13, 0x2), /* DIV */
55 INTEL_EVENT_CONSTRAINT(0x14, 0x1), /* CYCLES_DIV_BUSY */
56 EVENT_CONSTRAINT_END
57};
58
59static void p6_pmu_disable_all(void)
60{
61 u64 val;
62
63 /* p6 only has one enable register */
64 rdmsrl(MSR_P6_EVNTSEL0, val);
65 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
66 wrmsrl(MSR_P6_EVNTSEL0, val);
67}
68
69static void p6_pmu_enable_all(void)
70{
71 unsigned long val;
72
73 /* p6 only has one enable register */
74 rdmsrl(MSR_P6_EVNTSEL0, val);
75 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
76 wrmsrl(MSR_P6_EVNTSEL0, val);
77}
78
79static inline void
80p6_pmu_disable_event(struct perf_event *event)
81{
82 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
83 struct hw_perf_event *hwc = &event->hw;
84 u64 val = P6_NOP_EVENT;
85
86 if (cpuc->enabled)
87 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
88
89 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
90}
91
92static void p6_pmu_enable_event(struct perf_event *event)
93{
94 struct cpu_hw_events *cpuc = &__get_cpu_var(cpu_hw_events);
95 struct hw_perf_event *hwc = &event->hw;
96 u64 val;
97
98 val = hwc->config;
99 if (cpuc->enabled)
100 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
101
102 (void)checking_wrmsrl(hwc->config_base + hwc->idx, val);
103}
104
105static __initconst struct x86_pmu p6_pmu = {
106 .name = "p6",
107 .handle_irq = x86_pmu_handle_irq,
108 .disable_all = p6_pmu_disable_all,
109 .enable_all = p6_pmu_enable_all,
110 .enable = p6_pmu_enable_event,
111 .disable = p6_pmu_disable_event,
112 .eventsel = MSR_P6_EVNTSEL0,
113 .perfctr = MSR_P6_PERFCTR0,
114 .event_map = p6_pmu_event_map,
115 .raw_event = p6_pmu_raw_event,
116 .max_events = ARRAY_SIZE(p6_perfmon_event_map),
117 .apic = 1,
118 .max_period = (1ULL << 31) - 1,
119 .version = 0,
120 .num_events = 2,
121 /*
122 * Events have 40 bits implemented. However they are designed such
123 * that bits [32-39] are sign extensions of bit 31. As such the
124 * effective width of a event for P6-like PMU is 32 bits only.
125 *
126 * See IA-32 Intel Architecture Software developer manual Vol 3B
127 */
128 .event_bits = 32,
129 .event_mask = (1ULL << 32) - 1,
130 .get_event_constraints = x86_get_event_constraints,
131 .event_constraints = p6_event_constraints,
132};
133
134static __init int p6_pmu_init(void)
135{
136 switch (boot_cpu_data.x86_model) {
137 case 1:
138 case 3: /* Pentium Pro */
139 case 5:
140 case 6: /* Pentium II */
141 case 7:
142 case 8:
143 case 11: /* Pentium III */
144 case 9:
145 case 13:
146 /* Pentium M */
147 break;
148 default:
149 pr_cont("unsupported p6 CPU model %d ",
150 boot_cpu_data.x86_model);
151 return -ENODEV;
152 }
153
154 x86_pmu = p6_pmu;
155
156 return 0;
157}
158
159#endif /* CONFIG_CPU_SUP_INTEL */
diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c
index fab786f60ed6..fb329e9f8494 100644
--- a/arch/x86/kernel/cpu/perfctr-watchdog.c
+++ b/arch/x86/kernel/cpu/perfctr-watchdog.c
@@ -115,17 +115,6 @@ int avail_to_resrv_perfctr_nmi_bit(unsigned int counter)
115 115
116 return !test_bit(counter, perfctr_nmi_owner); 116 return !test_bit(counter, perfctr_nmi_owner);
117} 117}
118
119/* checks the an msr for availability */
120int avail_to_resrv_perfctr_nmi(unsigned int msr)
121{
122 unsigned int counter;
123
124 counter = nmi_perfctr_msr_to_bit(msr);
125 BUG_ON(counter > NMI_MAX_COUNTER_BITS);
126
127 return !test_bit(counter, perfctr_nmi_owner);
128}
129EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit); 118EXPORT_SYMBOL(avail_to_resrv_perfctr_nmi_bit);
130 119
131int reserve_perfctr_nmi(unsigned int msr) 120int reserve_perfctr_nmi(unsigned int msr)
@@ -691,7 +680,7 @@ static int setup_intel_arch_watchdog(unsigned nmi_hz)
691 cpu_nmi_set_wd_enabled(); 680 cpu_nmi_set_wd_enabled();
692 681
693 apic_write(APIC_LVTPC, APIC_DM_NMI); 682 apic_write(APIC_LVTPC, APIC_DM_NMI);
694 evntsel |= ARCH_PERFMON_EVENTSEL0_ENABLE; 683 evntsel |= ARCH_PERFMON_EVENTSEL_ENABLE;
695 wrmsr(evntsel_msr, evntsel, 0); 684 wrmsr(evntsel_msr, evntsel, 0);
696 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1); 685 intel_arch_wd_ops.checkbit = 1ULL << (eax.split.bit_width - 1);
697 return 1; 686 return 1;
@@ -712,7 +701,7 @@ static void probe_nmi_watchdog(void)
712 switch (boot_cpu_data.x86_vendor) { 701 switch (boot_cpu_data.x86_vendor) {
713 case X86_VENDOR_AMD: 702 case X86_VENDOR_AMD:
714 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 && 703 if (boot_cpu_data.x86 != 6 && boot_cpu_data.x86 != 15 &&
715 boot_cpu_data.x86 != 16) 704 boot_cpu_data.x86 != 16 && boot_cpu_data.x86 != 17)
716 return; 705 return;
717 wd_ops = &k7_wd_ops; 706 wd_ops = &k7_wd_ops;
718 break; 707 break;
diff --git a/arch/x86/kernel/cpu/transmeta.c b/arch/x86/kernel/cpu/transmeta.c
index bb62b3e5caad..28000743bbb0 100644
--- a/arch/x86/kernel/cpu/transmeta.c
+++ b/arch/x86/kernel/cpu/transmeta.c
@@ -26,7 +26,7 @@ static void __cpuinit init_transmeta(struct cpuinfo_x86 *c)
26 26
27 early_init_transmeta(c); 27 early_init_transmeta(c);
28 28
29 display_cacheinfo(c); 29 cpu_detect_cache_sizes(c);
30 30
31 /* Print CMS and CPU revision */ 31 /* Print CMS and CPU revision */
32 max = cpuid_eax(0x80860000); 32 max = cpuid_eax(0x80860000);
diff --git a/arch/x86/kernel/cpu/vmware.c b/arch/x86/kernel/cpu/vmware.c
index 1cbed97b59cf..dfdb4dba2320 100644
--- a/arch/x86/kernel/cpu/vmware.c
+++ b/arch/x86/kernel/cpu/vmware.c
@@ -22,6 +22,7 @@
22 */ 22 */
23 23
24#include <linux/dmi.h> 24#include <linux/dmi.h>
25#include <linux/module.h>
25#include <asm/div64.h> 26#include <asm/div64.h>
26#include <asm/vmware.h> 27#include <asm/vmware.h>
27#include <asm/x86_init.h> 28#include <asm/x86_init.h>
@@ -101,6 +102,7 @@ int vmware_platform(void)
101 102
102 return 0; 103 return 0;
103} 104}
105EXPORT_SYMBOL(vmware_platform);
104 106
105/* 107/*
106 * VMware hypervisor takes care of exporting a reliable TSC to the guest. 108 * VMware hypervisor takes care of exporting a reliable TSC to the guest.
diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c
index 6a52d4b36a30..8b862d5900fe 100644
--- a/arch/x86/kernel/cpuid.c
+++ b/arch/x86/kernel/cpuid.c
@@ -40,6 +40,7 @@
40#include <linux/cpu.h> 40#include <linux/cpu.h>
41#include <linux/notifier.h> 41#include <linux/notifier.h>
42#include <linux/uaccess.h> 42#include <linux/uaccess.h>
43#include <linux/gfp.h>
43 44
44#include <asm/processor.h> 45#include <asm/processor.h>
45#include <asm/msr.h> 46#include <asm/msr.h>
@@ -116,21 +117,16 @@ static int cpuid_open(struct inode *inode, struct file *file)
116{ 117{
117 unsigned int cpu; 118 unsigned int cpu;
118 struct cpuinfo_x86 *c; 119 struct cpuinfo_x86 *c;
119 int ret = 0;
120
121 lock_kernel();
122 120
123 cpu = iminor(file->f_path.dentry->d_inode); 121 cpu = iminor(file->f_path.dentry->d_inode);
124 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) { 122 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
125 ret = -ENXIO; /* No such CPU */ 123 return -ENXIO; /* No such CPU */
126 goto out; 124
127 }
128 c = &cpu_data(cpu); 125 c = &cpu_data(cpu);
129 if (c->cpuid_level < 0) 126 if (c->cpuid_level < 0)
130 ret = -EIO; /* CPUID not supported */ 127 return -EIO; /* CPUID not supported */
131out: 128
132 unlock_kernel(); 129 return 0;
133 return ret;
134} 130}
135 131
136/* 132/*
@@ -192,7 +188,8 @@ static int __init cpuid_init(void)
192 int i, err = 0; 188 int i, err = 0;
193 i = 0; 189 i = 0;
194 190
195 if (register_chrdev(CPUID_MAJOR, "cpu/cpuid", &cpuid_fops)) { 191 if (__register_chrdev(CPUID_MAJOR, 0, NR_CPUS,
192 "cpu/cpuid", &cpuid_fops)) {
196 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n", 193 printk(KERN_ERR "cpuid: unable to get major %d for cpuid\n",
197 CPUID_MAJOR); 194 CPUID_MAJOR);
198 err = -EBUSY; 195 err = -EBUSY;
@@ -221,7 +218,7 @@ out_class:
221 } 218 }
222 class_destroy(cpuid_class); 219 class_destroy(cpuid_class);
223out_chrdev: 220out_chrdev:
224 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 221 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
225out: 222out:
226 return err; 223 return err;
227} 224}
@@ -233,7 +230,7 @@ static void __exit cpuid_exit(void)
233 for_each_online_cpu(cpu) 230 for_each_online_cpu(cpu)
234 cpuid_device_destroy(cpu); 231 cpuid_device_destroy(cpu);
235 class_destroy(cpuid_class); 232 class_destroy(cpuid_class);
236 unregister_chrdev(CPUID_MAJOR, "cpu/cpuid"); 233 __unregister_chrdev(CPUID_MAJOR, 0, NR_CPUS, "cpu/cpuid");
237 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier); 234 unregister_hotcpu_notifier(&cpuid_class_cpu_notifier);
238} 235}
239 236
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 5e409dc298a4..ebd4c51d096a 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -27,8 +27,6 @@
27#include <asm/cpu.h> 27#include <asm/cpu.h>
28#include <asm/reboot.h> 28#include <asm/reboot.h>
29#include <asm/virtext.h> 29#include <asm/virtext.h>
30#include <asm/iommu.h>
31
32 30
33#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC) 31#if defined(CONFIG_SMP) && defined(CONFIG_X86_LOCAL_APIC)
34 32
@@ -104,10 +102,5 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
104#ifdef CONFIG_HPET_TIMER 102#ifdef CONFIG_HPET_TIMER
105 hpet_disable(); 103 hpet_disable();
106#endif 104#endif
107
108#ifdef CONFIG_X86_64
109 pci_iommu_shutdown();
110#endif
111
112 crash_save_cpu(regs, safe_smp_processor_id()); 105 crash_save_cpu(regs, safe_smp_processor_id());
113} 106}
diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c
index cd97ce18c29d..67414550c3cc 100644
--- a/arch/x86/kernel/crash_dump_32.c
+++ b/arch/x86/kernel/crash_dump_32.c
@@ -5,6 +5,7 @@
5 * Copyright (C) IBM Corporation, 2004. All rights reserved 5 * Copyright (C) IBM Corporation, 2004. All rights reserved
6 */ 6 */
7 7
8#include <linux/slab.h>
8#include <linux/errno.h> 9#include <linux/errno.h>
9#include <linux/highmem.h> 10#include <linux/highmem.h>
10#include <linux/crash_dump.h> 11#include <linux/crash_dump.h>
diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c
index ef42a038f1a6..1c47390dd0e5 100644
--- a/arch/x86/kernel/ds.c
+++ b/arch/x86/kernel/ds.c
@@ -265,13 +265,13 @@ struct ds_context {
265 int cpu; 265 int cpu;
266}; 266};
267 267
268static DEFINE_PER_CPU(struct ds_context *, cpu_context); 268static DEFINE_PER_CPU(struct ds_context *, cpu_ds_context);
269 269
270 270
271static struct ds_context *ds_get_context(struct task_struct *task, int cpu) 271static struct ds_context *ds_get_context(struct task_struct *task, int cpu)
272{ 272{
273 struct ds_context **p_context = 273 struct ds_context **p_context =
274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); 274 (task ? &task->thread.ds_ctx : &per_cpu(cpu_ds_context, cpu));
275 struct ds_context *context = NULL; 275 struct ds_context *context = NULL;
276 struct ds_context *new_context = NULL; 276 struct ds_context *new_context = NULL;
277 277
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 2d8a371d4339..6d817554780a 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -109,6 +109,32 @@ print_context_stack(struct thread_info *tinfo,
109 } 109 }
110 return bp; 110 return bp;
111} 111}
112EXPORT_SYMBOL_GPL(print_context_stack);
113
114unsigned long
115print_context_stack_bp(struct thread_info *tinfo,
116 unsigned long *stack, unsigned long bp,
117 const struct stacktrace_ops *ops, void *data,
118 unsigned long *end, int *graph)
119{
120 struct stack_frame *frame = (struct stack_frame *)bp;
121 unsigned long *ret_addr = &frame->return_address;
122
123 while (valid_stack_ptr(tinfo, ret_addr, sizeof(*ret_addr), end)) {
124 unsigned long addr = *ret_addr;
125
126 if (!__kernel_text_address(addr))
127 break;
128
129 ops->address(data, addr, 1);
130 frame = frame->next_frame;
131 ret_addr = &frame->return_address;
132 print_ftrace_graph_addr(addr, data, ops, tinfo, graph);
133 }
134
135 return (unsigned long)frame;
136}
137EXPORT_SYMBOL_GPL(print_context_stack_bp);
112 138
113 139
114static void 140static void
@@ -141,10 +167,11 @@ static void print_trace_address(void *data, unsigned long addr, int reliable)
141} 167}
142 168
143static const struct stacktrace_ops print_trace_ops = { 169static const struct stacktrace_ops print_trace_ops = {
144 .warning = print_trace_warning, 170 .warning = print_trace_warning,
145 .warning_symbol = print_trace_warning_symbol, 171 .warning_symbol = print_trace_warning_symbol,
146 .stack = print_trace_stack, 172 .stack = print_trace_stack,
147 .address = print_trace_address, 173 .address = print_trace_address,
174 .walk_stack = print_context_stack,
148}; 175};
149 176
150void 177void
@@ -188,7 +215,7 @@ void dump_stack(void)
188} 215}
189EXPORT_SYMBOL(dump_stack); 216EXPORT_SYMBOL(dump_stack);
190 217
191static raw_spinlock_t die_lock = __RAW_SPIN_LOCK_UNLOCKED; 218static arch_spinlock_t die_lock = __ARCH_SPIN_LOCK_UNLOCKED;
192static int die_owner = -1; 219static int die_owner = -1;
193static unsigned int die_nest_count; 220static unsigned int die_nest_count;
194 221
@@ -207,11 +234,11 @@ unsigned __kprobes long oops_begin(void)
207 /* racy, but better than risking deadlock. */ 234 /* racy, but better than risking deadlock. */
208 raw_local_irq_save(flags); 235 raw_local_irq_save(flags);
209 cpu = smp_processor_id(); 236 cpu = smp_processor_id();
210 if (!__raw_spin_trylock(&die_lock)) { 237 if (!arch_spin_trylock(&die_lock)) {
211 if (cpu == die_owner) 238 if (cpu == die_owner)
212 /* nested oops. should stop eventually */; 239 /* nested oops. should stop eventually */;
213 else 240 else
214 __raw_spin_lock(&die_lock); 241 arch_spin_lock(&die_lock);
215 } 242 }
216 die_nest_count++; 243 die_nest_count++;
217 die_owner = cpu; 244 die_owner = cpu;
@@ -231,7 +258,7 @@ void __kprobes oops_end(unsigned long flags, struct pt_regs *regs, int signr)
231 die_nest_count--; 258 die_nest_count--;
232 if (!die_nest_count) 259 if (!die_nest_count)
233 /* Nest count reaches zero, release the lock. */ 260 /* Nest count reaches zero, release the lock. */
234 __raw_spin_unlock(&die_lock); 261 arch_spin_unlock(&die_lock);
235 raw_local_irq_restore(flags); 262 raw_local_irq_restore(flags);
236 oops_exit(); 263 oops_exit();
237 264
@@ -268,11 +295,12 @@ int __kprobes __die(const char *str, struct pt_regs *regs, long err)
268 295
269 show_registers(regs); 296 show_registers(regs);
270#ifdef CONFIG_X86_32 297#ifdef CONFIG_X86_32
271 sp = (unsigned long) (&regs->sp); 298 if (user_mode_vm(regs)) {
272 savesegment(ss, ss);
273 if (user_mode(regs)) {
274 sp = regs->sp; 299 sp = regs->sp;
275 ss = regs->ss & 0xffff; 300 ss = regs->ss & 0xffff;
301 } else {
302 sp = kernel_stack_pointer(regs);
303 savesegment(ss, ss);
276 } 304 }
277 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip); 305 printk(KERN_EMERG "EIP: [<%08lx>] ", regs->ip);
278 print_symbol("%s", regs->ip); 306 print_symbol("%s", regs->ip);
diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h
index 81086c227ab7..e1a93be4fd44 100644
--- a/arch/x86/kernel/dumpstack.h
+++ b/arch/x86/kernel/dumpstack.h
@@ -14,11 +14,7 @@
14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :) 14#define get_bp(bp) asm("movq %%rbp, %0" : "=r" (bp) :)
15#endif 15#endif
16 16
17extern unsigned long 17#include <linux/uaccess.h>
18print_context_stack(struct thread_info *tinfo,
19 unsigned long *stack, unsigned long bp,
20 const struct stacktrace_ops *ops, void *data,
21 unsigned long *end, int *graph);
22 18
23extern void 19extern void
24show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, 20show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs,
@@ -35,4 +31,26 @@ struct stack_frame {
35 struct stack_frame *next_frame; 31 struct stack_frame *next_frame;
36 unsigned long return_address; 32 unsigned long return_address;
37}; 33};
34
35struct stack_frame_ia32 {
36 u32 next_frame;
37 u32 return_address;
38};
39
40static inline unsigned long rewind_frame_pointer(int n)
41{
42 struct stack_frame *frame;
43
44 get_bp(frame);
45
46#ifdef CONFIG_FRAME_POINTER
47 while (n--) {
48 if (probe_kernel_address(&frame->next_frame, frame))
49 break;
50 }
38#endif 51#endif
52
53 return (unsigned long)frame;
54}
55
56#endif /* DUMPSTACK_H */
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index f7dd2a7c3bf4..11540a189d93 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -10,19 +10,14 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21/* Just a stub for now */
22int x86_is_stack_id(int id, char *name)
23{
24 return 0;
25}
26 21
27void dump_trace(struct task_struct *task, struct pt_regs *regs, 22void dump_trace(struct task_struct *task, struct pt_regs *regs,
28 unsigned long *stack, unsigned long bp, 23 unsigned long *stack, unsigned long bp,
@@ -35,6 +30,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
35 30
36 if (!stack) { 31 if (!stack) {
37 unsigned long dummy; 32 unsigned long dummy;
33
38 stack = &dummy; 34 stack = &dummy;
39 if (task && task != current) 35 if (task && task != current)
40 stack = (unsigned long *)task->thread.sp; 36 stack = (unsigned long *)task->thread.sp;
@@ -57,8 +53,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
57 53
58 context = (struct thread_info *) 54 context = (struct thread_info *)
59 ((unsigned long)stack & (~(THREAD_SIZE - 1))); 55 ((unsigned long)stack & (~(THREAD_SIZE - 1)));
60 bp = print_context_stack(context, stack, bp, ops, 56 bp = ops->walk_stack(context, stack, bp, ops, data, NULL, &graph);
61 data, NULL, &graph);
62 57
63 stack = (unsigned long *)context->previous_esp; 58 stack = (unsigned long *)context->previous_esp;
64 if (!stack) 59 if (!stack)
@@ -72,7 +67,7 @@ EXPORT_SYMBOL(dump_trace);
72 67
73void 68void
74show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 69show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
75 unsigned long *sp, unsigned long bp, char *log_lvl) 70 unsigned long *sp, unsigned long bp, char *log_lvl)
76{ 71{
77 unsigned long *stack; 72 unsigned long *stack;
78 int i; 73 int i;
@@ -156,4 +151,3 @@ int is_valid_bugaddr(unsigned long ip)
156 151
157 return ud2 == 0x0b0f; 152 return ud2 == 0x0b0f;
158} 153}
159
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index a071e6be177e..272c9f1f05f3 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -10,34 +10,31 @@
10#include <linux/module.h> 10#include <linux/module.h>
11#include <linux/ptrace.h> 11#include <linux/ptrace.h>
12#include <linux/kexec.h> 12#include <linux/kexec.h>
13#include <linux/sysfs.h>
13#include <linux/bug.h> 14#include <linux/bug.h>
14#include <linux/nmi.h> 15#include <linux/nmi.h>
15#include <linux/sysfs.h>
16 16
17#include <asm/stacktrace.h> 17#include <asm/stacktrace.h>
18 18
19#include "dumpstack.h" 19#include "dumpstack.h"
20 20
21#define N_EXCEPTION_STACKS_END \
22 (N_EXCEPTION_STACKS + DEBUG_STKSZ/EXCEPTION_STKSZ - 2)
21 23
22static char x86_stack_ids[][8] = { 24static char x86_stack_ids[][8] = {
23 [DEBUG_STACK - 1] = "#DB", 25 [ DEBUG_STACK-1 ] = "#DB",
24 [NMI_STACK - 1] = "NMI", 26 [ NMI_STACK-1 ] = "NMI",
25 [DOUBLEFAULT_STACK - 1] = "#DF", 27 [ DOUBLEFAULT_STACK-1 ] = "#DF",
26 [STACKFAULT_STACK - 1] = "#SS", 28 [ STACKFAULT_STACK-1 ] = "#SS",
27 [MCE_STACK - 1] = "#MC", 29 [ MCE_STACK-1 ] = "#MC",
28#if DEBUG_STKSZ > EXCEPTION_STKSZ 30#if DEBUG_STKSZ > EXCEPTION_STKSZ
29 [N_EXCEPTION_STACKS ... 31 [ N_EXCEPTION_STACKS ...
30 N_EXCEPTION_STACKS + DEBUG_STKSZ / EXCEPTION_STKSZ - 2] = "#DB[?]" 32 N_EXCEPTION_STACKS_END ] = "#DB[?]"
31#endif 33#endif
32 }; 34};
33
34int x86_is_stack_id(int id, char *name)
35{
36 return x86_stack_ids[id - 1] == name;
37}
38 35
39static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, 36static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
40 unsigned *usedp, char **idp) 37 unsigned *usedp, char **idp)
41{ 38{
42 unsigned k; 39 unsigned k;
43 40
@@ -101,6 +98,41 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack,
101 return NULL; 98 return NULL;
102} 99}
103 100
101static inline int
102in_irq_stack(unsigned long *stack, unsigned long *irq_stack,
103 unsigned long *irq_stack_end)
104{
105 return (stack >= irq_stack && stack < irq_stack_end);
106}
107
108/*
109 * We are returning from the irq stack and go to the previous one.
110 * If the previous stack is also in the irq stack, then bp in the first
111 * frame of the irq stack points to the previous, interrupted one.
112 * Otherwise we have another level of indirection: We first save
113 * the bp of the previous stack, then we switch the stack to the irq one
114 * and save a new bp that links to the previous one.
115 * (See save_args())
116 */
117static inline unsigned long
118fixup_bp_irq_link(unsigned long bp, unsigned long *stack,
119 unsigned long *irq_stack, unsigned long *irq_stack_end)
120{
121#ifdef CONFIG_FRAME_POINTER
122 struct stack_frame *frame = (struct stack_frame *)bp;
123 unsigned long next;
124
125 if (!in_irq_stack(stack, irq_stack, irq_stack_end)) {
126 if (!probe_kernel_address(&frame->next_frame, next))
127 return next;
128 else
129 WARN_ONCE(1, "Perf: bad frame pointer = %p in "
130 "callchain\n", &frame->next_frame);
131 }
132#endif
133 return bp;
134}
135
104/* 136/*
105 * x86-64 can have up to three kernel stacks: 137 * x86-64 can have up to three kernel stacks:
106 * process stack 138 * process stack
@@ -157,8 +189,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
157 if (ops->stack(data, id) < 0) 189 if (ops->stack(data, id) < 0)
158 break; 190 break;
159 191
160 bp = print_context_stack(tinfo, stack, bp, ops, 192 bp = ops->walk_stack(tinfo, stack, bp, ops,
161 data, estack_end, &graph); 193 data, estack_end, &graph);
162 ops->stack(data, "<EOE>"); 194 ops->stack(data, "<EOE>");
163 /* 195 /*
164 * We link to the next stack via the 196 * We link to the next stack via the
@@ -173,10 +205,10 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
173 irq_stack = irq_stack_end - 205 irq_stack = irq_stack_end -
174 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); 206 (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack);
175 207
176 if (stack >= irq_stack && stack < irq_stack_end) { 208 if (in_irq_stack(stack, irq_stack, irq_stack_end)) {
177 if (ops->stack(data, "IRQ") < 0) 209 if (ops->stack(data, "IRQ") < 0)
178 break; 210 break;
179 bp = print_context_stack(tinfo, stack, bp, 211 bp = ops->walk_stack(tinfo, stack, bp,
180 ops, data, irq_stack_end, &graph); 212 ops, data, irq_stack_end, &graph);
181 /* 213 /*
182 * We link to the next stack (which would be 214 * We link to the next stack (which would be
@@ -184,6 +216,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
184 * pointer (index -1 to end) in the IRQ stack: 216 * pointer (index -1 to end) in the IRQ stack:
185 */ 217 */
186 stack = (unsigned long *) (irq_stack_end[-1]); 218 stack = (unsigned long *) (irq_stack_end[-1]);
219 bp = fixup_bp_irq_link(bp, stack, irq_stack,
220 irq_stack_end);
187 irq_stack_end = NULL; 221 irq_stack_end = NULL;
188 ops->stack(data, "EOI"); 222 ops->stack(data, "EOI");
189 continue; 223 continue;
@@ -195,28 +229,31 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs,
195 /* 229 /*
196 * This handles the process stack: 230 * This handles the process stack:
197 */ 231 */
198 bp = print_context_stack(tinfo, stack, bp, ops, data, NULL, &graph); 232 bp = ops->walk_stack(tinfo, stack, bp, ops, data, NULL, &graph);
199 put_cpu(); 233 put_cpu();
200} 234}
201EXPORT_SYMBOL(dump_trace); 235EXPORT_SYMBOL(dump_trace);
202 236
203void 237void
204show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, 238show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
205 unsigned long *sp, unsigned long bp, char *log_lvl) 239 unsigned long *sp, unsigned long bp, char *log_lvl)
206{ 240{
241 unsigned long *irq_stack_end;
242 unsigned long *irq_stack;
207 unsigned long *stack; 243 unsigned long *stack;
244 int cpu;
208 int i; 245 int i;
209 const int cpu = smp_processor_id(); 246
210 unsigned long *irq_stack_end = 247 preempt_disable();
211 (unsigned long *)(per_cpu(irq_stack_ptr, cpu)); 248 cpu = smp_processor_id();
212 unsigned long *irq_stack = 249
213 (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE); 250 irq_stack_end = (unsigned long *)(per_cpu(irq_stack_ptr, cpu));
251 irq_stack = (unsigned long *)(per_cpu(irq_stack_ptr, cpu) - IRQ_STACK_SIZE);
214 252
215 /* 253 /*
216 * debugging aid: "show_stack(NULL, NULL);" prints the 254 * Debugging aid: "show_stack(NULL, NULL);" prints the
217 * back trace for this cpu. 255 * back trace for this cpu:
218 */ 256 */
219
220 if (sp == NULL) { 257 if (sp == NULL) {
221 if (task) 258 if (task)
222 sp = (unsigned long *)task->thread.sp; 259 sp = (unsigned long *)task->thread.sp;
@@ -240,6 +277,8 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
240 printk(" %016lx", *stack++); 277 printk(" %016lx", *stack++);
241 touch_nmi_watchdog(); 278 touch_nmi_watchdog();
242 } 279 }
280 preempt_enable();
281
243 printk("\n"); 282 printk("\n");
244 show_trace_log_lvl(task, regs, sp, bp, log_lvl); 283 show_trace_log_lvl(task, regs, sp, bp, log_lvl);
245} 284}
@@ -253,6 +292,7 @@ void show_registers(struct pt_regs *regs)
253 292
254 sp = regs->sp; 293 sp = regs->sp;
255 printk("CPU %d ", cpu); 294 printk("CPU %d ", cpu);
295 print_modules();
256 __show_regs(regs, 1); 296 __show_regs(regs, 1);
257 printk("Process %s (pid: %d, threadinfo %p, task %p)\n", 297 printk("Process %s (pid: %d, threadinfo %p, task %p)\n",
258 cur->comm, cur->pid, task_thread_info(cur), cur); 298 cur->comm, cur->pid, task_thread_info(cur), cur);
@@ -303,4 +343,3 @@ int is_valid_bugaddr(unsigned long ip)
303 343
304 return ud2 == 0x0b0f; 344 return ud2 == 0x0b0f;
305} 345}
306
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index d17d482a04f4..7bca3c6a02fb 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -12,21 +12,13 @@
12#include <linux/types.h> 12#include <linux/types.h>
13#include <linux/init.h> 13#include <linux/init.h>
14#include <linux/bootmem.h> 14#include <linux/bootmem.h>
15#include <linux/ioport.h>
16#include <linux/string.h>
17#include <linux/kexec.h>
18#include <linux/module.h>
19#include <linux/mm.h>
20#include <linux/pfn.h> 15#include <linux/pfn.h>
21#include <linux/suspend.h> 16#include <linux/suspend.h>
22#include <linux/firmware-map.h> 17#include <linux/firmware-map.h>
23 18
24#include <asm/pgtable.h>
25#include <asm/page.h>
26#include <asm/e820.h> 19#include <asm/e820.h>
27#include <asm/proto.h> 20#include <asm/proto.h>
28#include <asm/setup.h> 21#include <asm/setup.h>
29#include <asm/trampoline.h>
30 22
31/* 23/*
32 * The e820 map is the map that gets modified e.g. with command line parameters 24 * The e820 map is the map that gets modified e.g. with command line parameters
@@ -517,31 +509,55 @@ u64 __init e820_remove_range(u64 start, u64 size, unsigned old_type,
517 int checktype) 509 int checktype)
518{ 510{
519 int i; 511 int i;
512 u64 end;
520 u64 real_removed_size = 0; 513 u64 real_removed_size = 0;
521 514
522 if (size > (ULLONG_MAX - start)) 515 if (size > (ULLONG_MAX - start))
523 size = ULLONG_MAX - start; 516 size = ULLONG_MAX - start;
524 517
518 end = start + size;
519 printk(KERN_DEBUG "e820 remove range: %016Lx - %016Lx ",
520 (unsigned long long) start,
521 (unsigned long long) end);
522 if (checktype)
523 e820_print_type(old_type);
524 printk(KERN_CONT "\n");
525
525 for (i = 0; i < e820.nr_map; i++) { 526 for (i = 0; i < e820.nr_map; i++) {
526 struct e820entry *ei = &e820.map[i]; 527 struct e820entry *ei = &e820.map[i];
527 u64 final_start, final_end; 528 u64 final_start, final_end;
529 u64 ei_end;
528 530
529 if (checktype && ei->type != old_type) 531 if (checktype && ei->type != old_type)
530 continue; 532 continue;
533
534 ei_end = ei->addr + ei->size;
531 /* totally covered? */ 535 /* totally covered? */
532 if (ei->addr >= start && 536 if (ei->addr >= start && ei_end <= end) {
533 (ei->addr + ei->size) <= (start + size)) {
534 real_removed_size += ei->size; 537 real_removed_size += ei->size;
535 memset(ei, 0, sizeof(struct e820entry)); 538 memset(ei, 0, sizeof(struct e820entry));
536 continue; 539 continue;
537 } 540 }
541
542 /* new range is totally covered? */
543 if (ei->addr < start && ei_end > end) {
544 e820_add_region(end, ei_end - end, ei->type);
545 ei->size = start - ei->addr;
546 real_removed_size += size;
547 continue;
548 }
549
538 /* partially covered */ 550 /* partially covered */
539 final_start = max(start, ei->addr); 551 final_start = max(start, ei->addr);
540 final_end = min(start + size, ei->addr + ei->size); 552 final_end = min(end, ei_end);
541 if (final_start >= final_end) 553 if (final_start >= final_end)
542 continue; 554 continue;
543 real_removed_size += final_end - final_start; 555 real_removed_size += final_end - final_start;
544 556
557 /*
558 * left range could be head or tail, so need to update
559 * size at first.
560 */
545 ei->size -= final_end - final_start; 561 ei->size -= final_end - final_start;
546 if (ei->addr < final_start) 562 if (ei->addr < final_start)
547 continue; 563 continue;
@@ -722,310 +738,44 @@ core_initcall(e820_mark_nvs_memory);
722#endif 738#endif
723 739
724/* 740/*
725 * Early reserved memory areas. 741 * Find a free area with specified alignment in a specific range.
726 */
727#define MAX_EARLY_RES 20
728
729struct early_res {
730 u64 start, end;
731 char name[16];
732 char overlap_ok;
733};
734static struct early_res early_res[MAX_EARLY_RES] __initdata = {
735 { 0, PAGE_SIZE, "BIOS data page" }, /* BIOS data page */
736 {}
737};
738
739static int __init find_overlapped_early(u64 start, u64 end)
740{
741 int i;
742 struct early_res *r;
743
744 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
745 r = &early_res[i];
746 if (end > r->start && start < r->end)
747 break;
748 }
749
750 return i;
751}
752
753/*
754 * Drop the i-th range from the early reservation map,
755 * by copying any higher ranges down one over it, and
756 * clearing what had been the last slot.
757 */
758static void __init drop_range(int i)
759{
760 int j;
761
762 for (j = i + 1; j < MAX_EARLY_RES && early_res[j].end; j++)
763 ;
764
765 memmove(&early_res[i], &early_res[i + 1],
766 (j - 1 - i) * sizeof(struct early_res));
767
768 early_res[j - 1].end = 0;
769}
770
771/*
772 * Split any existing ranges that:
773 * 1) are marked 'overlap_ok', and
774 * 2) overlap with the stated range [start, end)
775 * into whatever portion (if any) of the existing range is entirely
776 * below or entirely above the stated range. Drop the portion
777 * of the existing range that overlaps with the stated range,
778 * which will allow the caller of this routine to then add that
779 * stated range without conflicting with any existing range.
780 */ 742 */
781static void __init drop_overlaps_that_are_ok(u64 start, u64 end) 743u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
782{ 744{
783 int i; 745 int i;
784 struct early_res *r;
785 u64 lower_start, lower_end;
786 u64 upper_start, upper_end;
787 char name[16];
788 746
789 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) { 747 for (i = 0; i < e820.nr_map; i++) {
790 r = &early_res[i]; 748 struct e820entry *ei = &e820.map[i];
749 u64 addr;
750 u64 ei_start, ei_last;
791 751
792 /* Continue past non-overlapping ranges */ 752 if (ei->type != E820_RAM)
793 if (end <= r->start || start >= r->end)
794 continue; 753 continue;
795 754
796 /* 755 ei_last = ei->addr + ei->size;
797 * Leave non-ok overlaps as is; let caller 756 ei_start = ei->addr;
798 * panic "Overlapping early reservations" 757 addr = find_early_area(ei_start, ei_last, start, end,
799 * when it hits this overlap. 758 size, align);
800 */
801 if (!r->overlap_ok)
802 return;
803
804 /*
805 * We have an ok overlap. We will drop it from the early
806 * reservation map, and add back in any non-overlapping
807 * portions (lower or upper) as separate, overlap_ok,
808 * non-overlapping ranges.
809 */
810
811 /* 1. Note any non-overlapping (lower or upper) ranges. */
812 strncpy(name, r->name, sizeof(name) - 1);
813
814 lower_start = lower_end = 0;
815 upper_start = upper_end = 0;
816 if (r->start < start) {
817 lower_start = r->start;
818 lower_end = start;
819 }
820 if (r->end > end) {
821 upper_start = end;
822 upper_end = r->end;
823 }
824
825 /* 2. Drop the original ok overlapping range */
826 drop_range(i);
827
828 i--; /* resume for-loop on copied down entry */
829
830 /* 3. Add back in any non-overlapping ranges. */
831 if (lower_end)
832 reserve_early_overlap_ok(lower_start, lower_end, name);
833 if (upper_end)
834 reserve_early_overlap_ok(upper_start, upper_end, name);
835 }
836}
837
838static void __init __reserve_early(u64 start, u64 end, char *name,
839 int overlap_ok)
840{
841 int i;
842 struct early_res *r;
843
844 i = find_overlapped_early(start, end);
845 if (i >= MAX_EARLY_RES)
846 panic("Too many early reservations");
847 r = &early_res[i];
848 if (r->end)
849 panic("Overlapping early reservations "
850 "%llx-%llx %s to %llx-%llx %s\n",
851 start, end - 1, name?name:"", r->start,
852 r->end - 1, r->name);
853 r->start = start;
854 r->end = end;
855 r->overlap_ok = overlap_ok;
856 if (name)
857 strncpy(r->name, name, sizeof(r->name) - 1);
858}
859
860/*
861 * A few early reservtations come here.
862 *
863 * The 'overlap_ok' in the name of this routine does -not- mean it
864 * is ok for these reservations to overlap an earlier reservation.
865 * Rather it means that it is ok for subsequent reservations to
866 * overlap this one.
867 *
868 * Use this entry point to reserve early ranges when you are doing
869 * so out of "Paranoia", reserving perhaps more memory than you need,
870 * just in case, and don't mind a subsequent overlapping reservation
871 * that is known to be needed.
872 *
873 * The drop_overlaps_that_are_ok() call here isn't really needed.
874 * It would be needed if we had two colliding 'overlap_ok'
875 * reservations, so that the second such would not panic on the
876 * overlap with the first. We don't have any such as of this
877 * writing, but might as well tolerate such if it happens in
878 * the future.
879 */
880void __init reserve_early_overlap_ok(u64 start, u64 end, char *name)
881{
882 drop_overlaps_that_are_ok(start, end);
883 __reserve_early(start, end, name, 1);
884}
885
886/*
887 * Most early reservations come here.
888 *
889 * We first have drop_overlaps_that_are_ok() drop any pre-existing
890 * 'overlap_ok' ranges, so that we can then reserve this memory
891 * range without risk of panic'ing on an overlapping overlap_ok
892 * early reservation.
893 */
894void __init reserve_early(u64 start, u64 end, char *name)
895{
896 if (start >= end)
897 return;
898
899 drop_overlaps_that_are_ok(start, end);
900 __reserve_early(start, end, name, 0);
901}
902
903void __init free_early(u64 start, u64 end)
904{
905 struct early_res *r;
906 int i;
907
908 i = find_overlapped_early(start, end);
909 r = &early_res[i];
910 if (i >= MAX_EARLY_RES || r->end != end || r->start != start)
911 panic("free_early on not reserved area: %llx-%llx!",
912 start, end - 1);
913
914 drop_range(i);
915}
916 759
917void __init early_res_to_bootmem(u64 start, u64 end) 760 if (addr != -1ULL)
918{ 761 return addr;
919 int i, count;
920 u64 final_start, final_end;
921
922 count = 0;
923 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++)
924 count++;
925
926 printk(KERN_INFO "(%d early reservations) ==> bootmem [%010llx - %010llx]\n",
927 count, start, end);
928 for (i = 0; i < count; i++) {
929 struct early_res *r = &early_res[i];
930 printk(KERN_INFO " #%d [%010llx - %010llx] %16s", i,
931 r->start, r->end, r->name);
932 final_start = max(start, r->start);
933 final_end = min(end, r->end);
934 if (final_start >= final_end) {
935 printk(KERN_CONT "\n");
936 continue;
937 }
938 printk(KERN_CONT " ==> [%010llx - %010llx]\n",
939 final_start, final_end);
940 reserve_bootmem_generic(final_start, final_end - final_start,
941 BOOTMEM_DEFAULT);
942 } 762 }
763 return -1ULL;
943} 764}
944 765
945/* Check for already reserved areas */ 766u64 __init find_fw_memmap_area(u64 start, u64 end, u64 size, u64 align)
946static inline int __init bad_addr(u64 *addrp, u64 size, u64 align)
947{
948 int i;
949 u64 addr = *addrp;
950 int changed = 0;
951 struct early_res *r;
952again:
953 i = find_overlapped_early(addr, addr + size);
954 r = &early_res[i];
955 if (i < MAX_EARLY_RES && r->end) {
956 *addrp = addr = round_up(r->end, align);
957 changed = 1;
958 goto again;
959 }
960 return changed;
961}
962
963/* Check for already reserved areas */
964static inline int __init bad_addr_size(u64 *addrp, u64 *sizep, u64 align)
965{ 767{
966 int i; 768 return find_e820_area(start, end, size, align);
967 u64 addr = *addrp, last;
968 u64 size = *sizep;
969 int changed = 0;
970again:
971 last = addr + size;
972 for (i = 0; i < MAX_EARLY_RES && early_res[i].end; i++) {
973 struct early_res *r = &early_res[i];
974 if (last > r->start && addr < r->start) {
975 size = r->start - addr;
976 changed = 1;
977 goto again;
978 }
979 if (last > r->end && addr < r->end) {
980 addr = round_up(r->end, align);
981 size = last - addr;
982 changed = 1;
983 goto again;
984 }
985 if (last <= r->end && addr >= r->start) {
986 (*sizep)++;
987 return 0;
988 }
989 }
990 if (changed) {
991 *addrp = addr;
992 *sizep = size;
993 }
994 return changed;
995} 769}
996 770
997/* 771u64 __init get_max_mapped(void)
998 * Find a free area with specified alignment in a specific range.
999 */
1000u64 __init find_e820_area(u64 start, u64 end, u64 size, u64 align)
1001{ 772{
1002 int i; 773 u64 end = max_pfn_mapped;
1003 774
1004 for (i = 0; i < e820.nr_map; i++) { 775 end <<= PAGE_SHIFT;
1005 struct e820entry *ei = &e820.map[i];
1006 u64 addr, last;
1007 u64 ei_last;
1008 776
1009 if (ei->type != E820_RAM) 777 return end;
1010 continue;
1011 addr = round_up(ei->addr, align);
1012 ei_last = ei->addr + ei->size;
1013 if (addr < start)
1014 addr = round_up(start, align);
1015 if (addr >= ei_last)
1016 continue;
1017 while (bad_addr(&addr, size, align) && addr+size <= ei_last)
1018 ;
1019 last = addr + size;
1020 if (last > ei_last)
1021 continue;
1022 if (last > end)
1023 continue;
1024 return addr;
1025 }
1026 return -1ULL;
1027} 778}
1028
1029/* 779/*
1030 * Find next free range after *start 780 * Find next free range after *start
1031 */ 781 */
@@ -1035,25 +785,19 @@ u64 __init find_e820_area_size(u64 start, u64 *sizep, u64 align)
1035 785
1036 for (i = 0; i < e820.nr_map; i++) { 786 for (i = 0; i < e820.nr_map; i++) {
1037 struct e820entry *ei = &e820.map[i]; 787 struct e820entry *ei = &e820.map[i];
1038 u64 addr, last; 788 u64 addr;
1039 u64 ei_last; 789 u64 ei_start, ei_last;
1040 790
1041 if (ei->type != E820_RAM) 791 if (ei->type != E820_RAM)
1042 continue; 792 continue;
1043 addr = round_up(ei->addr, align); 793
1044 ei_last = ei->addr + ei->size; 794 ei_last = ei->addr + ei->size;
1045 if (addr < start) 795 ei_start = ei->addr;
1046 addr = round_up(start, align); 796 addr = find_early_area_size(ei_start, ei_last, start,
1047 if (addr >= ei_last) 797 sizep, align);
1048 continue; 798
1049 *sizep = ei_last - addr; 799 if (addr != -1ULL)
1050 while (bad_addr_size(&addr, sizep, align) && 800 return addr;
1051 addr + *sizep <= ei_last)
1052 ;
1053 last = addr + *sizep;
1054 if (last > ei_last)
1055 continue;
1056 return addr;
1057 } 801 }
1058 802
1059 return -1ULL; 803 return -1ULL;
@@ -1412,6 +1156,8 @@ void __init e820_reserve_resources_late(void)
1412 end = MAX_RESOURCE_SIZE; 1156 end = MAX_RESOURCE_SIZE;
1413 if (start >= end) 1157 if (start >= end)
1414 continue; 1158 continue;
1159 printk(KERN_DEBUG "reserve RAM buffer: %016llx - %016llx ",
1160 start, end);
1415 reserve_region_with_split(&iomem_resource, start, end, 1161 reserve_region_with_split(&iomem_resource, start, end,
1416 "RAM buffer"); 1162 "RAM buffer");
1417 } 1163 }
diff --git a/arch/x86/kernel/efi.c b/arch/x86/kernel/efi.c
index cdcfb122f256..c2fa9b8b497e 100644
--- a/arch/x86/kernel/efi.c
+++ b/arch/x86/kernel/efi.c
@@ -362,7 +362,7 @@ void __init efi_init(void)
362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n"); 362 printk(KERN_ERR PFX "Could not map the firmware vendor!\n");
363 early_iounmap(tmp, 2); 363 early_iounmap(tmp, 2);
364 364
365 printk(KERN_INFO "EFI v%u.%.02u by %s \n", 365 printk(KERN_INFO "EFI v%u.%.02u by %s\n",
366 efi.systab->hdr.revision >> 16, 366 efi.systab->hdr.revision >> 16,
367 efi.systab->hdr.revision & 0xffff, vendor); 367 efi.systab->hdr.revision & 0xffff, vendor);
368 368
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index c097e7d607c6..44a8e0dc6737 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -334,6 +334,10 @@ ENTRY(ret_from_fork)
334END(ret_from_fork) 334END(ret_from_fork)
335 335
336/* 336/*
337 * Interrupt exit functions should be protected against kprobes
338 */
339 .pushsection .kprobes.text, "ax"
340/*
337 * Return to user mode is not as complex as all this looks, 341 * Return to user mode is not as complex as all this looks,
338 * but we want the default path for a system call return to 342 * but we want the default path for a system call return to
339 * go as quickly as possible which is why some of this is 343 * go as quickly as possible which is why some of this is
@@ -383,6 +387,10 @@ need_resched:
383END(resume_kernel) 387END(resume_kernel)
384#endif 388#endif
385 CFI_ENDPROC 389 CFI_ENDPROC
390/*
391 * End of kprobes section
392 */
393 .popsection
386 394
387/* SYSENTER_RETURN points to after the "sysenter" instruction in 395/* SYSENTER_RETURN points to after the "sysenter" instruction in
388 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */ 396 the vsyscall page. See vsyscall-sysentry.S, which defines the symbol. */
@@ -513,6 +521,10 @@ sysexit_audit:
513 PTGS_TO_GS_EX 521 PTGS_TO_GS_EX
514ENDPROC(ia32_sysenter_target) 522ENDPROC(ia32_sysenter_target)
515 523
524/*
525 * syscall stub including irq exit should be protected against kprobes
526 */
527 .pushsection .kprobes.text, "ax"
516 # system call handler stub 528 # system call handler stub
517ENTRY(system_call) 529ENTRY(system_call)
518 RING0_INT_FRAME # can't unwind into user space anyway 530 RING0_INT_FRAME # can't unwind into user space anyway
@@ -705,26 +717,69 @@ syscall_badsys:
705 jmp resume_userspace 717 jmp resume_userspace
706END(syscall_badsys) 718END(syscall_badsys)
707 CFI_ENDPROC 719 CFI_ENDPROC
720/*
721 * End of kprobes section
722 */
723 .popsection
708 724
709/* 725/*
710 * System calls that need a pt_regs pointer. 726 * System calls that need a pt_regs pointer.
711 */ 727 */
712#define PTREGSCALL(name) \ 728#define PTREGSCALL0(name) \
713 ALIGN; \ 729 ALIGN; \
714ptregs_##name: \ 730ptregs_##name: \
715 leal 4(%esp),%eax; \ 731 leal 4(%esp),%eax; \
716 jmp sys_##name; 732 jmp sys_##name;
717 733
718PTREGSCALL(iopl) 734#define PTREGSCALL1(name) \
719PTREGSCALL(fork) 735 ALIGN; \
720PTREGSCALL(clone) 736ptregs_##name: \
721PTREGSCALL(vfork) 737 leal 4(%esp),%edx; \
722PTREGSCALL(execve) 738 movl (PT_EBX+4)(%esp),%eax; \
723PTREGSCALL(sigaltstack) 739 jmp sys_##name;
724PTREGSCALL(sigreturn) 740
725PTREGSCALL(rt_sigreturn) 741#define PTREGSCALL2(name) \
726PTREGSCALL(vm86) 742 ALIGN; \
727PTREGSCALL(vm86old) 743ptregs_##name: \
744 leal 4(%esp),%ecx; \
745 movl (PT_ECX+4)(%esp),%edx; \
746 movl (PT_EBX+4)(%esp),%eax; \
747 jmp sys_##name;
748
749#define PTREGSCALL3(name) \
750 ALIGN; \
751ptregs_##name: \
752 leal 4(%esp),%eax; \
753 pushl %eax; \
754 movl PT_EDX(%eax),%ecx; \
755 movl PT_ECX(%eax),%edx; \
756 movl PT_EBX(%eax),%eax; \
757 call sys_##name; \
758 addl $4,%esp; \
759 ret
760
761PTREGSCALL1(iopl)
762PTREGSCALL0(fork)
763PTREGSCALL0(vfork)
764PTREGSCALL3(execve)
765PTREGSCALL2(sigaltstack)
766PTREGSCALL0(sigreturn)
767PTREGSCALL0(rt_sigreturn)
768PTREGSCALL2(vm86)
769PTREGSCALL1(vm86old)
770
771/* Clone is an oddball. The 4th arg is in %edi */
772 ALIGN;
773ptregs_clone:
774 leal 4(%esp),%eax
775 pushl %eax
776 pushl PT_EDI(%eax)
777 movl PT_EDX(%eax),%ecx
778 movl PT_ECX(%eax),%edx
779 movl PT_EBX(%eax),%eax
780 call sys_clone
781 addl $8,%esp
782 ret
728 783
729.macro FIXUP_ESPFIX_STACK 784.macro FIXUP_ESPFIX_STACK
730/* 785/*
@@ -814,6 +869,10 @@ common_interrupt:
814ENDPROC(common_interrupt) 869ENDPROC(common_interrupt)
815 CFI_ENDPROC 870 CFI_ENDPROC
816 871
872/*
873 * Irq entries should be protected against kprobes
874 */
875 .pushsection .kprobes.text, "ax"
817#define BUILD_INTERRUPT3(name, nr, fn) \ 876#define BUILD_INTERRUPT3(name, nr, fn) \
818ENTRY(name) \ 877ENTRY(name) \
819 RING0_INT_FRAME; \ 878 RING0_INT_FRAME; \
@@ -980,16 +1039,16 @@ ENTRY(spurious_interrupt_bug)
980 jmp error_code 1039 jmp error_code
981 CFI_ENDPROC 1040 CFI_ENDPROC
982END(spurious_interrupt_bug) 1041END(spurious_interrupt_bug)
1042/*
1043 * End of kprobes section
1044 */
1045 .popsection
983 1046
984ENTRY(kernel_thread_helper) 1047ENTRY(kernel_thread_helper)
985 pushl $0 # fake return address for unwinder 1048 pushl $0 # fake return address for unwinder
986 CFI_STARTPROC 1049 CFI_STARTPROC
987 movl %edx,%eax 1050 movl %edi,%eax
988 push %edx 1051 call *%esi
989 CFI_ADJUST_CFA_OFFSET 4
990 call *%ebx
991 push %eax
992 CFI_ADJUST_CFA_OFFSET 4
993 call do_exit 1052 call do_exit
994 ud2 # padding for call trace 1053 ud2 # padding for call trace
995 CFI_ENDPROC 1054 CFI_ENDPROC
@@ -1185,17 +1244,14 @@ END(ftrace_graph_caller)
1185 1244
1186.globl return_to_handler 1245.globl return_to_handler
1187return_to_handler: 1246return_to_handler:
1188 pushl $0
1189 pushl %eax 1247 pushl %eax
1190 pushl %ecx
1191 pushl %edx 1248 pushl %edx
1192 movl %ebp, %eax 1249 movl %ebp, %eax
1193 call ftrace_return_to_handler 1250 call ftrace_return_to_handler
1194 movl %eax, 0xc(%esp) 1251 movl %eax, %ecx
1195 popl %edx 1252 popl %edx
1196 popl %ecx
1197 popl %eax 1253 popl %eax
1198 ret 1254 jmp *%ecx
1199#endif 1255#endif
1200 1256
1201.section .rodata,"a" 1257.section .rodata,"a"
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 5e9b0e538a18..b9ec6cd7796f 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -155,11 +155,11 @@ GLOBAL(return_to_handler)
155 155
156 call ftrace_return_to_handler 156 call ftrace_return_to_handler
157 157
158 movq %rax, 16(%rsp) 158 movq %rax, %rdi
159 movq 8(%rsp), %rdx 159 movq 8(%rsp), %rdx
160 movq (%rsp), %rax 160 movq (%rsp), %rax
161 addq $16, %rsp 161 addq $24, %rsp
162 retq 162 jmp *%rdi
163#endif 163#endif
164 164
165 165
@@ -803,6 +803,10 @@ END(interrupt)
803 call \func 803 call \func
804 .endm 804 .endm
805 805
806/*
807 * Interrupt entry/exit should be protected against kprobes
808 */
809 .pushsection .kprobes.text, "ax"
806 /* 810 /*
807 * The interrupt stubs push (~vector+0x80) onto the stack and 811 * The interrupt stubs push (~vector+0x80) onto the stack and
808 * then jump to common_interrupt. 812 * then jump to common_interrupt.
@@ -941,6 +945,10 @@ ENTRY(retint_kernel)
941 945
942 CFI_ENDPROC 946 CFI_ENDPROC
943END(common_interrupt) 947END(common_interrupt)
948/*
949 * End of kprobes section
950 */
951 .popsection
944 952
945/* 953/*
946 * APIC interrupts. 954 * APIC interrupts.
@@ -969,8 +977,8 @@ apicinterrupt UV_BAU_MESSAGE \
969#endif 977#endif
970apicinterrupt LOCAL_TIMER_VECTOR \ 978apicinterrupt LOCAL_TIMER_VECTOR \
971 apic_timer_interrupt smp_apic_timer_interrupt 979 apic_timer_interrupt smp_apic_timer_interrupt
972apicinterrupt GENERIC_INTERRUPT_VECTOR \ 980apicinterrupt X86_PLATFORM_IPI_VECTOR \
973 generic_interrupt smp_generic_interrupt 981 x86_platform_ipi smp_x86_platform_ipi
974 982
975#ifdef CONFIG_SMP 983#ifdef CONFIG_SMP
976apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \ 984apicinterrupt INVALIDATE_TLB_VECTOR_START+0 \
@@ -1070,10 +1078,10 @@ ENTRY(\sym)
1070 TRACE_IRQS_OFF 1078 TRACE_IRQS_OFF
1071 movq %rsp,%rdi /* pt_regs pointer */ 1079 movq %rsp,%rdi /* pt_regs pointer */
1072 xorl %esi,%esi /* no error code */ 1080 xorl %esi,%esi /* no error code */
1073 PER_CPU(init_tss, %rbp) 1081 PER_CPU(init_tss, %r12)
1074 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1082 subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1075 call \do_sym 1083 call \do_sym
1076 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) 1084 addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12)
1077 jmp paranoid_exit /* %ebx: no swapgs flag */ 1085 jmp paranoid_exit /* %ebx: no swapgs flag */
1078 CFI_ENDPROC 1086 CFI_ENDPROC
1079END(\sym) 1087END(\sym)
@@ -1160,63 +1168,20 @@ bad_gs:
1160 jmp 2b 1168 jmp 2b
1161 .previous 1169 .previous
1162 1170
1163/* 1171ENTRY(kernel_thread_helper)
1164 * Create a kernel thread.
1165 *
1166 * C extern interface:
1167 * extern long kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
1168 *
1169 * asm input arguments:
1170 * rdi: fn, rsi: arg, rdx: flags
1171 */
1172ENTRY(kernel_thread)
1173 CFI_STARTPROC
1174 FAKE_STACK_FRAME $child_rip
1175 SAVE_ALL
1176
1177 # rdi: flags, rsi: usp, rdx: will be &pt_regs
1178 movq %rdx,%rdi
1179 orq kernel_thread_flags(%rip),%rdi
1180 movq $-1, %rsi
1181 movq %rsp, %rdx
1182
1183 xorl %r8d,%r8d
1184 xorl %r9d,%r9d
1185
1186 # clone now
1187 call do_fork
1188 movq %rax,RAX(%rsp)
1189 xorl %edi,%edi
1190
1191 /*
1192 * It isn't worth to check for reschedule here,
1193 * so internally to the x86_64 port you can rely on kernel_thread()
1194 * not to reschedule the child before returning, this avoids the need
1195 * of hacks for example to fork off the per-CPU idle tasks.
1196 * [Hopefully no generic code relies on the reschedule -AK]
1197 */
1198 RESTORE_ALL
1199 UNFAKE_STACK_FRAME
1200 ret
1201 CFI_ENDPROC
1202END(kernel_thread)
1203
1204ENTRY(child_rip)
1205 pushq $0 # fake return address 1172 pushq $0 # fake return address
1206 CFI_STARTPROC 1173 CFI_STARTPROC
1207 /* 1174 /*
1208 * Here we are in the child and the registers are set as they were 1175 * Here we are in the child and the registers are set as they were
1209 * at kernel_thread() invocation in the parent. 1176 * at kernel_thread() invocation in the parent.
1210 */ 1177 */
1211 movq %rdi, %rax 1178 call *%rsi
1212 movq %rsi, %rdi
1213 call *%rax
1214 # exit 1179 # exit
1215 mov %eax, %edi 1180 mov %eax, %edi
1216 call do_exit 1181 call do_exit
1217 ud2 # padding for call trace 1182 ud2 # padding for call trace
1218 CFI_ENDPROC 1183 CFI_ENDPROC
1219END(child_rip) 1184END(kernel_thread_helper)
1220 1185
1221/* 1186/*
1222 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly. 1187 * execve(). This function needs to use IRET, not SYSRET, to set up all state properly.
@@ -1493,12 +1458,17 @@ error_kernelspace:
1493 leaq irq_return(%rip),%rcx 1458 leaq irq_return(%rip),%rcx
1494 cmpq %rcx,RIP+8(%rsp) 1459 cmpq %rcx,RIP+8(%rsp)
1495 je error_swapgs 1460 je error_swapgs
1496 movl %ecx,%ecx /* zero extend */ 1461 movl %ecx,%eax /* zero extend */
1497 cmpq %rcx,RIP+8(%rsp) 1462 cmpq %rax,RIP+8(%rsp)
1498 je error_swapgs 1463 je bstep_iret
1499 cmpq $gs_change,RIP+8(%rsp) 1464 cmpq $gs_change,RIP+8(%rsp)
1500 je error_swapgs 1465 je error_swapgs
1501 jmp error_sti 1466 jmp error_sti
1467
1468bstep_iret:
1469 /* Fix truncated RIP */
1470 movq %rcx,RIP+8(%rsp)
1471 jmp error_swapgs
1502END(error_entry) 1472END(error_entry)
1503 1473
1504 1474
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 9dbb527e1652..cd37469b54ee 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -9,6 +9,8 @@
9 * the dangers of modifying code on the run. 9 * the dangers of modifying code on the run.
10 */ 10 */
11 11
12#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
12#include <linux/spinlock.h> 14#include <linux/spinlock.h>
13#include <linux/hardirq.h> 15#include <linux/hardirq.h>
14#include <linux/uaccess.h> 16#include <linux/uaccess.h>
@@ -28,14 +30,32 @@
28 30
29#ifdef CONFIG_DYNAMIC_FTRACE 31#ifdef CONFIG_DYNAMIC_FTRACE
30 32
33/*
34 * modifying_code is set to notify NMIs that they need to use
35 * memory barriers when entering or exiting. But we don't want
36 * to burden NMIs with unnecessary memory barriers when code
37 * modification is not being done (which is most of the time).
38 *
39 * A mutex is already held when ftrace_arch_code_modify_prepare
40 * and post_process are called. No locks need to be taken here.
41 *
42 * Stop machine will make sure currently running NMIs are done
43 * and new NMIs will see the updated variable before we need
44 * to worry about NMIs doing memory barriers.
45 */
46static int modifying_code __read_mostly;
47static DEFINE_PER_CPU(int, save_modifying_code);
48
31int ftrace_arch_code_modify_prepare(void) 49int ftrace_arch_code_modify_prepare(void)
32{ 50{
33 set_kernel_text_rw(); 51 set_kernel_text_rw();
52 modifying_code = 1;
34 return 0; 53 return 0;
35} 54}
36 55
37int ftrace_arch_code_modify_post_process(void) 56int ftrace_arch_code_modify_post_process(void)
38{ 57{
58 modifying_code = 0;
39 set_kernel_text_ro(); 59 set_kernel_text_ro();
40 return 0; 60 return 0;
41} 61}
@@ -147,6 +167,11 @@ static void ftrace_mod_code(void)
147 167
148void ftrace_nmi_enter(void) 168void ftrace_nmi_enter(void)
149{ 169{
170 __get_cpu_var(save_modifying_code) = modifying_code;
171
172 if (!__get_cpu_var(save_modifying_code))
173 return;
174
150 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) { 175 if (atomic_inc_return(&nmi_running) & MOD_CODE_WRITE_FLAG) {
151 smp_rmb(); 176 smp_rmb();
152 ftrace_mod_code(); 177 ftrace_mod_code();
@@ -158,6 +183,9 @@ void ftrace_nmi_enter(void)
158 183
159void ftrace_nmi_exit(void) 184void ftrace_nmi_exit(void)
160{ 185{
186 if (!__get_cpu_var(save_modifying_code))
187 return;
188
161 /* Finish all executions before clearing nmi_running */ 189 /* Finish all executions before clearing nmi_running */
162 smp_mb(); 190 smp_mb();
163 atomic_dec(&nmi_running); 191 atomic_dec(&nmi_running);
@@ -187,9 +215,26 @@ static void wait_for_nmi(void)
187 nmi_wait_count++; 215 nmi_wait_count++;
188} 216}
189 217
218static inline int
219within(unsigned long addr, unsigned long start, unsigned long end)
220{
221 return addr >= start && addr < end;
222}
223
190static int 224static int
191do_ftrace_mod_code(unsigned long ip, void *new_code) 225do_ftrace_mod_code(unsigned long ip, void *new_code)
192{ 226{
227 /*
228 * On x86_64, kernel text mappings are mapped read-only with
229 * CONFIG_DEBUG_RODATA. So we use the kernel identity mapping instead
230 * of the kernel text mapping to modify the kernel text.
231 *
232 * For 32bit kernels, these mappings are same and we can use
233 * kernel identity mapping to modify code.
234 */
235 if (within(ip, (unsigned long)_text, (unsigned long)_etext))
236 ip = (unsigned long)__va(__pa(ip));
237
193 mod_code_ip = (void *)ip; 238 mod_code_ip = (void *)ip;
194 mod_code_newcode = new_code; 239 mod_code_newcode = new_code;
195 240
@@ -336,15 +381,15 @@ int __init ftrace_dyn_arch_init(void *data)
336 381
337 switch (faulted) { 382 switch (faulted) {
338 case 0: 383 case 0:
339 pr_info("ftrace: converting mcount calls to 0f 1f 44 00 00\n"); 384 pr_info("converting mcount calls to 0f 1f 44 00 00\n");
340 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE); 385 memcpy(ftrace_nop, ftrace_test_p6nop, MCOUNT_INSN_SIZE);
341 break; 386 break;
342 case 1: 387 case 1:
343 pr_info("ftrace: converting mcount calls to 66 66 66 66 90\n"); 388 pr_info("converting mcount calls to 66 66 66 66 90\n");
344 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE); 389 memcpy(ftrace_nop, ftrace_test_nop5, MCOUNT_INSN_SIZE);
345 break; 390 break;
346 case 2: 391 case 2:
347 pr_info("ftrace: converting mcount calls to jmp . + 5\n"); 392 pr_info("converting mcount calls to jmp . + 5\n");
348 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE); 393 memcpy(ftrace_nop, ftrace_test_jmp, MCOUNT_INSN_SIZE);
349 break; 394 break;
350 } 395 }
@@ -465,85 +510,3 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr,
465 } 510 }
466} 511}
467#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ 512#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
468
469#ifdef CONFIG_FTRACE_SYSCALLS
470
471extern unsigned long __start_syscalls_metadata[];
472extern unsigned long __stop_syscalls_metadata[];
473extern unsigned long *sys_call_table;
474
475static struct syscall_metadata **syscalls_metadata;
476
477static struct syscall_metadata *find_syscall_meta(unsigned long *syscall)
478{
479 struct syscall_metadata *start;
480 struct syscall_metadata *stop;
481 char str[KSYM_SYMBOL_LEN];
482
483
484 start = (struct syscall_metadata *)__start_syscalls_metadata;
485 stop = (struct syscall_metadata *)__stop_syscalls_metadata;
486 kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str);
487
488 for ( ; start < stop; start++) {
489 if (start->name && !strcmp(start->name, str))
490 return start;
491 }
492 return NULL;
493}
494
495struct syscall_metadata *syscall_nr_to_meta(int nr)
496{
497 if (!syscalls_metadata || nr >= NR_syscalls || nr < 0)
498 return NULL;
499
500 return syscalls_metadata[nr];
501}
502
503int syscall_name_to_nr(char *name)
504{
505 int i;
506
507 if (!syscalls_metadata)
508 return -1;
509
510 for (i = 0; i < NR_syscalls; i++) {
511 if (syscalls_metadata[i]) {
512 if (!strcmp(syscalls_metadata[i]->name, name))
513 return i;
514 }
515 }
516 return -1;
517}
518
519void set_syscall_enter_id(int num, int id)
520{
521 syscalls_metadata[num]->enter_id = id;
522}
523
524void set_syscall_exit_id(int num, int id)
525{
526 syscalls_metadata[num]->exit_id = id;
527}
528
529static int __init arch_init_ftrace_syscalls(void)
530{
531 int i;
532 struct syscall_metadata *meta;
533 unsigned long **psys_syscall_table = &sys_call_table;
534
535 syscalls_metadata = kzalloc(sizeof(*syscalls_metadata) *
536 NR_syscalls, GFP_KERNEL);
537 if (!syscalls_metadata) {
538 WARN_ON(1);
539 return -ENOMEM;
540 }
541
542 for (i = 0; i < NR_syscalls; i++) {
543 meta = find_syscall_meta(psys_syscall_table[i]);
544 syscalls_metadata[i] = meta;
545 }
546 return 0;
547}
548arch_initcall(arch_init_ftrace_syscalls);
549#endif
diff --git a/arch/x86/kernel/geode_32.c b/arch/x86/kernel/geode_32.c
deleted file mode 100644
index 9b08e852fd1a..000000000000
--- a/arch/x86/kernel/geode_32.c
+++ /dev/null
@@ -1,196 +0,0 @@
1/*
2 * AMD Geode southbridge support code
3 * Copyright (C) 2006, Advanced Micro Devices, Inc.
4 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of version 2 of the GNU General Public License
8 * as published by the Free Software Foundation.
9 */
10
11#include <linux/kernel.h>
12#include <linux/module.h>
13#include <linux/ioport.h>
14#include <linux/io.h>
15#include <asm/msr.h>
16#include <asm/geode.h>
17
18static struct {
19 char *name;
20 u32 msr;
21 int size;
22 u32 base;
23} lbars[] = {
24 { "geode-pms", MSR_LBAR_PMS, LBAR_PMS_SIZE, 0 },
25 { "geode-acpi", MSR_LBAR_ACPI, LBAR_ACPI_SIZE, 0 },
26 { "geode-gpio", MSR_LBAR_GPIO, LBAR_GPIO_SIZE, 0 },
27 { "geode-mfgpt", MSR_LBAR_MFGPT, LBAR_MFGPT_SIZE, 0 }
28};
29
30static void __init init_lbars(void)
31{
32 u32 lo, hi;
33 int i;
34
35 for (i = 0; i < ARRAY_SIZE(lbars); i++) {
36 rdmsr(lbars[i].msr, lo, hi);
37 if (hi & 0x01)
38 lbars[i].base = lo & 0x0000ffff;
39
40 if (lbars[i].base == 0)
41 printk(KERN_ERR "geode: Couldn't initialize '%s'\n",
42 lbars[i].name);
43 }
44}
45
46int geode_get_dev_base(unsigned int dev)
47{
48 BUG_ON(dev >= ARRAY_SIZE(lbars));
49 return lbars[dev].base;
50}
51EXPORT_SYMBOL_GPL(geode_get_dev_base);
52
53/* === GPIO API === */
54
55void geode_gpio_set(u32 gpio, unsigned int reg)
56{
57 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
58
59 if (!base)
60 return;
61
62 /* low bank register */
63 if (gpio & 0xFFFF)
64 outl(gpio & 0xFFFF, base + reg);
65 /* high bank register */
66 gpio >>= 16;
67 if (gpio)
68 outl(gpio, base + 0x80 + reg);
69}
70EXPORT_SYMBOL_GPL(geode_gpio_set);
71
72void geode_gpio_clear(u32 gpio, unsigned int reg)
73{
74 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
75
76 if (!base)
77 return;
78
79 /* low bank register */
80 if (gpio & 0xFFFF)
81 outl((gpio & 0xFFFF) << 16, base + reg);
82 /* high bank register */
83 gpio &= (0xFFFF << 16);
84 if (gpio)
85 outl(gpio, base + 0x80 + reg);
86}
87EXPORT_SYMBOL_GPL(geode_gpio_clear);
88
89int geode_gpio_isset(u32 gpio, unsigned int reg)
90{
91 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
92 u32 val;
93
94 if (!base)
95 return 0;
96
97 /* low bank register */
98 if (gpio & 0xFFFF) {
99 val = inl(base + reg) & (gpio & 0xFFFF);
100 if ((gpio & 0xFFFF) == val)
101 return 1;
102 }
103 /* high bank register */
104 gpio >>= 16;
105 if (gpio) {
106 val = inl(base + 0x80 + reg) & gpio;
107 if (gpio == val)
108 return 1;
109 }
110 return 0;
111}
112EXPORT_SYMBOL_GPL(geode_gpio_isset);
113
114void geode_gpio_set_irq(unsigned int group, unsigned int irq)
115{
116 u32 lo, hi;
117
118 if (group > 7 || irq > 15)
119 return;
120
121 rdmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
122
123 lo &= ~(0xF << (group * 4));
124 lo |= (irq & 0xF) << (group * 4);
125
126 wrmsr(MSR_PIC_ZSEL_HIGH, lo, hi);
127}
128EXPORT_SYMBOL_GPL(geode_gpio_set_irq);
129
130void geode_gpio_setup_event(unsigned int gpio, int pair, int pme)
131{
132 u32 base = geode_get_dev_base(GEODE_DEV_GPIO);
133 u32 offset, shift, val;
134
135 if (gpio >= 24)
136 offset = GPIO_MAP_W;
137 else if (gpio >= 16)
138 offset = GPIO_MAP_Z;
139 else if (gpio >= 8)
140 offset = GPIO_MAP_Y;
141 else
142 offset = GPIO_MAP_X;
143
144 shift = (gpio % 8) * 4;
145
146 val = inl(base + offset);
147
148 /* Clear whatever was there before */
149 val &= ~(0xF << shift);
150
151 /* And set the new value */
152
153 val |= ((pair & 7) << shift);
154
155 /* Set the PME bit if this is a PME event */
156
157 if (pme)
158 val |= (1 << (shift + 3));
159
160 outl(val, base + offset);
161}
162EXPORT_SYMBOL_GPL(geode_gpio_setup_event);
163
164int geode_has_vsa2(void)
165{
166 static int has_vsa2 = -1;
167
168 if (has_vsa2 == -1) {
169 u16 val;
170
171 /*
172 * The VSA has virtual registers that we can query for a
173 * signature.
174 */
175 outw(VSA_VR_UNLOCK, VSA_VRC_INDEX);
176 outw(VSA_VR_SIGNATURE, VSA_VRC_INDEX);
177
178 val = inw(VSA_VRC_DATA);
179 has_vsa2 = (val == AMD_VSA_SIG || val == GSW_VSA_SIG);
180 }
181
182 return has_vsa2;
183}
184EXPORT_SYMBOL_GPL(geode_has_vsa2);
185
186static int __init geode_southbridge_init(void)
187{
188 if (!is_geode())
189 return -ENODEV;
190
191 init_lbars();
192 (void) mfgpt_timer_setup();
193 return 0;
194}
195
196postcore_initcall(geode_southbridge_init);
diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c
index 4f8e2507e8f3..b2e246037392 100644
--- a/arch/x86/kernel/head32.c
+++ b/arch/x86/kernel/head32.c
@@ -7,6 +7,7 @@
7 7
8#include <linux/init.h> 8#include <linux/init.h>
9#include <linux/start_kernel.h> 9#include <linux/start_kernel.h>
10#include <linux/mm.h>
10 11
11#include <asm/setup.h> 12#include <asm/setup.h>
12#include <asm/sections.h> 13#include <asm/sections.h>
@@ -29,16 +30,25 @@ static void __init i386_default_early_setup(void)
29 30
30void __init i386_start_kernel(void) 31void __init i386_start_kernel(void)
31{ 32{
32 reserve_trampoline_memory(); 33#ifdef CONFIG_X86_TRAMPOLINE
34 /*
35 * But first pinch a few for the stack/trampoline stuff
36 * FIXME: Don't need the extra page at 4K, but need to fix
37 * trampoline before removing it. (see the GDT stuff)
38 */
39 reserve_early_overlap_ok(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE,
40 "EX TRAMPOLINE");
41#endif
33 42
34 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 43 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
35 44
36#ifdef CONFIG_BLK_DEV_INITRD 45#ifdef CONFIG_BLK_DEV_INITRD
37 /* Reserve INITRD */ 46 /* Reserve INITRD */
38 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 47 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
48 /* Assume only end is not page aligned */
39 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 49 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
40 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 50 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
41 u64 ramdisk_end = ramdisk_image + ramdisk_size; 51 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
42 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 52 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
43 } 53 }
44#endif 54#endif
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index 0b06cd778fd9..7147143fd614 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -98,16 +98,15 @@ void __init x86_64_start_reservations(char *real_mode_data)
98{ 98{
99 copy_bootdata(__va(real_mode_data)); 99 copy_bootdata(__va(real_mode_data));
100 100
101 reserve_trampoline_memory();
102
103 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); 101 reserve_early(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS");
104 102
105#ifdef CONFIG_BLK_DEV_INITRD 103#ifdef CONFIG_BLK_DEV_INITRD
106 /* Reserve INITRD */ 104 /* Reserve INITRD */
107 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) { 105 if (boot_params.hdr.type_of_loader && boot_params.hdr.ramdisk_image) {
106 /* Assume only end is not page aligned */
108 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; 107 unsigned long ramdisk_image = boot_params.hdr.ramdisk_image;
109 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; 108 unsigned long ramdisk_size = boot_params.hdr.ramdisk_size;
110 unsigned long ramdisk_end = ramdisk_image + ramdisk_size; 109 unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
111 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK"); 110 reserve_early(ramdisk_image, ramdisk_end, "RAMDISK");
112 } 111 }
113#endif 112#endif
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index 050c278481b1..37c3d4b17d85 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -18,6 +18,8 @@
18#include <asm/asm-offsets.h> 18#include <asm/asm-offsets.h>
19#include <asm/setup.h> 19#include <asm/setup.h>
20#include <asm/processor-flags.h> 20#include <asm/processor-flags.h>
21#include <asm/msr-index.h>
22#include <asm/cpufeature.h>
21#include <asm/percpu.h> 23#include <asm/percpu.h>
22 24
23/* Physical address */ 25/* Physical address */
@@ -297,25 +299,27 @@ ENTRY(startup_32_smp)
297 orl %edx,%eax 299 orl %edx,%eax
298 movl %eax,%cr4 300 movl %eax,%cr4
299 301
300 btl $5, %eax # check if PAE is enabled 302 testb $X86_CR4_PAE, %al # check if PAE is enabled
301 jnc 6f 303 jz 6f
302 304
303 /* Check if extended functions are implemented */ 305 /* Check if extended functions are implemented */
304 movl $0x80000000, %eax 306 movl $0x80000000, %eax
305 cpuid 307 cpuid
306 cmpl $0x80000000, %eax 308 /* Value must be in the range 0x80000001 to 0x8000ffff */
307 jbe 6f 309 subl $0x80000001, %eax
310 cmpl $(0x8000ffff-0x80000001), %eax
311 ja 6f
308 mov $0x80000001, %eax 312 mov $0x80000001, %eax
309 cpuid 313 cpuid
310 /* Execute Disable bit supported? */ 314 /* Execute Disable bit supported? */
311 btl $20, %edx 315 btl $(X86_FEATURE_NX & 31), %edx
312 jnc 6f 316 jnc 6f
313 317
314 /* Setup EFER (Extended Feature Enable Register) */ 318 /* Setup EFER (Extended Feature Enable Register) */
315 movl $0xc0000080, %ecx 319 movl $MSR_EFER, %ecx
316 rdmsr 320 rdmsr
317 321
318 btsl $11, %eax 322 btsl $_EFER_NX, %eax
319 /* Make changes effective */ 323 /* Make changes effective */
320 wrmsr 324 wrmsr
321 325
@@ -438,8 +442,8 @@ is386: movl $2,%ecx # set MP
438 */ 442 */
439 cmpb $0,ready 443 cmpb $0,ready
440 jne 1f 444 jne 1f
441 movl $per_cpu__gdt_page,%eax 445 movl $gdt_page,%eax
442 movl $per_cpu__stack_canary,%ecx 446 movl $stack_canary,%ecx
443 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax) 447 movw %cx, 8 * GDT_ENTRY_STACK_CANARY + 2(%eax)
444 shrl $16, %ecx 448 shrl $16, %ecx
445 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax) 449 movb %cl, 8 * GDT_ENTRY_STACK_CANARY + 4(%eax)
@@ -702,7 +706,7 @@ idt_descr:
702 .word 0 # 32 bit align gdt_desc.address 706 .word 0 # 32 bit align gdt_desc.address
703ENTRY(early_gdt_descr) 707ENTRY(early_gdt_descr)
704 .word GDT_ENTRIES*8-1 708 .word GDT_ENTRIES*8-1
705 .long per_cpu__gdt_page /* Overwritten for secondary CPUs */ 709 .long gdt_page /* Overwritten for secondary CPUs */
706 710
707/* 711/*
708 * The boot_gdt must mirror the equivalent in setup.S and is 712 * The boot_gdt must mirror the equivalent in setup.S and is
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 780cd928fcd5..3d1e6f16b7a6 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -27,7 +27,7 @@
27#define GET_CR2_INTO_RCX movq %cr2, %rcx 27#define GET_CR2_INTO_RCX movq %cr2, %rcx
28#endif 28#endif
29 29
30/* we are not able to switch in one step to the final KERNEL ADRESS SPACE 30/* we are not able to switch in one step to the final KERNEL ADDRESS SPACE
31 * because we need identity-mapped pages. 31 * because we need identity-mapped pages.
32 * 32 *
33 */ 33 */
@@ -212,8 +212,8 @@ ENTRY(secondary_startup_64)
212 */ 212 */
213 lgdt early_gdt_descr(%rip) 213 lgdt early_gdt_descr(%rip)
214 214
215 /* set up data segments. actually 0 would do too */ 215 /* set up data segments */
216 movl $__KERNEL_DS,%eax 216 xorl %eax,%eax
217 movl %eax,%ds 217 movl %eax,%ds
218 movl %eax,%ss 218 movl %eax,%ss
219 movl %eax,%es 219 movl %eax,%es
@@ -262,11 +262,11 @@ ENTRY(secondary_startup_64)
262 .quad x86_64_start_kernel 262 .quad x86_64_start_kernel
263 ENTRY(initial_gs) 263 ENTRY(initial_gs)
264 .quad INIT_PER_CPU_VAR(irq_stack_union) 264 .quad INIT_PER_CPU_VAR(irq_stack_union)
265 __FINITDATA
266 265
267 ENTRY(stack_start) 266 ENTRY(stack_start)
268 .quad init_thread_union+THREAD_SIZE-8 267 .quad init_thread_union+THREAD_SIZE-8
269 .word 0 268 .word 0
269 __FINITDATA
270 270
271bad_address: 271bad_address:
272 jmp bad_address 272 jmp bad_address
@@ -340,6 +340,7 @@ ENTRY(name)
340 i = i + 1 ; \ 340 i = i + 1 ; \
341 .endr 341 .endr
342 342
343 .data
343 /* 344 /*
344 * This default setting generates an ident mapping at address 0x100000 345 * This default setting generates an ident mapping at address 0x100000
345 * and a mapping for the kernel that precisely maps virtual address 346 * and a mapping for the kernel that precisely maps virtual address
diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c
index dedc2bddf7a5..23b4ecdffa9b 100644
--- a/arch/x86/kernel/hpet.c
+++ b/arch/x86/kernel/hpet.c
@@ -4,6 +4,7 @@
4#include <linux/sysdev.h> 4#include <linux/sysdev.h>
5#include <linux/delay.h> 5#include <linux/delay.h>
6#include <linux/errno.h> 6#include <linux/errno.h>
7#include <linux/slab.h>
7#include <linux/hpet.h> 8#include <linux/hpet.h>
8#include <linux/init.h> 9#include <linux/init.h>
9#include <linux/cpu.h> 10#include <linux/cpu.h>
@@ -33,6 +34,9 @@
33 * HPET address is set in acpi/boot.c, when an ACPI entry exists 34 * HPET address is set in acpi/boot.c, when an ACPI entry exists
34 */ 35 */
35unsigned long hpet_address; 36unsigned long hpet_address;
37u8 hpet_blockid; /* OS timer block num */
38u8 hpet_msi_disable;
39
36#ifdef CONFIG_PCI_MSI 40#ifdef CONFIG_PCI_MSI
37static unsigned long hpet_num_timers; 41static unsigned long hpet_num_timers;
38#endif 42#endif
@@ -47,12 +51,12 @@ struct hpet_dev {
47 char name[10]; 51 char name[10];
48}; 52};
49 53
50unsigned long hpet_readl(unsigned long a) 54inline unsigned int hpet_readl(unsigned int a)
51{ 55{
52 return readl(hpet_virt_address + a); 56 return readl(hpet_virt_address + a);
53} 57}
54 58
55static inline void hpet_writel(unsigned long d, unsigned long a) 59static inline void hpet_writel(unsigned int d, unsigned int a)
56{ 60{
57 writel(d, hpet_virt_address + a); 61 writel(d, hpet_virt_address + a);
58} 62}
@@ -167,7 +171,7 @@ do { \
167 171
168static void hpet_reserve_msi_timers(struct hpet_data *hd); 172static void hpet_reserve_msi_timers(struct hpet_data *hd);
169 173
170static void hpet_reserve_platform_timers(unsigned long id) 174static void hpet_reserve_platform_timers(unsigned int id)
171{ 175{
172 struct hpet __iomem *hpet = hpet_virt_address; 176 struct hpet __iomem *hpet = hpet_virt_address;
173 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2]; 177 struct hpet_timer __iomem *timer = &hpet->hpet_timers[2];
@@ -205,7 +209,7 @@ static void hpet_reserve_platform_timers(unsigned long id)
205 209
206} 210}
207#else 211#else
208static void hpet_reserve_platform_timers(unsigned long id) { } 212static void hpet_reserve_platform_timers(unsigned int id) { }
209#endif 213#endif
210 214
211/* 215/*
@@ -246,7 +250,7 @@ static void hpet_reset_counter(void)
246 250
247static void hpet_start_counter(void) 251static void hpet_start_counter(void)
248{ 252{
249 unsigned long cfg = hpet_readl(HPET_CFG); 253 unsigned int cfg = hpet_readl(HPET_CFG);
250 cfg |= HPET_CFG_ENABLE; 254 cfg |= HPET_CFG_ENABLE;
251 hpet_writel(cfg, HPET_CFG); 255 hpet_writel(cfg, HPET_CFG);
252} 256}
@@ -263,7 +267,7 @@ static void hpet_resume_device(void)
263 force_hpet_resume(); 267 force_hpet_resume();
264} 268}
265 269
266static void hpet_resume_counter(void) 270static void hpet_resume_counter(struct clocksource *cs)
267{ 271{
268 hpet_resume_device(); 272 hpet_resume_device();
269 hpet_restart_counter(); 273 hpet_restart_counter();
@@ -271,7 +275,7 @@ static void hpet_resume_counter(void)
271 275
272static void hpet_enable_legacy_int(void) 276static void hpet_enable_legacy_int(void)
273{ 277{
274 unsigned long cfg = hpet_readl(HPET_CFG); 278 unsigned int cfg = hpet_readl(HPET_CFG);
275 279
276 cfg |= HPET_CFG_LEGACY; 280 cfg |= HPET_CFG_LEGACY;
277 hpet_writel(cfg, HPET_CFG); 281 hpet_writel(cfg, HPET_CFG);
@@ -314,7 +318,7 @@ static int hpet_setup_msi_irq(unsigned int irq);
314static void hpet_set_mode(enum clock_event_mode mode, 318static void hpet_set_mode(enum clock_event_mode mode,
315 struct clock_event_device *evt, int timer) 319 struct clock_event_device *evt, int timer)
316{ 320{
317 unsigned long cfg, cmp, now; 321 unsigned int cfg, cmp, now;
318 uint64_t delta; 322 uint64_t delta;
319 323
320 switch (mode) { 324 switch (mode) {
@@ -323,7 +327,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
323 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult; 327 delta = ((uint64_t)(NSEC_PER_SEC/HZ)) * evt->mult;
324 delta >>= evt->shift; 328 delta >>= evt->shift;
325 now = hpet_readl(HPET_COUNTER); 329 now = hpet_readl(HPET_COUNTER);
326 cmp = now + (unsigned long) delta; 330 cmp = now + (unsigned int) delta;
327 cfg = hpet_readl(HPET_Tn_CFG(timer)); 331 cfg = hpet_readl(HPET_Tn_CFG(timer));
328 /* Make sure we use edge triggered interrupts */ 332 /* Make sure we use edge triggered interrupts */
329 cfg &= ~HPET_TN_LEVEL; 333 cfg &= ~HPET_TN_LEVEL;
@@ -339,7 +343,7 @@ static void hpet_set_mode(enum clock_event_mode mode,
339 * (See AMD-8111 HyperTransport I/O Hub Data Sheet, 343 * (See AMD-8111 HyperTransport I/O Hub Data Sheet,
340 * Publication # 24674) 344 * Publication # 24674)
341 */ 345 */
342 hpet_writel((unsigned long) delta, HPET_Tn_CMP(timer)); 346 hpet_writel((unsigned int) delta, HPET_Tn_CMP(timer));
343 hpet_start_counter(); 347 hpet_start_counter();
344 hpet_print_config(); 348 hpet_print_config();
345 break; 349 break;
@@ -383,13 +387,30 @@ static int hpet_next_event(unsigned long delta,
383 hpet_writel(cnt, HPET_Tn_CMP(timer)); 387 hpet_writel(cnt, HPET_Tn_CMP(timer));
384 388
385 /* 389 /*
386 * We need to read back the CMP register to make sure that 390 * We need to read back the CMP register on certain HPET
387 * what we wrote hit the chip before we compare it to the 391 * implementations (ATI chipsets) which seem to delay the
388 * counter. 392 * transfer of the compare register into the internal compare
393 * logic. With small deltas this might actually be too late as
394 * the counter could already be higher than the compare value
395 * at that point and we would wait for the next hpet interrupt
396 * forever. We found out that reading the CMP register back
397 * forces the transfer so we can rely on the comparison with
398 * the counter register below. If the read back from the
399 * compare register does not match the value we programmed
400 * then we might have a real hardware problem. We can not do
401 * much about it here, but at least alert the user/admin with
402 * a prominent warning.
403 * An erratum on some chipsets (ICH9,..), results in comparator read
404 * immediately following a write returning old value. Workaround
405 * for this is to read this value second time, when first
406 * read returns old value.
389 */ 407 */
390 WARN_ON_ONCE((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt); 408 if (unlikely((u32)hpet_readl(HPET_Tn_CMP(timer)) != cnt)) {
409 WARN_ONCE(hpet_readl(HPET_Tn_CMP(timer)) != cnt,
410 KERN_WARNING "hpet: compare register read back failed.\n");
411 }
391 412
392 return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; 413 return (s32)(hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0;
393} 414}
394 415
395static void hpet_legacy_set_mode(enum clock_event_mode mode, 416static void hpet_legacy_set_mode(enum clock_event_mode mode,
@@ -415,7 +436,7 @@ static struct hpet_dev *hpet_devs;
415void hpet_msi_unmask(unsigned int irq) 436void hpet_msi_unmask(unsigned int irq)
416{ 437{
417 struct hpet_dev *hdev = get_irq_data(irq); 438 struct hpet_dev *hdev = get_irq_data(irq);
418 unsigned long cfg; 439 unsigned int cfg;
419 440
420 /* unmask it */ 441 /* unmask it */
421 cfg = hpet_readl(HPET_Tn_CFG(hdev->num)); 442 cfg = hpet_readl(HPET_Tn_CFG(hdev->num));
@@ -425,7 +446,7 @@ void hpet_msi_unmask(unsigned int irq)
425 446
426void hpet_msi_mask(unsigned int irq) 447void hpet_msi_mask(unsigned int irq)
427{ 448{
428 unsigned long cfg; 449 unsigned int cfg;
429 struct hpet_dev *hdev = get_irq_data(irq); 450 struct hpet_dev *hdev = get_irq_data(irq);
430 451
431 /* mask it */ 452 /* mask it */
@@ -467,7 +488,7 @@ static int hpet_msi_next_event(unsigned long delta,
467 488
468static int hpet_setup_msi_irq(unsigned int irq) 489static int hpet_setup_msi_irq(unsigned int irq)
469{ 490{
470 if (arch_setup_hpet_msi(irq)) { 491 if (arch_setup_hpet_msi(irq, hpet_blockid)) {
471 destroy_irq(irq); 492 destroy_irq(irq);
472 return -EINVAL; 493 return -EINVAL;
473 } 494 }
@@ -584,6 +605,11 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
584 unsigned int num_timers_used = 0; 605 unsigned int num_timers_used = 0;
585 int i; 606 int i;
586 607
608 if (hpet_msi_disable)
609 return;
610
611 if (boot_cpu_has(X86_FEATURE_ARAT))
612 return;
587 id = hpet_readl(HPET_ID); 613 id = hpet_readl(HPET_ID);
588 614
589 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT); 615 num_timers = ((id & HPET_ID_NUMBER) >> HPET_ID_NUMBER_SHIFT);
@@ -598,7 +624,7 @@ static void hpet_msi_capability_lookup(unsigned int start_timer)
598 624
599 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) { 625 for (i = start_timer; i < num_timers - RESERVE_TIMERS; i++) {
600 struct hpet_dev *hdev = &hpet_devs[num_timers_used]; 626 struct hpet_dev *hdev = &hpet_devs[num_timers_used];
601 unsigned long cfg = hpet_readl(HPET_Tn_CFG(i)); 627 unsigned int cfg = hpet_readl(HPET_Tn_CFG(i));
602 628
603 /* Only consider HPET timer with MSI support */ 629 /* Only consider HPET timer with MSI support */
604 if (!(cfg & HPET_TN_FSB_CAP)) 630 if (!(cfg & HPET_TN_FSB_CAP))
@@ -813,7 +839,7 @@ static int hpet_clocksource_register(void)
813 */ 839 */
814int __init hpet_enable(void) 840int __init hpet_enable(void)
815{ 841{
816 unsigned long id; 842 unsigned int id;
817 int i; 843 int i;
818 844
819 if (!is_hpet_capable()) 845 if (!is_hpet_capable())
@@ -872,10 +898,8 @@ int __init hpet_enable(void)
872 898
873 if (id & HPET_ID_LEGSUP) { 899 if (id & HPET_ID_LEGSUP) {
874 hpet_legacy_clockevent_register(); 900 hpet_legacy_clockevent_register();
875 hpet_msi_capability_lookup(2);
876 return 1; 901 return 1;
877 } 902 }
878 hpet_msi_capability_lookup(0);
879 return 0; 903 return 0;
880 904
881out_nohpet: 905out_nohpet:
@@ -908,9 +932,20 @@ static __init int hpet_late_init(void)
908 if (!hpet_virt_address) 932 if (!hpet_virt_address)
909 return -ENODEV; 933 return -ENODEV;
910 934
935 if (hpet_readl(HPET_ID) & HPET_ID_LEGSUP)
936 hpet_msi_capability_lookup(2);
937 else
938 hpet_msi_capability_lookup(0);
939
911 hpet_reserve_platform_timers(hpet_readl(HPET_ID)); 940 hpet_reserve_platform_timers(hpet_readl(HPET_ID));
912 hpet_print_config(); 941 hpet_print_config();
913 942
943 if (hpet_msi_disable)
944 return 0;
945
946 if (boot_cpu_has(X86_FEATURE_ARAT))
947 return 0;
948
914 for_each_online_cpu(cpu) { 949 for_each_online_cpu(cpu) {
915 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu); 950 hpet_cpuhp_notify(NULL, CPU_ONLINE, (void *)(long)cpu);
916 } 951 }
@@ -925,7 +960,7 @@ fs_initcall(hpet_late_init);
925void hpet_disable(void) 960void hpet_disable(void)
926{ 961{
927 if (is_hpet_capable()) { 962 if (is_hpet_capable()) {
928 unsigned long cfg = hpet_readl(HPET_CFG); 963 unsigned int cfg = hpet_readl(HPET_CFG);
929 964
930 if (hpet_legacy_int_enabled) { 965 if (hpet_legacy_int_enabled) {
931 cfg &= ~HPET_CFG_LEGACY; 966 cfg &= ~HPET_CFG_LEGACY;
@@ -965,8 +1000,8 @@ static int hpet_prev_update_sec;
965static struct rtc_time hpet_alarm_time; 1000static struct rtc_time hpet_alarm_time;
966static unsigned long hpet_pie_count; 1001static unsigned long hpet_pie_count;
967static u32 hpet_t1_cmp; 1002static u32 hpet_t1_cmp;
968static unsigned long hpet_default_delta; 1003static u32 hpet_default_delta;
969static unsigned long hpet_pie_delta; 1004static u32 hpet_pie_delta;
970static unsigned long hpet_pie_limit; 1005static unsigned long hpet_pie_limit;
971 1006
972static rtc_irq_handler irq_handler; 1007static rtc_irq_handler irq_handler;
@@ -1017,7 +1052,8 @@ EXPORT_SYMBOL_GPL(hpet_unregister_irq_handler);
1017 */ 1052 */
1018int hpet_rtc_timer_init(void) 1053int hpet_rtc_timer_init(void)
1019{ 1054{
1020 unsigned long cfg, cnt, delta, flags; 1055 unsigned int cfg, cnt, delta;
1056 unsigned long flags;
1021 1057
1022 if (!is_hpet_enabled()) 1058 if (!is_hpet_enabled())
1023 return 0; 1059 return 0;
@@ -1027,7 +1063,7 @@ int hpet_rtc_timer_init(void)
1027 1063
1028 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1064 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1029 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT; 1065 clc >>= hpet_clockevent.shift + DEFAULT_RTC_SHIFT;
1030 hpet_default_delta = (unsigned long) clc; 1066 hpet_default_delta = clc;
1031 } 1067 }
1032 1068
1033 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) 1069 if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit)
@@ -1113,7 +1149,8 @@ int hpet_set_periodic_freq(unsigned long freq)
1113 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC; 1149 clc = (uint64_t) hpet_clockevent.mult * NSEC_PER_SEC;
1114 do_div(clc, freq); 1150 do_div(clc, freq);
1115 clc >>= hpet_clockevent.shift; 1151 clc >>= hpet_clockevent.shift;
1116 hpet_pie_delta = (unsigned long) clc; 1152 hpet_pie_delta = clc;
1153 hpet_pie_limit = 0;
1117 } 1154 }
1118 return 1; 1155 return 1;
1119} 1156}
@@ -1127,7 +1164,7 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq);
1127 1164
1128static void hpet_rtc_timer_reinit(void) 1165static void hpet_rtc_timer_reinit(void)
1129{ 1166{
1130 unsigned long cfg, delta; 1167 unsigned int cfg, delta;
1131 int lost_ints = -1; 1168 int lost_ints = -1;
1132 1169
1133 if (unlikely(!hpet_rtc_flags)) { 1170 if (unlikely(!hpet_rtc_flags)) {
diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
new file mode 100644
index 000000000000..d6cc065f519f
--- /dev/null
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -0,0 +1,530 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) 2007 Alan Stern
17 * Copyright (C) 2009 IBM Corporation
18 * Copyright (C) 2009 Frederic Weisbecker <fweisbec@gmail.com>
19 *
20 * Authors: Alan Stern <stern@rowland.harvard.edu>
21 * K.Prasad <prasad@linux.vnet.ibm.com>
22 * Frederic Weisbecker <fweisbec@gmail.com>
23 */
24
25/*
26 * HW_breakpoint: a unified kernel/user-space hardware breakpoint facility,
27 * using the CPU's debug registers.
28 */
29
30#include <linux/perf_event.h>
31#include <linux/hw_breakpoint.h>
32#include <linux/irqflags.h>
33#include <linux/notifier.h>
34#include <linux/kallsyms.h>
35#include <linux/kprobes.h>
36#include <linux/percpu.h>
37#include <linux/kdebug.h>
38#include <linux/kernel.h>
39#include <linux/module.h>
40#include <linux/sched.h>
41#include <linux/init.h>
42#include <linux/smp.h>
43
44#include <asm/hw_breakpoint.h>
45#include <asm/processor.h>
46#include <asm/debugreg.h>
47
48/* Per cpu debug control register value */
49DEFINE_PER_CPU(unsigned long, cpu_dr7);
50EXPORT_PER_CPU_SYMBOL(cpu_dr7);
51
52/* Per cpu debug address registers values */
53static DEFINE_PER_CPU(unsigned long, cpu_debugreg[HBP_NUM]);
54
55/*
56 * Stores the breakpoints currently in use on each breakpoint address
57 * register for each cpus
58 */
59static DEFINE_PER_CPU(struct perf_event *, bp_per_reg[HBP_NUM]);
60
61
62static inline unsigned long
63__encode_dr7(int drnum, unsigned int len, unsigned int type)
64{
65 unsigned long bp_info;
66
67 bp_info = (len | type) & 0xf;
68 bp_info <<= (DR_CONTROL_SHIFT + drnum * DR_CONTROL_SIZE);
69 bp_info |= (DR_GLOBAL_ENABLE << (drnum * DR_ENABLE_SIZE));
70
71 return bp_info;
72}
73
74/*
75 * Encode the length, type, Exact, and Enable bits for a particular breakpoint
76 * as stored in debug register 7.
77 */
78unsigned long encode_dr7(int drnum, unsigned int len, unsigned int type)
79{
80 return __encode_dr7(drnum, len, type) | DR_GLOBAL_SLOWDOWN;
81}
82
83/*
84 * Decode the length and type bits for a particular breakpoint as
85 * stored in debug register 7. Return the "enabled" status.
86 */
87int decode_dr7(unsigned long dr7, int bpnum, unsigned *len, unsigned *type)
88{
89 int bp_info = dr7 >> (DR_CONTROL_SHIFT + bpnum * DR_CONTROL_SIZE);
90
91 *len = (bp_info & 0xc) | 0x40;
92 *type = (bp_info & 0x3) | 0x80;
93
94 return (dr7 >> (bpnum * DR_ENABLE_SIZE)) & 0x3;
95}
96
97/*
98 * Install a perf counter breakpoint.
99 *
100 * We seek a free debug address register and use it for this
101 * breakpoint. Eventually we enable it in the debug control register.
102 *
103 * Atomic: we hold the counter->ctx->lock and we only handle variables
104 * and registers local to this cpu.
105 */
106int arch_install_hw_breakpoint(struct perf_event *bp)
107{
108 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
109 unsigned long *dr7;
110 int i;
111
112 for (i = 0; i < HBP_NUM; i++) {
113 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
114
115 if (!*slot) {
116 *slot = bp;
117 break;
118 }
119 }
120
121 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
122 return -EBUSY;
123
124 set_debugreg(info->address, i);
125 __get_cpu_var(cpu_debugreg[i]) = info->address;
126
127 dr7 = &__get_cpu_var(cpu_dr7);
128 *dr7 |= encode_dr7(i, info->len, info->type);
129
130 set_debugreg(*dr7, 7);
131
132 return 0;
133}
134
135/*
136 * Uninstall the breakpoint contained in the given counter.
137 *
138 * First we search the debug address register it uses and then we disable
139 * it.
140 *
141 * Atomic: we hold the counter->ctx->lock and we only handle variables
142 * and registers local to this cpu.
143 */
144void arch_uninstall_hw_breakpoint(struct perf_event *bp)
145{
146 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
147 unsigned long *dr7;
148 int i;
149
150 for (i = 0; i < HBP_NUM; i++) {
151 struct perf_event **slot = &__get_cpu_var(bp_per_reg[i]);
152
153 if (*slot == bp) {
154 *slot = NULL;
155 break;
156 }
157 }
158
159 if (WARN_ONCE(i == HBP_NUM, "Can't find any breakpoint slot"))
160 return;
161
162 dr7 = &__get_cpu_var(cpu_dr7);
163 *dr7 &= ~__encode_dr7(i, info->len, info->type);
164
165 set_debugreg(*dr7, 7);
166}
167
168static int get_hbp_len(u8 hbp_len)
169{
170 unsigned int len_in_bytes = 0;
171
172 switch (hbp_len) {
173 case X86_BREAKPOINT_LEN_1:
174 len_in_bytes = 1;
175 break;
176 case X86_BREAKPOINT_LEN_2:
177 len_in_bytes = 2;
178 break;
179 case X86_BREAKPOINT_LEN_4:
180 len_in_bytes = 4;
181 break;
182#ifdef CONFIG_X86_64
183 case X86_BREAKPOINT_LEN_8:
184 len_in_bytes = 8;
185 break;
186#endif
187 }
188 return len_in_bytes;
189}
190
191/*
192 * Check for virtual address in user space.
193 */
194int arch_check_va_in_userspace(unsigned long va, u8 hbp_len)
195{
196 unsigned int len;
197
198 len = get_hbp_len(hbp_len);
199
200 return (va <= TASK_SIZE - len);
201}
202
203/*
204 * Check for virtual address in kernel space.
205 */
206static int arch_check_va_in_kernelspace(unsigned long va, u8 hbp_len)
207{
208 unsigned int len;
209
210 len = get_hbp_len(hbp_len);
211
212 return (va >= TASK_SIZE) && ((va + len - 1) >= TASK_SIZE);
213}
214
215int arch_bp_generic_fields(int x86_len, int x86_type,
216 int *gen_len, int *gen_type)
217{
218 /* Len */
219 switch (x86_len) {
220 case X86_BREAKPOINT_LEN_1:
221 *gen_len = HW_BREAKPOINT_LEN_1;
222 break;
223 case X86_BREAKPOINT_LEN_2:
224 *gen_len = HW_BREAKPOINT_LEN_2;
225 break;
226 case X86_BREAKPOINT_LEN_4:
227 *gen_len = HW_BREAKPOINT_LEN_4;
228 break;
229#ifdef CONFIG_X86_64
230 case X86_BREAKPOINT_LEN_8:
231 *gen_len = HW_BREAKPOINT_LEN_8;
232 break;
233#endif
234 default:
235 return -EINVAL;
236 }
237
238 /* Type */
239 switch (x86_type) {
240 case X86_BREAKPOINT_EXECUTE:
241 *gen_type = HW_BREAKPOINT_X;
242 break;
243 case X86_BREAKPOINT_WRITE:
244 *gen_type = HW_BREAKPOINT_W;
245 break;
246 case X86_BREAKPOINT_RW:
247 *gen_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R;
248 break;
249 default:
250 return -EINVAL;
251 }
252
253 return 0;
254}
255
256
257static int arch_build_bp_info(struct perf_event *bp)
258{
259 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
260
261 info->address = bp->attr.bp_addr;
262
263 /* Len */
264 switch (bp->attr.bp_len) {
265 case HW_BREAKPOINT_LEN_1:
266 info->len = X86_BREAKPOINT_LEN_1;
267 break;
268 case HW_BREAKPOINT_LEN_2:
269 info->len = X86_BREAKPOINT_LEN_2;
270 break;
271 case HW_BREAKPOINT_LEN_4:
272 info->len = X86_BREAKPOINT_LEN_4;
273 break;
274#ifdef CONFIG_X86_64
275 case HW_BREAKPOINT_LEN_8:
276 info->len = X86_BREAKPOINT_LEN_8;
277 break;
278#endif
279 default:
280 return -EINVAL;
281 }
282
283 /* Type */
284 switch (bp->attr.bp_type) {
285 case HW_BREAKPOINT_W:
286 info->type = X86_BREAKPOINT_WRITE;
287 break;
288 case HW_BREAKPOINT_W | HW_BREAKPOINT_R:
289 info->type = X86_BREAKPOINT_RW;
290 break;
291 case HW_BREAKPOINT_X:
292 info->type = X86_BREAKPOINT_EXECUTE;
293 break;
294 default:
295 return -EINVAL;
296 }
297
298 return 0;
299}
300/*
301 * Validate the arch-specific HW Breakpoint register settings
302 */
303int arch_validate_hwbkpt_settings(struct perf_event *bp,
304 struct task_struct *tsk)
305{
306 struct arch_hw_breakpoint *info = counter_arch_bp(bp);
307 unsigned int align;
308 int ret;
309
310
311 ret = arch_build_bp_info(bp);
312 if (ret)
313 return ret;
314
315 ret = -EINVAL;
316
317 if (info->type == X86_BREAKPOINT_EXECUTE)
318 /*
319 * Ptrace-refactoring code
320 * For now, we'll allow instruction breakpoint only for user-space
321 * addresses
322 */
323 if ((!arch_check_va_in_userspace(info->address, info->len)) &&
324 info->len != X86_BREAKPOINT_EXECUTE)
325 return ret;
326
327 switch (info->len) {
328 case X86_BREAKPOINT_LEN_1:
329 align = 0;
330 break;
331 case X86_BREAKPOINT_LEN_2:
332 align = 1;
333 break;
334 case X86_BREAKPOINT_LEN_4:
335 align = 3;
336 break;
337#ifdef CONFIG_X86_64
338 case X86_BREAKPOINT_LEN_8:
339 align = 7;
340 break;
341#endif
342 default:
343 return ret;
344 }
345
346 /*
347 * Check that the low-order bits of the address are appropriate
348 * for the alignment implied by len.
349 */
350 if (info->address & align)
351 return -EINVAL;
352
353 /* Check that the virtual address is in the proper range */
354 if (tsk) {
355 if (!arch_check_va_in_userspace(info->address, info->len))
356 return -EFAULT;
357 } else {
358 if (!arch_check_va_in_kernelspace(info->address, info->len))
359 return -EFAULT;
360 }
361
362 return 0;
363}
364
365/*
366 * Dump the debug register contents to the user.
367 * We can't dump our per cpu values because it
368 * may contain cpu wide breakpoint, something that
369 * doesn't belong to the current task.
370 *
371 * TODO: include non-ptrace user breakpoints (perf)
372 */
373void aout_dump_debugregs(struct user *dump)
374{
375 int i;
376 int dr7 = 0;
377 struct perf_event *bp;
378 struct arch_hw_breakpoint *info;
379 struct thread_struct *thread = &current->thread;
380
381 for (i = 0; i < HBP_NUM; i++) {
382 bp = thread->ptrace_bps[i];
383
384 if (bp && !bp->attr.disabled) {
385 dump->u_debugreg[i] = bp->attr.bp_addr;
386 info = counter_arch_bp(bp);
387 dr7 |= encode_dr7(i, info->len, info->type);
388 } else {
389 dump->u_debugreg[i] = 0;
390 }
391 }
392
393 dump->u_debugreg[4] = 0;
394 dump->u_debugreg[5] = 0;
395 dump->u_debugreg[6] = current->thread.debugreg6;
396
397 dump->u_debugreg[7] = dr7;
398}
399EXPORT_SYMBOL_GPL(aout_dump_debugregs);
400
401/*
402 * Release the user breakpoints used by ptrace
403 */
404void flush_ptrace_hw_breakpoint(struct task_struct *tsk)
405{
406 int i;
407 struct thread_struct *t = &tsk->thread;
408
409 for (i = 0; i < HBP_NUM; i++) {
410 unregister_hw_breakpoint(t->ptrace_bps[i]);
411 t->ptrace_bps[i] = NULL;
412 }
413}
414
415void hw_breakpoint_restore(void)
416{
417 set_debugreg(__get_cpu_var(cpu_debugreg[0]), 0);
418 set_debugreg(__get_cpu_var(cpu_debugreg[1]), 1);
419 set_debugreg(__get_cpu_var(cpu_debugreg[2]), 2);
420 set_debugreg(__get_cpu_var(cpu_debugreg[3]), 3);
421 set_debugreg(current->thread.debugreg6, 6);
422 set_debugreg(__get_cpu_var(cpu_dr7), 7);
423}
424EXPORT_SYMBOL_GPL(hw_breakpoint_restore);
425
426/*
427 * Handle debug exception notifications.
428 *
429 * Return value is either NOTIFY_STOP or NOTIFY_DONE as explained below.
430 *
431 * NOTIFY_DONE returned if one of the following conditions is true.
432 * i) When the causative address is from user-space and the exception
433 * is a valid one, i.e. not triggered as a result of lazy debug register
434 * switching
435 * ii) When there are more bits than trap<n> set in DR6 register (such
436 * as BD, BS or BT) indicating that more than one debug condition is
437 * met and requires some more action in do_debug().
438 *
439 * NOTIFY_STOP returned for all other cases
440 *
441 */
442static int __kprobes hw_breakpoint_handler(struct die_args *args)
443{
444 int i, cpu, rc = NOTIFY_STOP;
445 struct perf_event *bp;
446 unsigned long dr7, dr6;
447 unsigned long *dr6_p;
448
449 /* The DR6 value is pointed by args->err */
450 dr6_p = (unsigned long *)ERR_PTR(args->err);
451 dr6 = *dr6_p;
452
453 /* Do an early return if no trap bits are set in DR6 */
454 if ((dr6 & DR_TRAP_BITS) == 0)
455 return NOTIFY_DONE;
456
457 get_debugreg(dr7, 7);
458 /* Disable breakpoints during exception handling */
459 set_debugreg(0UL, 7);
460 /*
461 * Assert that local interrupts are disabled
462 * Reset the DRn bits in the virtualized register value.
463 * The ptrace trigger routine will add in whatever is needed.
464 */
465 current->thread.debugreg6 &= ~DR_TRAP_BITS;
466 cpu = get_cpu();
467
468 /* Handle all the breakpoints that were triggered */
469 for (i = 0; i < HBP_NUM; ++i) {
470 if (likely(!(dr6 & (DR_TRAP0 << i))))
471 continue;
472
473 /*
474 * The counter may be concurrently released but that can only
475 * occur from a call_rcu() path. We can then safely fetch
476 * the breakpoint, use its callback, touch its counter
477 * while we are in an rcu_read_lock() path.
478 */
479 rcu_read_lock();
480
481 bp = per_cpu(bp_per_reg[i], cpu);
482 /*
483 * Reset the 'i'th TRAP bit in dr6 to denote completion of
484 * exception handling
485 */
486 (*dr6_p) &= ~(DR_TRAP0 << i);
487 /*
488 * bp can be NULL due to lazy debug register switching
489 * or due to concurrent perf counter removing.
490 */
491 if (!bp) {
492 rcu_read_unlock();
493 break;
494 }
495
496 perf_bp_event(bp, args->regs);
497
498 rcu_read_unlock();
499 }
500 /*
501 * Further processing in do_debug() is needed for a) user-space
502 * breakpoints (to generate signals) and b) when the system has
503 * taken exception due to multiple causes
504 */
505 if ((current->thread.debugreg6 & DR_TRAP_BITS) ||
506 (dr6 & (~DR_TRAP_BITS)))
507 rc = NOTIFY_DONE;
508
509 set_debugreg(dr7, 7);
510 put_cpu();
511
512 return rc;
513}
514
515/*
516 * Handle debug exception notifications.
517 */
518int __kprobes hw_breakpoint_exceptions_notify(
519 struct notifier_block *unused, unsigned long val, void *data)
520{
521 if (val != DIE_DEBUG)
522 return NOTIFY_DONE;
523
524 return hw_breakpoint_handler(data);
525}
526
527void hw_breakpoint_pmu_read(struct perf_event *bp)
528{
529 /* TODO */
530}
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index f2f8540a7f3d..54c31c285488 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -8,6 +8,7 @@
8#include <linux/module.h> 8#include <linux/module.h>
9#include <linux/regset.h> 9#include <linux/regset.h>
10#include <linux/sched.h> 10#include <linux/sched.h>
11#include <linux/slab.h>
11 12
12#include <asm/sigcontext.h> 13#include <asm/sigcontext.h>
13#include <asm/processor.h> 14#include <asm/processor.h>
@@ -164,6 +165,11 @@ int init_fpu(struct task_struct *tsk)
164 return 0; 165 return 0;
165} 166}
166 167
168/*
169 * The xstateregs_active() routine is the same as the fpregs_active() routine,
170 * as the "regset->n" for the xstate regset will be updated based on the feature
171 * capabilites supported by the xsave.
172 */
167int fpregs_active(struct task_struct *target, const struct user_regset *regset) 173int fpregs_active(struct task_struct *target, const struct user_regset *regset)
168{ 174{
169 return tsk_used_math(target) ? regset->n : 0; 175 return tsk_used_math(target) ? regset->n : 0;
@@ -204,8 +210,6 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
204 if (ret) 210 if (ret)
205 return ret; 211 return ret;
206 212
207 set_stopped_child_used_math(target);
208
209 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, 213 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
210 &target->thread.xstate->fxsave, 0, -1); 214 &target->thread.xstate->fxsave, 0, -1);
211 215
@@ -224,6 +228,68 @@ int xfpregs_set(struct task_struct *target, const struct user_regset *regset,
224 return ret; 228 return ret;
225} 229}
226 230
231int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
232 unsigned int pos, unsigned int count,
233 void *kbuf, void __user *ubuf)
234{
235 int ret;
236
237 if (!cpu_has_xsave)
238 return -ENODEV;
239
240 ret = init_fpu(target);
241 if (ret)
242 return ret;
243
244 /*
245 * Copy the 48bytes defined by the software first into the xstate
246 * memory layout in the thread struct, so that we can copy the entire
247 * xstateregs to the user using one user_regset_copyout().
248 */
249 memcpy(&target->thread.xstate->fxsave.sw_reserved,
250 xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
251
252 /*
253 * Copy the xstate memory layout.
254 */
255 ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
256 &target->thread.xstate->xsave, 0, -1);
257 return ret;
258}
259
260int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
261 unsigned int pos, unsigned int count,
262 const void *kbuf, const void __user *ubuf)
263{
264 int ret;
265 struct xsave_hdr_struct *xsave_hdr;
266
267 if (!cpu_has_xsave)
268 return -ENODEV;
269
270 ret = init_fpu(target);
271 if (ret)
272 return ret;
273
274 ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
275 &target->thread.xstate->xsave, 0, -1);
276
277 /*
278 * mxcsr reserved bits must be masked to zero for security reasons.
279 */
280 target->thread.xstate->fxsave.mxcsr &= mxcsr_feature_mask;
281
282 xsave_hdr = &target->thread.xstate->xsave.xsave_hdr;
283
284 xsave_hdr->xstate_bv &= pcntxt_mask;
285 /*
286 * These bits must be zero.
287 */
288 xsave_hdr->reserved1[0] = xsave_hdr->reserved1[1] = 0;
289
290 return ret;
291}
292
227#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 293#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
228 294
229/* 295/*
@@ -404,8 +470,6 @@ int fpregs_set(struct task_struct *target, const struct user_regset *regset,
404 if (ret) 470 if (ret)
405 return ret; 471 return ret;
406 472
407 set_stopped_child_used_math(target);
408
409 if (!HAVE_HWFP) 473 if (!HAVE_HWFP)
410 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf); 474 return fpregs_soft_set(target, regset, pos, count, kbuf, ubuf);
411 475
diff --git a/arch/x86/kernel/i8259.c b/arch/x86/kernel/i8259.c
index df89102bef80..7c9f02c130f3 100644
--- a/arch/x86/kernel/i8259.c
+++ b/arch/x86/kernel/i8259.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/init.h> 9#include <linux/init.h>
11#include <linux/kernel_stat.h> 10#include <linux/kernel_stat.h>
@@ -32,8 +31,14 @@
32 */ 31 */
33 32
34static int i8259A_auto_eoi; 33static int i8259A_auto_eoi;
35DEFINE_SPINLOCK(i8259A_lock); 34DEFINE_RAW_SPINLOCK(i8259A_lock);
36static void mask_and_ack_8259A(unsigned int); 35static void mask_and_ack_8259A(unsigned int);
36static void mask_8259A(void);
37static void unmask_8259A(void);
38static void disable_8259A_irq(unsigned int irq);
39static void enable_8259A_irq(unsigned int irq);
40static void init_8259A(int auto_eoi);
41static int i8259A_irq_pending(unsigned int irq);
37 42
38struct irq_chip i8259A_chip = { 43struct irq_chip i8259A_chip = {
39 .name = "XT-PIC", 44 .name = "XT-PIC",
@@ -63,51 +68,51 @@ unsigned int cached_irq_mask = 0xffff;
63 */ 68 */
64unsigned long io_apic_irqs; 69unsigned long io_apic_irqs;
65 70
66void disable_8259A_irq(unsigned int irq) 71static void disable_8259A_irq(unsigned int irq)
67{ 72{
68 unsigned int mask = 1 << irq; 73 unsigned int mask = 1 << irq;
69 unsigned long flags; 74 unsigned long flags;
70 75
71 spin_lock_irqsave(&i8259A_lock, flags); 76 raw_spin_lock_irqsave(&i8259A_lock, flags);
72 cached_irq_mask |= mask; 77 cached_irq_mask |= mask;
73 if (irq & 8) 78 if (irq & 8)
74 outb(cached_slave_mask, PIC_SLAVE_IMR); 79 outb(cached_slave_mask, PIC_SLAVE_IMR);
75 else 80 else
76 outb(cached_master_mask, PIC_MASTER_IMR); 81 outb(cached_master_mask, PIC_MASTER_IMR);
77 spin_unlock_irqrestore(&i8259A_lock, flags); 82 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
78} 83}
79 84
80void enable_8259A_irq(unsigned int irq) 85static void enable_8259A_irq(unsigned int irq)
81{ 86{
82 unsigned int mask = ~(1 << irq); 87 unsigned int mask = ~(1 << irq);
83 unsigned long flags; 88 unsigned long flags;
84 89
85 spin_lock_irqsave(&i8259A_lock, flags); 90 raw_spin_lock_irqsave(&i8259A_lock, flags);
86 cached_irq_mask &= mask; 91 cached_irq_mask &= mask;
87 if (irq & 8) 92 if (irq & 8)
88 outb(cached_slave_mask, PIC_SLAVE_IMR); 93 outb(cached_slave_mask, PIC_SLAVE_IMR);
89 else 94 else
90 outb(cached_master_mask, PIC_MASTER_IMR); 95 outb(cached_master_mask, PIC_MASTER_IMR);
91 spin_unlock_irqrestore(&i8259A_lock, flags); 96 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
92} 97}
93 98
94int i8259A_irq_pending(unsigned int irq) 99static int i8259A_irq_pending(unsigned int irq)
95{ 100{
96 unsigned int mask = 1<<irq; 101 unsigned int mask = 1<<irq;
97 unsigned long flags; 102 unsigned long flags;
98 int ret; 103 int ret;
99 104
100 spin_lock_irqsave(&i8259A_lock, flags); 105 raw_spin_lock_irqsave(&i8259A_lock, flags);
101 if (irq < 8) 106 if (irq < 8)
102 ret = inb(PIC_MASTER_CMD) & mask; 107 ret = inb(PIC_MASTER_CMD) & mask;
103 else 108 else
104 ret = inb(PIC_SLAVE_CMD) & (mask >> 8); 109 ret = inb(PIC_SLAVE_CMD) & (mask >> 8);
105 spin_unlock_irqrestore(&i8259A_lock, flags); 110 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
106 111
107 return ret; 112 return ret;
108} 113}
109 114
110void make_8259A_irq(unsigned int irq) 115static void make_8259A_irq(unsigned int irq)
111{ 116{
112 disable_irq_nosync(irq); 117 disable_irq_nosync(irq);
113 io_apic_irqs &= ~(1<<irq); 118 io_apic_irqs &= ~(1<<irq);
@@ -150,7 +155,7 @@ static void mask_and_ack_8259A(unsigned int irq)
150 unsigned int irqmask = 1 << irq; 155 unsigned int irqmask = 1 << irq;
151 unsigned long flags; 156 unsigned long flags;
152 157
153 spin_lock_irqsave(&i8259A_lock, flags); 158 raw_spin_lock_irqsave(&i8259A_lock, flags);
154 /* 159 /*
155 * Lightweight spurious IRQ detection. We do not want 160 * Lightweight spurious IRQ detection. We do not want
156 * to overdo spurious IRQ handling - it's usually a sign 161 * to overdo spurious IRQ handling - it's usually a sign
@@ -183,7 +188,7 @@ handle_real_irq:
183 outb(cached_master_mask, PIC_MASTER_IMR); 188 outb(cached_master_mask, PIC_MASTER_IMR);
184 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */ 189 outb(0x60+irq, PIC_MASTER_CMD); /* 'Specific EOI to master */
185 } 190 }
186 spin_unlock_irqrestore(&i8259A_lock, flags); 191 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
187 return; 192 return;
188 193
189spurious_8259A_irq: 194spurious_8259A_irq:
@@ -281,37 +286,37 @@ static int __init i8259A_init_sysfs(void)
281 286
282device_initcall(i8259A_init_sysfs); 287device_initcall(i8259A_init_sysfs);
283 288
284void mask_8259A(void) 289static void mask_8259A(void)
285{ 290{
286 unsigned long flags; 291 unsigned long flags;
287 292
288 spin_lock_irqsave(&i8259A_lock, flags); 293 raw_spin_lock_irqsave(&i8259A_lock, flags);
289 294
290 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 295 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
291 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 296 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
292 297
293 spin_unlock_irqrestore(&i8259A_lock, flags); 298 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
294} 299}
295 300
296void unmask_8259A(void) 301static void unmask_8259A(void)
297{ 302{
298 unsigned long flags; 303 unsigned long flags;
299 304
300 spin_lock_irqsave(&i8259A_lock, flags); 305 raw_spin_lock_irqsave(&i8259A_lock, flags);
301 306
302 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 307 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
303 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 308 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
304 309
305 spin_unlock_irqrestore(&i8259A_lock, flags); 310 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
306} 311}
307 312
308void init_8259A(int auto_eoi) 313static void init_8259A(int auto_eoi)
309{ 314{
310 unsigned long flags; 315 unsigned long flags;
311 316
312 i8259A_auto_eoi = auto_eoi; 317 i8259A_auto_eoi = auto_eoi;
313 318
314 spin_lock_irqsave(&i8259A_lock, flags); 319 raw_spin_lock_irqsave(&i8259A_lock, flags);
315 320
316 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */ 321 outb(0xff, PIC_MASTER_IMR); /* mask all of 8259A-1 */
317 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */ 322 outb(0xff, PIC_SLAVE_IMR); /* mask all of 8259A-2 */
@@ -356,5 +361,49 @@ void init_8259A(int auto_eoi)
356 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */ 361 outb(cached_master_mask, PIC_MASTER_IMR); /* restore master IRQ mask */
357 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */ 362 outb(cached_slave_mask, PIC_SLAVE_IMR); /* restore slave IRQ mask */
358 363
359 spin_unlock_irqrestore(&i8259A_lock, flags); 364 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
360} 365}
366
367/*
368 * make i8259 a driver so that we can select pic functions at run time. the goal
369 * is to make x86 binary compatible among pc compatible and non-pc compatible
370 * platforms, such as x86 MID.
371 */
372
373static void legacy_pic_noop(void) { };
374static void legacy_pic_uint_noop(unsigned int unused) { };
375static void legacy_pic_int_noop(int unused) { };
376
377static struct irq_chip dummy_pic_chip = {
378 .name = "dummy pic",
379 .mask = legacy_pic_uint_noop,
380 .unmask = legacy_pic_uint_noop,
381 .disable = legacy_pic_uint_noop,
382 .mask_ack = legacy_pic_uint_noop,
383};
384static int legacy_pic_irq_pending_noop(unsigned int irq)
385{
386 return 0;
387}
388
389struct legacy_pic null_legacy_pic = {
390 .nr_legacy_irqs = 0,
391 .chip = &dummy_pic_chip,
392 .mask_all = legacy_pic_noop,
393 .restore_mask = legacy_pic_noop,
394 .init = legacy_pic_int_noop,
395 .irq_pending = legacy_pic_irq_pending_noop,
396 .make_irq = legacy_pic_uint_noop,
397};
398
399struct legacy_pic default_legacy_pic = {
400 .nr_legacy_irqs = NR_IRQS_LEGACY,
401 .chip = &i8259A_chip,
402 .mask_all = mask_8259A,
403 .restore_mask = unmask_8259A,
404 .init = init_8259A,
405 .irq_pending = i8259A_irq_pending,
406 .make_irq = make_8259A_irq,
407};
408
409struct legacy_pic *legacy_pic = &default_legacy_pic;
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 99c4d308f16b..8eec0ec59af2 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -103,9 +103,10 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
103 * on system-call entry - see also fork() and the signal handling 103 * on system-call entry - see also fork() and the signal handling
104 * code. 104 * code.
105 */ 105 */
106static int do_iopl(unsigned int level, struct pt_regs *regs) 106long sys_iopl(unsigned int level, struct pt_regs *regs)
107{ 107{
108 unsigned int old = (regs->flags >> 12) & 3; 108 unsigned int old = (regs->flags >> 12) & 3;
109 struct thread_struct *t = &current->thread;
109 110
110 if (level > 3) 111 if (level > 3)
111 return -EINVAL; 112 return -EINVAL;
@@ -115,29 +116,8 @@ static int do_iopl(unsigned int level, struct pt_regs *regs)
115 return -EPERM; 116 return -EPERM;
116 } 117 }
117 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12); 118 regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | (level << 12);
118
119 return 0;
120}
121
122#ifdef CONFIG_X86_32
123long sys_iopl(struct pt_regs *regs)
124{
125 unsigned int level = regs->bx;
126 struct thread_struct *t = &current->thread;
127 int rc;
128
129 rc = do_iopl(level, regs);
130 if (rc < 0)
131 goto out;
132
133 t->iopl = level << 12; 119 t->iopl = level << 12;
134 set_iopl_mask(t->iopl); 120 set_iopl_mask(t->iopl);
135out: 121
136 return rc; 122 return 0;
137}
138#else
139asmlinkage long sys_iopl(unsigned int level, struct pt_regs *regs)
140{
141 return do_iopl(level, regs);
142} 123}
143#endif
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 04bbd5278568..91fd0c70a18a 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -18,7 +18,7 @@
18atomic_t irq_err_count; 18atomic_t irq_err_count;
19 19
20/* Function pointer for generic interrupt vector handling */ 20/* Function pointer for generic interrupt vector handling */
21void (*generic_interrupt_extension)(void) = NULL; 21void (*x86_platform_ipi_callback)(void) = NULL;
22 22
23/* 23/*
24 * 'what should we do if we get a hw irq event on an illegal vector'. 24 * 'what should we do if we get a hw irq event on an illegal vector'.
@@ -72,10 +72,10 @@ static int show_other_interrupts(struct seq_file *p, int prec)
72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); 72 seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs);
73 seq_printf(p, " Performance pending work\n"); 73 seq_printf(p, " Performance pending work\n");
74#endif 74#endif
75 if (generic_interrupt_extension) { 75 if (x86_platform_ipi_callback) {
76 seq_printf(p, "%*s: ", prec, "PLT"); 76 seq_printf(p, "%*s: ", prec, "PLT");
77 for_each_online_cpu(j) 77 for_each_online_cpu(j)
78 seq_printf(p, "%10u ", irq_stats(j)->generic_irqs); 78 seq_printf(p, "%10u ", irq_stats(j)->x86_platform_ipis);
79 seq_printf(p, " Platform interrupts\n"); 79 seq_printf(p, " Platform interrupts\n");
80 } 80 }
81#ifdef CONFIG_SMP 81#ifdef CONFIG_SMP
@@ -92,17 +92,17 @@ static int show_other_interrupts(struct seq_file *p, int prec)
92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count); 92 seq_printf(p, "%10u ", irq_stats(j)->irq_tlb_count);
93 seq_printf(p, " TLB shootdowns\n"); 93 seq_printf(p, " TLB shootdowns\n");
94#endif 94#endif
95#ifdef CONFIG_X86_MCE 95#ifdef CONFIG_X86_THERMAL_VECTOR
96 seq_printf(p, "%*s: ", prec, "TRM"); 96 seq_printf(p, "%*s: ", prec, "TRM");
97 for_each_online_cpu(j) 97 for_each_online_cpu(j)
98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count); 98 seq_printf(p, "%10u ", irq_stats(j)->irq_thermal_count);
99 seq_printf(p, " Thermal event interrupts\n"); 99 seq_printf(p, " Thermal event interrupts\n");
100# ifdef CONFIG_X86_MCE_THRESHOLD 100#endif
101#ifdef CONFIG_X86_MCE_THRESHOLD
101 seq_printf(p, "%*s: ", prec, "THR"); 102 seq_printf(p, "%*s: ", prec, "THR");
102 for_each_online_cpu(j) 103 for_each_online_cpu(j)
103 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count); 104 seq_printf(p, "%10u ", irq_stats(j)->irq_threshold_count);
104 seq_printf(p, " Threshold APIC interrupts\n"); 105 seq_printf(p, " Threshold APIC interrupts\n");
105# endif
106#endif 106#endif
107#ifdef CONFIG_X86_MCE 107#ifdef CONFIG_X86_MCE
108 seq_printf(p, "%*s: ", prec, "MCE"); 108 seq_printf(p, "%*s: ", prec, "MCE");
@@ -149,7 +149,7 @@ int show_interrupts(struct seq_file *p, void *v)
149 if (!desc) 149 if (!desc)
150 return 0; 150 return 0;
151 151
152 spin_lock_irqsave(&desc->lock, flags); 152 raw_spin_lock_irqsave(&desc->lock, flags);
153 for_each_online_cpu(j) 153 for_each_online_cpu(j)
154 any_count |= kstat_irqs_cpu(i, j); 154 any_count |= kstat_irqs_cpu(i, j);
155 action = desc->action; 155 action = desc->action;
@@ -170,7 +170,7 @@ int show_interrupts(struct seq_file *p, void *v)
170 170
171 seq_putc(p, '\n'); 171 seq_putc(p, '\n');
172out: 172out:
173 spin_unlock_irqrestore(&desc->lock, flags); 173 raw_spin_unlock_irqrestore(&desc->lock, flags);
174 return 0; 174 return 0;
175} 175}
176 176
@@ -187,18 +187,18 @@ u64 arch_irq_stat_cpu(unsigned int cpu)
187 sum += irq_stats(cpu)->apic_perf_irqs; 187 sum += irq_stats(cpu)->apic_perf_irqs;
188 sum += irq_stats(cpu)->apic_pending_irqs; 188 sum += irq_stats(cpu)->apic_pending_irqs;
189#endif 189#endif
190 if (generic_interrupt_extension) 190 if (x86_platform_ipi_callback)
191 sum += irq_stats(cpu)->generic_irqs; 191 sum += irq_stats(cpu)->x86_platform_ipis;
192#ifdef CONFIG_SMP 192#ifdef CONFIG_SMP
193 sum += irq_stats(cpu)->irq_resched_count; 193 sum += irq_stats(cpu)->irq_resched_count;
194 sum += irq_stats(cpu)->irq_call_count; 194 sum += irq_stats(cpu)->irq_call_count;
195 sum += irq_stats(cpu)->irq_tlb_count; 195 sum += irq_stats(cpu)->irq_tlb_count;
196#endif 196#endif
197#ifdef CONFIG_X86_MCE 197#ifdef CONFIG_X86_THERMAL_VECTOR
198 sum += irq_stats(cpu)->irq_thermal_count; 198 sum += irq_stats(cpu)->irq_thermal_count;
199# ifdef CONFIG_X86_MCE_THRESHOLD 199#endif
200#ifdef CONFIG_X86_MCE_THRESHOLD
200 sum += irq_stats(cpu)->irq_threshold_count; 201 sum += irq_stats(cpu)->irq_threshold_count;
201# endif
202#endif 202#endif
203#ifdef CONFIG_X86_MCE 203#ifdef CONFIG_X86_MCE
204 sum += per_cpu(mce_exception_count, cpu); 204 sum += per_cpu(mce_exception_count, cpu);
@@ -251,9 +251,9 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs)
251} 251}
252 252
253/* 253/*
254 * Handler for GENERIC_INTERRUPT_VECTOR. 254 * Handler for X86_PLATFORM_IPI_VECTOR.
255 */ 255 */
256void smp_generic_interrupt(struct pt_regs *regs) 256void smp_x86_platform_ipi(struct pt_regs *regs)
257{ 257{
258 struct pt_regs *old_regs = set_irq_regs(regs); 258 struct pt_regs *old_regs = set_irq_regs(regs);
259 259
@@ -263,10 +263,10 @@ void smp_generic_interrupt(struct pt_regs *regs)
263 263
264 irq_enter(); 264 irq_enter();
265 265
266 inc_irq_stat(generic_irqs); 266 inc_irq_stat(x86_platform_ipis);
267 267
268 if (generic_interrupt_extension) 268 if (x86_platform_ipi_callback)
269 generic_interrupt_extension(); 269 x86_platform_ipi_callback();
270 270
271 irq_exit(); 271 irq_exit();
272 272
@@ -274,3 +274,93 @@ void smp_generic_interrupt(struct pt_regs *regs)
274} 274}
275 275
276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq); 276EXPORT_SYMBOL_GPL(vector_used_by_percpu_irq);
277
278#ifdef CONFIG_HOTPLUG_CPU
279/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
280void fixup_irqs(void)
281{
282 unsigned int irq, vector;
283 static int warned;
284 struct irq_desc *desc;
285
286 for_each_irq_desc(irq, desc) {
287 int break_affinity = 0;
288 int set_affinity = 1;
289 const struct cpumask *affinity;
290
291 if (!desc)
292 continue;
293 if (irq == 2)
294 continue;
295
296 /* interrupt's are disabled at this point */
297 raw_spin_lock(&desc->lock);
298
299 affinity = desc->affinity;
300 if (!irq_has_action(irq) ||
301 cpumask_equal(affinity, cpu_online_mask)) {
302 raw_spin_unlock(&desc->lock);
303 continue;
304 }
305
306 /*
307 * Complete the irq move. This cpu is going down and for
308 * non intr-remapping case, we can't wait till this interrupt
309 * arrives at this cpu before completing the irq move.
310 */
311 irq_force_complete_move(irq);
312
313 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
314 break_affinity = 1;
315 affinity = cpu_all_mask;
316 }
317
318 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->mask)
319 desc->chip->mask(irq);
320
321 if (desc->chip->set_affinity)
322 desc->chip->set_affinity(irq, affinity);
323 else if (!(warned++))
324 set_affinity = 0;
325
326 if (!(desc->status & IRQ_MOVE_PCNTXT) && desc->chip->unmask)
327 desc->chip->unmask(irq);
328
329 raw_spin_unlock(&desc->lock);
330
331 if (break_affinity && set_affinity)
332 printk("Broke affinity for irq %i\n", irq);
333 else if (!set_affinity)
334 printk("Cannot set affinity for irq %i\n", irq);
335 }
336
337 /*
338 * We can remove mdelay() and then send spuriuous interrupts to
339 * new cpu targets for all the irqs that were handled previously by
340 * this cpu. While it works, I have seen spurious interrupt messages
341 * (nothing wrong but still...).
342 *
343 * So for now, retain mdelay(1) and check the IRR and then send those
344 * interrupts to new targets as this cpu is already offlined...
345 */
346 mdelay(1);
347
348 for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
349 unsigned int irr;
350
351 if (__get_cpu_var(vector_irq)[vector] < 0)
352 continue;
353
354 irr = apic_read(APIC_IRR + (vector / 32 * 0x10));
355 if (irr & (1 << (vector % 32))) {
356 irq = __get_cpu_var(vector_irq)[vector];
357
358 desc = irq_to_desc(irq);
359 raw_spin_lock(&desc->lock);
360 if (desc->chip->retrigger)
361 desc->chip->retrigger(irq);
362 raw_spin_unlock(&desc->lock);
363 }
364 }
365}
366#endif
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 7d35d0fe2329..10709f29d166 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -211,48 +211,3 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
211 211
212 return true; 212 return true;
213} 213}
214
215#ifdef CONFIG_HOTPLUG_CPU
216
217/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
218void fixup_irqs(void)
219{
220 unsigned int irq;
221 struct irq_desc *desc;
222
223 for_each_irq_desc(irq, desc) {
224 const struct cpumask *affinity;
225
226 if (!desc)
227 continue;
228 if (irq == 2)
229 continue;
230
231 affinity = desc->affinity;
232 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
233 printk("Breaking affinity for irq %i\n", irq);
234 affinity = cpu_all_mask;
235 }
236 if (desc->chip->set_affinity)
237 desc->chip->set_affinity(irq, affinity);
238 else if (desc->action)
239 printk_once("Cannot set affinity for irq %i\n", irq);
240 }
241
242#if 0
243 barrier();
244 /* Ingo Molnar says: "after the IO-APIC masks have been redirected
245 [note the nop - the interrupt-enable boundary on x86 is two
246 instructions from sti] - to flush out pending hardirqs and
247 IPIs. After this point nothing is supposed to reach this CPU." */
248 __asm__ __volatile__("sti; nop; cli");
249 barrier();
250#else
251 /* That doesn't seem sufficient. Give it 1ms. */
252 local_irq_enable();
253 mdelay(1);
254 local_irq_disable();
255#endif
256}
257#endif
258
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 977d8b43a0dd..acf8fbf8fbda 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -62,64 +62,6 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
62 return true; 62 return true;
63} 63}
64 64
65#ifdef CONFIG_HOTPLUG_CPU
66/* A cpu has been removed from cpu_online_mask. Reset irq affinities. */
67void fixup_irqs(void)
68{
69 unsigned int irq;
70 static int warned;
71 struct irq_desc *desc;
72
73 for_each_irq_desc(irq, desc) {
74 int break_affinity = 0;
75 int set_affinity = 1;
76 const struct cpumask *affinity;
77
78 if (!desc)
79 continue;
80 if (irq == 2)
81 continue;
82
83 /* interrupt's are disabled at this point */
84 spin_lock(&desc->lock);
85
86 affinity = desc->affinity;
87 if (!irq_has_action(irq) ||
88 cpumask_equal(affinity, cpu_online_mask)) {
89 spin_unlock(&desc->lock);
90 continue;
91 }
92
93 if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
94 break_affinity = 1;
95 affinity = cpu_all_mask;
96 }
97
98 if (desc->chip->mask)
99 desc->chip->mask(irq);
100
101 if (desc->chip->set_affinity)
102 desc->chip->set_affinity(irq, affinity);
103 else if (!(warned++))
104 set_affinity = 0;
105
106 if (desc->chip->unmask)
107 desc->chip->unmask(irq);
108
109 spin_unlock(&desc->lock);
110
111 if (break_affinity && set_affinity)
112 printk("Broke affinity for irq %i\n", irq);
113 else if (!set_affinity)
114 printk("Cannot set affinity for irq %i\n", irq);
115 }
116
117 /* That doesn't seem sufficient. Give it 1ms. */
118 local_irq_enable();
119 mdelay(1);
120 local_irq_disable();
121}
122#endif
123 65
124extern void call_softirq(void); 66extern void call_softirq(void);
125 67
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f5fa64c0b37e..a760ce1a2c0d 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -5,7 +5,6 @@
5#include <linux/ioport.h> 5#include <linux/ioport.h>
6#include <linux/interrupt.h> 6#include <linux/interrupt.h>
7#include <linux/timex.h> 7#include <linux/timex.h>
8#include <linux/slab.h>
9#include <linux/random.h> 8#include <linux/random.h>
10#include <linux/kprobes.h> 9#include <linux/kprobes.h>
11#include <linux/init.h> 10#include <linux/init.h>
@@ -84,24 +83,7 @@ static struct irqaction irq2 = {
84}; 83};
85 84
86DEFINE_PER_CPU(vector_irq_t, vector_irq) = { 85DEFINE_PER_CPU(vector_irq_t, vector_irq) = {
87 [0 ... IRQ0_VECTOR - 1] = -1, 86 [0 ... NR_VECTORS - 1] = -1,
88 [IRQ0_VECTOR] = 0,
89 [IRQ1_VECTOR] = 1,
90 [IRQ2_VECTOR] = 2,
91 [IRQ3_VECTOR] = 3,
92 [IRQ4_VECTOR] = 4,
93 [IRQ5_VECTOR] = 5,
94 [IRQ6_VECTOR] = 6,
95 [IRQ7_VECTOR] = 7,
96 [IRQ8_VECTOR] = 8,
97 [IRQ9_VECTOR] = 9,
98 [IRQ10_VECTOR] = 10,
99 [IRQ11_VECTOR] = 11,
100 [IRQ12_VECTOR] = 12,
101 [IRQ13_VECTOR] = 13,
102 [IRQ14_VECTOR] = 14,
103 [IRQ15_VECTOR] = 15,
104 [IRQ15_VECTOR + 1 ... NR_VECTORS - 1] = -1
105}; 87};
106 88
107int vector_used_by_percpu_irq(unsigned int vector) 89int vector_used_by_percpu_irq(unsigned int vector)
@@ -123,12 +105,12 @@ void __init init_ISA_irqs(void)
123#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC) 105#if defined(CONFIG_X86_64) || defined(CONFIG_X86_LOCAL_APIC)
124 init_bsp_APIC(); 106 init_bsp_APIC();
125#endif 107#endif
126 init_8259A(0); 108 legacy_pic->init(0);
127 109
128 /* 110 /*
129 * 16 old-style INTA-cycle interrupts: 111 * 16 old-style INTA-cycle interrupts:
130 */ 112 */
131 for (i = 0; i < NR_IRQS_LEGACY; i++) { 113 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++) {
132 struct irq_desc *desc = irq_to_desc(i); 114 struct irq_desc *desc = irq_to_desc(i);
133 115
134 desc->status = IRQ_DISABLED; 116 desc->status = IRQ_DISABLED;
@@ -142,9 +124,44 @@ void __init init_ISA_irqs(void)
142 124
143void __init init_IRQ(void) 125void __init init_IRQ(void)
144{ 126{
127 int i;
128
129 /*
130 * On cpu 0, Assign IRQ0_VECTOR..IRQ15_VECTOR's to IRQ 0..15.
131 * If these IRQ's are handled by legacy interrupt-controllers like PIC,
132 * then this configuration will likely be static after the boot. If
133 * these IRQ's are handled by more mordern controllers like IO-APIC,
134 * then this vector space can be freed and re-used dynamically as the
135 * irq's migrate etc.
136 */
137 for (i = 0; i < legacy_pic->nr_legacy_irqs; i++)
138 per_cpu(vector_irq, 0)[IRQ0_VECTOR + i] = i;
139
145 x86_init.irqs.intr_init(); 140 x86_init.irqs.intr_init();
146} 141}
147 142
143/*
144 * Setup the vector to irq mappings.
145 */
146void setup_vector_irq(int cpu)
147{
148#ifndef CONFIG_X86_IO_APIC
149 int irq;
150
151 /*
152 * On most of the platforms, legacy PIC delivers the interrupts on the
153 * boot cpu. But there are certain platforms where PIC interrupts are
154 * delivered to multiple cpu's. If the legacy IRQ is handled by the
155 * legacy PIC, for the new cpu that is coming online, setup the static
156 * legacy vector to irq mapping:
157 */
158 for (irq = 0; irq < legacy_pic->nr_legacy_irqs; irq++)
159 per_cpu(vector_irq, cpu)[IRQ0_VECTOR + irq] = irq;
160#endif
161
162 __setup_vector_irq(cpu);
163}
164
148static void __init smp_intr_init(void) 165static void __init smp_intr_init(void)
149{ 166{
150#ifdef CONFIG_SMP 167#ifdef CONFIG_SMP
@@ -203,8 +220,8 @@ static void __init apic_intr_init(void)
203 /* self generated IPI for local APIC timer */ 220 /* self generated IPI for local APIC timer */
204 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); 221 alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt);
205 222
206 /* generic IPI for platform specific use */ 223 /* IPI for X86 platform specific use */
207 alloc_intr_gate(GENERIC_INTERRUPT_VECTOR, generic_interrupt); 224 alloc_intr_gate(X86_PLATFORM_IPI_VECTOR, x86_platform_ipi);
208 225
209 /* IPI vectors for APIC spurious and error interrupts */ 226 /* IPI vectors for APIC spurious and error interrupts */
210 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); 227 alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt);
diff --git a/arch/x86/kernel/k8.c b/arch/x86/kernel/k8.c
index cbc4332a77b2..0f7bc20cfcde 100644
--- a/arch/x86/kernel/k8.c
+++ b/arch/x86/kernel/k8.c
@@ -2,8 +2,8 @@
2 * Shared support code for AMD K8 northbridges and derivates. 2 * Shared support code for AMD K8 northbridges and derivates.
3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2. 3 * Copyright 2006 Andi Kleen, SUSE Labs. Subject to GPLv2.
4 */ 4 */
5#include <linux/gfp.h>
6#include <linux/types.h> 5#include <linux/types.h>
6#include <linux/slab.h>
7#include <linux/init.h> 7#include <linux/init.h>
8#include <linux/errno.h> 8#include <linux/errno.h>
9#include <linux/module.h> 9#include <linux/module.h>
@@ -121,3 +121,17 @@ void k8_flush_garts(void)
121} 121}
122EXPORT_SYMBOL_GPL(k8_flush_garts); 122EXPORT_SYMBOL_GPL(k8_flush_garts);
123 123
124static __init int init_k8_nbs(void)
125{
126 int err = 0;
127
128 err = cache_k8_northbridges();
129
130 if (err < 0)
131 printk(KERN_NOTICE "K8 NB: Cannot enumerate AMD northbridges.\n");
132
133 return err;
134}
135
136/* This has to go after the PCI subsystem */
137fs_initcall(init_k8_nbs);
diff --git a/arch/x86/kernel/kdebugfs.c b/arch/x86/kernel/kdebugfs.c
index e444357375ce..8afd9f321f10 100644
--- a/arch/x86/kernel/kdebugfs.c
+++ b/arch/x86/kernel/kdebugfs.c
@@ -9,6 +9,7 @@
9#include <linux/debugfs.h> 9#include <linux/debugfs.h>
10#include <linux/uaccess.h> 10#include <linux/uaccess.h>
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/slab.h>
12#include <linux/init.h> 13#include <linux/init.h>
13#include <linux/stat.h> 14#include <linux/stat.h>
14#include <linux/io.h> 15#include <linux/io.h>
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 8d82a77a3f3b..b2258ca91003 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -42,7 +42,9 @@
42#include <linux/init.h> 42#include <linux/init.h>
43#include <linux/smp.h> 43#include <linux/smp.h>
44#include <linux/nmi.h> 44#include <linux/nmi.h>
45#include <linux/hw_breakpoint.h>
45 46
47#include <asm/debugreg.h>
46#include <asm/apicdef.h> 48#include <asm/apicdef.h>
47#include <asm/system.h> 49#include <asm/system.h>
48 50
@@ -85,10 +87,15 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
85 gdb_regs[GDB_DS] = regs->ds; 87 gdb_regs[GDB_DS] = regs->ds;
86 gdb_regs[GDB_ES] = regs->es; 88 gdb_regs[GDB_ES] = regs->es;
87 gdb_regs[GDB_CS] = regs->cs; 89 gdb_regs[GDB_CS] = regs->cs;
88 gdb_regs[GDB_SS] = __KERNEL_DS;
89 gdb_regs[GDB_FS] = 0xFFFF; 90 gdb_regs[GDB_FS] = 0xFFFF;
90 gdb_regs[GDB_GS] = 0xFFFF; 91 gdb_regs[GDB_GS] = 0xFFFF;
91 gdb_regs[GDB_SP] = (int)&regs->sp; 92 if (user_mode_vm(regs)) {
93 gdb_regs[GDB_SS] = regs->ss;
94 gdb_regs[GDB_SP] = regs->sp;
95 } else {
96 gdb_regs[GDB_SS] = __KERNEL_DS;
97 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
98 }
92#else 99#else
93 gdb_regs[GDB_R8] = regs->r8; 100 gdb_regs[GDB_R8] = regs->r8;
94 gdb_regs[GDB_R9] = regs->r9; 101 gdb_regs[GDB_R9] = regs->r9;
@@ -101,7 +108,7 @@ void pt_regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs)
101 gdb_regs32[GDB_PS] = regs->flags; 108 gdb_regs32[GDB_PS] = regs->flags;
102 gdb_regs32[GDB_CS] = regs->cs; 109 gdb_regs32[GDB_CS] = regs->cs;
103 gdb_regs32[GDB_SS] = regs->ss; 110 gdb_regs32[GDB_SS] = regs->ss;
104 gdb_regs[GDB_SP] = regs->sp; 111 gdb_regs[GDB_SP] = kernel_stack_pointer(regs);
105#endif 112#endif
106} 113}
107 114
@@ -198,41 +205,81 @@ void gdb_regs_to_pt_regs(unsigned long *gdb_regs, struct pt_regs *regs)
198 205
199static struct hw_breakpoint { 206static struct hw_breakpoint {
200 unsigned enabled; 207 unsigned enabled;
201 unsigned type;
202 unsigned len;
203 unsigned long addr; 208 unsigned long addr;
209 int len;
210 int type;
211 struct perf_event **pev;
204} breakinfo[4]; 212} breakinfo[4];
205 213
206static void kgdb_correct_hw_break(void) 214static void kgdb_correct_hw_break(void)
207{ 215{
208 unsigned long dr7;
209 int correctit = 0;
210 int breakbit;
211 int breakno; 216 int breakno;
212 217
213 get_debugreg(dr7, 7);
214 for (breakno = 0; breakno < 4; breakno++) { 218 for (breakno = 0; breakno < 4; breakno++) {
215 breakbit = 2 << (breakno << 1); 219 struct perf_event *bp;
216 if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { 220 struct arch_hw_breakpoint *info;
217 correctit = 1; 221 int val;
218 dr7 |= breakbit; 222 int cpu = raw_smp_processor_id();
219 dr7 &= ~(0xf0000 << (breakno << 2)); 223 if (!breakinfo[breakno].enabled)
220 dr7 |= ((breakinfo[breakno].len << 2) | 224 continue;
221 breakinfo[breakno].type) << 225 bp = *per_cpu_ptr(breakinfo[breakno].pev, cpu);
222 ((breakno << 2) + 16); 226 info = counter_arch_bp(bp);
223 if (breakno >= 0 && breakno <= 3) 227 if (bp->attr.disabled != 1)
224 set_debugreg(breakinfo[breakno].addr, breakno); 228 continue;
225 229 bp->attr.bp_addr = breakinfo[breakno].addr;
226 } else { 230 bp->attr.bp_len = breakinfo[breakno].len;
227 if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { 231 bp->attr.bp_type = breakinfo[breakno].type;
228 correctit = 1; 232 info->address = breakinfo[breakno].addr;
229 dr7 &= ~breakbit; 233 info->len = breakinfo[breakno].len;
230 dr7 &= ~(0xf0000 << (breakno << 2)); 234 info->type = breakinfo[breakno].type;
231 } 235 val = arch_install_hw_breakpoint(bp);
232 } 236 if (!val)
237 bp->attr.disabled = 0;
233 } 238 }
234 if (correctit) 239 hw_breakpoint_restore();
235 set_debugreg(dr7, 7); 240}
241
242static int hw_break_reserve_slot(int breakno)
243{
244 int cpu;
245 int cnt = 0;
246 struct perf_event **pevent;
247
248 for_each_online_cpu(cpu) {
249 cnt++;
250 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
251 if (dbg_reserve_bp_slot(*pevent))
252 goto fail;
253 }
254
255 return 0;
256
257fail:
258 for_each_online_cpu(cpu) {
259 cnt--;
260 if (!cnt)
261 break;
262 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
263 dbg_release_bp_slot(*pevent);
264 }
265 return -1;
266}
267
268static int hw_break_release_slot(int breakno)
269{
270 struct perf_event **pevent;
271 int cpu;
272
273 for_each_online_cpu(cpu) {
274 pevent = per_cpu_ptr(breakinfo[breakno].pev, cpu);
275 if (dbg_release_bp_slot(*pevent))
276 /*
277 * The debugger is responisble for handing the retry on
278 * remove failure.
279 */
280 return -1;
281 }
282 return 0;
236} 283}
237 284
238static int 285static int
@@ -246,6 +293,10 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
246 if (i == 4) 293 if (i == 4)
247 return -1; 294 return -1;
248 295
296 if (hw_break_release_slot(i)) {
297 printk(KERN_ERR "Cannot remove hw breakpoint at %lx\n", addr);
298 return -1;
299 }
249 breakinfo[i].enabled = 0; 300 breakinfo[i].enabled = 0;
250 301
251 return 0; 302 return 0;
@@ -254,15 +305,23 @@ kgdb_remove_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
254static void kgdb_remove_all_hw_break(void) 305static void kgdb_remove_all_hw_break(void)
255{ 306{
256 int i; 307 int i;
308 int cpu = raw_smp_processor_id();
309 struct perf_event *bp;
257 310
258 for (i = 0; i < 4; i++) 311 for (i = 0; i < 4; i++) {
259 memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); 312 if (!breakinfo[i].enabled)
313 continue;
314 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
315 if (bp->attr.disabled == 1)
316 continue;
317 arch_uninstall_hw_breakpoint(bp);
318 bp->attr.disabled = 1;
319 }
260} 320}
261 321
262static int 322static int
263kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype) 323kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
264{ 324{
265 unsigned type;
266 int i; 325 int i;
267 326
268 for (i = 0; i < 4; i++) 327 for (i = 0; i < 4; i++)
@@ -273,27 +332,42 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
273 332
274 switch (bptype) { 333 switch (bptype) {
275 case BP_HARDWARE_BREAKPOINT: 334 case BP_HARDWARE_BREAKPOINT:
276 type = 0; 335 len = 1;
277 len = 1; 336 breakinfo[i].type = X86_BREAKPOINT_EXECUTE;
278 break; 337 break;
279 case BP_WRITE_WATCHPOINT: 338 case BP_WRITE_WATCHPOINT:
280 type = 1; 339 breakinfo[i].type = X86_BREAKPOINT_WRITE;
281 break; 340 break;
282 case BP_ACCESS_WATCHPOINT: 341 case BP_ACCESS_WATCHPOINT:
283 type = 3; 342 breakinfo[i].type = X86_BREAKPOINT_RW;
284 break; 343 break;
285 default: 344 default:
286 return -1; 345 return -1;
287 } 346 }
288 347 switch (len) {
289 if (len == 1 || len == 2 || len == 4) 348 case 1:
290 breakinfo[i].len = len - 1; 349 breakinfo[i].len = X86_BREAKPOINT_LEN_1;
291 else 350 break;
351 case 2:
352 breakinfo[i].len = X86_BREAKPOINT_LEN_2;
353 break;
354 case 4:
355 breakinfo[i].len = X86_BREAKPOINT_LEN_4;
356 break;
357#ifdef CONFIG_X86_64
358 case 8:
359 breakinfo[i].len = X86_BREAKPOINT_LEN_8;
360 break;
361#endif
362 default:
292 return -1; 363 return -1;
293 364 }
294 breakinfo[i].enabled = 1;
295 breakinfo[i].addr = addr; 365 breakinfo[i].addr = addr;
296 breakinfo[i].type = type; 366 if (hw_break_reserve_slot(i)) {
367 breakinfo[i].addr = 0;
368 return -1;
369 }
370 breakinfo[i].enabled = 1;
297 371
298 return 0; 372 return 0;
299} 373}
@@ -308,8 +382,21 @@ kgdb_set_hw_break(unsigned long addr, int len, enum kgdb_bptype bptype)
308 */ 382 */
309void kgdb_disable_hw_debug(struct pt_regs *regs) 383void kgdb_disable_hw_debug(struct pt_regs *regs)
310{ 384{
385 int i;
386 int cpu = raw_smp_processor_id();
387 struct perf_event *bp;
388
311 /* Disable hardware debugging while we are in kgdb: */ 389 /* Disable hardware debugging while we are in kgdb: */
312 set_debugreg(0UL, 7); 390 set_debugreg(0UL, 7);
391 for (i = 0; i < 4; i++) {
392 if (!breakinfo[i].enabled)
393 continue;
394 bp = *per_cpu_ptr(breakinfo[i].pev, cpu);
395 if (bp->attr.disabled == 1)
396 continue;
397 arch_uninstall_hw_breakpoint(bp);
398 bp->attr.disabled = 1;
399 }
313} 400}
314 401
315/** 402/**
@@ -373,7 +460,6 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
373 struct pt_regs *linux_regs) 460 struct pt_regs *linux_regs)
374{ 461{
375 unsigned long addr; 462 unsigned long addr;
376 unsigned long dr6;
377 char *ptr; 463 char *ptr;
378 int newPC; 464 int newPC;
379 465
@@ -395,25 +481,10 @@ int kgdb_arch_handle_exception(int e_vector, int signo, int err_code,
395 /* set the trace bit if we're stepping */ 481 /* set the trace bit if we're stepping */
396 if (remcomInBuffer[0] == 's') { 482 if (remcomInBuffer[0] == 's') {
397 linux_regs->flags |= X86_EFLAGS_TF; 483 linux_regs->flags |= X86_EFLAGS_TF;
398 kgdb_single_step = 1;
399 atomic_set(&kgdb_cpu_doing_single_step, 484 atomic_set(&kgdb_cpu_doing_single_step,
400 raw_smp_processor_id()); 485 raw_smp_processor_id());
401 } 486 }
402 487
403 get_debugreg(dr6, 6);
404 if (!(dr6 & 0x4000)) {
405 int breakno;
406
407 for (breakno = 0; breakno < 4; breakno++) {
408 if (dr6 & (1 << breakno) &&
409 breakinfo[breakno].type == 0) {
410 /* Set restore flag: */
411 linux_regs->flags |= X86_EFLAGS_RF;
412 break;
413 }
414 }
415 }
416 set_debugreg(0UL, 6);
417 kgdb_correct_hw_break(); 488 kgdb_correct_hw_break();
418 489
419 return 0; 490 return 0;
@@ -434,6 +505,11 @@ single_step_cont(struct pt_regs *regs, struct die_args *args)
434 "resuming...\n"); 505 "resuming...\n");
435 kgdb_arch_handle_exception(args->trapnr, args->signr, 506 kgdb_arch_handle_exception(args->trapnr, args->signr,
436 args->err, "c", "", regs); 507 args->err, "c", "", regs);
508 /*
509 * Reset the BS bit in dr6 (pointed by args->err) to
510 * denote completion of processing
511 */
512 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
437 513
438 return NOTIFY_STOP; 514 return NOTIFY_STOP;
439} 515}
@@ -476,8 +552,7 @@ static int __kgdb_notify(struct die_args *args, unsigned long cmd)
476 break; 552 break;
477 553
478 case DIE_DEBUG: 554 case DIE_DEBUG:
479 if (atomic_read(&kgdb_cpu_doing_single_step) == 555 if (atomic_read(&kgdb_cpu_doing_single_step) != -1) {
480 raw_smp_processor_id()) {
481 if (user_mode(regs)) 556 if (user_mode(regs))
482 return single_step_cont(regs, args); 557 return single_step_cont(regs, args);
483 break; 558 break;
@@ -530,7 +605,42 @@ static struct notifier_block kgdb_notifier = {
530 */ 605 */
531int kgdb_arch_init(void) 606int kgdb_arch_init(void)
532{ 607{
533 return register_die_notifier(&kgdb_notifier); 608 int i, cpu;
609 int ret;
610 struct perf_event_attr attr;
611 struct perf_event **pevent;
612
613 ret = register_die_notifier(&kgdb_notifier);
614 if (ret != 0)
615 return ret;
616 /*
617 * Pre-allocate the hw breakpoint structions in the non-atomic
618 * portion of kgdb because this operation requires mutexs to
619 * complete.
620 */
621 hw_breakpoint_init(&attr);
622 attr.bp_addr = (unsigned long)kgdb_arch_init;
623 attr.bp_len = HW_BREAKPOINT_LEN_1;
624 attr.bp_type = HW_BREAKPOINT_W;
625 attr.disabled = 1;
626 for (i = 0; i < 4; i++) {
627 breakinfo[i].pev = register_wide_hw_breakpoint(&attr, NULL);
628 if (IS_ERR(breakinfo[i].pev)) {
629 printk(KERN_ERR "kgdb: Could not allocate hw breakpoints\n");
630 breakinfo[i].pev = NULL;
631 kgdb_arch_exit();
632 return -1;
633 }
634 for_each_online_cpu(cpu) {
635 pevent = per_cpu_ptr(breakinfo[i].pev, cpu);
636 pevent[0]->hw.sample_period = 1;
637 if (pevent[0]->destroy != NULL) {
638 pevent[0]->destroy = NULL;
639 release_bp_slot(*pevent);
640 }
641 }
642 }
643 return ret;
534} 644}
535 645
536/** 646/**
@@ -541,6 +651,13 @@ int kgdb_arch_init(void)
541 */ 651 */
542void kgdb_arch_exit(void) 652void kgdb_arch_exit(void)
543{ 653{
654 int i;
655 for (i = 0; i < 4; i++) {
656 if (breakinfo[i].pev) {
657 unregister_wide_hw_breakpoint(breakinfo[i].pev);
658 breakinfo[i].pev = NULL;
659 }
660 }
544 unregister_die_notifier(&kgdb_notifier); 661 unregister_die_notifier(&kgdb_notifier);
545} 662}
546 663
diff --git a/arch/x86/kernel/kprobes.c b/arch/x86/kernel/kprobes.c
index 7b5169d2b000..1658efdfb4e5 100644
--- a/arch/x86/kernel/kprobes.c
+++ b/arch/x86/kernel/kprobes.c
@@ -48,31 +48,23 @@
48#include <linux/preempt.h> 48#include <linux/preempt.h>
49#include <linux/module.h> 49#include <linux/module.h>
50#include <linux/kdebug.h> 50#include <linux/kdebug.h>
51#include <linux/kallsyms.h>
52#include <linux/ftrace.h>
51 53
52#include <asm/cacheflush.h> 54#include <asm/cacheflush.h>
53#include <asm/desc.h> 55#include <asm/desc.h>
54#include <asm/pgtable.h> 56#include <asm/pgtable.h>
55#include <asm/uaccess.h> 57#include <asm/uaccess.h>
56#include <asm/alternative.h> 58#include <asm/alternative.h>
59#include <asm/insn.h>
60#include <asm/debugreg.h>
57 61
58void jprobe_return_end(void); 62void jprobe_return_end(void);
59 63
60DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; 64DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL;
61DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); 65DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk);
62 66
63#ifdef CONFIG_X86_64 67#define stack_addr(regs) ((unsigned long *)kernel_stack_pointer(regs))
64#define stack_addr(regs) ((unsigned long *)regs->sp)
65#else
66/*
67 * "&regs->sp" looks wrong, but it's correct for x86_32. x86_32 CPUs
68 * don't save the ss and esp registers if the CPU is already in kernel
69 * mode when it traps. So for kprobes, regs->sp and regs->ss are not
70 * the [nonexistent] saved stack pointer and ss register, but rather
71 * the top 8 bytes of the pre-int3 stack. So &regs->sp happens to
72 * point to the top of the pre-int3 stack.
73 */
74#define stack_addr(regs) ((unsigned long *)&regs->sp)
75#endif
76 68
77#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\ 69#define W(row, b0, b1, b2, b3, b4, b5, b6, b7, b8, b9, ba, bb, bc, bd, be, bf)\
78 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \ 70 (((b0##UL << 0x0)|(b1##UL << 0x1)|(b2##UL << 0x2)|(b3##UL << 0x3) | \
@@ -106,50 +98,6 @@ static const u32 twobyte_is_boostable[256 / 32] = {
106 /* ----------------------------------------------- */ 98 /* ----------------------------------------------- */
107 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */ 99 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
108}; 100};
109static const u32 onebyte_has_modrm[256 / 32] = {
110 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
111 /* ----------------------------------------------- */
112 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 00 */
113 W(0x10, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 10 */
114 W(0x20, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) | /* 20 */
115 W(0x30, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0) , /* 30 */
116 W(0x40, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 40 */
117 W(0x50, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 50 */
118 W(0x60, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0) | /* 60 */
119 W(0x70, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 70 */
120 W(0x80, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 80 */
121 W(0x90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 90 */
122 W(0xa0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* a0 */
123 W(0xb0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* b0 */
124 W(0xc0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* c0 */
125 W(0xd0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1) , /* d0 */
126 W(0xe0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* e0 */
127 W(0xf0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1) /* f0 */
128 /* ----------------------------------------------- */
129 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
130};
131static const u32 twobyte_has_modrm[256 / 32] = {
132 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
133 /* ----------------------------------------------- */
134 W(0x00, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1) | /* 0f */
135 W(0x10, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0) , /* 1f */
136 W(0x20, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1) | /* 2f */
137 W(0x30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) , /* 3f */
138 W(0x40, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 4f */
139 W(0x50, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 5f */
140 W(0x60, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* 6f */
141 W(0x70, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1) , /* 7f */
142 W(0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0) | /* 8f */
143 W(0x90, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* 9f */
144 W(0xa0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1) | /* af */
145 W(0xb0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1) , /* bf */
146 W(0xc0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0) | /* cf */
147 W(0xd0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) , /* df */
148 W(0xe0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) | /* ef */
149 W(0xf0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0) /* ff */
150 /* ----------------------------------------------- */
151 /* 0 1 2 3 4 5 6 7 8 9 a b c d e f */
152};
153#undef W 101#undef W
154 102
155struct kretprobe_blackpoint kretprobe_blacklist[] = { 103struct kretprobe_blackpoint kretprobe_blacklist[] = {
@@ -159,16 +107,22 @@ struct kretprobe_blackpoint kretprobe_blacklist[] = {
159}; 107};
160const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist); 108const int kretprobe_blacklist_size = ARRAY_SIZE(kretprobe_blacklist);
161 109
162/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/ 110static void __kprobes __synthesize_relative_insn(void *from, void *to, u8 op)
163static void __kprobes set_jmp_op(void *from, void *to)
164{ 111{
165 struct __arch_jmp_op { 112 struct __arch_relative_insn {
166 char op; 113 u8 op;
167 s32 raddr; 114 s32 raddr;
168 } __attribute__((packed)) * jop; 115 } __attribute__((packed)) *insn;
169 jop = (struct __arch_jmp_op *)from; 116
170 jop->raddr = (s32)((long)(to) - ((long)(from) + 5)); 117 insn = (struct __arch_relative_insn *)from;
171 jop->op = RELATIVEJUMP_INSTRUCTION; 118 insn->raddr = (s32)((long)(to) - ((long)(from) + 5));
119 insn->op = op;
120}
121
122/* Insert a jump instruction at address 'from', which jumps to address 'to'.*/
123static void __kprobes synthesize_reljump(void *from, void *to)
124{
125 __synthesize_relative_insn(from, to, RELATIVEJUMP_OPCODE);
172} 126}
173 127
174/* 128/*
@@ -244,6 +198,75 @@ retry:
244 } 198 }
245} 199}
246 200
201/* Recover the probed instruction at addr for further analysis. */
202static int recover_probed_instruction(kprobe_opcode_t *buf, unsigned long addr)
203{
204 struct kprobe *kp;
205 kp = get_kprobe((void *)addr);
206 if (!kp)
207 return -EINVAL;
208
209 /*
210 * Basically, kp->ainsn.insn has an original instruction.
211 * However, RIP-relative instruction can not do single-stepping
212 * at different place, __copy_instruction() tweaks the displacement of
213 * that instruction. In that case, we can't recover the instruction
214 * from the kp->ainsn.insn.
215 *
216 * On the other hand, kp->opcode has a copy of the first byte of
217 * the probed instruction, which is overwritten by int3. And
218 * the instruction at kp->addr is not modified by kprobes except
219 * for the first byte, we can recover the original instruction
220 * from it and kp->opcode.
221 */
222 memcpy(buf, kp->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t));
223 buf[0] = kp->opcode;
224 return 0;
225}
226
227/* Dummy buffers for kallsyms_lookup */
228static char __dummy_buf[KSYM_NAME_LEN];
229
230/* Check if paddr is at an instruction boundary */
231static int __kprobes can_probe(unsigned long paddr)
232{
233 int ret;
234 unsigned long addr, offset = 0;
235 struct insn insn;
236 kprobe_opcode_t buf[MAX_INSN_SIZE];
237
238 if (!kallsyms_lookup(paddr, NULL, &offset, NULL, __dummy_buf))
239 return 0;
240
241 /* Decode instructions */
242 addr = paddr - offset;
243 while (addr < paddr) {
244 kernel_insn_init(&insn, (void *)addr);
245 insn_get_opcode(&insn);
246
247 /*
248 * Check if the instruction has been modified by another
249 * kprobe, in which case we replace the breakpoint by the
250 * original instruction in our buffer.
251 */
252 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
253 ret = recover_probed_instruction(buf, addr);
254 if (ret)
255 /*
256 * Another debugging subsystem might insert
257 * this breakpoint. In that case, we can't
258 * recover it.
259 */
260 return 0;
261 kernel_insn_init(&insn, buf);
262 }
263 insn_get_length(&insn);
264 addr += insn.length;
265 }
266
267 return (addr == paddr);
268}
269
247/* 270/*
248 * Returns non-zero if opcode modifies the interrupt flag. 271 * Returns non-zero if opcode modifies the interrupt flag.
249 */ 272 */
@@ -268,86 +291,67 @@ static int __kprobes is_IF_modifier(kprobe_opcode_t *insn)
268} 291}
269 292
270/* 293/*
271 * Adjust the displacement if the instruction uses the %rip-relative 294 * Copy an instruction and adjust the displacement if the instruction
272 * addressing mode. 295 * uses the %rip-relative addressing mode.
273 * If it does, Return the address of the 32-bit displacement word. 296 * If it does, Return the address of the 32-bit displacement word.
274 * If not, return null. 297 * If not, return null.
275 * Only applicable to 64-bit x86. 298 * Only applicable to 64-bit x86.
276 */ 299 */
277static void __kprobes fix_riprel(struct kprobe *p) 300static int __kprobes __copy_instruction(u8 *dest, u8 *src, int recover)
278{ 301{
279#ifdef CONFIG_X86_64 302 struct insn insn;
280 u8 *insn = p->ainsn.insn; 303 int ret;
281 s64 disp; 304 kprobe_opcode_t buf[MAX_INSN_SIZE];
282 int need_modrm; 305
283 306 kernel_insn_init(&insn, src);
284 /* Skip legacy instruction prefixes. */ 307 if (recover) {
285 while (1) { 308 insn_get_opcode(&insn);
286 switch (*insn) { 309 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
287 case 0x66: 310 ret = recover_probed_instruction(buf,
288 case 0x67: 311 (unsigned long)src);
289 case 0x2e: 312 if (ret)
290 case 0x3e: 313 return 0;
291 case 0x26: 314 kernel_insn_init(&insn, buf);
292 case 0x64:
293 case 0x65:
294 case 0x36:
295 case 0xf0:
296 case 0xf3:
297 case 0xf2:
298 ++insn;
299 continue;
300 } 315 }
301 break;
302 } 316 }
317 insn_get_length(&insn);
318 memcpy(dest, insn.kaddr, insn.length);
303 319
304 /* Skip REX instruction prefix. */ 320#ifdef CONFIG_X86_64
305 if (is_REX_prefix(insn)) 321 if (insn_rip_relative(&insn)) {
306 ++insn; 322 s64 newdisp;
307 323 u8 *disp;
308 if (*insn == 0x0f) { 324 kernel_insn_init(&insn, dest);
309 /* Two-byte opcode. */ 325 insn_get_displacement(&insn);
310 ++insn; 326 /*
311 need_modrm = test_bit(*insn, 327 * The copied instruction uses the %rip-relative addressing
312 (unsigned long *)twobyte_has_modrm); 328 * mode. Adjust the displacement for the difference between
313 } else 329 * the original location of this instruction and the location
314 /* One-byte opcode. */ 330 * of the copy that will actually be run. The tricky bit here
315 need_modrm = test_bit(*insn, 331 * is making sure that the sign extension happens correctly in
316 (unsigned long *)onebyte_has_modrm); 332 * this calculation, since we need a signed 32-bit result to
317 333 * be sign-extended to 64 bits when it's added to the %rip
318 if (need_modrm) { 334 * value and yield the same 64-bit result that the sign-
319 u8 modrm = *++insn; 335 * extension of the original signed 32-bit displacement would
320 if ((modrm & 0xc7) == 0x05) { 336 * have given.
321 /* %rip+disp32 addressing mode */ 337 */
322 /* Displacement follows ModRM byte. */ 338 newdisp = (u8 *) src + (s64) insn.displacement.value -
323 ++insn; 339 (u8 *) dest;
324 /* 340 BUG_ON((s64) (s32) newdisp != newdisp); /* Sanity check. */
325 * The copied instruction uses the %rip-relative 341 disp = (u8 *) dest + insn_offset_displacement(&insn);
326 * addressing mode. Adjust the displacement for the 342 *(s32 *) disp = (s32) newdisp;
327 * difference between the original location of this
328 * instruction and the location of the copy that will
329 * actually be run. The tricky bit here is making sure
330 * that the sign extension happens correctly in this
331 * calculation, since we need a signed 32-bit result to
332 * be sign-extended to 64 bits when it's added to the
333 * %rip value and yield the same 64-bit result that the
334 * sign-extension of the original signed 32-bit
335 * displacement would have given.
336 */
337 disp = (u8 *) p->addr + *((s32 *) insn) -
338 (u8 *) p->ainsn.insn;
339 BUG_ON((s64) (s32) disp != disp); /* Sanity check. */
340 *(s32 *)insn = (s32) disp;
341 }
342 } 343 }
343#endif 344#endif
345 return insn.length;
344} 346}
345 347
346static void __kprobes arch_copy_kprobe(struct kprobe *p) 348static void __kprobes arch_copy_kprobe(struct kprobe *p)
347{ 349{
348 memcpy(p->ainsn.insn, p->addr, MAX_INSN_SIZE * sizeof(kprobe_opcode_t)); 350 /*
349 351 * Copy an instruction without recovering int3, because it will be
350 fix_riprel(p); 352 * put by another subsystem.
353 */
354 __copy_instruction(p->ainsn.insn, p->addr, 0);
351 355
352 if (can_boost(p->addr)) 356 if (can_boost(p->addr))
353 p->ainsn.boostable = 0; 357 p->ainsn.boostable = 0;
@@ -359,6 +363,11 @@ static void __kprobes arch_copy_kprobe(struct kprobe *p)
359 363
360int __kprobes arch_prepare_kprobe(struct kprobe *p) 364int __kprobes arch_prepare_kprobe(struct kprobe *p)
361{ 365{
366 if (alternatives_text_reserved(p->addr, p->addr))
367 return -EINVAL;
368
369 if (!can_probe((unsigned long)p->addr))
370 return -EILSEQ;
362 /* insn: must be on special executable page on x86. */ 371 /* insn: must be on special executable page on x86. */
363 p->ainsn.insn = get_insn_slot(); 372 p->ainsn.insn = get_insn_slot();
364 if (!p->ainsn.insn) 373 if (!p->ainsn.insn)
@@ -423,18 +432,6 @@ static void __kprobes restore_btf(void)
423 update_debugctlmsr(current->thread.debugctlmsr); 432 update_debugctlmsr(current->thread.debugctlmsr);
424} 433}
425 434
426static void __kprobes prepare_singlestep(struct kprobe *p, struct pt_regs *regs)
427{
428 clear_btf();
429 regs->flags |= X86_EFLAGS_TF;
430 regs->flags &= ~X86_EFLAGS_IF;
431 /* single step inline if the instruction is an int3 */
432 if (p->opcode == BREAKPOINT_INSTRUCTION)
433 regs->ip = (unsigned long)p->addr;
434 else
435 regs->ip = (unsigned long)p->ainsn.insn;
436}
437
438void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri, 435void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
439 struct pt_regs *regs) 436 struct pt_regs *regs)
440{ 437{
@@ -446,20 +443,50 @@ void __kprobes arch_prepare_kretprobe(struct kretprobe_instance *ri,
446 *sara = (unsigned long) &kretprobe_trampoline; 443 *sara = (unsigned long) &kretprobe_trampoline;
447} 444}
448 445
446#ifdef CONFIG_OPTPROBES
447static int __kprobes setup_detour_execution(struct kprobe *p,
448 struct pt_regs *regs,
449 int reenter);
450#else
451#define setup_detour_execution(p, regs, reenter) (0)
452#endif
453
449static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, 454static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs,
450 struct kprobe_ctlblk *kcb) 455 struct kprobe_ctlblk *kcb, int reenter)
451{ 456{
452#if !defined(CONFIG_PREEMPT) || defined(CONFIG_FREEZER) 457 if (setup_detour_execution(p, regs, reenter))
458 return;
459
460#if !defined(CONFIG_PREEMPT)
453 if (p->ainsn.boostable == 1 && !p->post_handler) { 461 if (p->ainsn.boostable == 1 && !p->post_handler) {
454 /* Boost up -- we can execute copied instructions directly */ 462 /* Boost up -- we can execute copied instructions directly */
455 reset_current_kprobe(); 463 if (!reenter)
464 reset_current_kprobe();
465 /*
466 * Reentering boosted probe doesn't reset current_kprobe,
467 * nor set current_kprobe, because it doesn't use single
468 * stepping.
469 */
456 regs->ip = (unsigned long)p->ainsn.insn; 470 regs->ip = (unsigned long)p->ainsn.insn;
457 preempt_enable_no_resched(); 471 preempt_enable_no_resched();
458 return; 472 return;
459 } 473 }
460#endif 474#endif
461 prepare_singlestep(p, regs); 475 if (reenter) {
462 kcb->kprobe_status = KPROBE_HIT_SS; 476 save_previous_kprobe(kcb);
477 set_current_kprobe(p, regs, kcb);
478 kcb->kprobe_status = KPROBE_REENTER;
479 } else
480 kcb->kprobe_status = KPROBE_HIT_SS;
481 /* Prepare real single stepping */
482 clear_btf();
483 regs->flags |= X86_EFLAGS_TF;
484 regs->flags &= ~X86_EFLAGS_IF;
485 /* single step inline if the instruction is an int3 */
486 if (p->opcode == BREAKPOINT_INSTRUCTION)
487 regs->ip = (unsigned long)p->addr;
488 else
489 regs->ip = (unsigned long)p->ainsn.insn;
463} 490}
464 491
465/* 492/*
@@ -472,37 +499,21 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
472{ 499{
473 switch (kcb->kprobe_status) { 500 switch (kcb->kprobe_status) {
474 case KPROBE_HIT_SSDONE: 501 case KPROBE_HIT_SSDONE:
475#ifdef CONFIG_X86_64
476 /* TODO: Provide re-entrancy from post_kprobes_handler() and
477 * avoid exception stack corruption while single-stepping on
478 * the instruction of the new probe.
479 */
480 arch_disarm_kprobe(p);
481 regs->ip = (unsigned long)p->addr;
482 reset_current_kprobe();
483 preempt_enable_no_resched();
484 break;
485#endif
486 case KPROBE_HIT_ACTIVE: 502 case KPROBE_HIT_ACTIVE:
487 save_previous_kprobe(kcb);
488 set_current_kprobe(p, regs, kcb);
489 kprobes_inc_nmissed_count(p); 503 kprobes_inc_nmissed_count(p);
490 prepare_singlestep(p, regs); 504 setup_singlestep(p, regs, kcb, 1);
491 kcb->kprobe_status = KPROBE_REENTER;
492 break; 505 break;
493 case KPROBE_HIT_SS: 506 case KPROBE_HIT_SS:
494 if (p == kprobe_running()) { 507 /* A probe has been hit in the codepath leading up to, or just
495 regs->flags &= ~X86_EFLAGS_TF; 508 * after, single-stepping of a probed instruction. This entire
496 regs->flags |= kcb->kprobe_saved_flags; 509 * codepath should strictly reside in .kprobes.text section.
497 return 0; 510 * Raise a BUG or we'll continue in an endless reentering loop
498 } else { 511 * and eventually a stack overflow.
499 /* A probe has been hit in the codepath leading up 512 */
500 * to, or just after, single-stepping of a probed 513 printk(KERN_WARNING "Unrecoverable kprobe detected at %p.\n",
501 * instruction. This entire codepath should strictly 514 p->addr);
502 * reside in .kprobes.text section. Raise a warning 515 dump_kprobe(p);
503 * to highlight this peculiar case. 516 BUG();
504 */
505 }
506 default: 517 default:
507 /* impossible cases */ 518 /* impossible cases */
508 WARN_ON(1); 519 WARN_ON(1);
@@ -514,7 +525,7 @@ static int __kprobes reenter_kprobe(struct kprobe *p, struct pt_regs *regs,
514 525
515/* 526/*
516 * Interrupts are disabled on entry as trap3 is an interrupt gate and they 527 * Interrupts are disabled on entry as trap3 is an interrupt gate and they
517 * remain disabled thorough out this function. 528 * remain disabled throughout this function.
518 */ 529 */
519static int __kprobes kprobe_handler(struct pt_regs *regs) 530static int __kprobes kprobe_handler(struct pt_regs *regs)
520{ 531{
@@ -523,20 +534,6 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
523 struct kprobe_ctlblk *kcb; 534 struct kprobe_ctlblk *kcb;
524 535
525 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); 536 addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
526 if (*addr != BREAKPOINT_INSTRUCTION) {
527 /*
528 * The breakpoint instruction was removed right
529 * after we hit it. Another cpu has removed
530 * either a probepoint or a debugger breakpoint
531 * at this address. In either case, no further
532 * handling of this interrupt is appropriate.
533 * Back up over the (now missing) int3 and run
534 * the original instruction.
535 */
536 regs->ip = (unsigned long)addr;
537 return 1;
538 }
539
540 /* 537 /*
541 * We don't want to be preempted for the entire 538 * We don't want to be preempted for the entire
542 * duration of kprobe processing. We conditionally 539 * duration of kprobe processing. We conditionally
@@ -565,13 +562,26 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
565 * more here. 562 * more here.
566 */ 563 */
567 if (!p->pre_handler || !p->pre_handler(p, regs)) 564 if (!p->pre_handler || !p->pre_handler(p, regs))
568 setup_singlestep(p, regs, kcb); 565 setup_singlestep(p, regs, kcb, 0);
569 return 1; 566 return 1;
570 } 567 }
568 } else if (*addr != BREAKPOINT_INSTRUCTION) {
569 /*
570 * The breakpoint instruction was removed right
571 * after we hit it. Another cpu has removed
572 * either a probepoint or a debugger breakpoint
573 * at this address. In either case, no further
574 * handling of this interrupt is appropriate.
575 * Back up over the (now missing) int3 and run
576 * the original instruction.
577 */
578 regs->ip = (unsigned long)addr;
579 preempt_enable_no_resched();
580 return 1;
571 } else if (kprobe_running()) { 581 } else if (kprobe_running()) {
572 p = __get_cpu_var(current_kprobe); 582 p = __get_cpu_var(current_kprobe);
573 if (p->break_handler && p->break_handler(p, regs)) { 583 if (p->break_handler && p->break_handler(p, regs)) {
574 setup_singlestep(p, regs, kcb); 584 setup_singlestep(p, regs, kcb, 0);
575 return 1; 585 return 1;
576 } 586 }
577 } /* else: not a kprobe fault; let the kernel handle it */ 587 } /* else: not a kprobe fault; let the kernel handle it */
@@ -580,6 +590,69 @@ static int __kprobes kprobe_handler(struct pt_regs *regs)
580 return 0; 590 return 0;
581} 591}
582 592
593#ifdef CONFIG_X86_64
594#define SAVE_REGS_STRING \
595 /* Skip cs, ip, orig_ax. */ \
596 " subq $24, %rsp\n" \
597 " pushq %rdi\n" \
598 " pushq %rsi\n" \
599 " pushq %rdx\n" \
600 " pushq %rcx\n" \
601 " pushq %rax\n" \
602 " pushq %r8\n" \
603 " pushq %r9\n" \
604 " pushq %r10\n" \
605 " pushq %r11\n" \
606 " pushq %rbx\n" \
607 " pushq %rbp\n" \
608 " pushq %r12\n" \
609 " pushq %r13\n" \
610 " pushq %r14\n" \
611 " pushq %r15\n"
612#define RESTORE_REGS_STRING \
613 " popq %r15\n" \
614 " popq %r14\n" \
615 " popq %r13\n" \
616 " popq %r12\n" \
617 " popq %rbp\n" \
618 " popq %rbx\n" \
619 " popq %r11\n" \
620 " popq %r10\n" \
621 " popq %r9\n" \
622 " popq %r8\n" \
623 " popq %rax\n" \
624 " popq %rcx\n" \
625 " popq %rdx\n" \
626 " popq %rsi\n" \
627 " popq %rdi\n" \
628 /* Skip orig_ax, ip, cs */ \
629 " addq $24, %rsp\n"
630#else
631#define SAVE_REGS_STRING \
632 /* Skip cs, ip, orig_ax and gs. */ \
633 " subl $16, %esp\n" \
634 " pushl %fs\n" \
635 " pushl %ds\n" \
636 " pushl %es\n" \
637 " pushl %eax\n" \
638 " pushl %ebp\n" \
639 " pushl %edi\n" \
640 " pushl %esi\n" \
641 " pushl %edx\n" \
642 " pushl %ecx\n" \
643 " pushl %ebx\n"
644#define RESTORE_REGS_STRING \
645 " popl %ebx\n" \
646 " popl %ecx\n" \
647 " popl %edx\n" \
648 " popl %esi\n" \
649 " popl %edi\n" \
650 " popl %ebp\n" \
651 " popl %eax\n" \
652 /* Skip ds, es, fs, gs, orig_ax, and ip. Note: don't pop cs here*/\
653 " addl $24, %esp\n"
654#endif
655
583/* 656/*
584 * When a retprobed function returns, this code saves registers and 657 * When a retprobed function returns, this code saves registers and
585 * calls trampoline_handler() runs, which calls the kretprobe's handler. 658 * calls trampoline_handler() runs, which calls the kretprobe's handler.
@@ -593,65 +666,16 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
593 /* We don't bother saving the ss register */ 666 /* We don't bother saving the ss register */
594 " pushq %rsp\n" 667 " pushq %rsp\n"
595 " pushfq\n" 668 " pushfq\n"
596 /* 669 SAVE_REGS_STRING
597 * Skip cs, ip, orig_ax.
598 * trampoline_handler() will plug in these values
599 */
600 " subq $24, %rsp\n"
601 " pushq %rdi\n"
602 " pushq %rsi\n"
603 " pushq %rdx\n"
604 " pushq %rcx\n"
605 " pushq %rax\n"
606 " pushq %r8\n"
607 " pushq %r9\n"
608 " pushq %r10\n"
609 " pushq %r11\n"
610 " pushq %rbx\n"
611 " pushq %rbp\n"
612 " pushq %r12\n"
613 " pushq %r13\n"
614 " pushq %r14\n"
615 " pushq %r15\n"
616 " movq %rsp, %rdi\n" 670 " movq %rsp, %rdi\n"
617 " call trampoline_handler\n" 671 " call trampoline_handler\n"
618 /* Replace saved sp with true return address. */ 672 /* Replace saved sp with true return address. */
619 " movq %rax, 152(%rsp)\n" 673 " movq %rax, 152(%rsp)\n"
620 " popq %r15\n" 674 RESTORE_REGS_STRING
621 " popq %r14\n"
622 " popq %r13\n"
623 " popq %r12\n"
624 " popq %rbp\n"
625 " popq %rbx\n"
626 " popq %r11\n"
627 " popq %r10\n"
628 " popq %r9\n"
629 " popq %r8\n"
630 " popq %rax\n"
631 " popq %rcx\n"
632 " popq %rdx\n"
633 " popq %rsi\n"
634 " popq %rdi\n"
635 /* Skip orig_ax, ip, cs */
636 " addq $24, %rsp\n"
637 " popfq\n" 675 " popfq\n"
638#else 676#else
639 " pushf\n" 677 " pushf\n"
640 /* 678 SAVE_REGS_STRING
641 * Skip cs, ip, orig_ax and gs.
642 * trampoline_handler() will plug in these values
643 */
644 " subl $16, %esp\n"
645 " pushl %fs\n"
646 " pushl %es\n"
647 " pushl %ds\n"
648 " pushl %eax\n"
649 " pushl %ebp\n"
650 " pushl %edi\n"
651 " pushl %esi\n"
652 " pushl %edx\n"
653 " pushl %ecx\n"
654 " pushl %ebx\n"
655 " movl %esp, %eax\n" 679 " movl %esp, %eax\n"
656 " call trampoline_handler\n" 680 " call trampoline_handler\n"
657 /* Move flags to cs */ 681 /* Move flags to cs */
@@ -659,15 +683,7 @@ static void __used __kprobes kretprobe_trampoline_holder(void)
659 " movl %edx, 52(%esp)\n" 683 " movl %edx, 52(%esp)\n"
660 /* Replace saved flags with true return address. */ 684 /* Replace saved flags with true return address. */
661 " movl %eax, 56(%esp)\n" 685 " movl %eax, 56(%esp)\n"
662 " popl %ebx\n" 686 RESTORE_REGS_STRING
663 " popl %ecx\n"
664 " popl %edx\n"
665 " popl %esi\n"
666 " popl %edi\n"
667 " popl %ebp\n"
668 " popl %eax\n"
669 /* Skip ds, es, fs, gs, orig_ax and ip */
670 " addl $24, %esp\n"
671 " popf\n" 687 " popf\n"
672#endif 688#endif
673 " ret\n"); 689 " ret\n");
@@ -835,8 +851,8 @@ static void __kprobes resume_execution(struct kprobe *p,
835 * These instructions can be executed directly if it 851 * These instructions can be executed directly if it
836 * jumps back to correct address. 852 * jumps back to correct address.
837 */ 853 */
838 set_jmp_op((void *)regs->ip, 854 synthesize_reljump((void *)regs->ip,
839 (void *)orig_ip + (regs->ip - copy_ip)); 855 (void *)orig_ip + (regs->ip - copy_ip));
840 p->ainsn.boostable = 1; 856 p->ainsn.boostable = 1;
841 } else { 857 } else {
842 p->ainsn.boostable = -1; 858 p->ainsn.boostable = -1;
@@ -851,7 +867,7 @@ no_change:
851 867
852/* 868/*
853 * Interrupts are disabled on entry as trap1 is an interrupt gate and they 869 * Interrupts are disabled on entry as trap1 is an interrupt gate and they
854 * remain disabled thoroughout this function. 870 * remain disabled throughout this function.
855 */ 871 */
856static int __kprobes post_kprobe_handler(struct pt_regs *regs) 872static int __kprobes post_kprobe_handler(struct pt_regs *regs)
857{ 873{
@@ -967,8 +983,14 @@ int __kprobes kprobe_exceptions_notify(struct notifier_block *self,
967 ret = NOTIFY_STOP; 983 ret = NOTIFY_STOP;
968 break; 984 break;
969 case DIE_DEBUG: 985 case DIE_DEBUG:
970 if (post_kprobe_handler(args->regs)) 986 if (post_kprobe_handler(args->regs)) {
987 /*
988 * Reset the BS bit in dr6 (pointed by args->err) to
989 * denote completion of processing
990 */
991 (*(unsigned long *)ERR_PTR(args->err)) &= ~DR_STEP;
971 ret = NOTIFY_STOP; 992 ret = NOTIFY_STOP;
993 }
972 break; 994 break;
973 case DIE_GPF: 995 case DIE_GPF:
974 /* 996 /*
@@ -1057,6 +1079,358 @@ int __kprobes longjmp_break_handler(struct kprobe *p, struct pt_regs *regs)
1057 return 0; 1079 return 0;
1058} 1080}
1059 1081
1082
1083#ifdef CONFIG_OPTPROBES
1084
1085/* Insert a call instruction at address 'from', which calls address 'to'.*/
1086static void __kprobes synthesize_relcall(void *from, void *to)
1087{
1088 __synthesize_relative_insn(from, to, RELATIVECALL_OPCODE);
1089}
1090
1091/* Insert a move instruction which sets a pointer to eax/rdi (1st arg). */
1092static void __kprobes synthesize_set_arg1(kprobe_opcode_t *addr,
1093 unsigned long val)
1094{
1095#ifdef CONFIG_X86_64
1096 *addr++ = 0x48;
1097 *addr++ = 0xbf;
1098#else
1099 *addr++ = 0xb8;
1100#endif
1101 *(unsigned long *)addr = val;
1102}
1103
1104void __kprobes kprobes_optinsn_template_holder(void)
1105{
1106 asm volatile (
1107 ".global optprobe_template_entry\n"
1108 "optprobe_template_entry: \n"
1109#ifdef CONFIG_X86_64
1110 /* We don't bother saving the ss register */
1111 " pushq %rsp\n"
1112 " pushfq\n"
1113 SAVE_REGS_STRING
1114 " movq %rsp, %rsi\n"
1115 ".global optprobe_template_val\n"
1116 "optprobe_template_val: \n"
1117 ASM_NOP5
1118 ASM_NOP5
1119 ".global optprobe_template_call\n"
1120 "optprobe_template_call: \n"
1121 ASM_NOP5
1122 /* Move flags to rsp */
1123 " movq 144(%rsp), %rdx\n"
1124 " movq %rdx, 152(%rsp)\n"
1125 RESTORE_REGS_STRING
1126 /* Skip flags entry */
1127 " addq $8, %rsp\n"
1128 " popfq\n"
1129#else /* CONFIG_X86_32 */
1130 " pushf\n"
1131 SAVE_REGS_STRING
1132 " movl %esp, %edx\n"
1133 ".global optprobe_template_val\n"
1134 "optprobe_template_val: \n"
1135 ASM_NOP5
1136 ".global optprobe_template_call\n"
1137 "optprobe_template_call: \n"
1138 ASM_NOP5
1139 RESTORE_REGS_STRING
1140 " addl $4, %esp\n" /* skip cs */
1141 " popf\n"
1142#endif
1143 ".global optprobe_template_end\n"
1144 "optprobe_template_end: \n");
1145}
1146
1147#define TMPL_MOVE_IDX \
1148 ((long)&optprobe_template_val - (long)&optprobe_template_entry)
1149#define TMPL_CALL_IDX \
1150 ((long)&optprobe_template_call - (long)&optprobe_template_entry)
1151#define TMPL_END_IDX \
1152 ((long)&optprobe_template_end - (long)&optprobe_template_entry)
1153
1154#define INT3_SIZE sizeof(kprobe_opcode_t)
1155
1156/* Optimized kprobe call back function: called from optinsn */
1157static void __kprobes optimized_callback(struct optimized_kprobe *op,
1158 struct pt_regs *regs)
1159{
1160 struct kprobe_ctlblk *kcb = get_kprobe_ctlblk();
1161
1162 preempt_disable();
1163 if (kprobe_running()) {
1164 kprobes_inc_nmissed_count(&op->kp);
1165 } else {
1166 /* Save skipped registers */
1167#ifdef CONFIG_X86_64
1168 regs->cs = __KERNEL_CS;
1169#else
1170 regs->cs = __KERNEL_CS | get_kernel_rpl();
1171 regs->gs = 0;
1172#endif
1173 regs->ip = (unsigned long)op->kp.addr + INT3_SIZE;
1174 regs->orig_ax = ~0UL;
1175
1176 __get_cpu_var(current_kprobe) = &op->kp;
1177 kcb->kprobe_status = KPROBE_HIT_ACTIVE;
1178 opt_pre_handler(&op->kp, regs);
1179 __get_cpu_var(current_kprobe) = NULL;
1180 }
1181 preempt_enable_no_resched();
1182}
1183
1184static int __kprobes copy_optimized_instructions(u8 *dest, u8 *src)
1185{
1186 int len = 0, ret;
1187
1188 while (len < RELATIVEJUMP_SIZE) {
1189 ret = __copy_instruction(dest + len, src + len, 1);
1190 if (!ret || !can_boost(dest + len))
1191 return -EINVAL;
1192 len += ret;
1193 }
1194 /* Check whether the address range is reserved */
1195 if (ftrace_text_reserved(src, src + len - 1) ||
1196 alternatives_text_reserved(src, src + len - 1))
1197 return -EBUSY;
1198
1199 return len;
1200}
1201
1202/* Check whether insn is indirect jump */
1203static int __kprobes insn_is_indirect_jump(struct insn *insn)
1204{
1205 return ((insn->opcode.bytes[0] == 0xff &&
1206 (X86_MODRM_REG(insn->modrm.value) & 6) == 4) || /* Jump */
1207 insn->opcode.bytes[0] == 0xea); /* Segment based jump */
1208}
1209
1210/* Check whether insn jumps into specified address range */
1211static int insn_jump_into_range(struct insn *insn, unsigned long start, int len)
1212{
1213 unsigned long target = 0;
1214
1215 switch (insn->opcode.bytes[0]) {
1216 case 0xe0: /* loopne */
1217 case 0xe1: /* loope */
1218 case 0xe2: /* loop */
1219 case 0xe3: /* jcxz */
1220 case 0xe9: /* near relative jump */
1221 case 0xeb: /* short relative jump */
1222 break;
1223 case 0x0f:
1224 if ((insn->opcode.bytes[1] & 0xf0) == 0x80) /* jcc near */
1225 break;
1226 return 0;
1227 default:
1228 if ((insn->opcode.bytes[0] & 0xf0) == 0x70) /* jcc short */
1229 break;
1230 return 0;
1231 }
1232 target = (unsigned long)insn->next_byte + insn->immediate.value;
1233
1234 return (start <= target && target <= start + len);
1235}
1236
1237/* Decode whole function to ensure any instructions don't jump into target */
1238static int __kprobes can_optimize(unsigned long paddr)
1239{
1240 int ret;
1241 unsigned long addr, size = 0, offset = 0;
1242 struct insn insn;
1243 kprobe_opcode_t buf[MAX_INSN_SIZE];
1244 /* Dummy buffers for lookup_symbol_attrs */
1245 static char __dummy_buf[KSYM_NAME_LEN];
1246
1247 /* Lookup symbol including addr */
1248 if (!kallsyms_lookup(paddr, &size, &offset, NULL, __dummy_buf))
1249 return 0;
1250
1251 /* Check there is enough space for a relative jump. */
1252 if (size - offset < RELATIVEJUMP_SIZE)
1253 return 0;
1254
1255 /* Decode instructions */
1256 addr = paddr - offset;
1257 while (addr < paddr - offset + size) { /* Decode until function end */
1258 if (search_exception_tables(addr))
1259 /*
1260 * Since some fixup code will jumps into this function,
1261 * we can't optimize kprobe in this function.
1262 */
1263 return 0;
1264 kernel_insn_init(&insn, (void *)addr);
1265 insn_get_opcode(&insn);
1266 if (insn.opcode.bytes[0] == BREAKPOINT_INSTRUCTION) {
1267 ret = recover_probed_instruction(buf, addr);
1268 if (ret)
1269 return 0;
1270 kernel_insn_init(&insn, buf);
1271 }
1272 insn_get_length(&insn);
1273 /* Recover address */
1274 insn.kaddr = (void *)addr;
1275 insn.next_byte = (void *)(addr + insn.length);
1276 /* Check any instructions don't jump into target */
1277 if (insn_is_indirect_jump(&insn) ||
1278 insn_jump_into_range(&insn, paddr + INT3_SIZE,
1279 RELATIVE_ADDR_SIZE))
1280 return 0;
1281 addr += insn.length;
1282 }
1283
1284 return 1;
1285}
1286
1287/* Check optimized_kprobe can actually be optimized. */
1288int __kprobes arch_check_optimized_kprobe(struct optimized_kprobe *op)
1289{
1290 int i;
1291 struct kprobe *p;
1292
1293 for (i = 1; i < op->optinsn.size; i++) {
1294 p = get_kprobe(op->kp.addr + i);
1295 if (p && !kprobe_disabled(p))
1296 return -EEXIST;
1297 }
1298
1299 return 0;
1300}
1301
1302/* Check the addr is within the optimized instructions. */
1303int __kprobes arch_within_optimized_kprobe(struct optimized_kprobe *op,
1304 unsigned long addr)
1305{
1306 return ((unsigned long)op->kp.addr <= addr &&
1307 (unsigned long)op->kp.addr + op->optinsn.size > addr);
1308}
1309
1310/* Free optimized instruction slot */
1311static __kprobes
1312void __arch_remove_optimized_kprobe(struct optimized_kprobe *op, int dirty)
1313{
1314 if (op->optinsn.insn) {
1315 free_optinsn_slot(op->optinsn.insn, dirty);
1316 op->optinsn.insn = NULL;
1317 op->optinsn.size = 0;
1318 }
1319}
1320
1321void __kprobes arch_remove_optimized_kprobe(struct optimized_kprobe *op)
1322{
1323 __arch_remove_optimized_kprobe(op, 1);
1324}
1325
1326/*
1327 * Copy replacing target instructions
1328 * Target instructions MUST be relocatable (checked inside)
1329 */
1330int __kprobes arch_prepare_optimized_kprobe(struct optimized_kprobe *op)
1331{
1332 u8 *buf;
1333 int ret;
1334 long rel;
1335
1336 if (!can_optimize((unsigned long)op->kp.addr))
1337 return -EILSEQ;
1338
1339 op->optinsn.insn = get_optinsn_slot();
1340 if (!op->optinsn.insn)
1341 return -ENOMEM;
1342
1343 /*
1344 * Verify if the address gap is in 2GB range, because this uses
1345 * a relative jump.
1346 */
1347 rel = (long)op->optinsn.insn - (long)op->kp.addr + RELATIVEJUMP_SIZE;
1348 if (abs(rel) > 0x7fffffff)
1349 return -ERANGE;
1350
1351 buf = (u8 *)op->optinsn.insn;
1352
1353 /* Copy instructions into the out-of-line buffer */
1354 ret = copy_optimized_instructions(buf + TMPL_END_IDX, op->kp.addr);
1355 if (ret < 0) {
1356 __arch_remove_optimized_kprobe(op, 0);
1357 return ret;
1358 }
1359 op->optinsn.size = ret;
1360
1361 /* Copy arch-dep-instance from template */
1362 memcpy(buf, &optprobe_template_entry, TMPL_END_IDX);
1363
1364 /* Set probe information */
1365 synthesize_set_arg1(buf + TMPL_MOVE_IDX, (unsigned long)op);
1366
1367 /* Set probe function call */
1368 synthesize_relcall(buf + TMPL_CALL_IDX, optimized_callback);
1369
1370 /* Set returning jmp instruction at the tail of out-of-line buffer */
1371 synthesize_reljump(buf + TMPL_END_IDX + op->optinsn.size,
1372 (u8 *)op->kp.addr + op->optinsn.size);
1373
1374 flush_icache_range((unsigned long) buf,
1375 (unsigned long) buf + TMPL_END_IDX +
1376 op->optinsn.size + RELATIVEJUMP_SIZE);
1377 return 0;
1378}
1379
1380/* Replace a breakpoint (int3) with a relative jump. */
1381int __kprobes arch_optimize_kprobe(struct optimized_kprobe *op)
1382{
1383 unsigned char jmp_code[RELATIVEJUMP_SIZE];
1384 s32 rel = (s32)((long)op->optinsn.insn -
1385 ((long)op->kp.addr + RELATIVEJUMP_SIZE));
1386
1387 /* Backup instructions which will be replaced by jump address */
1388 memcpy(op->optinsn.copied_insn, op->kp.addr + INT3_SIZE,
1389 RELATIVE_ADDR_SIZE);
1390
1391 jmp_code[0] = RELATIVEJUMP_OPCODE;
1392 *(s32 *)(&jmp_code[1]) = rel;
1393
1394 /*
1395 * text_poke_smp doesn't support NMI/MCE code modifying.
1396 * However, since kprobes itself also doesn't support NMI/MCE
1397 * code probing, it's not a problem.
1398 */
1399 text_poke_smp(op->kp.addr, jmp_code, RELATIVEJUMP_SIZE);
1400 return 0;
1401}
1402
1403/* Replace a relative jump with a breakpoint (int3). */
1404void __kprobes arch_unoptimize_kprobe(struct optimized_kprobe *op)
1405{
1406 u8 buf[RELATIVEJUMP_SIZE];
1407
1408 /* Set int3 to first byte for kprobes */
1409 buf[0] = BREAKPOINT_INSTRUCTION;
1410 memcpy(buf + 1, op->optinsn.copied_insn, RELATIVE_ADDR_SIZE);
1411 text_poke_smp(op->kp.addr, buf, RELATIVEJUMP_SIZE);
1412}
1413
1414static int __kprobes setup_detour_execution(struct kprobe *p,
1415 struct pt_regs *regs,
1416 int reenter)
1417{
1418 struct optimized_kprobe *op;
1419
1420 if (p->flags & KPROBE_FLAG_OPTIMIZED) {
1421 /* This kprobe is really able to run optimized path. */
1422 op = container_of(p, struct optimized_kprobe, kp);
1423 /* Detour through copied instructions */
1424 regs->ip = (unsigned long)op->optinsn.insn + TMPL_END_IDX;
1425 if (!reenter)
1426 reset_current_kprobe();
1427 preempt_enable_no_resched();
1428 return 1;
1429 }
1430 return 0;
1431}
1432#endif
1433
1060int __init arch_init_kprobes(void) 1434int __init arch_init_kprobes(void)
1061{ 1435{
1062 return 0; 1436 return 0;
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index ec6ef60cbd17..ea697263b373 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -7,6 +7,7 @@
7 */ 7 */
8 8
9#include <linux/errno.h> 9#include <linux/errno.h>
10#include <linux/gfp.h>
10#include <linux/sched.h> 11#include <linux/sched.h>
11#include <linux/string.h> 12#include <linux/string.h>
12#include <linux/mm.h> 13#include <linux/mm.h>
diff --git a/arch/x86/kernel/machine_kexec_32.c b/arch/x86/kernel/machine_kexec_32.c
index c1c429d00130..a3fa43ba5d3b 100644
--- a/arch/x86/kernel/machine_kexec_32.c
+++ b/arch/x86/kernel/machine_kexec_32.c
@@ -25,6 +25,7 @@
25#include <asm/desc.h> 25#include <asm/desc.h>
26#include <asm/system.h> 26#include <asm/system.h>
27#include <asm/cacheflush.h> 27#include <asm/cacheflush.h>
28#include <asm/debugreg.h>
28 29
29static void set_idt(void *newidt, __u16 limit) 30static void set_idt(void *newidt, __u16 limit)
30{ 31{
@@ -157,8 +158,7 @@ int machine_kexec_prepare(struct kimage *image)
157{ 158{
158 int error; 159 int error;
159 160
160 if (nx_enabled) 161 set_pages_x(image->control_code_page, 1);
161 set_pages_x(image->control_code_page, 1);
162 error = machine_kexec_alloc_page_tables(image); 162 error = machine_kexec_alloc_page_tables(image);
163 if (error) 163 if (error)
164 return error; 164 return error;
@@ -172,8 +172,7 @@ int machine_kexec_prepare(struct kimage *image)
172 */ 172 */
173void machine_kexec_cleanup(struct kimage *image) 173void machine_kexec_cleanup(struct kimage *image)
174{ 174{
175 if (nx_enabled) 175 set_pages_nx(image->control_code_page, 1);
176 set_pages_nx(image->control_code_page, 1);
177 machine_kexec_free_page_tables(image); 176 machine_kexec_free_page_tables(image);
178} 177}
179 178
@@ -202,6 +201,7 @@ void machine_kexec(struct kimage *image)
202 201
203 /* Interrupts aren't acceptable while we reboot */ 202 /* Interrupts aren't acceptable while we reboot */
204 local_irq_disable(); 203 local_irq_disable();
204 hw_breakpoint_disable();
205 205
206 if (image->preserve_context) { 206 if (image->preserve_context) {
207#ifdef CONFIG_X86_IO_APIC 207#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c
index 84c3bf209e98..035c8c529181 100644
--- a/arch/x86/kernel/machine_kexec_64.c
+++ b/arch/x86/kernel/machine_kexec_64.c
@@ -9,6 +9,7 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/kexec.h> 10#include <linux/kexec.h>
11#include <linux/string.h> 11#include <linux/string.h>
12#include <linux/gfp.h>
12#include <linux/reboot.h> 13#include <linux/reboot.h>
13#include <linux/numa.h> 14#include <linux/numa.h>
14#include <linux/ftrace.h> 15#include <linux/ftrace.h>
@@ -18,6 +19,7 @@
18#include <asm/pgtable.h> 19#include <asm/pgtable.h>
19#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
20#include <asm/mmu_context.h> 21#include <asm/mmu_context.h>
22#include <asm/debugreg.h>
21 23
22static int init_one_level2_page(struct kimage *image, pgd_t *pgd, 24static int init_one_level2_page(struct kimage *image, pgd_t *pgd,
23 unsigned long addr) 25 unsigned long addr)
@@ -282,6 +284,7 @@ void machine_kexec(struct kimage *image)
282 284
283 /* Interrupts aren't acceptable while we reboot */ 285 /* Interrupts aren't acceptable while we reboot */
284 local_irq_disable(); 286 local_irq_disable();
287 hw_breakpoint_disable();
285 288
286 if (image->preserve_context) { 289 if (image->preserve_context) {
287#ifdef CONFIG_X86_IO_APIC 290#ifdef CONFIG_X86_IO_APIC
diff --git a/arch/x86/kernel/mca_32.c b/arch/x86/kernel/mca_32.c
index 845d80ce1ef1..63eaf6596233 100644
--- a/arch/x86/kernel/mca_32.c
+++ b/arch/x86/kernel/mca_32.c
@@ -42,6 +42,7 @@
42#include <linux/kernel.h> 42#include <linux/kernel.h>
43#include <linux/mca.h> 43#include <linux/mca.h>
44#include <linux/kprobes.h> 44#include <linux/kprobes.h>
45#include <linux/slab.h>
45#include <asm/system.h> 46#include <asm/system.h>
46#include <asm/io.h> 47#include <asm/io.h>
47#include <linux/proc_fs.h> 48#include <linux/proc_fs.h>
diff --git a/arch/x86/kernel/mfgpt_32.c b/arch/x86/kernel/mfgpt_32.c
deleted file mode 100644
index 2a62d843f015..000000000000
--- a/arch/x86/kernel/mfgpt_32.c
+++ /dev/null
@@ -1,410 +0,0 @@
1/*
2 * Driver/API for AMD Geode Multi-Function General Purpose Timers (MFGPT)
3 *
4 * Copyright (C) 2006, Advanced Micro Devices, Inc.
5 * Copyright (C) 2007, Andres Salomon <dilinger@debian.org>
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of version 2 of the GNU General Public License
9 * as published by the Free Software Foundation.
10 *
11 * The MFGPTs are documented in AMD Geode CS5536 Companion Device Data Book.
12 */
13
14/*
15 * We are using the 32.768kHz input clock - it's the only one that has the
16 * ranges we find desirable. The following table lists the suitable
17 * divisors and the associated Hz, minimum interval and the maximum interval:
18 *
19 * Divisor Hz Min Delta (s) Max Delta (s)
20 * 1 32768 .00048828125 2.000
21 * 2 16384 .0009765625 4.000
22 * 4 8192 .001953125 8.000
23 * 8 4096 .00390625 16.000
24 * 16 2048 .0078125 32.000
25 * 32 1024 .015625 64.000
26 * 64 512 .03125 128.000
27 * 128 256 .0625 256.000
28 * 256 128 .125 512.000
29 */
30
31#include <linux/kernel.h>
32#include <linux/interrupt.h>
33#include <linux/module.h>
34#include <asm/geode.h>
35
36#define MFGPT_DEFAULT_IRQ 7
37
38static struct mfgpt_timer_t {
39 unsigned int avail:1;
40} mfgpt_timers[MFGPT_MAX_TIMERS];
41
42/* Selected from the table above */
43
44#define MFGPT_DIVISOR 16
45#define MFGPT_SCALE 4 /* divisor = 2^(scale) */
46#define MFGPT_HZ (32768 / MFGPT_DIVISOR)
47#define MFGPT_PERIODIC (MFGPT_HZ / HZ)
48
49/* Allow for disabling of MFGPTs */
50static int disable;
51static int __init mfgpt_disable(char *s)
52{
53 disable = 1;
54 return 1;
55}
56__setup("nomfgpt", mfgpt_disable);
57
58/* Reset the MFGPT timers. This is required by some broken BIOSes which already
59 * do the same and leave the system in an unstable state. TinyBIOS 0.98 is
60 * affected at least (0.99 is OK with MFGPT workaround left to off).
61 */
62static int __init mfgpt_fix(char *s)
63{
64 u32 val, dummy;
65
66 /* The following udocumented bit resets the MFGPT timers */
67 val = 0xFF; dummy = 0;
68 wrmsr(MSR_MFGPT_SETUP, val, dummy);
69 return 1;
70}
71__setup("mfgptfix", mfgpt_fix);
72
73/*
74 * Check whether any MFGPTs are available for the kernel to use. In most
75 * cases, firmware that uses AMD's VSA code will claim all timers during
76 * bootup; we certainly don't want to take them if they're already in use.
77 * In other cases (such as with VSAless OpenFirmware), the system firmware
78 * leaves timers available for us to use.
79 */
80
81
82static int timers = -1;
83
84static void geode_mfgpt_detect(void)
85{
86 int i;
87 u16 val;
88
89 timers = 0;
90
91 if (disable) {
92 printk(KERN_INFO "geode-mfgpt: MFGPT support is disabled\n");
93 goto done;
94 }
95
96 if (!geode_get_dev_base(GEODE_DEV_MFGPT)) {
97 printk(KERN_INFO "geode-mfgpt: MFGPT LBAR is not set up\n");
98 goto done;
99 }
100
101 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
102 val = geode_mfgpt_read(i, MFGPT_REG_SETUP);
103 if (!(val & MFGPT_SETUP_SETUP)) {
104 mfgpt_timers[i].avail = 1;
105 timers++;
106 }
107 }
108
109done:
110 printk(KERN_INFO "geode-mfgpt: %d MFGPT timers available.\n", timers);
111}
112
113int geode_mfgpt_toggle_event(int timer, int cmp, int event, int enable)
114{
115 u32 msr, mask, value, dummy;
116 int shift = (cmp == MFGPT_CMP1) ? 0 : 8;
117
118 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
119 return -EIO;
120
121 /*
122 * The register maps for these are described in sections 6.17.1.x of
123 * the AMD Geode CS5536 Companion Device Data Book.
124 */
125 switch (event) {
126 case MFGPT_EVENT_RESET:
127 /*
128 * XXX: According to the docs, we cannot reset timers above
129 * 6; that is, resets for 7 and 8 will be ignored. Is this
130 * a problem? -dilinger
131 */
132 msr = MSR_MFGPT_NR;
133 mask = 1 << (timer + 24);
134 break;
135
136 case MFGPT_EVENT_NMI:
137 msr = MSR_MFGPT_NR;
138 mask = 1 << (timer + shift);
139 break;
140
141 case MFGPT_EVENT_IRQ:
142 msr = MSR_MFGPT_IRQ;
143 mask = 1 << (timer + shift);
144 break;
145
146 default:
147 return -EIO;
148 }
149
150 rdmsr(msr, value, dummy);
151
152 if (enable)
153 value |= mask;
154 else
155 value &= ~mask;
156
157 wrmsr(msr, value, dummy);
158 return 0;
159}
160EXPORT_SYMBOL_GPL(geode_mfgpt_toggle_event);
161
162int geode_mfgpt_set_irq(int timer, int cmp, int *irq, int enable)
163{
164 u32 zsel, lpc, dummy;
165 int shift;
166
167 if (timer < 0 || timer >= MFGPT_MAX_TIMERS)
168 return -EIO;
169
170 /*
171 * Unfortunately, MFGPTs come in pairs sharing their IRQ lines. If VSA
172 * is using the same CMP of the timer's Siamese twin, the IRQ is set to
173 * 2, and we mustn't use nor change it.
174 * XXX: Likewise, 2 Linux drivers might clash if the 2nd overwrites the
175 * IRQ of the 1st. This can only happen if forcing an IRQ, calling this
176 * with *irq==0 is safe. Currently there _are_ no 2 drivers.
177 */
178 rdmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
179 shift = ((cmp == MFGPT_CMP1 ? 0 : 4) + timer % 4) * 4;
180 if (((zsel >> shift) & 0xF) == 2)
181 return -EIO;
182
183 /* Choose IRQ: if none supplied, keep IRQ already set or use default */
184 if (!*irq)
185 *irq = (zsel >> shift) & 0xF;
186 if (!*irq)
187 *irq = MFGPT_DEFAULT_IRQ;
188
189 /* Can't use IRQ if it's 0 (=disabled), 2, or routed to LPC */
190 if (*irq < 1 || *irq == 2 || *irq > 15)
191 return -EIO;
192 rdmsr(MSR_PIC_IRQM_LPC, lpc, dummy);
193 if (lpc & (1 << *irq))
194 return -EIO;
195
196 /* All chosen and checked - go for it */
197 if (geode_mfgpt_toggle_event(timer, cmp, MFGPT_EVENT_IRQ, enable))
198 return -EIO;
199 if (enable) {
200 zsel = (zsel & ~(0xF << shift)) | (*irq << shift);
201 wrmsr(MSR_PIC_ZSEL_LOW, zsel, dummy);
202 }
203
204 return 0;
205}
206
207static int mfgpt_get(int timer)
208{
209 mfgpt_timers[timer].avail = 0;
210 printk(KERN_INFO "geode-mfgpt: Registered timer %d\n", timer);
211 return timer;
212}
213
214int geode_mfgpt_alloc_timer(int timer, int domain)
215{
216 int i;
217
218 if (timers == -1) {
219 /* timers haven't been detected yet */
220 geode_mfgpt_detect();
221 }
222
223 if (!timers)
224 return -1;
225
226 if (timer >= MFGPT_MAX_TIMERS)
227 return -1;
228
229 if (timer < 0) {
230 /* Try to find an available timer */
231 for (i = 0; i < MFGPT_MAX_TIMERS; i++) {
232 if (mfgpt_timers[i].avail)
233 return mfgpt_get(i);
234
235 if (i == 5 && domain == MFGPT_DOMAIN_WORKING)
236 break;
237 }
238 } else {
239 /* If they requested a specific timer, try to honor that */
240 if (mfgpt_timers[timer].avail)
241 return mfgpt_get(timer);
242 }
243
244 /* No timers available - too bad */
245 return -1;
246}
247EXPORT_SYMBOL_GPL(geode_mfgpt_alloc_timer);
248
249
250#ifdef CONFIG_GEODE_MFGPT_TIMER
251
252/*
253 * The MFPGT timers on the CS5536 provide us with suitable timers to use
254 * as clock event sources - not as good as a HPET or APIC, but certainly
255 * better than the PIT. This isn't a general purpose MFGPT driver, but
256 * a simplified one designed specifically to act as a clock event source.
257 * For full details about the MFGPT, please consult the CS5536 data sheet.
258 */
259
260#include <linux/clocksource.h>
261#include <linux/clockchips.h>
262
263static unsigned int mfgpt_tick_mode = CLOCK_EVT_MODE_SHUTDOWN;
264static u16 mfgpt_event_clock;
265
266static int irq;
267static int __init mfgpt_setup(char *str)
268{
269 get_option(&str, &irq);
270 return 1;
271}
272__setup("mfgpt_irq=", mfgpt_setup);
273
274static void mfgpt_disable_timer(u16 clock)
275{
276 /* avoid races by clearing CMP1 and CMP2 unconditionally */
277 geode_mfgpt_write(clock, MFGPT_REG_SETUP, (u16) ~MFGPT_SETUP_CNTEN |
278 MFGPT_SETUP_CMP1 | MFGPT_SETUP_CMP2);
279}
280
281static int mfgpt_next_event(unsigned long, struct clock_event_device *);
282static void mfgpt_set_mode(enum clock_event_mode, struct clock_event_device *);
283
284static struct clock_event_device mfgpt_clockevent = {
285 .name = "mfgpt-timer",
286 .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT,
287 .set_mode = mfgpt_set_mode,
288 .set_next_event = mfgpt_next_event,
289 .rating = 250,
290 .cpumask = cpu_all_mask,
291 .shift = 32
292};
293
294static void mfgpt_start_timer(u16 delta)
295{
296 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_CMP2, (u16) delta);
297 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
298
299 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
300 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
301}
302
303static void mfgpt_set_mode(enum clock_event_mode mode,
304 struct clock_event_device *evt)
305{
306 mfgpt_disable_timer(mfgpt_event_clock);
307
308 if (mode == CLOCK_EVT_MODE_PERIODIC)
309 mfgpt_start_timer(MFGPT_PERIODIC);
310
311 mfgpt_tick_mode = mode;
312}
313
314static int mfgpt_next_event(unsigned long delta, struct clock_event_device *evt)
315{
316 mfgpt_start_timer(delta);
317 return 0;
318}
319
320static irqreturn_t mfgpt_tick(int irq, void *dev_id)
321{
322 u16 val = geode_mfgpt_read(mfgpt_event_clock, MFGPT_REG_SETUP);
323
324 /* See if the interrupt was for us */
325 if (!(val & (MFGPT_SETUP_SETUP | MFGPT_SETUP_CMP2 | MFGPT_SETUP_CMP1)))
326 return IRQ_NONE;
327
328 /* Turn off the clock (and clear the event) */
329 mfgpt_disable_timer(mfgpt_event_clock);
330
331 if (mfgpt_tick_mode == CLOCK_EVT_MODE_SHUTDOWN)
332 return IRQ_HANDLED;
333
334 /* Clear the counter */
335 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_COUNTER, 0);
336
337 /* Restart the clock in periodic mode */
338
339 if (mfgpt_tick_mode == CLOCK_EVT_MODE_PERIODIC) {
340 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP,
341 MFGPT_SETUP_CNTEN | MFGPT_SETUP_CMP2);
342 }
343
344 mfgpt_clockevent.event_handler(&mfgpt_clockevent);
345 return IRQ_HANDLED;
346}
347
348static struct irqaction mfgptirq = {
349 .handler = mfgpt_tick,
350 .flags = IRQF_DISABLED | IRQF_NOBALANCING | IRQF_TIMER,
351 .name = "mfgpt-timer"
352};
353
354int __init mfgpt_timer_setup(void)
355{
356 int timer, ret;
357 u16 val;
358
359 timer = geode_mfgpt_alloc_timer(MFGPT_TIMER_ANY, MFGPT_DOMAIN_WORKING);
360 if (timer < 0) {
361 printk(KERN_ERR
362 "mfgpt-timer: Could not allocate a MFPGT timer\n");
363 return -ENODEV;
364 }
365
366 mfgpt_event_clock = timer;
367
368 /* Set up the IRQ on the MFGPT side */
369 if (geode_mfgpt_setup_irq(mfgpt_event_clock, MFGPT_CMP2, &irq)) {
370 printk(KERN_ERR "mfgpt-timer: Could not set up IRQ %d\n", irq);
371 return -EIO;
372 }
373
374 /* And register it with the kernel */
375 ret = setup_irq(irq, &mfgptirq);
376
377 if (ret) {
378 printk(KERN_ERR
379 "mfgpt-timer: Unable to set up the interrupt.\n");
380 goto err;
381 }
382
383 /* Set the clock scale and enable the event mode for CMP2 */
384 val = MFGPT_SCALE | (3 << 8);
385
386 geode_mfgpt_write(mfgpt_event_clock, MFGPT_REG_SETUP, val);
387
388 /* Set up the clock event */
389 mfgpt_clockevent.mult = div_sc(MFGPT_HZ, NSEC_PER_SEC,
390 mfgpt_clockevent.shift);
391 mfgpt_clockevent.min_delta_ns = clockevent_delta2ns(0xF,
392 &mfgpt_clockevent);
393 mfgpt_clockevent.max_delta_ns = clockevent_delta2ns(0xFFFE,
394 &mfgpt_clockevent);
395
396 printk(KERN_INFO
397 "mfgpt-timer: Registering MFGPT timer %d as a clock event, using IRQ %d\n",
398 timer, irq);
399 clockevents_register_device(&mfgpt_clockevent);
400
401 return 0;
402
403err:
404 geode_mfgpt_release_irq(mfgpt_event_clock, MFGPT_CMP2, &irq);
405 printk(KERN_ERR
406 "mfgpt-timer: Unable to set up the MFGPT clock source\n");
407 return -EIO;
408}
409
410#endif
diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c
index f4c538b681ca..e1af7c055c7d 100644
--- a/arch/x86/kernel/microcode_amd.c
+++ b/arch/x86/kernel/microcode_amd.c
@@ -13,6 +13,9 @@
13 * Licensed under the terms of the GNU General Public 13 * Licensed under the terms of the GNU General Public
14 * License version 2. See file COPYING for details. 14 * License version 2. See file COPYING for details.
15 */ 15 */
16
17#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
16#include <linux/firmware.h> 19#include <linux/firmware.h>
17#include <linux/pci_ids.h> 20#include <linux/pci_ids.h>
18#include <linux/uaccess.h> 21#include <linux/uaccess.h>
@@ -76,12 +79,12 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig)
76 79
77 memset(csig, 0, sizeof(*csig)); 80 memset(csig, 0, sizeof(*csig));
78 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) { 81 if (c->x86_vendor != X86_VENDOR_AMD || c->x86 < 0x10) {
79 printk(KERN_WARNING "microcode: CPU%d: AMD CPU family 0x%x not " 82 pr_warning("microcode: CPU%d: AMD CPU family 0x%x not "
80 "supported\n", cpu, c->x86); 83 "supported\n", cpu, c->x86);
81 return -1; 84 return -1;
82 } 85 }
83 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy); 86 rdmsr(MSR_AMD64_PATCH_LEVEL, csig->rev, dummy);
84 printk(KERN_INFO "microcode: CPU%d: patch_level=0x%x\n", cpu, csig->rev); 87 pr_info("CPU%d: patch_level=0x%x\n", cpu, csig->rev);
85 return 0; 88 return 0;
86} 89}
87 90
@@ -103,23 +106,16 @@ static int get_matching_microcode(int cpu, void *mc, int rev)
103 i++; 106 i++;
104 } 107 }
105 108
106 if (!equiv_cpu_id) { 109 if (!equiv_cpu_id)
107 printk(KERN_WARNING "microcode: CPU%d: cpu revision "
108 "not listed in equivalent cpu table\n", cpu);
109 return 0; 110 return 0;
110 }
111 111
112 if (mc_header->processor_rev_id != equiv_cpu_id) { 112 if (mc_header->processor_rev_id != equiv_cpu_id)
113 printk(KERN_ERR "microcode: CPU%d: patch mismatch "
114 "(processor_rev_id: %x, equiv_cpu_id: %x)\n",
115 cpu, mc_header->processor_rev_id, equiv_cpu_id);
116 return 0; 113 return 0;
117 }
118 114
119 /* ucode might be chipset specific -- currently we don't support this */ 115 /* ucode might be chipset specific -- currently we don't support this */
120 if (mc_header->nb_dev_id || mc_header->sb_dev_id) { 116 if (mc_header->nb_dev_id || mc_header->sb_dev_id) {
121 printk(KERN_ERR "microcode: CPU%d: loading of chipset " 117 pr_err("CPU%d: loading of chipset specific code not yet supported\n",
122 "specific code not yet supported\n", cpu); 118 cpu);
123 return 0; 119 return 0;
124 } 120 }
125 121
@@ -148,14 +144,12 @@ static int apply_microcode_amd(int cpu)
148 144
149 /* check current patch id and patch's id for match */ 145 /* check current patch id and patch's id for match */
150 if (rev != mc_amd->hdr.patch_id) { 146 if (rev != mc_amd->hdr.patch_id) {
151 printk(KERN_ERR "microcode: CPU%d: update failed " 147 pr_err("CPU%d: update failed (for patch_level=0x%x)\n",
152 "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); 148 cpu, mc_amd->hdr.patch_id);
153 return -1; 149 return -1;
154 } 150 }
155 151
156 printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", 152 pr_info("CPU%d: updated (new patch_level=0x%x)\n", cpu, rev);
157 cpu, rev);
158
159 uci->cpu_sig.rev = rev; 153 uci->cpu_sig.rev = rev;
160 154
161 return 0; 155 return 0;
@@ -178,18 +172,14 @@ get_next_ucode(const u8 *buf, unsigned int size, unsigned int *mc_size)
178 return NULL; 172 return NULL;
179 173
180 if (section_hdr[0] != UCODE_UCODE_TYPE) { 174 if (section_hdr[0] != UCODE_UCODE_TYPE) {
181 printk(KERN_ERR "microcode: error: invalid type field in " 175 pr_err("error: invalid type field in container file section header\n");
182 "container file section header\n");
183 return NULL; 176 return NULL;
184 } 177 }
185 178
186 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8)); 179 total_size = (unsigned long) (section_hdr[4] + (section_hdr[5] << 8));
187 180
188 printk(KERN_DEBUG "microcode: size %u, total_size %u\n",
189 size, total_size);
190
191 if (total_size > size || total_size > UCODE_MAX_SIZE) { 181 if (total_size > size || total_size > UCODE_MAX_SIZE) {
192 printk(KERN_ERR "microcode: error: size mismatch\n"); 182 pr_err("error: size mismatch\n");
193 return NULL; 183 return NULL;
194 } 184 }
195 185
@@ -218,15 +208,13 @@ static int install_equiv_cpu_table(const u8 *buf)
218 size = buf_pos[2]; 208 size = buf_pos[2];
219 209
220 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) { 210 if (buf_pos[1] != UCODE_EQUIV_CPU_TABLE_TYPE || !size) {
221 printk(KERN_ERR "microcode: error: invalid type field in " 211 pr_err("error: invalid type field in container file section header\n");
222 "container file section header\n");
223 return 0; 212 return 0;
224 } 213 }
225 214
226 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size); 215 equiv_cpu_table = (struct equiv_cpu_entry *) vmalloc(size);
227 if (!equiv_cpu_table) { 216 if (!equiv_cpu_table) {
228 printk(KERN_ERR "microcode: failed to allocate " 217 pr_err("failed to allocate equivalent CPU table\n");
229 "equivalent CPU table\n");
230 return 0; 218 return 0;
231 } 219 }
232 220
@@ -259,8 +247,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
259 247
260 offset = install_equiv_cpu_table(ucode_ptr); 248 offset = install_equiv_cpu_table(ucode_ptr);
261 if (!offset) { 249 if (!offset) {
262 printk(KERN_ERR "microcode: failed to create " 250 pr_err("failed to create equivalent cpu table\n");
263 "equivalent cpu table\n");
264 return UCODE_ERROR; 251 return UCODE_ERROR;
265 } 252 }
266 253
@@ -291,8 +278,7 @@ generic_load_microcode(int cpu, const u8 *data, size_t size)
291 if (!leftover) { 278 if (!leftover) {
292 vfree(uci->mc); 279 vfree(uci->mc);
293 uci->mc = new_mc; 280 uci->mc = new_mc;
294 pr_debug("microcode: CPU%d found a matching microcode " 281 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
295 "update with version 0x%x (current=0x%x)\n",
296 cpu, new_rev, uci->cpu_sig.rev); 282 cpu, new_rev, uci->cpu_sig.rev);
297 } else { 283 } else {
298 vfree(new_mc); 284 vfree(new_mc);
@@ -318,7 +304,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
318 } 304 }
319 305
320 if (*(u32 *)firmware->data != UCODE_MAGIC) { 306 if (*(u32 *)firmware->data != UCODE_MAGIC) {
321 printk(KERN_ERR "microcode: invalid UCODE_MAGIC (0x%08x)\n", 307 pr_err("invalid UCODE_MAGIC (0x%08x)\n",
322 *(u32 *)firmware->data); 308 *(u32 *)firmware->data);
323 return UCODE_ERROR; 309 return UCODE_ERROR;
324 } 310 }
@@ -333,8 +319,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
333static enum ucode_state 319static enum ucode_state
334request_microcode_user(int cpu, const void __user *buf, size_t size) 320request_microcode_user(int cpu, const void __user *buf, size_t size)
335{ 321{
336 printk(KERN_INFO "microcode: AMD microcode update via " 322 pr_info("AMD microcode update via /dev/cpu/microcode not supported\n");
337 "/dev/cpu/microcode not supported\n");
338 return UCODE_ERROR; 323 return UCODE_ERROR;
339} 324}
340 325
diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c
index 378e9a8f1bf8..cceb5bc3c3c2 100644
--- a/arch/x86/kernel/microcode_core.c
+++ b/arch/x86/kernel/microcode_core.c
@@ -70,10 +70,12 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/platform_device.h> 76#include <linux/platform_device.h>
74#include <linux/miscdevice.h> 77#include <linux/miscdevice.h>
75#include <linux/capability.h> 78#include <linux/capability.h>
76#include <linux/smp_lock.h>
77#include <linux/kernel.h> 79#include <linux/kernel.h>
78#include <linux/module.h> 80#include <linux/module.h>
79#include <linux/mutex.h> 81#include <linux/mutex.h>
@@ -201,7 +203,6 @@ static int do_microcode_update(const void __user *buf, size_t size)
201 203
202static int microcode_open(struct inode *unused1, struct file *unused2) 204static int microcode_open(struct inode *unused1, struct file *unused2)
203{ 205{
204 cycle_kernel_lock();
205 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM; 206 return capable(CAP_SYS_RAWIO) ? 0 : -EPERM;
206} 207}
207 208
@@ -211,7 +212,7 @@ static ssize_t microcode_write(struct file *file, const char __user *buf,
211 ssize_t ret = -EINVAL; 212 ssize_t ret = -EINVAL;
212 213
213 if ((len >> PAGE_SHIFT) > totalram_pages) { 214 if ((len >> PAGE_SHIFT) > totalram_pages) {
214 pr_err("microcode: too much data (max %ld pages)\n", totalram_pages); 215 pr_err("too much data (max %ld pages)\n", totalram_pages);
215 return ret; 216 return ret;
216 } 217 }
217 218
@@ -246,7 +247,7 @@ static int __init microcode_dev_init(void)
246 247
247 error = misc_register(&microcode_dev); 248 error = misc_register(&microcode_dev);
248 if (error) { 249 if (error) {
249 pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); 250 pr_err("can't misc_register on minor=%d\n", MICROCODE_MINOR);
250 return error; 251 return error;
251 } 252 }
252 253
@@ -361,7 +362,7 @@ static enum ucode_state microcode_resume_cpu(int cpu)
361 if (!uci->mc) 362 if (!uci->mc)
362 return UCODE_NFOUND; 363 return UCODE_NFOUND;
363 364
364 pr_debug("microcode: CPU%d updated upon resume\n", cpu); 365 pr_debug("CPU%d updated upon resume\n", cpu);
365 apply_microcode_on_target(cpu); 366 apply_microcode_on_target(cpu);
366 367
367 return UCODE_OK; 368 return UCODE_OK;
@@ -381,7 +382,7 @@ static enum ucode_state microcode_init_cpu(int cpu)
381 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev); 382 ustate = microcode_ops->request_microcode_fw(cpu, &microcode_pdev->dev);
382 383
383 if (ustate == UCODE_OK) { 384 if (ustate == UCODE_OK) {
384 pr_debug("microcode: CPU%d updated upon init\n", cpu); 385 pr_debug("CPU%d updated upon init\n", cpu);
385 apply_microcode_on_target(cpu); 386 apply_microcode_on_target(cpu);
386 } 387 }
387 388
@@ -408,7 +409,7 @@ static int mc_sysdev_add(struct sys_device *sys_dev)
408 if (!cpu_online(cpu)) 409 if (!cpu_online(cpu))
409 return 0; 410 return 0;
410 411
411 pr_debug("microcode: CPU%d added\n", cpu); 412 pr_debug("CPU%d added\n", cpu);
412 413
413 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); 414 err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group);
414 if (err) 415 if (err)
@@ -427,7 +428,7 @@ static int mc_sysdev_remove(struct sys_device *sys_dev)
427 if (!cpu_online(cpu)) 428 if (!cpu_online(cpu))
428 return 0; 429 return 0;
429 430
430 pr_debug("microcode: CPU%d removed\n", cpu); 431 pr_debug("CPU%d removed\n", cpu);
431 microcode_fini_cpu(cpu); 432 microcode_fini_cpu(cpu);
432 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 433 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
433 return 0; 434 return 0;
@@ -475,15 +476,15 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu)
475 microcode_update_cpu(cpu); 476 microcode_update_cpu(cpu);
476 case CPU_DOWN_FAILED: 477 case CPU_DOWN_FAILED:
477 case CPU_DOWN_FAILED_FROZEN: 478 case CPU_DOWN_FAILED_FROZEN:
478 pr_debug("microcode: CPU%d added\n", cpu); 479 pr_debug("CPU%d added\n", cpu);
479 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) 480 if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group))
480 pr_err("microcode: Failed to create group for CPU%d\n", cpu); 481 pr_err("Failed to create group for CPU%d\n", cpu);
481 break; 482 break;
482 case CPU_DOWN_PREPARE: 483 case CPU_DOWN_PREPARE:
483 case CPU_DOWN_PREPARE_FROZEN: 484 case CPU_DOWN_PREPARE_FROZEN:
484 /* Suspend is in progress, only remove the interface */ 485 /* Suspend is in progress, only remove the interface */
485 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); 486 sysfs_remove_group(&sys_dev->kobj, &mc_attr_group);
486 pr_debug("microcode: CPU%d removed\n", cpu); 487 pr_debug("CPU%d removed\n", cpu);
487 break; 488 break;
488 case CPU_DEAD: 489 case CPU_DEAD:
489 case CPU_UP_CANCELED_FROZEN: 490 case CPU_UP_CANCELED_FROZEN:
@@ -509,7 +510,7 @@ static int __init microcode_init(void)
509 microcode_ops = init_amd_microcode(); 510 microcode_ops = init_amd_microcode();
510 511
511 if (!microcode_ops) { 512 if (!microcode_ops) {
512 pr_err("microcode: no support for this CPU vendor\n"); 513 pr_err("no support for this CPU vendor\n");
513 return -ENODEV; 514 return -ENODEV;
514 } 515 }
515 516
@@ -540,8 +541,7 @@ static int __init microcode_init(void)
540 register_hotcpu_notifier(&mc_cpu_notifier); 541 register_hotcpu_notifier(&mc_cpu_notifier);
541 542
542 pr_info("Microcode Update Driver: v" MICROCODE_VERSION 543 pr_info("Microcode Update Driver: v" MICROCODE_VERSION
543 " <tigran@aivazian.fsnet.co.uk>," 544 " <tigran@aivazian.fsnet.co.uk>, Peter Oruba\n");
544 " Peter Oruba\n");
545 545
546 return 0; 546 return 0;
547} 547}
diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c
index 0d334ddd0a96..85a343e28937 100644
--- a/arch/x86/kernel/microcode_intel.c
+++ b/arch/x86/kernel/microcode_intel.c
@@ -70,6 +70,9 @@
70 * Fix sigmatch() macro to handle old CPUs with pf == 0. 70 * Fix sigmatch() macro to handle old CPUs with pf == 0.
71 * Thanks to Stuart Swales for pointing out this bug. 71 * Thanks to Stuart Swales for pointing out this bug.
72 */ 72 */
73
74#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
75
73#include <linux/firmware.h> 76#include <linux/firmware.h>
74#include <linux/uaccess.h> 77#include <linux/uaccess.h>
75#include <linux/kernel.h> 78#include <linux/kernel.h>
@@ -146,8 +149,7 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
146 149
147 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 || 150 if (c->x86_vendor != X86_VENDOR_INTEL || c->x86 < 6 ||
148 cpu_has(c, X86_FEATURE_IA64)) { 151 cpu_has(c, X86_FEATURE_IA64)) {
149 printk(KERN_ERR "microcode: CPU%d not a capable Intel " 152 pr_err("CPU%d not a capable Intel processor\n", cpu_num);
150 "processor\n", cpu_num);
151 return -1; 153 return -1;
152 } 154 }
153 155
@@ -165,8 +167,8 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig)
165 /* get the current revision from MSR 0x8B */ 167 /* get the current revision from MSR 0x8B */
166 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); 168 rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev);
167 169
168 printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", 170 pr_info("CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n",
169 cpu_num, csig->sig, csig->pf, csig->rev); 171 cpu_num, csig->sig, csig->pf, csig->rev);
170 172
171 return 0; 173 return 0;
172} 174}
@@ -194,28 +196,24 @@ static int microcode_sanity_check(void *mc)
194 data_size = get_datasize(mc_header); 196 data_size = get_datasize(mc_header);
195 197
196 if (data_size + MC_HEADER_SIZE > total_size) { 198 if (data_size + MC_HEADER_SIZE > total_size) {
197 printk(KERN_ERR "microcode: error! " 199 pr_err("error! Bad data size in microcode data file\n");
198 "Bad data size in microcode data file\n");
199 return -EINVAL; 200 return -EINVAL;
200 } 201 }
201 202
202 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) { 203 if (mc_header->ldrver != 1 || mc_header->hdrver != 1) {
203 printk(KERN_ERR "microcode: error! " 204 pr_err("error! Unknown microcode update format\n");
204 "Unknown microcode update format\n");
205 return -EINVAL; 205 return -EINVAL;
206 } 206 }
207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size); 207 ext_table_size = total_size - (MC_HEADER_SIZE + data_size);
208 if (ext_table_size) { 208 if (ext_table_size) {
209 if ((ext_table_size < EXT_HEADER_SIZE) 209 if ((ext_table_size < EXT_HEADER_SIZE)
210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) { 210 || ((ext_table_size - EXT_HEADER_SIZE) % EXT_SIGNATURE_SIZE)) {
211 printk(KERN_ERR "microcode: error! " 211 pr_err("error! Small exttable size in microcode data file\n");
212 "Small exttable size in microcode data file\n");
213 return -EINVAL; 212 return -EINVAL;
214 } 213 }
215 ext_header = mc + MC_HEADER_SIZE + data_size; 214 ext_header = mc + MC_HEADER_SIZE + data_size;
216 if (ext_table_size != exttable_size(ext_header)) { 215 if (ext_table_size != exttable_size(ext_header)) {
217 printk(KERN_ERR "microcode: error! " 216 pr_err("error! Bad exttable size in microcode data file\n");
218 "Bad exttable size in microcode data file\n");
219 return -EFAULT; 217 return -EFAULT;
220 } 218 }
221 ext_sigcount = ext_header->count; 219 ext_sigcount = ext_header->count;
@@ -230,8 +228,7 @@ static int microcode_sanity_check(void *mc)
230 while (i--) 228 while (i--)
231 ext_table_sum += ext_tablep[i]; 229 ext_table_sum += ext_tablep[i];
232 if (ext_table_sum) { 230 if (ext_table_sum) {
233 printk(KERN_WARNING "microcode: aborting, " 231 pr_warning("aborting, bad extended signature table checksum\n");
234 "bad extended signature table checksum\n");
235 return -EINVAL; 232 return -EINVAL;
236 } 233 }
237 } 234 }
@@ -242,7 +239,7 @@ static int microcode_sanity_check(void *mc)
242 while (i--) 239 while (i--)
243 orig_sum += ((int *)mc)[i]; 240 orig_sum += ((int *)mc)[i];
244 if (orig_sum) { 241 if (orig_sum) {
245 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 242 pr_err("aborting, bad checksum\n");
246 return -EINVAL; 243 return -EINVAL;
247 } 244 }
248 if (!ext_table_size) 245 if (!ext_table_size)
@@ -255,7 +252,7 @@ static int microcode_sanity_check(void *mc)
255 - (mc_header->sig + mc_header->pf + mc_header->cksum) 252 - (mc_header->sig + mc_header->pf + mc_header->cksum)
256 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum); 253 + (ext_sig->sig + ext_sig->pf + ext_sig->cksum);
257 if (sum) { 254 if (sum) {
258 printk(KERN_ERR "microcode: aborting, bad checksum\n"); 255 pr_err("aborting, bad checksum\n");
259 return -EINVAL; 256 return -EINVAL;
260 } 257 }
261 } 258 }
@@ -327,13 +324,11 @@ static int apply_microcode(int cpu)
327 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); 324 rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]);
328 325
329 if (val[1] != mc_intel->hdr.rev) { 326 if (val[1] != mc_intel->hdr.rev) {
330 printk(KERN_ERR "microcode: CPU%d update " 327 pr_err("CPU%d update to revision 0x%x failed\n",
331 "to revision 0x%x failed\n", 328 cpu_num, mc_intel->hdr.rev);
332 cpu_num, mc_intel->hdr.rev);
333 return -1; 329 return -1;
334 } 330 }
335 printk(KERN_INFO "microcode: CPU%d updated to revision " 331 pr_info("CPU%d updated to revision 0x%x, date = %04x-%02x-%02x\n",
336 "0x%x, date = %04x-%02x-%02x \n",
337 cpu_num, val[1], 332 cpu_num, val[1],
338 mc_intel->hdr.date & 0xffff, 333 mc_intel->hdr.date & 0xffff,
339 mc_intel->hdr.date >> 24, 334 mc_intel->hdr.date >> 24,
@@ -362,8 +357,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
362 357
363 mc_size = get_totalsize(&mc_header); 358 mc_size = get_totalsize(&mc_header);
364 if (!mc_size || mc_size > leftover) { 359 if (!mc_size || mc_size > leftover) {
365 printk(KERN_ERR "microcode: error!" 360 pr_err("error! Bad data in microcode data file\n");
366 "Bad data in microcode data file\n");
367 break; 361 break;
368 } 362 }
369 363
@@ -405,9 +399,8 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
405 vfree(uci->mc); 399 vfree(uci->mc);
406 uci->mc = (struct microcode_intel *)new_mc; 400 uci->mc = (struct microcode_intel *)new_mc;
407 401
408 pr_debug("microcode: CPU%d found a matching microcode update with" 402 pr_debug("CPU%d found a matching microcode update with version 0x%x (current=0x%x)\n",
409 " version 0x%x (current=0x%x)\n", 403 cpu, new_rev, uci->cpu_sig.rev);
410 cpu, new_rev, uci->cpu_sig.rev);
411out: 404out:
412 return state; 405 return state;
413} 406}
@@ -429,7 +422,7 @@ static enum ucode_state request_microcode_fw(int cpu, struct device *device)
429 c->x86, c->x86_model, c->x86_mask); 422 c->x86, c->x86_model, c->x86_mask);
430 423
431 if (request_firmware(&firmware, name, device)) { 424 if (request_firmware(&firmware, name, device)) {
432 pr_debug("microcode: data file %s load failed\n", name); 425 pr_debug("data file %s load failed\n", name);
433 return UCODE_NFOUND; 426 return UCODE_NFOUND;
434 } 427 }
435 428
diff --git a/arch/x86/kernel/mmconf-fam10h_64.c b/arch/x86/kernel/mmconf-fam10h_64.c
index 712d15fdc416..71825806cd44 100644
--- a/arch/x86/kernel/mmconf-fam10h_64.c
+++ b/arch/x86/kernel/mmconf-fam10h_64.c
@@ -7,6 +7,8 @@
7#include <linux/string.h> 7#include <linux/string.h>
8#include <linux/pci.h> 8#include <linux/pci.h>
9#include <linux/dmi.h> 9#include <linux/dmi.h>
10#include <linux/range.h>
11
10#include <asm/pci-direct.h> 12#include <asm/pci-direct.h>
11#include <linux/sort.h> 13#include <linux/sort.h>
12#include <asm/io.h> 14#include <asm/io.h>
@@ -30,11 +32,6 @@ static struct pci_hostbridge_probe pci_probes[] __cpuinitdata = {
30 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 }, 32 { 0xff, 0, PCI_VENDOR_ID_AMD, 0x1200 },
31}; 33};
32 34
33struct range {
34 u64 start;
35 u64 end;
36};
37
38static int __cpuinit cmp_range(const void *x1, const void *x2) 35static int __cpuinit cmp_range(const void *x1, const void *x2)
39{ 36{
40 const struct range *r1 = x1; 37 const struct range *r1 = x1;
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index 89f386f044e4..e0bc186d7501 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -23,6 +23,7 @@
23#include <linux/kernel.h> 23#include <linux/kernel.h>
24#include <linux/bug.h> 24#include <linux/bug.h>
25#include <linux/mm.h> 25#include <linux/mm.h>
26#include <linux/gfp.h>
26 27
27#include <asm/system.h> 28#include <asm/system.h>
28#include <asm/page.h> 29#include <asm/page.h>
diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c
index 5be95ef4ffec..e81030f71a8f 100644
--- a/arch/x86/kernel/mpparse.c
+++ b/arch/x86/kernel/mpparse.c
@@ -359,13 +359,6 @@ static int __init smp_read_mpc(struct mpc_table *mpc, unsigned early)
359 x86_init.mpparse.mpc_record(1); 359 x86_init.mpparse.mpc_record(1);
360 } 360 }
361 361
362#ifdef CONFIG_X86_BIGSMP
363 generic_bigsmp_probe();
364#endif
365
366 if (apic->setup_apic_routing)
367 apic->setup_apic_routing();
368
369 if (!num_processors) 362 if (!num_processors)
370 printk(KERN_ERR "MPTABLE: no processors registered!\n"); 363 printk(KERN_ERR "MPTABLE: no processors registered!\n");
371 return num_processors; 364 return num_processors;
@@ -667,36 +660,18 @@ void __init default_get_smp_config(unsigned int early)
667 */ 660 */
668} 661}
669 662
670static void __init smp_reserve_bootmem(struct mpf_intel *mpf) 663static void __init smp_reserve_memory(struct mpf_intel *mpf)
671{ 664{
672 unsigned long size = get_mpc_size(mpf->physptr); 665 unsigned long size = get_mpc_size(mpf->physptr);
673#ifdef CONFIG_X86_32
674 /*
675 * We cannot access to MPC table to compute table size yet,
676 * as only few megabytes from the bottom is mapped now.
677 * PC-9800's MPC table places on the very last of physical
678 * memory; so that simply reserving PAGE_SIZE from mpf->physptr
679 * yields BUG() in reserve_bootmem.
680 * also need to make sure physptr is below than max_low_pfn
681 * we don't need reserve the area above max_low_pfn
682 */
683 unsigned long end = max_low_pfn * PAGE_SIZE;
684 666
685 if (mpf->physptr < end) { 667 reserve_early_overlap_ok(mpf->physptr, mpf->physptr+size, "MP-table mpc");
686 if (mpf->physptr + size > end)
687 size = end - mpf->physptr;
688 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
689 }
690#else
691 reserve_bootmem_generic(mpf->physptr, size, BOOTMEM_DEFAULT);
692#endif
693} 668}
694 669
695static int __init smp_scan_config(unsigned long base, unsigned long length, 670static int __init smp_scan_config(unsigned long base, unsigned long length)
696 unsigned reserve)
697{ 671{
698 unsigned int *bp = phys_to_virt(base); 672 unsigned int *bp = phys_to_virt(base);
699 struct mpf_intel *mpf; 673 struct mpf_intel *mpf;
674 unsigned long mem;
700 675
701 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n", 676 apic_printk(APIC_VERBOSE, "Scan SMP from %p for %ld bytes.\n",
702 bp, length); 677 bp, length);
@@ -717,12 +692,10 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
717 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n", 692 printk(KERN_INFO "found SMP MP-table at [%p] %llx\n",
718 mpf, (u64)virt_to_phys(mpf)); 693 mpf, (u64)virt_to_phys(mpf));
719 694
720 if (!reserve) 695 mem = virt_to_phys(mpf);
721 return 1; 696 reserve_early_overlap_ok(mem, mem + sizeof(*mpf), "MP-table mpf");
722 reserve_bootmem_generic(virt_to_phys(mpf), sizeof(*mpf),
723 BOOTMEM_DEFAULT);
724 if (mpf->physptr) 697 if (mpf->physptr)
725 smp_reserve_bootmem(mpf); 698 smp_reserve_memory(mpf);
726 699
727 return 1; 700 return 1;
728 } 701 }
@@ -732,7 +705,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length,
732 return 0; 705 return 0;
733} 706}
734 707
735void __init default_find_smp_config(unsigned int reserve) 708void __init default_find_smp_config(void)
736{ 709{
737 unsigned int address; 710 unsigned int address;
738 711
@@ -744,9 +717,9 @@ void __init default_find_smp_config(unsigned int reserve)
744 * 2) Scan the top 1K of base RAM 717 * 2) Scan the top 1K of base RAM
745 * 3) Scan the 64K of bios 718 * 3) Scan the 64K of bios
746 */ 719 */
747 if (smp_scan_config(0x0, 0x400, reserve) || 720 if (smp_scan_config(0x0, 0x400) ||
748 smp_scan_config(639 * 0x400, 0x400, reserve) || 721 smp_scan_config(639 * 0x400, 0x400) ||
749 smp_scan_config(0xF0000, 0x10000, reserve)) 722 smp_scan_config(0xF0000, 0x10000))
750 return; 723 return;
751 /* 724 /*
752 * If it is an SMP machine we should know now, unless the 725 * If it is an SMP machine we should know now, unless the
@@ -767,7 +740,7 @@ void __init default_find_smp_config(unsigned int reserve)
767 740
768 address = get_bios_ebda(); 741 address = get_bios_ebda();
769 if (address) 742 if (address)
770 smp_scan_config(address, 0x400, reserve); 743 smp_scan_config(address, 0x400);
771} 744}
772 745
773#ifdef CONFIG_X86_IO_APIC 746#ifdef CONFIG_X86_IO_APIC
@@ -965,9 +938,6 @@ void __init early_reserve_e820_mpc_new(void)
965{ 938{
966 if (enable_update_mptable && alloc_mptable) { 939 if (enable_update_mptable && alloc_mptable) {
967 u64 startt = 0; 940 u64 startt = 0;
968#ifdef CONFIG_X86_TRAMPOLINE
969 startt = TRAMPOLINE_BASE;
970#endif
971 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); 941 mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4);
972 } 942 }
973} 943}
diff --git a/arch/x86/kernel/mrst.c b/arch/x86/kernel/mrst.c
index 3b7078abc871..0aad8670858e 100644
--- a/arch/x86/kernel/mrst.c
+++ b/arch/x86/kernel/mrst.c
@@ -10,8 +10,211 @@
10 * of the License. 10 * of the License.
11 */ 11 */
12#include <linux/init.h> 12#include <linux/init.h>
13#include <linux/kernel.h>
14#include <linux/sfi.h>
15#include <linux/irq.h>
16#include <linux/module.h>
13 17
14#include <asm/setup.h> 18#include <asm/setup.h>
19#include <asm/mpspec_def.h>
20#include <asm/hw_irq.h>
21#include <asm/apic.h>
22#include <asm/io_apic.h>
23#include <asm/mrst.h>
24#include <asm/io.h>
25#include <asm/i8259.h>
26#include <asm/apb_timer.h>
27
28static u32 sfi_mtimer_usage[SFI_MTMR_MAX_NUM];
29static struct sfi_timer_table_entry sfi_mtimer_array[SFI_MTMR_MAX_NUM];
30int sfi_mtimer_num;
31
32struct sfi_rtc_table_entry sfi_mrtc_array[SFI_MRTC_MAX];
33EXPORT_SYMBOL_GPL(sfi_mrtc_array);
34int sfi_mrtc_num;
35
36static inline void assign_to_mp_irq(struct mpc_intsrc *m,
37 struct mpc_intsrc *mp_irq)
38{
39 memcpy(mp_irq, m, sizeof(struct mpc_intsrc));
40}
41
42static inline int mp_irq_cmp(struct mpc_intsrc *mp_irq,
43 struct mpc_intsrc *m)
44{
45 return memcmp(mp_irq, m, sizeof(struct mpc_intsrc));
46}
47
48static void save_mp_irq(struct mpc_intsrc *m)
49{
50 int i;
51
52 for (i = 0; i < mp_irq_entries; i++) {
53 if (!mp_irq_cmp(&mp_irqs[i], m))
54 return;
55 }
56
57 assign_to_mp_irq(m, &mp_irqs[mp_irq_entries]);
58 if (++mp_irq_entries == MAX_IRQ_SOURCES)
59 panic("Max # of irq sources exceeded!!\n");
60}
61
62/* parse all the mtimer info to a static mtimer array */
63static int __init sfi_parse_mtmr(struct sfi_table_header *table)
64{
65 struct sfi_table_simple *sb;
66 struct sfi_timer_table_entry *pentry;
67 struct mpc_intsrc mp_irq;
68 int totallen;
69
70 sb = (struct sfi_table_simple *)table;
71 if (!sfi_mtimer_num) {
72 sfi_mtimer_num = SFI_GET_NUM_ENTRIES(sb,
73 struct sfi_timer_table_entry);
74 pentry = (struct sfi_timer_table_entry *) sb->pentry;
75 totallen = sfi_mtimer_num * sizeof(*pentry);
76 memcpy(sfi_mtimer_array, pentry, totallen);
77 }
78
79 printk(KERN_INFO "SFI: MTIMER info (num = %d):\n", sfi_mtimer_num);
80 pentry = sfi_mtimer_array;
81 for (totallen = 0; totallen < sfi_mtimer_num; totallen++, pentry++) {
82 printk(KERN_INFO "timer[%d]: paddr = 0x%08x, freq = %dHz,"
83 " irq = %d\n", totallen, (u32)pentry->phys_addr,
84 pentry->freq_hz, pentry->irq);
85 if (!pentry->irq)
86 continue;
87 mp_irq.type = MP_IOAPIC;
88 mp_irq.irqtype = mp_INT;
89/* triggering mode edge bit 2-3, active high polarity bit 0-1 */
90 mp_irq.irqflag = 5;
91 mp_irq.srcbus = 0;
92 mp_irq.srcbusirq = pentry->irq; /* IRQ */
93 mp_irq.dstapic = MP_APIC_ALL;
94 mp_irq.dstirq = pentry->irq;
95 save_mp_irq(&mp_irq);
96 }
97
98 return 0;
99}
100
101struct sfi_timer_table_entry *sfi_get_mtmr(int hint)
102{
103 int i;
104 if (hint < sfi_mtimer_num) {
105 if (!sfi_mtimer_usage[hint]) {
106 pr_debug("hint taken for timer %d irq %d\n",\
107 hint, sfi_mtimer_array[hint].irq);
108 sfi_mtimer_usage[hint] = 1;
109 return &sfi_mtimer_array[hint];
110 }
111 }
112 /* take the first timer available */
113 for (i = 0; i < sfi_mtimer_num;) {
114 if (!sfi_mtimer_usage[i]) {
115 sfi_mtimer_usage[i] = 1;
116 return &sfi_mtimer_array[i];
117 }
118 i++;
119 }
120 return NULL;
121}
122
123void sfi_free_mtmr(struct sfi_timer_table_entry *mtmr)
124{
125 int i;
126 for (i = 0; i < sfi_mtimer_num;) {
127 if (mtmr->irq == sfi_mtimer_array[i].irq) {
128 sfi_mtimer_usage[i] = 0;
129 return;
130 }
131 i++;
132 }
133}
134
135/* parse all the mrtc info to a global mrtc array */
136int __init sfi_parse_mrtc(struct sfi_table_header *table)
137{
138 struct sfi_table_simple *sb;
139 struct sfi_rtc_table_entry *pentry;
140 struct mpc_intsrc mp_irq;
141
142 int totallen;
143
144 sb = (struct sfi_table_simple *)table;
145 if (!sfi_mrtc_num) {
146 sfi_mrtc_num = SFI_GET_NUM_ENTRIES(sb,
147 struct sfi_rtc_table_entry);
148 pentry = (struct sfi_rtc_table_entry *)sb->pentry;
149 totallen = sfi_mrtc_num * sizeof(*pentry);
150 memcpy(sfi_mrtc_array, pentry, totallen);
151 }
152
153 printk(KERN_INFO "SFI: RTC info (num = %d):\n", sfi_mrtc_num);
154 pentry = sfi_mrtc_array;
155 for (totallen = 0; totallen < sfi_mrtc_num; totallen++, pentry++) {
156 printk(KERN_INFO "RTC[%d]: paddr = 0x%08x, irq = %d\n",
157 totallen, (u32)pentry->phys_addr, pentry->irq);
158 mp_irq.type = MP_IOAPIC;
159 mp_irq.irqtype = mp_INT;
160 mp_irq.irqflag = 0;
161 mp_irq.srcbus = 0;
162 mp_irq.srcbusirq = pentry->irq; /* IRQ */
163 mp_irq.dstapic = MP_APIC_ALL;
164 mp_irq.dstirq = pentry->irq;
165 save_mp_irq(&mp_irq);
166 }
167 return 0;
168}
169
170/*
171 * the secondary clock in Moorestown can be APBT or LAPIC clock, default to
172 * APBT but cmdline option can also override it.
173 */
174static void __cpuinit mrst_setup_secondary_clock(void)
175{
176 /* restore default lapic clock if disabled by cmdline */
177 if (disable_apbt_percpu)
178 return setup_secondary_APIC_clock();
179 apbt_setup_secondary_clock();
180}
181
182static unsigned long __init mrst_calibrate_tsc(void)
183{
184 unsigned long flags, fast_calibrate;
185
186 local_irq_save(flags);
187 fast_calibrate = apbt_quick_calibrate();
188 local_irq_restore(flags);
189
190 if (fast_calibrate)
191 return fast_calibrate;
192
193 return 0;
194}
195
196void __init mrst_time_init(void)
197{
198 sfi_table_parse(SFI_SIG_MTMR, NULL, NULL, sfi_parse_mtmr);
199 pre_init_apic_IRQ0();
200 apbt_time_init();
201}
202
203void __init mrst_rtc_init(void)
204{
205 sfi_table_parse(SFI_SIG_MRTC, NULL, NULL, sfi_parse_mrtc);
206}
207
208/*
209 * if we use per cpu apb timer, the bootclock already setup. if we use lapic
210 * timer and one apbt timer for broadcast, we need to set up lapic boot clock.
211 */
212static void __init mrst_setup_boot_clock(void)
213{
214 pr_info("%s: per cpu apbt flag %d \n", __func__, disable_apbt_percpu);
215 if (disable_apbt_percpu)
216 setup_boot_APIC_clock();
217};
15 218
16/* 219/*
17 * Moorestown specific x86_init function overrides and early setup 220 * Moorestown specific x86_init function overrides and early setup
@@ -21,4 +224,17 @@ void __init x86_mrst_early_setup(void)
21{ 224{
22 x86_init.resources.probe_roms = x86_init_noop; 225 x86_init.resources.probe_roms = x86_init_noop;
23 x86_init.resources.reserve_resources = x86_init_noop; 226 x86_init.resources.reserve_resources = x86_init_noop;
227
228 x86_init.timers.timer_init = mrst_time_init;
229 x86_init.timers.setup_percpu_clockev = mrst_setup_boot_clock;
230
231 x86_init.irqs.pre_vector_init = x86_init_noop;
232
233 x86_cpuinit.setup_percpu_clockev = mrst_setup_secondary_clock;
234
235 x86_platform.calibrate_tsc = mrst_calibrate_tsc;
236 x86_init.pci.init = pci_mrst_init;
237 x86_init.pci.fixup_irqs = x86_init_noop;
238
239 legacy_pic = &null_legacy_pic;
24} 240}
diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 6a3cefc7dda1..4d4468e9f47c 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -37,6 +37,7 @@
37#include <linux/cpu.h> 37#include <linux/cpu.h>
38#include <linux/notifier.h> 38#include <linux/notifier.h>
39#include <linux/uaccess.h> 39#include <linux/uaccess.h>
40#include <linux/gfp.h>
40 41
41#include <asm/processor.h> 42#include <asm/processor.h>
42#include <asm/msr.h> 43#include <asm/msr.h>
@@ -172,23 +173,18 @@ static long msr_ioctl(struct file *file, unsigned int ioc, unsigned long arg)
172 173
173static int msr_open(struct inode *inode, struct file *file) 174static int msr_open(struct inode *inode, struct file *file)
174{ 175{
175 unsigned int cpu = iminor(file->f_path.dentry->d_inode); 176 unsigned int cpu;
176 struct cpuinfo_x86 *c = &cpu_data(cpu); 177 struct cpuinfo_x86 *c;
177 int ret = 0;
178 178
179 lock_kernel();
180 cpu = iminor(file->f_path.dentry->d_inode); 179 cpu = iminor(file->f_path.dentry->d_inode);
180 if (cpu >= nr_cpu_ids || !cpu_online(cpu))
181 return -ENXIO; /* No such CPU */
181 182
182 if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
183 ret = -ENXIO; /* No such CPU */
184 goto out;
185 }
186 c = &cpu_data(cpu); 183 c = &cpu_data(cpu);
187 if (!cpu_has(c, X86_FEATURE_MSR)) 184 if (!cpu_has(c, X86_FEATURE_MSR))
188 ret = -EIO; /* MSR not supported */ 185 return -EIO; /* MSR not supported */
189out: 186
190 unlock_kernel(); 187 return 0;
191 return ret;
192} 188}
193 189
194/* 190/*
@@ -251,7 +247,7 @@ static int __init msr_init(void)
251 int i, err = 0; 247 int i, err = 0;
252 i = 0; 248 i = 0;
253 249
254 if (register_chrdev(MSR_MAJOR, "cpu/msr", &msr_fops)) { 250 if (__register_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr", &msr_fops)) {
255 printk(KERN_ERR "msr: unable to get major %d for msr\n", 251 printk(KERN_ERR "msr: unable to get major %d for msr\n",
256 MSR_MAJOR); 252 MSR_MAJOR);
257 err = -EBUSY; 253 err = -EBUSY;
@@ -279,7 +275,7 @@ out_class:
279 msr_device_destroy(i); 275 msr_device_destroy(i);
280 class_destroy(msr_class); 276 class_destroy(msr_class);
281out_chrdev: 277out_chrdev:
282 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 278 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
283out: 279out:
284 return err; 280 return err;
285} 281}
@@ -290,7 +286,7 @@ static void __exit msr_exit(void)
290 for_each_online_cpu(cpu) 286 for_each_online_cpu(cpu)
291 msr_device_destroy(cpu); 287 msr_device_destroy(cpu);
292 class_destroy(msr_class); 288 class_destroy(msr_class);
293 unregister_chrdev(MSR_MAJOR, "cpu/msr"); 289 __unregister_chrdev(MSR_MAJOR, 0, NR_CPUS, "cpu/msr");
294 unregister_hotcpu_notifier(&msr_class_cpu_notifier); 290 unregister_hotcpu_notifier(&msr_class_cpu_notifier);
295} 291}
296 292
diff --git a/arch/x86/kernel/olpc.c b/arch/x86/kernel/olpc.c
index 4006c522adc7..8297160c41b3 100644
--- a/arch/x86/kernel/olpc.c
+++ b/arch/x86/kernel/olpc.c
@@ -17,7 +17,9 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/io.h> 18#include <linux/io.h>
19#include <linux/string.h> 19#include <linux/string.h>
20
20#include <asm/geode.h> 21#include <asm/geode.h>
22#include <asm/setup.h>
21#include <asm/olpc.h> 23#include <asm/olpc.h>
22 24
23#ifdef CONFIG_OPEN_FIRMWARE 25#ifdef CONFIG_OPEN_FIRMWARE
@@ -212,7 +214,7 @@ static int __init olpc_init(void)
212 unsigned char *romsig; 214 unsigned char *romsig;
213 215
214 /* The ioremap check is dangerous; limit what we run it on */ 216 /* The ioremap check is dangerous; limit what we run it on */
215 if (!is_geode() || geode_has_vsa2()) 217 if (!is_geode() || cs5535_has_vsa2())
216 return 0; 218 return 0;
217 219
218 spin_lock_init(&ec_lock); 220 spin_lock_init(&ec_lock);
@@ -243,9 +245,11 @@ static int __init olpc_init(void)
243 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0, 245 olpc_ec_cmd(EC_FIRMWARE_REV, NULL, 0,
244 (unsigned char *) &olpc_platform_info.ecver, 1); 246 (unsigned char *) &olpc_platform_info.ecver, 1);
245 247
246 /* check to see if the VSA exists */ 248#ifdef CONFIG_PCI_OLPC
247 if (geode_has_vsa2()) 249 /* If the VSA exists let it emulate PCI, if not emulate in kernel */
248 olpc_platform_info.flags |= OLPC_F_VSA; 250 if (!cs5535_has_vsa2())
251 x86_init.pci.arch_init = pci_olpc_init;
252#endif
249 253
250 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n", 254 printk(KERN_INFO "OLPC board revision %s%X (EC=%x)\n",
251 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "", 255 ((olpc_platform_info.boardrev & 0xf) < 8) ? "pre" : "",
diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c
index 3a7c5a44082e..676b8c77a976 100644
--- a/arch/x86/kernel/paravirt-spinlocks.c
+++ b/arch/x86/kernel/paravirt-spinlocks.c
@@ -8,9 +8,9 @@
8#include <asm/paravirt.h> 8#include <asm/paravirt.h>
9 9
10static inline void 10static inline void
11default_spin_lock_flags(raw_spinlock_t *lock, unsigned long flags) 11default_spin_lock_flags(arch_spinlock_t *lock, unsigned long flags)
12{ 12{
13 __raw_spin_lock(lock); 13 arch_spin_lock(lock);
14} 14}
15 15
16struct pv_lock_ops pv_lock_ops = { 16struct pv_lock_ops pv_lock_ops = {
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 1b1739d16310..1db183ed7c01 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x86/kernel/paravirt.c
@@ -428,10 +428,6 @@ struct pv_mmu_ops pv_mmu_ops = {
428 .ptep_modify_prot_start = __ptep_modify_prot_start, 428 .ptep_modify_prot_start = __ptep_modify_prot_start,
429 .ptep_modify_prot_commit = __ptep_modify_prot_commit, 429 .ptep_modify_prot_commit = __ptep_modify_prot_commit,
430 430
431#ifdef CONFIG_HIGHPTE
432 .kmap_atomic_pte = kmap_atomic,
433#endif
434
435#if PAGETABLE_LEVELS >= 3 431#if PAGETABLE_LEVELS >= 3
436#ifdef CONFIG_X86_PAE 432#ifdef CONFIG_X86_PAE
437 .set_pte_atomic = native_set_pte_atomic, 433 .set_pte_atomic = native_set_pte_atomic,
diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c
index 971a3bec47a8..fb99f7edb341 100644
--- a/arch/x86/kernel/pci-calgary_64.c
+++ b/arch/x86/kernel/pci-calgary_64.c
@@ -31,7 +31,7 @@
31#include <linux/string.h> 31#include <linux/string.h>
32#include <linux/crash_dump.h> 32#include <linux/crash_dump.h>
33#include <linux/dma-mapping.h> 33#include <linux/dma-mapping.h>
34#include <linux/bitops.h> 34#include <linux/bitmap.h>
35#include <linux/pci_ids.h> 35#include <linux/pci_ids.h>
36#include <linux/pci.h> 36#include <linux/pci.h>
37#include <linux/delay.h> 37#include <linux/delay.h>
@@ -46,6 +46,7 @@
46#include <asm/dma.h> 46#include <asm/dma.h>
47#include <asm/rio.h> 47#include <asm/rio.h>
48#include <asm/bios_ebda.h> 48#include <asm/bios_ebda.h>
49#include <asm/x86_init.h>
49 50
50#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT 51#ifdef CONFIG_CALGARY_IOMMU_ENABLED_BY_DEFAULT
51int use_calgary __read_mostly = 1; 52int use_calgary __read_mostly = 1;
@@ -211,7 +212,7 @@ static void iommu_range_reserve(struct iommu_table *tbl,
211 212
212 spin_lock_irqsave(&tbl->it_lock, flags); 213 spin_lock_irqsave(&tbl->it_lock, flags);
213 214
214 iommu_area_reserve(tbl->it_map, index, npages); 215 bitmap_set(tbl->it_map, index, npages);
215 216
216 spin_unlock_irqrestore(&tbl->it_lock, flags); 217 spin_unlock_irqrestore(&tbl->it_lock, flags);
217} 218}
@@ -244,7 +245,7 @@ static unsigned long iommu_range_alloc(struct device *dev,
244 if (panic_on_overflow) 245 if (panic_on_overflow)
245 panic("Calgary: fix the allocator.\n"); 246 panic("Calgary: fix the allocator.\n");
246 else 247 else
247 return bad_dma_address; 248 return DMA_ERROR_CODE;
248 } 249 }
249 } 250 }
250 251
@@ -260,12 +261,15 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
260 void *vaddr, unsigned int npages, int direction) 261 void *vaddr, unsigned int npages, int direction)
261{ 262{
262 unsigned long entry; 263 unsigned long entry;
263 dma_addr_t ret = bad_dma_address; 264 dma_addr_t ret;
264 265
265 entry = iommu_range_alloc(dev, tbl, npages); 266 entry = iommu_range_alloc(dev, tbl, npages);
266 267
267 if (unlikely(entry == bad_dma_address)) 268 if (unlikely(entry == DMA_ERROR_CODE)) {
268 goto error; 269 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
270 "iommu %p\n", npages, tbl);
271 return DMA_ERROR_CODE;
272 }
269 273
270 /* set the return dma address */ 274 /* set the return dma address */
271 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK); 275 ret = (entry << PAGE_SHIFT) | ((unsigned long)vaddr & ~PAGE_MASK);
@@ -273,13 +277,7 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
273 /* put the TCEs in the HW table */ 277 /* put the TCEs in the HW table */
274 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK, 278 tce_build(tbl, entry, npages, (unsigned long)vaddr & PAGE_MASK,
275 direction); 279 direction);
276
277 return ret; 280 return ret;
278
279error:
280 printk(KERN_WARNING "Calgary: failed to allocate %u pages in "
281 "iommu %p\n", npages, tbl);
282 return bad_dma_address;
283} 281}
284 282
285static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 283static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
@@ -290,8 +288,8 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
290 unsigned long flags; 288 unsigned long flags;
291 289
292 /* were we called with bad_dma_address? */ 290 /* were we called with bad_dma_address? */
293 badend = bad_dma_address + (EMERGENCY_PAGES * PAGE_SIZE); 291 badend = DMA_ERROR_CODE + (EMERGENCY_PAGES * PAGE_SIZE);
294 if (unlikely((dma_addr >= bad_dma_address) && (dma_addr < badend))) { 292 if (unlikely((dma_addr >= DMA_ERROR_CODE) && (dma_addr < badend))) {
295 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA " 293 WARN(1, KERN_ERR "Calgary: driver tried unmapping bad DMA "
296 "address 0x%Lx\n", dma_addr); 294 "address 0x%Lx\n", dma_addr);
297 return; 295 return;
@@ -305,7 +303,7 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
305 303
306 spin_lock_irqsave(&tbl->it_lock, flags); 304 spin_lock_irqsave(&tbl->it_lock, flags);
307 305
308 iommu_area_free(tbl->it_map, entry, npages); 306 bitmap_clear(tbl->it_map, entry, npages);
309 307
310 spin_unlock_irqrestore(&tbl->it_lock, flags); 308 spin_unlock_irqrestore(&tbl->it_lock, flags);
311} 309}
@@ -318,13 +316,15 @@ static inline struct iommu_table *find_iommu_table(struct device *dev)
318 316
319 pdev = to_pci_dev(dev); 317 pdev = to_pci_dev(dev);
320 318
319 /* search up the device tree for an iommu */
321 pbus = pdev->bus; 320 pbus = pdev->bus;
322 321 do {
323 /* is the device behind a bridge? Look for the root bus */ 322 tbl = pci_iommu(pbus);
324 while (pbus->parent) 323 if (tbl && tbl->it_busno == pbus->number)
324 break;
325 tbl = NULL;
325 pbus = pbus->parent; 326 pbus = pbus->parent;
326 327 } while (pbus);
327 tbl = pci_iommu(pbus);
328 328
329 BUG_ON(tbl && (tbl->it_busno != pbus->number)); 329 BUG_ON(tbl && (tbl->it_busno != pbus->number));
330 330
@@ -373,7 +373,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE); 373 npages = iommu_num_pages(vaddr, s->length, PAGE_SIZE);
374 374
375 entry = iommu_range_alloc(dev, tbl, npages); 375 entry = iommu_range_alloc(dev, tbl, npages);
376 if (entry == bad_dma_address) { 376 if (entry == DMA_ERROR_CODE) {
377 /* makes sure unmap knows to stop */ 377 /* makes sure unmap knows to stop */
378 s->dma_length = 0; 378 s->dma_length = 0;
379 goto error; 379 goto error;
@@ -391,7 +391,7 @@ static int calgary_map_sg(struct device *dev, struct scatterlist *sg,
391error: 391error:
392 calgary_unmap_sg(dev, sg, nelems, dir, NULL); 392 calgary_unmap_sg(dev, sg, nelems, dir, NULL);
393 for_each_sg(sg, s, nelems, i) { 393 for_each_sg(sg, s, nelems, i) {
394 sg->dma_address = bad_dma_address; 394 sg->dma_address = DMA_ERROR_CODE;
395 sg->dma_length = 0; 395 sg->dma_length = 0;
396 } 396 }
397 return 0; 397 return 0;
@@ -446,7 +446,7 @@ static void* calgary_alloc_coherent(struct device *dev, size_t size,
446 446
447 /* set up tces to cover the allocated range */ 447 /* set up tces to cover the allocated range */
448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL); 448 mapping = iommu_alloc(dev, tbl, ret, npages, DMA_BIDIRECTIONAL);
449 if (mapping == bad_dma_address) 449 if (mapping == DMA_ERROR_CODE)
450 goto free; 450 goto free;
451 *dma_handle = mapping; 451 *dma_handle = mapping;
452 return ret; 452 return ret;
@@ -727,7 +727,7 @@ static void __init calgary_reserve_regions(struct pci_dev *dev)
727 struct iommu_table *tbl = pci_iommu(dev->bus); 727 struct iommu_table *tbl = pci_iommu(dev->bus);
728 728
729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */ 729 /* reserve EMERGENCY_PAGES from bad_dma_address and up */
730 iommu_range_reserve(tbl, bad_dma_address, EMERGENCY_PAGES); 730 iommu_range_reserve(tbl, DMA_ERROR_CODE, EMERGENCY_PAGES);
731 731
732 /* avoid the BIOS/VGA first 640KB-1MB region */ 732 /* avoid the BIOS/VGA first 640KB-1MB region */
733 /* for CalIOC2 - avoid the entire first MB */ 733 /* for CalIOC2 - avoid the entire first MB */
@@ -1309,7 +1309,7 @@ static void calgary_init_bitmap_from_tce_table(struct iommu_table *tbl)
1309/* 1309/*
1310 * get_tce_space_from_tar(): 1310 * get_tce_space_from_tar():
1311 * Function for kdump case. Get the tce tables from first kernel 1311 * Function for kdump case. Get the tce tables from first kernel
1312 * by reading the contents of the base adress register of calgary iommu 1312 * by reading the contents of the base address register of calgary iommu
1313 */ 1313 */
1314static void __init get_tce_space_from_tar(void) 1314static void __init get_tce_space_from_tar(void)
1315{ 1315{
@@ -1344,6 +1344,23 @@ static void __init get_tce_space_from_tar(void)
1344 return; 1344 return;
1345} 1345}
1346 1346
1347static int __init calgary_iommu_init(void)
1348{
1349 int ret;
1350
1351 /* ok, we're trying to use Calgary - let's roll */
1352 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1353
1354 ret = calgary_init();
1355 if (ret) {
1356 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1357 "falling back to no_iommu\n", ret);
1358 return ret;
1359 }
1360
1361 return 0;
1362}
1363
1347void __init detect_calgary(void) 1364void __init detect_calgary(void)
1348{ 1365{
1349 int bus; 1366 int bus;
@@ -1357,7 +1374,7 @@ void __init detect_calgary(void)
1357 * if the user specified iommu=off or iommu=soft or we found 1374 * if the user specified iommu=off or iommu=soft or we found
1358 * another HW IOMMU already, bail out. 1375 * another HW IOMMU already, bail out.
1359 */ 1376 */
1360 if (swiotlb || no_iommu || iommu_detected) 1377 if (no_iommu || iommu_detected)
1361 return; 1378 return;
1362 1379
1363 if (!use_calgary) 1380 if (!use_calgary)
@@ -1442,9 +1459,7 @@ void __init detect_calgary(void)
1442 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", 1459 printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n",
1443 specified_table_size); 1460 specified_table_size);
1444 1461
1445 /* swiotlb for devices that aren't behind the Calgary. */ 1462 x86_init.iommu.iommu_init = calgary_iommu_init;
1446 if (max_pfn > MAX_DMA32_PFN)
1447 swiotlb = 1;
1448 } 1463 }
1449 return; 1464 return;
1450 1465
@@ -1457,35 +1472,6 @@ cleanup:
1457 } 1472 }
1458} 1473}
1459 1474
1460int __init calgary_iommu_init(void)
1461{
1462 int ret;
1463
1464 if (no_iommu || (swiotlb && !calgary_detected))
1465 return -ENODEV;
1466
1467 if (!calgary_detected)
1468 return -ENODEV;
1469
1470 /* ok, we're trying to use Calgary - let's roll */
1471 printk(KERN_INFO "PCI-DMA: Using Calgary IOMMU\n");
1472
1473 ret = calgary_init();
1474 if (ret) {
1475 printk(KERN_ERR "PCI-DMA: Calgary init failed %d, "
1476 "falling back to no_iommu\n", ret);
1477 return ret;
1478 }
1479
1480 force_iommu = 1;
1481 bad_dma_address = 0x0;
1482 /* dma_ops is set to swiotlb or nommu */
1483 if (!dma_ops)
1484 dma_ops = &nommu_dma_ops;
1485
1486 return 0;
1487}
1488
1489static int __init calgary_parse_options(char *p) 1475static int __init calgary_parse_options(char *p)
1490{ 1476{
1491 unsigned int bridge; 1477 unsigned int bridge;
diff --git a/arch/x86/kernel/pci-dma.c b/arch/x86/kernel/pci-dma.c
index a6e804d16c35..4b7e3d8b01dd 100644
--- a/arch/x86/kernel/pci-dma.c
+++ b/arch/x86/kernel/pci-dma.c
@@ -2,6 +2,7 @@
2#include <linux/dma-debug.h> 2#include <linux/dma-debug.h>
3#include <linux/dmar.h> 3#include <linux/dmar.h>
4#include <linux/bootmem.h> 4#include <linux/bootmem.h>
5#include <linux/gfp.h>
5#include <linux/pci.h> 6#include <linux/pci.h>
6#include <linux/kmemleak.h> 7#include <linux/kmemleak.h>
7 8
@@ -11,10 +12,11 @@
11#include <asm/gart.h> 12#include <asm/gart.h>
12#include <asm/calgary.h> 13#include <asm/calgary.h>
13#include <asm/amd_iommu.h> 14#include <asm/amd_iommu.h>
15#include <asm/x86_init.h>
14 16
15static int forbid_dac __read_mostly; 17static int forbid_dac __read_mostly;
16 18
17struct dma_map_ops *dma_ops; 19struct dma_map_ops *dma_ops = &nommu_dma_ops;
18EXPORT_SYMBOL(dma_ops); 20EXPORT_SYMBOL(dma_ops);
19 21
20static int iommu_sac_force __read_mostly; 22static int iommu_sac_force __read_mostly;
@@ -37,14 +39,11 @@ int iommu_detected __read_mostly = 0;
37 * This variable becomes 1 if iommu=pt is passed on the kernel command line. 39 * This variable becomes 1 if iommu=pt is passed on the kernel command line.
38 * If this variable is 1, IOMMU implementations do no DMA translation for 40 * If this variable is 1, IOMMU implementations do no DMA translation for
39 * devices and allow every device to access to whole physical memory. This is 41 * devices and allow every device to access to whole physical memory. This is
40 * useful if a user want to use an IOMMU only for KVM device assignment to 42 * useful if a user wants to use an IOMMU only for KVM device assignment to
41 * guests and not for driver dma translation. 43 * guests and not for driver dma translation.
42 */ 44 */
43int iommu_pass_through __read_mostly; 45int iommu_pass_through __read_mostly;
44 46
45dma_addr_t bad_dma_address __read_mostly = 0;
46EXPORT_SYMBOL(bad_dma_address);
47
48/* Dummy device used for NULL arguments (normally ISA). */ 47/* Dummy device used for NULL arguments (normally ISA). */
49struct device x86_dma_fallback_dev = { 48struct device x86_dma_fallback_dev = {
50 .init_name = "fallback device", 49 .init_name = "fallback device",
@@ -67,7 +66,7 @@ int dma_set_mask(struct device *dev, u64 mask)
67} 66}
68EXPORT_SYMBOL(dma_set_mask); 67EXPORT_SYMBOL(dma_set_mask);
69 68
70#ifdef CONFIG_X86_64 69#if defined(CONFIG_X86_64) && !defined(CONFIG_NUMA)
71static __initdata void *dma32_bootmem_ptr; 70static __initdata void *dma32_bootmem_ptr;
72static unsigned long dma32_bootmem_size __initdata = (128ULL<<20); 71static unsigned long dma32_bootmem_size __initdata = (128ULL<<20);
73 72
@@ -118,27 +117,33 @@ static void __init dma32_free_bootmem(void)
118 dma32_bootmem_ptr = NULL; 117 dma32_bootmem_ptr = NULL;
119 dma32_bootmem_size = 0; 118 dma32_bootmem_size = 0;
120} 119}
120#else
121void __init dma32_reserve_bootmem(void)
122{
123}
124static void __init dma32_free_bootmem(void)
125{
126}
127
121#endif 128#endif
122 129
123void __init pci_iommu_alloc(void) 130void __init pci_iommu_alloc(void)
124{ 131{
125#ifdef CONFIG_X86_64
126 /* free the range so iommu could get some range less than 4G */ 132 /* free the range so iommu could get some range less than 4G */
127 dma32_free_bootmem(); 133 dma32_free_bootmem();
128#endif
129 134
130 /* 135 if (pci_swiotlb_detect())
131 * The order of these functions is important for 136 goto out;
132 * fall-back/fail-over reasons 137
133 */
134 gart_iommu_hole_init(); 138 gart_iommu_hole_init();
135 139
136 detect_calgary(); 140 detect_calgary();
137 141
138 detect_intel_iommu(); 142 detect_intel_iommu();
139 143
144 /* needs to be called after gart_iommu_hole_init */
140 amd_iommu_detect(); 145 amd_iommu_detect();
141 146out:
142 pci_swiotlb_init(); 147 pci_swiotlb_init();
143} 148}
144 149
@@ -214,7 +219,7 @@ static __init int iommu_setup(char *p)
214 if (!strncmp(p, "allowdac", 8)) 219 if (!strncmp(p, "allowdac", 8))
215 forbid_dac = 0; 220 forbid_dac = 0;
216 if (!strncmp(p, "nodac", 5)) 221 if (!strncmp(p, "nodac", 5))
217 forbid_dac = -1; 222 forbid_dac = 1;
218 if (!strncmp(p, "usedac", 6)) { 223 if (!strncmp(p, "usedac", 6)) {
219 forbid_dac = -1; 224 forbid_dac = -1;
220 return 1; 225 return 1;
@@ -289,25 +294,17 @@ static int __init pci_iommu_init(void)
289#ifdef CONFIG_PCI 294#ifdef CONFIG_PCI
290 dma_debug_add_bus(&pci_bus_type); 295 dma_debug_add_bus(&pci_bus_type);
291#endif 296#endif
297 x86_init.iommu.iommu_init();
292 298
293 calgary_iommu_init(); 299 if (swiotlb) {
294 300 printk(KERN_INFO "PCI-DMA: "
295 intel_iommu_init(); 301 "Using software bounce buffering for IO (SWIOTLB)\n");
296 302 swiotlb_print_info();
297 amd_iommu_init(); 303 } else
304 swiotlb_free();
298 305
299 gart_iommu_init();
300
301 no_iommu_init();
302 return 0; 306 return 0;
303} 307}
304
305void pci_iommu_shutdown(void)
306{
307 gart_iommu_shutdown();
308
309 amd_iommu_shutdown();
310}
311/* Must execute after PCI subsystem */ 308/* Must execute after PCI subsystem */
312rootfs_initcall(pci_iommu_init); 309rootfs_initcall(pci_iommu_init);
313 310
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index a7f1b64f86e0..0f7f130caa67 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -23,12 +23,13 @@
23#include <linux/module.h> 23#include <linux/module.h>
24#include <linux/topology.h> 24#include <linux/topology.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/bitops.h> 26#include <linux/bitmap.h>
27#include <linux/kdebug.h> 27#include <linux/kdebug.h>
28#include <linux/scatterlist.h> 28#include <linux/scatterlist.h>
29#include <linux/iommu-helper.h> 29#include <linux/iommu-helper.h>
30#include <linux/sysdev.h> 30#include <linux/sysdev.h>
31#include <linux/io.h> 31#include <linux/io.h>
32#include <linux/gfp.h>
32#include <asm/atomic.h> 33#include <asm/atomic.h>
33#include <asm/mtrr.h> 34#include <asm/mtrr.h>
34#include <asm/pgtable.h> 35#include <asm/pgtable.h>
@@ -39,6 +40,7 @@
39#include <asm/swiotlb.h> 40#include <asm/swiotlb.h>
40#include <asm/dma.h> 41#include <asm/dma.h>
41#include <asm/k8.h> 42#include <asm/k8.h>
43#include <asm/x86_init.h>
42 44
43static unsigned long iommu_bus_base; /* GART remapping area (physical) */ 45static unsigned long iommu_bus_base; /* GART remapping area (physical) */
44static unsigned long iommu_size; /* size of remapping area bytes */ 46static unsigned long iommu_size; /* size of remapping area bytes */
@@ -46,6 +48,8 @@ static unsigned long iommu_pages; /* .. and in pages */
46 48
47static u32 *iommu_gatt_base; /* Remapping table */ 49static u32 *iommu_gatt_base; /* Remapping table */
48 50
51static dma_addr_t bad_dma_addr;
52
49/* 53/*
50 * If this is disabled the IOMMU will use an optimized flushing strategy 54 * If this is disabled the IOMMU will use an optimized flushing strategy
51 * of only flushing when an mapping is reused. With it true the GART is 55 * of only flushing when an mapping is reused. With it true the GART is
@@ -92,7 +96,7 @@ static unsigned long alloc_iommu(struct device *dev, int size,
92 96
93 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev), 97 base_index = ALIGN(iommu_bus_base & dma_get_seg_boundary(dev),
94 PAGE_SIZE) >> PAGE_SHIFT; 98 PAGE_SIZE) >> PAGE_SHIFT;
95 boundary_size = ALIGN((unsigned long long)dma_get_seg_boundary(dev) + 1, 99 boundary_size = ALIGN((u64)dma_get_seg_boundary(dev) + 1,
96 PAGE_SIZE) >> PAGE_SHIFT; 100 PAGE_SIZE) >> PAGE_SHIFT;
97 101
98 spin_lock_irqsave(&iommu_bitmap_lock, flags); 102 spin_lock_irqsave(&iommu_bitmap_lock, flags);
@@ -123,7 +127,7 @@ static void free_iommu(unsigned long offset, int size)
123 unsigned long flags; 127 unsigned long flags;
124 128
125 spin_lock_irqsave(&iommu_bitmap_lock, flags); 129 spin_lock_irqsave(&iommu_bitmap_lock, flags);
126 iommu_area_free(iommu_gart_bitmap, offset, size); 130 bitmap_clear(iommu_gart_bitmap, offset, size);
127 if (offset >= next_bit) 131 if (offset >= next_bit)
128 next_bit = offset + size; 132 next_bit = offset + size;
129 spin_unlock_irqrestore(&iommu_bitmap_lock, flags); 133 spin_unlock_irqrestore(&iommu_bitmap_lock, flags);
@@ -216,7 +220,7 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem,
216 if (panic_on_overflow) 220 if (panic_on_overflow)
217 panic("dma_map_area overflow %lu bytes\n", size); 221 panic("dma_map_area overflow %lu bytes\n", size);
218 iommu_full(dev, size, dir); 222 iommu_full(dev, size, dir);
219 return bad_dma_address; 223 return bad_dma_addr;
220 } 224 }
221 225
222 for (i = 0; i < npages; i++) { 226 for (i = 0; i < npages; i++) {
@@ -294,7 +298,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
294 int i; 298 int i;
295 299
296#ifdef CONFIG_IOMMU_DEBUG 300#ifdef CONFIG_IOMMU_DEBUG
297 printk(KERN_DEBUG "dma_map_sg overflow\n"); 301 pr_debug("dma_map_sg overflow\n");
298#endif 302#endif
299 303
300 for_each_sg(sg, s, nents, i) { 304 for_each_sg(sg, s, nents, i) {
@@ -302,7 +306,7 @@ static int dma_map_sg_nonforce(struct device *dev, struct scatterlist *sg,
302 306
303 if (nonforced_iommu(dev, addr, s->length)) { 307 if (nonforced_iommu(dev, addr, s->length)) {
304 addr = dma_map_area(dev, addr, s->length, dir, 0); 308 addr = dma_map_area(dev, addr, s->length, dir, 0);
305 if (addr == bad_dma_address) { 309 if (addr == bad_dma_addr) {
306 if (i > 0) 310 if (i > 0)
307 gart_unmap_sg(dev, sg, i, dir, NULL); 311 gart_unmap_sg(dev, sg, i, dir, NULL);
308 nents = 0; 312 nents = 0;
@@ -389,12 +393,14 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
389 if (!dev) 393 if (!dev)
390 dev = &x86_dma_fallback_dev; 394 dev = &x86_dma_fallback_dev;
391 395
392 out = 0; 396 out = 0;
393 start = 0; 397 start = 0;
394 start_sg = sgmap = sg; 398 start_sg = sg;
395 seg_size = 0; 399 sgmap = sg;
396 max_seg_size = dma_get_max_seg_size(dev); 400 seg_size = 0;
397 ps = NULL; /* shut up gcc */ 401 max_seg_size = dma_get_max_seg_size(dev);
402 ps = NULL; /* shut up gcc */
403
398 for_each_sg(sg, s, nents, i) { 404 for_each_sg(sg, s, nents, i) {
399 dma_addr_t addr = sg_phys(s); 405 dma_addr_t addr = sg_phys(s);
400 406
@@ -417,11 +423,12 @@ static int gart_map_sg(struct device *dev, struct scatterlist *sg, int nents,
417 sgmap, pages, need) < 0) 423 sgmap, pages, need) < 0)
418 goto error; 424 goto error;
419 out++; 425 out++;
420 seg_size = 0; 426
421 sgmap = sg_next(sgmap); 427 seg_size = 0;
422 pages = 0; 428 sgmap = sg_next(sgmap);
423 start = i; 429 pages = 0;
424 start_sg = s; 430 start = i;
431 start_sg = s;
425 } 432 }
426 } 433 }
427 434
@@ -455,7 +462,7 @@ error:
455 462
456 iommu_full(dev, pages << PAGE_SHIFT, dir); 463 iommu_full(dev, pages << PAGE_SHIFT, dir);
457 for_each_sg(sg, s, nents, i) 464 for_each_sg(sg, s, nents, i)
458 s->dma_address = bad_dma_address; 465 s->dma_address = bad_dma_addr;
459 return 0; 466 return 0;
460} 467}
461 468
@@ -479,7 +486,7 @@ gart_alloc_coherent(struct device *dev, size_t size, dma_addr_t *dma_addr,
479 DMA_BIDIRECTIONAL, align_mask); 486 DMA_BIDIRECTIONAL, align_mask);
480 487
481 flush_gart(); 488 flush_gart();
482 if (paddr != bad_dma_address) { 489 if (paddr != bad_dma_addr) {
483 *dma_addr = paddr; 490 *dma_addr = paddr;
484 return page_address(page); 491 return page_address(page);
485 } 492 }
@@ -499,6 +506,11 @@ gart_free_coherent(struct device *dev, size_t size, void *vaddr,
499 free_pages((unsigned long)vaddr, get_order(size)); 506 free_pages((unsigned long)vaddr, get_order(size));
500} 507}
501 508
509static int gart_mapping_error(struct device *dev, dma_addr_t dma_addr)
510{
511 return (dma_addr == bad_dma_addr);
512}
513
502static int no_agp; 514static int no_agp;
503 515
504static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size) 516static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
@@ -515,7 +527,7 @@ static __init unsigned long check_iommu_size(unsigned long aper, u64 aper_size)
515 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a; 527 iommu_size -= round_up(a, PMD_PAGE_SIZE) - a;
516 528
517 if (iommu_size < 64*1024*1024) { 529 if (iommu_size < 64*1024*1024) {
518 printk(KERN_WARNING 530 pr_warning(
519 "PCI-DMA: Warning: Small IOMMU %luMB." 531 "PCI-DMA: Warning: Small IOMMU %luMB."
520 " Consider increasing the AGP aperture in BIOS\n", 532 " Consider increasing the AGP aperture in BIOS\n",
521 iommu_size >> 20); 533 iommu_size >> 20);
@@ -553,6 +565,9 @@ static void enable_gart_translations(void)
553 565
554 enable_gart_translation(dev, __pa(agp_gatt_table)); 566 enable_gart_translation(dev, __pa(agp_gatt_table));
555 } 567 }
568
569 /* Flush the GART-TLB to remove stale entries */
570 k8_flush_garts();
556} 571}
557 572
558/* 573/*
@@ -570,28 +585,32 @@ void set_up_gart_resume(u32 aper_order, u32 aper_alloc)
570 aperture_alloc = aper_alloc; 585 aperture_alloc = aper_alloc;
571} 586}
572 587
573static int gart_resume(struct sys_device *dev) 588static void gart_fixup_northbridges(struct sys_device *dev)
574{ 589{
575 printk(KERN_INFO "PCI-DMA: Resuming GART IOMMU\n"); 590 int i;
576 591
577 if (fix_up_north_bridges) { 592 if (!fix_up_north_bridges)
578 int i; 593 return;
579 594
580 printk(KERN_INFO "PCI-DMA: Restoring GART aperture settings\n"); 595 pr_info("PCI-DMA: Restoring GART aperture settings\n");
581 596
582 for (i = 0; i < num_k8_northbridges; i++) { 597 for (i = 0; i < num_k8_northbridges; i++) {
583 struct pci_dev *dev = k8_northbridges[i]; 598 struct pci_dev *dev = k8_northbridges[i];
584 599
585 /* 600 /*
586 * Don't enable translations just yet. That is the next 601 * Don't enable translations just yet. That is the next
587 * step. Restore the pre-suspend aperture settings. 602 * step. Restore the pre-suspend aperture settings.
588 */ 603 */
589 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, 604 pci_write_config_dword(dev, AMD64_GARTAPERTURECTL, aperture_order << 1);
590 aperture_order << 1); 605 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE, aperture_alloc >> 25);
591 pci_write_config_dword(dev, AMD64_GARTAPERTUREBASE,
592 aperture_alloc >> 25);
593 }
594 } 606 }
607}
608
609static int gart_resume(struct sys_device *dev)
610{
611 pr_info("PCI-DMA: Resuming GART IOMMU\n");
612
613 gart_fixup_northbridges(dev);
595 614
596 enable_gart_translations(); 615 enable_gart_translations();
597 616
@@ -604,15 +623,14 @@ static int gart_suspend(struct sys_device *dev, pm_message_t state)
604} 623}
605 624
606static struct sysdev_class gart_sysdev_class = { 625static struct sysdev_class gart_sysdev_class = {
607 .name = "gart", 626 .name = "gart",
608 .suspend = gart_suspend, 627 .suspend = gart_suspend,
609 .resume = gart_resume, 628 .resume = gart_resume,
610 629
611}; 630};
612 631
613static struct sys_device device_gart = { 632static struct sys_device device_gart = {
614 .id = 0, 633 .cls = &gart_sysdev_class,
615 .cls = &gart_sysdev_class,
616}; 634};
617 635
618/* 636/*
@@ -627,7 +645,8 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
627 void *gatt; 645 void *gatt;
628 int i, error; 646 int i, error;
629 647
630 printk(KERN_INFO "PCI-DMA: Disabling AGP.\n"); 648 pr_info("PCI-DMA: Disabling AGP.\n");
649
631 aper_size = aper_base = info->aper_size = 0; 650 aper_size = aper_base = info->aper_size = 0;
632 dev = NULL; 651 dev = NULL;
633 for (i = 0; i < num_k8_northbridges; i++) { 652 for (i = 0; i < num_k8_northbridges; i++) {
@@ -645,6 +664,7 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
645 } 664 }
646 if (!aper_base) 665 if (!aper_base)
647 goto nommu; 666 goto nommu;
667
648 info->aper_base = aper_base; 668 info->aper_base = aper_base;
649 info->aper_size = aper_size >> 20; 669 info->aper_size = aper_size >> 20;
650 670
@@ -667,14 +687,14 @@ static __init int init_k8_gatt(struct agp_kern_info *info)
667 687
668 flush_gart(); 688 flush_gart();
669 689
670 printk(KERN_INFO "PCI-DMA: aperture base @ %x size %u KB\n", 690 pr_info("PCI-DMA: aperture base @ %x size %u KB\n",
671 aper_base, aper_size>>10); 691 aper_base, aper_size>>10);
672 692
673 return 0; 693 return 0;
674 694
675 nommu: 695 nommu:
676 /* Should not happen anymore */ 696 /* Should not happen anymore */
677 printk(KERN_WARNING "PCI-DMA: More than 4GB of RAM and no IOMMU\n" 697 pr_warning("PCI-DMA: More than 4GB of RAM and no IOMMU\n"
678 "falling back to iommu=soft.\n"); 698 "falling back to iommu=soft.\n");
679 return -1; 699 return -1;
680} 700}
@@ -686,14 +706,16 @@ static struct dma_map_ops gart_dma_ops = {
686 .unmap_page = gart_unmap_page, 706 .unmap_page = gart_unmap_page,
687 .alloc_coherent = gart_alloc_coherent, 707 .alloc_coherent = gart_alloc_coherent,
688 .free_coherent = gart_free_coherent, 708 .free_coherent = gart_free_coherent,
709 .mapping_error = gart_mapping_error,
689}; 710};
690 711
691void gart_iommu_shutdown(void) 712static void gart_iommu_shutdown(void)
692{ 713{
693 struct pci_dev *dev; 714 struct pci_dev *dev;
694 int i; 715 int i;
695 716
696 if (no_agp && (dma_ops != &gart_dma_ops)) 717 /* don't shutdown it if there is AGP installed */
718 if (!no_agp)
697 return; 719 return;
698 720
699 for (i = 0; i < num_k8_northbridges; i++) { 721 for (i = 0; i < num_k8_northbridges; i++) {
@@ -708,7 +730,7 @@ void gart_iommu_shutdown(void)
708 } 730 }
709} 731}
710 732
711void __init gart_iommu_init(void) 733int __init gart_iommu_init(void)
712{ 734{
713 struct agp_kern_info info; 735 struct agp_kern_info info;
714 unsigned long iommu_start; 736 unsigned long iommu_start;
@@ -717,8 +739,8 @@ void __init gart_iommu_init(void)
717 unsigned long scratch; 739 unsigned long scratch;
718 long i; 740 long i;
719 741
720 if (cache_k8_northbridges() < 0 || num_k8_northbridges == 0) 742 if (num_k8_northbridges == 0)
721 return; 743 return 0;
722 744
723#ifndef CONFIG_AGP_AMD64 745#ifndef CONFIG_AGP_AMD64
724 no_agp = 1; 746 no_agp = 1;
@@ -730,35 +752,28 @@ void __init gart_iommu_init(void)
730 (agp_copy_info(agp_bridge, &info) < 0); 752 (agp_copy_info(agp_bridge, &info) < 0);
731#endif 753#endif
732 754
733 if (swiotlb)
734 return;
735
736 /* Did we detect a different HW IOMMU? */
737 if (iommu_detected && !gart_iommu_aperture)
738 return;
739
740 if (no_iommu || 755 if (no_iommu ||
741 (!force_iommu && max_pfn <= MAX_DMA32_PFN) || 756 (!force_iommu && max_pfn <= MAX_DMA32_PFN) ||
742 !gart_iommu_aperture || 757 !gart_iommu_aperture ||
743 (no_agp && init_k8_gatt(&info) < 0)) { 758 (no_agp && init_k8_gatt(&info) < 0)) {
744 if (max_pfn > MAX_DMA32_PFN) { 759 if (max_pfn > MAX_DMA32_PFN) {
745 printk(KERN_WARNING "More than 4GB of memory " 760 pr_warning("More than 4GB of memory but GART IOMMU not available.\n");
746 "but GART IOMMU not available.\n"); 761 pr_warning("falling back to iommu=soft.\n");
747 printk(KERN_WARNING "falling back to iommu=soft.\n");
748 } 762 }
749 return; 763 return 0;
750 } 764 }
751 765
752 /* need to map that range */ 766 /* need to map that range */
753 aper_size = info.aper_size << 20; 767 aper_size = info.aper_size << 20;
754 aper_base = info.aper_base; 768 aper_base = info.aper_base;
755 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT); 769 end_pfn = (aper_base>>PAGE_SHIFT) + (aper_size>>PAGE_SHIFT);
770
756 if (end_pfn > max_low_pfn_mapped) { 771 if (end_pfn > max_low_pfn_mapped) {
757 start_pfn = (aper_base>>PAGE_SHIFT); 772 start_pfn = (aper_base>>PAGE_SHIFT);
758 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT); 773 init_memory_mapping(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
759 } 774 }
760 775
761 printk(KERN_INFO "PCI-DMA: using GART IOMMU.\n"); 776 pr_info("PCI-DMA: using GART IOMMU.\n");
762 iommu_size = check_iommu_size(info.aper_base, aper_size); 777 iommu_size = check_iommu_size(info.aper_base, aper_size);
763 iommu_pages = iommu_size >> PAGE_SHIFT; 778 iommu_pages = iommu_size >> PAGE_SHIFT;
764 779
@@ -773,8 +788,7 @@ void __init gart_iommu_init(void)
773 788
774 ret = dma_debug_resize_entries(iommu_pages); 789 ret = dma_debug_resize_entries(iommu_pages);
775 if (ret) 790 if (ret)
776 printk(KERN_DEBUG 791 pr_debug("PCI-DMA: Cannot trace all the entries\n");
777 "PCI-DMA: Cannot trace all the entries\n");
778 } 792 }
779#endif 793#endif
780 794
@@ -782,17 +796,16 @@ void __init gart_iommu_init(void)
782 * Out of IOMMU space handling. 796 * Out of IOMMU space handling.
783 * Reserve some invalid pages at the beginning of the GART. 797 * Reserve some invalid pages at the beginning of the GART.
784 */ 798 */
785 iommu_area_reserve(iommu_gart_bitmap, 0, EMERGENCY_PAGES); 799 bitmap_set(iommu_gart_bitmap, 0, EMERGENCY_PAGES);
786 800
787 agp_memory_reserved = iommu_size; 801 pr_info("PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
788 printk(KERN_INFO
789 "PCI-DMA: Reserving %luMB of IOMMU area in the AGP aperture\n",
790 iommu_size >> 20); 802 iommu_size >> 20);
791 803
792 iommu_start = aper_size - iommu_size; 804 agp_memory_reserved = iommu_size;
793 iommu_bus_base = info.aper_base + iommu_start; 805 iommu_start = aper_size - iommu_size;
794 bad_dma_address = iommu_bus_base; 806 iommu_bus_base = info.aper_base + iommu_start;
795 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT); 807 bad_dma_addr = iommu_bus_base;
808 iommu_gatt_base = agp_gatt_table + (iommu_start>>PAGE_SHIFT);
796 809
797 /* 810 /*
798 * Unmap the IOMMU part of the GART. The alias of the page is 811 * Unmap the IOMMU part of the GART. The alias of the page is
@@ -814,7 +827,7 @@ void __init gart_iommu_init(void)
814 * the pages as Not-Present: 827 * the pages as Not-Present:
815 */ 828 */
816 wbinvd(); 829 wbinvd();
817 830
818 /* 831 /*
819 * Now all caches are flushed and we can safely enable 832 * Now all caches are flushed and we can safely enable
820 * GART hardware. Doing it early leaves the possibility 833 * GART hardware. Doing it early leaves the possibility
@@ -838,6 +851,10 @@ void __init gart_iommu_init(void)
838 851
839 flush_gart(); 852 flush_gart();
840 dma_ops = &gart_dma_ops; 853 dma_ops = &gart_dma_ops;
854 x86_platform.iommu_shutdown = gart_iommu_shutdown;
855 swiotlb = 0;
856
857 return 0;
841} 858}
842 859
843void __init gart_parse_options(char *p) 860void __init gart_parse_options(char *p)
@@ -856,7 +873,7 @@ void __init gart_parse_options(char *p)
856#endif 873#endif
857 if (isdigit(*p) && get_option(&p, &arg)) 874 if (isdigit(*p) && get_option(&p, &arg))
858 iommu_size = arg; 875 iommu_size = arg;
859 if (!strncmp(p, "fullflush", 8)) 876 if (!strncmp(p, "fullflush", 9))
860 iommu_fullflush = 1; 877 iommu_fullflush = 1;
861 if (!strncmp(p, "nofullflush", 11)) 878 if (!strncmp(p, "nofullflush", 11))
862 iommu_fullflush = 0; 879 iommu_fullflush = 0;
diff --git a/arch/x86/kernel/pci-nommu.c b/arch/x86/kernel/pci-nommu.c
index a3933d4330cd..3af4af810c07 100644
--- a/arch/x86/kernel/pci-nommu.c
+++ b/arch/x86/kernel/pci-nommu.c
@@ -4,6 +4,7 @@
4#include <linux/scatterlist.h> 4#include <linux/scatterlist.h>
5#include <linux/string.h> 5#include <linux/string.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/gfp.h>
7#include <linux/pci.h> 8#include <linux/pci.h>
8#include <linux/mm.h> 9#include <linux/mm.h>
9 10
@@ -33,7 +34,7 @@ static dma_addr_t nommu_map_page(struct device *dev, struct page *page,
33 dma_addr_t bus = page_to_phys(page) + offset; 34 dma_addr_t bus = page_to_phys(page) + offset;
34 WARN_ON(size == 0); 35 WARN_ON(size == 0);
35 if (!check_addr("map_single", dev, bus, size)) 36 if (!check_addr("map_single", dev, bus, size))
36 return bad_dma_address; 37 return DMA_ERROR_CODE;
37 flush_write_buffers(); 38 flush_write_buffers();
38 return bus; 39 return bus;
39} 40}
@@ -103,12 +104,3 @@ struct dma_map_ops nommu_dma_ops = {
103 .sync_sg_for_device = nommu_sync_sg_for_device, 104 .sync_sg_for_device = nommu_sync_sg_for_device,
104 .is_phys = 1, 105 .is_phys = 1,
105}; 106};
106
107void __init no_iommu_init(void)
108{
109 if (dma_ops)
110 return;
111
112 force_iommu = 0; /* no HW IOMMU */
113 dma_ops = &nommu_dma_ops;
114}
diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c
index aaa6b7839f1e..7d2829dde20e 100644
--- a/arch/x86/kernel/pci-swiotlb.c
+++ b/arch/x86/kernel/pci-swiotlb.c
@@ -42,18 +42,31 @@ static struct dma_map_ops swiotlb_dma_ops = {
42 .dma_supported = NULL, 42 .dma_supported = NULL,
43}; 43};
44 44
45void __init pci_swiotlb_init(void) 45/*
46 * pci_swiotlb_detect - set swiotlb to 1 if necessary
47 *
48 * This returns non-zero if we are forced to use swiotlb (by the boot
49 * option).
50 */
51int __init pci_swiotlb_detect(void)
46{ 52{
53 int use_swiotlb = swiotlb | swiotlb_force;
54
47 /* don't initialize swiotlb if iommu=off (no_iommu=1) */ 55 /* don't initialize swiotlb if iommu=off (no_iommu=1) */
48#ifdef CONFIG_X86_64 56#ifdef CONFIG_X86_64
49 if ((!iommu_detected && !no_iommu && max_pfn > MAX_DMA32_PFN)) 57 if (!no_iommu && max_pfn > MAX_DMA32_PFN)
50 swiotlb = 1; 58 swiotlb = 1;
51#endif 59#endif
52 if (swiotlb_force) 60 if (swiotlb_force)
53 swiotlb = 1; 61 swiotlb = 1;
62
63 return use_swiotlb;
64}
65
66void __init pci_swiotlb_init(void)
67{
54 if (swiotlb) { 68 if (swiotlb) {
55 printk(KERN_INFO "PCI-DMA: Using software bounce buffering for IO (SWIOTLB)\n"); 69 swiotlb_init(0);
56 swiotlb_init();
57 dma_ops = &swiotlb_dma_ops; 70 dma_ops = &swiotlb_dma_ops;
58 } 71 }
59} 72}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 5284cd2b5776..0415c3ef91b5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,11 @@
9#include <linux/pm.h> 9#include <linux/pm.h>
10#include <linux/clockchips.h> 10#include <linux/clockchips.h>
11#include <linux/random.h> 11#include <linux/random.h>
12#include <linux/user-return-notifier.h>
13#include <linux/dmi.h>
14#include <linux/utsname.h>
12#include <trace/events/power.h> 15#include <trace/events/power.h>
16#include <linux/hw_breakpoint.h>
13#include <asm/system.h> 17#include <asm/system.h>
14#include <asm/apic.h> 18#include <asm/apic.h>
15#include <asm/syscalls.h> 19#include <asm/syscalls.h>
@@ -17,6 +21,7 @@
17#include <asm/uaccess.h> 21#include <asm/uaccess.h>
18#include <asm/i387.h> 22#include <asm/i387.h>
19#include <asm/ds.h> 23#include <asm/ds.h>
24#include <asm/debugreg.h>
20 25
21unsigned long idle_halt; 26unsigned long idle_halt;
22EXPORT_SYMBOL(idle_halt); 27EXPORT_SYMBOL(idle_halt);
@@ -87,30 +92,37 @@ void exit_thread(void)
87 } 92 }
88} 93}
89 94
90void flush_thread(void) 95void show_regs(struct pt_regs *regs)
91{ 96{
92 struct task_struct *tsk = current; 97 show_registers(regs);
98 show_trace(NULL, regs, (unsigned long *)kernel_stack_pointer(regs),
99 regs->bp);
100}
93 101
94#ifdef CONFIG_X86_64 102void show_regs_common(void)
95 if (test_tsk_thread_flag(tsk, TIF_ABI_PENDING)) { 103{
96 clear_tsk_thread_flag(tsk, TIF_ABI_PENDING); 104 const char *board, *product;
97 if (test_tsk_thread_flag(tsk, TIF_IA32)) {
98 clear_tsk_thread_flag(tsk, TIF_IA32);
99 } else {
100 set_tsk_thread_flag(tsk, TIF_IA32);
101 current_thread_info()->status |= TS_COMPAT;
102 }
103 }
104#endif
105 105
106 clear_tsk_thread_flag(tsk, TIF_DEBUG); 106 board = dmi_get_system_info(DMI_BOARD_NAME);
107 if (!board)
108 board = "";
109 product = dmi_get_system_info(DMI_PRODUCT_NAME);
110 if (!product)
111 product = "";
107 112
108 tsk->thread.debugreg0 = 0; 113 printk(KERN_CONT "\n");
109 tsk->thread.debugreg1 = 0; 114 printk(KERN_DEFAULT "Pid: %d, comm: %.20s %s %s %.*s %s/%s\n",
110 tsk->thread.debugreg2 = 0; 115 current->pid, current->comm, print_tainted(),
111 tsk->thread.debugreg3 = 0; 116 init_utsname()->release,
112 tsk->thread.debugreg6 = 0; 117 (int)strcspn(init_utsname()->version, " "),
113 tsk->thread.debugreg7 = 0; 118 init_utsname()->version, board, product);
119}
120
121void flush_thread(void)
122{
123 struct task_struct *tsk = current;
124
125 flush_ptrace_hw_breakpoint(tsk);
114 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); 126 memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
115 /* 127 /*
116 * Forget coprocessor state.. 128 * Forget coprocessor state..
@@ -192,16 +204,6 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
192 else if (next->debugctlmsr != prev->debugctlmsr) 204 else if (next->debugctlmsr != prev->debugctlmsr)
193 update_debugctlmsr(next->debugctlmsr); 205 update_debugctlmsr(next->debugctlmsr);
194 206
195 if (test_tsk_thread_flag(next_p, TIF_DEBUG)) {
196 set_debugreg(next->debugreg0, 0);
197 set_debugreg(next->debugreg1, 1);
198 set_debugreg(next->debugreg2, 2);
199 set_debugreg(next->debugreg3, 3);
200 /* no 4 and 5 */
201 set_debugreg(next->debugreg6, 6);
202 set_debugreg(next->debugreg7, 7);
203 }
204
205 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^ 207 if (test_tsk_thread_flag(prev_p, TIF_NOTSC) ^
206 test_tsk_thread_flag(next_p, TIF_NOTSC)) { 208 test_tsk_thread_flag(next_p, TIF_NOTSC)) {
207 /* prev and next are different */ 209 /* prev and next are different */
@@ -224,6 +226,7 @@ void __switch_to_xtra(struct task_struct *prev_p, struct task_struct *next_p,
224 */ 226 */
225 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max); 227 memset(tss->io_bitmap, 0xff, prev->io_bitmap_max);
226 } 228 }
229 propagate_user_return_notify(prev_p, next_p);
227} 230}
228 231
229int sys_fork(struct pt_regs *regs) 232int sys_fork(struct pt_regs *regs)
@@ -247,6 +250,78 @@ int sys_vfork(struct pt_regs *regs)
247 NULL, NULL); 250 NULL, NULL);
248} 251}
249 252
253long
254sys_clone(unsigned long clone_flags, unsigned long newsp,
255 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
256{
257 if (!newsp)
258 newsp = regs->sp;
259 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid);
260}
261
262/*
263 * This gets run with %si containing the
264 * function to call, and %di containing
265 * the "args".
266 */
267extern void kernel_thread_helper(void);
268
269/*
270 * Create a kernel thread
271 */
272int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
273{
274 struct pt_regs regs;
275
276 memset(&regs, 0, sizeof(regs));
277
278 regs.si = (unsigned long) fn;
279 regs.di = (unsigned long) arg;
280
281#ifdef CONFIG_X86_32
282 regs.ds = __USER_DS;
283 regs.es = __USER_DS;
284 regs.fs = __KERNEL_PERCPU;
285 regs.gs = __KERNEL_STACK_CANARY;
286#else
287 regs.ss = __KERNEL_DS;
288#endif
289
290 regs.orig_ax = -1;
291 regs.ip = (unsigned long) kernel_thread_helper;
292 regs.cs = __KERNEL_CS | get_kernel_rpl();
293 regs.flags = X86_EFLAGS_IF | 0x2;
294
295 /* Ok, create the new process.. */
296 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
297}
298EXPORT_SYMBOL(kernel_thread);
299
300/*
301 * sys_execve() executes a new program.
302 */
303long sys_execve(char __user *name, char __user * __user *argv,
304 char __user * __user *envp, struct pt_regs *regs)
305{
306 long error;
307 char *filename;
308
309 filename = getname(name);
310 error = PTR_ERR(filename);
311 if (IS_ERR(filename))
312 return error;
313 error = do_execve(filename, argv, envp, regs);
314
315#ifdef CONFIG_X86_32
316 if (error == 0) {
317 /* Make sure we don't return using sysenter.. */
318 set_thread_flag(TIF_IRET);
319 }
320#endif
321
322 putname(filename);
323 return error;
324}
250 325
251/* 326/*
252 * Idle related variables and functions 327 * Idle related variables and functions
@@ -451,21 +526,39 @@ static int __cpuinit mwait_usable(const struct cpuinfo_x86 *c)
451} 526}
452 527
453/* 528/*
454 * Check for AMD CPUs, which have potentially C1E support 529 * Check for AMD CPUs, where APIC timer interrupt does not wake up CPU from C1e.
530 * For more information see
531 * - Erratum #400 for NPT family 0xf and family 0x10 CPUs
532 * - Erratum #365 for family 0x11 (not affected because C1e not in use)
455 */ 533 */
456static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c) 534static int __cpuinit check_c1e_idle(const struct cpuinfo_x86 *c)
457{ 535{
536 u64 val;
458 if (c->x86_vendor != X86_VENDOR_AMD) 537 if (c->x86_vendor != X86_VENDOR_AMD)
459 return 0; 538 goto no_c1e_idle;
460
461 if (c->x86 < 0x0F)
462 return 0;
463 539
464 /* Family 0x0f models < rev F do not have C1E */ 540 /* Family 0x0f models < rev F do not have C1E */
465 if (c->x86 == 0x0f && c->x86_model < 0x40) 541 if (c->x86 == 0x0F && c->x86_model >= 0x40)
466 return 0; 542 return 1;
467 543
468 return 1; 544 if (c->x86 == 0x10) {
545 /*
546 * check OSVW bit for CPUs that are not affected
547 * by erratum #400
548 */
549 if (cpu_has(c, X86_FEATURE_OSVW)) {
550 rdmsrl(MSR_AMD64_OSVW_ID_LENGTH, val);
551 if (val >= 2) {
552 rdmsrl(MSR_AMD64_OSVW_STATUS, val);
553 if (!(val & BIT(1)))
554 goto no_c1e_idle;
555 }
556 }
557 return 1;
558 }
559
560no_c1e_idle:
561 return 0;
469} 562}
470 563
471static cpumask_var_t c1e_mask; 564static cpumask_var_t c1e_mask;
@@ -532,7 +625,7 @@ void __cpuinit select_idle_routine(const struct cpuinfo_x86 *c)
532{ 625{
533#ifdef CONFIG_SMP 626#ifdef CONFIG_SMP
534 if (pm_idle == poll_idle && smp_num_siblings > 1) { 627 if (pm_idle == poll_idle && smp_num_siblings > 1) {
535 printk(KERN_WARNING "WARNING: polling idle and HT enabled," 628 printk_once(KERN_WARNING "WARNING: polling idle and HT enabled,"
536 " performance may degrade.\n"); 629 " performance may degrade.\n");
537 } 630 }
538#endif 631#endif
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 4cf79567cdab..f6c62667e30c 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -23,7 +23,6 @@
23#include <linux/vmalloc.h> 23#include <linux/vmalloc.h>
24#include <linux/user.h> 24#include <linux/user.h>
25#include <linux/interrupt.h> 25#include <linux/interrupt.h>
26#include <linux/utsname.h>
27#include <linux/delay.h> 26#include <linux/delay.h>
28#include <linux/reboot.h> 27#include <linux/reboot.h>
29#include <linux/init.h> 28#include <linux/init.h>
@@ -35,7 +34,6 @@
35#include <linux/tick.h> 34#include <linux/tick.h>
36#include <linux/percpu.h> 35#include <linux/percpu.h>
37#include <linux/prctl.h> 36#include <linux/prctl.h>
38#include <linux/dmi.h>
39#include <linux/ftrace.h> 37#include <linux/ftrace.h>
40#include <linux/uaccess.h> 38#include <linux/uaccess.h>
41#include <linux/io.h> 39#include <linux/io.h>
@@ -58,6 +56,7 @@
58#include <asm/idle.h> 56#include <asm/idle.h>
59#include <asm/syscalls.h> 57#include <asm/syscalls.h>
60#include <asm/ds.h> 58#include <asm/ds.h>
59#include <asm/debugreg.h>
61 60
62asmlinkage void ret_from_fork(void) __asm__("ret_from_fork"); 61asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
63 62
@@ -127,39 +126,29 @@ void __show_regs(struct pt_regs *regs, int all)
127 unsigned long d0, d1, d2, d3, d6, d7; 126 unsigned long d0, d1, d2, d3, d6, d7;
128 unsigned long sp; 127 unsigned long sp;
129 unsigned short ss, gs; 128 unsigned short ss, gs;
130 const char *board;
131 129
132 if (user_mode_vm(regs)) { 130 if (user_mode_vm(regs)) {
133 sp = regs->sp; 131 sp = regs->sp;
134 ss = regs->ss & 0xffff; 132 ss = regs->ss & 0xffff;
135 gs = get_user_gs(regs); 133 gs = get_user_gs(regs);
136 } else { 134 } else {
137 sp = (unsigned long) (&regs->sp); 135 sp = kernel_stack_pointer(regs);
138 savesegment(ss, ss); 136 savesegment(ss, ss);
139 savesegment(gs, gs); 137 savesegment(gs, gs);
140 } 138 }
141 139
142 printk("\n"); 140 show_regs_common();
143 141
144 board = dmi_get_system_info(DMI_PRODUCT_NAME); 142 printk(KERN_DEFAULT "EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
145 if (!board)
146 board = "";
147 printk("Pid: %d, comm: %s %s (%s %.*s) %s\n",
148 task_pid_nr(current), current->comm,
149 print_tainted(), init_utsname()->release,
150 (int)strcspn(init_utsname()->version, " "),
151 init_utsname()->version, board);
152
153 printk("EIP: %04x:[<%08lx>] EFLAGS: %08lx CPU: %d\n",
154 (u16)regs->cs, regs->ip, regs->flags, 143 (u16)regs->cs, regs->ip, regs->flags,
155 smp_processor_id()); 144 smp_processor_id());
156 print_symbol("EIP is at %s\n", regs->ip); 145 print_symbol("EIP is at %s\n", regs->ip);
157 146
158 printk("EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n", 147 printk(KERN_DEFAULT "EAX: %08lx EBX: %08lx ECX: %08lx EDX: %08lx\n",
159 regs->ax, regs->bx, regs->cx, regs->dx); 148 regs->ax, regs->bx, regs->cx, regs->dx);
160 printk("ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n", 149 printk(KERN_DEFAULT "ESI: %08lx EDI: %08lx EBP: %08lx ESP: %08lx\n",
161 regs->si, regs->di, regs->bp, sp); 150 regs->si, regs->di, regs->bp, sp);
162 printk(" DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n", 151 printk(KERN_DEFAULT " DS: %04x ES: %04x FS: %04x GS: %04x SS: %04x\n",
163 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss); 152 (u16)regs->ds, (u16)regs->es, (u16)regs->fs, gs, ss);
164 153
165 if (!all) 154 if (!all)
@@ -169,61 +158,22 @@ void __show_regs(struct pt_regs *regs, int all)
169 cr2 = read_cr2(); 158 cr2 = read_cr2();
170 cr3 = read_cr3(); 159 cr3 = read_cr3();
171 cr4 = read_cr4_safe(); 160 cr4 = read_cr4_safe();
172 printk("CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n", 161 printk(KERN_DEFAULT "CR0: %08lx CR2: %08lx CR3: %08lx CR4: %08lx\n",
173 cr0, cr2, cr3, cr4); 162 cr0, cr2, cr3, cr4);
174 163
175 get_debugreg(d0, 0); 164 get_debugreg(d0, 0);
176 get_debugreg(d1, 1); 165 get_debugreg(d1, 1);
177 get_debugreg(d2, 2); 166 get_debugreg(d2, 2);
178 get_debugreg(d3, 3); 167 get_debugreg(d3, 3);
179 printk("DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n", 168 printk(KERN_DEFAULT "DR0: %08lx DR1: %08lx DR2: %08lx DR3: %08lx\n",
180 d0, d1, d2, d3); 169 d0, d1, d2, d3);
181 170
182 get_debugreg(d6, 6); 171 get_debugreg(d6, 6);
183 get_debugreg(d7, 7); 172 get_debugreg(d7, 7);
184 printk("DR6: %08lx DR7: %08lx\n", 173 printk(KERN_DEFAULT "DR6: %08lx DR7: %08lx\n",
185 d6, d7); 174 d6, d7);
186} 175}
187 176
188void show_regs(struct pt_regs *regs)
189{
190 __show_regs(regs, 1);
191 show_trace(NULL, regs, &regs->sp, regs->bp);
192}
193
194/*
195 * This gets run with %bx containing the
196 * function to call, and %dx containing
197 * the "args".
198 */
199extern void kernel_thread_helper(void);
200
201/*
202 * Create a kernel thread
203 */
204int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags)
205{
206 struct pt_regs regs;
207
208 memset(&regs, 0, sizeof(regs));
209
210 regs.bx = (unsigned long) fn;
211 regs.dx = (unsigned long) arg;
212
213 regs.ds = __USER_DS;
214 regs.es = __USER_DS;
215 regs.fs = __KERNEL_PERCPU;
216 regs.gs = __KERNEL_STACK_CANARY;
217 regs.orig_ax = -1;
218 regs.ip = (unsigned long) kernel_thread_helper;
219 regs.cs = __KERNEL_CS | get_kernel_rpl();
220 regs.flags = X86_EFLAGS_IF | X86_EFLAGS_SF | X86_EFLAGS_PF | 0x2;
221
222 /* Ok, create the new process.. */
223 return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, &regs, 0, NULL, NULL);
224}
225EXPORT_SYMBOL(kernel_thread);
226
227void release_thread(struct task_struct *dead_task) 177void release_thread(struct task_struct *dead_task)
228{ 178{
229 BUG_ON(dead_task->mm); 179 BUG_ON(dead_task->mm);
@@ -259,7 +209,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
259 209
260 task_user_gs(p) = get_user_gs(regs); 210 task_user_gs(p) = get_user_gs(regs);
261 211
212 p->thread.io_bitmap_ptr = NULL;
262 tsk = current; 213 tsk = current;
214 err = -ENOMEM;
215
216 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
217
263 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) { 218 if (unlikely(test_tsk_thread_flag(tsk, TIF_IO_BITMAP))) {
264 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr, 219 p->thread.io_bitmap_ptr = kmemdup(tsk->thread.io_bitmap_ptr,
265 IO_BITMAP_BYTES, GFP_KERNEL); 220 IO_BITMAP_BYTES, GFP_KERNEL);
@@ -430,46 +385,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
430 return prev_p; 385 return prev_p;
431} 386}
432 387
433int sys_clone(struct pt_regs *regs)
434{
435 unsigned long clone_flags;
436 unsigned long newsp;
437 int __user *parent_tidptr, *child_tidptr;
438
439 clone_flags = regs->bx;
440 newsp = regs->cx;
441 parent_tidptr = (int __user *)regs->dx;
442 child_tidptr = (int __user *)regs->di;
443 if (!newsp)
444 newsp = regs->sp;
445 return do_fork(clone_flags, newsp, regs, 0, parent_tidptr, child_tidptr);
446}
447
448/*
449 * sys_execve() executes a new program.
450 */
451int sys_execve(struct pt_regs *regs)
452{
453 int error;
454 char *filename;
455
456 filename = getname((char __user *) regs->bx);
457 error = PTR_ERR(filename);
458 if (IS_ERR(filename))
459 goto out;
460 error = do_execve(filename,
461 (char __user * __user *) regs->cx,
462 (char __user * __user *) regs->dx,
463 regs);
464 if (error == 0) {
465 /* Make sure we don't return using sysenter.. */
466 set_thread_flag(TIF_IRET);
467 }
468 putname(filename);
469out:
470 return error;
471}
472
473#define top_esp (THREAD_SIZE - sizeof(unsigned long)) 388#define top_esp (THREAD_SIZE - sizeof(unsigned long))
474#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long)) 389#define top_ebp (THREAD_SIZE - 2*sizeof(unsigned long))
475 390
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index eb62cbcaa490..17cb3295cbf7 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -26,7 +26,6 @@
26#include <linux/slab.h> 26#include <linux/slab.h>
27#include <linux/user.h> 27#include <linux/user.h>
28#include <linux/interrupt.h> 28#include <linux/interrupt.h>
29#include <linux/utsname.h>
30#include <linux/delay.h> 29#include <linux/delay.h>
31#include <linux/module.h> 30#include <linux/module.h>
32#include <linux/ptrace.h> 31#include <linux/ptrace.h>
@@ -38,7 +37,6 @@
38#include <linux/uaccess.h> 37#include <linux/uaccess.h>
39#include <linux/io.h> 38#include <linux/io.h>
40#include <linux/ftrace.h> 39#include <linux/ftrace.h>
41#include <linux/dmi.h>
42 40
43#include <asm/pgtable.h> 41#include <asm/pgtable.h>
44#include <asm/system.h> 42#include <asm/system.h>
@@ -52,14 +50,13 @@
52#include <asm/idle.h> 50#include <asm/idle.h>
53#include <asm/syscalls.h> 51#include <asm/syscalls.h>
54#include <asm/ds.h> 52#include <asm/ds.h>
53#include <asm/debugreg.h>
55 54
56asmlinkage extern void ret_from_fork(void); 55asmlinkage extern void ret_from_fork(void);
57 56
58DEFINE_PER_CPU(unsigned long, old_rsp); 57DEFINE_PER_CPU(unsigned long, old_rsp);
59static DEFINE_PER_CPU(unsigned char, is_idle); 58static DEFINE_PER_CPU(unsigned char, is_idle);
60 59
61unsigned long kernel_thread_flags = CLONE_VM | CLONE_UNTRACED;
62
63static ATOMIC_NOTIFIER_HEAD(idle_notifier); 60static ATOMIC_NOTIFIER_HEAD(idle_notifier);
64 61
65void idle_notifier_register(struct notifier_block *n) 62void idle_notifier_register(struct notifier_block *n)
@@ -162,31 +159,21 @@ void __show_regs(struct pt_regs *regs, int all)
162 unsigned long d0, d1, d2, d3, d6, d7; 159 unsigned long d0, d1, d2, d3, d6, d7;
163 unsigned int fsindex, gsindex; 160 unsigned int fsindex, gsindex;
164 unsigned int ds, cs, es; 161 unsigned int ds, cs, es;
165 const char *board; 162
166 163 show_regs_common();
167 printk("\n"); 164 printk(KERN_DEFAULT "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
168 print_modules();
169 board = dmi_get_system_info(DMI_PRODUCT_NAME);
170 if (!board)
171 board = "";
172 printk(KERN_INFO "Pid: %d, comm: %.20s %s %s %.*s %s\n",
173 current->pid, current->comm, print_tainted(),
174 init_utsname()->release,
175 (int)strcspn(init_utsname()->version, " "),
176 init_utsname()->version, board);
177 printk(KERN_INFO "RIP: %04lx:[<%016lx>] ", regs->cs & 0xffff, regs->ip);
178 printk_address(regs->ip, 1); 165 printk_address(regs->ip, 1);
179 printk(KERN_INFO "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss, 166 printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx\n", regs->ss,
180 regs->sp, regs->flags); 167 regs->sp, regs->flags);
181 printk(KERN_INFO "RAX: %016lx RBX: %016lx RCX: %016lx\n", 168 printk(KERN_DEFAULT "RAX: %016lx RBX: %016lx RCX: %016lx\n",
182 regs->ax, regs->bx, regs->cx); 169 regs->ax, regs->bx, regs->cx);
183 printk(KERN_INFO "RDX: %016lx RSI: %016lx RDI: %016lx\n", 170 printk(KERN_DEFAULT "RDX: %016lx RSI: %016lx RDI: %016lx\n",
184 regs->dx, regs->si, regs->di); 171 regs->dx, regs->si, regs->di);
185 printk(KERN_INFO "RBP: %016lx R08: %016lx R09: %016lx\n", 172 printk(KERN_DEFAULT "RBP: %016lx R08: %016lx R09: %016lx\n",
186 regs->bp, regs->r8, regs->r9); 173 regs->bp, regs->r8, regs->r9);
187 printk(KERN_INFO "R10: %016lx R11: %016lx R12: %016lx\n", 174 printk(KERN_DEFAULT "R10: %016lx R11: %016lx R12: %016lx\n",
188 regs->r10, regs->r11, regs->r12); 175 regs->r10, regs->r11, regs->r12);
189 printk(KERN_INFO "R13: %016lx R14: %016lx R15: %016lx\n", 176 printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n",
190 regs->r13, regs->r14, regs->r15); 177 regs->r13, regs->r14, regs->r15);
191 178
192 asm("movl %%ds,%0" : "=r" (ds)); 179 asm("movl %%ds,%0" : "=r" (ds));
@@ -207,28 +194,21 @@ void __show_regs(struct pt_regs *regs, int all)
207 cr3 = read_cr3(); 194 cr3 = read_cr3();
208 cr4 = read_cr4(); 195 cr4 = read_cr4();
209 196
210 printk(KERN_INFO "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n", 197 printk(KERN_DEFAULT "FS: %016lx(%04x) GS:%016lx(%04x) knlGS:%016lx\n",
211 fs, fsindex, gs, gsindex, shadowgs); 198 fs, fsindex, gs, gsindex, shadowgs);
212 printk(KERN_INFO "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds, 199 printk(KERN_DEFAULT "CS: %04x DS: %04x ES: %04x CR0: %016lx\n", cs, ds,
213 es, cr0); 200 es, cr0);
214 printk(KERN_INFO "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3, 201 printk(KERN_DEFAULT "CR2: %016lx CR3: %016lx CR4: %016lx\n", cr2, cr3,
215 cr4); 202 cr4);
216 203
217 get_debugreg(d0, 0); 204 get_debugreg(d0, 0);
218 get_debugreg(d1, 1); 205 get_debugreg(d1, 1);
219 get_debugreg(d2, 2); 206 get_debugreg(d2, 2);
220 printk(KERN_INFO "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2); 207 printk(KERN_DEFAULT "DR0: %016lx DR1: %016lx DR2: %016lx\n", d0, d1, d2);
221 get_debugreg(d3, 3); 208 get_debugreg(d3, 3);
222 get_debugreg(d6, 6); 209 get_debugreg(d6, 6);
223 get_debugreg(d7, 7); 210 get_debugreg(d7, 7);
224 printk(KERN_INFO "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7); 211 printk(KERN_DEFAULT "DR3: %016lx DR6: %016lx DR7: %016lx\n", d3, d6, d7);
225}
226
227void show_regs(struct pt_regs *regs)
228{
229 printk(KERN_INFO "CPU %d:", smp_processor_id());
230 __show_regs(regs, 1);
231 show_trace(NULL, regs, (void *)(regs + 1), regs->bp);
232} 212}
233 213
234void release_thread(struct task_struct *dead_task) 214void release_thread(struct task_struct *dead_task)
@@ -285,8 +265,9 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
285 *childregs = *regs; 265 *childregs = *regs;
286 266
287 childregs->ax = 0; 267 childregs->ax = 0;
288 childregs->sp = sp; 268 if (user_mode(regs))
289 if (sp == ~0UL) 269 childregs->sp = sp;
270 else
290 childregs->sp = (unsigned long)childregs; 271 childregs->sp = (unsigned long)childregs;
291 272
292 p->thread.sp = (unsigned long) childregs; 273 p->thread.sp = (unsigned long) childregs;
@@ -295,14 +276,18 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
295 276
296 set_tsk_thread_flag(p, TIF_FORK); 277 set_tsk_thread_flag(p, TIF_FORK);
297 278
298 p->thread.fs = me->thread.fs; 279 p->thread.io_bitmap_ptr = NULL;
299 p->thread.gs = me->thread.gs;
300 280
301 savesegment(gs, p->thread.gsindex); 281 savesegment(gs, p->thread.gsindex);
282 p->thread.gs = p->thread.gsindex ? 0 : me->thread.gs;
302 savesegment(fs, p->thread.fsindex); 283 savesegment(fs, p->thread.fsindex);
284 p->thread.fs = p->thread.fsindex ? 0 : me->thread.fs;
303 savesegment(es, p->thread.es); 285 savesegment(es, p->thread.es);
304 savesegment(ds, p->thread.ds); 286 savesegment(ds, p->thread.ds);
305 287
288 err = -ENOMEM;
289 memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps));
290
306 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { 291 if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) {
307 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); 292 p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL);
308 if (!p->thread.io_bitmap_ptr) { 293 if (!p->thread.io_bitmap_ptr) {
@@ -341,29 +326,46 @@ out:
341 kfree(p->thread.io_bitmap_ptr); 326 kfree(p->thread.io_bitmap_ptr);
342 p->thread.io_bitmap_max = 0; 327 p->thread.io_bitmap_max = 0;
343 } 328 }
329
344 return err; 330 return err;
345} 331}
346 332
347void 333static void
348start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) 334start_thread_common(struct pt_regs *regs, unsigned long new_ip,
335 unsigned long new_sp,
336 unsigned int _cs, unsigned int _ss, unsigned int _ds)
349{ 337{
350 loadsegment(fs, 0); 338 loadsegment(fs, 0);
351 loadsegment(es, 0); 339 loadsegment(es, _ds);
352 loadsegment(ds, 0); 340 loadsegment(ds, _ds);
353 load_gs_index(0); 341 load_gs_index(0);
354 regs->ip = new_ip; 342 regs->ip = new_ip;
355 regs->sp = new_sp; 343 regs->sp = new_sp;
356 percpu_write(old_rsp, new_sp); 344 percpu_write(old_rsp, new_sp);
357 regs->cs = __USER_CS; 345 regs->cs = _cs;
358 regs->ss = __USER_DS; 346 regs->ss = _ss;
359 regs->flags = 0x200; 347 regs->flags = X86_EFLAGS_IF;
360 set_fs(USER_DS); 348 set_fs(USER_DS);
361 /* 349 /*
362 * Free the old FP and other extended state 350 * Free the old FP and other extended state
363 */ 351 */
364 free_thread_xstate(current); 352 free_thread_xstate(current);
365} 353}
366EXPORT_SYMBOL_GPL(start_thread); 354
355void
356start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
357{
358 start_thread_common(regs, new_ip, new_sp,
359 __USER_CS, __USER_DS, 0);
360}
361
362#ifdef CONFIG_IA32_EMULATION
363void start_thread_ia32(struct pt_regs *regs, u32 new_ip, u32 new_sp)
364{
365 start_thread_common(regs, new_ip, new_sp,
366 __USER32_CS, __USER32_DS, __USER32_DS);
367}
368#endif
367 369
368/* 370/*
369 * switch_to(x,y) should switch tasks from x to y. 371 * switch_to(x,y) should switch tasks from x to y.
@@ -495,26 +497,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
495 */ 497 */
496 if (preload_fpu) 498 if (preload_fpu)
497 __math_state_restore(); 499 __math_state_restore();
498 return prev_p;
499}
500 500
501/* 501 return prev_p;
502 * sys_execve() executes a new program.
503 */
504asmlinkage
505long sys_execve(char __user *name, char __user * __user *argv,
506 char __user * __user *envp, struct pt_regs *regs)
507{
508 long error;
509 char *filename;
510
511 filename = getname(name);
512 error = PTR_ERR(filename);
513 if (IS_ERR(filename))
514 return error;
515 error = do_execve(filename, argv, envp, regs);
516 putname(filename);
517 return error;
518} 502}
519 503
520void set_personality_64bit(void) 504void set_personality_64bit(void)
@@ -531,13 +515,16 @@ void set_personality_64bit(void)
531 current->personality &= ~READ_IMPLIES_EXEC; 515 current->personality &= ~READ_IMPLIES_EXEC;
532} 516}
533 517
534asmlinkage long 518void set_personality_ia32(void)
535sys_clone(unsigned long clone_flags, unsigned long newsp,
536 void __user *parent_tid, void __user *child_tid, struct pt_regs *regs)
537{ 519{
538 if (!newsp) 520 /* inherit personality from parent */
539 newsp = regs->sp; 521
540 return do_fork(clone_flags, newsp, regs, 0, parent_tid, child_tid); 522 /* Make sure to be in 32bit mode */
523 set_thread_flag(TIF_IA32);
524 current->personality |= force_personality32;
525
526 /* Prepare the first "return" to user space */
527 current_thread_info()->status |= TS_COMPAT;
541} 528}
542 529
543unsigned long get_wchan(struct task_struct *p) 530unsigned long get_wchan(struct task_struct *p)
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 7b058a2dc66a..2e9b55027b7e 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -12,6 +12,7 @@
12#include <linux/mm.h> 12#include <linux/mm.h>
13#include <linux/smp.h> 13#include <linux/smp.h>
14#include <linux/errno.h> 14#include <linux/errno.h>
15#include <linux/slab.h>
15#include <linux/ptrace.h> 16#include <linux/ptrace.h>
16#include <linux/regset.h> 17#include <linux/regset.h>
17#include <linux/tracehook.h> 18#include <linux/tracehook.h>
@@ -22,6 +23,8 @@
22#include <linux/seccomp.h> 23#include <linux/seccomp.h>
23#include <linux/signal.h> 24#include <linux/signal.h>
24#include <linux/workqueue.h> 25#include <linux/workqueue.h>
26#include <linux/perf_event.h>
27#include <linux/hw_breakpoint.h>
25 28
26#include <asm/uaccess.h> 29#include <asm/uaccess.h>
27#include <asm/pgtable.h> 30#include <asm/pgtable.h>
@@ -34,6 +37,7 @@
34#include <asm/prctl.h> 37#include <asm/prctl.h>
35#include <asm/proto.h> 38#include <asm/proto.h>
36#include <asm/ds.h> 39#include <asm/ds.h>
40#include <asm/hw_breakpoint.h>
37 41
38#include "tls.h" 42#include "tls.h"
39 43
@@ -45,10 +49,99 @@ enum x86_regset {
45 REGSET_FP, 49 REGSET_FP,
46 REGSET_XFP, 50 REGSET_XFP,
47 REGSET_IOPERM64 = REGSET_XFP, 51 REGSET_IOPERM64 = REGSET_XFP,
52 REGSET_XSTATE,
48 REGSET_TLS, 53 REGSET_TLS,
49 REGSET_IOPERM32, 54 REGSET_IOPERM32,
50}; 55};
51 56
57struct pt_regs_offset {
58 const char *name;
59 int offset;
60};
61
62#define REG_OFFSET_NAME(r) {.name = #r, .offset = offsetof(struct pt_regs, r)}
63#define REG_OFFSET_END {.name = NULL, .offset = 0}
64
65static const struct pt_regs_offset regoffset_table[] = {
66#ifdef CONFIG_X86_64
67 REG_OFFSET_NAME(r15),
68 REG_OFFSET_NAME(r14),
69 REG_OFFSET_NAME(r13),
70 REG_OFFSET_NAME(r12),
71 REG_OFFSET_NAME(r11),
72 REG_OFFSET_NAME(r10),
73 REG_OFFSET_NAME(r9),
74 REG_OFFSET_NAME(r8),
75#endif
76 REG_OFFSET_NAME(bx),
77 REG_OFFSET_NAME(cx),
78 REG_OFFSET_NAME(dx),
79 REG_OFFSET_NAME(si),
80 REG_OFFSET_NAME(di),
81 REG_OFFSET_NAME(bp),
82 REG_OFFSET_NAME(ax),
83#ifdef CONFIG_X86_32
84 REG_OFFSET_NAME(ds),
85 REG_OFFSET_NAME(es),
86 REG_OFFSET_NAME(fs),
87 REG_OFFSET_NAME(gs),
88#endif
89 REG_OFFSET_NAME(orig_ax),
90 REG_OFFSET_NAME(ip),
91 REG_OFFSET_NAME(cs),
92 REG_OFFSET_NAME(flags),
93 REG_OFFSET_NAME(sp),
94 REG_OFFSET_NAME(ss),
95 REG_OFFSET_END,
96};
97
98/**
99 * regs_query_register_offset() - query register offset from its name
100 * @name: the name of a register
101 *
102 * regs_query_register_offset() returns the offset of a register in struct
103 * pt_regs from its name. If the name is invalid, this returns -EINVAL;
104 */
105int regs_query_register_offset(const char *name)
106{
107 const struct pt_regs_offset *roff;
108 for (roff = regoffset_table; roff->name != NULL; roff++)
109 if (!strcmp(roff->name, name))
110 return roff->offset;
111 return -EINVAL;
112}
113
114/**
115 * regs_query_register_name() - query register name from its offset
116 * @offset: the offset of a register in struct pt_regs.
117 *
118 * regs_query_register_name() returns the name of a register from its
119 * offset in struct pt_regs. If the @offset is invalid, this returns NULL;
120 */
121const char *regs_query_register_name(unsigned int offset)
122{
123 const struct pt_regs_offset *roff;
124 for (roff = regoffset_table; roff->name != NULL; roff++)
125 if (roff->offset == offset)
126 return roff->name;
127 return NULL;
128}
129
130static const int arg_offs_table[] = {
131#ifdef CONFIG_X86_32
132 [0] = offsetof(struct pt_regs, ax),
133 [1] = offsetof(struct pt_regs, dx),
134 [2] = offsetof(struct pt_regs, cx)
135#else /* CONFIG_X86_64 */
136 [0] = offsetof(struct pt_regs, di),
137 [1] = offsetof(struct pt_regs, si),
138 [2] = offsetof(struct pt_regs, dx),
139 [3] = offsetof(struct pt_regs, cx),
140 [4] = offsetof(struct pt_regs, r8),
141 [5] = offsetof(struct pt_regs, r9)
142#endif
143};
144
52/* 145/*
53 * does not yet catch signals sent when the child dies. 146 * does not yet catch signals sent when the child dies.
54 * in exit.c or in signal.c. 147 * in exit.c or in signal.c.
@@ -137,11 +230,6 @@ static int set_segment_reg(struct task_struct *task,
137 return 0; 230 return 0;
138} 231}
139 232
140static unsigned long debugreg_addr_limit(struct task_struct *task)
141{
142 return TASK_SIZE - 3;
143}
144
145#else /* CONFIG_X86_64 */ 233#else /* CONFIG_X86_64 */
146 234
147#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT) 235#define FLAG_MASK (FLAG_MASK_32 | X86_EFLAGS_NT)
@@ -266,15 +354,6 @@ static int set_segment_reg(struct task_struct *task,
266 return 0; 354 return 0;
267} 355}
268 356
269static unsigned long debugreg_addr_limit(struct task_struct *task)
270{
271#ifdef CONFIG_IA32_EMULATION
272 if (test_tsk_thread_flag(task, TIF_IA32))
273 return IA32_PAGE_OFFSET - 3;
274#endif
275 return TASK_SIZE_MAX - 7;
276}
277
278#endif /* CONFIG_X86_32 */ 357#endif /* CONFIG_X86_32 */
279 358
280static unsigned long get_flags(struct task_struct *task) 359static unsigned long get_flags(struct task_struct *task)
@@ -408,14 +487,14 @@ static int genregs_get(struct task_struct *target,
408{ 487{
409 if (kbuf) { 488 if (kbuf) {
410 unsigned long *k = kbuf; 489 unsigned long *k = kbuf;
411 while (count > 0) { 490 while (count >= sizeof(*k)) {
412 *k++ = getreg(target, pos); 491 *k++ = getreg(target, pos);
413 count -= sizeof(*k); 492 count -= sizeof(*k);
414 pos += sizeof(*k); 493 pos += sizeof(*k);
415 } 494 }
416 } else { 495 } else {
417 unsigned long __user *u = ubuf; 496 unsigned long __user *u = ubuf;
418 while (count > 0) { 497 while (count >= sizeof(*u)) {
419 if (__put_user(getreg(target, pos), u++)) 498 if (__put_user(getreg(target, pos), u++))
420 return -EFAULT; 499 return -EFAULT;
421 count -= sizeof(*u); 500 count -= sizeof(*u);
@@ -434,14 +513,14 @@ static int genregs_set(struct task_struct *target,
434 int ret = 0; 513 int ret = 0;
435 if (kbuf) { 514 if (kbuf) {
436 const unsigned long *k = kbuf; 515 const unsigned long *k = kbuf;
437 while (count > 0 && !ret) { 516 while (count >= sizeof(*k) && !ret) {
438 ret = putreg(target, pos, *k++); 517 ret = putreg(target, pos, *k++);
439 count -= sizeof(*k); 518 count -= sizeof(*k);
440 pos += sizeof(*k); 519 pos += sizeof(*k);
441 } 520 }
442 } else { 521 } else {
443 const unsigned long __user *u = ubuf; 522 const unsigned long __user *u = ubuf;
444 while (count > 0 && !ret) { 523 while (count >= sizeof(*u) && !ret) {
445 unsigned long word; 524 unsigned long word;
446 ret = __get_user(word, u++); 525 ret = __get_user(word, u++);
447 if (ret) 526 if (ret)
@@ -454,99 +533,240 @@ static int genregs_set(struct task_struct *target,
454 return ret; 533 return ret;
455} 534}
456 535
536static void ptrace_triggered(struct perf_event *bp, int nmi,
537 struct perf_sample_data *data,
538 struct pt_regs *regs)
539{
540 int i;
541 struct thread_struct *thread = &(current->thread);
542
543 /*
544 * Store in the virtual DR6 register the fact that the breakpoint
545 * was hit so the thread's debugger will see it.
546 */
547 for (i = 0; i < HBP_NUM; i++) {
548 if (thread->ptrace_bps[i] == bp)
549 break;
550 }
551
552 thread->debugreg6 |= (DR_TRAP0 << i);
553}
554
457/* 555/*
458 * This function is trivial and will be inlined by the compiler. 556 * Walk through every ptrace breakpoints for this thread and
459 * Having it separates the implementation details of debug 557 * build the dr7 value on top of their attributes.
460 * registers from the interface details of ptrace. 558 *
461 */ 559 */
462static unsigned long ptrace_get_debugreg(struct task_struct *child, int n) 560static unsigned long ptrace_get_dr7(struct perf_event *bp[])
463{ 561{
464 switch (n) { 562 int i;
465 case 0: return child->thread.debugreg0; 563 int dr7 = 0;
466 case 1: return child->thread.debugreg1; 564 struct arch_hw_breakpoint *info;
467 case 2: return child->thread.debugreg2; 565
468 case 3: return child->thread.debugreg3; 566 for (i = 0; i < HBP_NUM; i++) {
469 case 6: return child->thread.debugreg6; 567 if (bp[i] && !bp[i]->attr.disabled) {
470 case 7: return child->thread.debugreg7; 568 info = counter_arch_bp(bp[i]);
569 dr7 |= encode_dr7(i, info->len, info->type);
570 }
471 } 571 }
472 return 0; 572
573 return dr7;
473} 574}
474 575
475static int ptrace_set_debugreg(struct task_struct *child, 576static int
476 int n, unsigned long data) 577ptrace_modify_breakpoint(struct perf_event *bp, int len, int type,
578 struct task_struct *tsk, int disabled)
477{ 579{
478 int i; 580 int err;
581 int gen_len, gen_type;
582 struct perf_event_attr attr;
479 583
480 if (unlikely(n == 4 || n == 5)) 584 /*
481 return -EIO; 585 * We should have at least an inactive breakpoint at this
586 * slot. It means the user is writing dr7 without having
587 * written the address register first
588 */
589 if (!bp)
590 return -EINVAL;
482 591
483 if (n < 4 && unlikely(data >= debugreg_addr_limit(child))) 592 err = arch_bp_generic_fields(len, type, &gen_len, &gen_type);
484 return -EIO; 593 if (err)
594 return err;
485 595
486 switch (n) { 596 attr = bp->attr;
487 case 0: child->thread.debugreg0 = data; break; 597 attr.bp_len = gen_len;
488 case 1: child->thread.debugreg1 = data; break; 598 attr.bp_type = gen_type;
489 case 2: child->thread.debugreg2 = data; break; 599 attr.disabled = disabled;
490 case 3: child->thread.debugreg3 = data; break;
491 600
492 case 6: 601 return modify_user_hw_breakpoint(bp, &attr);
493 if ((data & ~0xffffffffUL) != 0) 602}
494 return -EIO;
495 child->thread.debugreg6 = data;
496 break;
497 603
498 case 7: 604/*
605 * Handle ptrace writes to debug register 7.
606 */
607static int ptrace_write_dr7(struct task_struct *tsk, unsigned long data)
608{
609 struct thread_struct *thread = &(tsk->thread);
610 unsigned long old_dr7;
611 int i, orig_ret = 0, rc = 0;
612 int enabled, second_pass = 0;
613 unsigned len, type;
614 struct perf_event *bp;
615
616 data &= ~DR_CONTROL_RESERVED;
617 old_dr7 = ptrace_get_dr7(thread->ptrace_bps);
618restore:
619 /*
620 * Loop through all the hardware breakpoints, making the
621 * appropriate changes to each.
622 */
623 for (i = 0; i < HBP_NUM; i++) {
624 enabled = decode_dr7(data, i, &len, &type);
625 bp = thread->ptrace_bps[i];
626
627 if (!enabled) {
628 if (bp) {
629 /*
630 * Don't unregister the breakpoints right-away,
631 * unless all register_user_hw_breakpoint()
632 * requests have succeeded. This prevents
633 * any window of opportunity for debug
634 * register grabbing by other users.
635 */
636 if (!second_pass)
637 continue;
638
639 rc = ptrace_modify_breakpoint(bp, len, type,
640 tsk, 1);
641 if (rc)
642 break;
643 }
644 continue;
645 }
646
647 rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0);
648 if (rc)
649 break;
650 }
651 /*
652 * Make a second pass to free the remaining unused breakpoints
653 * or to restore the original breakpoints if an error occurred.
654 */
655 if (!second_pass) {
656 second_pass = 1;
657 if (rc < 0) {
658 orig_ret = rc;
659 data = old_dr7;
660 }
661 goto restore;
662 }
663 return ((orig_ret < 0) ? orig_ret : rc);
664}
665
666/*
667 * Handle PTRACE_PEEKUSR calls for the debug register area.
668 */
669static unsigned long ptrace_get_debugreg(struct task_struct *tsk, int n)
670{
671 struct thread_struct *thread = &(tsk->thread);
672 unsigned long val = 0;
673
674 if (n < HBP_NUM) {
675 struct perf_event *bp;
676 bp = thread->ptrace_bps[n];
677 if (!bp)
678 return 0;
679 val = bp->hw.info.address;
680 } else if (n == 6) {
681 val = thread->debugreg6;
682 } else if (n == 7) {
683 val = thread->ptrace_dr7;
684 }
685 return val;
686}
687
688static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr,
689 unsigned long addr)
690{
691 struct perf_event *bp;
692 struct thread_struct *t = &tsk->thread;
693 struct perf_event_attr attr;
694
695 if (!t->ptrace_bps[nr]) {
696 hw_breakpoint_init(&attr);
499 /* 697 /*
500 * Sanity-check data. Take one half-byte at once with 698 * Put stub len and type to register (reserve) an inactive but
501 * check = (val >> (16 + 4*i)) & 0xf. It contains the 699 * correct bp
502 * R/Wi and LENi bits; bits 0 and 1 are R/Wi, and bits
503 * 2 and 3 are LENi. Given a list of invalid values,
504 * we do mask |= 1 << invalid_value, so that
505 * (mask >> check) & 1 is a correct test for invalid
506 * values.
507 *
508 * R/Wi contains the type of the breakpoint /
509 * watchpoint, LENi contains the length of the watched
510 * data in the watchpoint case.
511 *
512 * The invalid values are:
513 * - LENi == 0x10 (undefined), so mask |= 0x0f00. [32-bit]
514 * - R/Wi == 0x10 (break on I/O reads or writes), so
515 * mask |= 0x4444.
516 * - R/Wi == 0x00 && LENi != 0x00, so we have mask |=
517 * 0x1110.
518 *
519 * Finally, mask = 0x0f00 | 0x4444 | 0x1110 == 0x5f54.
520 *
521 * See the Intel Manual "System Programming Guide",
522 * 15.2.4
523 *
524 * Note that LENi == 0x10 is defined on x86_64 in long
525 * mode (i.e. even for 32-bit userspace software, but
526 * 64-bit kernel), so the x86_64 mask value is 0x5454.
527 * See the AMD manual no. 24593 (AMD64 System Programming)
528 */ 700 */
529#ifdef CONFIG_X86_32 701 attr.bp_addr = addr;
530#define DR7_MASK 0x5f54 702 attr.bp_len = HW_BREAKPOINT_LEN_1;
531#else 703 attr.bp_type = HW_BREAKPOINT_W;
532#define DR7_MASK 0x5554 704 attr.disabled = 1;
533#endif 705
534 data &= ~DR_CONTROL_RESERVED; 706 bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk);
535 for (i = 0; i < 4; i++) 707
536 if ((DR7_MASK >> ((data >> (16 + 4*i)) & 0xf)) & 1) 708 /*
537 return -EIO; 709 * CHECKME: the previous code returned -EIO if the addr wasn't
538 child->thread.debugreg7 = data; 710 * a valid task virtual addr. The new one will return -EINVAL in
539 if (data) 711 * this case.
540 set_tsk_thread_flag(child, TIF_DEBUG); 712 * -EINVAL may be what we want for in-kernel breakpoints users,
541 else 713 * but -EIO looks better for ptrace, since we refuse a register
542 clear_tsk_thread_flag(child, TIF_DEBUG); 714 * writing for the user. And anyway this is the previous
543 break; 715 * behaviour.
716 */
717 if (IS_ERR(bp))
718 return PTR_ERR(bp);
719
720 t->ptrace_bps[nr] = bp;
721 } else {
722 int err;
723
724 bp = t->ptrace_bps[nr];
725
726 attr = bp->attr;
727 attr.bp_addr = addr;
728 err = modify_user_hw_breakpoint(bp, &attr);
729 if (err)
730 return err;
544 } 731 }
545 732
733
546 return 0; 734 return 0;
547} 735}
548 736
549/* 737/*
738 * Handle PTRACE_POKEUSR calls for the debug register area.
739 */
740int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val)
741{
742 struct thread_struct *thread = &(tsk->thread);
743 int rc = 0;
744
745 /* There are no DR4 or DR5 registers */
746 if (n == 4 || n == 5)
747 return -EIO;
748
749 if (n == 6) {
750 thread->debugreg6 = val;
751 goto ret_path;
752 }
753 if (n < HBP_NUM) {
754 rc = ptrace_set_breakpoint_addr(tsk, n, val);
755 if (rc)
756 return rc;
757 }
758 /* All that's left is DR7 */
759 if (n == 7) {
760 rc = ptrace_write_dr7(tsk, val);
761 if (!rc)
762 thread->ptrace_dr7 = val;
763 }
764
765ret_path:
766 return rc;
767}
768
769/*
550 * These access the current or another (stopped) task's io permission 770 * These access the current or another (stopped) task's io permission
551 * bitmap for debugging or core dump. 771 * bitmap for debugging or core dump.
552 */ 772 */
@@ -1219,14 +1439,14 @@ static int genregs32_get(struct task_struct *target,
1219{ 1439{
1220 if (kbuf) { 1440 if (kbuf) {
1221 compat_ulong_t *k = kbuf; 1441 compat_ulong_t *k = kbuf;
1222 while (count > 0) { 1442 while (count >= sizeof(*k)) {
1223 getreg32(target, pos, k++); 1443 getreg32(target, pos, k++);
1224 count -= sizeof(*k); 1444 count -= sizeof(*k);
1225 pos += sizeof(*k); 1445 pos += sizeof(*k);
1226 } 1446 }
1227 } else { 1447 } else {
1228 compat_ulong_t __user *u = ubuf; 1448 compat_ulong_t __user *u = ubuf;
1229 while (count > 0) { 1449 while (count >= sizeof(*u)) {
1230 compat_ulong_t word; 1450 compat_ulong_t word;
1231 getreg32(target, pos, &word); 1451 getreg32(target, pos, &word);
1232 if (__put_user(word, u++)) 1452 if (__put_user(word, u++))
@@ -1247,14 +1467,14 @@ static int genregs32_set(struct task_struct *target,
1247 int ret = 0; 1467 int ret = 0;
1248 if (kbuf) { 1468 if (kbuf) {
1249 const compat_ulong_t *k = kbuf; 1469 const compat_ulong_t *k = kbuf;
1250 while (count > 0 && !ret) { 1470 while (count >= sizeof(*k) && !ret) {
1251 ret = putreg32(target, pos, *k++); 1471 ret = putreg32(target, pos, *k++);
1252 count -= sizeof(*k); 1472 count -= sizeof(*k);
1253 pos += sizeof(*k); 1473 pos += sizeof(*k);
1254 } 1474 }
1255 } else { 1475 } else {
1256 const compat_ulong_t __user *u = ubuf; 1476 const compat_ulong_t __user *u = ubuf;
1257 while (count > 0 && !ret) { 1477 while (count >= sizeof(*u) && !ret) {
1258 compat_ulong_t word; 1478 compat_ulong_t word;
1259 ret = __get_user(word, u++); 1479 ret = __get_user(word, u++);
1260 if (ret) 1480 if (ret)
@@ -1345,7 +1565,7 @@ long compat_arch_ptrace(struct task_struct *child, compat_long_t request,
1345 1565
1346#ifdef CONFIG_X86_64 1566#ifdef CONFIG_X86_64
1347 1567
1348static const struct user_regset x86_64_regsets[] = { 1568static struct user_regset x86_64_regsets[] __read_mostly = {
1349 [REGSET_GENERAL] = { 1569 [REGSET_GENERAL] = {
1350 .core_note_type = NT_PRSTATUS, 1570 .core_note_type = NT_PRSTATUS,
1351 .n = sizeof(struct user_regs_struct) / sizeof(long), 1571 .n = sizeof(struct user_regs_struct) / sizeof(long),
@@ -1358,6 +1578,12 @@ static const struct user_regset x86_64_regsets[] = {
1358 .size = sizeof(long), .align = sizeof(long), 1578 .size = sizeof(long), .align = sizeof(long),
1359 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1579 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1360 }, 1580 },
1581 [REGSET_XSTATE] = {
1582 .core_note_type = NT_X86_XSTATE,
1583 .size = sizeof(u64), .align = sizeof(u64),
1584 .active = xstateregs_active, .get = xstateregs_get,
1585 .set = xstateregs_set
1586 },
1361 [REGSET_IOPERM64] = { 1587 [REGSET_IOPERM64] = {
1362 .core_note_type = NT_386_IOPERM, 1588 .core_note_type = NT_386_IOPERM,
1363 .n = IO_BITMAP_LONGS, 1589 .n = IO_BITMAP_LONGS,
@@ -1383,7 +1609,7 @@ static const struct user_regset_view user_x86_64_view = {
1383#endif /* CONFIG_X86_64 */ 1609#endif /* CONFIG_X86_64 */
1384 1610
1385#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION 1611#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1386static const struct user_regset x86_32_regsets[] = { 1612static struct user_regset x86_32_regsets[] __read_mostly = {
1387 [REGSET_GENERAL] = { 1613 [REGSET_GENERAL] = {
1388 .core_note_type = NT_PRSTATUS, 1614 .core_note_type = NT_PRSTATUS,
1389 .n = sizeof(struct user_regs_struct32) / sizeof(u32), 1615 .n = sizeof(struct user_regs_struct32) / sizeof(u32),
@@ -1402,6 +1628,12 @@ static const struct user_regset x86_32_regsets[] = {
1402 .size = sizeof(u32), .align = sizeof(u32), 1628 .size = sizeof(u32), .align = sizeof(u32),
1403 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set 1629 .active = xfpregs_active, .get = xfpregs_get, .set = xfpregs_set
1404 }, 1630 },
1631 [REGSET_XSTATE] = {
1632 .core_note_type = NT_X86_XSTATE,
1633 .size = sizeof(u64), .align = sizeof(u64),
1634 .active = xstateregs_active, .get = xstateregs_get,
1635 .set = xstateregs_set
1636 },
1405 [REGSET_TLS] = { 1637 [REGSET_TLS] = {
1406 .core_note_type = NT_386_TLS, 1638 .core_note_type = NT_386_TLS,
1407 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN, 1639 .n = GDT_ENTRY_TLS_ENTRIES, .bias = GDT_ENTRY_TLS_MIN,
@@ -1424,6 +1656,23 @@ static const struct user_regset_view user_x86_32_view = {
1424}; 1656};
1425#endif 1657#endif
1426 1658
1659/*
1660 * This represents bytes 464..511 in the memory layout exported through
1661 * the REGSET_XSTATE interface.
1662 */
1663u64 xstate_fx_sw_bytes[USER_XSTATE_FX_SW_WORDS];
1664
1665void update_regset_xstate_info(unsigned int size, u64 xstate_mask)
1666{
1667#ifdef CONFIG_X86_64
1668 x86_64_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1669#endif
1670#if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION
1671 x86_32_regsets[REGSET_XSTATE].n = size / sizeof(u64);
1672#endif
1673 xstate_fx_sw_bytes[USER_XSTATE_XCR0_WORD] = xstate_mask;
1674}
1675
1427const struct user_regset_view *task_user_regset_view(struct task_struct *task) 1676const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1428{ 1677{
1429#ifdef CONFIG_IA32_EMULATION 1678#ifdef CONFIG_IA32_EMULATION
@@ -1437,21 +1686,33 @@ const struct user_regset_view *task_user_regset_view(struct task_struct *task)
1437#endif 1686#endif
1438} 1687}
1439 1688
1440void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs, 1689static void fill_sigtrap_info(struct task_struct *tsk,
1441 int error_code, int si_code) 1690 struct pt_regs *regs,
1691 int error_code, int si_code,
1692 struct siginfo *info)
1442{ 1693{
1443 struct siginfo info;
1444
1445 tsk->thread.trap_no = 1; 1694 tsk->thread.trap_no = 1;
1446 tsk->thread.error_code = error_code; 1695 tsk->thread.error_code = error_code;
1447 1696
1448 memset(&info, 0, sizeof(info)); 1697 memset(info, 0, sizeof(*info));
1449 info.si_signo = SIGTRAP; 1698 info->si_signo = SIGTRAP;
1450 info.si_code = si_code; 1699 info->si_code = si_code;
1700 info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
1701}
1451 1702
1452 /* User-mode ip? */ 1703void user_single_step_siginfo(struct task_struct *tsk,
1453 info.si_addr = user_mode_vm(regs) ? (void __user *) regs->ip : NULL; 1704 struct pt_regs *regs,
1705 struct siginfo *info)
1706{
1707 fill_sigtrap_info(tsk, regs, 0, TRAP_BRKPT, info);
1708}
1454 1709
1710void send_sigtrap(struct task_struct *tsk, struct pt_regs *regs,
1711 int error_code, int si_code)
1712{
1713 struct siginfo info;
1714
1715 fill_sigtrap_info(tsk, regs, error_code, si_code, &info);
1455 /* Send us the fake SIGTRAP */ 1716 /* Send us the fake SIGTRAP */
1456 force_sig_info(SIGTRAP, &info, tsk); 1717 force_sig_info(SIGTRAP, &info, tsk);
1457} 1718}
@@ -1516,29 +1777,22 @@ asmregparm long syscall_trace_enter(struct pt_regs *regs)
1516 1777
1517asmregparm void syscall_trace_leave(struct pt_regs *regs) 1778asmregparm void syscall_trace_leave(struct pt_regs *regs)
1518{ 1779{
1780 bool step;
1781
1519 if (unlikely(current->audit_context)) 1782 if (unlikely(current->audit_context))
1520 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax); 1783 audit_syscall_exit(AUDITSC_RESULT(regs->ax), regs->ax);
1521 1784
1522 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT))) 1785 if (unlikely(test_thread_flag(TIF_SYSCALL_TRACEPOINT)))
1523 trace_sys_exit(regs, regs->ax); 1786 trace_sys_exit(regs, regs->ax);
1524 1787
1525 if (test_thread_flag(TIF_SYSCALL_TRACE))
1526 tracehook_report_syscall_exit(regs, 0);
1527
1528 /* 1788 /*
1529 * If TIF_SYSCALL_EMU is set, we only get here because of 1789 * If TIF_SYSCALL_EMU is set, we only get here because of
1530 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP). 1790 * TIF_SINGLESTEP (i.e. this is PTRACE_SYSEMU_SINGLESTEP).
1531 * We already reported this syscall instruction in 1791 * We already reported this syscall instruction in
1532 * syscall_trace_enter(), so don't do any more now. 1792 * syscall_trace_enter().
1533 */
1534 if (unlikely(test_thread_flag(TIF_SYSCALL_EMU)))
1535 return;
1536
1537 /*
1538 * If we are single-stepping, synthesize a trap to follow the
1539 * system call instruction.
1540 */ 1793 */
1541 if (test_thread_flag(TIF_SINGLESTEP) && 1794 step = unlikely(test_thread_flag(TIF_SINGLESTEP)) &&
1542 tracehook_consider_fatal_signal(current, SIGTRAP)) 1795 !test_thread_flag(TIF_SYSCALL_EMU);
1543 send_sigtrap(current, regs, 0, TRAP_BRKPT); 1796 if (step || test_thread_flag(TIF_SYSCALL_TRACE))
1797 tracehook_report_syscall_exit(regs, step);
1544} 1798}
diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c
index 6c3b2c6fd772..12e9feaa2f7a 100644
--- a/arch/x86/kernel/quirks.c
+++ b/arch/x86/kernel/quirks.c
@@ -491,6 +491,19 @@ void force_hpet_resume(void)
491 break; 491 break;
492 } 492 }
493} 493}
494
495/*
496 * HPET MSI on some boards (ATI SB700/SB800) has side effect on
497 * floppy DMA. Disable HPET MSI on such platforms.
498 */
499static void force_disable_hpet_msi(struct pci_dev *unused)
500{
501 hpet_msi_disable = 1;
502}
503
504DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_ATI, PCI_DEVICE_ID_ATI_SBX00_SMBUS,
505 force_disable_hpet_msi);
506
494#endif 507#endif
495 508
496#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) 509#if defined(CONFIG_PCI) && defined(CONFIG_NUMA)
@@ -499,6 +512,7 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
499{ 512{
500 struct pci_dev *nb_ht; 513 struct pci_dev *nb_ht;
501 unsigned int devfn; 514 unsigned int devfn;
515 u32 node;
502 u32 val; 516 u32 val;
503 517
504 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); 518 devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0);
@@ -507,7 +521,13 @@ static void __init quirk_amd_nb_node(struct pci_dev *dev)
507 return; 521 return;
508 522
509 pci_read_config_dword(nb_ht, 0x60, &val); 523 pci_read_config_dword(nb_ht, 0x60, &val);
510 set_dev_node(&dev->dev, val & 7); 524 node = val & 7;
525 /*
526 * Some hardware may return an invalid node ID,
527 * so check it first:
528 */
529 if (node_online(node))
530 set_dev_node(&dev->dev, node);
511 pci_dev_put(nb_ht); 531 pci_dev_put(nb_ht);
512} 532}
513 533
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index f93078746e00..8e1aac86b50c 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -23,7 +23,7 @@
23# include <linux/ctype.h> 23# include <linux/ctype.h>
24# include <linux/mc146818rtc.h> 24# include <linux/mc146818rtc.h>
25#else 25#else
26# include <asm/iommu.h> 26# include <asm/x86_init.h>
27#endif 27#endif
28 28
29/* 29/*
@@ -203,6 +203,15 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"), 203 DMI_MATCH(DMI_BOARD_NAME, "0T656F"),
204 }, 204 },
205 }, 205 },
206 { /* Handle problems with rebooting on Dell OptiPlex 760 with 0G919G*/
207 .callback = set_bios_reboot,
208 .ident = "Dell OptiPlex 760",
209 .matches = {
210 DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."),
211 DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 760"),
212 DMI_MATCH(DMI_BOARD_NAME, "0G919G"),
213 },
214 },
206 { /* Handle problems with rebooting on Dell 2400's */ 215 { /* Handle problems with rebooting on Dell 2400's */
207 .callback = set_bios_reboot, 216 .callback = set_bios_reboot,
208 .ident = "Dell PowerEdge 2400", 217 .ident = "Dell PowerEdge 2400",
@@ -259,6 +268,14 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
259 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"), 268 DMI_MATCH(DMI_PRODUCT_NAME, "SBC-FITPC2"),
260 }, 269 },
261 }, 270 },
271 { /* Handle problems with rebooting on ASUS P4S800 */
272 .callback = set_bios_reboot,
273 .ident = "ASUS P4S800",
274 .matches = {
275 DMI_MATCH(DMI_BOARD_VENDOR, "ASUSTeK Computer INC."),
276 DMI_MATCH(DMI_BOARD_NAME, "P4S800"),
277 },
278 },
262 { } 279 { }
263}; 280};
264 281
@@ -444,6 +461,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = {
444 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"), 461 DMI_MATCH(DMI_PRODUCT_NAME, "Macmini3,1"),
445 }, 462 },
446 }, 463 },
464 { /* Handle problems with rebooting on the iMac9,1. */
465 .callback = set_pci_reboot,
466 .ident = "Apple iMac9,1",
467 .matches = {
468 DMI_MATCH(DMI_SYS_VENDOR, "Apple Inc."),
469 DMI_MATCH(DMI_PRODUCT_NAME, "iMac9,1"),
470 },
471 },
447 { } 472 { }
448}; 473};
449 474
@@ -622,7 +647,7 @@ void native_machine_shutdown(void)
622#endif 647#endif
623 648
624#ifdef CONFIG_X86_64 649#ifdef CONFIG_X86_64
625 pci_iommu_shutdown(); 650 x86_platform.iommu_shutdown();
626#endif 651#endif
627} 652}
628 653
diff --git a/arch/x86/kernel/reboot_fixups_32.c b/arch/x86/kernel/reboot_fixups_32.c
index 61a837743fe5..fda313ebbb03 100644
--- a/arch/x86/kernel/reboot_fixups_32.c
+++ b/arch/x86/kernel/reboot_fixups_32.c
@@ -12,7 +12,7 @@
12#include <linux/interrupt.h> 12#include <linux/interrupt.h>
13#include <asm/reboot_fixups.h> 13#include <asm/reboot_fixups.h>
14#include <asm/msr.h> 14#include <asm/msr.h>
15#include <asm/geode.h> 15#include <linux/cs5535.h>
16 16
17static void cs5530a_warm_reset(struct pci_dev *dev) 17static void cs5530a_warm_reset(struct pci_dev *dev)
18{ 18{
@@ -80,6 +80,7 @@ void mach_reboot_fixups(void)
80 continue; 80 continue;
81 81
82 cur->reboot_fixup(dev); 82 cur->reboot_fixup(dev);
83 pci_dev_put(dev);
83 } 84 }
84} 85}
85 86
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 2a34f9c5be21..c4851eff57b3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -55,7 +55,6 @@
55#include <linux/stddef.h> 55#include <linux/stddef.h>
56#include <linux/unistd.h> 56#include <linux/unistd.h>
57#include <linux/ptrace.h> 57#include <linux/ptrace.h>
58#include <linux/slab.h>
59#include <linux/user.h> 58#include <linux/user.h>
60#include <linux/delay.h> 59#include <linux/delay.h>
61 60
@@ -73,6 +72,7 @@
73 72
74#include <asm/mtrr.h> 73#include <asm/mtrr.h>
75#include <asm/apic.h> 74#include <asm/apic.h>
75#include <asm/trampoline.h>
76#include <asm/e820.h> 76#include <asm/e820.h>
77#include <asm/mpspec.h> 77#include <asm/mpspec.h>
78#include <asm/setup.h> 78#include <asm/setup.h>
@@ -106,9 +106,11 @@
106#include <asm/percpu.h> 106#include <asm/percpu.h>
107#include <asm/topology.h> 107#include <asm/topology.h>
108#include <asm/apicdef.h> 108#include <asm/apicdef.h>
109#include <asm/k8.h>
109#ifdef CONFIG_X86_64 110#ifdef CONFIG_X86_64
110#include <asm/numa_64.h> 111#include <asm/numa_64.h>
111#endif 112#endif
113#include <asm/mce.h>
112 114
113/* 115/*
114 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. 116 * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries.
@@ -118,7 +120,9 @@
118unsigned long max_low_pfn_mapped; 120unsigned long max_low_pfn_mapped;
119unsigned long max_pfn_mapped; 121unsigned long max_pfn_mapped;
120 122
123#ifdef CONFIG_DMI
121RESERVE_BRK(dmi_alloc, 65536); 124RESERVE_BRK(dmi_alloc, 65536);
125#endif
122 126
123unsigned int boot_cpu_id __read_mostly; 127unsigned int boot_cpu_id __read_mostly;
124 128
@@ -247,7 +251,7 @@ EXPORT_SYMBOL(edd);
247 * from boot_params into a safe place. 251 * from boot_params into a safe place.
248 * 252 *
249 */ 253 */
250static inline void copy_edd(void) 254static inline void __init copy_edd(void)
251{ 255{
252 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer, 256 memcpy(edd.mbr_signature, boot_params.edd_mbr_sig_buffer,
253 sizeof(edd.mbr_signature)); 257 sizeof(edd.mbr_signature));
@@ -256,7 +260,7 @@ static inline void copy_edd(void)
256 edd.edd_info_nr = boot_params.eddbuf_entries; 260 edd.edd_info_nr = boot_params.eddbuf_entries;
257} 261}
258#else 262#else
259static inline void copy_edd(void) 263static inline void __init copy_edd(void)
260{ 264{
261} 265}
262#endif 266#endif
@@ -309,16 +313,17 @@ static void __init reserve_brk(void)
309#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT) 313#define MAX_MAP_CHUNK (NR_FIX_BTMAPS << PAGE_SHIFT)
310static void __init relocate_initrd(void) 314static void __init relocate_initrd(void)
311{ 315{
312 316 /* Assume only end is not page aligned */
313 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 317 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
314 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 318 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
319 u64 area_size = PAGE_ALIGN(ramdisk_size);
315 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 320 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
316 u64 ramdisk_here; 321 u64 ramdisk_here;
317 unsigned long slop, clen, mapaddr; 322 unsigned long slop, clen, mapaddr;
318 char *p, *q; 323 char *p, *q;
319 324
320 /* We need to move the initrd down into lowmem */ 325 /* We need to move the initrd down into lowmem */
321 ramdisk_here = find_e820_area(0, end_of_lowmem, ramdisk_size, 326 ramdisk_here = find_e820_area(0, end_of_lowmem, area_size,
322 PAGE_SIZE); 327 PAGE_SIZE);
323 328
324 if (ramdisk_here == -1ULL) 329 if (ramdisk_here == -1ULL)
@@ -327,7 +332,7 @@ static void __init relocate_initrd(void)
327 332
328 /* Note: this includes all the lowmem currently occupied by 333 /* Note: this includes all the lowmem currently occupied by
329 the initrd, we rely on that fact to keep the data intact. */ 334 the initrd, we rely on that fact to keep the data intact. */
330 reserve_early(ramdisk_here, ramdisk_here + ramdisk_size, 335 reserve_early(ramdisk_here, ramdisk_here + area_size,
331 "NEW RAMDISK"); 336 "NEW RAMDISK");
332 initrd_start = ramdisk_here + PAGE_OFFSET; 337 initrd_start = ramdisk_here + PAGE_OFFSET;
333 initrd_end = initrd_start + ramdisk_size; 338 initrd_end = initrd_start + ramdisk_size;
@@ -371,9 +376,10 @@ static void __init relocate_initrd(void)
371 376
372static void __init reserve_initrd(void) 377static void __init reserve_initrd(void)
373{ 378{
379 /* Assume only end is not page aligned */
374 u64 ramdisk_image = boot_params.hdr.ramdisk_image; 380 u64 ramdisk_image = boot_params.hdr.ramdisk_image;
375 u64 ramdisk_size = boot_params.hdr.ramdisk_size; 381 u64 ramdisk_size = boot_params.hdr.ramdisk_size;
376 u64 ramdisk_end = ramdisk_image + ramdisk_size; 382 u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size);
377 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT; 383 u64 end_of_lowmem = max_low_pfn_mapped << PAGE_SHIFT;
378 384
379 if (!boot_params.hdr.type_of_loader || 385 if (!boot_params.hdr.type_of_loader ||
@@ -486,42 +492,11 @@ static void __init reserve_early_setup_data(void)
486 492
487#ifdef CONFIG_KEXEC 493#ifdef CONFIG_KEXEC
488 494
489/**
490 * Reserve @size bytes of crashkernel memory at any suitable offset.
491 *
492 * @size: Size of the crashkernel memory to reserve.
493 * Returns the base address on success, and -1ULL on failure.
494 */
495static
496unsigned long long __init find_and_reserve_crashkernel(unsigned long long size)
497{
498 const unsigned long long alignment = 16<<20; /* 16M */
499 unsigned long long start = 0LL;
500
501 while (1) {
502 int ret;
503
504 start = find_e820_area(start, ULONG_MAX, size, alignment);
505 if (start == -1ULL)
506 return start;
507
508 /* try to reserve it */
509 ret = reserve_bootmem_generic(start, size, BOOTMEM_EXCLUSIVE);
510 if (ret >= 0)
511 return start;
512
513 start += alignment;
514 }
515}
516
517static inline unsigned long long get_total_mem(void) 495static inline unsigned long long get_total_mem(void)
518{ 496{
519 unsigned long long total; 497 unsigned long long total;
520 498
521 total = max_low_pfn - min_low_pfn; 499 total = max_pfn - min_low_pfn;
522#ifdef CONFIG_HIGHMEM
523 total += highend_pfn - highstart_pfn;
524#endif
525 500
526 return total << PAGE_SHIFT; 501 return total << PAGE_SHIFT;
527} 502}
@@ -541,21 +516,25 @@ static void __init reserve_crashkernel(void)
541 516
542 /* 0 means: find the address automatically */ 517 /* 0 means: find the address automatically */
543 if (crash_base <= 0) { 518 if (crash_base <= 0) {
544 crash_base = find_and_reserve_crashkernel(crash_size); 519 const unsigned long long alignment = 16<<20; /* 16M */
520
521 crash_base = find_e820_area(alignment, ULONG_MAX, crash_size,
522 alignment);
545 if (crash_base == -1ULL) { 523 if (crash_base == -1ULL) {
546 pr_info("crashkernel reservation failed. " 524 pr_info("crashkernel reservation failed - No suitable area found.\n");
547 "No suitable area found.\n");
548 return; 525 return;
549 } 526 }
550 } else { 527 } else {
551 ret = reserve_bootmem_generic(crash_base, crash_size, 528 unsigned long long start;
552 BOOTMEM_EXCLUSIVE); 529
553 if (ret < 0) { 530 start = find_e820_area(crash_base, ULONG_MAX, crash_size,
554 pr_info("crashkernel reservation failed - " 531 1<<20);
555 "memory is in use\n"); 532 if (start != crash_base) {
533 pr_info("crashkernel reservation failed - memory is in use.\n");
556 return; 534 return;
557 } 535 }
558 } 536 }
537 reserve_early(crash_base, crash_base + crash_size, "CRASH KERNEL");
559 538
560 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " 539 printk(KERN_INFO "Reserving %ldMB of memory at %ldMB "
561 "for crashkernel (System RAM: %ldMB)\n", 540 "for crashkernel (System RAM: %ldMB)\n",
@@ -628,6 +607,16 @@ static int __init setup_elfcorehdr(char *arg)
628early_param("elfcorehdr", setup_elfcorehdr); 607early_param("elfcorehdr", setup_elfcorehdr);
629#endif 608#endif
630 609
610static __init void reserve_ibft_region(void)
611{
612 unsigned long addr, size = 0;
613
614 addr = find_ibft_region(&size);
615
616 if (size)
617 reserve_early_overlap_ok(addr, addr + size, "ibft");
618}
619
631#ifdef CONFIG_X86_RESERVE_LOW_64K 620#ifdef CONFIG_X86_RESERVE_LOW_64K
632static int __init dmi_low_memory_corruption(const struct dmi_system_id *d) 621static int __init dmi_low_memory_corruption(const struct dmi_system_id *d)
633{ 622{
@@ -666,23 +655,48 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
666 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"), 655 DMI_MATCH(DMI_BIOS_VENDOR, "Phoenix/MSC"),
667 }, 656 },
668 }, 657 },
669 {
670 /* 658 /*
671 * AMI BIOS with low memory corruption was found on Intel DG45ID board. 659 * AMI BIOS with low memory corruption was found on Intel DG45ID and
672 * It hase different DMI_BIOS_VENDOR = "Intel Corp.", for now we will 660 * DG45FC boards.
661 * It has a different DMI_BIOS_VENDOR = "Intel Corp.", for now we will
673 * match only DMI_BOARD_NAME and see if there is more bad products 662 * match only DMI_BOARD_NAME and see if there is more bad products
674 * with this vendor. 663 * with this vendor.
675 */ 664 */
665 {
676 .callback = dmi_low_memory_corruption, 666 .callback = dmi_low_memory_corruption,
677 .ident = "AMI BIOS", 667 .ident = "AMI BIOS",
678 .matches = { 668 .matches = {
679 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"), 669 DMI_MATCH(DMI_BOARD_NAME, "DG45ID"),
680 }, 670 },
681 }, 671 },
672 {
673 .callback = dmi_low_memory_corruption,
674 .ident = "AMI BIOS",
675 .matches = {
676 DMI_MATCH(DMI_BOARD_NAME, "DG45FC"),
677 },
678 },
682#endif 679#endif
683 {} 680 {}
684}; 681};
685 682
683static void __init trim_bios_range(void)
684{
685 /*
686 * A special case is the first 4Kb of memory;
687 * This is a BIOS owned area, not kernel ram, but generally
688 * not listed as such in the E820 table.
689 */
690 e820_update_range(0, PAGE_SIZE, E820_RAM, E820_RESERVED);
691 /*
692 * special case: Some BIOSen report the PC BIOS
693 * area (640->1Mb) as ram even though it is not.
694 * take them out.
695 */
696 e820_remove_range(BIOS_BEGIN, BIOS_END - BIOS_BEGIN, E820_RAM, 1);
697 sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
698}
699
686/* 700/*
687 * Determine if we were loaded by an EFI loader. If so, then we have also been 701 * Determine if we were loaded by an EFI loader. If so, then we have also been
688 * passed the efi memmap, systab, etc., so we should use these data structures 702 * passed the efi memmap, systab, etc., so we should use these data structures
@@ -698,6 +712,9 @@ static struct dmi_system_id __initdata bad_bios_dmi_table[] = {
698 712
699void __init setup_arch(char **cmdline_p) 713void __init setup_arch(char **cmdline_p)
700{ 714{
715 int acpi = 0;
716 int k8 = 0;
717
701#ifdef CONFIG_X86_32 718#ifdef CONFIG_X86_32
702 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); 719 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data));
703 visws_early_detect(); 720 visws_early_detect();
@@ -790,21 +807,18 @@ void __init setup_arch(char **cmdline_p)
790 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE); 807 strlcpy(command_line, boot_command_line, COMMAND_LINE_SIZE);
791 *cmdline_p = command_line; 808 *cmdline_p = command_line;
792 809
793#ifdef CONFIG_X86_64
794 /* 810 /*
795 * Must call this twice: Once just to detect whether hardware doesn't 811 * x86_configure_nx() is called before parse_early_param() to detect
796 * support NX (so that the early EHCI debug console setup can safely 812 * whether hardware doesn't support NX (so that the early EHCI debug
797 * call set_fixmap(), and then again after parsing early parameters to 813 * console setup can safely call set_fixmap()). It may then be called
798 * honor the respective command line option. 814 * again from within noexec_setup() during parsing early parameters
815 * to honor the respective command line option.
799 */ 816 */
800 check_efer(); 817 x86_configure_nx();
801#endif
802 818
803 parse_early_param(); 819 parse_early_param();
804 820
805#ifdef CONFIG_X86_64 821 x86_report_nx();
806 check_efer();
807#endif
808 822
809 /* Must be before kernel pagetables are setup */ 823 /* Must be before kernel pagetables are setup */
810 vmi_activate(); 824 vmi_activate();
@@ -846,7 +860,7 @@ void __init setup_arch(char **cmdline_p)
846 insert_resource(&iomem_resource, &data_resource); 860 insert_resource(&iomem_resource, &data_resource);
847 insert_resource(&iomem_resource, &bss_resource); 861 insert_resource(&iomem_resource, &bss_resource);
848 862
849 863 trim_bios_range();
850#ifdef CONFIG_X86_32 864#ifdef CONFIG_X86_32
851 if (ppro_with_ram_bug()) { 865 if (ppro_with_ram_bug()) {
852 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM, 866 e820_update_range(0x70000000ULL, 0x40000ULL, E820_RAM,
@@ -900,6 +914,22 @@ void __init setup_arch(char **cmdline_p)
900 914
901 reserve_brk(); 915 reserve_brk();
902 916
917 /*
918 * Find and reserve possible boot-time SMP configuration:
919 */
920 find_smp_config();
921
922 reserve_ibft_region();
923
924 reserve_trampoline_memory();
925
926#ifdef CONFIG_ACPI_SLEEP
927 /*
928 * Reserve low memory region for sleep support.
929 * even before init_memory_mapping
930 */
931 acpi_reserve_wakeup_memory();
932#endif
903 init_gbpages(); 933 init_gbpages();
904 934
905 /* max_pfn_mapped is updated here */ 935 /* max_pfn_mapped is updated here */
@@ -926,6 +956,8 @@ void __init setup_arch(char **cmdline_p)
926 956
927 reserve_initrd(); 957 reserve_initrd();
928 958
959 reserve_crashkernel();
960
929 vsmp_init(); 961 vsmp_init();
930 962
931 io_delay_init(); 963 io_delay_init();
@@ -941,34 +973,20 @@ void __init setup_arch(char **cmdline_p)
941 /* 973 /*
942 * Parse SRAT to discover nodes. 974 * Parse SRAT to discover nodes.
943 */ 975 */
944 acpi_numa_init(); 976 acpi = acpi_numa_init();
945#endif 977#endif
946 978
947 initmem_init(0, max_pfn); 979#ifdef CONFIG_K8_NUMA
948 980 if (!acpi)
949#ifdef CONFIG_ACPI_SLEEP 981 k8 = !k8_numa_init(0, max_pfn);
950 /*
951 * Reserve low memory region for sleep support.
952 */
953 acpi_reserve_bootmem();
954#endif 982#endif
955 /*
956 * Find and reserve possible boot-time SMP configuration:
957 */
958 find_smp_config();
959 983
960 reserve_crashkernel(); 984 initmem_init(0, max_pfn, acpi, k8);
961 985#ifndef CONFIG_NO_BOOTMEM
962#ifdef CONFIG_X86_64 986 early_res_to_bootmem(0, max_low_pfn<<PAGE_SHIFT);
963 /*
964 * dma32_reserve_bootmem() allocates bootmem which may conflict
965 * with the crashkernel command line, so do that after
966 * reserve_crashkernel()
967 */
968 dma32_reserve_bootmem();
969#endif 987#endif
970 988
971 reserve_ibft_region(); 989 dma32_reserve_bootmem();
972 990
973#ifdef CONFIG_KVM_CLOCK 991#ifdef CONFIG_KVM_CLOCK
974 kvmclock_init(); 992 kvmclock_init();
@@ -1031,6 +1049,8 @@ void __init setup_arch(char **cmdline_p)
1031#endif 1049#endif
1032#endif 1050#endif
1033 x86_init.oem.banner(); 1051 x86_init.oem.banner();
1052
1053 mcheck_init();
1034} 1054}
1035 1055
1036#ifdef CONFIG_X86_32 1056#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index d559af913e1f..ef6370b00e70 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -1,3 +1,5 @@
1#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
2
1#include <linux/kernel.h> 3#include <linux/kernel.h>
2#include <linux/module.h> 4#include <linux/module.h>
3#include <linux/init.h> 5#include <linux/init.h>
@@ -20,9 +22,9 @@
20#include <asm/stackprotector.h> 22#include <asm/stackprotector.h>
21 23
22#ifdef CONFIG_DEBUG_PER_CPU_MAPS 24#ifdef CONFIG_DEBUG_PER_CPU_MAPS
23# define DBG(x...) printk(KERN_DEBUG x) 25# define DBG(fmt, ...) pr_dbg(fmt, ##__VA_ARGS__)
24#else 26#else
25# define DBG(x...) 27# define DBG(fmt, ...) do { if (0) pr_dbg(fmt, ##__VA_ARGS__); } while (0)
26#endif 28#endif
27 29
28DEFINE_PER_CPU(int, cpu_number); 30DEFINE_PER_CPU(int, cpu_number);
@@ -116,8 +118,8 @@ static void * __init pcpu_alloc_bootmem(unsigned int cpu, unsigned long size,
116 } else { 118 } else {
117 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node), 119 ptr = __alloc_bootmem_node_nopanic(NODE_DATA(node),
118 size, align, goal); 120 size, align, goal);
119 pr_debug("per cpu data for cpu%d %lu bytes on node%d at " 121 pr_debug("per cpu data for cpu%d %lu bytes on node%d at %016lx\n",
120 "%016lx\n", cpu, size, node, __pa(ptr)); 122 cpu, size, node, __pa(ptr));
121 } 123 }
122 return ptr; 124 return ptr;
123#else 125#else
@@ -135,7 +137,13 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align)
135 137
136static void __init pcpu_fc_free(void *ptr, size_t size) 138static void __init pcpu_fc_free(void *ptr, size_t size)
137{ 139{
140#ifdef CONFIG_NO_BOOTMEM
141 u64 start = __pa(ptr);
142 u64 end = start + size;
143 free_early_partial(start, end);
144#else
138 free_bootmem(__pa(ptr), size); 145 free_bootmem(__pa(ptr), size);
146#endif
139} 147}
140 148
141static int __init pcpu_cpu_distance(unsigned int from, unsigned int to) 149static int __init pcpu_cpu_distance(unsigned int from, unsigned int to)
@@ -198,8 +206,7 @@ void __init setup_per_cpu_areas(void)
198 pcpu_cpu_distance, 206 pcpu_cpu_distance,
199 pcpu_fc_alloc, pcpu_fc_free); 207 pcpu_fc_alloc, pcpu_fc_free);
200 if (rc < 0) 208 if (rc < 0)
201 pr_warning("PERCPU: %s allocator failed (%d), " 209 pr_warning("%s allocator failed (%d), falling back to page size\n",
202 "falling back to page size\n",
203 pcpu_fc_names[pcpu_chosen_fc], rc); 210 pcpu_fc_names[pcpu_chosen_fc], rc);
204 } 211 }
205 if (rc < 0) 212 if (rc < 0)
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index 6a44a76055ad..4fd173cd8e57 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -19,6 +19,7 @@
19#include <linux/stddef.h> 19#include <linux/stddef.h>
20#include <linux/personality.h> 20#include <linux/personality.h>
21#include <linux/uaccess.h> 21#include <linux/uaccess.h>
22#include <linux/user-return-notifier.h>
22 23
23#include <asm/processor.h> 24#include <asm/processor.h>
24#include <asm/ucontext.h> 25#include <asm/ucontext.h>
@@ -544,22 +545,12 @@ sys_sigaction(int sig, const struct old_sigaction __user *act,
544} 545}
545#endif /* CONFIG_X86_32 */ 546#endif /* CONFIG_X86_32 */
546 547
547#ifdef CONFIG_X86_32 548long
548int sys_sigaltstack(struct pt_regs *regs)
549{
550 const stack_t __user *uss = (const stack_t __user *)regs->bx;
551 stack_t __user *uoss = (stack_t __user *)regs->cx;
552
553 return do_sigaltstack(uss, uoss, regs->sp);
554}
555#else /* !CONFIG_X86_32 */
556asmlinkage long
557sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss, 549sys_sigaltstack(const stack_t __user *uss, stack_t __user *uoss,
558 struct pt_regs *regs) 550 struct pt_regs *regs)
559{ 551{
560 return do_sigaltstack(uss, uoss, regs->sp); 552 return do_sigaltstack(uss, uoss, regs->sp);
561} 553}
562#endif /* CONFIG_X86_32 */
563 554
564/* 555/*
565 * Do a signal return; undo the signal stack. 556 * Do a signal return; undo the signal stack.
@@ -799,15 +790,6 @@ static void do_signal(struct pt_regs *regs)
799 790
800 signr = get_signal_to_deliver(&info, &ka, regs, NULL); 791 signr = get_signal_to_deliver(&info, &ka, regs, NULL);
801 if (signr > 0) { 792 if (signr > 0) {
802 /*
803 * Re-enable any watchpoints before delivering the
804 * signal to user space. The processor register will
805 * have been cleared if the watchpoint triggered
806 * inside the kernel.
807 */
808 if (current->thread.debugreg7)
809 set_debugreg(current->thread.debugreg7, 7);
810
811 /* Whee! Actually deliver the signal. */ 793 /* Whee! Actually deliver the signal. */
812 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) { 794 if (handle_signal(signr, &info, &ka, oldset, regs) == 0) {
813 /* 795 /*
@@ -872,6 +854,8 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
872 if (current->replacement_session_keyring) 854 if (current->replacement_session_keyring)
873 key_replace_session_keyring(); 855 key_replace_session_keyring();
874 } 856 }
857 if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
858 fire_user_return_notifiers();
875 859
876#ifdef CONFIG_X86_32 860#ifdef CONFIG_X86_32
877 clear_thread_flag(TIF_IRET); 861 clear_thread_flag(TIF_IRET);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index a93528bc16e9..97af589a5c0c 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -21,6 +21,7 @@
21#include <linux/cache.h> 21#include <linux/cache.h>
22#include <linux/interrupt.h> 22#include <linux/interrupt.h>
23#include <linux/cpu.h> 23#include <linux/cpu.h>
24#include <linux/gfp.h>
24 25
25#include <litmus/litmus.h> 26#include <litmus/litmus.h>
26#include <litmus/trace.h> 27#include <litmus/trace.h>
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 565ebc65920e..763d815e27a0 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -48,6 +48,8 @@
48#include <linux/err.h> 48#include <linux/err.h>
49#include <linux/nmi.h> 49#include <linux/nmi.h>
50#include <linux/tboot.h> 50#include <linux/tboot.h>
51#include <linux/stackprotector.h>
52#include <linux/gfp.h>
51 53
52#include <asm/acpi.h> 54#include <asm/acpi.h>
53#include <asm/desc.h> 55#include <asm/desc.h>
@@ -67,6 +69,7 @@
67#include <linux/mc146818rtc.h> 69#include <linux/mc146818rtc.h>
68 70
69#include <asm/smpboot_hooks.h> 71#include <asm/smpboot_hooks.h>
72#include <asm/i8259.h>
70 73
71#ifdef CONFIG_X86_32 74#ifdef CONFIG_X86_32
72u8 apicid_2_node[MAX_APICID]; 75u8 apicid_2_node[MAX_APICID];
@@ -240,7 +243,10 @@ static void __cpuinit smp_callin(void)
240 end_local_APIC_setup(); 243 end_local_APIC_setup();
241 map_cpu_to_logical_apicid(); 244 map_cpu_to_logical_apicid();
242 245
243 notify_cpu_starting(cpuid); 246 /*
247 * Need to setup vector mappings before we enable interrupts.
248 */
249 setup_vector_irq(smp_processor_id());
244 /* 250 /*
245 * Get our bogomips. 251 * Get our bogomips.
246 * 252 *
@@ -257,6 +263,8 @@ static void __cpuinit smp_callin(void)
257 */ 263 */
258 smp_store_cpu_info(cpuid); 264 smp_store_cpu_info(cpuid);
259 265
266 notify_cpu_starting(cpuid);
267
260 /* 268 /*
261 * Allow the master to continue. 269 * Allow the master to continue.
262 */ 270 */
@@ -286,9 +294,9 @@ notrace static void __cpuinit start_secondary(void *unused)
286 check_tsc_sync_target(); 294 check_tsc_sync_target();
287 295
288 if (nmi_watchdog == NMI_IO_APIC) { 296 if (nmi_watchdog == NMI_IO_APIC) {
289 disable_8259A_irq(0); 297 legacy_pic->chip->mask(0);
290 enable_NMI_through_LVT0(); 298 enable_NMI_through_LVT0();
291 enable_8259A_irq(0); 299 legacy_pic->chip->unmask(0);
292 } 300 }
293 301
294#ifdef CONFIG_X86_32 302#ifdef CONFIG_X86_32
@@ -315,15 +323,18 @@ notrace static void __cpuinit start_secondary(void *unused)
315 */ 323 */
316 ipi_call_lock(); 324 ipi_call_lock();
317 lock_vector_lock(); 325 lock_vector_lock();
318 __setup_vector_irq(smp_processor_id());
319 set_cpu_online(smp_processor_id(), true); 326 set_cpu_online(smp_processor_id(), true);
320 unlock_vector_lock(); 327 unlock_vector_lock();
321 ipi_call_unlock(); 328 ipi_call_unlock();
322 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; 329 per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
330 x86_platform.nmi_init();
323 331
324 /* enable local interrupts */ 332 /* enable local interrupts */
325 local_irq_enable(); 333 local_irq_enable();
326 334
335 /* to prevent fake stack check failure in clock setup */
336 boot_init_stack_canary();
337
327 x86_cpuinit.setup_percpu_clockev(); 338 x86_cpuinit.setup_percpu_clockev();
328 339
329 wmb(); 340 wmb();
@@ -671,6 +682,26 @@ static void __cpuinit do_fork_idle(struct work_struct *work)
671 complete(&c_idle->done); 682 complete(&c_idle->done);
672} 683}
673 684
685/* reduce the number of lines printed when booting a large cpu count system */
686static void __cpuinit announce_cpu(int cpu, int apicid)
687{
688 static int current_node = -1;
689 int node = cpu_to_node(cpu);
690
691 if (system_state == SYSTEM_BOOTING) {
692 if (node != current_node) {
693 if (current_node > (-1))
694 pr_cont(" Ok.\n");
695 current_node = node;
696 pr_info("Booting Node %3d, Processors ", node);
697 }
698 pr_cont(" #%d%s", cpu, cpu == (nr_cpu_ids - 1) ? " Ok.\n" : "");
699 return;
700 } else
701 pr_info("Booting Node %d Processor %d APIC 0x%x\n",
702 node, cpu, apicid);
703}
704
674/* 705/*
675 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad 706 * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
676 * (ie clustered apic addressing mode), this is a LOGICAL apic ID. 707 * (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -687,7 +718,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
687 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done), 718 .done = COMPLETION_INITIALIZER_ONSTACK(c_idle.done),
688 }; 719 };
689 720
690 INIT_WORK(&c_idle.work, do_fork_idle); 721 INIT_WORK_ON_STACK(&c_idle.work, do_fork_idle);
691 722
692 alternatives_smp_switch(1); 723 alternatives_smp_switch(1);
693 724
@@ -713,6 +744,7 @@ static int __cpuinit do_boot_cpu(int apicid, int cpu)
713 744
714 if (IS_ERR(c_idle.idle)) { 745 if (IS_ERR(c_idle.idle)) {
715 printk("failed fork for CPU %d\n", cpu); 746 printk("failed fork for CPU %d\n", cpu);
747 destroy_work_on_stack(&c_idle.work);
716 return PTR_ERR(c_idle.idle); 748 return PTR_ERR(c_idle.idle);
717 } 749 }
718 750
@@ -736,9 +768,8 @@ do_rest:
736 /* start_ip had better be page-aligned! */ 768 /* start_ip had better be page-aligned! */
737 start_ip = setup_trampoline(); 769 start_ip = setup_trampoline();
738 770
739 /* So we see what's up */ 771 /* So we see what's up */
740 printk(KERN_INFO "Booting processor %d APIC 0x%x ip 0x%lx\n", 772 announce_cpu(cpu, apicid);
741 cpu, apicid, start_ip);
742 773
743 /* 774 /*
744 * This grunge runs the startup process for 775 * This grunge runs the startup process for
@@ -787,21 +818,17 @@ do_rest:
787 udelay(100); 818 udelay(100);
788 } 819 }
789 820
790 if (cpumask_test_cpu(cpu, cpu_callin_mask)) { 821 if (cpumask_test_cpu(cpu, cpu_callin_mask))
791 /* number CPUs logically, starting from 1 (BSP is 0) */ 822 pr_debug("CPU%d: has booted.\n", cpu);
792 pr_debug("OK.\n"); 823 else {
793 printk(KERN_INFO "CPU%d: ", cpu);
794 print_cpu_info(&cpu_data(cpu));
795 pr_debug("CPU has booted.\n");
796 } else {
797 boot_error = 1; 824 boot_error = 1;
798 if (*((volatile unsigned char *)trampoline_base) 825 if (*((volatile unsigned char *)trampoline_base)
799 == 0xA5) 826 == 0xA5)
800 /* trampoline started but...? */ 827 /* trampoline started but...? */
801 printk(KERN_ERR "Stuck ??\n"); 828 pr_err("CPU%d: Stuck ??\n", cpu);
802 else 829 else
803 /* trampoline code not run */ 830 /* trampoline code not run */
804 printk(KERN_ERR "Not responding.\n"); 831 pr_err("CPU%d: Not responding.\n", cpu);
805 if (apic->inquire_remote_apic) 832 if (apic->inquire_remote_apic)
806 apic->inquire_remote_apic(apicid); 833 apic->inquire_remote_apic(apicid);
807 } 834 }
@@ -831,6 +858,7 @@ do_rest:
831 smpboot_restore_warm_reset_vector(); 858 smpboot_restore_warm_reset_vector();
832 } 859 }
833 860
861 destroy_work_on_stack(&c_idle.work);
834 return boot_error; 862 return boot_error;
835} 863}
836 864
@@ -1066,9 +1094,7 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus)
1066 set_cpu_sibling_map(0); 1094 set_cpu_sibling_map(0);
1067 1095
1068 enable_IR_x2apic(); 1096 enable_IR_x2apic();
1069#ifdef CONFIG_X86_64
1070 default_setup_apic_routing(); 1097 default_setup_apic_routing();
1071#endif
1072 1098
1073 if (smp_sanity_check(max_cpus) < 0) { 1099 if (smp_sanity_check(max_cpus) < 0) {
1074 printk(KERN_INFO "SMP disabled\n"); 1100 printk(KERN_INFO "SMP disabled\n");
@@ -1196,11 +1222,12 @@ __init void prefill_possible_map(void)
1196 1222
1197 total_cpus = max_t(int, possible, num_processors + disabled_cpus); 1223 total_cpus = max_t(int, possible, num_processors + disabled_cpus);
1198 1224
1199 if (possible > CONFIG_NR_CPUS) { 1225 /* nr_cpu_ids could be reduced via nr_cpus= */
1226 if (possible > nr_cpu_ids) {
1200 printk(KERN_WARNING 1227 printk(KERN_WARNING
1201 "%d Processors exceeds NR_CPUS limit of %d\n", 1228 "%d Processors exceeds NR_CPUS limit of %d\n",
1202 possible, CONFIG_NR_CPUS); 1229 possible, nr_cpu_ids);
1203 possible = CONFIG_NR_CPUS; 1230 possible = nr_cpu_ids;
1204 } 1231 }
1205 1232
1206 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n", 1233 printk(KERN_INFO "SMP: Allowing %d CPUs, %d hotplug CPUs\n",
@@ -1250,16 +1277,7 @@ static void __ref remove_cpu_from_maps(int cpu)
1250void cpu_disable_common(void) 1277void cpu_disable_common(void)
1251{ 1278{
1252 int cpu = smp_processor_id(); 1279 int cpu = smp_processor_id();
1253 /*
1254 * HACK:
1255 * Allow any queued timer interrupts to get serviced
1256 * This is only a temporary solution until we cleanup
1257 * fixup_irqs as we do for IA64.
1258 */
1259 local_irq_enable();
1260 mdelay(1);
1261 1280
1262 local_irq_disable();
1263 remove_siblinginfo(cpu); 1281 remove_siblinginfo(cpu);
1264 1282
1265 /* It's now safe to remove this processor from the online map */ 1283 /* It's now safe to remove this processor from the online map */
@@ -1300,14 +1318,16 @@ void native_cpu_die(unsigned int cpu)
1300 for (i = 0; i < 10; i++) { 1318 for (i = 0; i < 10; i++) {
1301 /* They ack this in play_dead by setting CPU_DEAD */ 1319 /* They ack this in play_dead by setting CPU_DEAD */
1302 if (per_cpu(cpu_state, cpu) == CPU_DEAD) { 1320 if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
1303 printk(KERN_INFO "CPU %d is now offline\n", cpu); 1321 if (system_state == SYSTEM_RUNNING)
1322 pr_info("CPU %u is now offline\n", cpu);
1323
1304 if (1 == num_online_cpus()) 1324 if (1 == num_online_cpus())
1305 alternatives_smp_switch(0); 1325 alternatives_smp_switch(0);
1306 return; 1326 return;
1307 } 1327 }
1308 msleep(100); 1328 msleep(100);
1309 } 1329 }
1310 printk(KERN_ERR "CPU %u didn't die...\n", cpu); 1330 pr_err("CPU %u didn't die...\n", cpu);
1311} 1331}
1312 1332
1313void play_dead_common(void) 1333void play_dead_common(void)
diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c
index c3eb207181fe..922eefbb3f6c 100644
--- a/arch/x86/kernel/stacktrace.c
+++ b/arch/x86/kernel/stacktrace.c
@@ -53,17 +53,19 @@ save_stack_address_nosched(void *data, unsigned long addr, int reliable)
53} 53}
54 54
55static const struct stacktrace_ops save_stack_ops = { 55static const struct stacktrace_ops save_stack_ops = {
56 .warning = save_stack_warning, 56 .warning = save_stack_warning,
57 .warning_symbol = save_stack_warning_symbol, 57 .warning_symbol = save_stack_warning_symbol,
58 .stack = save_stack_stack, 58 .stack = save_stack_stack,
59 .address = save_stack_address, 59 .address = save_stack_address,
60 .walk_stack = print_context_stack,
60}; 61};
61 62
62static const struct stacktrace_ops save_stack_ops_nosched = { 63static const struct stacktrace_ops save_stack_ops_nosched = {
63 .warning = save_stack_warning, 64 .warning = save_stack_warning,
64 .warning_symbol = save_stack_warning_symbol, 65 .warning_symbol = save_stack_warning_symbol,
65 .stack = save_stack_stack, 66 .stack = save_stack_stack,
66 .address = save_stack_address_nosched, 67 .address = save_stack_address_nosched,
68 .walk_stack = print_context_stack,
67}; 69};
68 70
69/* 71/*
diff --git a/arch/x86/kernel/sys_i386_32.c b/arch/x86/kernel/sys_i386_32.c
index 1884a8d12bfa..196552bb412c 100644
--- a/arch/x86/kernel/sys_i386_32.c
+++ b/arch/x86/kernel/sys_i386_32.c
@@ -24,216 +24,6 @@
24 24
25#include <asm/syscalls.h> 25#include <asm/syscalls.h>
26 26
27asmlinkage long sys_mmap2(unsigned long addr, unsigned long len,
28 unsigned long prot, unsigned long flags,
29 unsigned long fd, unsigned long pgoff)
30{
31 int error = -EBADF;
32 struct file *file = NULL;
33 struct mm_struct *mm = current->mm;
34
35 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
36 if (!(flags & MAP_ANONYMOUS)) {
37 file = fget(fd);
38 if (!file)
39 goto out;
40 }
41
42 down_write(&mm->mmap_sem);
43 error = do_mmap_pgoff(file, addr, len, prot, flags, pgoff);
44 up_write(&mm->mmap_sem);
45
46 if (file)
47 fput(file);
48out:
49 return error;
50}
51
52/*
53 * Perform the select(nd, in, out, ex, tv) and mmap() system
54 * calls. Linux/i386 didn't use to be able to handle more than
55 * 4 system call parameters, so these system calls used a memory
56 * block for parameter passing..
57 */
58
59struct mmap_arg_struct {
60 unsigned long addr;
61 unsigned long len;
62 unsigned long prot;
63 unsigned long flags;
64 unsigned long fd;
65 unsigned long offset;
66};
67
68asmlinkage int old_mmap(struct mmap_arg_struct __user *arg)
69{
70 struct mmap_arg_struct a;
71 int err = -EFAULT;
72
73 if (copy_from_user(&a, arg, sizeof(a)))
74 goto out;
75
76 err = -EINVAL;
77 if (a.offset & ~PAGE_MASK)
78 goto out;
79
80 err = sys_mmap2(a.addr, a.len, a.prot, a.flags,
81 a.fd, a.offset >> PAGE_SHIFT);
82out:
83 return err;
84}
85
86
87struct sel_arg_struct {
88 unsigned long n;
89 fd_set __user *inp, *outp, *exp;
90 struct timeval __user *tvp;
91};
92
93asmlinkage int old_select(struct sel_arg_struct __user *arg)
94{
95 struct sel_arg_struct a;
96
97 if (copy_from_user(&a, arg, sizeof(a)))
98 return -EFAULT;
99 /* sys_select() does the appropriate kernel locking */
100 return sys_select(a.n, a.inp, a.outp, a.exp, a.tvp);
101}
102
103/*
104 * sys_ipc() is the de-multiplexer for the SysV IPC calls..
105 *
106 * This is really horribly ugly.
107 */
108asmlinkage int sys_ipc(uint call, int first, int second,
109 int third, void __user *ptr, long fifth)
110{
111 int version, ret;
112
113 version = call >> 16; /* hack for backward compatibility */
114 call &= 0xffff;
115
116 switch (call) {
117 case SEMOP:
118 return sys_semtimedop(first, (struct sembuf __user *)ptr, second, NULL);
119 case SEMTIMEDOP:
120 return sys_semtimedop(first, (struct sembuf __user *)ptr, second,
121 (const struct timespec __user *)fifth);
122
123 case SEMGET:
124 return sys_semget(first, second, third);
125 case SEMCTL: {
126 union semun fourth;
127 if (!ptr)
128 return -EINVAL;
129 if (get_user(fourth.__pad, (void __user * __user *) ptr))
130 return -EFAULT;
131 return sys_semctl(first, second, third, fourth);
132 }
133
134 case MSGSND:
135 return sys_msgsnd(first, (struct msgbuf __user *) ptr,
136 second, third);
137 case MSGRCV:
138 switch (version) {
139 case 0: {
140 struct ipc_kludge tmp;
141 if (!ptr)
142 return -EINVAL;
143
144 if (copy_from_user(&tmp,
145 (struct ipc_kludge __user *) ptr,
146 sizeof(tmp)))
147 return -EFAULT;
148 return sys_msgrcv(first, tmp.msgp, second,
149 tmp.msgtyp, third);
150 }
151 default:
152 return sys_msgrcv(first,
153 (struct msgbuf __user *) ptr,
154 second, fifth, third);
155 }
156 case MSGGET:
157 return sys_msgget((key_t) first, second);
158 case MSGCTL:
159 return sys_msgctl(first, second, (struct msqid_ds __user *) ptr);
160
161 case SHMAT:
162 switch (version) {
163 default: {
164 ulong raddr;
165 ret = do_shmat(first, (char __user *) ptr, second, &raddr);
166 if (ret)
167 return ret;
168 return put_user(raddr, (ulong __user *) third);
169 }
170 case 1: /* iBCS2 emulator entry point */
171 if (!segment_eq(get_fs(), get_ds()))
172 return -EINVAL;
173 /* The "(ulong *) third" is valid _only_ because of the kernel segment thing */
174 return do_shmat(first, (char __user *) ptr, second, (ulong *) third);
175 }
176 case SHMDT:
177 return sys_shmdt((char __user *)ptr);
178 case SHMGET:
179 return sys_shmget(first, second, third);
180 case SHMCTL:
181 return sys_shmctl(first, second,
182 (struct shmid_ds __user *) ptr);
183 default:
184 return -ENOSYS;
185 }
186}
187
188/*
189 * Old cruft
190 */
191asmlinkage int sys_uname(struct old_utsname __user *name)
192{
193 int err;
194 if (!name)
195 return -EFAULT;
196 down_read(&uts_sem);
197 err = copy_to_user(name, utsname(), sizeof(*name));
198 up_read(&uts_sem);
199 return err? -EFAULT:0;
200}
201
202asmlinkage int sys_olduname(struct oldold_utsname __user *name)
203{
204 int error;
205
206 if (!name)
207 return -EFAULT;
208 if (!access_ok(VERIFY_WRITE, name, sizeof(struct oldold_utsname)))
209 return -EFAULT;
210
211 down_read(&uts_sem);
212
213 error = __copy_to_user(&name->sysname, &utsname()->sysname,
214 __OLD_UTS_LEN);
215 error |= __put_user(0, name->sysname + __OLD_UTS_LEN);
216 error |= __copy_to_user(&name->nodename, &utsname()->nodename,
217 __OLD_UTS_LEN);
218 error |= __put_user(0, name->nodename + __OLD_UTS_LEN);
219 error |= __copy_to_user(&name->release, &utsname()->release,
220 __OLD_UTS_LEN);
221 error |= __put_user(0, name->release + __OLD_UTS_LEN);
222 error |= __copy_to_user(&name->version, &utsname()->version,
223 __OLD_UTS_LEN);
224 error |= __put_user(0, name->version + __OLD_UTS_LEN);
225 error |= __copy_to_user(&name->machine, &utsname()->machine,
226 __OLD_UTS_LEN);
227 error |= __put_user(0, name->machine + __OLD_UTS_LEN);
228
229 up_read(&uts_sem);
230
231 error = error ? -EFAULT : 0;
232
233 return error;
234}
235
236
237/* 27/*
238 * Do a system call from kernel instead of calling sys_execve so we 28 * Do a system call from kernel instead of calling sys_execve so we
239 * end up with proper pt_regs. 29 * end up with proper pt_regs.
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 45e00eb09c3a..ff14a5044ce6 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -23,26 +23,11 @@ SYSCALL_DEFINE6(mmap, unsigned long, addr, unsigned long, len,
23 unsigned long, fd, unsigned long, off) 23 unsigned long, fd, unsigned long, off)
24{ 24{
25 long error; 25 long error;
26 struct file *file;
27
28 error = -EINVAL; 26 error = -EINVAL;
29 if (off & ~PAGE_MASK) 27 if (off & ~PAGE_MASK)
30 goto out; 28 goto out;
31 29
32 error = -EBADF; 30 error = sys_mmap_pgoff(addr, len, prot, flags, fd, off >> PAGE_SHIFT);
33 file = NULL;
34 flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
35 if (!(flags & MAP_ANONYMOUS)) {
36 file = fget(fd);
37 if (!file)
38 goto out;
39 }
40 down_write(&current->mm->mmap_sem);
41 error = do_mmap_pgoff(file, addr, len, prot, flags, off >> PAGE_SHIFT);
42 up_write(&current->mm->mmap_sem);
43
44 if (file)
45 fput(file);
46out: 31out:
47 return error; 32 return error;
48} 33}
@@ -224,15 +209,3 @@ bottomup:
224 209
225 return addr; 210 return addr;
226} 211}
227
228
229SYSCALL_DEFINE1(uname, struct new_utsname __user *, name)
230{
231 int err;
232 down_read(&uts_sem);
233 err = copy_to_user(name, utsname(), sizeof(*name));
234 up_read(&uts_sem);
235 if (personality(current->personality) == PER_LINUX32)
236 err |= copy_to_user(&name->machine, "i686", 5);
237 return err ? -EFAULT : 0;
238}
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index 17fcb3abe236..5da9a68546b7 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -81,7 +81,7 @@ ENTRY(sys_call_table)
81 .long sys_settimeofday 81 .long sys_settimeofday
82 .long sys_getgroups16 /* 80 */ 82 .long sys_getgroups16 /* 80 */
83 .long sys_setgroups16 83 .long sys_setgroups16
84 .long old_select 84 .long sys_old_select
85 .long sys_symlink 85 .long sys_symlink
86 .long sys_lstat 86 .long sys_lstat
87 .long sys_readlink /* 85 */ 87 .long sys_readlink /* 85 */
@@ -89,7 +89,7 @@ ENTRY(sys_call_table)
89 .long sys_swapon 89 .long sys_swapon
90 .long sys_reboot 90 .long sys_reboot
91 .long sys_old_readdir 91 .long sys_old_readdir
92 .long old_mmap /* 90 */ 92 .long sys_old_mmap /* 90 */
93 .long sys_munmap 93 .long sys_munmap
94 .long sys_truncate 94 .long sys_truncate
95 .long sys_ftruncate 95 .long sys_ftruncate
@@ -191,7 +191,7 @@ ENTRY(sys_call_table)
191 .long sys_ni_syscall /* reserved for streams2 */ 191 .long sys_ni_syscall /* reserved for streams2 */
192 .long ptregs_vfork /* 190 */ 192 .long ptregs_vfork /* 190 */
193 .long sys_getrlimit 193 .long sys_getrlimit
194 .long sys_mmap2 194 .long sys_mmap_pgoff
195 .long sys_truncate64 195 .long sys_truncate64
196 .long sys_ftruncate64 196 .long sys_ftruncate64
197 .long sys_stat64 /* 195 */ 197 .long sys_stat64 /* 195 */
@@ -336,7 +336,8 @@ ENTRY(sys_call_table)
336 .long sys_pwritev 336 .long sys_pwritev
337 .long sys_rt_tgsigqueueinfo /* 335 */ 337 .long sys_rt_tgsigqueueinfo /* 335 */
338 .long sys_perf_event_open 338 .long sys_perf_event_open
339 .long sys_set_rt_task_param /* LITMUS^RT 337 */ 339 .long sys_recvmmsg
340 .long sys_set_rt_task_param /* LITMUS^RT 338 */
340 .long sys_get_rt_task_param 341 .long sys_get_rt_task_param
341 .long sys_complete_job 342 .long sys_complete_job
342 .long sys_od_open 343 .long sys_od_open
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index be2573448ed9..fb5cc5e14cfa 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -70,11 +70,11 @@ static irqreturn_t timer_interrupt(int irq, void *dev_id)
70 * manually to deassert NMI lines for the watchdog if run 70 * manually to deassert NMI lines for the watchdog if run
71 * on an 82489DX-based system. 71 * on an 82489DX-based system.
72 */ 72 */
73 spin_lock(&i8259A_lock); 73 raw_spin_lock(&i8259A_lock);
74 outb(0x0c, PIC_MASTER_OCW3); 74 outb(0x0c, PIC_MASTER_OCW3);
75 /* Ack the IRQ; AEOI will end it automatically. */ 75 /* Ack the IRQ; AEOI will end it automatically. */
76 inb(PIC_MASTER_POLL); 76 inb(PIC_MASTER_POLL);
77 spin_unlock(&i8259A_lock); 77 raw_spin_unlock(&i8259A_lock);
78 } 78 }
79 79
80 global_clock_event->event_handler(global_clock_event); 80 global_clock_event->event_handler(global_clock_event);
diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c
index 1740c85e24bb..17b03dd3a6b5 100644
--- a/arch/x86/kernel/tlb_uv.c
+++ b/arch/x86/kernel/tlb_uv.c
@@ -9,6 +9,7 @@
9#include <linux/seq_file.h> 9#include <linux/seq_file.h>
10#include <linux/proc_fs.h> 10#include <linux/proc_fs.h>
11#include <linux/kernel.h> 11#include <linux/kernel.h>
12#include <linux/slab.h>
12 13
13#include <asm/mmu_context.h> 14#include <asm/mmu_context.h>
14#include <asm/uv/uv.h> 15#include <asm/uv/uv.h>
@@ -817,10 +818,8 @@ static int __init uv_init_blade(int blade)
817 */ 818 */
818 apicid = blade_to_first_apicid(blade); 819 apicid = blade_to_first_apicid(blade);
819 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG); 820 pa = uv_read_global_mmr64(pnode, UVH_BAU_DATA_CONFIG);
820 if ((pa & 0xff) != UV_BAU_MESSAGE) { 821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
821 uv_write_global_mmr64(pnode, UVH_BAU_DATA_CONFIG,
822 ((apicid << 32) | UV_BAU_MESSAGE)); 822 ((apicid << 32) | UV_BAU_MESSAGE));
823 }
824 return 0; 823 return 0;
825} 824}
826 825
diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c
index cd022121cab6..c652ef62742d 100644
--- a/arch/x86/kernel/trampoline.c
+++ b/arch/x86/kernel/trampoline.c
@@ -12,21 +12,19 @@
12#endif 12#endif
13 13
14/* ready for x86_64 and x86 */ 14/* ready for x86_64 and x86 */
15unsigned char *__trampinitdata trampoline_base = __va(TRAMPOLINE_BASE); 15unsigned char *__trampinitdata trampoline_base;
16 16
17void __init reserve_trampoline_memory(void) 17void __init reserve_trampoline_memory(void)
18{ 18{
19#ifdef CONFIG_X86_32 19 unsigned long mem;
20 /* 20
21 * But first pinch a few for the stack/trampoline stuff
22 * FIXME: Don't need the extra page at 4K, but need to fix
23 * trampoline before removing it. (see the GDT stuff)
24 */
25 reserve_early(PAGE_SIZE, PAGE_SIZE + PAGE_SIZE, "EX TRAMPOLINE");
26#endif
27 /* Has to be in very low memory so we can execute real-mode AP code. */ 21 /* Has to be in very low memory so we can execute real-mode AP code. */
28 reserve_early(TRAMPOLINE_BASE, TRAMPOLINE_BASE + TRAMPOLINE_SIZE, 22 mem = find_e820_area(0, 1<<20, TRAMPOLINE_SIZE, PAGE_SIZE);
29 "TRAMPOLINE"); 23 if (mem == -1L)
24 panic("Cannot allocate trampoline\n");
25
26 trampoline_base = __va(mem);
27 reserve_early(mem, mem + TRAMPOLINE_SIZE, "TRAMPOLINE");
30} 28}
31 29
32/* 30/*
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 7e37dcee0cc3..1168e4454188 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -529,77 +529,59 @@ asmlinkage __kprobes struct pt_regs *sync_regs(struct pt_regs *eregs)
529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) 529dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code)
530{ 530{
531 struct task_struct *tsk = current; 531 struct task_struct *tsk = current;
532 unsigned long condition; 532 unsigned long dr6;
533 int si_code; 533 int si_code;
534 534
535 get_debugreg(condition, 6); 535 get_debugreg(dr6, 6);
536
537 /* Filter out all the reserved bits which are preset to 1 */
538 dr6 &= ~DR6_RESERVED;
536 539
537 /* Catch kmemcheck conditions first of all! */ 540 /* Catch kmemcheck conditions first of all! */
538 if (condition & DR_STEP && kmemcheck_trap(regs)) 541 if ((dr6 & DR_STEP) && kmemcheck_trap(regs))
539 return; 542 return;
540 543
544 /* DR6 may or may not be cleared by the CPU */
545 set_debugreg(0, 6);
541 /* 546 /*
542 * The processor cleared BTF, so don't mark that we need it set. 547 * The processor cleared BTF, so don't mark that we need it set.
543 */ 548 */
544 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR); 549 clear_tsk_thread_flag(tsk, TIF_DEBUGCTLMSR);
545 tsk->thread.debugctlmsr = 0; 550 tsk->thread.debugctlmsr = 0;
546 551
547 if (notify_die(DIE_DEBUG, "debug", regs, condition, error_code, 552 /* Store the virtualized DR6 value */
548 SIGTRAP) == NOTIFY_STOP) 553 tsk->thread.debugreg6 = dr6;
554
555 if (notify_die(DIE_DEBUG, "debug", regs, PTR_ERR(&dr6), error_code,
556 SIGTRAP) == NOTIFY_STOP)
549 return; 557 return;
550 558
551 /* It's safe to allow irq's after DR6 has been saved */ 559 /* It's safe to allow irq's after DR6 has been saved */
552 preempt_conditional_sti(regs); 560 preempt_conditional_sti(regs);
553 561
554 /* Mask out spurious debug traps due to lazy DR7 setting */ 562 if (regs->flags & X86_VM_MASK) {
555 if (condition & (DR_TRAP0|DR_TRAP1|DR_TRAP2|DR_TRAP3)) { 563 handle_vm86_trap((struct kernel_vm86_regs *) regs,
556 if (!tsk->thread.debugreg7) 564 error_code, 1);
557 goto clear_dr7; 565 return;
558 } 566 }
559 567
560#ifdef CONFIG_X86_32
561 if (regs->flags & X86_VM_MASK)
562 goto debug_vm86;
563#endif
564
565 /* Save debug status register where ptrace can see it */
566 tsk->thread.debugreg6 = condition;
567
568 /* 568 /*
569 * Single-stepping through TF: make sure we ignore any events in 569 * Single-stepping through system calls: ignore any exceptions in
570 * kernel space (but re-enable TF when returning to user mode). 570 * kernel space, but re-enable TF when returning to user mode.
571 *
572 * We already checked v86 mode above, so we can check for kernel mode
573 * by just checking the CPL of CS.
571 */ 574 */
572 if (condition & DR_STEP) { 575 if ((dr6 & DR_STEP) && !user_mode(regs)) {
573 if (!user_mode(regs)) 576 tsk->thread.debugreg6 &= ~DR_STEP;
574 goto clear_TF_reenable; 577 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
578 regs->flags &= ~X86_EFLAGS_TF;
575 } 579 }
576 580 si_code = get_si_code(tsk->thread.debugreg6);
577 si_code = get_si_code(condition); 581 if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS))
578 /* Ok, finally something we can handle */ 582 send_sigtrap(tsk, regs, error_code, si_code);
579 send_sigtrap(tsk, regs, error_code, si_code);
580
581 /*
582 * Disable additional traps. They'll be re-enabled when
583 * the signal is delivered.
584 */
585clear_dr7:
586 set_debugreg(0, 7);
587 preempt_conditional_cli(regs); 583 preempt_conditional_cli(regs);
588 return;
589
590#ifdef CONFIG_X86_32
591debug_vm86:
592 /* reenable preemption: handle_vm86_trap() might sleep */
593 dec_preempt_count();
594 handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1);
595 conditional_cli(regs);
596 return;
597#endif
598 584
599clear_TF_reenable:
600 set_tsk_thread_flag(tsk, TIF_SINGLESTEP);
601 regs->flags &= ~X86_EFLAGS_TF;
602 preempt_conditional_cli(regs);
603 return; 585 return;
604} 586}
605 587
diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index cd982f48e23e..9faf91ae1841 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -50,7 +50,7 @@ u64 native_sched_clock(void)
50 * unstable. We do this because unlike Time Of Day, 50 * unstable. We do this because unlike Time Of Day,
51 * the scheduler clock tolerates small errors and it's 51 * the scheduler clock tolerates small errors and it's
52 * very important for it to be as fast as the platform 52 * very important for it to be as fast as the platform
53 * can achive it. ) 53 * can achieve it. )
54 */ 54 */
55 if (unlikely(tsc_disabled)) { 55 if (unlikely(tsc_disabled)) {
56 /* No locking but a rare wrong value is not a big deal: */ 56 /* No locking but a rare wrong value is not a big deal: */
@@ -740,7 +740,7 @@ static cycle_t __vsyscall_fn vread_tsc(void)
740} 740}
741#endif 741#endif
742 742
743static void resume_tsc(void) 743static void resume_tsc(struct clocksource *cs)
744{ 744{
745 clocksource_tsc.cycle_last = 0; 745 clocksource_tsc.cycle_last = 0;
746} 746}
@@ -763,6 +763,7 @@ void mark_tsc_unstable(char *reason)
763{ 763{
764 if (!tsc_unstable) { 764 if (!tsc_unstable) {
765 tsc_unstable = 1; 765 tsc_unstable = 1;
766 sched_clock_stable = 0;
766 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason); 767 printk(KERN_INFO "Marking TSC unstable due to %s\n", reason);
767 /* Change only the rating, when not registered */ 768 /* Change only the rating, when not registered */
768 if (clocksource_tsc.mult) 769 if (clocksource_tsc.mult)
@@ -805,7 +806,7 @@ static void __init check_system_tsc_reliable(void)
805 unsigned long res_low, res_high; 806 unsigned long res_low, res_high;
806 807
807 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); 808 rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high);
808 /* Geode_LX - the OLPC CPU has a possibly a very reliable TSC */ 809 /* Geode_LX - the OLPC CPU has a very reliable TSC */
809 if (res_low & RTSC_SUSP) 810 if (res_low & RTSC_SUSP)
810 tsc_clocksource_reliable = 1; 811 tsc_clocksource_reliable = 1;
811#endif 812#endif
diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c
index f37930954d15..0aa5fed8b9e6 100644
--- a/arch/x86/kernel/tsc_sync.c
+++ b/arch/x86/kernel/tsc_sync.c
@@ -33,7 +33,7 @@ static __cpuinitdata atomic_t stop_count;
33 * we want to have the fastest, inlined, non-debug version 33 * we want to have the fastest, inlined, non-debug version
34 * of a critical section, to be able to prove TSC time-warps: 34 * of a critical section, to be able to prove TSC time-warps:
35 */ 35 */
36static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; 36static __cpuinitdata arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED;
37 37
38static __cpuinitdata cycles_t last_tsc; 38static __cpuinitdata cycles_t last_tsc;
39static __cpuinitdata cycles_t max_warp; 39static __cpuinitdata cycles_t max_warp;
@@ -62,13 +62,13 @@ static __cpuinit void check_tsc_warp(void)
62 * previous TSC that was measured (possibly on 62 * previous TSC that was measured (possibly on
63 * another CPU) and update the previous TSC timestamp. 63 * another CPU) and update the previous TSC timestamp.
64 */ 64 */
65 __raw_spin_lock(&sync_lock); 65 arch_spin_lock(&sync_lock);
66 prev = last_tsc; 66 prev = last_tsc;
67 rdtsc_barrier(); 67 rdtsc_barrier();
68 now = get_cycles(); 68 now = get_cycles();
69 rdtsc_barrier(); 69 rdtsc_barrier();
70 last_tsc = now; 70 last_tsc = now;
71 __raw_spin_unlock(&sync_lock); 71 arch_spin_unlock(&sync_lock);
72 72
73 /* 73 /*
74 * Be nice every now and then (and also check whether 74 * Be nice every now and then (and also check whether
@@ -87,10 +87,10 @@ static __cpuinit void check_tsc_warp(void)
87 * we saw a time-warp of the TSC going backwards: 87 * we saw a time-warp of the TSC going backwards:
88 */ 88 */
89 if (unlikely(prev > now)) { 89 if (unlikely(prev > now)) {
90 __raw_spin_lock(&sync_lock); 90 arch_spin_lock(&sync_lock);
91 max_warp = max(max_warp, prev - now); 91 max_warp = max(max_warp, prev - now);
92 nr_warps++; 92 nr_warps++;
93 __raw_spin_unlock(&sync_lock); 93 arch_spin_unlock(&sync_lock);
94 } 94 }
95 } 95 }
96 WARN(!(now-start), 96 WARN(!(now-start),
@@ -114,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu)
114 return; 114 return;
115 115
116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { 116 if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) {
117 printk_once(KERN_INFO "Skipping synchronization checks as TSC is reliable.\n"); 117 if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING)
118 pr_info(
119 "Skipped synchronization checks as TSC is reliable.\n");
118 return; 120 return;
119 } 121 }
120 122
121 pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:",
122 smp_processor_id(), cpu);
123
124 /* 123 /*
125 * Reset it - in case this is a second bootup: 124 * Reset it - in case this is a second bootup:
126 */ 125 */
@@ -142,12 +141,14 @@ void __cpuinit check_tsc_sync_source(int cpu)
142 cpu_relax(); 141 cpu_relax();
143 142
144 if (nr_warps) { 143 if (nr_warps) {
145 printk("\n"); 144 pr_warning("TSC synchronization [CPU#%d -> CPU#%d]:\n",
145 smp_processor_id(), cpu);
146 pr_warning("Measured %Ld cycles TSC warp between CPUs, " 146 pr_warning("Measured %Ld cycles TSC warp between CPUs, "
147 "turning off TSC clock.\n", max_warp); 147 "turning off TSC clock.\n", max_warp);
148 mark_tsc_unstable("check_tsc_sync_source failed"); 148 mark_tsc_unstable("check_tsc_sync_source failed");
149 } else { 149 } else {
150 printk(" passed.\n"); 150 pr_debug("TSC synchronization [CPU#%d -> CPU#%d]: passed\n",
151 smp_processor_id(), cpu);
151 } 152 }
152 153
153 /* 154 /*
diff --git a/arch/x86/kernel/uv_irq.c b/arch/x86/kernel/uv_irq.c
index aeef529917e4..1d40336b030a 100644
--- a/arch/x86/kernel/uv_irq.c
+++ b/arch/x86/kernel/uv_irq.c
@@ -9,10 +9,26 @@
9 */ 9 */
10 10
11#include <linux/module.h> 11#include <linux/module.h>
12#include <linux/rbtree.h>
13#include <linux/slab.h>
12#include <linux/irq.h> 14#include <linux/irq.h>
13 15
14#include <asm/apic.h> 16#include <asm/apic.h>
15#include <asm/uv/uv_irq.h> 17#include <asm/uv/uv_irq.h>
18#include <asm/uv/uv_hub.h>
19
20/* MMR offset and pnode of hub sourcing interrupts for a given irq */
21struct uv_irq_2_mmr_pnode{
22 struct rb_node list;
23 unsigned long offset;
24 int pnode;
25 int irq;
26};
27
28static spinlock_t uv_irq_lock;
29static struct rb_root uv_irq_root;
30
31static int uv_set_irq_affinity(unsigned int, const struct cpumask *);
16 32
17static void uv_noop(unsigned int irq) 33static void uv_noop(unsigned int irq)
18{ 34{
@@ -39,25 +55,213 @@ struct irq_chip uv_irq_chip = {
39 .unmask = uv_noop, 55 .unmask = uv_noop,
40 .eoi = uv_ack_apic, 56 .eoi = uv_ack_apic,
41 .end = uv_noop, 57 .end = uv_noop,
58 .set_affinity = uv_set_irq_affinity,
42}; 59};
43 60
44/* 61/*
62 * Add offset and pnode information of the hub sourcing interrupts to the
63 * rb tree for a specific irq.
64 */
65static int uv_set_irq_2_mmr_info(int irq, unsigned long offset, unsigned blade)
66{
67 struct rb_node **link = &uv_irq_root.rb_node;
68 struct rb_node *parent = NULL;
69 struct uv_irq_2_mmr_pnode *n;
70 struct uv_irq_2_mmr_pnode *e;
71 unsigned long irqflags;
72
73 n = kmalloc_node(sizeof(struct uv_irq_2_mmr_pnode), GFP_KERNEL,
74 uv_blade_to_memory_nid(blade));
75 if (!n)
76 return -ENOMEM;
77
78 n->irq = irq;
79 n->offset = offset;
80 n->pnode = uv_blade_to_pnode(blade);
81 spin_lock_irqsave(&uv_irq_lock, irqflags);
82 /* Find the right place in the rbtree: */
83 while (*link) {
84 parent = *link;
85 e = rb_entry(parent, struct uv_irq_2_mmr_pnode, list);
86
87 if (unlikely(irq == e->irq)) {
88 /* irq entry exists */
89 e->pnode = uv_blade_to_pnode(blade);
90 e->offset = offset;
91 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
92 kfree(n);
93 return 0;
94 }
95
96 if (irq < e->irq)
97 link = &(*link)->rb_left;
98 else
99 link = &(*link)->rb_right;
100 }
101
102 /* Insert the node into the rbtree. */
103 rb_link_node(&n->list, parent, link);
104 rb_insert_color(&n->list, &uv_irq_root);
105
106 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
107 return 0;
108}
109
110/* Retrieve offset and pnode information from the rb tree for a specific irq */
111int uv_irq_2_mmr_info(int irq, unsigned long *offset, int *pnode)
112{
113 struct uv_irq_2_mmr_pnode *e;
114 struct rb_node *n;
115 unsigned long irqflags;
116
117 spin_lock_irqsave(&uv_irq_lock, irqflags);
118 n = uv_irq_root.rb_node;
119 while (n) {
120 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
121
122 if (e->irq == irq) {
123 *offset = e->offset;
124 *pnode = e->pnode;
125 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
126 return 0;
127 }
128
129 if (irq < e->irq)
130 n = n->rb_left;
131 else
132 n = n->rb_right;
133 }
134 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
135 return -1;
136}
137
138/*
139 * Re-target the irq to the specified CPU and enable the specified MMR located
140 * on the specified blade to allow the sending of MSIs to the specified CPU.
141 */
142static int
143arch_enable_uv_irq(char *irq_name, unsigned int irq, int cpu, int mmr_blade,
144 unsigned long mmr_offset, int restrict)
145{
146 const struct cpumask *eligible_cpu = cpumask_of(cpu);
147 struct irq_desc *desc = irq_to_desc(irq);
148 struct irq_cfg *cfg;
149 int mmr_pnode;
150 unsigned long mmr_value;
151 struct uv_IO_APIC_route_entry *entry;
152 int err;
153
154 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
155 sizeof(unsigned long));
156
157 cfg = irq_cfg(irq);
158
159 err = assign_irq_vector(irq, cfg, eligible_cpu);
160 if (err != 0)
161 return err;
162
163 if (restrict == UV_AFFINITY_CPU)
164 desc->status |= IRQ_NO_BALANCING;
165 else
166 desc->status |= IRQ_MOVE_PCNTXT;
167
168 set_irq_chip_and_handler_name(irq, &uv_irq_chip, handle_percpu_irq,
169 irq_name);
170
171 mmr_value = 0;
172 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
173 entry->vector = cfg->vector;
174 entry->delivery_mode = apic->irq_delivery_mode;
175 entry->dest_mode = apic->irq_dest_mode;
176 entry->polarity = 0;
177 entry->trigger = 0;
178 entry->mask = 0;
179 entry->dest = apic->cpu_mask_to_apicid(eligible_cpu);
180
181 mmr_pnode = uv_blade_to_pnode(mmr_blade);
182 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
183
184 if (cfg->move_in_progress)
185 send_cleanup_vector(cfg);
186
187 return irq;
188}
189
190/*
191 * Disable the specified MMR located on the specified blade so that MSIs are
192 * longer allowed to be sent.
193 */
194static void arch_disable_uv_irq(int mmr_pnode, unsigned long mmr_offset)
195{
196 unsigned long mmr_value;
197 struct uv_IO_APIC_route_entry *entry;
198
199 BUILD_BUG_ON(sizeof(struct uv_IO_APIC_route_entry) !=
200 sizeof(unsigned long));
201
202 mmr_value = 0;
203 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
204 entry->mask = 1;
205
206 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
207}
208
209static int uv_set_irq_affinity(unsigned int irq, const struct cpumask *mask)
210{
211 struct irq_desc *desc = irq_to_desc(irq);
212 struct irq_cfg *cfg = desc->chip_data;
213 unsigned int dest;
214 unsigned long mmr_value;
215 struct uv_IO_APIC_route_entry *entry;
216 unsigned long mmr_offset;
217 unsigned mmr_pnode;
218
219 if (set_desc_affinity(desc, mask, &dest))
220 return -1;
221
222 mmr_value = 0;
223 entry = (struct uv_IO_APIC_route_entry *)&mmr_value;
224
225 entry->vector = cfg->vector;
226 entry->delivery_mode = apic->irq_delivery_mode;
227 entry->dest_mode = apic->irq_dest_mode;
228 entry->polarity = 0;
229 entry->trigger = 0;
230 entry->mask = 0;
231 entry->dest = dest;
232
233 /* Get previously stored MMR and pnode of hub sourcing interrupts */
234 if (uv_irq_2_mmr_info(irq, &mmr_offset, &mmr_pnode))
235 return -1;
236
237 uv_write_global_mmr64(mmr_pnode, mmr_offset, mmr_value);
238
239 if (cfg->move_in_progress)
240 send_cleanup_vector(cfg);
241
242 return 0;
243}
244
245/*
45 * Set up a mapping of an available irq and vector, and enable the specified 246 * Set up a mapping of an available irq and vector, and enable the specified
46 * MMR that defines the MSI that is to be sent to the specified CPU when an 247 * MMR that defines the MSI that is to be sent to the specified CPU when an
47 * interrupt is raised. 248 * interrupt is raised.
48 */ 249 */
49int uv_setup_irq(char *irq_name, int cpu, int mmr_blade, 250int uv_setup_irq(char *irq_name, int cpu, int mmr_blade,
50 unsigned long mmr_offset) 251 unsigned long mmr_offset, int restrict)
51{ 252{
52 int irq; 253 int irq, ret;
53 int ret; 254
255 irq = create_irq_nr(NR_IRQS_LEGACY, uv_blade_to_memory_nid(mmr_blade));
54 256
55 irq = create_irq();
56 if (irq <= 0) 257 if (irq <= 0)
57 return -EBUSY; 258 return -EBUSY;
58 259
59 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset); 260 ret = arch_enable_uv_irq(irq_name, irq, cpu, mmr_blade, mmr_offset,
60 if (ret != irq) 261 restrict);
262 if (ret == irq)
263 uv_set_irq_2_mmr_info(irq, mmr_offset, mmr_blade);
264 else
61 destroy_irq(irq); 265 destroy_irq(irq);
62 266
63 return ret; 267 return ret;
@@ -71,9 +275,28 @@ EXPORT_SYMBOL_GPL(uv_setup_irq);
71 * 275 *
72 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq(). 276 * Set mmr_blade and mmr_offset to what was passed in on uv_setup_irq().
73 */ 277 */
74void uv_teardown_irq(unsigned int irq, int mmr_blade, unsigned long mmr_offset) 278void uv_teardown_irq(unsigned int irq)
75{ 279{
76 arch_disable_uv_irq(mmr_blade, mmr_offset); 280 struct uv_irq_2_mmr_pnode *e;
281 struct rb_node *n;
282 unsigned long irqflags;
283
284 spin_lock_irqsave(&uv_irq_lock, irqflags);
285 n = uv_irq_root.rb_node;
286 while (n) {
287 e = rb_entry(n, struct uv_irq_2_mmr_pnode, list);
288 if (e->irq == irq) {
289 arch_disable_uv_irq(e->pnode, e->offset);
290 rb_erase(n, &uv_irq_root);
291 kfree(e);
292 break;
293 }
294 if (irq < e->irq)
295 n = n->rb_left;
296 else
297 n = n->rb_right;
298 }
299 spin_unlock_irqrestore(&uv_irq_lock, irqflags);
77 destroy_irq(irq); 300 destroy_irq(irq);
78} 301}
79EXPORT_SYMBOL_GPL(uv_teardown_irq); 302EXPORT_SYMBOL_GPL(uv_teardown_irq);
diff --git a/arch/x86/kernel/uv_sysfs.c b/arch/x86/kernel/uv_sysfs.c
index 36afb98675a4..309c70fb7759 100644
--- a/arch/x86/kernel/uv_sysfs.c
+++ b/arch/x86/kernel/uv_sysfs.c
@@ -54,19 +54,19 @@ static int __init sgi_uv_sysfs_init(void)
54 if (!sgi_uv_kobj) 54 if (!sgi_uv_kobj)
55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj); 55 sgi_uv_kobj = kobject_create_and_add("sgi_uv", firmware_kobj);
56 if (!sgi_uv_kobj) { 56 if (!sgi_uv_kobj) {
57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed \n"); 57 printk(KERN_WARNING "kobject_create_and_add sgi_uv failed\n");
58 return -EINVAL; 58 return -EINVAL;
59 } 59 }
60 60
61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr); 61 ret = sysfs_create_file(sgi_uv_kobj, &partition_id_attr.attr);
62 if (ret) { 62 if (ret) {
63 printk(KERN_WARNING "sysfs_create_file partition_id failed \n"); 63 printk(KERN_WARNING "sysfs_create_file partition_id failed\n");
64 return ret; 64 return ret;
65 } 65 }
66 66
67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr); 67 ret = sysfs_create_file(sgi_uv_kobj, &coherence_id_attr.attr);
68 if (ret) { 68 if (ret) {
69 printk(KERN_WARNING "sysfs_create_file coherence_id failed \n"); 69 printk(KERN_WARNING "sysfs_create_file coherence_id failed\n");
70 return ret; 70 return ret;
71 } 71 }
72 72
diff --git a/arch/x86/kernel/uv_time.c b/arch/x86/kernel/uv_time.c
index 583f11d5c480..56e421bc379b 100644
--- a/arch/x86/kernel/uv_time.c
+++ b/arch/x86/kernel/uv_time.c
@@ -19,6 +19,7 @@
19 * Copyright (c) Dimitri Sivanich 19 * Copyright (c) Dimitri Sivanich
20 */ 20 */
21#include <linux/clockchips.h> 21#include <linux/clockchips.h>
22#include <linux/slab.h>
22 23
23#include <asm/uv/uv_mmrs.h> 24#include <asm/uv/uv_mmrs.h>
24#include <asm/uv/uv_hub.h> 25#include <asm/uv/uv_hub.h>
@@ -74,7 +75,7 @@ struct uv_rtc_timer_head {
74 */ 75 */
75static struct uv_rtc_timer_head **blade_info __read_mostly; 76static struct uv_rtc_timer_head **blade_info __read_mostly;
76 77
77static int uv_rtc_enable; 78static int uv_rtc_evt_enable;
78 79
79/* 80/*
80 * Hardware interface routines 81 * Hardware interface routines
@@ -90,7 +91,7 @@ static void uv_rtc_send_IPI(int cpu)
90 pnode = uv_apicid_to_pnode(apicid); 91 pnode = uv_apicid_to_pnode(apicid);
91 val = (1UL << UVH_IPI_INT_SEND_SHFT) | 92 val = (1UL << UVH_IPI_INT_SEND_SHFT) |
92 (apicid << UVH_IPI_INT_APIC_ID_SHFT) | 93 (apicid << UVH_IPI_INT_APIC_ID_SHFT) |
93 (GENERIC_INTERRUPT_VECTOR << UVH_IPI_INT_VECTOR_SHFT); 94 (X86_PLATFORM_IPI_VECTOR << UVH_IPI_INT_VECTOR_SHFT);
94 95
95 uv_write_global_mmr64(pnode, UVH_IPI_INT, val); 96 uv_write_global_mmr64(pnode, UVH_IPI_INT, val);
96} 97}
@@ -115,7 +116,7 @@ static int uv_setup_intr(int cpu, u64 expires)
115 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS, 116 uv_write_global_mmr64(pnode, UVH_EVENT_OCCURRED0_ALIAS,
116 UVH_EVENT_OCCURRED0_RTC1_MASK); 117 UVH_EVENT_OCCURRED0_RTC1_MASK);
117 118
118 val = (GENERIC_INTERRUPT_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) | 119 val = (X86_PLATFORM_IPI_VECTOR << UVH_RTC1_INT_CONFIG_VECTOR_SHFT) |
119 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT); 120 ((u64)cpu_physical_id(cpu) << UVH_RTC1_INT_CONFIG_APIC_ID_SHFT);
120 121
121 /* Set configuration */ 122 /* Set configuration */
@@ -123,7 +124,10 @@ static int uv_setup_intr(int cpu, u64 expires)
123 /* Initialize comparator value */ 124 /* Initialize comparator value */
124 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires); 125 uv_write_global_mmr64(pnode, UVH_INT_CMPB, expires);
125 126
126 return (expires < uv_read_rtc(NULL) && !uv_intr_pending(pnode)); 127 if (uv_read_rtc(NULL) <= expires)
128 return 0;
129
130 return !uv_intr_pending(pnode);
127} 131}
128 132
129/* 133/*
@@ -223,6 +227,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
223 227
224 next_cpu = head->next_cpu; 228 next_cpu = head->next_cpu;
225 *t = expires; 229 *t = expires;
230
226 /* Will this one be next to go off? */ 231 /* Will this one be next to go off? */
227 if (next_cpu < 0 || bcpu == next_cpu || 232 if (next_cpu < 0 || bcpu == next_cpu ||
228 expires < head->cpu[next_cpu].expires) { 233 expires < head->cpu[next_cpu].expires) {
@@ -231,7 +236,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
231 *t = ULLONG_MAX; 236 *t = ULLONG_MAX;
232 uv_rtc_find_next_timer(head, pnode); 237 uv_rtc_find_next_timer(head, pnode);
233 spin_unlock_irqrestore(&head->lock, flags); 238 spin_unlock_irqrestore(&head->lock, flags);
234 return 1; 239 return -ETIME;
235 } 240 }
236 } 241 }
237 242
@@ -244,7 +249,7 @@ static int uv_rtc_set_timer(int cpu, u64 expires)
244 * 249 *
245 * Returns 1 if this timer was pending. 250 * Returns 1 if this timer was pending.
246 */ 251 */
247static int uv_rtc_unset_timer(int cpu) 252static int uv_rtc_unset_timer(int cpu, int force)
248{ 253{
249 int pnode = uv_cpu_to_pnode(cpu); 254 int pnode = uv_cpu_to_pnode(cpu);
250 int bid = uv_cpu_to_blade_id(cpu); 255 int bid = uv_cpu_to_blade_id(cpu);
@@ -256,14 +261,15 @@ static int uv_rtc_unset_timer(int cpu)
256 261
257 spin_lock_irqsave(&head->lock, flags); 262 spin_lock_irqsave(&head->lock, flags);
258 263
259 if (head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) 264 if ((head->next_cpu == bcpu && uv_read_rtc(NULL) >= *t) || force)
260 rc = 1; 265 rc = 1;
261 266
262 *t = ULLONG_MAX; 267 if (rc) {
263 268 *t = ULLONG_MAX;
264 /* Was the hardware setup for this timer? */ 269 /* Was the hardware setup for this timer? */
265 if (head->next_cpu == bcpu) 270 if (head->next_cpu == bcpu)
266 uv_rtc_find_next_timer(head, pnode); 271 uv_rtc_find_next_timer(head, pnode);
272 }
267 273
268 spin_unlock_irqrestore(&head->lock, flags); 274 spin_unlock_irqrestore(&head->lock, flags);
269 275
@@ -277,10 +283,21 @@ static int uv_rtc_unset_timer(int cpu)
277 283
278/* 284/*
279 * Read the RTC. 285 * Read the RTC.
286 *
287 * Starting with HUB rev 2.0, the UV RTC register is replicated across all
288 * cachelines of it's own page. This allows faster simultaneous reads
289 * from a given socket.
280 */ 290 */
281static cycle_t uv_read_rtc(struct clocksource *cs) 291static cycle_t uv_read_rtc(struct clocksource *cs)
282{ 292{
283 return (cycle_t)uv_read_local_mmr(UVH_RTC); 293 unsigned long offset;
294
295 if (uv_get_min_hub_revision_id() == 1)
296 offset = 0;
297 else
298 offset = (uv_blade_processor_id() * L1_CACHE_BYTES) % PAGE_SIZE;
299
300 return (cycle_t)uv_read_local_mmr(UVH_RTC | offset);
284} 301}
285 302
286/* 303/*
@@ -310,32 +327,32 @@ static void uv_rtc_timer_setup(enum clock_event_mode mode,
310 break; 327 break;
311 case CLOCK_EVT_MODE_UNUSED: 328 case CLOCK_EVT_MODE_UNUSED:
312 case CLOCK_EVT_MODE_SHUTDOWN: 329 case CLOCK_EVT_MODE_SHUTDOWN:
313 uv_rtc_unset_timer(ced_cpu); 330 uv_rtc_unset_timer(ced_cpu, 1);
314 break; 331 break;
315 } 332 }
316} 333}
317 334
318static void uv_rtc_interrupt(void) 335static void uv_rtc_interrupt(void)
319{ 336{
320 struct clock_event_device *ced = &__get_cpu_var(cpu_ced);
321 int cpu = smp_processor_id(); 337 int cpu = smp_processor_id();
338 struct clock_event_device *ced = &per_cpu(cpu_ced, cpu);
322 339
323 if (!ced || !ced->event_handler) 340 if (!ced || !ced->event_handler)
324 return; 341 return;
325 342
326 if (uv_rtc_unset_timer(cpu) != 1) 343 if (uv_rtc_unset_timer(cpu, 0) != 1)
327 return; 344 return;
328 345
329 ced->event_handler(ced); 346 ced->event_handler(ced);
330} 347}
331 348
332static int __init uv_enable_rtc(char *str) 349static int __init uv_enable_evt_rtc(char *str)
333{ 350{
334 uv_rtc_enable = 1; 351 uv_rtc_evt_enable = 1;
335 352
336 return 1; 353 return 1;
337} 354}
338__setup("uvrtc", uv_enable_rtc); 355__setup("uvrtcevt", uv_enable_evt_rtc);
339 356
340static __init void uv_rtc_register_clockevents(struct work_struct *dummy) 357static __init void uv_rtc_register_clockevents(struct work_struct *dummy)
341{ 358{
@@ -350,27 +367,32 @@ static __init int uv_rtc_setup_clock(void)
350{ 367{
351 int rc; 368 int rc;
352 369
353 if (!uv_rtc_enable || !is_uv_system() || generic_interrupt_extension) 370 if (!is_uv_system())
354 return -ENODEV; 371 return -ENODEV;
355 372
356 generic_interrupt_extension = uv_rtc_interrupt;
357
358 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second, 373 clocksource_uv.mult = clocksource_hz2mult(sn_rtc_cycles_per_second,
359 clocksource_uv.shift); 374 clocksource_uv.shift);
360 375
376 /* If single blade, prefer tsc */
377 if (uv_num_possible_blades() == 1)
378 clocksource_uv.rating = 250;
379
361 rc = clocksource_register(&clocksource_uv); 380 rc = clocksource_register(&clocksource_uv);
362 if (rc) { 381 if (rc)
363 generic_interrupt_extension = NULL; 382 printk(KERN_INFO "UV RTC clocksource failed rc %d\n", rc);
383 else
384 printk(KERN_INFO "UV RTC clocksource registered freq %lu MHz\n",
385 sn_rtc_cycles_per_second/(unsigned long)1E6);
386
387 if (rc || !uv_rtc_evt_enable || x86_platform_ipi_callback)
364 return rc; 388 return rc;
365 }
366 389
367 /* Setup and register clockevents */ 390 /* Setup and register clockevents */
368 rc = uv_rtc_allocate_timers(); 391 rc = uv_rtc_allocate_timers();
369 if (rc) { 392 if (rc)
370 clocksource_unregister(&clocksource_uv); 393 goto error;
371 generic_interrupt_extension = NULL; 394
372 return rc; 395 x86_platform_ipi_callback = uv_rtc_interrupt;
373 }
374 396
375 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second, 397 clock_event_device_uv.mult = div_sc(sn_rtc_cycles_per_second,
376 NSEC_PER_SEC, clock_event_device_uv.shift); 398 NSEC_PER_SEC, clock_event_device_uv.shift);
@@ -383,11 +405,19 @@ static __init int uv_rtc_setup_clock(void)
383 405
384 rc = schedule_on_each_cpu(uv_rtc_register_clockevents); 406 rc = schedule_on_each_cpu(uv_rtc_register_clockevents);
385 if (rc) { 407 if (rc) {
386 clocksource_unregister(&clocksource_uv); 408 x86_platform_ipi_callback = NULL;
387 generic_interrupt_extension = NULL;
388 uv_rtc_deallocate_timers(); 409 uv_rtc_deallocate_timers();
410 goto error;
389 } 411 }
390 412
413 printk(KERN_INFO "UV RTC clockevents registered\n");
414
415 return 0;
416
417error:
418 clocksource_unregister(&clocksource_uv);
419 printk(KERN_INFO "UV RTC clockevents failed rc %d\n", rc);
420
391 return rc; 421 return rc;
392} 422}
393arch_initcall(uv_rtc_setup_clock); 423arch_initcall(uv_rtc_setup_clock);
diff --git a/arch/x86/kernel/visws_quirks.c b/arch/x86/kernel/visws_quirks.c
index f068553a1b17..e680ea52db9b 100644
--- a/arch/x86/kernel/visws_quirks.c
+++ b/arch/x86/kernel/visws_quirks.c
@@ -49,11 +49,6 @@ extern int no_broadcast;
49char visws_board_type = -1; 49char visws_board_type = -1;
50char visws_board_rev = -1; 50char visws_board_rev = -1;
51 51
52int is_visws_box(void)
53{
54 return visws_board_type >= 0;
55}
56
57static void __init visws_time_init(void) 52static void __init visws_time_init(void)
58{ 53{
59 printk(KERN_INFO "Starting Cobalt Timer system clock\n"); 54 printk(KERN_INFO "Starting Cobalt Timer system clock\n");
@@ -183,7 +178,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
183 return; 178 return;
184 } 179 }
185 180
186 apic_cpus = apic->apicid_to_cpu_present(m->apicid); 181 apic->apicid_to_cpu_present(m->apicid, &apic_cpus);
187 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus); 182 physids_or(phys_cpu_present_map, phys_cpu_present_map, apic_cpus);
188 /* 183 /*
189 * Validate version 184 * Validate version
@@ -197,7 +192,7 @@ static void __init MP_processor_info(struct mpc_cpu *m)
197 apic_version[m->apicid] = ver; 192 apic_version[m->apicid] = ver;
198} 193}
199 194
200static void __init visws_find_smp_config(unsigned int reserve) 195static void __init visws_find_smp_config(void)
201{ 196{
202 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS); 197 struct mpc_cpu *mp = phys_to_virt(CO_CPU_TAB_PHYS);
203 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS)); 198 unsigned short ncpus = readw(phys_to_virt(CO_CPU_NUM_PHYS));
@@ -242,6 +237,8 @@ void __init visws_early_detect(void)
242 x86_init.irqs.pre_vector_init = visws_pre_intr_init; 237 x86_init.irqs.pre_vector_init = visws_pre_intr_init;
243 x86_init.irqs.trap_init = visws_trap_init; 238 x86_init.irqs.trap_init = visws_trap_init;
244 x86_init.timers.timer_init = visws_time_init; 239 x86_init.timers.timer_init = visws_time_init;
240 x86_init.pci.init = pci_visws_init;
241 x86_init.pci.init_irq = x86_init_noop;
245 242
246 /* 243 /*
247 * Install reboot quirks: 244 * Install reboot quirks:
@@ -486,7 +483,7 @@ static void end_cobalt_irq(unsigned int irq)
486} 483}
487 484
488static struct irq_chip cobalt_irq_type = { 485static struct irq_chip cobalt_irq_type = {
489 .typename = "Cobalt-APIC", 486 .name = "Cobalt-APIC",
490 .startup = startup_cobalt_irq, 487 .startup = startup_cobalt_irq,
491 .shutdown = disable_cobalt_irq, 488 .shutdown = disable_cobalt_irq,
492 .enable = enable_cobalt_irq, 489 .enable = enable_cobalt_irq,
@@ -508,7 +505,7 @@ static struct irq_chip cobalt_irq_type = {
508 */ 505 */
509static unsigned int startup_piix4_master_irq(unsigned int irq) 506static unsigned int startup_piix4_master_irq(unsigned int irq)
510{ 507{
511 init_8259A(0); 508 legacy_pic->init(0);
512 509
513 return startup_cobalt_irq(irq); 510 return startup_cobalt_irq(irq);
514} 511}
@@ -523,7 +520,7 @@ static void end_piix4_master_irq(unsigned int irq)
523} 520}
524 521
525static struct irq_chip piix4_master_irq_type = { 522static struct irq_chip piix4_master_irq_type = {
526 .typename = "PIIX4-master", 523 .name = "PIIX4-master",
527 .startup = startup_piix4_master_irq, 524 .startup = startup_piix4_master_irq,
528 .ack = ack_cobalt_irq, 525 .ack = ack_cobalt_irq,
529 .end = end_piix4_master_irq, 526 .end = end_piix4_master_irq,
@@ -531,10 +528,7 @@ static struct irq_chip piix4_master_irq_type = {
531 528
532 529
533static struct irq_chip piix4_virtual_irq_type = { 530static struct irq_chip piix4_virtual_irq_type = {
534 .typename = "PIIX4-virtual", 531 .name = "PIIX4-virtual",
535 .shutdown = disable_8259A_irq,
536 .enable = enable_8259A_irq,
537 .disable = disable_8259A_irq,
538}; 532};
539 533
540 534
@@ -559,7 +553,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
559 struct irq_desc *desc; 553 struct irq_desc *desc;
560 unsigned long flags; 554 unsigned long flags;
561 555
562 spin_lock_irqsave(&i8259A_lock, flags); 556 raw_spin_lock_irqsave(&i8259A_lock, flags);
563 557
564 /* Find out what's interrupting in the PIIX4 master 8259 */ 558 /* Find out what's interrupting in the PIIX4 master 8259 */
565 outb(0x0c, 0x20); /* OCW3 Poll command */ 559 outb(0x0c, 0x20); /* OCW3 Poll command */
@@ -596,7 +590,7 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
596 outb(0x60 + realirq, 0x20); 590 outb(0x60 + realirq, 0x20);
597 } 591 }
598 592
599 spin_unlock_irqrestore(&i8259A_lock, flags); 593 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
600 594
601 desc = irq_to_desc(realirq); 595 desc = irq_to_desc(realirq);
602 596
@@ -609,12 +603,12 @@ static irqreturn_t piix4_master_intr(int irq, void *dev_id)
609 handle_IRQ_event(realirq, desc->action); 603 handle_IRQ_event(realirq, desc->action);
610 604
611 if (!(desc->status & IRQ_DISABLED)) 605 if (!(desc->status & IRQ_DISABLED))
612 enable_8259A_irq(realirq); 606 legacy_pic->chip->unmask(realirq);
613 607
614 return IRQ_HANDLED; 608 return IRQ_HANDLED;
615 609
616out_unlock: 610out_unlock:
617 spin_unlock_irqrestore(&i8259A_lock, flags); 611 raw_spin_unlock_irqrestore(&i8259A_lock, flags);
618 return IRQ_NONE; 612 return IRQ_NONE;
619} 613}
620 614
@@ -628,6 +622,12 @@ static struct irqaction cascade_action = {
628 .name = "cascade", 622 .name = "cascade",
629}; 623};
630 624
625static inline void set_piix4_virtual_irq_type(void)
626{
627 piix4_virtual_irq_type.shutdown = i8259A_chip.mask;
628 piix4_virtual_irq_type.enable = i8259A_chip.unmask;
629 piix4_virtual_irq_type.disable = i8259A_chip.mask;
630}
631 631
632void init_VISWS_APIC_irqs(void) 632void init_VISWS_APIC_irqs(void)
633{ 633{
@@ -653,6 +653,7 @@ void init_VISWS_APIC_irqs(void)
653 desc->chip = &piix4_master_irq_type; 653 desc->chip = &piix4_master_irq_type;
654 } 654 }
655 else if (i < CO_IRQ_APIC0) { 655 else if (i < CO_IRQ_APIC0) {
656 set_piix4_virtual_irq_type();
656 desc->chip = &piix4_virtual_irq_type; 657 desc->chip = &piix4_virtual_irq_type;
657 } 658 }
658 else if (IS_CO_APIC(i)) { 659 else if (IS_CO_APIC(i)) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 9c4e62539058..5ffb5622f793 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -197,9 +197,8 @@ out:
197static int do_vm86_irq_handling(int subfunction, int irqnumber); 197static int do_vm86_irq_handling(int subfunction, int irqnumber);
198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk); 198static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk);
199 199
200int sys_vm86old(struct pt_regs *regs) 200int sys_vm86old(struct vm86_struct __user *v86, struct pt_regs *regs)
201{ 201{
202 struct vm86_struct __user *v86 = (struct vm86_struct __user *)regs->bx;
203 struct kernel_vm86_struct info; /* declare this _on top_, 202 struct kernel_vm86_struct info; /* declare this _on top_,
204 * this avoids wasting of stack space. 203 * this avoids wasting of stack space.
205 * This remains on the stack until we 204 * This remains on the stack until we
@@ -227,7 +226,7 @@ out:
227} 226}
228 227
229 228
230int sys_vm86(struct pt_regs *regs) 229int sys_vm86(unsigned long cmd, unsigned long arg, struct pt_regs *regs)
231{ 230{
232 struct kernel_vm86_struct info; /* declare this _on top_, 231 struct kernel_vm86_struct info; /* declare this _on top_,
233 * this avoids wasting of stack space. 232 * this avoids wasting of stack space.
@@ -239,12 +238,12 @@ int sys_vm86(struct pt_regs *regs)
239 struct vm86plus_struct __user *v86; 238 struct vm86plus_struct __user *v86;
240 239
241 tsk = current; 240 tsk = current;
242 switch (regs->bx) { 241 switch (cmd) {
243 case VM86_REQUEST_IRQ: 242 case VM86_REQUEST_IRQ:
244 case VM86_FREE_IRQ: 243 case VM86_FREE_IRQ:
245 case VM86_GET_IRQ_BITS: 244 case VM86_GET_IRQ_BITS:
246 case VM86_GET_AND_RESET_IRQ: 245 case VM86_GET_AND_RESET_IRQ:
247 ret = do_vm86_irq_handling(regs->bx, (int)regs->cx); 246 ret = do_vm86_irq_handling(cmd, (int)arg);
248 goto out; 247 goto out;
249 case VM86_PLUS_INSTALL_CHECK: 248 case VM86_PLUS_INSTALL_CHECK:
250 /* 249 /*
@@ -261,7 +260,7 @@ int sys_vm86(struct pt_regs *regs)
261 ret = -EPERM; 260 ret = -EPERM;
262 if (tsk->thread.saved_sp0) 261 if (tsk->thread.saved_sp0)
263 goto out; 262 goto out;
264 v86 = (struct vm86plus_struct __user *)regs->cx; 263 v86 = (struct vm86plus_struct __user *)arg;
265 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs, 264 tmp = copy_vm86_regs_from_user(&info.regs, &v86->regs,
266 offsetof(struct kernel_vm86_struct, regs32) - 265 offsetof(struct kernel_vm86_struct, regs32) -
267 sizeof(info.regs)); 266 sizeof(info.regs));
diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c
index d430e4c30193..ce9fbacb7526 100644
--- a/arch/x86/kernel/vmi_32.c
+++ b/arch/x86/kernel/vmi_32.c
@@ -28,11 +28,13 @@
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/highmem.h> 29#include <linux/highmem.h>
30#include <linux/sched.h> 30#include <linux/sched.h>
31#include <linux/gfp.h>
31#include <asm/vmi.h> 32#include <asm/vmi.h>
32#include <asm/io.h> 33#include <asm/io.h>
33#include <asm/fixmap.h> 34#include <asm/fixmap.h>
34#include <asm/apicdef.h> 35#include <asm/apicdef.h>
35#include <asm/apic.h> 36#include <asm/apic.h>
37#include <asm/pgalloc.h>
36#include <asm/processor.h> 38#include <asm/processor.h>
37#include <asm/timer.h> 39#include <asm/timer.h>
38#include <asm/vmi_time.h> 40#include <asm/vmi_time.h>
@@ -266,30 +268,6 @@ static void vmi_nop(void)
266{ 268{
267} 269}
268 270
269#ifdef CONFIG_HIGHPTE
270static void *vmi_kmap_atomic_pte(struct page *page, enum km_type type)
271{
272 void *va = kmap_atomic(page, type);
273
274 /*
275 * Internally, the VMI ROM must map virtual addresses to physical
276 * addresses for processing MMU updates. By the time MMU updates
277 * are issued, this information is typically already lost.
278 * Fortunately, the VMI provides a cache of mapping slots for active
279 * page tables.
280 *
281 * We use slot zero for the linear mapping of physical memory, and
282 * in HIGHPTE kernels, slot 1 and 2 for KM_PTE0 and KM_PTE1.
283 *
284 * args: SLOT VA COUNT PFN
285 */
286 BUG_ON(type != KM_PTE0 && type != KM_PTE1);
287 vmi_ops.set_linear_mapping((type - KM_PTE0)+1, va, 1, page_to_pfn(page));
288
289 return va;
290}
291#endif
292
293static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn) 271static void vmi_allocate_pte(struct mm_struct *mm, unsigned long pfn)
294{ 272{
295 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0); 273 vmi_ops.allocate_page(pfn, VMI_PAGE_L1, 0, 0, 0);
@@ -640,6 +618,12 @@ static inline int __init activate_vmi(void)
640 u64 reloc; 618 u64 reloc;
641 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc; 619 const struct vmi_relocation_info *rel = (struct vmi_relocation_info *)&reloc;
642 620
621 /*
622 * Prevent page tables from being allocated in highmem, even if
623 * CONFIG_HIGHPTE is enabled.
624 */
625 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
626
643 if (call_vrom_func(vmi_rom, vmi_init) != 0) { 627 if (call_vrom_func(vmi_rom, vmi_init) != 0) {
644 printk(KERN_ERR "VMI ROM failed to initialize!"); 628 printk(KERN_ERR "VMI ROM failed to initialize!");
645 return 0; 629 return 0;
@@ -778,10 +762,6 @@ static inline int __init activate_vmi(void)
778 762
779 /* Set linear is needed in all cases */ 763 /* Set linear is needed in all cases */
780 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping); 764 vmi_ops.set_linear_mapping = vmi_get_function(VMI_CALL_SetLinearMapping);
781#ifdef CONFIG_HIGHPTE
782 if (vmi_ops.set_linear_mapping)
783 pv_mmu_ops.kmap_atomic_pte = vmi_kmap_atomic_pte;
784#endif
785 765
786 /* 766 /*
787 * These MUST always be patched. Don't support indirect jumps 767 * These MUST always be patched. Don't support indirect jumps
diff --git a/arch/x86/kernel/vmiclock_32.c b/arch/x86/kernel/vmiclock_32.c
index 611b9e2360d3..5e1ff66ecd73 100644
--- a/arch/x86/kernel/vmiclock_32.c
+++ b/arch/x86/kernel/vmiclock_32.c
@@ -79,11 +79,7 @@ unsigned long vmi_tsc_khz(void)
79 79
80static inline unsigned int vmi_get_timer_vector(void) 80static inline unsigned int vmi_get_timer_vector(void)
81{ 81{
82#ifdef CONFIG_X86_IO_APIC 82 return IRQ0_VECTOR;
83 return FIRST_DEVICE_VECTOR;
84#else
85 return FIRST_EXTERNAL_VECTOR;
86#endif
87} 83}
88 84
89/** vmi clockchip */ 85/** vmi clockchip */
@@ -171,7 +167,7 @@ static int vmi_timer_next_event(unsigned long delta,
171{ 167{
172 /* Unfortunately, set_next_event interface only passes relative 168 /* Unfortunately, set_next_event interface only passes relative
173 * expiry, but we want absolute expiry. It'd be better if were 169 * expiry, but we want absolute expiry. It'd be better if were
174 * were passed an aboslute expiry, since a bunch of time may 170 * were passed an absolute expiry, since a bunch of time may
175 * have been stolen between the time the delta is computed and 171 * have been stolen between the time the delta is computed and
176 * when we set the alarm below. */ 172 * when we set the alarm below. */
177 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT)); 173 cycle_t now = vmi_timer_ops.get_cycle_counter(vmi_counter(VMI_ONESHOT));
@@ -226,7 +222,7 @@ static void __devinit vmi_time_init_clockevent(void)
226 evt->min_delta_ns = clockevent_delta2ns(1, evt); 222 evt->min_delta_ns = clockevent_delta2ns(1, evt);
227 evt->cpumask = cpumask_of(cpu); 223 evt->cpumask = cpumask_of(cpu);
228 224
229 printk(KERN_WARNING "vmi: registering clock event %s. mult=%lu shift=%u\n", 225 printk(KERN_WARNING "vmi: registering clock event %s. mult=%u shift=%u\n",
230 evt->name, evt->mult, evt->shift); 226 evt->name, evt->mult, evt->shift);
231 clockevents_register_device(evt); 227 clockevents_register_device(evt);
232} 228}
diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S
index 3c68fe2d46cf..2cc249718c46 100644
--- a/arch/x86/kernel/vmlinux.lds.S
+++ b/arch/x86/kernel/vmlinux.lds.S
@@ -41,6 +41,32 @@ ENTRY(phys_startup_64)
41jiffies_64 = jiffies; 41jiffies_64 = jiffies;
42#endif 42#endif
43 43
44#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
45/*
46 * On 64-bit, align RODATA to 2MB so that even with CONFIG_DEBUG_RODATA
47 * we retain large page mappings for boundaries spanning kernel text, rodata
48 * and data sections.
49 *
50 * However, kernel identity mappings will have different RWX permissions
51 * to the pages mapping to text and to the pages padding (which are freed) the
52 * text section. Hence kernel identity mappings will be broken to smaller
53 * pages. For 64-bit, kernel text and kernel identity mappings are different,
54 * so we can enable protection checks that come with CONFIG_DEBUG_RODATA,
55 * as well as retain 2MB large page mappings for kernel text.
56 */
57#define X64_ALIGN_DEBUG_RODATA_BEGIN . = ALIGN(HPAGE_SIZE);
58
59#define X64_ALIGN_DEBUG_RODATA_END \
60 . = ALIGN(HPAGE_SIZE); \
61 __end_rodata_hpage_align = .;
62
63#else
64
65#define X64_ALIGN_DEBUG_RODATA_BEGIN
66#define X64_ALIGN_DEBUG_RODATA_END
67
68#endif
69
44PHDRS { 70PHDRS {
45 text PT_LOAD FLAGS(5); /* R_E */ 71 text PT_LOAD FLAGS(5); /* R_E */
46 data PT_LOAD FLAGS(7); /* RWE */ 72 data PT_LOAD FLAGS(7); /* RWE */
@@ -90,7 +116,9 @@ SECTIONS
90 116
91 EXCEPTION_TABLE(16) :text = 0x9090 117 EXCEPTION_TABLE(16) :text = 0x9090
92 118
119 X64_ALIGN_DEBUG_RODATA_BEGIN
93 RO_DATA(PAGE_SIZE) 120 RO_DATA(PAGE_SIZE)
121 X64_ALIGN_DEBUG_RODATA_END
94 122
95 /* Data */ 123 /* Data */
96 .data : AT(ADDR(.data) - LOAD_OFFSET) { 124 .data : AT(ADDR(.data) - LOAD_OFFSET) {
@@ -107,13 +135,13 @@ SECTIONS
107 135
108 PAGE_ALIGNED_DATA(PAGE_SIZE) 136 PAGE_ALIGNED_DATA(PAGE_SIZE)
109 137
110 CACHELINE_ALIGNED_DATA(CONFIG_X86_L1_CACHE_BYTES) 138 CACHELINE_ALIGNED_DATA(L1_CACHE_BYTES)
111 139
112 DATA_DATA 140 DATA_DATA
113 CONSTRUCTORS 141 CONSTRUCTORS
114 142
115 /* rarely changed data like cpu maps */ 143 /* rarely changed data like cpu maps */
116 READ_MOSTLY_DATA(CONFIG_X86_INTERNODE_CACHE_BYTES) 144 READ_MOSTLY_DATA(INTERNODE_CACHE_BYTES)
117 145
118 /* End of data section */ 146 /* End of data section */
119 _edata = .; 147 _edata = .;
@@ -137,12 +165,12 @@ SECTIONS
137 *(.vsyscall_0) 165 *(.vsyscall_0)
138 } :user 166 } :user
139 167
140 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 168 . = ALIGN(L1_CACHE_BYTES);
141 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { 169 .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) {
142 *(.vsyscall_fn) 170 *(.vsyscall_fn)
143 } 171 }
144 172
145 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 173 . = ALIGN(L1_CACHE_BYTES);
146 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { 174 .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) {
147 *(.vsyscall_gtod_data) 175 *(.vsyscall_gtod_data)
148 } 176 }
@@ -166,7 +194,7 @@ SECTIONS
166 } 194 }
167 vgetcpu_mode = VVIRT(.vgetcpu_mode); 195 vgetcpu_mode = VVIRT(.vgetcpu_mode);
168 196
169 . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); 197 . = ALIGN(L1_CACHE_BYTES);
170 .jiffies : AT(VLOAD(.jiffies)) { 198 .jiffies : AT(VLOAD(.jiffies)) {
171 *(.jiffies) 199 *(.jiffies)
172 } 200 }
@@ -263,8 +291,8 @@ SECTIONS
263 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { 291 .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) {
264 __smp_locks = .; 292 __smp_locks = .;
265 *(.smp_locks) 293 *(.smp_locks)
266 __smp_locks_end = .;
267 . = ALIGN(PAGE_SIZE); 294 . = ALIGN(PAGE_SIZE);
295 __smp_locks_end = .;
268 } 296 }
269 297
270#ifdef CONFIG_X86_64 298#ifdef CONFIG_X86_64
@@ -291,9 +319,7 @@ SECTIONS
291 __brk_limit = .; 319 __brk_limit = .;
292 } 320 }
293 321
294 .end : AT(ADDR(.end) - LOAD_OFFSET) { 322 _end = .;
295 _end = .;
296 }
297 323
298 STABS_DEBUG 324 STABS_DEBUG
299 DWARF_DEBUG 325 DWARF_DEBUG
@@ -315,7 +341,7 @@ SECTIONS
315 * Per-cpu symbols which need to be offset from __per_cpu_load 341 * Per-cpu symbols which need to be offset from __per_cpu_load
316 * for the boot processor. 342 * for the boot processor.
317 */ 343 */
318#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load 344#define INIT_PER_CPU(x) init_per_cpu__##x = x + __per_cpu_load
319INIT_PER_CPU(gdt_page); 345INIT_PER_CPU(gdt_page);
320INIT_PER_CPU(irq_stack_union); 346INIT_PER_CPU(irq_stack_union);
321 347
@@ -326,7 +352,7 @@ INIT_PER_CPU(irq_stack_union);
326 "kernel image bigger than KERNEL_IMAGE_SIZE"); 352 "kernel image bigger than KERNEL_IMAGE_SIZE");
327 353
328#ifdef CONFIG_SMP 354#ifdef CONFIG_SMP
329. = ASSERT((per_cpu__irq_stack_union == 0), 355. = ASSERT((irq_stack_union == 0),
330 "irq_stack_union is not at start of per-cpu area"); 356 "irq_stack_union is not at start of per-cpu area");
331#endif 357#endif
332 358
diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c
index 8cb4974ff599..1c0c6ab9c60f 100644
--- a/arch/x86/kernel/vsyscall_64.c
+++ b/arch/x86/kernel/vsyscall_64.c
@@ -73,7 +73,8 @@ void update_vsyscall_tz(void)
73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags); 73 write_sequnlock_irqrestore(&vsyscall_gtod_data.lock, flags);
74} 74}
75 75
76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock) 76void update_vsyscall(struct timespec *wall_time, struct clocksource *clock,
77 u32 mult)
77{ 78{
78 unsigned long flags; 79 unsigned long flags;
79 80
@@ -82,7 +83,7 @@ void update_vsyscall(struct timespec *wall_time, struct clocksource *clock)
82 vsyscall_gtod_data.clock.vread = clock->vread; 83 vsyscall_gtod_data.clock.vread = clock->vread;
83 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last; 84 vsyscall_gtod_data.clock.cycle_last = clock->cycle_last;
84 vsyscall_gtod_data.clock.mask = clock->mask; 85 vsyscall_gtod_data.clock.mask = clock->mask;
85 vsyscall_gtod_data.clock.mult = clock->mult; 86 vsyscall_gtod_data.clock.mult = mult;
86 vsyscall_gtod_data.clock.shift = clock->shift; 87 vsyscall_gtod_data.clock.shift = clock->shift;
87 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec; 88 vsyscall_gtod_data.wall_time_sec = wall_time->tv_sec;
88 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec; 89 vsyscall_gtod_data.wall_time_nsec = wall_time->tv_nsec;
@@ -237,7 +238,7 @@ static ctl_table kernel_table2[] = {
237}; 238};
238 239
239static ctl_table kernel_root_table2[] = { 240static ctl_table kernel_root_table2[] = {
240 { .ctl_name = CTL_KERN, .procname = "kernel", .mode = 0555, 241 { .procname = "kernel", .mode = 0555,
241 .child = kernel_table2 }, 242 .child = kernel_table2 },
242 {} 243 {}
243}; 244};
@@ -300,7 +301,8 @@ static int __init vsyscall_init(void)
300 register_sysctl_table(kernel_root_table2); 301 register_sysctl_table(kernel_root_table2);
301#endif 302#endif
302 on_each_cpu(cpu_vsyscall_init, NULL, 1); 303 on_each_cpu(cpu_vsyscall_init, NULL, 1);
303 hotcpu_notifier(cpu_vsyscall_notifier, 0); 304 /* notifier priority > KVM */
305 hotcpu_notifier(cpu_vsyscall_notifier, 30);
304 return 0; 306 return 0;
305} 307}
306 308
diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c
index 3909e3ba5ce3..693920b22496 100644
--- a/arch/x86/kernel/x8664_ksyms_64.c
+++ b/arch/x86/kernel/x8664_ksyms_64.c
@@ -17,8 +17,6 @@
17EXPORT_SYMBOL(mcount); 17EXPORT_SYMBOL(mcount);
18#endif 18#endif
19 19
20EXPORT_SYMBOL(kernel_thread);
21
22EXPORT_SYMBOL(__get_user_1); 20EXPORT_SYMBOL(__get_user_1);
23EXPORT_SYMBOL(__get_user_2); 21EXPORT_SYMBOL(__get_user_2);
24EXPORT_SYMBOL(__get_user_4); 22EXPORT_SYMBOL(__get_user_4);
@@ -28,11 +26,11 @@ EXPORT_SYMBOL(__put_user_2);
28EXPORT_SYMBOL(__put_user_4); 26EXPORT_SYMBOL(__put_user_4);
29EXPORT_SYMBOL(__put_user_8); 27EXPORT_SYMBOL(__put_user_8);
30 28
31EXPORT_SYMBOL(copy_user_generic); 29EXPORT_SYMBOL(copy_user_generic_string);
30EXPORT_SYMBOL(copy_user_generic_unrolled);
32EXPORT_SYMBOL(__copy_user_nocache); 31EXPORT_SYMBOL(__copy_user_nocache);
33EXPORT_SYMBOL(copy_from_user); 32EXPORT_SYMBOL(_copy_from_user);
34EXPORT_SYMBOL(copy_to_user); 33EXPORT_SYMBOL(_copy_to_user);
35EXPORT_SYMBOL(__copy_from_user_inatomic);
36 34
37EXPORT_SYMBOL(copy_page); 35EXPORT_SYMBOL(copy_page);
38EXPORT_SYMBOL(clear_page); 36EXPORT_SYMBOL(clear_page);
@@ -57,4 +55,6 @@ EXPORT_SYMBOL(__memcpy);
57 55
58EXPORT_SYMBOL(empty_zero_page); 56EXPORT_SYMBOL(empty_zero_page);
59EXPORT_SYMBOL(init_level4_pgt); 57EXPORT_SYMBOL(init_level4_pgt);
60EXPORT_SYMBOL(load_gs_index); 58#ifndef CONFIG_PARAVIRT
59EXPORT_SYMBOL(native_load_gs_index);
60#endif
diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c
index 4449a4a2c2ed..61a1e8c7e19f 100644
--- a/arch/x86/kernel/x86_init.c
+++ b/arch/x86/kernel/x86_init.c
@@ -4,20 +4,26 @@
4 * For licencing details see kernel-base/COPYING 4 * For licencing details see kernel-base/COPYING
5 */ 5 */
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/ioport.h>
7 8
8#include <asm/bios_ebda.h> 9#include <asm/bios_ebda.h>
9#include <asm/paravirt.h> 10#include <asm/paravirt.h>
11#include <asm/pci_x86.h>
10#include <asm/mpspec.h> 12#include <asm/mpspec.h>
11#include <asm/setup.h> 13#include <asm/setup.h>
12#include <asm/apic.h> 14#include <asm/apic.h>
13#include <asm/e820.h> 15#include <asm/e820.h>
14#include <asm/time.h> 16#include <asm/time.h>
15#include <asm/irq.h> 17#include <asm/irq.h>
18#include <asm/pat.h>
16#include <asm/tsc.h> 19#include <asm/tsc.h>
20#include <asm/iommu.h>
17 21
18void __cpuinit x86_init_noop(void) { } 22void __cpuinit x86_init_noop(void) { }
19void __init x86_init_uint_noop(unsigned int unused) { } 23void __init x86_init_uint_noop(unsigned int unused) { }
20void __init x86_init_pgd_noop(pgd_t *unused) { } 24void __init x86_init_pgd_noop(pgd_t *unused) { }
25int __init iommu_init_noop(void) { return 0; }
26void iommu_shutdown_noop(void) { }
21 27
22/* 28/*
23 * The platform setup functions are preset with the default functions 29 * The platform setup functions are preset with the default functions
@@ -62,14 +68,29 @@ struct x86_init_ops x86_init __initdata = {
62 .tsc_pre_init = x86_init_noop, 68 .tsc_pre_init = x86_init_noop,
63 .timer_init = hpet_time_init, 69 .timer_init = hpet_time_init,
64 }, 70 },
71
72 .iommu = {
73 .iommu_init = iommu_init_noop,
74 },
75
76 .pci = {
77 .init = x86_default_pci_init,
78 .init_irq = x86_default_pci_init_irq,
79 .fixup_irqs = x86_default_pci_fixup_irqs,
80 },
65}; 81};
66 82
67struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { 83struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = {
68 .setup_percpu_clockev = setup_secondary_APIC_clock, 84 .setup_percpu_clockev = setup_secondary_APIC_clock,
69}; 85};
70 86
87static void default_nmi_init(void) { };
88
71struct x86_platform_ops x86_platform = { 89struct x86_platform_ops x86_platform = {
72 .calibrate_tsc = native_calibrate_tsc, 90 .calibrate_tsc = native_calibrate_tsc,
73 .get_wallclock = mach_get_cmos_time, 91 .get_wallclock = mach_get_cmos_time,
74 .set_wallclock = mach_set_rtc_mmss, 92 .set_wallclock = mach_set_rtc_mmss,
93 .iommu_shutdown = iommu_shutdown_noop,
94 .is_untracked_pat_range = is_ISA_range,
95 .nmi_init = default_nmi_init
75}; 96};
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index c5ee17e8c6d9..782c3a362ec6 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -337,6 +337,7 @@ void __ref xsave_cntxt_init(void)
337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); 337 cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
338 xstate_size = ebx; 338 xstate_size = ebx;
339 339
340 update_regset_xstate_info(xstate_size, pcntxt_mask);
340 prepare_fx_sw_frame(); 341 prepare_fx_sw_frame();
341 342
342 setup_xstate_init(); 343 setup_xstate_init();
diff --git a/arch/x86/kvm/Kconfig b/arch/x86/kvm/Kconfig
index b84e571f4175..970bbd479516 100644
--- a/arch/x86/kvm/Kconfig
+++ b/arch/x86/kvm/Kconfig
@@ -28,6 +28,8 @@ config KVM
28 select HAVE_KVM_IRQCHIP 28 select HAVE_KVM_IRQCHIP
29 select HAVE_KVM_EVENTFD 29 select HAVE_KVM_EVENTFD
30 select KVM_APIC_ARCHITECTURE 30 select KVM_APIC_ARCHITECTURE
31 select USER_RETURN_NOTIFIER
32 select KVM_MMIO
31 ---help--- 33 ---help---
32 Support hosting fully virtualized guest machines using hardware 34 Support hosting fully virtualized guest machines using hardware
33 virtualization extensions. You will need a fairly recent 35 virtualization extensions. You will need a fairly recent
@@ -64,6 +66,7 @@ config KVM_AMD
64 66
65# OK, it's a little counter-intuitive to do this, but it puts it neatly under 67# OK, it's a little counter-intuitive to do this, but it puts it neatly under
66# the virtualization menu. 68# the virtualization menu.
69source drivers/vhost/Kconfig
67source drivers/lguest/Kconfig 70source drivers/lguest/Kconfig
68source drivers/virtio/Kconfig 71source drivers/virtio/Kconfig
69 72
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 0e7fe78d0f74..31a7035c4bd9 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -6,7 +6,8 @@ CFLAGS_svm.o := -I.
6CFLAGS_vmx.o := -I. 6CFLAGS_vmx.o := -I.
7 7
8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \ 8kvm-y += $(addprefix ../../../virt/kvm/, kvm_main.o ioapic.o \
9 coalesced_mmio.o irq_comm.o eventfd.o) 9 coalesced_mmio.o irq_comm.o eventfd.o \
10 assigned-dev.o)
10kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o) 11kvm-$(CONFIG_IOMMU_API) += $(addprefix ../../../virt/kvm/, iommu.o)
11 12
12kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ 13kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 1be5cd640e93..4dade6ac0827 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -32,7 +32,7 @@
32#include <linux/module.h> 32#include <linux/module.h>
33#include <asm/kvm_emulate.h> 33#include <asm/kvm_emulate.h>
34 34
35#include "mmu.h" /* for is_long_mode() */ 35#include "x86.h"
36 36
37/* 37/*
38 * Opcode effective-address decode tables. 38 * Opcode effective-address decode tables.
@@ -75,6 +75,10 @@
75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */ 75#define Group (1<<14) /* Bits 3:5 of modrm byte extend opcode */
76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */ 76#define GroupDual (1<<15) /* Alternate decoding of mod == 3 */
77#define GroupMask 0xff /* Group number stored in bits 0:7 */ 77#define GroupMask 0xff /* Group number stored in bits 0:7 */
78/* Misc flags */
79#define Lock (1<<26) /* lock prefix is allowed for the instruction */
80#define Priv (1<<27) /* instruction generates #GP if current CPL != 0 */
81#define No64 (1<<28)
78/* Source 2 operand type */ 82/* Source 2 operand type */
79#define Src2None (0<<29) 83#define Src2None (0<<29)
80#define Src2CL (1<<29) 84#define Src2CL (1<<29)
@@ -86,35 +90,40 @@
86enum { 90enum {
87 Group1_80, Group1_81, Group1_82, Group1_83, 91 Group1_80, Group1_81, Group1_82, Group1_83,
88 Group1A, Group3_Byte, Group3, Group4, Group5, Group7, 92 Group1A, Group3_Byte, Group3, Group4, Group5, Group7,
93 Group8, Group9,
89}; 94};
90 95
91static u32 opcode_table[256] = { 96static u32 opcode_table[256] = {
92 /* 0x00 - 0x07 */ 97 /* 0x00 - 0x07 */
93 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 98 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
94 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 99 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
95 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 100 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
101 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
96 /* 0x08 - 0x0F */ 102 /* 0x08 - 0x0F */
97 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 103 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
98 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 104 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
99 0, 0, 0, 0, 105 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
106 ImplicitOps | Stack | No64, 0,
100 /* 0x10 - 0x17 */ 107 /* 0x10 - 0x17 */
101 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 108 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
102 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 109 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
103 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 110 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
111 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
104 /* 0x18 - 0x1F */ 112 /* 0x18 - 0x1F */
105 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 113 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
106 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 114 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
107 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm, 0, 0, 115 ByteOp | DstAcc | SrcImm, DstAcc | SrcImm,
116 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
108 /* 0x20 - 0x27 */ 117 /* 0x20 - 0x27 */
109 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 118 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
110 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 119 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
111 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0, 120 DstAcc | SrcImmByte, DstAcc | SrcImm, 0, 0,
112 /* 0x28 - 0x2F */ 121 /* 0x28 - 0x2F */
113 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 122 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
114 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 123 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
115 0, 0, 0, 0, 124 0, 0, 0, 0,
116 /* 0x30 - 0x37 */ 125 /* 0x30 - 0x37 */
117 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 126 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
118 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM, 127 ByteOp | DstReg | SrcMem | ModRM, DstReg | SrcMem | ModRM,
119 0, 0, 0, 0, 128 0, 0, 0, 0,
120 /* 0x38 - 0x3F */ 129 /* 0x38 - 0x3F */
@@ -133,7 +142,8 @@ static u32 opcode_table[256] = {
133 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 142 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
134 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack, 143 DstReg | Stack, DstReg | Stack, DstReg | Stack, DstReg | Stack,
135 /* 0x60 - 0x67 */ 144 /* 0x60 - 0x67 */
136 0, 0, 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ , 145 ImplicitOps | Stack | No64, ImplicitOps | Stack | No64,
146 0, DstReg | SrcMem32 | ModRM | Mov /* movsxd (x86/64) */ ,
137 0, 0, 0, 0, 147 0, 0, 0, 0,
138 /* 0x68 - 0x6F */ 148 /* 0x68 - 0x6F */
139 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0, 149 SrcImm | Mov | Stack, 0, SrcImmByte | Mov | Stack, 0,
@@ -149,7 +159,7 @@ static u32 opcode_table[256] = {
149 Group | Group1_80, Group | Group1_81, 159 Group | Group1_80, Group | Group1_81,
150 Group | Group1_82, Group | Group1_83, 160 Group | Group1_82, Group | Group1_83,
151 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 161 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM,
152 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 162 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
153 /* 0x88 - 0x8F */ 163 /* 0x88 - 0x8F */
154 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov, 164 ByteOp | DstMem | SrcReg | ModRM | Mov, DstMem | SrcReg | ModRM | Mov,
155 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 165 ByteOp | DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -158,7 +168,7 @@ static u32 opcode_table[256] = {
158 /* 0x90 - 0x97 */ 168 /* 0x90 - 0x97 */
159 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, 169 DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg, DstReg,
160 /* 0x98 - 0x9F */ 170 /* 0x98 - 0x9F */
161 0, 0, SrcImm | Src2Imm16, 0, 171 0, 0, SrcImm | Src2Imm16 | No64, 0,
162 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0, 172 ImplicitOps | Stack, ImplicitOps | Stack, 0, 0,
163 /* 0xA0 - 0xA7 */ 173 /* 0xA0 - 0xA7 */
164 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs, 174 ByteOp | DstReg | SrcMem | Mov | MemAbs, DstReg | SrcMem | Mov | MemAbs,
@@ -185,7 +195,7 @@ static u32 opcode_table[256] = {
185 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov, 195 ByteOp | DstMem | SrcImm | ModRM | Mov, DstMem | SrcImm | ModRM | Mov,
186 /* 0xC8 - 0xCF */ 196 /* 0xC8 - 0xCF */
187 0, 0, 0, ImplicitOps | Stack, 197 0, 0, 0, ImplicitOps | Stack,
188 ImplicitOps, SrcImmByte, ImplicitOps, ImplicitOps, 198 ImplicitOps, SrcImmByte, ImplicitOps | No64, ImplicitOps,
189 /* 0xD0 - 0xD7 */ 199 /* 0xD0 - 0xD7 */
190 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 200 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
191 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM, 201 ByteOp | DstMem | SrcImplicit | ModRM, DstMem | SrcImplicit | ModRM,
@@ -198,12 +208,12 @@ static u32 opcode_table[256] = {
198 ByteOp | SrcImmUByte, SrcImmUByte, 208 ByteOp | SrcImmUByte, SrcImmUByte,
199 /* 0xE8 - 0xEF */ 209 /* 0xE8 - 0xEF */
200 SrcImm | Stack, SrcImm | ImplicitOps, 210 SrcImm | Stack, SrcImm | ImplicitOps,
201 SrcImmU | Src2Imm16, SrcImmByte | ImplicitOps, 211 SrcImmU | Src2Imm16 | No64, SrcImmByte | ImplicitOps,
202 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 212 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
203 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps, 213 SrcNone | ByteOp | ImplicitOps, SrcNone | ImplicitOps,
204 /* 0xF0 - 0xF7 */ 214 /* 0xF0 - 0xF7 */
205 0, 0, 0, 0, 215 0, 0, 0, 0,
206 ImplicitOps, ImplicitOps, Group | Group3_Byte, Group | Group3, 216 ImplicitOps | Priv, ImplicitOps, Group | Group3_Byte, Group | Group3,
207 /* 0xF8 - 0xFF */ 217 /* 0xF8 - 0xFF */
208 ImplicitOps, 0, ImplicitOps, ImplicitOps, 218 ImplicitOps, 0, ImplicitOps, ImplicitOps,
209 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5, 219 ImplicitOps, ImplicitOps, Group | Group4, Group | Group5,
@@ -211,16 +221,20 @@ static u32 opcode_table[256] = {
211 221
212static u32 twobyte_table[256] = { 222static u32 twobyte_table[256] = {
213 /* 0x00 - 0x0F */ 223 /* 0x00 - 0x0F */
214 0, Group | GroupDual | Group7, 0, 0, 0, ImplicitOps, ImplicitOps, 0, 224 0, Group | GroupDual | Group7, 0, 0,
215 ImplicitOps, ImplicitOps, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 225 0, ImplicitOps, ImplicitOps | Priv, 0,
226 ImplicitOps | Priv, ImplicitOps | Priv, 0, 0,
227 0, ImplicitOps | ModRM, 0, 0,
216 /* 0x10 - 0x1F */ 228 /* 0x10 - 0x1F */
217 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0, 229 0, 0, 0, 0, 0, 0, 0, 0, ImplicitOps | ModRM, 0, 0, 0, 0, 0, 0, 0,
218 /* 0x20 - 0x2F */ 230 /* 0x20 - 0x2F */
219 ModRM | ImplicitOps, ModRM, ModRM | ImplicitOps, ModRM, 0, 0, 0, 0, 231 ModRM | ImplicitOps | Priv, ModRM | Priv,
232 ModRM | ImplicitOps | Priv, ModRM | Priv,
233 0, 0, 0, 0,
220 0, 0, 0, 0, 0, 0, 0, 0, 234 0, 0, 0, 0, 0, 0, 0, 0,
221 /* 0x30 - 0x3F */ 235 /* 0x30 - 0x3F */
222 ImplicitOps, 0, ImplicitOps, 0, 236 ImplicitOps | Priv, 0, ImplicitOps | Priv, 0,
223 ImplicitOps, ImplicitOps, 0, 0, 237 ImplicitOps, ImplicitOps | Priv, 0, 0,
224 0, 0, 0, 0, 0, 0, 0, 0, 238 0, 0, 0, 0, 0, 0, 0, 0,
225 /* 0x40 - 0x47 */ 239 /* 0x40 - 0x47 */
226 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov, 240 DstReg | SrcMem | ModRM | Mov, DstReg | SrcMem | ModRM | Mov,
@@ -244,25 +258,29 @@ static u32 twobyte_table[256] = {
244 /* 0x90 - 0x9F */ 258 /* 0x90 - 0x9F */
245 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 259 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
246 /* 0xA0 - 0xA7 */ 260 /* 0xA0 - 0xA7 */
247 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 261 ImplicitOps | Stack, ImplicitOps | Stack,
262 0, DstMem | SrcReg | ModRM | BitOp,
248 DstMem | SrcReg | Src2ImmByte | ModRM, 263 DstMem | SrcReg | Src2ImmByte | ModRM,
249 DstMem | SrcReg | Src2CL | ModRM, 0, 0, 264 DstMem | SrcReg | Src2CL | ModRM, 0, 0,
250 /* 0xA8 - 0xAF */ 265 /* 0xA8 - 0xAF */
251 0, 0, 0, DstMem | SrcReg | ModRM | BitOp, 266 ImplicitOps | Stack, ImplicitOps | Stack,
267 0, DstMem | SrcReg | ModRM | BitOp | Lock,
252 DstMem | SrcReg | Src2ImmByte | ModRM, 268 DstMem | SrcReg | Src2ImmByte | ModRM,
253 DstMem | SrcReg | Src2CL | ModRM, 269 DstMem | SrcReg | Src2CL | ModRM,
254 ModRM, 0, 270 ModRM, 0,
255 /* 0xB0 - 0xB7 */ 271 /* 0xB0 - 0xB7 */
256 ByteOp | DstMem | SrcReg | ModRM, DstMem | SrcReg | ModRM, 0, 272 ByteOp | DstMem | SrcReg | ModRM | Lock, DstMem | SrcReg | ModRM | Lock,
257 DstMem | SrcReg | ModRM | BitOp, 273 0, DstMem | SrcReg | ModRM | BitOp | Lock,
258 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 274 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
259 DstReg | SrcMem16 | ModRM | Mov, 275 DstReg | SrcMem16 | ModRM | Mov,
260 /* 0xB8 - 0xBF */ 276 /* 0xB8 - 0xBF */
261 0, 0, DstMem | SrcImmByte | ModRM, DstMem | SrcReg | ModRM | BitOp, 277 0, 0,
278 Group | Group8, DstMem | SrcReg | ModRM | BitOp | Lock,
262 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov, 279 0, 0, ByteOp | DstReg | SrcMem | ModRM | Mov,
263 DstReg | SrcMem16 | ModRM | Mov, 280 DstReg | SrcMem16 | ModRM | Mov,
264 /* 0xC0 - 0xCF */ 281 /* 0xC0 - 0xCF */
265 0, 0, 0, DstMem | SrcReg | ModRM | Mov, 0, 0, 0, ImplicitOps | ModRM, 282 0, 0, 0, DstMem | SrcReg | ModRM | Mov,
283 0, 0, 0, Group | GroupDual | Group9,
266 0, 0, 0, 0, 0, 0, 0, 0, 284 0, 0, 0, 0, 0, 0, 0, 0,
267 /* 0xD0 - 0xDF */ 285 /* 0xD0 - 0xDF */
268 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 286 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
@@ -274,25 +292,41 @@ static u32 twobyte_table[256] = {
274 292
275static u32 group_table[] = { 293static u32 group_table[] = {
276 [Group1_80*8] = 294 [Group1_80*8] =
277 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 295 ByteOp | DstMem | SrcImm | ModRM | Lock,
278 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 296 ByteOp | DstMem | SrcImm | ModRM | Lock,
279 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 297 ByteOp | DstMem | SrcImm | ModRM | Lock,
280 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 298 ByteOp | DstMem | SrcImm | ModRM | Lock,
299 ByteOp | DstMem | SrcImm | ModRM | Lock,
300 ByteOp | DstMem | SrcImm | ModRM | Lock,
301 ByteOp | DstMem | SrcImm | ModRM | Lock,
302 ByteOp | DstMem | SrcImm | ModRM,
281 [Group1_81*8] = 303 [Group1_81*8] =
282 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 304 DstMem | SrcImm | ModRM | Lock,
283 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 305 DstMem | SrcImm | ModRM | Lock,
284 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 306 DstMem | SrcImm | ModRM | Lock,
285 DstMem | SrcImm | ModRM, DstMem | SrcImm | ModRM, 307 DstMem | SrcImm | ModRM | Lock,
308 DstMem | SrcImm | ModRM | Lock,
309 DstMem | SrcImm | ModRM | Lock,
310 DstMem | SrcImm | ModRM | Lock,
311 DstMem | SrcImm | ModRM,
286 [Group1_82*8] = 312 [Group1_82*8] =
287 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 313 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
288 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 314 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
289 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 315 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
290 ByteOp | DstMem | SrcImm | ModRM, ByteOp | DstMem | SrcImm | ModRM, 316 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
317 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
318 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
319 ByteOp | DstMem | SrcImm | ModRM | No64 | Lock,
320 ByteOp | DstMem | SrcImm | ModRM | No64,
291 [Group1_83*8] = 321 [Group1_83*8] =
292 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 322 DstMem | SrcImmByte | ModRM | Lock,
293 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 323 DstMem | SrcImmByte | ModRM | Lock,
294 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 324 DstMem | SrcImmByte | ModRM | Lock,
295 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM, 325 DstMem | SrcImmByte | ModRM | Lock,
326 DstMem | SrcImmByte | ModRM | Lock,
327 DstMem | SrcImmByte | ModRM | Lock,
328 DstMem | SrcImmByte | ModRM | Lock,
329 DstMem | SrcImmByte | ModRM,
296 [Group1A*8] = 330 [Group1A*8] =
297 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0, 331 DstMem | SrcNone | ModRM | Mov | Stack, 0, 0, 0, 0, 0, 0, 0,
298 [Group3_Byte*8] = 332 [Group3_Byte*8] =
@@ -311,24 +345,39 @@ static u32 group_table[] = {
311 SrcMem | ModRM | Stack, 0, 345 SrcMem | ModRM | Stack, 0,
312 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0, 346 SrcMem | ModRM | Stack, 0, SrcMem | ModRM | Stack, 0,
313 [Group7*8] = 347 [Group7*8] =
314 0, 0, ModRM | SrcMem, ModRM | SrcMem, 348 0, 0, ModRM | SrcMem | Priv, ModRM | SrcMem | Priv,
315 SrcNone | ModRM | DstMem | Mov, 0, 349 SrcNone | ModRM | DstMem | Mov, 0,
316 SrcMem16 | ModRM | Mov, SrcMem | ModRM | ByteOp, 350 SrcMem16 | ModRM | Mov | Priv, SrcMem | ModRM | ByteOp | Priv,
351 [Group8*8] =
352 0, 0, 0, 0,
353 DstMem | SrcImmByte | ModRM, DstMem | SrcImmByte | ModRM | Lock,
354 DstMem | SrcImmByte | ModRM | Lock, DstMem | SrcImmByte | ModRM | Lock,
355 [Group9*8] =
356 0, ImplicitOps | ModRM | Lock, 0, 0, 0, 0, 0, 0,
317}; 357};
318 358
319static u32 group2_table[] = { 359static u32 group2_table[] = {
320 [Group7*8] = 360 [Group7*8] =
321 SrcNone | ModRM, 0, 0, SrcNone | ModRM, 361 SrcNone | ModRM | Priv, 0, 0, SrcNone | ModRM,
322 SrcNone | ModRM | DstMem | Mov, 0, 362 SrcNone | ModRM | DstMem | Mov, 0,
323 SrcMem16 | ModRM | Mov, 0, 363 SrcMem16 | ModRM | Mov, 0,
364 [Group9*8] =
365 0, 0, 0, 0, 0, 0, 0, 0,
324}; 366};
325 367
326/* EFLAGS bit definitions. */ 368/* EFLAGS bit definitions. */
369#define EFLG_ID (1<<21)
370#define EFLG_VIP (1<<20)
371#define EFLG_VIF (1<<19)
372#define EFLG_AC (1<<18)
327#define EFLG_VM (1<<17) 373#define EFLG_VM (1<<17)
328#define EFLG_RF (1<<16) 374#define EFLG_RF (1<<16)
375#define EFLG_IOPL (3<<12)
376#define EFLG_NT (1<<14)
329#define EFLG_OF (1<<11) 377#define EFLG_OF (1<<11)
330#define EFLG_DF (1<<10) 378#define EFLG_DF (1<<10)
331#define EFLG_IF (1<<9) 379#define EFLG_IF (1<<9)
380#define EFLG_TF (1<<8)
332#define EFLG_SF (1<<7) 381#define EFLG_SF (1<<7)
333#define EFLG_ZF (1<<6) 382#define EFLG_ZF (1<<6)
334#define EFLG_AF (1<<4) 383#define EFLG_AF (1<<4)
@@ -597,7 +646,7 @@ static int do_fetch_insn_byte(struct x86_emulate_ctxt *ctxt,
597 646
598 if (linear < fc->start || linear >= fc->end) { 647 if (linear < fc->start || linear >= fc->end) {
599 size = min(15UL, PAGE_SIZE - offset_in_page(linear)); 648 size = min(15UL, PAGE_SIZE - offset_in_page(linear));
600 rc = ops->read_std(linear, fc->data, size, ctxt->vcpu); 649 rc = ops->fetch(linear, fc->data, size, ctxt->vcpu, NULL);
601 if (rc) 650 if (rc)
602 return rc; 651 return rc;
603 fc->start = linear; 652 fc->start = linear;
@@ -613,6 +662,9 @@ static int do_insn_fetch(struct x86_emulate_ctxt *ctxt,
613{ 662{
614 int rc = 0; 663 int rc = 0;
615 664
665 /* x86 instructions are limited to 15 bytes. */
666 if (eip + size - ctxt->decode.eip_orig > 15)
667 return X86EMUL_UNHANDLEABLE;
616 eip += ctxt->cs_base; 668 eip += ctxt->cs_base;
617 while (size--) { 669 while (size--) {
618 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++); 670 rc = do_fetch_insn_byte(ctxt, ops, eip++, dest++);
@@ -649,11 +701,11 @@ static int read_descriptor(struct x86_emulate_ctxt *ctxt,
649 op_bytes = 3; 701 op_bytes = 3;
650 *address = 0; 702 *address = 0;
651 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2, 703 rc = ops->read_std((unsigned long)ptr, (unsigned long *)size, 2,
652 ctxt->vcpu); 704 ctxt->vcpu, NULL);
653 if (rc) 705 if (rc)
654 return rc; 706 return rc;
655 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes, 707 rc = ops->read_std((unsigned long)ptr + 2, address, op_bytes,
656 ctxt->vcpu); 708 ctxt->vcpu, NULL);
657 return rc; 709 return rc;
658} 710}
659 711
@@ -871,12 +923,13 @@ x86_decode_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
871 /* Shadow copy of register state. Committed on successful emulation. */ 923 /* Shadow copy of register state. Committed on successful emulation. */
872 924
873 memset(c, 0, sizeof(struct decode_cache)); 925 memset(c, 0, sizeof(struct decode_cache));
874 c->eip = kvm_rip_read(ctxt->vcpu); 926 c->eip = c->eip_orig = kvm_rip_read(ctxt->vcpu);
875 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS); 927 ctxt->cs_base = seg_base(ctxt, VCPU_SREG_CS);
876 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 928 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
877 929
878 switch (mode) { 930 switch (mode) {
879 case X86EMUL_MODE_REAL: 931 case X86EMUL_MODE_REAL:
932 case X86EMUL_MODE_VM86:
880 case X86EMUL_MODE_PROT16: 933 case X86EMUL_MODE_PROT16:
881 def_op_bytes = def_ad_bytes = 2; 934 def_op_bytes = def_ad_bytes = 2;
882 break; 935 break;
@@ -962,6 +1015,11 @@ done_prefixes:
962 } 1015 }
963 } 1016 }
964 1017
1018 if (mode == X86EMUL_MODE_PROT64 && (c->d & No64)) {
1019 kvm_report_emulation_failure(ctxt->vcpu, "invalid x86/64 instruction");
1020 return -1;
1021 }
1022
965 if (c->d & Group) { 1023 if (c->d & Group) {
966 group = c->d & GroupMask; 1024 group = c->d & GroupMask;
967 c->modrm = insn_fetch(u8, 1, c->eip); 1025 c->modrm = insn_fetch(u8, 1, c->eip);
@@ -1179,13 +1237,119 @@ static int emulate_pop(struct x86_emulate_ctxt *ctxt,
1179 rc = ops->read_emulated(register_address(c, ss_base(ctxt), 1237 rc = ops->read_emulated(register_address(c, ss_base(ctxt),
1180 c->regs[VCPU_REGS_RSP]), 1238 c->regs[VCPU_REGS_RSP]),
1181 dest, len, ctxt->vcpu); 1239 dest, len, ctxt->vcpu);
1182 if (rc != 0) 1240 if (rc != X86EMUL_CONTINUE)
1183 return rc; 1241 return rc;
1184 1242
1185 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len); 1243 register_address_increment(c, &c->regs[VCPU_REGS_RSP], len);
1186 return rc; 1244 return rc;
1187} 1245}
1188 1246
1247static int emulate_popf(struct x86_emulate_ctxt *ctxt,
1248 struct x86_emulate_ops *ops,
1249 void *dest, int len)
1250{
1251 int rc;
1252 unsigned long val, change_mask;
1253 int iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1254 int cpl = kvm_x86_ops->get_cpl(ctxt->vcpu);
1255
1256 rc = emulate_pop(ctxt, ops, &val, len);
1257 if (rc != X86EMUL_CONTINUE)
1258 return rc;
1259
1260 change_mask = EFLG_CF | EFLG_PF | EFLG_AF | EFLG_ZF | EFLG_SF | EFLG_OF
1261 | EFLG_TF | EFLG_DF | EFLG_NT | EFLG_RF | EFLG_AC | EFLG_ID;
1262
1263 switch(ctxt->mode) {
1264 case X86EMUL_MODE_PROT64:
1265 case X86EMUL_MODE_PROT32:
1266 case X86EMUL_MODE_PROT16:
1267 if (cpl == 0)
1268 change_mask |= EFLG_IOPL;
1269 if (cpl <= iopl)
1270 change_mask |= EFLG_IF;
1271 break;
1272 case X86EMUL_MODE_VM86:
1273 if (iopl < 3) {
1274 kvm_inject_gp(ctxt->vcpu, 0);
1275 return X86EMUL_PROPAGATE_FAULT;
1276 }
1277 change_mask |= EFLG_IF;
1278 break;
1279 default: /* real mode */
1280 change_mask |= (EFLG_IOPL | EFLG_IF);
1281 break;
1282 }
1283
1284 *(unsigned long *)dest =
1285 (ctxt->eflags & ~change_mask) | (val & change_mask);
1286
1287 return rc;
1288}
1289
1290static void emulate_push_sreg(struct x86_emulate_ctxt *ctxt, int seg)
1291{
1292 struct decode_cache *c = &ctxt->decode;
1293 struct kvm_segment segment;
1294
1295 kvm_x86_ops->get_segment(ctxt->vcpu, &segment, seg);
1296
1297 c->src.val = segment.selector;
1298 emulate_push(ctxt);
1299}
1300
1301static int emulate_pop_sreg(struct x86_emulate_ctxt *ctxt,
1302 struct x86_emulate_ops *ops, int seg)
1303{
1304 struct decode_cache *c = &ctxt->decode;
1305 unsigned long selector;
1306 int rc;
1307
1308 rc = emulate_pop(ctxt, ops, &selector, c->op_bytes);
1309 if (rc != 0)
1310 return rc;
1311
1312 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)selector, seg);
1313 return rc;
1314}
1315
1316static void emulate_pusha(struct x86_emulate_ctxt *ctxt)
1317{
1318 struct decode_cache *c = &ctxt->decode;
1319 unsigned long old_esp = c->regs[VCPU_REGS_RSP];
1320 int reg = VCPU_REGS_RAX;
1321
1322 while (reg <= VCPU_REGS_RDI) {
1323 (reg == VCPU_REGS_RSP) ?
1324 (c->src.val = old_esp) : (c->src.val = c->regs[reg]);
1325
1326 emulate_push(ctxt);
1327 ++reg;
1328 }
1329}
1330
1331static int emulate_popa(struct x86_emulate_ctxt *ctxt,
1332 struct x86_emulate_ops *ops)
1333{
1334 struct decode_cache *c = &ctxt->decode;
1335 int rc = 0;
1336 int reg = VCPU_REGS_RDI;
1337
1338 while (reg >= VCPU_REGS_RAX) {
1339 if (reg == VCPU_REGS_RSP) {
1340 register_address_increment(c, &c->regs[VCPU_REGS_RSP],
1341 c->op_bytes);
1342 --reg;
1343 }
1344
1345 rc = emulate_pop(ctxt, ops, &c->regs[reg], c->op_bytes);
1346 if (rc != 0)
1347 break;
1348 --reg;
1349 }
1350 return rc;
1351}
1352
1189static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt, 1353static inline int emulate_grp1a(struct x86_emulate_ctxt *ctxt,
1190 struct x86_emulate_ops *ops) 1354 struct x86_emulate_ops *ops)
1191{ 1355{
@@ -1290,7 +1454,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1290 int rc; 1454 int rc;
1291 1455
1292 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu); 1456 rc = ops->read_emulated(memop, &old, 8, ctxt->vcpu);
1293 if (rc != 0) 1457 if (rc != X86EMUL_CONTINUE)
1294 return rc; 1458 return rc;
1295 1459
1296 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) || 1460 if (((u32) (old >> 0) != (u32) c->regs[VCPU_REGS_RAX]) ||
@@ -1305,7 +1469,7 @@ static inline int emulate_grp9(struct x86_emulate_ctxt *ctxt,
1305 (u32) c->regs[VCPU_REGS_RBX]; 1469 (u32) c->regs[VCPU_REGS_RBX];
1306 1470
1307 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu); 1471 rc = ops->cmpxchg_emulated(memop, &old, &new, 8, ctxt->vcpu);
1308 if (rc != 0) 1472 if (rc != X86EMUL_CONTINUE)
1309 return rc; 1473 return rc;
1310 ctxt->eflags |= EFLG_ZF; 1474 ctxt->eflags |= EFLG_ZF;
1311 } 1475 }
@@ -1327,7 +1491,7 @@ static int emulate_ret_far(struct x86_emulate_ctxt *ctxt,
1327 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes); 1491 rc = emulate_pop(ctxt, ops, &cs, c->op_bytes);
1328 if (rc) 1492 if (rc)
1329 return rc; 1493 return rc;
1330 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, 1, VCPU_SREG_CS); 1494 rc = kvm_load_segment_descriptor(ctxt->vcpu, (u16)cs, VCPU_SREG_CS);
1331 return rc; 1495 return rc;
1332} 1496}
1333 1497
@@ -1371,7 +1535,7 @@ static inline int writeback(struct x86_emulate_ctxt *ctxt,
1371 &c->dst.val, 1535 &c->dst.val,
1372 c->dst.bytes, 1536 c->dst.bytes,
1373 ctxt->vcpu); 1537 ctxt->vcpu);
1374 if (rc != 0) 1538 if (rc != X86EMUL_CONTINUE)
1375 return rc; 1539 return rc;
1376 break; 1540 break;
1377 case OP_NONE: 1541 case OP_NONE:
@@ -1434,9 +1598,8 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1434 u64 msr_data; 1598 u64 msr_data;
1435 1599
1436 /* syscall is not available in real mode */ 1600 /* syscall is not available in real mode */
1437 if (c->lock_prefix || ctxt->mode == X86EMUL_MODE_REAL 1601 if (ctxt->mode == X86EMUL_MODE_REAL || ctxt->mode == X86EMUL_MODE_VM86)
1438 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) 1602 return X86EMUL_UNHANDLEABLE;
1439 return -1;
1440 1603
1441 setup_syscalls_segments(ctxt, &cs, &ss); 1604 setup_syscalls_segments(ctxt, &cs, &ss);
1442 1605
@@ -1473,7 +1636,7 @@ emulate_syscall(struct x86_emulate_ctxt *ctxt)
1473 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF); 1636 ctxt->eflags &= ~(EFLG_VM | EFLG_IF | EFLG_RF);
1474 } 1637 }
1475 1638
1476 return 0; 1639 return X86EMUL_CONTINUE;
1477} 1640}
1478 1641
1479static int 1642static int
@@ -1483,22 +1646,17 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1483 struct kvm_segment cs, ss; 1646 struct kvm_segment cs, ss;
1484 u64 msr_data; 1647 u64 msr_data;
1485 1648
1486 /* inject #UD if LOCK prefix is used */ 1649 /* inject #GP if in real mode */
1487 if (c->lock_prefix) 1650 if (ctxt->mode == X86EMUL_MODE_REAL) {
1488 return -1;
1489
1490 /* inject #GP if in real mode or paging is disabled */
1491 if (ctxt->mode == X86EMUL_MODE_REAL ||
1492 !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1493 kvm_inject_gp(ctxt->vcpu, 0); 1651 kvm_inject_gp(ctxt->vcpu, 0);
1494 return -1; 1652 return X86EMUL_UNHANDLEABLE;
1495 } 1653 }
1496 1654
1497 /* XXX sysenter/sysexit have not been tested in 64bit mode. 1655 /* XXX sysenter/sysexit have not been tested in 64bit mode.
1498 * Therefore, we inject an #UD. 1656 * Therefore, we inject an #UD.
1499 */ 1657 */
1500 if (ctxt->mode == X86EMUL_MODE_PROT64) 1658 if (ctxt->mode == X86EMUL_MODE_PROT64)
1501 return -1; 1659 return X86EMUL_UNHANDLEABLE;
1502 1660
1503 setup_syscalls_segments(ctxt, &cs, &ss); 1661 setup_syscalls_segments(ctxt, &cs, &ss);
1504 1662
@@ -1507,13 +1665,13 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1507 case X86EMUL_MODE_PROT32: 1665 case X86EMUL_MODE_PROT32:
1508 if ((msr_data & 0xfffc) == 0x0) { 1666 if ((msr_data & 0xfffc) == 0x0) {
1509 kvm_inject_gp(ctxt->vcpu, 0); 1667 kvm_inject_gp(ctxt->vcpu, 0);
1510 return -1; 1668 return X86EMUL_PROPAGATE_FAULT;
1511 } 1669 }
1512 break; 1670 break;
1513 case X86EMUL_MODE_PROT64: 1671 case X86EMUL_MODE_PROT64:
1514 if (msr_data == 0x0) { 1672 if (msr_data == 0x0) {
1515 kvm_inject_gp(ctxt->vcpu, 0); 1673 kvm_inject_gp(ctxt->vcpu, 0);
1516 return -1; 1674 return X86EMUL_PROPAGATE_FAULT;
1517 } 1675 }
1518 break; 1676 break;
1519 } 1677 }
@@ -1538,7 +1696,7 @@ emulate_sysenter(struct x86_emulate_ctxt *ctxt)
1538 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data); 1696 kvm_x86_ops->get_msr(ctxt->vcpu, MSR_IA32_SYSENTER_ESP, &msr_data);
1539 c->regs[VCPU_REGS_RSP] = msr_data; 1697 c->regs[VCPU_REGS_RSP] = msr_data;
1540 1698
1541 return 0; 1699 return X86EMUL_CONTINUE;
1542} 1700}
1543 1701
1544static int 1702static int
@@ -1549,21 +1707,11 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1549 u64 msr_data; 1707 u64 msr_data;
1550 int usermode; 1708 int usermode;
1551 1709
1552 /* inject #UD if LOCK prefix is used */ 1710 /* inject #GP if in real mode or Virtual 8086 mode */
1553 if (c->lock_prefix) 1711 if (ctxt->mode == X86EMUL_MODE_REAL ||
1554 return -1; 1712 ctxt->mode == X86EMUL_MODE_VM86) {
1555
1556 /* inject #GP if in real mode or paging is disabled */
1557 if (ctxt->mode == X86EMUL_MODE_REAL
1558 || !(ctxt->vcpu->arch.cr0 & X86_CR0_PE)) {
1559 kvm_inject_gp(ctxt->vcpu, 0);
1560 return -1;
1561 }
1562
1563 /* sysexit must be called from CPL 0 */
1564 if (kvm_x86_ops->get_cpl(ctxt->vcpu) != 0) {
1565 kvm_inject_gp(ctxt->vcpu, 0); 1713 kvm_inject_gp(ctxt->vcpu, 0);
1566 return -1; 1714 return X86EMUL_UNHANDLEABLE;
1567 } 1715 }
1568 1716
1569 setup_syscalls_segments(ctxt, &cs, &ss); 1717 setup_syscalls_segments(ctxt, &cs, &ss);
@@ -1581,7 +1729,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1581 cs.selector = (u16)(msr_data + 16); 1729 cs.selector = (u16)(msr_data + 16);
1582 if ((msr_data & 0xfffc) == 0x0) { 1730 if ((msr_data & 0xfffc) == 0x0) {
1583 kvm_inject_gp(ctxt->vcpu, 0); 1731 kvm_inject_gp(ctxt->vcpu, 0);
1584 return -1; 1732 return X86EMUL_PROPAGATE_FAULT;
1585 } 1733 }
1586 ss.selector = (u16)(msr_data + 24); 1734 ss.selector = (u16)(msr_data + 24);
1587 break; 1735 break;
@@ -1589,7 +1737,7 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1589 cs.selector = (u16)(msr_data + 32); 1737 cs.selector = (u16)(msr_data + 32);
1590 if (msr_data == 0x0) { 1738 if (msr_data == 0x0) {
1591 kvm_inject_gp(ctxt->vcpu, 0); 1739 kvm_inject_gp(ctxt->vcpu, 0);
1592 return -1; 1740 return X86EMUL_PROPAGATE_FAULT;
1593 } 1741 }
1594 ss.selector = cs.selector + 8; 1742 ss.selector = cs.selector + 8;
1595 cs.db = 0; 1743 cs.db = 0;
@@ -1605,7 +1753,58 @@ emulate_sysexit(struct x86_emulate_ctxt *ctxt)
1605 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX]; 1753 c->eip = ctxt->vcpu->arch.regs[VCPU_REGS_RDX];
1606 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX]; 1754 c->regs[VCPU_REGS_RSP] = ctxt->vcpu->arch.regs[VCPU_REGS_RCX];
1607 1755
1608 return 0; 1756 return X86EMUL_CONTINUE;
1757}
1758
1759static bool emulator_bad_iopl(struct x86_emulate_ctxt *ctxt)
1760{
1761 int iopl;
1762 if (ctxt->mode == X86EMUL_MODE_REAL)
1763 return false;
1764 if (ctxt->mode == X86EMUL_MODE_VM86)
1765 return true;
1766 iopl = (ctxt->eflags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1767 return kvm_x86_ops->get_cpl(ctxt->vcpu) > iopl;
1768}
1769
1770static bool emulator_io_port_access_allowed(struct x86_emulate_ctxt *ctxt,
1771 struct x86_emulate_ops *ops,
1772 u16 port, u16 len)
1773{
1774 struct kvm_segment tr_seg;
1775 int r;
1776 u16 io_bitmap_ptr;
1777 u8 perm, bit_idx = port & 0x7;
1778 unsigned mask = (1 << len) - 1;
1779
1780 kvm_get_segment(ctxt->vcpu, &tr_seg, VCPU_SREG_TR);
1781 if (tr_seg.unusable)
1782 return false;
1783 if (tr_seg.limit < 103)
1784 return false;
1785 r = ops->read_std(tr_seg.base + 102, &io_bitmap_ptr, 2, ctxt->vcpu,
1786 NULL);
1787 if (r != X86EMUL_CONTINUE)
1788 return false;
1789 if (io_bitmap_ptr + port/8 > tr_seg.limit)
1790 return false;
1791 r = ops->read_std(tr_seg.base + io_bitmap_ptr + port/8, &perm, 1,
1792 ctxt->vcpu, NULL);
1793 if (r != X86EMUL_CONTINUE)
1794 return false;
1795 if ((perm >> bit_idx) & mask)
1796 return false;
1797 return true;
1798}
1799
1800static bool emulator_io_permited(struct x86_emulate_ctxt *ctxt,
1801 struct x86_emulate_ops *ops,
1802 u16 port, u16 len)
1803{
1804 if (emulator_bad_iopl(ctxt))
1805 if (!emulator_io_port_access_allowed(ctxt, ops, port, len))
1806 return false;
1807 return true;
1609} 1808}
1610 1809
1611int 1810int
@@ -1629,6 +1828,18 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1629 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs); 1828 memcpy(c->regs, ctxt->vcpu->arch.regs, sizeof c->regs);
1630 saved_eip = c->eip; 1829 saved_eip = c->eip;
1631 1830
1831 /* LOCK prefix is allowed only with some instructions */
1832 if (c->lock_prefix && !(c->d & Lock)) {
1833 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1834 goto done;
1835 }
1836
1837 /* Privileged instruction can be executed only in CPL=0 */
1838 if ((c->d & Priv) && kvm_x86_ops->get_cpl(ctxt->vcpu)) {
1839 kvm_inject_gp(ctxt->vcpu, 0);
1840 goto done;
1841 }
1842
1632 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs)) 1843 if (((c->d & ModRM) && (c->modrm_mod != 3)) || (c->d & MemAbs))
1633 memop = c->modrm_ea; 1844 memop = c->modrm_ea;
1634 1845
@@ -1669,7 +1880,7 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1669 &c->src.val, 1880 &c->src.val,
1670 c->src.bytes, 1881 c->src.bytes,
1671 ctxt->vcpu); 1882 ctxt->vcpu);
1672 if (rc != 0) 1883 if (rc != X86EMUL_CONTINUE)
1673 goto done; 1884 goto done;
1674 c->src.orig_val = c->src.val; 1885 c->src.orig_val = c->src.val;
1675 } 1886 }
@@ -1688,12 +1899,15 @@ x86_emulate_insn(struct x86_emulate_ctxt *ctxt, struct x86_emulate_ops *ops)
1688 c->dst.ptr = (void *)c->dst.ptr + 1899 c->dst.ptr = (void *)c->dst.ptr +
1689 (c->src.val & mask) / 8; 1900 (c->src.val & mask) / 8;
1690 } 1901 }
1691 if (!(c->d & Mov) && 1902 if (!(c->d & Mov)) {
1692 /* optimisation - avoid slow emulated read */ 1903 /* optimisation - avoid slow emulated read */
1693 ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 1904 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1694 &c->dst.val, 1905 &c->dst.val,
1695 c->dst.bytes, ctxt->vcpu)) != 0)) 1906 c->dst.bytes,
1696 goto done; 1907 ctxt->vcpu);
1908 if (rc != X86EMUL_CONTINUE)
1909 goto done;
1910 }
1697 } 1911 }
1698 c->dst.orig_val = c->dst.val; 1912 c->dst.orig_val = c->dst.val;
1699 1913
@@ -1707,18 +1921,45 @@ special_insn:
1707 add: /* add */ 1921 add: /* add */
1708 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags); 1922 emulate_2op_SrcV("add", c->src, c->dst, ctxt->eflags);
1709 break; 1923 break;
1924 case 0x06: /* push es */
1925 emulate_push_sreg(ctxt, VCPU_SREG_ES);
1926 break;
1927 case 0x07: /* pop es */
1928 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_ES);
1929 if (rc != 0)
1930 goto done;
1931 break;
1710 case 0x08 ... 0x0d: 1932 case 0x08 ... 0x0d:
1711 or: /* or */ 1933 or: /* or */
1712 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags); 1934 emulate_2op_SrcV("or", c->src, c->dst, ctxt->eflags);
1713 break; 1935 break;
1936 case 0x0e: /* push cs */
1937 emulate_push_sreg(ctxt, VCPU_SREG_CS);
1938 break;
1714 case 0x10 ... 0x15: 1939 case 0x10 ... 0x15:
1715 adc: /* adc */ 1940 adc: /* adc */
1716 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags); 1941 emulate_2op_SrcV("adc", c->src, c->dst, ctxt->eflags);
1717 break; 1942 break;
1943 case 0x16: /* push ss */
1944 emulate_push_sreg(ctxt, VCPU_SREG_SS);
1945 break;
1946 case 0x17: /* pop ss */
1947 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_SS);
1948 if (rc != 0)
1949 goto done;
1950 break;
1718 case 0x18 ... 0x1d: 1951 case 0x18 ... 0x1d:
1719 sbb: /* sbb */ 1952 sbb: /* sbb */
1720 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags); 1953 emulate_2op_SrcV("sbb", c->src, c->dst, ctxt->eflags);
1721 break; 1954 break;
1955 case 0x1e: /* push ds */
1956 emulate_push_sreg(ctxt, VCPU_SREG_DS);
1957 break;
1958 case 0x1f: /* pop ds */
1959 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_DS);
1960 if (rc != 0)
1961 goto done;
1962 break;
1722 case 0x20 ... 0x25: 1963 case 0x20 ... 0x25:
1723 and: /* and */ 1964 and: /* and */
1724 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags); 1965 emulate_2op_SrcV("and", c->src, c->dst, ctxt->eflags);
@@ -1750,6 +1991,14 @@ special_insn:
1750 if (rc != 0) 1991 if (rc != 0)
1751 goto done; 1992 goto done;
1752 break; 1993 break;
1994 case 0x60: /* pusha */
1995 emulate_pusha(ctxt);
1996 break;
1997 case 0x61: /* popa */
1998 rc = emulate_popa(ctxt, ops);
1999 if (rc != 0)
2000 goto done;
2001 break;
1753 case 0x63: /* movsxd */ 2002 case 0x63: /* movsxd */
1754 if (ctxt->mode != X86EMUL_MODE_PROT64) 2003 if (ctxt->mode != X86EMUL_MODE_PROT64)
1755 goto cannot_emulate; 2004 goto cannot_emulate;
@@ -1761,7 +2010,12 @@ special_insn:
1761 break; 2010 break;
1762 case 0x6c: /* insb */ 2011 case 0x6c: /* insb */
1763 case 0x6d: /* insw/insd */ 2012 case 0x6d: /* insw/insd */
1764 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 2013 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2014 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2015 kvm_inject_gp(ctxt->vcpu, 0);
2016 goto done;
2017 }
2018 if (kvm_emulate_pio_string(ctxt->vcpu,
1765 1, 2019 1,
1766 (c->d & ByteOp) ? 1 : c->op_bytes, 2020 (c->d & ByteOp) ? 1 : c->op_bytes,
1767 c->rep_prefix ? 2021 c->rep_prefix ?
@@ -1777,7 +2031,12 @@ special_insn:
1777 return 0; 2031 return 0;
1778 case 0x6e: /* outsb */ 2032 case 0x6e: /* outsb */
1779 case 0x6f: /* outsw/outsd */ 2033 case 0x6f: /* outsw/outsd */
1780 if (kvm_emulate_pio_string(ctxt->vcpu, NULL, 2034 if (!emulator_io_permited(ctxt, ops, c->regs[VCPU_REGS_RDX],
2035 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2036 kvm_inject_gp(ctxt->vcpu, 0);
2037 goto done;
2038 }
2039 if (kvm_emulate_pio_string(ctxt->vcpu,
1781 0, 2040 0,
1782 (c->d & ByteOp) ? 1 : c->op_bytes, 2041 (c->d & ByteOp) ? 1 : c->op_bytes,
1783 c->rep_prefix ? 2042 c->rep_prefix ?
@@ -1863,25 +2122,19 @@ special_insn:
1863 break; 2122 break;
1864 case 0x8e: { /* mov seg, r/m16 */ 2123 case 0x8e: { /* mov seg, r/m16 */
1865 uint16_t sel; 2124 uint16_t sel;
1866 int type_bits;
1867 int err;
1868 2125
1869 sel = c->src.val; 2126 sel = c->src.val;
1870 if (c->modrm_reg == VCPU_SREG_SS)
1871 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
1872 2127
1873 if (c->modrm_reg <= 5) { 2128 if (c->modrm_reg == VCPU_SREG_CS ||
1874 type_bits = (c->modrm_reg == 1) ? 9 : 1; 2129 c->modrm_reg > VCPU_SREG_GS) {
1875 err = kvm_load_segment_descriptor(ctxt->vcpu, sel, 2130 kvm_queue_exception(ctxt->vcpu, UD_VECTOR);
1876 type_bits, c->modrm_reg); 2131 goto done;
1877 } else {
1878 printk(KERN_INFO "Invalid segreg in modrm byte 0x%02x\n",
1879 c->modrm);
1880 goto cannot_emulate;
1881 } 2132 }
1882 2133
1883 if (err < 0) 2134 if (c->modrm_reg == VCPU_SREG_SS)
1884 goto cannot_emulate; 2135 toggle_interruptibility(ctxt, X86_SHADOW_INT_MOV_SS);
2136
2137 rc = kvm_load_segment_descriptor(ctxt->vcpu, sel, c->modrm_reg);
1885 2138
1886 c->dst.type = OP_NONE; /* Disable writeback. */ 2139 c->dst.type = OP_NONE; /* Disable writeback. */
1887 break; 2140 break;
@@ -1910,7 +2163,10 @@ special_insn:
1910 c->dst.type = OP_REG; 2163 c->dst.type = OP_REG;
1911 c->dst.ptr = (unsigned long *) &ctxt->eflags; 2164 c->dst.ptr = (unsigned long *) &ctxt->eflags;
1912 c->dst.bytes = c->op_bytes; 2165 c->dst.bytes = c->op_bytes;
1913 goto pop_instruction; 2166 rc = emulate_popf(ctxt, ops, &c->dst.val, c->op_bytes);
2167 if (rc != X86EMUL_CONTINUE)
2168 goto done;
2169 break;
1914 case 0xa0 ... 0xa1: /* mov */ 2170 case 0xa0 ... 0xa1: /* mov */
1915 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2171 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1916 c->dst.val = c->src.val; 2172 c->dst.val = c->src.val;
@@ -1924,11 +2180,12 @@ special_insn:
1924 c->dst.ptr = (unsigned long *)register_address(c, 2180 c->dst.ptr = (unsigned long *)register_address(c,
1925 es_base(ctxt), 2181 es_base(ctxt),
1926 c->regs[VCPU_REGS_RDI]); 2182 c->regs[VCPU_REGS_RDI]);
1927 if ((rc = ops->read_emulated(register_address(c, 2183 rc = ops->read_emulated(register_address(c,
1928 seg_override_base(ctxt, c), 2184 seg_override_base(ctxt, c),
1929 c->regs[VCPU_REGS_RSI]), 2185 c->regs[VCPU_REGS_RSI]),
1930 &c->dst.val, 2186 &c->dst.val,
1931 c->dst.bytes, ctxt->vcpu)) != 0) 2187 c->dst.bytes, ctxt->vcpu);
2188 if (rc != X86EMUL_CONTINUE)
1932 goto done; 2189 goto done;
1933 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2190 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
1934 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2191 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -1943,10 +2200,11 @@ special_insn:
1943 c->src.ptr = (unsigned long *)register_address(c, 2200 c->src.ptr = (unsigned long *)register_address(c,
1944 seg_override_base(ctxt, c), 2201 seg_override_base(ctxt, c),
1945 c->regs[VCPU_REGS_RSI]); 2202 c->regs[VCPU_REGS_RSI]);
1946 if ((rc = ops->read_emulated((unsigned long)c->src.ptr, 2203 rc = ops->read_emulated((unsigned long)c->src.ptr,
1947 &c->src.val, 2204 &c->src.val,
1948 c->src.bytes, 2205 c->src.bytes,
1949 ctxt->vcpu)) != 0) 2206 ctxt->vcpu);
2207 if (rc != X86EMUL_CONTINUE)
1950 goto done; 2208 goto done;
1951 2209
1952 c->dst.type = OP_NONE; /* Disable writeback. */ 2210 c->dst.type = OP_NONE; /* Disable writeback. */
@@ -1954,10 +2212,11 @@ special_insn:
1954 c->dst.ptr = (unsigned long *)register_address(c, 2212 c->dst.ptr = (unsigned long *)register_address(c,
1955 es_base(ctxt), 2213 es_base(ctxt),
1956 c->regs[VCPU_REGS_RDI]); 2214 c->regs[VCPU_REGS_RDI]);
1957 if ((rc = ops->read_emulated((unsigned long)c->dst.ptr, 2215 rc = ops->read_emulated((unsigned long)c->dst.ptr,
1958 &c->dst.val, 2216 &c->dst.val,
1959 c->dst.bytes, 2217 c->dst.bytes,
1960 ctxt->vcpu)) != 0) 2218 ctxt->vcpu);
2219 if (rc != X86EMUL_CONTINUE)
1961 goto done; 2220 goto done;
1962 2221
1963 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr); 2222 DPRINTF("cmps: mem1=0x%p mem2=0x%p\n", c->src.ptr, c->dst.ptr);
@@ -1987,12 +2246,13 @@ special_insn:
1987 c->dst.type = OP_REG; 2246 c->dst.type = OP_REG;
1988 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes; 2247 c->dst.bytes = (c->d & ByteOp) ? 1 : c->op_bytes;
1989 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX]; 2248 c->dst.ptr = (unsigned long *)&c->regs[VCPU_REGS_RAX];
1990 if ((rc = ops->read_emulated(register_address(c, 2249 rc = ops->read_emulated(register_address(c,
1991 seg_override_base(ctxt, c), 2250 seg_override_base(ctxt, c),
1992 c->regs[VCPU_REGS_RSI]), 2251 c->regs[VCPU_REGS_RSI]),
1993 &c->dst.val, 2252 &c->dst.val,
1994 c->dst.bytes, 2253 c->dst.bytes,
1995 ctxt->vcpu)) != 0) 2254 ctxt->vcpu);
2255 if (rc != X86EMUL_CONTINUE)
1996 goto done; 2256 goto done;
1997 register_address_increment(c, &c->regs[VCPU_REGS_RSI], 2257 register_address_increment(c, &c->regs[VCPU_REGS_RSI],
1998 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes 2258 (ctxt->eflags & EFLG_DF) ? -c->dst.bytes
@@ -2048,11 +2308,9 @@ special_insn:
2048 case 0xe9: /* jmp rel */ 2308 case 0xe9: /* jmp rel */
2049 goto jmp; 2309 goto jmp;
2050 case 0xea: /* jmp far */ 2310 case 0xea: /* jmp far */
2051 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val, 9, 2311 if (kvm_load_segment_descriptor(ctxt->vcpu, c->src2.val,
2052 VCPU_SREG_CS) < 0) { 2312 VCPU_SREG_CS))
2053 DPRINTF("jmp far: Failed to load CS descriptor\n"); 2313 goto done;
2054 goto cannot_emulate;
2055 }
2056 2314
2057 c->eip = c->src.val; 2315 c->eip = c->src.val;
2058 break; 2316 break;
@@ -2070,7 +2328,13 @@ special_insn:
2070 case 0xef: /* out (e/r)ax,dx */ 2328 case 0xef: /* out (e/r)ax,dx */
2071 port = c->regs[VCPU_REGS_RDX]; 2329 port = c->regs[VCPU_REGS_RDX];
2072 io_dir_in = 0; 2330 io_dir_in = 0;
2073 do_io: if (kvm_emulate_pio(ctxt->vcpu, NULL, io_dir_in, 2331 do_io:
2332 if (!emulator_io_permited(ctxt, ops, port,
2333 (c->d & ByteOp) ? 1 : c->op_bytes)) {
2334 kvm_inject_gp(ctxt->vcpu, 0);
2335 goto done;
2336 }
2337 if (kvm_emulate_pio(ctxt->vcpu, io_dir_in,
2074 (c->d & ByteOp) ? 1 : c->op_bytes, 2338 (c->d & ByteOp) ? 1 : c->op_bytes,
2075 port) != 0) { 2339 port) != 0) {
2076 c->eip = saved_eip; 2340 c->eip = saved_eip;
@@ -2095,13 +2359,21 @@ special_insn:
2095 c->dst.type = OP_NONE; /* Disable writeback. */ 2359 c->dst.type = OP_NONE; /* Disable writeback. */
2096 break; 2360 break;
2097 case 0xfa: /* cli */ 2361 case 0xfa: /* cli */
2098 ctxt->eflags &= ~X86_EFLAGS_IF; 2362 if (emulator_bad_iopl(ctxt))
2099 c->dst.type = OP_NONE; /* Disable writeback. */ 2363 kvm_inject_gp(ctxt->vcpu, 0);
2364 else {
2365 ctxt->eflags &= ~X86_EFLAGS_IF;
2366 c->dst.type = OP_NONE; /* Disable writeback. */
2367 }
2100 break; 2368 break;
2101 case 0xfb: /* sti */ 2369 case 0xfb: /* sti */
2102 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI); 2370 if (emulator_bad_iopl(ctxt))
2103 ctxt->eflags |= X86_EFLAGS_IF; 2371 kvm_inject_gp(ctxt->vcpu, 0);
2104 c->dst.type = OP_NONE; /* Disable writeback. */ 2372 else {
2373 toggle_interruptibility(ctxt, X86_SHADOW_INT_STI);
2374 ctxt->eflags |= X86_EFLAGS_IF;
2375 c->dst.type = OP_NONE; /* Disable writeback. */
2376 }
2105 break; 2377 break;
2106 case 0xfc: /* cld */ 2378 case 0xfc: /* cld */
2107 ctxt->eflags &= ~EFLG_DF; 2379 ctxt->eflags &= ~EFLG_DF;
@@ -2204,8 +2476,9 @@ twobyte_insn:
2204 } 2476 }
2205 break; 2477 break;
2206 case 0x05: /* syscall */ 2478 case 0x05: /* syscall */
2207 if (emulate_syscall(ctxt) == -1) 2479 rc = emulate_syscall(ctxt);
2208 goto cannot_emulate; 2480 if (rc != X86EMUL_CONTINUE)
2481 goto done;
2209 else 2482 else
2210 goto writeback; 2483 goto writeback;
2211 break; 2484 break;
@@ -2276,14 +2549,16 @@ twobyte_insn:
2276 c->dst.type = OP_NONE; 2549 c->dst.type = OP_NONE;
2277 break; 2550 break;
2278 case 0x34: /* sysenter */ 2551 case 0x34: /* sysenter */
2279 if (emulate_sysenter(ctxt) == -1) 2552 rc = emulate_sysenter(ctxt);
2280 goto cannot_emulate; 2553 if (rc != X86EMUL_CONTINUE)
2554 goto done;
2281 else 2555 else
2282 goto writeback; 2556 goto writeback;
2283 break; 2557 break;
2284 case 0x35: /* sysexit */ 2558 case 0x35: /* sysexit */
2285 if (emulate_sysexit(ctxt) == -1) 2559 rc = emulate_sysexit(ctxt);
2286 goto cannot_emulate; 2560 if (rc != X86EMUL_CONTINUE)
2561 goto done;
2287 else 2562 else
2288 goto writeback; 2563 goto writeback;
2289 break; 2564 break;
@@ -2297,6 +2572,14 @@ twobyte_insn:
2297 jmp_rel(c, c->src.val); 2572 jmp_rel(c, c->src.val);
2298 c->dst.type = OP_NONE; 2573 c->dst.type = OP_NONE;
2299 break; 2574 break;
2575 case 0xa0: /* push fs */
2576 emulate_push_sreg(ctxt, VCPU_SREG_FS);
2577 break;
2578 case 0xa1: /* pop fs */
2579 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_FS);
2580 if (rc != 0)
2581 goto done;
2582 break;
2300 case 0xa3: 2583 case 0xa3:
2301 bt: /* bt */ 2584 bt: /* bt */
2302 c->dst.type = OP_NONE; 2585 c->dst.type = OP_NONE;
@@ -2308,6 +2591,14 @@ twobyte_insn:
2308 case 0xa5: /* shld cl, r, r/m */ 2591 case 0xa5: /* shld cl, r, r/m */
2309 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags); 2592 emulate_2op_cl("shld", c->src2, c->src, c->dst, ctxt->eflags);
2310 break; 2593 break;
2594 case 0xa8: /* push gs */
2595 emulate_push_sreg(ctxt, VCPU_SREG_GS);
2596 break;
2597 case 0xa9: /* pop gs */
2598 rc = emulate_pop_sreg(ctxt, ops, VCPU_SREG_GS);
2599 if (rc != 0)
2600 goto done;
2601 break;
2311 case 0xab: 2602 case 0xab:
2312 bts: /* bts */ 2603 bts: /* bts */
2313 /* only subword offset */ 2604 /* only subword offset */
diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c
index 144e7f60b5e2..0150affad25d 100644
--- a/arch/x86/kvm/i8254.c
+++ b/arch/x86/kvm/i8254.c
@@ -29,7 +29,10 @@
29 * Based on QEMU and Xen. 29 * Based on QEMU and Xen.
30 */ 30 */
31 31
32#define pr_fmt(fmt) "pit: " fmt
33
32#include <linux/kvm_host.h> 34#include <linux/kvm_host.h>
35#include <linux/slab.h>
33 36
34#include "irq.h" 37#include "irq.h"
35#include "i8254.h" 38#include "i8254.h"
@@ -240,11 +243,11 @@ static void kvm_pit_ack_irq(struct kvm_irq_ack_notifier *kian)
240{ 243{
241 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state, 244 struct kvm_kpit_state *ps = container_of(kian, struct kvm_kpit_state,
242 irq_ack_notifier); 245 irq_ack_notifier);
243 spin_lock(&ps->inject_lock); 246 raw_spin_lock(&ps->inject_lock);
244 if (atomic_dec_return(&ps->pit_timer.pending) < 0) 247 if (atomic_dec_return(&ps->pit_timer.pending) < 0)
245 atomic_inc(&ps->pit_timer.pending); 248 atomic_inc(&ps->pit_timer.pending);
246 ps->irq_ack = 1; 249 ps->irq_ack = 1;
247 spin_unlock(&ps->inject_lock); 250 raw_spin_unlock(&ps->inject_lock);
248} 251}
249 252
250void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) 253void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
@@ -262,7 +265,7 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu)
262 265
263static void destroy_pit_timer(struct kvm_timer *pt) 266static void destroy_pit_timer(struct kvm_timer *pt)
264{ 267{
265 pr_debug("pit: execute del timer!\n"); 268 pr_debug("execute del timer!\n");
266 hrtimer_cancel(&pt->timer); 269 hrtimer_cancel(&pt->timer);
267} 270}
268 271
@@ -284,7 +287,7 @@ static void create_pit_timer(struct kvm_kpit_state *ps, u32 val, int is_period)
284 287
285 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ); 288 interval = muldiv64(val, NSEC_PER_SEC, KVM_PIT_FREQ);
286 289
287 pr_debug("pit: create pit timer, interval is %llu nsec\n", interval); 290 pr_debug("create pit timer, interval is %llu nsec\n", interval);
288 291
289 /* TODO The new value only affected after the retriggered */ 292 /* TODO The new value only affected after the retriggered */
290 hrtimer_cancel(&pt->timer); 293 hrtimer_cancel(&pt->timer);
@@ -309,7 +312,7 @@ static void pit_load_count(struct kvm *kvm, int channel, u32 val)
309 312
310 WARN_ON(!mutex_is_locked(&ps->lock)); 313 WARN_ON(!mutex_is_locked(&ps->lock));
311 314
312 pr_debug("pit: load_count val is %d, channel is %d\n", val, channel); 315 pr_debug("load_count val is %d, channel is %d\n", val, channel);
313 316
314 /* 317 /*
315 * The largest possible initial count is 0; this is equivalent 318 * The largest possible initial count is 0; this is equivalent
@@ -395,8 +398,8 @@ static int pit_ioport_write(struct kvm_io_device *this,
395 mutex_lock(&pit_state->lock); 398 mutex_lock(&pit_state->lock);
396 399
397 if (val != 0) 400 if (val != 0)
398 pr_debug("pit: write addr is 0x%x, len is %d, val is 0x%x\n", 401 pr_debug("write addr is 0x%x, len is %d, val is 0x%x\n",
399 (unsigned int)addr, len, val); 402 (unsigned int)addr, len, val);
400 403
401 if (addr == 3) { 404 if (addr == 3) {
402 channel = val >> 6; 405 channel = val >> 6;
@@ -465,6 +468,9 @@ static int pit_ioport_read(struct kvm_io_device *this,
465 return -EOPNOTSUPP; 468 return -EOPNOTSUPP;
466 469
467 addr &= KVM_PIT_CHANNEL_MASK; 470 addr &= KVM_PIT_CHANNEL_MASK;
471 if (addr == 3)
472 return 0;
473
468 s = &pit_state->channels[addr]; 474 s = &pit_state->channels[addr];
469 475
470 mutex_lock(&pit_state->lock); 476 mutex_lock(&pit_state->lock);
@@ -600,7 +606,7 @@ static const struct kvm_io_device_ops speaker_dev_ops = {
600 .write = speaker_ioport_write, 606 .write = speaker_ioport_write,
601}; 607};
602 608
603/* Caller must have writers lock on slots_lock */ 609/* Caller must hold slots_lock */
604struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags) 610struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
605{ 611{
606 struct kvm_pit *pit; 612 struct kvm_pit *pit;
@@ -619,7 +625,7 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
619 625
620 mutex_init(&pit->pit_state.lock); 626 mutex_init(&pit->pit_state.lock);
621 mutex_lock(&pit->pit_state.lock); 627 mutex_lock(&pit->pit_state.lock);
622 spin_lock_init(&pit->pit_state.inject_lock); 628 raw_spin_lock_init(&pit->pit_state.inject_lock);
623 629
624 kvm->arch.vpit = pit; 630 kvm->arch.vpit = pit;
625 pit->kvm = kvm; 631 pit->kvm = kvm;
@@ -640,13 +646,13 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
640 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier); 646 kvm_register_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
641 647
642 kvm_iodevice_init(&pit->dev, &pit_dev_ops); 648 kvm_iodevice_init(&pit->dev, &pit_dev_ops);
643 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, &pit->dev); 649 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &pit->dev);
644 if (ret < 0) 650 if (ret < 0)
645 goto fail; 651 goto fail;
646 652
647 if (flags & KVM_PIT_SPEAKER_DUMMY) { 653 if (flags & KVM_PIT_SPEAKER_DUMMY) {
648 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops); 654 kvm_iodevice_init(&pit->speaker_dev, &speaker_dev_ops);
649 ret = __kvm_io_bus_register_dev(&kvm->pio_bus, 655 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS,
650 &pit->speaker_dev); 656 &pit->speaker_dev);
651 if (ret < 0) 657 if (ret < 0)
652 goto fail_unregister; 658 goto fail_unregister;
@@ -655,11 +661,12 @@ struct kvm_pit *kvm_create_pit(struct kvm *kvm, u32 flags)
655 return pit; 661 return pit;
656 662
657fail_unregister: 663fail_unregister:
658 __kvm_io_bus_unregister_dev(&kvm->pio_bus, &pit->dev); 664 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &pit->dev);
659 665
660fail: 666fail:
661 if (pit->irq_source_id >= 0) 667 kvm_unregister_irq_mask_notifier(kvm, 0, &pit->mask_notifier);
662 kvm_free_irq_source_id(kvm, pit->irq_source_id); 668 kvm_unregister_irq_ack_notifier(kvm, &pit_state->irq_ack_notifier);
669 kvm_free_irq_source_id(kvm, pit->irq_source_id);
663 670
664 kfree(pit); 671 kfree(pit);
665 return NULL; 672 return NULL;
@@ -688,10 +695,8 @@ static void __inject_pit_timer_intr(struct kvm *kvm)
688 struct kvm_vcpu *vcpu; 695 struct kvm_vcpu *vcpu;
689 int i; 696 int i;
690 697
691 mutex_lock(&kvm->irq_lock);
692 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1); 698 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 1);
693 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0); 699 kvm_set_irq(kvm, kvm->arch.vpit->irq_source_id, 0, 0);
694 mutex_unlock(&kvm->irq_lock);
695 700
696 /* 701 /*
697 * Provides NMI watchdog support via Virtual Wire mode. 702 * Provides NMI watchdog support via Virtual Wire mode.
@@ -720,12 +725,12 @@ void kvm_inject_pit_timer_irqs(struct kvm_vcpu *vcpu)
720 /* Try to inject pending interrupts when 725 /* Try to inject pending interrupts when
721 * last one has been acked. 726 * last one has been acked.
722 */ 727 */
723 spin_lock(&ps->inject_lock); 728 raw_spin_lock(&ps->inject_lock);
724 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) { 729 if (atomic_read(&ps->pit_timer.pending) && ps->irq_ack) {
725 ps->irq_ack = 0; 730 ps->irq_ack = 0;
726 inject = 1; 731 inject = 1;
727 } 732 }
728 spin_unlock(&ps->inject_lock); 733 raw_spin_unlock(&ps->inject_lock);
729 if (inject) 734 if (inject)
730 __inject_pit_timer_intr(kvm); 735 __inject_pit_timer_intr(kvm);
731 } 736 }
diff --git a/arch/x86/kvm/i8254.h b/arch/x86/kvm/i8254.h
index d4c1c7ffdc09..900d6b0ba7c2 100644
--- a/arch/x86/kvm/i8254.h
+++ b/arch/x86/kvm/i8254.h
@@ -27,7 +27,7 @@ struct kvm_kpit_state {
27 u32 speaker_data_on; 27 u32 speaker_data_on;
28 struct mutex lock; 28 struct mutex lock;
29 struct kvm_pit *pit; 29 struct kvm_pit *pit;
30 spinlock_t inject_lock; 30 raw_spinlock_t inject_lock;
31 unsigned long irq_ack; 31 unsigned long irq_ack;
32 struct kvm_irq_ack_notifier irq_ack_notifier; 32 struct kvm_irq_ack_notifier irq_ack_notifier;
33}; 33};
diff --git a/arch/x86/kvm/i8259.c b/arch/x86/kvm/i8259.c
index 01f151682802..a790fa128a9f 100644
--- a/arch/x86/kvm/i8259.c
+++ b/arch/x86/kvm/i8259.c
@@ -26,6 +26,7 @@
26 * Port from Qemu. 26 * Port from Qemu.
27 */ 27 */
28#include <linux/mm.h> 28#include <linux/mm.h>
29#include <linux/slab.h>
29#include <linux/bitops.h> 30#include <linux/bitops.h>
30#include "irq.h" 31#include "irq.h"
31 32
@@ -38,16 +39,25 @@ static void pic_clear_isr(struct kvm_kpic_state *s, int irq)
38 s->isr_ack |= (1 << irq); 39 s->isr_ack |= (1 << irq);
39 if (s != &s->pics_state->pics[0]) 40 if (s != &s->pics_state->pics[0])
40 irq += 8; 41 irq += 8;
42 /*
43 * We are dropping lock while calling ack notifiers since ack
44 * notifier callbacks for assigned devices call into PIC recursively.
45 * Other interrupt may be delivered to PIC while lock is dropped but
46 * it should be safe since PIC state is already updated at this stage.
47 */
48 raw_spin_unlock(&s->pics_state->lock);
41 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq); 49 kvm_notify_acked_irq(s->pics_state->kvm, SELECT_PIC(irq), irq);
50 raw_spin_lock(&s->pics_state->lock);
42} 51}
43 52
44void kvm_pic_clear_isr_ack(struct kvm *kvm) 53void kvm_pic_clear_isr_ack(struct kvm *kvm)
45{ 54{
46 struct kvm_pic *s = pic_irqchip(kvm); 55 struct kvm_pic *s = pic_irqchip(kvm);
47 spin_lock(&s->lock); 56
57 raw_spin_lock(&s->lock);
48 s->pics[0].isr_ack = 0xff; 58 s->pics[0].isr_ack = 0xff;
49 s->pics[1].isr_ack = 0xff; 59 s->pics[1].isr_ack = 0xff;
50 spin_unlock(&s->lock); 60 raw_spin_unlock(&s->lock);
51} 61}
52 62
53/* 63/*
@@ -148,9 +158,9 @@ static void pic_update_irq(struct kvm_pic *s)
148 158
149void kvm_pic_update_irq(struct kvm_pic *s) 159void kvm_pic_update_irq(struct kvm_pic *s)
150{ 160{
151 spin_lock(&s->lock); 161 raw_spin_lock(&s->lock);
152 pic_update_irq(s); 162 pic_update_irq(s);
153 spin_unlock(&s->lock); 163 raw_spin_unlock(&s->lock);
154} 164}
155 165
156int kvm_pic_set_irq(void *opaque, int irq, int level) 166int kvm_pic_set_irq(void *opaque, int irq, int level)
@@ -158,14 +168,14 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
158 struct kvm_pic *s = opaque; 168 struct kvm_pic *s = opaque;
159 int ret = -1; 169 int ret = -1;
160 170
161 spin_lock(&s->lock); 171 raw_spin_lock(&s->lock);
162 if (irq >= 0 && irq < PIC_NUM_PINS) { 172 if (irq >= 0 && irq < PIC_NUM_PINS) {
163 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level); 173 ret = pic_set_irq1(&s->pics[irq >> 3], irq & 7, level);
164 pic_update_irq(s); 174 pic_update_irq(s);
165 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr, 175 trace_kvm_pic_set_irq(irq >> 3, irq & 7, s->pics[irq >> 3].elcr,
166 s->pics[irq >> 3].imr, ret == 0); 176 s->pics[irq >> 3].imr, ret == 0);
167 } 177 }
168 spin_unlock(&s->lock); 178 raw_spin_unlock(&s->lock);
169 179
170 return ret; 180 return ret;
171} 181}
@@ -176,16 +186,18 @@ int kvm_pic_set_irq(void *opaque, int irq, int level)
176static inline void pic_intack(struct kvm_kpic_state *s, int irq) 186static inline void pic_intack(struct kvm_kpic_state *s, int irq)
177{ 187{
178 s->isr |= 1 << irq; 188 s->isr |= 1 << irq;
179 if (s->auto_eoi) {
180 if (s->rotate_on_auto_eoi)
181 s->priority_add = (irq + 1) & 7;
182 pic_clear_isr(s, irq);
183 }
184 /* 189 /*
185 * We don't clear a level sensitive interrupt here 190 * We don't clear a level sensitive interrupt here
186 */ 191 */
187 if (!(s->elcr & (1 << irq))) 192 if (!(s->elcr & (1 << irq)))
188 s->irr &= ~(1 << irq); 193 s->irr &= ~(1 << irq);
194
195 if (s->auto_eoi) {
196 if (s->rotate_on_auto_eoi)
197 s->priority_add = (irq + 1) & 7;
198 pic_clear_isr(s, irq);
199 }
200
189} 201}
190 202
191int kvm_pic_read_irq(struct kvm *kvm) 203int kvm_pic_read_irq(struct kvm *kvm)
@@ -193,7 +205,7 @@ int kvm_pic_read_irq(struct kvm *kvm)
193 int irq, irq2, intno; 205 int irq, irq2, intno;
194 struct kvm_pic *s = pic_irqchip(kvm); 206 struct kvm_pic *s = pic_irqchip(kvm);
195 207
196 spin_lock(&s->lock); 208 raw_spin_lock(&s->lock);
197 irq = pic_get_irq(&s->pics[0]); 209 irq = pic_get_irq(&s->pics[0]);
198 if (irq >= 0) { 210 if (irq >= 0) {
199 pic_intack(&s->pics[0], irq); 211 pic_intack(&s->pics[0], irq);
@@ -218,29 +230,18 @@ int kvm_pic_read_irq(struct kvm *kvm)
218 intno = s->pics[0].irq_base + irq; 230 intno = s->pics[0].irq_base + irq;
219 } 231 }
220 pic_update_irq(s); 232 pic_update_irq(s);
221 spin_unlock(&s->lock); 233 raw_spin_unlock(&s->lock);
222 234
223 return intno; 235 return intno;
224} 236}
225 237
226void kvm_pic_reset(struct kvm_kpic_state *s) 238void kvm_pic_reset(struct kvm_kpic_state *s)
227{ 239{
228 int irq, irqbase, n; 240 int irq;
229 struct kvm *kvm = s->pics_state->irq_request_opaque; 241 struct kvm *kvm = s->pics_state->irq_request_opaque;
230 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu; 242 struct kvm_vcpu *vcpu0 = kvm->bsp_vcpu;
243 u8 irr = s->irr, isr = s->imr;
231 244
232 if (s == &s->pics_state->pics[0])
233 irqbase = 0;
234 else
235 irqbase = 8;
236
237 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
238 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
239 if (s->irr & (1 << irq) || s->isr & (1 << irq)) {
240 n = irq + irqbase;
241 kvm_notify_acked_irq(kvm, SELECT_PIC(n), n);
242 }
243 }
244 s->last_irr = 0; 245 s->last_irr = 0;
245 s->irr = 0; 246 s->irr = 0;
246 s->imr = 0; 247 s->imr = 0;
@@ -256,6 +257,13 @@ void kvm_pic_reset(struct kvm_kpic_state *s)
256 s->rotate_on_auto_eoi = 0; 257 s->rotate_on_auto_eoi = 0;
257 s->special_fully_nested_mode = 0; 258 s->special_fully_nested_mode = 0;
258 s->init4 = 0; 259 s->init4 = 0;
260
261 for (irq = 0; irq < PIC_NUM_PINS/2; irq++) {
262 if (vcpu0 && kvm_apic_accept_pic_intr(vcpu0))
263 if (irr & (1 << irq) || isr & (1 << irq)) {
264 pic_clear_isr(s, irq);
265 }
266 }
259} 267}
260 268
261static void pic_ioport_write(void *opaque, u32 addr, u32 val) 269static void pic_ioport_write(void *opaque, u32 addr, u32 val)
@@ -298,9 +306,9 @@ static void pic_ioport_write(void *opaque, u32 addr, u32 val)
298 priority = get_priority(s, s->isr); 306 priority = get_priority(s, s->isr);
299 if (priority != 8) { 307 if (priority != 8) {
300 irq = (priority + s->priority_add) & 7; 308 irq = (priority + s->priority_add) & 7;
301 pic_clear_isr(s, irq);
302 if (cmd == 5) 309 if (cmd == 5)
303 s->priority_add = (irq + 1) & 7; 310 s->priority_add = (irq + 1) & 7;
311 pic_clear_isr(s, irq);
304 pic_update_irq(s->pics_state); 312 pic_update_irq(s->pics_state);
305 } 313 }
306 break; 314 break;
@@ -436,7 +444,7 @@ static int picdev_write(struct kvm_io_device *this,
436 printk(KERN_ERR "PIC: non byte write\n"); 444 printk(KERN_ERR "PIC: non byte write\n");
437 return 0; 445 return 0;
438 } 446 }
439 spin_lock(&s->lock); 447 raw_spin_lock(&s->lock);
440 switch (addr) { 448 switch (addr) {
441 case 0x20: 449 case 0x20:
442 case 0x21: 450 case 0x21:
@@ -449,7 +457,7 @@ static int picdev_write(struct kvm_io_device *this,
449 elcr_ioport_write(&s->pics[addr & 1], addr, data); 457 elcr_ioport_write(&s->pics[addr & 1], addr, data);
450 break; 458 break;
451 } 459 }
452 spin_unlock(&s->lock); 460 raw_spin_unlock(&s->lock);
453 return 0; 461 return 0;
454} 462}
455 463
@@ -466,7 +474,7 @@ static int picdev_read(struct kvm_io_device *this,
466 printk(KERN_ERR "PIC: non byte read\n"); 474 printk(KERN_ERR "PIC: non byte read\n");
467 return 0; 475 return 0;
468 } 476 }
469 spin_lock(&s->lock); 477 raw_spin_lock(&s->lock);
470 switch (addr) { 478 switch (addr) {
471 case 0x20: 479 case 0x20:
472 case 0x21: 480 case 0x21:
@@ -480,7 +488,7 @@ static int picdev_read(struct kvm_io_device *this,
480 break; 488 break;
481 } 489 }
482 *(unsigned char *)val = data; 490 *(unsigned char *)val = data;
483 spin_unlock(&s->lock); 491 raw_spin_unlock(&s->lock);
484 return 0; 492 return 0;
485} 493}
486 494
@@ -514,7 +522,7 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
514 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL); 522 s = kzalloc(sizeof(struct kvm_pic), GFP_KERNEL);
515 if (!s) 523 if (!s)
516 return NULL; 524 return NULL;
517 spin_lock_init(&s->lock); 525 raw_spin_lock_init(&s->lock);
518 s->kvm = kvm; 526 s->kvm = kvm;
519 s->pics[0].elcr_mask = 0xf8; 527 s->pics[0].elcr_mask = 0xf8;
520 s->pics[1].elcr_mask = 0xde; 528 s->pics[1].elcr_mask = 0xde;
@@ -527,7 +535,9 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
527 * Initialize PIO device 535 * Initialize PIO device
528 */ 536 */
529 kvm_iodevice_init(&s->dev, &picdev_ops); 537 kvm_iodevice_init(&s->dev, &picdev_ops);
530 ret = kvm_io_bus_register_dev(kvm, &kvm->pio_bus, &s->dev); 538 mutex_lock(&kvm->slots_lock);
539 ret = kvm_io_bus_register_dev(kvm, KVM_PIO_BUS, &s->dev);
540 mutex_unlock(&kvm->slots_lock);
531 if (ret < 0) { 541 if (ret < 0) {
532 kfree(s); 542 kfree(s);
533 return NULL; 543 return NULL;
@@ -535,3 +545,14 @@ struct kvm_pic *kvm_create_pic(struct kvm *kvm)
535 545
536 return s; 546 return s;
537} 547}
548
549void kvm_destroy_pic(struct kvm *kvm)
550{
551 struct kvm_pic *vpic = kvm->arch.vpic;
552
553 if (vpic) {
554 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS, &vpic->dev);
555 kvm->arch.vpic = NULL;
556 kfree(vpic);
557 }
558}
diff --git a/arch/x86/kvm/irq.h b/arch/x86/kvm/irq.h
index 7d6058a2fd38..34b15915754d 100644
--- a/arch/x86/kvm/irq.h
+++ b/arch/x86/kvm/irq.h
@@ -62,7 +62,7 @@ struct kvm_kpic_state {
62}; 62};
63 63
64struct kvm_pic { 64struct kvm_pic {
65 spinlock_t lock; 65 raw_spinlock_t lock;
66 unsigned pending_acks; 66 unsigned pending_acks;
67 struct kvm *kvm; 67 struct kvm *kvm;
68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */ 68 struct kvm_kpic_state pics[2]; /* 0 is master pic, 1 is slave pic */
@@ -71,9 +71,11 @@ struct kvm_pic {
71 int output; /* intr from master PIC */ 71 int output; /* intr from master PIC */
72 struct kvm_io_device dev; 72 struct kvm_io_device dev;
73 void (*ack_notifier)(void *opaque, int irq); 73 void (*ack_notifier)(void *opaque, int irq);
74 unsigned long irq_states[16];
74}; 75};
75 76
76struct kvm_pic *kvm_create_pic(struct kvm *kvm); 77struct kvm_pic *kvm_create_pic(struct kvm *kvm);
78void kvm_destroy_pic(struct kvm *kvm);
77int kvm_pic_read_irq(struct kvm *kvm); 79int kvm_pic_read_irq(struct kvm *kvm);
78void kvm_pic_update_irq(struct kvm_pic *s); 80void kvm_pic_update_irq(struct kvm_pic *s);
79void kvm_pic_clear_isr_ack(struct kvm *kvm); 81void kvm_pic_clear_isr_ack(struct kvm *kvm);
@@ -85,7 +87,11 @@ static inline struct kvm_pic *pic_irqchip(struct kvm *kvm)
85 87
86static inline int irqchip_in_kernel(struct kvm *kvm) 88static inline int irqchip_in_kernel(struct kvm *kvm)
87{ 89{
88 return pic_irqchip(kvm) != NULL; 90 int ret;
91
92 ret = (pic_irqchip(kvm) != NULL);
93 smp_rmb();
94 return ret;
89} 95}
90 96
91void kvm_pic_reset(struct kvm_kpic_state *s); 97void kvm_pic_reset(struct kvm_kpic_state *s);
diff --git a/arch/x86/kvm/kvm_cache_regs.h b/arch/x86/kvm/kvm_cache_regs.h
index 7bcc5b6a4403..cff851cf5322 100644
--- a/arch/x86/kvm/kvm_cache_regs.h
+++ b/arch/x86/kvm/kvm_cache_regs.h
@@ -1,6 +1,11 @@
1#ifndef ASM_KVM_CACHE_REGS_H 1#ifndef ASM_KVM_CACHE_REGS_H
2#define ASM_KVM_CACHE_REGS_H 2#define ASM_KVM_CACHE_REGS_H
3 3
4#define KVM_POSSIBLE_CR0_GUEST_BITS X86_CR0_TS
5#define KVM_POSSIBLE_CR4_GUEST_BITS \
6 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
7 | X86_CR4_OSXMMEXCPT | X86_CR4_PGE)
8
4static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu, 9static inline unsigned long kvm_register_read(struct kvm_vcpu *vcpu,
5 enum kvm_reg reg) 10 enum kvm_reg reg)
6{ 11{
@@ -38,4 +43,30 @@ static inline u64 kvm_pdptr_read(struct kvm_vcpu *vcpu, int index)
38 return vcpu->arch.pdptrs[index]; 43 return vcpu->arch.pdptrs[index];
39} 44}
40 45
46static inline ulong kvm_read_cr0_bits(struct kvm_vcpu *vcpu, ulong mask)
47{
48 ulong tmask = mask & KVM_POSSIBLE_CR0_GUEST_BITS;
49 if (tmask & vcpu->arch.cr0_guest_owned_bits)
50 kvm_x86_ops->decache_cr0_guest_bits(vcpu);
51 return vcpu->arch.cr0 & mask;
52}
53
54static inline ulong kvm_read_cr0(struct kvm_vcpu *vcpu)
55{
56 return kvm_read_cr0_bits(vcpu, ~0UL);
57}
58
59static inline ulong kvm_read_cr4_bits(struct kvm_vcpu *vcpu, ulong mask)
60{
61 ulong tmask = mask & KVM_POSSIBLE_CR4_GUEST_BITS;
62 if (tmask & vcpu->arch.cr4_guest_owned_bits)
63 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
64 return vcpu->arch.cr4 & mask;
65}
66
67static inline ulong kvm_read_cr4(struct kvm_vcpu *vcpu)
68{
69 return kvm_read_cr4_bits(vcpu, ~0UL);
70}
71
41#endif 72#endif
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 23c217692ea9..1eb7a4ae0c9c 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -26,13 +26,13 @@
26#include <linux/io.h> 26#include <linux/io.h>
27#include <linux/module.h> 27#include <linux/module.h>
28#include <linux/math64.h> 28#include <linux/math64.h>
29#include <linux/slab.h>
29#include <asm/processor.h> 30#include <asm/processor.h>
30#include <asm/msr.h> 31#include <asm/msr.h>
31#include <asm/page.h> 32#include <asm/page.h>
32#include <asm/current.h> 33#include <asm/current.h>
33#include <asm/apicdef.h> 34#include <asm/apicdef.h>
34#include <asm/atomic.h> 35#include <asm/atomic.h>
35#include <asm/apicdef.h>
36#include "kvm_cache_regs.h" 36#include "kvm_cache_regs.h"
37#include "irq.h" 37#include "irq.h"
38#include "trace.h" 38#include "trace.h"
@@ -374,6 +374,12 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
374 if (unlikely(!apic_enabled(apic))) 374 if (unlikely(!apic_enabled(apic)))
375 break; 375 break;
376 376
377 if (trig_mode) {
378 apic_debug("level trig mode for vector %d", vector);
379 apic_set_vector(vector, apic->regs + APIC_TMR);
380 } else
381 apic_clear_vector(vector, apic->regs + APIC_TMR);
382
377 result = !apic_test_and_set_irr(vector, apic); 383 result = !apic_test_and_set_irr(vector, apic);
378 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode, 384 trace_kvm_apic_accept_irq(vcpu->vcpu_id, delivery_mode,
379 trig_mode, vector, !result); 385 trig_mode, vector, !result);
@@ -384,11 +390,6 @@ static int __apic_accept_irq(struct kvm_lapic *apic, int delivery_mode,
384 break; 390 break;
385 } 391 }
386 392
387 if (trig_mode) {
388 apic_debug("level trig mode for vector %d", vector);
389 apic_set_vector(vector, apic->regs + APIC_TMR);
390 } else
391 apic_clear_vector(vector, apic->regs + APIC_TMR);
392 kvm_vcpu_kick(vcpu); 393 kvm_vcpu_kick(vcpu);
393 break; 394 break;
394 395
@@ -471,11 +472,8 @@ static void apic_set_eoi(struct kvm_lapic *apic)
471 trigger_mode = IOAPIC_LEVEL_TRIG; 472 trigger_mode = IOAPIC_LEVEL_TRIG;
472 else 473 else
473 trigger_mode = IOAPIC_EDGE_TRIG; 474 trigger_mode = IOAPIC_EDGE_TRIG;
474 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI)) { 475 if (!(apic_get_reg(apic, APIC_SPIV) & APIC_SPIV_DIRECTED_EOI))
475 mutex_lock(&apic->vcpu->kvm->irq_lock);
476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode); 476 kvm_ioapic_update_eoi(apic->vcpu->kvm, vector, trigger_mode);
477 mutex_unlock(&apic->vcpu->kvm->irq_lock);
478 }
479} 477}
480 478
481static void apic_send_ipi(struct kvm_lapic *apic) 479static void apic_send_ipi(struct kvm_lapic *apic)
@@ -504,9 +502,7 @@ static void apic_send_ipi(struct kvm_lapic *apic)
504 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode, 502 irq.trig_mode, irq.level, irq.dest_mode, irq.delivery_mode,
505 irq.vector); 503 irq.vector);
506 504
507 mutex_lock(&apic->vcpu->kvm->irq_lock);
508 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq); 505 kvm_irq_delivery_to_apic(apic->vcpu->kvm, apic, &irq);
509 mutex_unlock(&apic->vcpu->kvm->irq_lock);
510} 506}
511 507
512static u32 apic_get_tmcct(struct kvm_lapic *apic) 508static u32 apic_get_tmcct(struct kvm_lapic *apic)
@@ -1156,6 +1152,7 @@ void kvm_apic_post_state_restore(struct kvm_vcpu *vcpu)
1156 hrtimer_cancel(&apic->lapic_timer.timer); 1152 hrtimer_cancel(&apic->lapic_timer.timer);
1157 update_divide_count(apic); 1153 update_divide_count(apic);
1158 start_apic_timer(apic); 1154 start_apic_timer(apic);
1155 apic->irr_pending = true;
1159} 1156}
1160 1157
1161void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu) 1158void __kvm_migrate_apic_timer(struct kvm_vcpu *vcpu)
@@ -1250,3 +1247,34 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data)
1250 1247
1251 return 0; 1248 return 0;
1252} 1249}
1250
1251int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 reg, u64 data)
1252{
1253 struct kvm_lapic *apic = vcpu->arch.apic;
1254
1255 if (!irqchip_in_kernel(vcpu->kvm))
1256 return 1;
1257
1258 /* if this is ICR write vector before command */
1259 if (reg == APIC_ICR)
1260 apic_reg_write(apic, APIC_ICR2, (u32)(data >> 32));
1261 return apic_reg_write(apic, reg, (u32)data);
1262}
1263
1264int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data)
1265{
1266 struct kvm_lapic *apic = vcpu->arch.apic;
1267 u32 low, high = 0;
1268
1269 if (!irqchip_in_kernel(vcpu->kvm))
1270 return 1;
1271
1272 if (apic_reg_read(apic, reg, 4, &low))
1273 return 1;
1274 if (reg == APIC_ICR)
1275 apic_reg_read(apic, APIC_ICR2, 4, &high);
1276
1277 *data = (((u64)high) << 32) | low;
1278
1279 return 0;
1280}
diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h
index 40010b09c4aa..f5fe32c5edad 100644
--- a/arch/x86/kvm/lapic.h
+++ b/arch/x86/kvm/lapic.h
@@ -48,4 +48,12 @@ void kvm_lapic_sync_to_vapic(struct kvm_vcpu *vcpu);
48 48
49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); 49int kvm_x2apic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); 50int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
51
52int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data);
53int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data);
54
55static inline bool kvm_hv_vapic_assist_page_enabled(struct kvm_vcpu *vcpu)
56{
57 return vcpu->arch.hv_vapic & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE;
58}
51#endif 59#endif
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 818b92ad82cf..19a8906bcaa2 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -18,6 +18,7 @@
18 */ 18 */
19 19
20#include "mmu.h" 20#include "mmu.h"
21#include "x86.h"
21#include "kvm_cache_regs.h" 22#include "kvm_cache_regs.h"
22 23
23#include <linux/kvm_host.h> 24#include <linux/kvm_host.h>
@@ -29,6 +30,8 @@
29#include <linux/swap.h> 30#include <linux/swap.h>
30#include <linux/hugetlb.h> 31#include <linux/hugetlb.h>
31#include <linux/compiler.h> 32#include <linux/compiler.h>
33#include <linux/srcu.h>
34#include <linux/slab.h>
32 35
33#include <asm/page.h> 36#include <asm/page.h>
34#include <asm/cmpxchg.h> 37#include <asm/cmpxchg.h>
@@ -136,16 +139,6 @@ module_param(oos_shadow, bool, 0644);
136#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \ 139#define PT64_PERM_MASK (PT_PRESENT_MASK | PT_WRITABLE_MASK | PT_USER_MASK \
137 | PT64_NX_MASK) 140 | PT64_NX_MASK)
138 141
139#define PFERR_PRESENT_MASK (1U << 0)
140#define PFERR_WRITE_MASK (1U << 1)
141#define PFERR_USER_MASK (1U << 2)
142#define PFERR_RSVD_MASK (1U << 3)
143#define PFERR_FETCH_MASK (1U << 4)
144
145#define PT_PDPE_LEVEL 3
146#define PT_DIRECTORY_LEVEL 2
147#define PT_PAGE_TABLE_LEVEL 1
148
149#define RMAP_EXT 4 142#define RMAP_EXT 4
150 143
151#define ACC_EXEC_MASK 1 144#define ACC_EXEC_MASK 1
@@ -153,6 +146,9 @@ module_param(oos_shadow, bool, 0644);
153#define ACC_USER_MASK PT_USER_MASK 146#define ACC_USER_MASK PT_USER_MASK
154#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK) 147#define ACC_ALL (ACC_EXEC_MASK | ACC_WRITE_MASK | ACC_USER_MASK)
155 148
149#include <trace/events/kvm.h>
150
151#undef TRACE_INCLUDE_FILE
156#define CREATE_TRACE_POINTS 152#define CREATE_TRACE_POINTS
157#include "mmutrace.h" 153#include "mmutrace.h"
158 154
@@ -229,7 +225,7 @@ EXPORT_SYMBOL_GPL(kvm_mmu_set_mask_ptes);
229 225
230static int is_write_protection(struct kvm_vcpu *vcpu) 226static int is_write_protection(struct kvm_vcpu *vcpu)
231{ 227{
232 return vcpu->arch.cr0 & X86_CR0_WP; 228 return kvm_read_cr0_bits(vcpu, X86_CR0_WP);
233} 229}
234 230
235static int is_cpuid_PSE36(void) 231static int is_cpuid_PSE36(void)
@@ -239,7 +235,7 @@ static int is_cpuid_PSE36(void)
239 235
240static int is_nx(struct kvm_vcpu *vcpu) 236static int is_nx(struct kvm_vcpu *vcpu)
241{ 237{
242 return vcpu->arch.shadow_efer & EFER_NX; 238 return vcpu->arch.efer & EFER_NX;
243} 239}
244 240
245static int is_shadow_present_pte(u64 pte) 241static int is_shadow_present_pte(u64 pte)
@@ -253,7 +249,7 @@ static int is_large_pte(u64 pte)
253 return pte & PT_PAGE_SIZE_MASK; 249 return pte & PT_PAGE_SIZE_MASK;
254} 250}
255 251
256static int is_writeble_pte(unsigned long pte) 252static int is_writable_pte(unsigned long pte)
257{ 253{
258 return pte & PT_WRITABLE_MASK; 254 return pte & PT_WRITABLE_MASK;
259} 255}
@@ -470,24 +466,10 @@ static int has_wrprotected_page(struct kvm *kvm,
470 466
471static int host_mapping_level(struct kvm *kvm, gfn_t gfn) 467static int host_mapping_level(struct kvm *kvm, gfn_t gfn)
472{ 468{
473 unsigned long page_size = PAGE_SIZE; 469 unsigned long page_size;
474 struct vm_area_struct *vma;
475 unsigned long addr;
476 int i, ret = 0; 470 int i, ret = 0;
477 471
478 addr = gfn_to_hva(kvm, gfn); 472 page_size = kvm_host_page_size(kvm, gfn);
479 if (kvm_is_error_hva(addr))
480 return page_size;
481
482 down_read(&current->mm->mmap_sem);
483 vma = find_vma(current->mm, addr);
484 if (!vma)
485 goto out;
486
487 page_size = vma_kernel_pagesize(vma);
488
489out:
490 up_read(&current->mm->mmap_sem);
491 473
492 for (i = PT_PAGE_TABLE_LEVEL; 474 for (i = PT_PAGE_TABLE_LEVEL;
493 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) { 475 i < (PT_PAGE_TABLE_LEVEL + KVM_NR_PAGE_SIZES); ++i) {
@@ -503,8 +485,7 @@ out:
503static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn) 485static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
504{ 486{
505 struct kvm_memory_slot *slot; 487 struct kvm_memory_slot *slot;
506 int host_level; 488 int host_level, level, max_level;
507 int level = PT_PAGE_TABLE_LEVEL;
508 489
509 slot = gfn_to_memslot(vcpu->kvm, large_gfn); 490 slot = gfn_to_memslot(vcpu->kvm, large_gfn);
510 if (slot && slot->dirty_bitmap) 491 if (slot && slot->dirty_bitmap)
@@ -515,11 +496,12 @@ static int mapping_level(struct kvm_vcpu *vcpu, gfn_t large_gfn)
515 if (host_level == PT_PAGE_TABLE_LEVEL) 496 if (host_level == PT_PAGE_TABLE_LEVEL)
516 return host_level; 497 return host_level;
517 498
518 for (level = PT_DIRECTORY_LEVEL; level <= host_level; ++level) { 499 max_level = kvm_x86_ops->get_lpage_level() < host_level ?
500 kvm_x86_ops->get_lpage_level() : host_level;
519 501
502 for (level = PT_DIRECTORY_LEVEL; level <= max_level; ++level)
520 if (has_wrprotected_page(vcpu->kvm, large_gfn, level)) 503 if (has_wrprotected_page(vcpu->kvm, large_gfn, level))
521 break; 504 break;
522 }
523 505
524 return level - 1; 506 return level - 1;
525} 507}
@@ -635,7 +617,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
635 pfn = spte_to_pfn(*spte); 617 pfn = spte_to_pfn(*spte);
636 if (*spte & shadow_accessed_mask) 618 if (*spte & shadow_accessed_mask)
637 kvm_set_pfn_accessed(pfn); 619 kvm_set_pfn_accessed(pfn);
638 if (is_writeble_pte(*spte)) 620 if (is_writable_pte(*spte))
639 kvm_set_pfn_dirty(pfn); 621 kvm_set_pfn_dirty(pfn);
640 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level); 622 rmapp = gfn_to_rmap(kvm, sp->gfns[spte - sp->spt], sp->role.level);
641 if (!*rmapp) { 623 if (!*rmapp) {
@@ -664,6 +646,7 @@ static void rmap_remove(struct kvm *kvm, u64 *spte)
664 prev_desc = desc; 646 prev_desc = desc;
665 desc = desc->more; 647 desc = desc->more;
666 } 648 }
649 pr_err("rmap_remove: %p %llx many->many\n", spte, *spte);
667 BUG(); 650 BUG();
668 } 651 }
669} 652}
@@ -710,7 +693,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
710 BUG_ON(!spte); 693 BUG_ON(!spte);
711 BUG_ON(!(*spte & PT_PRESENT_MASK)); 694 BUG_ON(!(*spte & PT_PRESENT_MASK));
712 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte); 695 rmap_printk("rmap_write_protect: spte %p %llx\n", spte, *spte);
713 if (is_writeble_pte(*spte)) { 696 if (is_writable_pte(*spte)) {
714 __set_spte(spte, *spte & ~PT_WRITABLE_MASK); 697 __set_spte(spte, *spte & ~PT_WRITABLE_MASK);
715 write_protected = 1; 698 write_protected = 1;
716 } 699 }
@@ -734,7 +717,7 @@ static int rmap_write_protect(struct kvm *kvm, u64 gfn)
734 BUG_ON(!(*spte & PT_PRESENT_MASK)); 717 BUG_ON(!(*spte & PT_PRESENT_MASK));
735 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)); 718 BUG_ON((*spte & (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK)) != (PT_PAGE_SIZE_MASK|PT_PRESENT_MASK));
736 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn); 719 pgprintk("rmap_write_protect(large): spte %p %llx %lld\n", spte, *spte, gfn);
737 if (is_writeble_pte(*spte)) { 720 if (is_writable_pte(*spte)) {
738 rmap_remove(kvm, spte); 721 rmap_remove(kvm, spte);
739 --kvm->stat.lpages; 722 --kvm->stat.lpages;
740 __set_spte(spte, shadow_trap_nonpresent_pte); 723 __set_spte(spte, shadow_trap_nonpresent_pte);
@@ -789,7 +772,7 @@ static int kvm_set_pte_rmapp(struct kvm *kvm, unsigned long *rmapp,
789 772
790 new_spte &= ~PT_WRITABLE_MASK; 773 new_spte &= ~PT_WRITABLE_MASK;
791 new_spte &= ~SPTE_HOST_WRITEABLE; 774 new_spte &= ~SPTE_HOST_WRITEABLE;
792 if (is_writeble_pte(*spte)) 775 if (is_writable_pte(*spte))
793 kvm_set_pfn_dirty(spte_to_pfn(*spte)); 776 kvm_set_pfn_dirty(spte_to_pfn(*spte));
794 __set_spte(spte, new_spte); 777 __set_spte(spte, new_spte);
795 spte = rmap_next(kvm, rmapp, spte); 778 spte = rmap_next(kvm, rmapp, spte);
@@ -807,35 +790,32 @@ static int kvm_handle_hva(struct kvm *kvm, unsigned long hva,
807 unsigned long data)) 790 unsigned long data))
808{ 791{
809 int i, j; 792 int i, j;
793 int ret;
810 int retval = 0; 794 int retval = 0;
795 struct kvm_memslots *slots;
811 796
812 /* 797 slots = rcu_dereference(kvm->memslots);
813 * If mmap_sem isn't taken, we can look the memslots with only 798
814 * the mmu_lock by skipping over the slots with userspace_addr == 0. 799 for (i = 0; i < slots->nmemslots; i++) {
815 */ 800 struct kvm_memory_slot *memslot = &slots->memslots[i];
816 for (i = 0; i < kvm->nmemslots; i++) {
817 struct kvm_memory_slot *memslot = &kvm->memslots[i];
818 unsigned long start = memslot->userspace_addr; 801 unsigned long start = memslot->userspace_addr;
819 unsigned long end; 802 unsigned long end;
820 803
821 /* mmu_lock protects userspace_addr */
822 if (!start)
823 continue;
824
825 end = start + (memslot->npages << PAGE_SHIFT); 804 end = start + (memslot->npages << PAGE_SHIFT);
826 if (hva >= start && hva < end) { 805 if (hva >= start && hva < end) {
827 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT; 806 gfn_t gfn_offset = (hva - start) >> PAGE_SHIFT;
828 807
829 retval |= handler(kvm, &memslot->rmap[gfn_offset], 808 ret = handler(kvm, &memslot->rmap[gfn_offset], data);
830 data);
831 809
832 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) { 810 for (j = 0; j < KVM_NR_PAGE_SIZES - 1; ++j) {
833 int idx = gfn_offset; 811 int idx = gfn_offset;
834 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j); 812 idx /= KVM_PAGES_PER_HPAGE(PT_DIRECTORY_LEVEL + j);
835 retval |= handler(kvm, 813 ret |= handler(kvm,
836 &memslot->lpage_info[j][idx].rmap_pde, 814 &memslot->lpage_info[j][idx].rmap_pde,
837 data); 815 data);
838 } 816 }
817 trace_kvm_age_page(hva, memslot, ret);
818 retval |= ret;
839 } 819 }
840 } 820 }
841 821
@@ -858,9 +838,15 @@ static int kvm_age_rmapp(struct kvm *kvm, unsigned long *rmapp,
858 u64 *spte; 838 u64 *spte;
859 int young = 0; 839 int young = 0;
860 840
861 /* always return old for EPT */ 841 /*
842 * Emulate the accessed bit for EPT, by checking if this page has
843 * an EPT mapping, and clearing it if it does. On the next access,
844 * a new EPT mapping will be established.
845 * This has some overhead, but not as much as the cost of swapping
846 * out actively used pages or breaking up actively used hugepages.
847 */
862 if (!shadow_accessed_mask) 848 if (!shadow_accessed_mask)
863 return 0; 849 return kvm_unmap_rmapp(kvm, rmapp, data);
864 850
865 spte = rmap_next(kvm, rmapp, NULL); 851 spte = rmap_next(kvm, rmapp, NULL);
866 while (spte) { 852 while (spte) {
@@ -1504,8 +1490,8 @@ static int mmu_zap_unsync_children(struct kvm *kvm,
1504 for_each_sp(pages, sp, parents, i) { 1490 for_each_sp(pages, sp, parents, i) {
1505 kvm_mmu_zap_page(kvm, sp); 1491 kvm_mmu_zap_page(kvm, sp);
1506 mmu_pages_clear_parents(&parents); 1492 mmu_pages_clear_parents(&parents);
1493 zapped++;
1507 } 1494 }
1508 zapped += pages.nr;
1509 kvm_mmu_pages_init(parent, &parents, &pages); 1495 kvm_mmu_pages_init(parent, &parents, &pages);
1510 } 1496 }
1511 1497
@@ -1556,14 +1542,16 @@ void kvm_mmu_change_mmu_pages(struct kvm *kvm, unsigned int kvm_nr_mmu_pages)
1556 */ 1542 */
1557 1543
1558 if (used_pages > kvm_nr_mmu_pages) { 1544 if (used_pages > kvm_nr_mmu_pages) {
1559 while (used_pages > kvm_nr_mmu_pages) { 1545 while (used_pages > kvm_nr_mmu_pages &&
1546 !list_empty(&kvm->arch.active_mmu_pages)) {
1560 struct kvm_mmu_page *page; 1547 struct kvm_mmu_page *page;
1561 1548
1562 page = container_of(kvm->arch.active_mmu_pages.prev, 1549 page = container_of(kvm->arch.active_mmu_pages.prev,
1563 struct kvm_mmu_page, link); 1550 struct kvm_mmu_page, link);
1564 kvm_mmu_zap_page(kvm, page); 1551 used_pages -= kvm_mmu_zap_page(kvm, page);
1565 used_pages--; 1552 used_pages--;
1566 } 1553 }
1554 kvm_nr_mmu_pages = used_pages;
1567 kvm->arch.n_free_mmu_pages = 0; 1555 kvm->arch.n_free_mmu_pages = 0;
1568 } 1556 }
1569 else 1557 else
@@ -1610,14 +1598,15 @@ static void mmu_unshadow(struct kvm *kvm, gfn_t gfn)
1610 && !sp->role.invalid) { 1598 && !sp->role.invalid) {
1611 pgprintk("%s: zap %lx %x\n", 1599 pgprintk("%s: zap %lx %x\n",
1612 __func__, gfn, sp->role.word); 1600 __func__, gfn, sp->role.word);
1613 kvm_mmu_zap_page(kvm, sp); 1601 if (kvm_mmu_zap_page(kvm, sp))
1602 nn = bucket->first;
1614 } 1603 }
1615 } 1604 }
1616} 1605}
1617 1606
1618static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn) 1607static void page_header_update_slot(struct kvm *kvm, void *pte, gfn_t gfn)
1619{ 1608{
1620 int slot = memslot_id(kvm, gfn_to_memslot(kvm, gfn)); 1609 int slot = memslot_id(kvm, gfn);
1621 struct kvm_mmu_page *sp = page_header(__pa(pte)); 1610 struct kvm_mmu_page *sp = page_header(__pa(pte));
1622 1611
1623 __set_bit(slot, sp->slot_bitmap); 1612 __set_bit(slot, sp->slot_bitmap);
@@ -1641,7 +1630,7 @@ struct page *gva_to_page(struct kvm_vcpu *vcpu, gva_t gva)
1641{ 1630{
1642 struct page *page; 1631 struct page *page;
1643 1632
1644 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 1633 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
1645 1634
1646 if (gpa == UNMAPPED_GVA) 1635 if (gpa == UNMAPPED_GVA)
1647 return NULL; 1636 return NULL;
@@ -1854,7 +1843,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1854 * is responsibility of mmu_get_page / kvm_sync_page. 1843 * is responsibility of mmu_get_page / kvm_sync_page.
1855 * Same reasoning can be applied to dirty page accounting. 1844 * Same reasoning can be applied to dirty page accounting.
1856 */ 1845 */
1857 if (!can_unsync && is_writeble_pte(*sptep)) 1846 if (!can_unsync && is_writable_pte(*sptep))
1858 goto set_pte; 1847 goto set_pte;
1859 1848
1860 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) { 1849 if (mmu_need_write_protect(vcpu, gfn, can_unsync)) {
@@ -1862,7 +1851,7 @@ static int set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1862 __func__, gfn); 1851 __func__, gfn);
1863 ret = 1; 1852 ret = 1;
1864 pte_access &= ~ACC_WRITE_MASK; 1853 pte_access &= ~ACC_WRITE_MASK;
1865 if (is_writeble_pte(spte)) 1854 if (is_writable_pte(spte))
1866 spte &= ~PT_WRITABLE_MASK; 1855 spte &= ~PT_WRITABLE_MASK;
1867 } 1856 }
1868 } 1857 }
@@ -1883,7 +1872,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1883 bool reset_host_protection) 1872 bool reset_host_protection)
1884{ 1873{
1885 int was_rmapped = 0; 1874 int was_rmapped = 0;
1886 int was_writeble = is_writeble_pte(*sptep); 1875 int was_writable = is_writable_pte(*sptep);
1887 int rmap_count; 1876 int rmap_count;
1888 1877
1889 pgprintk("%s: spte %llx access %x write_fault %d" 1878 pgprintk("%s: spte %llx access %x write_fault %d"
@@ -1934,7 +1923,7 @@ static void mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep,
1934 if (rmap_count > RMAP_RECYCLE_THRESHOLD) 1923 if (rmap_count > RMAP_RECYCLE_THRESHOLD)
1935 rmap_recycle(vcpu, sptep, gfn); 1924 rmap_recycle(vcpu, sptep, gfn);
1936 } else { 1925 } else {
1937 if (was_writeble) 1926 if (was_writable)
1938 kvm_release_pfn_dirty(pfn); 1927 kvm_release_pfn_dirty(pfn);
1939 else 1928 else
1940 kvm_release_pfn_clean(pfn); 1929 kvm_release_pfn_clean(pfn);
@@ -2164,8 +2153,11 @@ void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
2164 spin_unlock(&vcpu->kvm->mmu_lock); 2153 spin_unlock(&vcpu->kvm->mmu_lock);
2165} 2154}
2166 2155
2167static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr) 2156static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu *vcpu, gva_t vaddr,
2157 u32 access, u32 *error)
2168{ 2158{
2159 if (error)
2160 *error = 0;
2169 return vaddr; 2161 return vaddr;
2170} 2162}
2171 2163
@@ -2749,7 +2741,7 @@ int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2749 if (tdp_enabled) 2741 if (tdp_enabled)
2750 return 0; 2742 return 0;
2751 2743
2752 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, gva); 2744 gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2753 2745
2754 spin_lock(&vcpu->kvm->mmu_lock); 2746 spin_lock(&vcpu->kvm->mmu_lock);
2755 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT); 2747 r = kvm_mmu_unprotect_page(vcpu->kvm, gpa >> PAGE_SHIFT);
@@ -2789,7 +2781,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2789 if (r) 2781 if (r)
2790 goto out; 2782 goto out;
2791 2783
2792 er = emulate_instruction(vcpu, vcpu->run, cr2, error_code, 0); 2784 er = emulate_instruction(vcpu, cr2, error_code, 0);
2793 2785
2794 switch (er) { 2786 switch (er) {
2795 case EMULATE_DONE: 2787 case EMULATE_DONE:
@@ -2800,6 +2792,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u32 error_code)
2800 case EMULATE_FAIL: 2792 case EMULATE_FAIL:
2801 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; 2793 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2802 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; 2794 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
2795 vcpu->run->internal.ndata = 0;
2803 return 0; 2796 return 0;
2804 default: 2797 default:
2805 BUG(); 2798 BUG();
@@ -2848,16 +2841,13 @@ static int alloc_mmu_pages(struct kvm_vcpu *vcpu)
2848 */ 2841 */
2849 page = alloc_page(GFP_KERNEL | __GFP_DMA32); 2842 page = alloc_page(GFP_KERNEL | __GFP_DMA32);
2850 if (!page) 2843 if (!page)
2851 goto error_1; 2844 return -ENOMEM;
2845
2852 vcpu->arch.mmu.pae_root = page_address(page); 2846 vcpu->arch.mmu.pae_root = page_address(page);
2853 for (i = 0; i < 4; ++i) 2847 for (i = 0; i < 4; ++i)
2854 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE; 2848 vcpu->arch.mmu.pae_root[i] = INVALID_PAGE;
2855 2849
2856 return 0; 2850 return 0;
2857
2858error_1:
2859 free_mmu_pages(vcpu);
2860 return -ENOMEM;
2861} 2851}
2862 2852
2863int kvm_mmu_create(struct kvm_vcpu *vcpu) 2853int kvm_mmu_create(struct kvm_vcpu *vcpu)
@@ -2937,10 +2927,9 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2937 spin_lock(&kvm_lock); 2927 spin_lock(&kvm_lock);
2938 2928
2939 list_for_each_entry(kvm, &vm_list, vm_list) { 2929 list_for_each_entry(kvm, &vm_list, vm_list) {
2940 int npages; 2930 int npages, idx;
2941 2931
2942 if (!down_read_trylock(&kvm->slots_lock)) 2932 idx = srcu_read_lock(&kvm->srcu);
2943 continue;
2944 spin_lock(&kvm->mmu_lock); 2933 spin_lock(&kvm->mmu_lock);
2945 npages = kvm->arch.n_alloc_mmu_pages - 2934 npages = kvm->arch.n_alloc_mmu_pages -
2946 kvm->arch.n_free_mmu_pages; 2935 kvm->arch.n_free_mmu_pages;
@@ -2953,7 +2942,7 @@ static int mmu_shrink(int nr_to_scan, gfp_t gfp_mask)
2953 nr_to_scan--; 2942 nr_to_scan--;
2954 2943
2955 spin_unlock(&kvm->mmu_lock); 2944 spin_unlock(&kvm->mmu_lock);
2956 up_read(&kvm->slots_lock); 2945 srcu_read_unlock(&kvm->srcu, idx);
2957 } 2946 }
2958 if (kvm_freed) 2947 if (kvm_freed)
2959 list_move_tail(&kvm_freed->vm_list, &vm_list); 2948 list_move_tail(&kvm_freed->vm_list, &vm_list);
@@ -3020,9 +3009,11 @@ unsigned int kvm_mmu_calculate_mmu_pages(struct kvm *kvm)
3020 int i; 3009 int i;
3021 unsigned int nr_mmu_pages; 3010 unsigned int nr_mmu_pages;
3022 unsigned int nr_pages = 0; 3011 unsigned int nr_pages = 0;
3012 struct kvm_memslots *slots;
3023 3013
3024 for (i = 0; i < kvm->nmemslots; i++) 3014 slots = rcu_dereference(kvm->memslots);
3025 nr_pages += kvm->memslots[i].npages; 3015 for (i = 0; i < slots->nmemslots; i++)
3016 nr_pages += slots->memslots[i].npages;
3026 3017
3027 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000; 3018 nr_mmu_pages = nr_pages * KVM_PERMILLE_MMU_PAGES / 1000;
3028 nr_mmu_pages = max(nr_mmu_pages, 3019 nr_mmu_pages = max(nr_mmu_pages,
@@ -3247,7 +3238,7 @@ static void audit_mappings_page(struct kvm_vcpu *vcpu, u64 page_pte,
3247 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level)) 3238 if (is_shadow_present_pte(ent) && !is_last_spte(ent, level))
3248 audit_mappings_page(vcpu, ent, va, level - 1); 3239 audit_mappings_page(vcpu, ent, va, level - 1);
3249 else { 3240 else {
3250 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, va); 3241 gpa_t gpa = kvm_mmu_gva_to_gpa_read(vcpu, va, NULL);
3251 gfn_t gfn = gpa >> PAGE_SHIFT; 3242 gfn_t gfn = gpa >> PAGE_SHIFT;
3252 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn); 3243 pfn_t pfn = gfn_to_pfn(vcpu->kvm, gfn);
3253 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT; 3244 hpa_t hpa = (hpa_t)pfn << PAGE_SHIFT;
@@ -3292,10 +3283,12 @@ static void audit_mappings(struct kvm_vcpu *vcpu)
3292static int count_rmaps(struct kvm_vcpu *vcpu) 3283static int count_rmaps(struct kvm_vcpu *vcpu)
3293{ 3284{
3294 int nmaps = 0; 3285 int nmaps = 0;
3295 int i, j, k; 3286 int i, j, k, idx;
3296 3287
3288 idx = srcu_read_lock(&kvm->srcu);
3289 slots = rcu_dereference(kvm->memslots);
3297 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) { 3290 for (i = 0; i < KVM_MEMORY_SLOTS; ++i) {
3298 struct kvm_memory_slot *m = &vcpu->kvm->memslots[i]; 3291 struct kvm_memory_slot *m = &slots->memslots[i];
3299 struct kvm_rmap_desc *d; 3292 struct kvm_rmap_desc *d;
3300 3293
3301 for (j = 0; j < m->npages; ++j) { 3294 for (j = 0; j < m->npages; ++j) {
@@ -3318,6 +3311,7 @@ static int count_rmaps(struct kvm_vcpu *vcpu)
3318 } 3311 }
3319 } 3312 }
3320 } 3313 }
3314 srcu_read_unlock(&kvm->srcu, idx);
3321 return nmaps; 3315 return nmaps;
3322} 3316}
3323 3317
diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h
index 61a1b3884b49..be66759321a5 100644
--- a/arch/x86/kvm/mmu.h
+++ b/arch/x86/kvm/mmu.h
@@ -2,6 +2,7 @@
2#define __KVM_X86_MMU_H 2#define __KVM_X86_MMU_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6#define PT64_PT_BITS 9 7#define PT64_PT_BITS 9
7#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS) 8#define PT64_ENT_PER_PAGE (1 << PT64_PT_BITS)
@@ -37,6 +38,16 @@
37#define PT32_ROOT_LEVEL 2 38#define PT32_ROOT_LEVEL 2
38#define PT32E_ROOT_LEVEL 3 39#define PT32E_ROOT_LEVEL 3
39 40
41#define PT_PDPE_LEVEL 3
42#define PT_DIRECTORY_LEVEL 2
43#define PT_PAGE_TABLE_LEVEL 1
44
45#define PFERR_PRESENT_MASK (1U << 0)
46#define PFERR_WRITE_MASK (1U << 1)
47#define PFERR_USER_MASK (1U << 2)
48#define PFERR_RSVD_MASK (1U << 3)
49#define PFERR_FETCH_MASK (1U << 4)
50
40int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]); 51int kvm_mmu_get_spte_hierarchy(struct kvm_vcpu *vcpu, u64 addr, u64 sptes[4]);
41 52
42static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu) 53static inline void kvm_mmu_free_some_pages(struct kvm_vcpu *vcpu)
@@ -53,30 +64,6 @@ static inline int kvm_mmu_reload(struct kvm_vcpu *vcpu)
53 return kvm_mmu_load(vcpu); 64 return kvm_mmu_load(vcpu);
54} 65}
55 66
56static inline int is_long_mode(struct kvm_vcpu *vcpu)
57{
58#ifdef CONFIG_X86_64
59 return vcpu->arch.shadow_efer & EFER_LMA;
60#else
61 return 0;
62#endif
63}
64
65static inline int is_pae(struct kvm_vcpu *vcpu)
66{
67 return vcpu->arch.cr4 & X86_CR4_PAE;
68}
69
70static inline int is_pse(struct kvm_vcpu *vcpu)
71{
72 return vcpu->arch.cr4 & X86_CR4_PSE;
73}
74
75static inline int is_paging(struct kvm_vcpu *vcpu)
76{
77 return vcpu->arch.cr0 & X86_CR0_PG;
78}
79
80static inline int is_present_gpte(unsigned long pte) 67static inline int is_present_gpte(unsigned long pte)
81{ 68{
82 return pte & PT_PRESENT_MASK; 69 return pte & PT_PRESENT_MASK;
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 72558f8ff3f5..81eab9a50e6a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -150,7 +150,9 @@ walk:
150 walker->table_gfn[walker->level - 1] = table_gfn; 150 walker->table_gfn[walker->level - 1] = table_gfn;
151 walker->pte_gpa[walker->level - 1] = pte_gpa; 151 walker->pte_gpa[walker->level - 1] = pte_gpa;
152 152
153 kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)); 153 if (kvm_read_guest(vcpu->kvm, pte_gpa, &pte, sizeof(pte)))
154 goto not_present;
155
154 trace_kvm_mmu_paging_element(pte, walker->level); 156 trace_kvm_mmu_paging_element(pte, walker->level);
155 157
156 if (!is_present_gpte(pte)) 158 if (!is_present_gpte(pte))
@@ -160,7 +162,7 @@ walk:
160 if (rsvd_fault) 162 if (rsvd_fault)
161 goto access_error; 163 goto access_error;
162 164
163 if (write_fault && !is_writeble_pte(pte)) 165 if (write_fault && !is_writable_pte(pte))
164 if (user_fault || is_write_protection(vcpu)) 166 if (user_fault || is_write_protection(vcpu))
165 goto access_error; 167 goto access_error;
166 168
@@ -455,8 +457,6 @@ out_unlock:
455static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva) 457static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
456{ 458{
457 struct kvm_shadow_walk_iterator iterator; 459 struct kvm_shadow_walk_iterator iterator;
458 pt_element_t gpte;
459 gpa_t pte_gpa = -1;
460 int level; 460 int level;
461 u64 *sptep; 461 u64 *sptep;
462 int need_flush = 0; 462 int need_flush = 0;
@@ -467,14 +467,9 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
467 level = iterator.level; 467 level = iterator.level;
468 sptep = iterator.sptep; 468 sptep = iterator.sptep;
469 469
470 /* FIXME: properly handle invlpg on large guest pages */
471 if (level == PT_PAGE_TABLE_LEVEL || 470 if (level == PT_PAGE_TABLE_LEVEL ||
472 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) || 471 ((level == PT_DIRECTORY_LEVEL && is_large_pte(*sptep))) ||
473 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) { 472 ((level == PT_PDPE_LEVEL && is_large_pte(*sptep)))) {
474 struct kvm_mmu_page *sp = page_header(__pa(sptep));
475
476 pte_gpa = (sp->gfn << PAGE_SHIFT);
477 pte_gpa += (sptep - sp->spt) * sizeof(pt_element_t);
478 473
479 if (is_shadow_present_pte(*sptep)) { 474 if (is_shadow_present_pte(*sptep)) {
480 rmap_remove(vcpu->kvm, sptep); 475 rmap_remove(vcpu->kvm, sptep);
@@ -493,32 +488,25 @@ static void FNAME(invlpg)(struct kvm_vcpu *vcpu, gva_t gva)
493 if (need_flush) 488 if (need_flush)
494 kvm_flush_remote_tlbs(vcpu->kvm); 489 kvm_flush_remote_tlbs(vcpu->kvm);
495 spin_unlock(&vcpu->kvm->mmu_lock); 490 spin_unlock(&vcpu->kvm->mmu_lock);
496
497 if (pte_gpa == -1)
498 return;
499 if (kvm_read_guest_atomic(vcpu->kvm, pte_gpa, &gpte,
500 sizeof(pt_element_t)))
501 return;
502 if (is_present_gpte(gpte) && (gpte & PT_ACCESSED_MASK)) {
503 if (mmu_topup_memory_caches(vcpu))
504 return;
505 kvm_mmu_pte_write(vcpu, pte_gpa, (const u8 *)&gpte,
506 sizeof(pt_element_t), 0);
507 }
508} 491}
509 492
510static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr) 493static gpa_t FNAME(gva_to_gpa)(struct kvm_vcpu *vcpu, gva_t vaddr, u32 access,
494 u32 *error)
511{ 495{
512 struct guest_walker walker; 496 struct guest_walker walker;
513 gpa_t gpa = UNMAPPED_GVA; 497 gpa_t gpa = UNMAPPED_GVA;
514 int r; 498 int r;
515 499
516 r = FNAME(walk_addr)(&walker, vcpu, vaddr, 0, 0, 0); 500 r = FNAME(walk_addr)(&walker, vcpu, vaddr,
501 !!(access & PFERR_WRITE_MASK),
502 !!(access & PFERR_USER_MASK),
503 !!(access & PFERR_FETCH_MASK));
517 504
518 if (r) { 505 if (r) {
519 gpa = gfn_to_gpa(walker.gfn); 506 gpa = gfn_to_gpa(walker.gfn);
520 gpa |= vaddr & ~PAGE_MASK; 507 gpa |= vaddr & ~PAGE_MASK;
521 } 508 } else if (error)
509 *error = walker.error_code;
522 510
523 return gpa; 511 return gpa;
524} 512}
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index c17404add91f..737361fcd503 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -26,6 +26,7 @@
26#include <linux/highmem.h> 26#include <linux/highmem.h>
27#include <linux/sched.h> 27#include <linux/sched.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h>
29 30
30#include <asm/desc.h> 31#include <asm/desc.h>
31 32
@@ -46,6 +47,7 @@ MODULE_LICENSE("GPL");
46#define SVM_FEATURE_NPT (1 << 0) 47#define SVM_FEATURE_NPT (1 << 0)
47#define SVM_FEATURE_LBRV (1 << 1) 48#define SVM_FEATURE_LBRV (1 << 1)
48#define SVM_FEATURE_SVML (1 << 2) 49#define SVM_FEATURE_SVML (1 << 2)
50#define SVM_FEATURE_PAUSE_FILTER (1 << 10)
49 51
50#define NESTED_EXIT_HOST 0 /* Exit handled on host level */ 52#define NESTED_EXIT_HOST 0 /* Exit handled on host level */
51#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */ 53#define NESTED_EXIT_DONE 1 /* Exit caused nested vmexit */
@@ -53,15 +55,6 @@ MODULE_LICENSE("GPL");
53 55
54#define DEBUGCTL_RESERVED_BITS (~(0x3fULL)) 56#define DEBUGCTL_RESERVED_BITS (~(0x3fULL))
55 57
56/* Turn on to get debugging output*/
57/* #define NESTED_DEBUG */
58
59#ifdef NESTED_DEBUG
60#define nsvm_printk(fmt, args...) printk(KERN_INFO fmt, ## args)
61#else
62#define nsvm_printk(fmt, args...) do {} while(0)
63#endif
64
65static const u32 host_save_user_msrs[] = { 58static const u32 host_save_user_msrs[] = {
66#ifdef CONFIG_X86_64 59#ifdef CONFIG_X86_64
67 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE, 60 MSR_STAR, MSR_LSTAR, MSR_CSTAR, MSR_SYSCALL_MASK, MSR_KERNEL_GS_BASE,
@@ -85,6 +78,9 @@ struct nested_state {
85 /* gpa pointers to the real vectors */ 78 /* gpa pointers to the real vectors */
86 u64 vmcb_msrpm; 79 u64 vmcb_msrpm;
87 80
81 /* A VMEXIT is required but not yet emulated */
82 bool exit_required;
83
88 /* cache for intercepts of the guest */ 84 /* cache for intercepts of the guest */
89 u16 intercept_cr_read; 85 u16 intercept_cr_read;
90 u16 intercept_cr_write; 86 u16 intercept_cr_write;
@@ -112,6 +108,8 @@ struct vcpu_svm {
112 u32 *msrpm; 108 u32 *msrpm;
113 109
114 struct nested_state nested; 110 struct nested_state nested;
111
112 bool nmi_singlestep;
115}; 113};
116 114
117/* enable NPT for AMD64 and X86 with PAE */ 115/* enable NPT for AMD64 and X86 with PAE */
@@ -234,7 +232,7 @@ static void svm_set_efer(struct kvm_vcpu *vcpu, u64 efer)
234 efer &= ~EFER_LME; 232 efer &= ~EFER_LME;
235 233
236 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME; 234 to_svm(vcpu)->vmcb->save.efer = efer | EFER_SVME;
237 vcpu->arch.shadow_efer = efer; 235 vcpu->arch.efer = efer;
238} 236}
239 237
240static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr, 238static void svm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
@@ -286,7 +284,7 @@ static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
286 struct vcpu_svm *svm = to_svm(vcpu); 284 struct vcpu_svm *svm = to_svm(vcpu);
287 285
288 if (!svm->next_rip) { 286 if (!svm->next_rip) {
289 if (emulate_instruction(vcpu, vcpu->run, 0, 0, EMULTYPE_SKIP) != 287 if (emulate_instruction(vcpu, 0, 0, EMULTYPE_SKIP) !=
290 EMULATE_DONE) 288 EMULATE_DONE)
291 printk(KERN_DEBUG "%s: NOP\n", __func__); 289 printk(KERN_DEBUG "%s: NOP\n", __func__);
292 return; 290 return;
@@ -316,75 +314,79 @@ static void svm_hardware_disable(void *garbage)
316 cpu_svm_disable(); 314 cpu_svm_disable();
317} 315}
318 316
319static void svm_hardware_enable(void *garbage) 317static int svm_hardware_enable(void *garbage)
320{ 318{
321 319
322 struct svm_cpu_data *svm_data; 320 struct svm_cpu_data *sd;
323 uint64_t efer; 321 uint64_t efer;
324 struct descriptor_table gdt_descr; 322 struct descriptor_table gdt_descr;
325 struct desc_struct *gdt; 323 struct desc_struct *gdt;
326 int me = raw_smp_processor_id(); 324 int me = raw_smp_processor_id();
327 325
326 rdmsrl(MSR_EFER, efer);
327 if (efer & EFER_SVME)
328 return -EBUSY;
329
328 if (!has_svm()) { 330 if (!has_svm()) {
329 printk(KERN_ERR "svm_cpu_init: err EOPNOTSUPP on %d\n", me); 331 printk(KERN_ERR "svm_hardware_enable: err EOPNOTSUPP on %d\n",
330 return; 332 me);
333 return -EINVAL;
331 } 334 }
332 svm_data = per_cpu(svm_data, me); 335 sd = per_cpu(svm_data, me);
333 336
334 if (!svm_data) { 337 if (!sd) {
335 printk(KERN_ERR "svm_cpu_init: svm_data is NULL on %d\n", 338 printk(KERN_ERR "svm_hardware_enable: svm_data is NULL on %d\n",
336 me); 339 me);
337 return; 340 return -EINVAL;
338 } 341 }
339 342
340 svm_data->asid_generation = 1; 343 sd->asid_generation = 1;
341 svm_data->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; 344 sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1;
342 svm_data->next_asid = svm_data->max_asid + 1; 345 sd->next_asid = sd->max_asid + 1;
343 346
344 kvm_get_gdt(&gdt_descr); 347 kvm_get_gdt(&gdt_descr);
345 gdt = (struct desc_struct *)gdt_descr.base; 348 gdt = (struct desc_struct *)gdt_descr.base;
346 svm_data->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS); 349 sd->tss_desc = (struct kvm_ldttss_desc *)(gdt + GDT_ENTRY_TSS);
347 350
348 rdmsrl(MSR_EFER, efer);
349 wrmsrl(MSR_EFER, efer | EFER_SVME); 351 wrmsrl(MSR_EFER, efer | EFER_SVME);
350 352
351 wrmsrl(MSR_VM_HSAVE_PA, 353 wrmsrl(MSR_VM_HSAVE_PA, page_to_pfn(sd->save_area) << PAGE_SHIFT);
352 page_to_pfn(svm_data->save_area) << PAGE_SHIFT); 354
355 return 0;
353} 356}
354 357
355static void svm_cpu_uninit(int cpu) 358static void svm_cpu_uninit(int cpu)
356{ 359{
357 struct svm_cpu_data *svm_data 360 struct svm_cpu_data *sd = per_cpu(svm_data, raw_smp_processor_id());
358 = per_cpu(svm_data, raw_smp_processor_id());
359 361
360 if (!svm_data) 362 if (!sd)
361 return; 363 return;
362 364
363 per_cpu(svm_data, raw_smp_processor_id()) = NULL; 365 per_cpu(svm_data, raw_smp_processor_id()) = NULL;
364 __free_page(svm_data->save_area); 366 __free_page(sd->save_area);
365 kfree(svm_data); 367 kfree(sd);
366} 368}
367 369
368static int svm_cpu_init(int cpu) 370static int svm_cpu_init(int cpu)
369{ 371{
370 struct svm_cpu_data *svm_data; 372 struct svm_cpu_data *sd;
371 int r; 373 int r;
372 374
373 svm_data = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL); 375 sd = kzalloc(sizeof(struct svm_cpu_data), GFP_KERNEL);
374 if (!svm_data) 376 if (!sd)
375 return -ENOMEM; 377 return -ENOMEM;
376 svm_data->cpu = cpu; 378 sd->cpu = cpu;
377 svm_data->save_area = alloc_page(GFP_KERNEL); 379 sd->save_area = alloc_page(GFP_KERNEL);
378 r = -ENOMEM; 380 r = -ENOMEM;
379 if (!svm_data->save_area) 381 if (!sd->save_area)
380 goto err_1; 382 goto err_1;
381 383
382 per_cpu(svm_data, cpu) = svm_data; 384 per_cpu(svm_data, cpu) = sd;
383 385
384 return 0; 386 return 0;
385 387
386err_1: 388err_1:
387 kfree(svm_data); 389 kfree(sd);
388 return r; 390 return r;
389 391
390} 392}
@@ -476,7 +478,7 @@ static __init int svm_hardware_setup(void)
476 kvm_enable_efer_bits(EFER_SVME); 478 kvm_enable_efer_bits(EFER_SVME);
477 } 479 }
478 480
479 for_each_online_cpu(cpu) { 481 for_each_possible_cpu(cpu) {
480 r = svm_cpu_init(cpu); 482 r = svm_cpu_init(cpu);
481 if (r) 483 if (r)
482 goto err; 484 goto err;
@@ -510,7 +512,7 @@ static __exit void svm_hardware_unsetup(void)
510{ 512{
511 int cpu; 513 int cpu;
512 514
513 for_each_online_cpu(cpu) 515 for_each_possible_cpu(cpu)
514 svm_cpu_uninit(cpu); 516 svm_cpu_uninit(cpu);
515 517
516 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER); 518 __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), IOPM_ALLOC_ORDER);
@@ -539,6 +541,8 @@ static void init_vmcb(struct vcpu_svm *svm)
539 struct vmcb_control_area *control = &svm->vmcb->control; 541 struct vmcb_control_area *control = &svm->vmcb->control;
540 struct vmcb_save_area *save = &svm->vmcb->save; 542 struct vmcb_save_area *save = &svm->vmcb->save;
541 543
544 svm->vcpu.fpu_active = 1;
545
542 control->intercept_cr_read = INTERCEPT_CR0_MASK | 546 control->intercept_cr_read = INTERCEPT_CR0_MASK |
543 INTERCEPT_CR3_MASK | 547 INTERCEPT_CR3_MASK |
544 INTERCEPT_CR4_MASK; 548 INTERCEPT_CR4_MASK;
@@ -551,13 +555,19 @@ static void init_vmcb(struct vcpu_svm *svm)
551 control->intercept_dr_read = INTERCEPT_DR0_MASK | 555 control->intercept_dr_read = INTERCEPT_DR0_MASK |
552 INTERCEPT_DR1_MASK | 556 INTERCEPT_DR1_MASK |
553 INTERCEPT_DR2_MASK | 557 INTERCEPT_DR2_MASK |
554 INTERCEPT_DR3_MASK; 558 INTERCEPT_DR3_MASK |
559 INTERCEPT_DR4_MASK |
560 INTERCEPT_DR5_MASK |
561 INTERCEPT_DR6_MASK |
562 INTERCEPT_DR7_MASK;
555 563
556 control->intercept_dr_write = INTERCEPT_DR0_MASK | 564 control->intercept_dr_write = INTERCEPT_DR0_MASK |
557 INTERCEPT_DR1_MASK | 565 INTERCEPT_DR1_MASK |
558 INTERCEPT_DR2_MASK | 566 INTERCEPT_DR2_MASK |
559 INTERCEPT_DR3_MASK | 567 INTERCEPT_DR3_MASK |
568 INTERCEPT_DR4_MASK |
560 INTERCEPT_DR5_MASK | 569 INTERCEPT_DR5_MASK |
570 INTERCEPT_DR6_MASK |
561 INTERCEPT_DR7_MASK; 571 INTERCEPT_DR7_MASK;
562 572
563 control->intercept_exceptions = (1 << PF_VECTOR) | 573 control->intercept_exceptions = (1 << PF_VECTOR) |
@@ -568,6 +578,7 @@ static void init_vmcb(struct vcpu_svm *svm)
568 control->intercept = (1ULL << INTERCEPT_INTR) | 578 control->intercept = (1ULL << INTERCEPT_INTR) |
569 (1ULL << INTERCEPT_NMI) | 579 (1ULL << INTERCEPT_NMI) |
570 (1ULL << INTERCEPT_SMI) | 580 (1ULL << INTERCEPT_SMI) |
581 (1ULL << INTERCEPT_SELECTIVE_CR0) |
571 (1ULL << INTERCEPT_CPUID) | 582 (1ULL << INTERCEPT_CPUID) |
572 (1ULL << INTERCEPT_INVD) | 583 (1ULL << INTERCEPT_INVD) |
573 (1ULL << INTERCEPT_HLT) | 584 (1ULL << INTERCEPT_HLT) |
@@ -625,11 +636,12 @@ static void init_vmcb(struct vcpu_svm *svm)
625 save->rip = 0x0000fff0; 636 save->rip = 0x0000fff0;
626 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip; 637 svm->vcpu.arch.regs[VCPU_REGS_RIP] = save->rip;
627 638
628 /* 639 /* This is the guest-visible cr0 value.
629 * cr0 val on cpu init should be 0x60000010, we enable cpu 640 * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0.
630 * cache by default. the orderly way is to enable cache in bios.
631 */ 641 */
632 save->cr0 = 0x00000010 | X86_CR0_PG | X86_CR0_WP; 642 svm->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
643 kvm_set_cr0(&svm->vcpu, svm->vcpu.arch.cr0);
644
633 save->cr4 = X86_CR4_PAE; 645 save->cr4 = X86_CR4_PAE;
634 /* rdx = ?? */ 646 /* rdx = ?? */
635 647
@@ -639,13 +651,9 @@ static void init_vmcb(struct vcpu_svm *svm)
639 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) | 651 control->intercept &= ~((1ULL << INTERCEPT_TASK_SWITCH) |
640 (1ULL << INTERCEPT_INVLPG)); 652 (1ULL << INTERCEPT_INVLPG));
641 control->intercept_exceptions &= ~(1 << PF_VECTOR); 653 control->intercept_exceptions &= ~(1 << PF_VECTOR);
642 control->intercept_cr_read &= ~(INTERCEPT_CR0_MASK| 654 control->intercept_cr_read &= ~INTERCEPT_CR3_MASK;
643 INTERCEPT_CR3_MASK); 655 control->intercept_cr_write &= ~INTERCEPT_CR3_MASK;
644 control->intercept_cr_write &= ~(INTERCEPT_CR0_MASK|
645 INTERCEPT_CR3_MASK);
646 save->g_pat = 0x0007040600070406ULL; 656 save->g_pat = 0x0007040600070406ULL;
647 /* enable caching because the QEMU Bios doesn't enable it */
648 save->cr0 = X86_CR0_ET;
649 save->cr3 = 0; 657 save->cr3 = 0;
650 save->cr4 = 0; 658 save->cr4 = 0;
651 } 659 }
@@ -654,6 +662,11 @@ static void init_vmcb(struct vcpu_svm *svm)
654 svm->nested.vmcb = 0; 662 svm->nested.vmcb = 0;
655 svm->vcpu.arch.hflags = 0; 663 svm->vcpu.arch.hflags = 0;
656 664
665 if (svm_has(SVM_FEATURE_PAUSE_FILTER)) {
666 control->pause_filter_count = 3000;
667 control->intercept |= (1ULL << INTERCEPT_PAUSE);
668 }
669
657 enable_gif(svm); 670 enable_gif(svm);
658} 671}
659 672
@@ -693,29 +706,28 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
693 if (err) 706 if (err)
694 goto free_svm; 707 goto free_svm;
695 708
709 err = -ENOMEM;
696 page = alloc_page(GFP_KERNEL); 710 page = alloc_page(GFP_KERNEL);
697 if (!page) { 711 if (!page)
698 err = -ENOMEM;
699 goto uninit; 712 goto uninit;
700 }
701 713
702 err = -ENOMEM;
703 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 714 msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
704 if (!msrpm_pages) 715 if (!msrpm_pages)
705 goto uninit; 716 goto free_page1;
706 717
707 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER); 718 nested_msrpm_pages = alloc_pages(GFP_KERNEL, MSRPM_ALLOC_ORDER);
708 if (!nested_msrpm_pages) 719 if (!nested_msrpm_pages)
709 goto uninit; 720 goto free_page2;
710
711 svm->msrpm = page_address(msrpm_pages);
712 svm_vcpu_init_msrpm(svm->msrpm);
713 721
714 hsave_page = alloc_page(GFP_KERNEL); 722 hsave_page = alloc_page(GFP_KERNEL);
715 if (!hsave_page) 723 if (!hsave_page)
716 goto uninit; 724 goto free_page3;
725
717 svm->nested.hsave = page_address(hsave_page); 726 svm->nested.hsave = page_address(hsave_page);
718 727
728 svm->msrpm = page_address(msrpm_pages);
729 svm_vcpu_init_msrpm(svm->msrpm);
730
719 svm->nested.msrpm = page_address(nested_msrpm_pages); 731 svm->nested.msrpm = page_address(nested_msrpm_pages);
720 732
721 svm->vmcb = page_address(page); 733 svm->vmcb = page_address(page);
@@ -725,13 +737,18 @@ static struct kvm_vcpu *svm_create_vcpu(struct kvm *kvm, unsigned int id)
725 init_vmcb(svm); 737 init_vmcb(svm);
726 738
727 fx_init(&svm->vcpu); 739 fx_init(&svm->vcpu);
728 svm->vcpu.fpu_active = 1;
729 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE; 740 svm->vcpu.arch.apic_base = 0xfee00000 | MSR_IA32_APICBASE_ENABLE;
730 if (kvm_vcpu_is_bsp(&svm->vcpu)) 741 if (kvm_vcpu_is_bsp(&svm->vcpu))
731 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP; 742 svm->vcpu.arch.apic_base |= MSR_IA32_APICBASE_BSP;
732 743
733 return &svm->vcpu; 744 return &svm->vcpu;
734 745
746free_page3:
747 __free_pages(nested_msrpm_pages, MSRPM_ALLOC_ORDER);
748free_page2:
749 __free_pages(msrpm_pages, MSRPM_ALLOC_ORDER);
750free_page1:
751 __free_page(page);
735uninit: 752uninit:
736 kvm_vcpu_uninit(&svm->vcpu); 753 kvm_vcpu_uninit(&svm->vcpu);
737free_svm: 754free_svm:
@@ -758,17 +775,18 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
758 int i; 775 int i;
759 776
760 if (unlikely(cpu != vcpu->cpu)) { 777 if (unlikely(cpu != vcpu->cpu)) {
761 u64 tsc_this, delta; 778 u64 delta;
762 779
763 /* 780 if (check_tsc_unstable()) {
764 * Make sure that the guest sees a monotonically 781 /*
765 * increasing TSC. 782 * Make sure that the guest sees a monotonically
766 */ 783 * increasing TSC.
767 rdtscll(tsc_this); 784 */
768 delta = vcpu->arch.host_tsc - tsc_this; 785 delta = vcpu->arch.host_tsc - native_read_tsc();
769 svm->vmcb->control.tsc_offset += delta; 786 svm->vmcb->control.tsc_offset += delta;
770 if (is_nested(svm)) 787 if (is_nested(svm))
771 svm->nested.hsave->control.tsc_offset += delta; 788 svm->nested.hsave->control.tsc_offset += delta;
789 }
772 vcpu->cpu = cpu; 790 vcpu->cpu = cpu;
773 kvm_migrate_timers(vcpu); 791 kvm_migrate_timers(vcpu);
774 svm->asid_generation = 0; 792 svm->asid_generation = 0;
@@ -787,7 +805,7 @@ static void svm_vcpu_put(struct kvm_vcpu *vcpu)
787 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++) 805 for (i = 0; i < NR_HOST_SAVE_USER_MSRS; i++)
788 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]); 806 wrmsrl(host_save_user_msrs[i], svm->host_user_msrs[i]);
789 807
790 rdtscll(vcpu->arch.host_tsc); 808 vcpu->arch.host_tsc = native_read_tsc();
791} 809}
792 810
793static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) 811static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu)
@@ -950,42 +968,59 @@ static void svm_set_gdt(struct kvm_vcpu *vcpu, struct descriptor_table *dt)
950 svm->vmcb->save.gdtr.base = dt->base ; 968 svm->vmcb->save.gdtr.base = dt->base ;
951} 969}
952 970
971static void svm_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
972{
973}
974
953static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 975static void svm_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
954{ 976{
955} 977}
956 978
979static void update_cr0_intercept(struct vcpu_svm *svm)
980{
981 ulong gcr0 = svm->vcpu.arch.cr0;
982 u64 *hcr0 = &svm->vmcb->save.cr0;
983
984 if (!svm->vcpu.fpu_active)
985 *hcr0 |= SVM_CR0_SELECTIVE_MASK;
986 else
987 *hcr0 = (*hcr0 & ~SVM_CR0_SELECTIVE_MASK)
988 | (gcr0 & SVM_CR0_SELECTIVE_MASK);
989
990
991 if (gcr0 == *hcr0 && svm->vcpu.fpu_active) {
992 svm->vmcb->control.intercept_cr_read &= ~INTERCEPT_CR0_MASK;
993 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR0_MASK;
994 } else {
995 svm->vmcb->control.intercept_cr_read |= INTERCEPT_CR0_MASK;
996 svm->vmcb->control.intercept_cr_write |= INTERCEPT_CR0_MASK;
997 }
998}
999
957static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1000static void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
958{ 1001{
959 struct vcpu_svm *svm = to_svm(vcpu); 1002 struct vcpu_svm *svm = to_svm(vcpu);
960 1003
961#ifdef CONFIG_X86_64 1004#ifdef CONFIG_X86_64
962 if (vcpu->arch.shadow_efer & EFER_LME) { 1005 if (vcpu->arch.efer & EFER_LME) {
963 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 1006 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
964 vcpu->arch.shadow_efer |= EFER_LMA; 1007 vcpu->arch.efer |= EFER_LMA;
965 svm->vmcb->save.efer |= EFER_LMA | EFER_LME; 1008 svm->vmcb->save.efer |= EFER_LMA | EFER_LME;
966 } 1009 }
967 1010
968 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { 1011 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) {
969 vcpu->arch.shadow_efer &= ~EFER_LMA; 1012 vcpu->arch.efer &= ~EFER_LMA;
970 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); 1013 svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME);
971 } 1014 }
972 } 1015 }
973#endif 1016#endif
974 if (npt_enabled) 1017 vcpu->arch.cr0 = cr0;
975 goto set;
976 1018
977 if ((vcpu->arch.cr0 & X86_CR0_TS) && !(cr0 & X86_CR0_TS)) { 1019 if (!npt_enabled)
978 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1020 cr0 |= X86_CR0_PG | X86_CR0_WP;
979 vcpu->fpu_active = 1;
980 }
981 1021
982 vcpu->arch.cr0 = cr0; 1022 if (!vcpu->fpu_active)
983 cr0 |= X86_CR0_PG | X86_CR0_WP;
984 if (!vcpu->fpu_active) {
985 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
986 cr0 |= X86_CR0_TS; 1023 cr0 |= X86_CR0_TS;
987 }
988set:
989 /* 1024 /*
990 * re-enable caching here because the QEMU bios 1025 * re-enable caching here because the QEMU bios
991 * does not do it - this results in some delay at 1026 * does not do it - this results in some delay at
@@ -993,6 +1028,7 @@ set:
993 */ 1028 */
994 cr0 &= ~(X86_CR0_CD | X86_CR0_NW); 1029 cr0 &= ~(X86_CR0_CD | X86_CR0_NW);
995 svm->vmcb->save.cr0 = cr0; 1030 svm->vmcb->save.cr0 = cr0;
1031 update_cr0_intercept(svm);
996} 1032}
997 1033
998static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1034static void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1045,7 +1081,7 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1045 svm->vmcb->control.intercept_exceptions &= 1081 svm->vmcb->control.intercept_exceptions &=
1046 ~((1 << DB_VECTOR) | (1 << BP_VECTOR)); 1082 ~((1 << DB_VECTOR) | (1 << BP_VECTOR));
1047 1083
1048 if (vcpu->arch.singlestep) 1084 if (svm->nmi_singlestep)
1049 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR); 1085 svm->vmcb->control.intercept_exceptions |= (1 << DB_VECTOR);
1050 1086
1051 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { 1087 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
@@ -1060,26 +1096,16 @@ static void update_db_intercept(struct kvm_vcpu *vcpu)
1060 vcpu->guest_debug = 0; 1096 vcpu->guest_debug = 0;
1061} 1097}
1062 1098
1063static int svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1099static void svm_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1064{ 1100{
1065 int old_debug = vcpu->guest_debug;
1066 struct vcpu_svm *svm = to_svm(vcpu); 1101 struct vcpu_svm *svm = to_svm(vcpu);
1067 1102
1068 vcpu->guest_debug = dbg->control;
1069
1070 update_db_intercept(vcpu);
1071
1072 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1103 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1073 svm->vmcb->save.dr7 = dbg->arch.debugreg[7]; 1104 svm->vmcb->save.dr7 = dbg->arch.debugreg[7];
1074 else 1105 else
1075 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1106 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1076 1107
1077 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) 1108 update_db_intercept(vcpu);
1078 svm->vmcb->save.rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1079 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1080 svm->vmcb->save.rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1081
1082 return 0;
1083} 1109}
1084 1110
1085static void load_host_msrs(struct kvm_vcpu *vcpu) 1111static void load_host_msrs(struct kvm_vcpu *vcpu)
@@ -1096,91 +1122,85 @@ static void save_host_msrs(struct kvm_vcpu *vcpu)
1096#endif 1122#endif
1097} 1123}
1098 1124
1099static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *svm_data) 1125static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd)
1100{ 1126{
1101 if (svm_data->next_asid > svm_data->max_asid) { 1127 if (sd->next_asid > sd->max_asid) {
1102 ++svm_data->asid_generation; 1128 ++sd->asid_generation;
1103 svm_data->next_asid = 1; 1129 sd->next_asid = 1;
1104 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; 1130 svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID;
1105 } 1131 }
1106 1132
1107 svm->asid_generation = svm_data->asid_generation; 1133 svm->asid_generation = sd->asid_generation;
1108 svm->vmcb->control.asid = svm_data->next_asid++; 1134 svm->vmcb->control.asid = sd->next_asid++;
1109} 1135}
1110 1136
1111static unsigned long svm_get_dr(struct kvm_vcpu *vcpu, int dr) 1137static int svm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *dest)
1112{ 1138{
1113 struct vcpu_svm *svm = to_svm(vcpu); 1139 struct vcpu_svm *svm = to_svm(vcpu);
1114 unsigned long val;
1115 1140
1116 switch (dr) { 1141 switch (dr) {
1117 case 0 ... 3: 1142 case 0 ... 3:
1118 val = vcpu->arch.db[dr]; 1143 *dest = vcpu->arch.db[dr];
1119 break; 1144 break;
1145 case 4:
1146 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1147 return EMULATE_FAIL; /* will re-inject UD */
1148 /* fall through */
1120 case 6: 1149 case 6:
1121 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1150 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1122 val = vcpu->arch.dr6; 1151 *dest = vcpu->arch.dr6;
1123 else 1152 else
1124 val = svm->vmcb->save.dr6; 1153 *dest = svm->vmcb->save.dr6;
1125 break; 1154 break;
1155 case 5:
1156 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1157 return EMULATE_FAIL; /* will re-inject UD */
1158 /* fall through */
1126 case 7: 1159 case 7:
1127 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1160 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1128 val = vcpu->arch.dr7; 1161 *dest = vcpu->arch.dr7;
1129 else 1162 else
1130 val = svm->vmcb->save.dr7; 1163 *dest = svm->vmcb->save.dr7;
1131 break; 1164 break;
1132 default:
1133 val = 0;
1134 } 1165 }
1135 1166
1136 return val; 1167 return EMULATE_DONE;
1137} 1168}
1138 1169
1139static void svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value, 1170static int svm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long value)
1140 int *exception)
1141{ 1171{
1142 struct vcpu_svm *svm = to_svm(vcpu); 1172 struct vcpu_svm *svm = to_svm(vcpu);
1143 1173
1144 *exception = 0;
1145
1146 switch (dr) { 1174 switch (dr) {
1147 case 0 ... 3: 1175 case 0 ... 3:
1148 vcpu->arch.db[dr] = value; 1176 vcpu->arch.db[dr] = value;
1149 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 1177 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
1150 vcpu->arch.eff_db[dr] = value; 1178 vcpu->arch.eff_db[dr] = value;
1151 return; 1179 break;
1152 case 4 ... 5: 1180 case 4:
1153 if (vcpu->arch.cr4 & X86_CR4_DE) 1181 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1154 *exception = UD_VECTOR; 1182 return EMULATE_FAIL; /* will re-inject UD */
1155 return; 1183 /* fall through */
1156 case 6: 1184 case 6:
1157 if (value & 0xffffffff00000000ULL) {
1158 *exception = GP_VECTOR;
1159 return;
1160 }
1161 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1; 1185 vcpu->arch.dr6 = (value & DR6_VOLATILE) | DR6_FIXED_1;
1162 return; 1186 break;
1187 case 5:
1188 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
1189 return EMULATE_FAIL; /* will re-inject UD */
1190 /* fall through */
1163 case 7: 1191 case 7:
1164 if (value & 0xffffffff00000000ULL) {
1165 *exception = GP_VECTOR;
1166 return;
1167 }
1168 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1; 1192 vcpu->arch.dr7 = (value & DR7_VOLATILE) | DR7_FIXED_1;
1169 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 1193 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
1170 svm->vmcb->save.dr7 = vcpu->arch.dr7; 1194 svm->vmcb->save.dr7 = vcpu->arch.dr7;
1171 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK); 1195 vcpu->arch.switch_db_regs = (value & DR7_BP_EN_MASK);
1172 } 1196 }
1173 return; 1197 break;
1174 default:
1175 /* FIXME: Possible case? */
1176 printk(KERN_DEBUG "%s: unexpected dr %u\n",
1177 __func__, dr);
1178 *exception = UD_VECTOR;
1179 return;
1180 } 1198 }
1199
1200 return EMULATE_DONE;
1181} 1201}
1182 1202
1183static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1203static int pf_interception(struct vcpu_svm *svm)
1184{ 1204{
1185 u64 fault_address; 1205 u64 fault_address;
1186 u32 error_code; 1206 u32 error_code;
@@ -1194,17 +1214,19 @@ static int pf_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1194 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code); 1214 return kvm_mmu_page_fault(&svm->vcpu, fault_address, error_code);
1195} 1215}
1196 1216
1197static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1217static int db_interception(struct vcpu_svm *svm)
1198{ 1218{
1219 struct kvm_run *kvm_run = svm->vcpu.run;
1220
1199 if (!(svm->vcpu.guest_debug & 1221 if (!(svm->vcpu.guest_debug &
1200 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && 1222 (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) &&
1201 !svm->vcpu.arch.singlestep) { 1223 !svm->nmi_singlestep) {
1202 kvm_queue_exception(&svm->vcpu, DB_VECTOR); 1224 kvm_queue_exception(&svm->vcpu, DB_VECTOR);
1203 return 1; 1225 return 1;
1204 } 1226 }
1205 1227
1206 if (svm->vcpu.arch.singlestep) { 1228 if (svm->nmi_singlestep) {
1207 svm->vcpu.arch.singlestep = false; 1229 svm->nmi_singlestep = false;
1208 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) 1230 if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP))
1209 svm->vmcb->save.rflags &= 1231 svm->vmcb->save.rflags &=
1210 ~(X86_EFLAGS_TF | X86_EFLAGS_RF); 1232 ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
@@ -1223,35 +1245,41 @@ static int db_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1223 return 1; 1245 return 1;
1224} 1246}
1225 1247
1226static int bp_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1248static int bp_interception(struct vcpu_svm *svm)
1227{ 1249{
1250 struct kvm_run *kvm_run = svm->vcpu.run;
1251
1228 kvm_run->exit_reason = KVM_EXIT_DEBUG; 1252 kvm_run->exit_reason = KVM_EXIT_DEBUG;
1229 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; 1253 kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip;
1230 kvm_run->debug.arch.exception = BP_VECTOR; 1254 kvm_run->debug.arch.exception = BP_VECTOR;
1231 return 0; 1255 return 0;
1232} 1256}
1233 1257
1234static int ud_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1258static int ud_interception(struct vcpu_svm *svm)
1235{ 1259{
1236 int er; 1260 int er;
1237 1261
1238 er = emulate_instruction(&svm->vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 1262 er = emulate_instruction(&svm->vcpu, 0, 0, EMULTYPE_TRAP_UD);
1239 if (er != EMULATE_DONE) 1263 if (er != EMULATE_DONE)
1240 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1264 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1241 return 1; 1265 return 1;
1242} 1266}
1243 1267
1244static int nm_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1268static void svm_fpu_activate(struct kvm_vcpu *vcpu)
1245{ 1269{
1270 struct vcpu_svm *svm = to_svm(vcpu);
1246 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR); 1271 svm->vmcb->control.intercept_exceptions &= ~(1 << NM_VECTOR);
1247 if (!(svm->vcpu.arch.cr0 & X86_CR0_TS))
1248 svm->vmcb->save.cr0 &= ~X86_CR0_TS;
1249 svm->vcpu.fpu_active = 1; 1272 svm->vcpu.fpu_active = 1;
1273 update_cr0_intercept(svm);
1274}
1250 1275
1276static int nm_interception(struct vcpu_svm *svm)
1277{
1278 svm_fpu_activate(&svm->vcpu);
1251 return 1; 1279 return 1;
1252} 1280}
1253 1281
1254static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1282static int mc_interception(struct vcpu_svm *svm)
1255{ 1283{
1256 /* 1284 /*
1257 * On an #MC intercept the MCE handler is not called automatically in 1285 * On an #MC intercept the MCE handler is not called automatically in
@@ -1264,8 +1292,10 @@ static int mc_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1264 return 1; 1292 return 1;
1265} 1293}
1266 1294
1267static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1295static int shutdown_interception(struct vcpu_svm *svm)
1268{ 1296{
1297 struct kvm_run *kvm_run = svm->vcpu.run;
1298
1269 /* 1299 /*
1270 * VMCB is undefined after a SHUTDOWN intercept 1300 * VMCB is undefined after a SHUTDOWN intercept
1271 * so reinitialize it. 1301 * so reinitialize it.
@@ -1277,7 +1307,7 @@ static int shutdown_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1277 return 0; 1307 return 0;
1278} 1308}
1279 1309
1280static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1310static int io_interception(struct vcpu_svm *svm)
1281{ 1311{
1282 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ 1312 u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */
1283 int size, in, string; 1313 int size, in, string;
@@ -1291,7 +1321,7 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1291 1321
1292 if (string) { 1322 if (string) {
1293 if (emulate_instruction(&svm->vcpu, 1323 if (emulate_instruction(&svm->vcpu,
1294 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO) 1324 0, 0, 0) == EMULATE_DO_MMIO)
1295 return 0; 1325 return 0;
1296 return 1; 1326 return 1;
1297 } 1327 }
@@ -1301,33 +1331,33 @@ static int io_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1301 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; 1331 size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT;
1302 1332
1303 skip_emulated_instruction(&svm->vcpu); 1333 skip_emulated_instruction(&svm->vcpu);
1304 return kvm_emulate_pio(&svm->vcpu, kvm_run, in, size, port); 1334 return kvm_emulate_pio(&svm->vcpu, in, size, port);
1305} 1335}
1306 1336
1307static int nmi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1337static int nmi_interception(struct vcpu_svm *svm)
1308{ 1338{
1309 return 1; 1339 return 1;
1310} 1340}
1311 1341
1312static int intr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1342static int intr_interception(struct vcpu_svm *svm)
1313{ 1343{
1314 ++svm->vcpu.stat.irq_exits; 1344 ++svm->vcpu.stat.irq_exits;
1315 return 1; 1345 return 1;
1316} 1346}
1317 1347
1318static int nop_on_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1348static int nop_on_interception(struct vcpu_svm *svm)
1319{ 1349{
1320 return 1; 1350 return 1;
1321} 1351}
1322 1352
1323static int halt_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1353static int halt_interception(struct vcpu_svm *svm)
1324{ 1354{
1325 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1; 1355 svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
1326 skip_emulated_instruction(&svm->vcpu); 1356 skip_emulated_instruction(&svm->vcpu);
1327 return kvm_emulate_halt(&svm->vcpu); 1357 return kvm_emulate_halt(&svm->vcpu);
1328} 1358}
1329 1359
1330static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1360static int vmmcall_interception(struct vcpu_svm *svm)
1331{ 1361{
1332 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3; 1362 svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
1333 skip_emulated_instruction(&svm->vcpu); 1363 skip_emulated_instruction(&svm->vcpu);
@@ -1337,7 +1367,7 @@ static int vmmcall_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1337 1367
1338static int nested_svm_check_permissions(struct vcpu_svm *svm) 1368static int nested_svm_check_permissions(struct vcpu_svm *svm)
1339{ 1369{
1340 if (!(svm->vcpu.arch.shadow_efer & EFER_SVME) 1370 if (!(svm->vcpu.arch.efer & EFER_SVME)
1341 || !is_paging(&svm->vcpu)) { 1371 || !is_paging(&svm->vcpu)) {
1342 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 1372 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1343 return 1; 1373 return 1;
@@ -1378,8 +1408,15 @@ static inline int nested_svm_intr(struct vcpu_svm *svm)
1378 1408
1379 svm->vmcb->control.exit_code = SVM_EXIT_INTR; 1409 svm->vmcb->control.exit_code = SVM_EXIT_INTR;
1380 1410
1381 if (nested_svm_exit_handled(svm)) { 1411 if (svm->nested.intercept & 1ULL) {
1382 nsvm_printk("VMexit -> INTR\n"); 1412 /*
1413 * The #vmexit can't be emulated here directly because this
1414 * code path runs with irqs and preemtion disabled. A
1415 * #vmexit emulation might sleep. Only signal request for
1416 * the #vmexit here.
1417 */
1418 svm->nested.exit_required = true;
1419 trace_kvm_nested_intr_vmexit(svm->vmcb->save.rip);
1383 return 1; 1420 return 1;
1384 } 1421 }
1385 1422
@@ -1390,10 +1427,7 @@ static void *nested_svm_map(struct vcpu_svm *svm, u64 gpa, enum km_type idx)
1390{ 1427{
1391 struct page *page; 1428 struct page *page;
1392 1429
1393 down_read(&current->mm->mmap_sem);
1394 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT); 1430 page = gfn_to_page(svm->vcpu.kvm, gpa >> PAGE_SHIFT);
1395 up_read(&current->mm->mmap_sem);
1396
1397 if (is_error_page(page)) 1431 if (is_error_page(page))
1398 goto error; 1432 goto error;
1399 1433
@@ -1532,14 +1566,12 @@ static int nested_svm_exit_handled(struct vcpu_svm *svm)
1532 } 1566 }
1533 default: { 1567 default: {
1534 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR); 1568 u64 exit_bits = 1ULL << (exit_code - SVM_EXIT_INTR);
1535 nsvm_printk("exit code: 0x%x\n", exit_code);
1536 if (svm->nested.intercept & exit_bits) 1569 if (svm->nested.intercept & exit_bits)
1537 vmexit = NESTED_EXIT_DONE; 1570 vmexit = NESTED_EXIT_DONE;
1538 } 1571 }
1539 } 1572 }
1540 1573
1541 if (vmexit == NESTED_EXIT_DONE) { 1574 if (vmexit == NESTED_EXIT_DONE) {
1542 nsvm_printk("#VMEXIT reason=%04x\n", exit_code);
1543 nested_svm_vmexit(svm); 1575 nested_svm_vmexit(svm);
1544 } 1576 }
1545 1577
@@ -1584,6 +1616,12 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1584 struct vmcb *hsave = svm->nested.hsave; 1616 struct vmcb *hsave = svm->nested.hsave;
1585 struct vmcb *vmcb = svm->vmcb; 1617 struct vmcb *vmcb = svm->vmcb;
1586 1618
1619 trace_kvm_nested_vmexit_inject(vmcb->control.exit_code,
1620 vmcb->control.exit_info_1,
1621 vmcb->control.exit_info_2,
1622 vmcb->control.exit_int_info,
1623 vmcb->control.exit_int_info_err);
1624
1587 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0); 1625 nested_vmcb = nested_svm_map(svm, svm->nested.vmcb, KM_USER0);
1588 if (!nested_vmcb) 1626 if (!nested_vmcb)
1589 return 1; 1627 return 1;
@@ -1617,6 +1655,22 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1617 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2; 1655 nested_vmcb->control.exit_info_2 = vmcb->control.exit_info_2;
1618 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info; 1656 nested_vmcb->control.exit_int_info = vmcb->control.exit_int_info;
1619 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err; 1657 nested_vmcb->control.exit_int_info_err = vmcb->control.exit_int_info_err;
1658
1659 /*
1660 * If we emulate a VMRUN/#VMEXIT in the same host #vmexit cycle we have
1661 * to make sure that we do not lose injected events. So check event_inj
1662 * here and copy it to exit_int_info if it is valid.
1663 * Exit_int_info and event_inj can't be both valid because the case
1664 * below only happens on a VMRUN instruction intercept which has
1665 * no valid exit_int_info set.
1666 */
1667 if (vmcb->control.event_inj & SVM_EVTINJ_VALID) {
1668 struct vmcb_control_area *nc = &nested_vmcb->control;
1669
1670 nc->exit_int_info = vmcb->control.event_inj;
1671 nc->exit_int_info_err = vmcb->control.event_inj_err;
1672 }
1673
1620 nested_vmcb->control.tlb_ctl = 0; 1674 nested_vmcb->control.tlb_ctl = 0;
1621 nested_vmcb->control.event_inj = 0; 1675 nested_vmcb->control.event_inj = 0;
1622 nested_vmcb->control.event_inj_err = 0; 1676 nested_vmcb->control.event_inj_err = 0;
@@ -1628,10 +1682,6 @@ static int nested_svm_vmexit(struct vcpu_svm *svm)
1628 /* Restore the original control entries */ 1682 /* Restore the original control entries */
1629 copy_vmcb_control_area(vmcb, hsave); 1683 copy_vmcb_control_area(vmcb, hsave);
1630 1684
1631 /* Kill any pending exceptions */
1632 if (svm->vcpu.arch.exception.pending == true)
1633 nsvm_printk("WARNING: Pending Exception\n");
1634
1635 kvm_clear_exception_queue(&svm->vcpu); 1685 kvm_clear_exception_queue(&svm->vcpu);
1636 kvm_clear_interrupt_queue(&svm->vcpu); 1686 kvm_clear_interrupt_queue(&svm->vcpu);
1637 1687
@@ -1702,6 +1752,12 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1702 /* nested_vmcb is our indicator if nested SVM is activated */ 1752 /* nested_vmcb is our indicator if nested SVM is activated */
1703 svm->nested.vmcb = svm->vmcb->save.rax; 1753 svm->nested.vmcb = svm->vmcb->save.rax;
1704 1754
1755 trace_kvm_nested_vmrun(svm->vmcb->save.rip - 3, svm->nested.vmcb,
1756 nested_vmcb->save.rip,
1757 nested_vmcb->control.int_ctl,
1758 nested_vmcb->control.event_inj,
1759 nested_vmcb->control.nested_ctl);
1760
1705 /* Clear internal status */ 1761 /* Clear internal status */
1706 kvm_clear_exception_queue(&svm->vcpu); 1762 kvm_clear_exception_queue(&svm->vcpu);
1707 kvm_clear_interrupt_queue(&svm->vcpu); 1763 kvm_clear_interrupt_queue(&svm->vcpu);
@@ -1714,8 +1770,8 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1714 hsave->save.ds = vmcb->save.ds; 1770 hsave->save.ds = vmcb->save.ds;
1715 hsave->save.gdtr = vmcb->save.gdtr; 1771 hsave->save.gdtr = vmcb->save.gdtr;
1716 hsave->save.idtr = vmcb->save.idtr; 1772 hsave->save.idtr = vmcb->save.idtr;
1717 hsave->save.efer = svm->vcpu.arch.shadow_efer; 1773 hsave->save.efer = svm->vcpu.arch.efer;
1718 hsave->save.cr0 = svm->vcpu.arch.cr0; 1774 hsave->save.cr0 = kvm_read_cr0(&svm->vcpu);
1719 hsave->save.cr4 = svm->vcpu.arch.cr4; 1775 hsave->save.cr4 = svm->vcpu.arch.cr4;
1720 hsave->save.rflags = vmcb->save.rflags; 1776 hsave->save.rflags = vmcb->save.rflags;
1721 hsave->save.rip = svm->next_rip; 1777 hsave->save.rip = svm->next_rip;
@@ -1789,28 +1845,15 @@ static bool nested_svm_vmrun(struct vcpu_svm *svm)
1789 svm->nested.intercept = nested_vmcb->control.intercept; 1845 svm->nested.intercept = nested_vmcb->control.intercept;
1790 1846
1791 force_new_asid(&svm->vcpu); 1847 force_new_asid(&svm->vcpu);
1792 svm->vmcb->control.exit_int_info = nested_vmcb->control.exit_int_info;
1793 svm->vmcb->control.exit_int_info_err = nested_vmcb->control.exit_int_info_err;
1794 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK; 1848 svm->vmcb->control.int_ctl = nested_vmcb->control.int_ctl | V_INTR_MASKING_MASK;
1795 if (nested_vmcb->control.int_ctl & V_IRQ_MASK) {
1796 nsvm_printk("nSVM Injecting Interrupt: 0x%x\n",
1797 nested_vmcb->control.int_ctl);
1798 }
1799 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK) 1849 if (nested_vmcb->control.int_ctl & V_INTR_MASKING_MASK)
1800 svm->vcpu.arch.hflags |= HF_VINTR_MASK; 1850 svm->vcpu.arch.hflags |= HF_VINTR_MASK;
1801 else 1851 else
1802 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK; 1852 svm->vcpu.arch.hflags &= ~HF_VINTR_MASK;
1803 1853
1804 nsvm_printk("nSVM exit_int_info: 0x%x | int_state: 0x%x\n",
1805 nested_vmcb->control.exit_int_info,
1806 nested_vmcb->control.int_state);
1807
1808 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector; 1854 svm->vmcb->control.int_vector = nested_vmcb->control.int_vector;
1809 svm->vmcb->control.int_state = nested_vmcb->control.int_state; 1855 svm->vmcb->control.int_state = nested_vmcb->control.int_state;
1810 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset; 1856 svm->vmcb->control.tsc_offset += nested_vmcb->control.tsc_offset;
1811 if (nested_vmcb->control.event_inj & SVM_EVTINJ_VALID)
1812 nsvm_printk("Injecting Event: 0x%x\n",
1813 nested_vmcb->control.event_inj);
1814 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj; 1857 svm->vmcb->control.event_inj = nested_vmcb->control.event_inj;
1815 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err; 1858 svm->vmcb->control.event_inj_err = nested_vmcb->control.event_inj_err;
1816 1859
@@ -1837,7 +1880,7 @@ static void nested_svm_vmloadsave(struct vmcb *from_vmcb, struct vmcb *to_vmcb)
1837 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip; 1880 to_vmcb->save.sysenter_eip = from_vmcb->save.sysenter_eip;
1838} 1881}
1839 1882
1840static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1883static int vmload_interception(struct vcpu_svm *svm)
1841{ 1884{
1842 struct vmcb *nested_vmcb; 1885 struct vmcb *nested_vmcb;
1843 1886
@@ -1857,7 +1900,7 @@ static int vmload_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1857 return 1; 1900 return 1;
1858} 1901}
1859 1902
1860static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1903static int vmsave_interception(struct vcpu_svm *svm)
1861{ 1904{
1862 struct vmcb *nested_vmcb; 1905 struct vmcb *nested_vmcb;
1863 1906
@@ -1877,10 +1920,8 @@ static int vmsave_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1877 return 1; 1920 return 1;
1878} 1921}
1879 1922
1880static int vmrun_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1923static int vmrun_interception(struct vcpu_svm *svm)
1881{ 1924{
1882 nsvm_printk("VMrun\n");
1883
1884 if (nested_svm_check_permissions(svm)) 1925 if (nested_svm_check_permissions(svm))
1885 return 1; 1926 return 1;
1886 1927
@@ -1907,7 +1948,7 @@ failed:
1907 return 1; 1948 return 1;
1908} 1949}
1909 1950
1910static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1951static int stgi_interception(struct vcpu_svm *svm)
1911{ 1952{
1912 if (nested_svm_check_permissions(svm)) 1953 if (nested_svm_check_permissions(svm))
1913 return 1; 1954 return 1;
@@ -1920,7 +1961,7 @@ static int stgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1920 return 1; 1961 return 1;
1921} 1962}
1922 1963
1923static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1964static int clgi_interception(struct vcpu_svm *svm)
1924{ 1965{
1925 if (nested_svm_check_permissions(svm)) 1966 if (nested_svm_check_permissions(svm))
1926 return 1; 1967 return 1;
@@ -1937,10 +1978,12 @@ static int clgi_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1937 return 1; 1978 return 1;
1938} 1979}
1939 1980
1940static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 1981static int invlpga_interception(struct vcpu_svm *svm)
1941{ 1982{
1942 struct kvm_vcpu *vcpu = &svm->vcpu; 1983 struct kvm_vcpu *vcpu = &svm->vcpu;
1943 nsvm_printk("INVLPGA\n"); 1984
1985 trace_kvm_invlpga(svm->vmcb->save.rip, vcpu->arch.regs[VCPU_REGS_RCX],
1986 vcpu->arch.regs[VCPU_REGS_RAX]);
1944 1987
1945 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ 1988 /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */
1946 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]); 1989 kvm_mmu_invlpg(vcpu, vcpu->arch.regs[VCPU_REGS_RAX]);
@@ -1950,15 +1993,21 @@ static int invlpga_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run)
1950 return 1; 1993 return 1;
1951} 1994}
1952 1995
1953static int invalid_op_interception(struct vcpu_svm *svm, 1996static int skinit_interception(struct vcpu_svm *svm)
1954 struct kvm_run *kvm_run) 1997{
1998 trace_kvm_skinit(svm->vmcb->save.rip, svm->vcpu.arch.regs[VCPU_REGS_RAX]);
1999
2000 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
2001 return 1;
2002}
2003
2004static int invalid_op_interception(struct vcpu_svm *svm)
1955{ 2005{
1956 kvm_queue_exception(&svm->vcpu, UD_VECTOR); 2006 kvm_queue_exception(&svm->vcpu, UD_VECTOR);
1957 return 1; 2007 return 1;
1958} 2008}
1959 2009
1960static int task_switch_interception(struct vcpu_svm *svm, 2010static int task_switch_interception(struct vcpu_svm *svm)
1961 struct kvm_run *kvm_run)
1962{ 2011{
1963 u16 tss_selector; 2012 u16 tss_selector;
1964 int reason; 2013 int reason;
@@ -2008,41 +2057,42 @@ static int task_switch_interception(struct vcpu_svm *svm,
2008 return kvm_task_switch(&svm->vcpu, tss_selector, reason); 2057 return kvm_task_switch(&svm->vcpu, tss_selector, reason);
2009} 2058}
2010 2059
2011static int cpuid_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2060static int cpuid_interception(struct vcpu_svm *svm)
2012{ 2061{
2013 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2062 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2014 kvm_emulate_cpuid(&svm->vcpu); 2063 kvm_emulate_cpuid(&svm->vcpu);
2015 return 1; 2064 return 1;
2016} 2065}
2017 2066
2018static int iret_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2067static int iret_interception(struct vcpu_svm *svm)
2019{ 2068{
2020 ++svm->vcpu.stat.nmi_window_exits; 2069 ++svm->vcpu.stat.nmi_window_exits;
2021 svm->vmcb->control.intercept &= ~(1UL << INTERCEPT_IRET); 2070 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
2022 svm->vcpu.arch.hflags |= HF_IRET_MASK; 2071 svm->vcpu.arch.hflags |= HF_IRET_MASK;
2023 return 1; 2072 return 1;
2024} 2073}
2025 2074
2026static int invlpg_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2075static int invlpg_interception(struct vcpu_svm *svm)
2027{ 2076{
2028 if (emulate_instruction(&svm->vcpu, kvm_run, 0, 0, 0) != EMULATE_DONE) 2077 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2029 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2078 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2030 return 1; 2079 return 1;
2031} 2080}
2032 2081
2033static int emulate_on_interception(struct vcpu_svm *svm, 2082static int emulate_on_interception(struct vcpu_svm *svm)
2034 struct kvm_run *kvm_run)
2035{ 2083{
2036 if (emulate_instruction(&svm->vcpu, NULL, 0, 0, 0) != EMULATE_DONE) 2084 if (emulate_instruction(&svm->vcpu, 0, 0, 0) != EMULATE_DONE)
2037 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__); 2085 pr_unimpl(&svm->vcpu, "%s: failed\n", __func__);
2038 return 1; 2086 return 1;
2039} 2087}
2040 2088
2041static int cr8_write_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2089static int cr8_write_interception(struct vcpu_svm *svm)
2042{ 2090{
2091 struct kvm_run *kvm_run = svm->vcpu.run;
2092
2043 u8 cr8_prev = kvm_get_cr8(&svm->vcpu); 2093 u8 cr8_prev = kvm_get_cr8(&svm->vcpu);
2044 /* instruction emulation calls kvm_set_cr8() */ 2094 /* instruction emulation calls kvm_set_cr8() */
2045 emulate_instruction(&svm->vcpu, NULL, 0, 0, 0); 2095 emulate_instruction(&svm->vcpu, 0, 0, 0);
2046 if (irqchip_in_kernel(svm->vcpu.kvm)) { 2096 if (irqchip_in_kernel(svm->vcpu.kvm)) {
2047 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK; 2097 svm->vmcb->control.intercept_cr_write &= ~INTERCEPT_CR8_MASK;
2048 return 1; 2098 return 1;
@@ -2128,14 +2178,15 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 *data)
2128 return 0; 2178 return 0;
2129} 2179}
2130 2180
2131static int rdmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2181static int rdmsr_interception(struct vcpu_svm *svm)
2132{ 2182{
2133 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2183 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2134 u64 data; 2184 u64 data;
2135 2185
2136 if (svm_get_msr(&svm->vcpu, ecx, &data)) 2186 if (svm_get_msr(&svm->vcpu, ecx, &data)) {
2187 trace_kvm_msr_read_ex(ecx);
2137 kvm_inject_gp(&svm->vcpu, 0); 2188 kvm_inject_gp(&svm->vcpu, 0);
2138 else { 2189 } else {
2139 trace_kvm_msr_read(ecx, data); 2190 trace_kvm_msr_read(ecx, data);
2140 2191
2141 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff; 2192 svm->vcpu.arch.regs[VCPU_REGS_RAX] = data & 0xffffffff;
@@ -2221,33 +2272,36 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, unsigned ecx, u64 data)
2221 return 0; 2272 return 0;
2222} 2273}
2223 2274
2224static int wrmsr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2275static int wrmsr_interception(struct vcpu_svm *svm)
2225{ 2276{
2226 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX]; 2277 u32 ecx = svm->vcpu.arch.regs[VCPU_REGS_RCX];
2227 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u) 2278 u64 data = (svm->vcpu.arch.regs[VCPU_REGS_RAX] & -1u)
2228 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32); 2279 | ((u64)(svm->vcpu.arch.regs[VCPU_REGS_RDX] & -1u) << 32);
2229 2280
2230 trace_kvm_msr_write(ecx, data);
2231 2281
2232 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2; 2282 svm->next_rip = kvm_rip_read(&svm->vcpu) + 2;
2233 if (svm_set_msr(&svm->vcpu, ecx, data)) 2283 if (svm_set_msr(&svm->vcpu, ecx, data)) {
2284 trace_kvm_msr_write_ex(ecx, data);
2234 kvm_inject_gp(&svm->vcpu, 0); 2285 kvm_inject_gp(&svm->vcpu, 0);
2235 else 2286 } else {
2287 trace_kvm_msr_write(ecx, data);
2236 skip_emulated_instruction(&svm->vcpu); 2288 skip_emulated_instruction(&svm->vcpu);
2289 }
2237 return 1; 2290 return 1;
2238} 2291}
2239 2292
2240static int msr_interception(struct vcpu_svm *svm, struct kvm_run *kvm_run) 2293static int msr_interception(struct vcpu_svm *svm)
2241{ 2294{
2242 if (svm->vmcb->control.exit_info_1) 2295 if (svm->vmcb->control.exit_info_1)
2243 return wrmsr_interception(svm, kvm_run); 2296 return wrmsr_interception(svm);
2244 else 2297 else
2245 return rdmsr_interception(svm, kvm_run); 2298 return rdmsr_interception(svm);
2246} 2299}
2247 2300
2248static int interrupt_window_interception(struct vcpu_svm *svm, 2301static int interrupt_window_interception(struct vcpu_svm *svm)
2249 struct kvm_run *kvm_run)
2250{ 2302{
2303 struct kvm_run *kvm_run = svm->vcpu.run;
2304
2251 svm_clear_vintr(svm); 2305 svm_clear_vintr(svm);
2252 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK; 2306 svm->vmcb->control.int_ctl &= ~V_IRQ_MASK;
2253 /* 2307 /*
@@ -2265,13 +2319,18 @@ static int interrupt_window_interception(struct vcpu_svm *svm,
2265 return 1; 2319 return 1;
2266} 2320}
2267 2321
2268static int (*svm_exit_handlers[])(struct vcpu_svm *svm, 2322static int pause_interception(struct vcpu_svm *svm)
2269 struct kvm_run *kvm_run) = { 2323{
2324 kvm_vcpu_on_spin(&(svm->vcpu));
2325 return 1;
2326}
2327
2328static int (*svm_exit_handlers[])(struct vcpu_svm *svm) = {
2270 [SVM_EXIT_READ_CR0] = emulate_on_interception, 2329 [SVM_EXIT_READ_CR0] = emulate_on_interception,
2271 [SVM_EXIT_READ_CR3] = emulate_on_interception, 2330 [SVM_EXIT_READ_CR3] = emulate_on_interception,
2272 [SVM_EXIT_READ_CR4] = emulate_on_interception, 2331 [SVM_EXIT_READ_CR4] = emulate_on_interception,
2273 [SVM_EXIT_READ_CR8] = emulate_on_interception, 2332 [SVM_EXIT_READ_CR8] = emulate_on_interception,
2274 /* for now: */ 2333 [SVM_EXIT_CR0_SEL_WRITE] = emulate_on_interception,
2275 [SVM_EXIT_WRITE_CR0] = emulate_on_interception, 2334 [SVM_EXIT_WRITE_CR0] = emulate_on_interception,
2276 [SVM_EXIT_WRITE_CR3] = emulate_on_interception, 2335 [SVM_EXIT_WRITE_CR3] = emulate_on_interception,
2277 [SVM_EXIT_WRITE_CR4] = emulate_on_interception, 2336 [SVM_EXIT_WRITE_CR4] = emulate_on_interception,
@@ -2280,11 +2339,17 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2280 [SVM_EXIT_READ_DR1] = emulate_on_interception, 2339 [SVM_EXIT_READ_DR1] = emulate_on_interception,
2281 [SVM_EXIT_READ_DR2] = emulate_on_interception, 2340 [SVM_EXIT_READ_DR2] = emulate_on_interception,
2282 [SVM_EXIT_READ_DR3] = emulate_on_interception, 2341 [SVM_EXIT_READ_DR3] = emulate_on_interception,
2342 [SVM_EXIT_READ_DR4] = emulate_on_interception,
2343 [SVM_EXIT_READ_DR5] = emulate_on_interception,
2344 [SVM_EXIT_READ_DR6] = emulate_on_interception,
2345 [SVM_EXIT_READ_DR7] = emulate_on_interception,
2283 [SVM_EXIT_WRITE_DR0] = emulate_on_interception, 2346 [SVM_EXIT_WRITE_DR0] = emulate_on_interception,
2284 [SVM_EXIT_WRITE_DR1] = emulate_on_interception, 2347 [SVM_EXIT_WRITE_DR1] = emulate_on_interception,
2285 [SVM_EXIT_WRITE_DR2] = emulate_on_interception, 2348 [SVM_EXIT_WRITE_DR2] = emulate_on_interception,
2286 [SVM_EXIT_WRITE_DR3] = emulate_on_interception, 2349 [SVM_EXIT_WRITE_DR3] = emulate_on_interception,
2350 [SVM_EXIT_WRITE_DR4] = emulate_on_interception,
2287 [SVM_EXIT_WRITE_DR5] = emulate_on_interception, 2351 [SVM_EXIT_WRITE_DR5] = emulate_on_interception,
2352 [SVM_EXIT_WRITE_DR6] = emulate_on_interception,
2288 [SVM_EXIT_WRITE_DR7] = emulate_on_interception, 2353 [SVM_EXIT_WRITE_DR7] = emulate_on_interception,
2289 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, 2354 [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception,
2290 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, 2355 [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception,
@@ -2301,6 +2366,7 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2301 [SVM_EXIT_CPUID] = cpuid_interception, 2366 [SVM_EXIT_CPUID] = cpuid_interception,
2302 [SVM_EXIT_IRET] = iret_interception, 2367 [SVM_EXIT_IRET] = iret_interception,
2303 [SVM_EXIT_INVD] = emulate_on_interception, 2368 [SVM_EXIT_INVD] = emulate_on_interception,
2369 [SVM_EXIT_PAUSE] = pause_interception,
2304 [SVM_EXIT_HLT] = halt_interception, 2370 [SVM_EXIT_HLT] = halt_interception,
2305 [SVM_EXIT_INVLPG] = invlpg_interception, 2371 [SVM_EXIT_INVLPG] = invlpg_interception,
2306 [SVM_EXIT_INVLPGA] = invlpga_interception, 2372 [SVM_EXIT_INVLPGA] = invlpga_interception,
@@ -2314,26 +2380,36 @@ static int (*svm_exit_handlers[])(struct vcpu_svm *svm,
2314 [SVM_EXIT_VMSAVE] = vmsave_interception, 2380 [SVM_EXIT_VMSAVE] = vmsave_interception,
2315 [SVM_EXIT_STGI] = stgi_interception, 2381 [SVM_EXIT_STGI] = stgi_interception,
2316 [SVM_EXIT_CLGI] = clgi_interception, 2382 [SVM_EXIT_CLGI] = clgi_interception,
2317 [SVM_EXIT_SKINIT] = invalid_op_interception, 2383 [SVM_EXIT_SKINIT] = skinit_interception,
2318 [SVM_EXIT_WBINVD] = emulate_on_interception, 2384 [SVM_EXIT_WBINVD] = emulate_on_interception,
2319 [SVM_EXIT_MONITOR] = invalid_op_interception, 2385 [SVM_EXIT_MONITOR] = invalid_op_interception,
2320 [SVM_EXIT_MWAIT] = invalid_op_interception, 2386 [SVM_EXIT_MWAIT] = invalid_op_interception,
2321 [SVM_EXIT_NPF] = pf_interception, 2387 [SVM_EXIT_NPF] = pf_interception,
2322}; 2388};
2323 2389
2324static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 2390static int handle_exit(struct kvm_vcpu *vcpu)
2325{ 2391{
2326 struct vcpu_svm *svm = to_svm(vcpu); 2392 struct vcpu_svm *svm = to_svm(vcpu);
2393 struct kvm_run *kvm_run = vcpu->run;
2327 u32 exit_code = svm->vmcb->control.exit_code; 2394 u32 exit_code = svm->vmcb->control.exit_code;
2328 2395
2329 trace_kvm_exit(exit_code, svm->vmcb->save.rip); 2396 trace_kvm_exit(exit_code, svm->vmcb->save.rip);
2330 2397
2398 if (unlikely(svm->nested.exit_required)) {
2399 nested_svm_vmexit(svm);
2400 svm->nested.exit_required = false;
2401
2402 return 1;
2403 }
2404
2331 if (is_nested(svm)) { 2405 if (is_nested(svm)) {
2332 int vmexit; 2406 int vmexit;
2333 2407
2334 nsvm_printk("nested handle_exit: 0x%x | 0x%lx | 0x%lx | 0x%lx\n", 2408 trace_kvm_nested_vmexit(svm->vmcb->save.rip, exit_code,
2335 exit_code, svm->vmcb->control.exit_info_1, 2409 svm->vmcb->control.exit_info_1,
2336 svm->vmcb->control.exit_info_2, svm->vmcb->save.rip); 2410 svm->vmcb->control.exit_info_2,
2411 svm->vmcb->control.exit_int_info,
2412 svm->vmcb->control.exit_int_info_err);
2337 2413
2338 vmexit = nested_svm_exit_special(svm); 2414 vmexit = nested_svm_exit_special(svm);
2339 2415
@@ -2346,20 +2422,10 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2346 2422
2347 svm_complete_interrupts(svm); 2423 svm_complete_interrupts(svm);
2348 2424
2349 if (npt_enabled) { 2425 if (!(svm->vmcb->control.intercept_cr_write & INTERCEPT_CR0_MASK))
2350 int mmu_reload = 0;
2351 if ((vcpu->arch.cr0 ^ svm->vmcb->save.cr0) & X86_CR0_PG) {
2352 svm_set_cr0(vcpu, svm->vmcb->save.cr0);
2353 mmu_reload = 1;
2354 }
2355 vcpu->arch.cr0 = svm->vmcb->save.cr0; 2426 vcpu->arch.cr0 = svm->vmcb->save.cr0;
2427 if (npt_enabled)
2356 vcpu->arch.cr3 = svm->vmcb->save.cr3; 2428 vcpu->arch.cr3 = svm->vmcb->save.cr3;
2357 if (mmu_reload) {
2358 kvm_mmu_reset_context(vcpu);
2359 kvm_mmu_load(vcpu);
2360 }
2361 }
2362
2363 2429
2364 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { 2430 if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) {
2365 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 2431 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY;
@@ -2383,15 +2449,15 @@ static int handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
2383 return 0; 2449 return 0;
2384 } 2450 }
2385 2451
2386 return svm_exit_handlers[exit_code](svm, kvm_run); 2452 return svm_exit_handlers[exit_code](svm);
2387} 2453}
2388 2454
2389static void reload_tss(struct kvm_vcpu *vcpu) 2455static void reload_tss(struct kvm_vcpu *vcpu)
2390{ 2456{
2391 int cpu = raw_smp_processor_id(); 2457 int cpu = raw_smp_processor_id();
2392 2458
2393 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2459 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2394 svm_data->tss_desc->type = 9; /* available 32/64-bit TSS */ 2460 sd->tss_desc->type = 9; /* available 32/64-bit TSS */
2395 load_TR_desc(); 2461 load_TR_desc();
2396} 2462}
2397 2463
@@ -2399,12 +2465,12 @@ static void pre_svm_run(struct vcpu_svm *svm)
2399{ 2465{
2400 int cpu = raw_smp_processor_id(); 2466 int cpu = raw_smp_processor_id();
2401 2467
2402 struct svm_cpu_data *svm_data = per_cpu(svm_data, cpu); 2468 struct svm_cpu_data *sd = per_cpu(svm_data, cpu);
2403 2469
2404 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; 2470 svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING;
2405 /* FIXME: handle wraparound of asid_generation */ 2471 /* FIXME: handle wraparound of asid_generation */
2406 if (svm->asid_generation != svm_data->asid_generation) 2472 if (svm->asid_generation != sd->asid_generation)
2407 new_asid(svm, svm_data); 2473 new_asid(svm, sd);
2408} 2474}
2409 2475
2410static void svm_inject_nmi(struct kvm_vcpu *vcpu) 2476static void svm_inject_nmi(struct kvm_vcpu *vcpu)
@@ -2413,7 +2479,7 @@ static void svm_inject_nmi(struct kvm_vcpu *vcpu)
2413 2479
2414 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; 2480 svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI;
2415 vcpu->arch.hflags |= HF_NMI_MASK; 2481 vcpu->arch.hflags |= HF_NMI_MASK;
2416 svm->vmcb->control.intercept |= (1UL << INTERCEPT_IRET); 2482 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
2417 ++vcpu->stat.nmi_injections; 2483 ++vcpu->stat.nmi_injections;
2418} 2484}
2419 2485
@@ -2460,20 +2526,47 @@ static int svm_nmi_allowed(struct kvm_vcpu *vcpu)
2460 !(svm->vcpu.arch.hflags & HF_NMI_MASK); 2526 !(svm->vcpu.arch.hflags & HF_NMI_MASK);
2461} 2527}
2462 2528
2529static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu)
2530{
2531 struct vcpu_svm *svm = to_svm(vcpu);
2532
2533 return !!(svm->vcpu.arch.hflags & HF_NMI_MASK);
2534}
2535
2536static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2537{
2538 struct vcpu_svm *svm = to_svm(vcpu);
2539
2540 if (masked) {
2541 svm->vcpu.arch.hflags |= HF_NMI_MASK;
2542 svm->vmcb->control.intercept |= (1ULL << INTERCEPT_IRET);
2543 } else {
2544 svm->vcpu.arch.hflags &= ~HF_NMI_MASK;
2545 svm->vmcb->control.intercept &= ~(1ULL << INTERCEPT_IRET);
2546 }
2547}
2548
2463static int svm_interrupt_allowed(struct kvm_vcpu *vcpu) 2549static int svm_interrupt_allowed(struct kvm_vcpu *vcpu)
2464{ 2550{
2465 struct vcpu_svm *svm = to_svm(vcpu); 2551 struct vcpu_svm *svm = to_svm(vcpu);
2466 struct vmcb *vmcb = svm->vmcb; 2552 struct vmcb *vmcb = svm->vmcb;
2467 return (vmcb->save.rflags & X86_EFLAGS_IF) && 2553 int ret;
2468 !(vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) && 2554
2469 gif_set(svm) && 2555 if (!gif_set(svm) ||
2470 !(is_nested(svm) && (svm->vcpu.arch.hflags & HF_VINTR_MASK)); 2556 (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK))
2557 return 0;
2558
2559 ret = !!(vmcb->save.rflags & X86_EFLAGS_IF);
2560
2561 if (is_nested(svm))
2562 return ret && !(svm->vcpu.arch.hflags & HF_VINTR_MASK);
2563
2564 return ret;
2471} 2565}
2472 2566
2473static void enable_irq_window(struct kvm_vcpu *vcpu) 2567static void enable_irq_window(struct kvm_vcpu *vcpu)
2474{ 2568{
2475 struct vcpu_svm *svm = to_svm(vcpu); 2569 struct vcpu_svm *svm = to_svm(vcpu);
2476 nsvm_printk("Trying to open IRQ window\n");
2477 2570
2478 nested_svm_intr(svm); 2571 nested_svm_intr(svm);
2479 2572
@@ -2498,7 +2591,7 @@ static void enable_nmi_window(struct kvm_vcpu *vcpu)
2498 /* Something prevents NMI from been injected. Single step over 2591 /* Something prevents NMI from been injected. Single step over
2499 possible problem (IRET or exception injection or interrupt 2592 possible problem (IRET or exception injection or interrupt
2500 shadow) */ 2593 shadow) */
2501 vcpu->arch.singlestep = true; 2594 svm->nmi_singlestep = true;
2502 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); 2595 svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF);
2503 update_db_intercept(vcpu); 2596 update_db_intercept(vcpu);
2504} 2597}
@@ -2588,13 +2681,20 @@ static void svm_complete_interrupts(struct vcpu_svm *svm)
2588#define R "e" 2681#define R "e"
2589#endif 2682#endif
2590 2683
2591static void svm_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2684static void svm_vcpu_run(struct kvm_vcpu *vcpu)
2592{ 2685{
2593 struct vcpu_svm *svm = to_svm(vcpu); 2686 struct vcpu_svm *svm = to_svm(vcpu);
2594 u16 fs_selector; 2687 u16 fs_selector;
2595 u16 gs_selector; 2688 u16 gs_selector;
2596 u16 ldt_selector; 2689 u16 ldt_selector;
2597 2690
2691 /*
2692 * A vmexit emulation is required before the vcpu can be executed
2693 * again.
2694 */
2695 if (unlikely(svm->nested.exit_required))
2696 return;
2697
2598 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; 2698 svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX];
2599 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; 2699 svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP];
2600 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; 2700 svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP];
@@ -2727,12 +2827,6 @@ static void svm_set_cr3(struct kvm_vcpu *vcpu, unsigned long root)
2727 2827
2728 svm->vmcb->save.cr3 = root; 2828 svm->vmcb->save.cr3 = root;
2729 force_new_asid(vcpu); 2829 force_new_asid(vcpu);
2730
2731 if (vcpu->fpu_active) {
2732 svm->vmcb->control.intercept_exceptions |= (1 << NM_VECTOR);
2733 svm->vmcb->save.cr0 |= X86_CR0_TS;
2734 vcpu->fpu_active = 0;
2735 }
2736} 2830}
2737 2831
2738static int is_disabled(void) 2832static int is_disabled(void)
@@ -2781,6 +2875,10 @@ static u64 svm_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
2781 return 0; 2875 return 0;
2782} 2876}
2783 2877
2878static void svm_cpuid_update(struct kvm_vcpu *vcpu)
2879{
2880}
2881
2784static const struct trace_print_flags svm_exit_reasons_str[] = { 2882static const struct trace_print_flags svm_exit_reasons_str[] = {
2785 { SVM_EXIT_READ_CR0, "read_cr0" }, 2883 { SVM_EXIT_READ_CR0, "read_cr0" },
2786 { SVM_EXIT_READ_CR3, "read_cr3" }, 2884 { SVM_EXIT_READ_CR3, "read_cr3" },
@@ -2834,9 +2932,22 @@ static const struct trace_print_flags svm_exit_reasons_str[] = {
2834 { -1, NULL } 2932 { -1, NULL }
2835}; 2933};
2836 2934
2837static bool svm_gb_page_enable(void) 2935static int svm_get_lpage_level(void)
2838{ 2936{
2839 return true; 2937 return PT_PDPE_LEVEL;
2938}
2939
2940static bool svm_rdtscp_supported(void)
2941{
2942 return false;
2943}
2944
2945static void svm_fpu_deactivate(struct kvm_vcpu *vcpu)
2946{
2947 struct vcpu_svm *svm = to_svm(vcpu);
2948
2949 update_cr0_intercept(svm);
2950 svm->vmcb->control.intercept_exceptions |= 1 << NM_VECTOR;
2840} 2951}
2841 2952
2842static struct kvm_x86_ops svm_x86_ops = { 2953static struct kvm_x86_ops svm_x86_ops = {
@@ -2865,6 +2976,7 @@ static struct kvm_x86_ops svm_x86_ops = {
2865 .set_segment = svm_set_segment, 2976 .set_segment = svm_set_segment,
2866 .get_cpl = svm_get_cpl, 2977 .get_cpl = svm_get_cpl,
2867 .get_cs_db_l_bits = kvm_get_cs_db_l_bits, 2978 .get_cs_db_l_bits = kvm_get_cs_db_l_bits,
2979 .decache_cr0_guest_bits = svm_decache_cr0_guest_bits,
2868 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits, 2980 .decache_cr4_guest_bits = svm_decache_cr4_guest_bits,
2869 .set_cr0 = svm_set_cr0, 2981 .set_cr0 = svm_set_cr0,
2870 .set_cr3 = svm_set_cr3, 2982 .set_cr3 = svm_set_cr3,
@@ -2879,6 +2991,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2879 .cache_reg = svm_cache_reg, 2991 .cache_reg = svm_cache_reg,
2880 .get_rflags = svm_get_rflags, 2992 .get_rflags = svm_get_rflags,
2881 .set_rflags = svm_set_rflags, 2993 .set_rflags = svm_set_rflags,
2994 .fpu_activate = svm_fpu_activate,
2995 .fpu_deactivate = svm_fpu_deactivate,
2882 2996
2883 .tlb_flush = svm_flush_tlb, 2997 .tlb_flush = svm_flush_tlb,
2884 2998
@@ -2893,6 +3007,8 @@ static struct kvm_x86_ops svm_x86_ops = {
2893 .queue_exception = svm_queue_exception, 3007 .queue_exception = svm_queue_exception,
2894 .interrupt_allowed = svm_interrupt_allowed, 3008 .interrupt_allowed = svm_interrupt_allowed,
2895 .nmi_allowed = svm_nmi_allowed, 3009 .nmi_allowed = svm_nmi_allowed,
3010 .get_nmi_mask = svm_get_nmi_mask,
3011 .set_nmi_mask = svm_set_nmi_mask,
2896 .enable_nmi_window = enable_nmi_window, 3012 .enable_nmi_window = enable_nmi_window,
2897 .enable_irq_window = enable_irq_window, 3013 .enable_irq_window = enable_irq_window,
2898 .update_cr8_intercept = update_cr8_intercept, 3014 .update_cr8_intercept = update_cr8_intercept,
@@ -2902,7 +3018,11 @@ static struct kvm_x86_ops svm_x86_ops = {
2902 .get_mt_mask = svm_get_mt_mask, 3018 .get_mt_mask = svm_get_mt_mask,
2903 3019
2904 .exit_reasons_str = svm_exit_reasons_str, 3020 .exit_reasons_str = svm_exit_reasons_str,
2905 .gb_page_enable = svm_gb_page_enable, 3021 .get_lpage_level = svm_get_lpage_level,
3022
3023 .cpuid_update = svm_cpuid_update,
3024
3025 .rdtscp_supported = svm_rdtscp_supported,
2906}; 3026};
2907 3027
2908static int __init svm_init(void) 3028static int __init svm_init(void)
diff --git a/arch/x86/kvm/trace.h b/arch/x86/kvm/trace.h
index 0d480e77eacf..6ad30a29f044 100644
--- a/arch/x86/kvm/trace.h
+++ b/arch/x86/kvm/trace.h
@@ -56,6 +56,38 @@ TRACE_EVENT(kvm_hypercall,
56); 56);
57 57
58/* 58/*
59 * Tracepoint for hypercall.
60 */
61TRACE_EVENT(kvm_hv_hypercall,
62 TP_PROTO(__u16 code, bool fast, __u16 rep_cnt, __u16 rep_idx,
63 __u64 ingpa, __u64 outgpa),
64 TP_ARGS(code, fast, rep_cnt, rep_idx, ingpa, outgpa),
65
66 TP_STRUCT__entry(
67 __field( __u16, code )
68 __field( bool, fast )
69 __field( __u16, rep_cnt )
70 __field( __u16, rep_idx )
71 __field( __u64, ingpa )
72 __field( __u64, outgpa )
73 ),
74
75 TP_fast_assign(
76 __entry->code = code;
77 __entry->fast = fast;
78 __entry->rep_cnt = rep_cnt;
79 __entry->rep_idx = rep_idx;
80 __entry->ingpa = ingpa;
81 __entry->outgpa = outgpa;
82 ),
83
84 TP_printk("code 0x%x %s cnt 0x%x idx 0x%x in 0x%llx out 0x%llx",
85 __entry->code, __entry->fast ? "fast" : "slow",
86 __entry->rep_cnt, __entry->rep_idx, __entry->ingpa,
87 __entry->outgpa)
88);
89
90/*
59 * Tracepoint for PIO. 91 * Tracepoint for PIO.
60 */ 92 */
61TRACE_EVENT(kvm_pio, 93TRACE_EVENT(kvm_pio,
@@ -214,28 +246,33 @@ TRACE_EVENT(kvm_page_fault,
214 * Tracepoint for guest MSR access. 246 * Tracepoint for guest MSR access.
215 */ 247 */
216TRACE_EVENT(kvm_msr, 248TRACE_EVENT(kvm_msr,
217 TP_PROTO(unsigned int rw, unsigned int ecx, unsigned long data), 249 TP_PROTO(unsigned write, u32 ecx, u64 data, bool exception),
218 TP_ARGS(rw, ecx, data), 250 TP_ARGS(write, ecx, data, exception),
219 251
220 TP_STRUCT__entry( 252 TP_STRUCT__entry(
221 __field( unsigned int, rw ) 253 __field( unsigned, write )
222 __field( unsigned int, ecx ) 254 __field( u32, ecx )
223 __field( unsigned long, data ) 255 __field( u64, data )
256 __field( u8, exception )
224 ), 257 ),
225 258
226 TP_fast_assign( 259 TP_fast_assign(
227 __entry->rw = rw; 260 __entry->write = write;
228 __entry->ecx = ecx; 261 __entry->ecx = ecx;
229 __entry->data = data; 262 __entry->data = data;
263 __entry->exception = exception;
230 ), 264 ),
231 265
232 TP_printk("msr_%s %x = 0x%lx", 266 TP_printk("msr_%s %x = 0x%llx%s",
233 __entry->rw ? "write" : "read", 267 __entry->write ? "write" : "read",
234 __entry->ecx, __entry->data) 268 __entry->ecx, __entry->data,
269 __entry->exception ? " (#GP)" : "")
235); 270);
236 271
237#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data) 272#define trace_kvm_msr_read(ecx, data) trace_kvm_msr(0, ecx, data, false)
238#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data) 273#define trace_kvm_msr_write(ecx, data) trace_kvm_msr(1, ecx, data, false)
274#define trace_kvm_msr_read_ex(ecx) trace_kvm_msr(0, ecx, 0, true)
275#define trace_kvm_msr_write_ex(ecx, data) trace_kvm_msr(1, ecx, data, true)
239 276
240/* 277/*
241 * Tracepoint for guest CR access. 278 * Tracepoint for guest CR access.
@@ -349,6 +386,171 @@ TRACE_EVENT(kvm_apic_accept_irq,
349 __entry->coalesced ? " (coalesced)" : "") 386 __entry->coalesced ? " (coalesced)" : "")
350); 387);
351 388
389/*
390 * Tracepoint for nested VMRUN
391 */
392TRACE_EVENT(kvm_nested_vmrun,
393 TP_PROTO(__u64 rip, __u64 vmcb, __u64 nested_rip, __u32 int_ctl,
394 __u32 event_inj, bool npt),
395 TP_ARGS(rip, vmcb, nested_rip, int_ctl, event_inj, npt),
396
397 TP_STRUCT__entry(
398 __field( __u64, rip )
399 __field( __u64, vmcb )
400 __field( __u64, nested_rip )
401 __field( __u32, int_ctl )
402 __field( __u32, event_inj )
403 __field( bool, npt )
404 ),
405
406 TP_fast_assign(
407 __entry->rip = rip;
408 __entry->vmcb = vmcb;
409 __entry->nested_rip = nested_rip;
410 __entry->int_ctl = int_ctl;
411 __entry->event_inj = event_inj;
412 __entry->npt = npt;
413 ),
414
415 TP_printk("rip: 0x%016llx vmcb: 0x%016llx nrip: 0x%016llx int_ctl: 0x%08x "
416 "event_inj: 0x%08x npt: %s\n",
417 __entry->rip, __entry->vmcb, __entry->nested_rip,
418 __entry->int_ctl, __entry->event_inj,
419 __entry->npt ? "on" : "off")
420);
421
422/*
423 * Tracepoint for #VMEXIT while nested
424 */
425TRACE_EVENT(kvm_nested_vmexit,
426 TP_PROTO(__u64 rip, __u32 exit_code,
427 __u64 exit_info1, __u64 exit_info2,
428 __u32 exit_int_info, __u32 exit_int_info_err),
429 TP_ARGS(rip, exit_code, exit_info1, exit_info2,
430 exit_int_info, exit_int_info_err),
431
432 TP_STRUCT__entry(
433 __field( __u64, rip )
434 __field( __u32, exit_code )
435 __field( __u64, exit_info1 )
436 __field( __u64, exit_info2 )
437 __field( __u32, exit_int_info )
438 __field( __u32, exit_int_info_err )
439 ),
440
441 TP_fast_assign(
442 __entry->rip = rip;
443 __entry->exit_code = exit_code;
444 __entry->exit_info1 = exit_info1;
445 __entry->exit_info2 = exit_info2;
446 __entry->exit_int_info = exit_int_info;
447 __entry->exit_int_info_err = exit_int_info_err;
448 ),
449 TP_printk("rip: 0x%016llx reason: %s ext_inf1: 0x%016llx "
450 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
451 __entry->rip,
452 ftrace_print_symbols_seq(p, __entry->exit_code,
453 kvm_x86_ops->exit_reasons_str),
454 __entry->exit_info1, __entry->exit_info2,
455 __entry->exit_int_info, __entry->exit_int_info_err)
456);
457
458/*
459 * Tracepoint for #VMEXIT reinjected to the guest
460 */
461TRACE_EVENT(kvm_nested_vmexit_inject,
462 TP_PROTO(__u32 exit_code,
463 __u64 exit_info1, __u64 exit_info2,
464 __u32 exit_int_info, __u32 exit_int_info_err),
465 TP_ARGS(exit_code, exit_info1, exit_info2,
466 exit_int_info, exit_int_info_err),
467
468 TP_STRUCT__entry(
469 __field( __u32, exit_code )
470 __field( __u64, exit_info1 )
471 __field( __u64, exit_info2 )
472 __field( __u32, exit_int_info )
473 __field( __u32, exit_int_info_err )
474 ),
475
476 TP_fast_assign(
477 __entry->exit_code = exit_code;
478 __entry->exit_info1 = exit_info1;
479 __entry->exit_info2 = exit_info2;
480 __entry->exit_int_info = exit_int_info;
481 __entry->exit_int_info_err = exit_int_info_err;
482 ),
483
484 TP_printk("reason: %s ext_inf1: 0x%016llx "
485 "ext_inf2: 0x%016llx ext_int: 0x%08x ext_int_err: 0x%08x\n",
486 ftrace_print_symbols_seq(p, __entry->exit_code,
487 kvm_x86_ops->exit_reasons_str),
488 __entry->exit_info1, __entry->exit_info2,
489 __entry->exit_int_info, __entry->exit_int_info_err)
490);
491
492/*
493 * Tracepoint for nested #vmexit because of interrupt pending
494 */
495TRACE_EVENT(kvm_nested_intr_vmexit,
496 TP_PROTO(__u64 rip),
497 TP_ARGS(rip),
498
499 TP_STRUCT__entry(
500 __field( __u64, rip )
501 ),
502
503 TP_fast_assign(
504 __entry->rip = rip
505 ),
506
507 TP_printk("rip: 0x%016llx\n", __entry->rip)
508);
509
510/*
511 * Tracepoint for nested #vmexit because of interrupt pending
512 */
513TRACE_EVENT(kvm_invlpga,
514 TP_PROTO(__u64 rip, int asid, u64 address),
515 TP_ARGS(rip, asid, address),
516
517 TP_STRUCT__entry(
518 __field( __u64, rip )
519 __field( int, asid )
520 __field( __u64, address )
521 ),
522
523 TP_fast_assign(
524 __entry->rip = rip;
525 __entry->asid = asid;
526 __entry->address = address;
527 ),
528
529 TP_printk("rip: 0x%016llx asid: %d address: 0x%016llx\n",
530 __entry->rip, __entry->asid, __entry->address)
531);
532
533/*
534 * Tracepoint for nested #vmexit because of interrupt pending
535 */
536TRACE_EVENT(kvm_skinit,
537 TP_PROTO(__u64 rip, __u32 slb),
538 TP_ARGS(rip, slb),
539
540 TP_STRUCT__entry(
541 __field( __u64, rip )
542 __field( __u32, slb )
543 ),
544
545 TP_fast_assign(
546 __entry->rip = rip;
547 __entry->slb = slb;
548 ),
549
550 TP_printk("rip: 0x%016llx slb: 0x%08x\n",
551 __entry->rip, __entry->slb)
552);
553
352#endif /* _TRACE_KVM_H */ 554#endif /* _TRACE_KVM_H */
353 555
354/* This part must be outside protection */ 556/* This part must be outside protection */
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index ed53b42caba1..2f8db0ec8ae4 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -26,6 +26,7 @@
26#include <linux/sched.h> 26#include <linux/sched.h>
27#include <linux/moduleparam.h> 27#include <linux/moduleparam.h>
28#include <linux/ftrace_event.h> 28#include <linux/ftrace_event.h>
29#include <linux/slab.h>
29#include "kvm_cache_regs.h" 30#include "kvm_cache_regs.h"
30#include "x86.h" 31#include "x86.h"
31 32
@@ -61,12 +62,54 @@ module_param_named(unrestricted_guest,
61static int __read_mostly emulate_invalid_guest_state = 0; 62static int __read_mostly emulate_invalid_guest_state = 0;
62module_param(emulate_invalid_guest_state, bool, S_IRUGO); 63module_param(emulate_invalid_guest_state, bool, S_IRUGO);
63 64
65#define KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST \
66 (X86_CR0_WP | X86_CR0_NE | X86_CR0_NW | X86_CR0_CD)
67#define KVM_GUEST_CR0_MASK \
68 (KVM_GUEST_CR0_MASK_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
69#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \
70 (X86_CR0_WP | X86_CR0_NE)
71#define KVM_VM_CR0_ALWAYS_ON \
72 (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE)
73#define KVM_CR4_GUEST_OWNED_BITS \
74 (X86_CR4_PVI | X86_CR4_DE | X86_CR4_PCE | X86_CR4_OSFXSR \
75 | X86_CR4_OSXMMEXCPT)
76
77#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE)
78#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME | X86_CR4_PAE | X86_CR4_VMXE)
79
80#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL | X86_EFLAGS_VM))
81
82/*
83 * These 2 parameters are used to config the controls for Pause-Loop Exiting:
84 * ple_gap: upper bound on the amount of time between two successive
85 * executions of PAUSE in a loop. Also indicate if ple enabled.
86 * According to test, this time is usually small than 41 cycles.
87 * ple_window: upper bound on the amount of time a guest is allowed to execute
88 * in a PAUSE loop. Tests indicate that most spinlocks are held for
89 * less than 2^12 cycles
90 * Time is measured based on a counter that runs at the same rate as the TSC,
91 * refer SDM volume 3b section 21.6.13 & 22.1.3.
92 */
93#define KVM_VMX_DEFAULT_PLE_GAP 41
94#define KVM_VMX_DEFAULT_PLE_WINDOW 4096
95static int ple_gap = KVM_VMX_DEFAULT_PLE_GAP;
96module_param(ple_gap, int, S_IRUGO);
97
98static int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
99module_param(ple_window, int, S_IRUGO);
100
64struct vmcs { 101struct vmcs {
65 u32 revision_id; 102 u32 revision_id;
66 u32 abort; 103 u32 abort;
67 char data[0]; 104 char data[0];
68}; 105};
69 106
107struct shared_msr_entry {
108 unsigned index;
109 u64 data;
110 u64 mask;
111};
112
70struct vcpu_vmx { 113struct vcpu_vmx {
71 struct kvm_vcpu vcpu; 114 struct kvm_vcpu vcpu;
72 struct list_head local_vcpus_link; 115 struct list_head local_vcpus_link;
@@ -74,13 +117,12 @@ struct vcpu_vmx {
74 int launched; 117 int launched;
75 u8 fail; 118 u8 fail;
76 u32 idt_vectoring_info; 119 u32 idt_vectoring_info;
77 struct kvm_msr_entry *guest_msrs; 120 struct shared_msr_entry *guest_msrs;
78 struct kvm_msr_entry *host_msrs;
79 int nmsrs; 121 int nmsrs;
80 int save_nmsrs; 122 int save_nmsrs;
81 int msr_offset_efer;
82#ifdef CONFIG_X86_64 123#ifdef CONFIG_X86_64
83 int msr_offset_kernel_gs_base; 124 u64 msr_host_kernel_gs_base;
125 u64 msr_guest_kernel_gs_base;
84#endif 126#endif
85 struct vmcs *vmcs; 127 struct vmcs *vmcs;
86 struct { 128 struct {
@@ -88,11 +130,10 @@ struct vcpu_vmx {
88 u16 fs_sel, gs_sel, ldt_sel; 130 u16 fs_sel, gs_sel, ldt_sel;
89 int gs_ldt_reload_needed; 131 int gs_ldt_reload_needed;
90 int fs_reload_needed; 132 int fs_reload_needed;
91 int guest_efer_loaded;
92 } host_state; 133 } host_state;
93 struct { 134 struct {
94 int vm86_active; 135 int vm86_active;
95 u8 save_iopl; 136 ulong save_rflags;
96 struct kvm_save_segment { 137 struct kvm_save_segment {
97 u16 selector; 138 u16 selector;
98 unsigned long base; 139 unsigned long base;
@@ -107,13 +148,14 @@ struct vcpu_vmx {
107 } rmode; 148 } rmode;
108 int vpid; 149 int vpid;
109 bool emulation_required; 150 bool emulation_required;
110 enum emulation_result invalid_state_emulation_result;
111 151
112 /* Support for vnmi-less CPUs */ 152 /* Support for vnmi-less CPUs */
113 int soft_vnmi_blocked; 153 int soft_vnmi_blocked;
114 ktime_t entry_time; 154 ktime_t entry_time;
115 s64 vnmi_blocked_time; 155 s64 vnmi_blocked_time;
116 u32 exit_reason; 156 u32 exit_reason;
157
158 bool rdtscp_enabled;
117}; 159};
118 160
119static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu) 161static inline struct vcpu_vmx *to_vmx(struct kvm_vcpu *vcpu)
@@ -176,6 +218,8 @@ static struct kvm_vmx_segment_field {
176 VMX_SEGMENT_FIELD(LDTR), 218 VMX_SEGMENT_FIELD(LDTR),
177}; 219};
178 220
221static u64 host_efer;
222
179static void ept_save_pdptrs(struct kvm_vcpu *vcpu); 223static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
180 224
181/* 225/*
@@ -184,28 +228,12 @@ static void ept_save_pdptrs(struct kvm_vcpu *vcpu);
184 */ 228 */
185static const u32 vmx_msr_index[] = { 229static const u32 vmx_msr_index[] = {
186#ifdef CONFIG_X86_64 230#ifdef CONFIG_X86_64
187 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR, MSR_KERNEL_GS_BASE, 231 MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
188#endif 232#endif
189 MSR_EFER, MSR_K6_STAR, 233 MSR_EFER, MSR_TSC_AUX, MSR_K6_STAR,
190}; 234};
191#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index) 235#define NR_VMX_MSR ARRAY_SIZE(vmx_msr_index)
192 236
193static void load_msrs(struct kvm_msr_entry *e, int n)
194{
195 int i;
196
197 for (i = 0; i < n; ++i)
198 wrmsrl(e[i].index, e[i].data);
199}
200
201static void save_msrs(struct kvm_msr_entry *e, int n)
202{
203 int i;
204
205 for (i = 0; i < n; ++i)
206 rdmsrl(e[i].index, e[i].data);
207}
208
209static inline int is_page_fault(u32 intr_info) 237static inline int is_page_fault(u32 intr_info)
210{ 238{
211 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK | 239 return (intr_info & (INTR_INFO_INTR_TYPE_MASK | INTR_INFO_VECTOR_MASK |
@@ -293,6 +321,11 @@ static inline bool cpu_has_vmx_ept_2m_page(void)
293 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT); 321 return !!(vmx_capability.ept & VMX_EPT_2MB_PAGE_BIT);
294} 322}
295 323
324static inline bool cpu_has_vmx_ept_1g_page(void)
325{
326 return !!(vmx_capability.ept & VMX_EPT_1GB_PAGE_BIT);
327}
328
296static inline int cpu_has_vmx_invept_individual_addr(void) 329static inline int cpu_has_vmx_invept_individual_addr(void)
297{ 330{
298 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT); 331 return !!(vmx_capability.ept & VMX_EPT_EXTENT_INDIVIDUAL_BIT);
@@ -320,11 +353,15 @@ static inline int cpu_has_vmx_unrestricted_guest(void)
320 SECONDARY_EXEC_UNRESTRICTED_GUEST; 353 SECONDARY_EXEC_UNRESTRICTED_GUEST;
321} 354}
322 355
356static inline int cpu_has_vmx_ple(void)
357{
358 return vmcs_config.cpu_based_2nd_exec_ctrl &
359 SECONDARY_EXEC_PAUSE_LOOP_EXITING;
360}
361
323static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm) 362static inline int vm_need_virtualize_apic_accesses(struct kvm *kvm)
324{ 363{
325 return flexpriority_enabled && 364 return flexpriority_enabled && irqchip_in_kernel(kvm);
326 (cpu_has_vmx_virtualize_apic_accesses()) &&
327 (irqchip_in_kernel(kvm));
328} 365}
329 366
330static inline int cpu_has_vmx_vpid(void) 367static inline int cpu_has_vmx_vpid(void)
@@ -333,6 +370,12 @@ static inline int cpu_has_vmx_vpid(void)
333 SECONDARY_EXEC_ENABLE_VPID; 370 SECONDARY_EXEC_ENABLE_VPID;
334} 371}
335 372
373static inline int cpu_has_vmx_rdtscp(void)
374{
375 return vmcs_config.cpu_based_2nd_exec_ctrl &
376 SECONDARY_EXEC_RDTSCP;
377}
378
336static inline int cpu_has_virtual_nmis(void) 379static inline int cpu_has_virtual_nmis(void)
337{ 380{
338 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS; 381 return vmcs_config.pin_based_exec_ctrl & PIN_BASED_VIRTUAL_NMIS;
@@ -348,7 +391,7 @@ static int __find_msr_index(struct vcpu_vmx *vmx, u32 msr)
348 int i; 391 int i;
349 392
350 for (i = 0; i < vmx->nmsrs; ++i) 393 for (i = 0; i < vmx->nmsrs; ++i)
351 if (vmx->guest_msrs[i].index == msr) 394 if (vmx_msr_index[vmx->guest_msrs[i].index] == msr)
352 return i; 395 return i;
353 return -1; 396 return -1;
354} 397}
@@ -379,7 +422,7 @@ static inline void __invept(int ext, u64 eptp, gpa_t gpa)
379 : : "a" (&operand), "c" (ext) : "cc", "memory"); 422 : : "a" (&operand), "c" (ext) : "cc", "memory");
380} 423}
381 424
382static struct kvm_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr) 425static struct shared_msr_entry *find_msr_entry(struct vcpu_vmx *vmx, u32 msr)
383{ 426{
384 int i; 427 int i;
385 428
@@ -537,22 +580,18 @@ static void update_exception_bitmap(struct kvm_vcpu *vcpu)
537{ 580{
538 u32 eb; 581 u32 eb;
539 582
540 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR); 583 eb = (1u << PF_VECTOR) | (1u << UD_VECTOR) | (1u << MC_VECTOR) |
541 if (!vcpu->fpu_active) 584 (1u << NM_VECTOR) | (1u << DB_VECTOR);
542 eb |= 1u << NM_VECTOR; 585 if ((vcpu->guest_debug &
543 /* 586 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP)) ==
544 * Unconditionally intercept #DB so we can maintain dr6 without 587 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_SW_BP))
545 * reading it every exit. 588 eb |= 1u << BP_VECTOR;
546 */
547 eb |= 1u << DB_VECTOR;
548 if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) {
549 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
550 eb |= 1u << BP_VECTOR;
551 }
552 if (to_vmx(vcpu)->rmode.vm86_active) 589 if (to_vmx(vcpu)->rmode.vm86_active)
553 eb = ~0; 590 eb = ~0;
554 if (enable_ept) 591 if (enable_ept)
555 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */ 592 eb &= ~(1u << PF_VECTOR); /* bypass_guest_pf = 0 */
593 if (vcpu->fpu_active)
594 eb &= ~(1u << NM_VECTOR);
556 vmcs_write32(EXCEPTION_BITMAP, eb); 595 vmcs_write32(EXCEPTION_BITMAP, eb);
557} 596}
558 597
@@ -570,17 +609,12 @@ static void reload_tss(void)
570 load_TR_desc(); 609 load_TR_desc();
571} 610}
572 611
573static void load_transition_efer(struct vcpu_vmx *vmx) 612static bool update_transition_efer(struct vcpu_vmx *vmx, int efer_offset)
574{ 613{
575 int efer_offset = vmx->msr_offset_efer;
576 u64 host_efer;
577 u64 guest_efer; 614 u64 guest_efer;
578 u64 ignore_bits; 615 u64 ignore_bits;
579 616
580 if (efer_offset < 0) 617 guest_efer = vmx->vcpu.arch.efer;
581 return;
582 host_efer = vmx->host_msrs[efer_offset].data;
583 guest_efer = vmx->guest_msrs[efer_offset].data;
584 618
585 /* 619 /*
586 * NX is emulated; LMA and LME handled by hardware; SCE meaninless 620 * NX is emulated; LMA and LME handled by hardware; SCE meaninless
@@ -593,27 +627,17 @@ static void load_transition_efer(struct vcpu_vmx *vmx)
593 if (guest_efer & EFER_LMA) 627 if (guest_efer & EFER_LMA)
594 ignore_bits &= ~(u64)EFER_SCE; 628 ignore_bits &= ~(u64)EFER_SCE;
595#endif 629#endif
596 if ((guest_efer & ~ignore_bits) == (host_efer & ~ignore_bits))
597 return;
598
599 vmx->host_state.guest_efer_loaded = 1;
600 guest_efer &= ~ignore_bits; 630 guest_efer &= ~ignore_bits;
601 guest_efer |= host_efer & ignore_bits; 631 guest_efer |= host_efer & ignore_bits;
602 wrmsrl(MSR_EFER, guest_efer); 632 vmx->guest_msrs[efer_offset].data = guest_efer;
603 vmx->vcpu.stat.efer_reload++; 633 vmx->guest_msrs[efer_offset].mask = ~ignore_bits;
604} 634 return true;
605
606static void reload_host_efer(struct vcpu_vmx *vmx)
607{
608 if (vmx->host_state.guest_efer_loaded) {
609 vmx->host_state.guest_efer_loaded = 0;
610 load_msrs(vmx->host_msrs + vmx->msr_offset_efer, 1);
611 }
612} 635}
613 636
614static void vmx_save_host_state(struct kvm_vcpu *vcpu) 637static void vmx_save_host_state(struct kvm_vcpu *vcpu)
615{ 638{
616 struct vcpu_vmx *vmx = to_vmx(vcpu); 639 struct vcpu_vmx *vmx = to_vmx(vcpu);
640 int i;
617 641
618 if (vmx->host_state.loaded) 642 if (vmx->host_state.loaded)
619 return; 643 return;
@@ -650,13 +674,15 @@ static void vmx_save_host_state(struct kvm_vcpu *vcpu)
650#endif 674#endif
651 675
652#ifdef CONFIG_X86_64 676#ifdef CONFIG_X86_64
653 if (is_long_mode(&vmx->vcpu)) 677 if (is_long_mode(&vmx->vcpu)) {
654 save_msrs(vmx->host_msrs + 678 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
655 vmx->msr_offset_kernel_gs_base, 1); 679 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
656 680 }
657#endif 681#endif
658 load_msrs(vmx->guest_msrs, vmx->save_nmsrs); 682 for (i = 0; i < vmx->save_nmsrs; ++i)
659 load_transition_efer(vmx); 683 kvm_set_shared_msr(vmx->guest_msrs[i].index,
684 vmx->guest_msrs[i].data,
685 vmx->guest_msrs[i].mask);
660} 686}
661 687
662static void __vmx_load_host_state(struct vcpu_vmx *vmx) 688static void __vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -684,9 +710,12 @@ static void __vmx_load_host_state(struct vcpu_vmx *vmx)
684 local_irq_restore(flags); 710 local_irq_restore(flags);
685 } 711 }
686 reload_tss(); 712 reload_tss();
687 save_msrs(vmx->guest_msrs, vmx->save_nmsrs); 713#ifdef CONFIG_X86_64
688 load_msrs(vmx->host_msrs, vmx->save_nmsrs); 714 if (is_long_mode(&vmx->vcpu)) {
689 reload_host_efer(vmx); 715 rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
716 wrmsrl(MSR_KERNEL_GS_BASE, vmx->msr_host_kernel_gs_base);
717 }
718#endif
690} 719}
691 720
692static void vmx_load_host_state(struct vcpu_vmx *vmx) 721static void vmx_load_host_state(struct vcpu_vmx *vmx)
@@ -763,38 +792,51 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
763 792
764static void vmx_fpu_activate(struct kvm_vcpu *vcpu) 793static void vmx_fpu_activate(struct kvm_vcpu *vcpu)
765{ 794{
795 ulong cr0;
796
766 if (vcpu->fpu_active) 797 if (vcpu->fpu_active)
767 return; 798 return;
768 vcpu->fpu_active = 1; 799 vcpu->fpu_active = 1;
769 vmcs_clear_bits(GUEST_CR0, X86_CR0_TS); 800 cr0 = vmcs_readl(GUEST_CR0);
770 if (vcpu->arch.cr0 & X86_CR0_TS) 801 cr0 &= ~(X86_CR0_TS | X86_CR0_MP);
771 vmcs_set_bits(GUEST_CR0, X86_CR0_TS); 802 cr0 |= kvm_read_cr0_bits(vcpu, X86_CR0_TS | X86_CR0_MP);
803 vmcs_writel(GUEST_CR0, cr0);
772 update_exception_bitmap(vcpu); 804 update_exception_bitmap(vcpu);
805 vcpu->arch.cr0_guest_owned_bits = X86_CR0_TS;
806 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
773} 807}
774 808
809static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu);
810
775static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu) 811static void vmx_fpu_deactivate(struct kvm_vcpu *vcpu)
776{ 812{
777 if (!vcpu->fpu_active) 813 vmx_decache_cr0_guest_bits(vcpu);
778 return; 814 vmcs_set_bits(GUEST_CR0, X86_CR0_TS | X86_CR0_MP);
779 vcpu->fpu_active = 0;
780 vmcs_set_bits(GUEST_CR0, X86_CR0_TS);
781 update_exception_bitmap(vcpu); 815 update_exception_bitmap(vcpu);
816 vcpu->arch.cr0_guest_owned_bits = 0;
817 vmcs_writel(CR0_GUEST_HOST_MASK, ~vcpu->arch.cr0_guest_owned_bits);
818 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
782} 819}
783 820
784static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu) 821static unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
785{ 822{
786 unsigned long rflags; 823 unsigned long rflags, save_rflags;
787 824
788 rflags = vmcs_readl(GUEST_RFLAGS); 825 rflags = vmcs_readl(GUEST_RFLAGS);
789 if (to_vmx(vcpu)->rmode.vm86_active) 826 if (to_vmx(vcpu)->rmode.vm86_active) {
790 rflags &= ~(unsigned long)(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 827 rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
828 save_rflags = to_vmx(vcpu)->rmode.save_rflags;
829 rflags |= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
830 }
791 return rflags; 831 return rflags;
792} 832}
793 833
794static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) 834static void vmx_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
795{ 835{
796 if (to_vmx(vcpu)->rmode.vm86_active) 836 if (to_vmx(vcpu)->rmode.vm86_active) {
837 to_vmx(vcpu)->rmode.save_rflags = rflags;
797 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 838 rflags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
839 }
798 vmcs_writel(GUEST_RFLAGS, rflags); 840 vmcs_writel(GUEST_RFLAGS, rflags);
799} 841}
800 842
@@ -874,22 +916,22 @@ static void vmx_queue_exception(struct kvm_vcpu *vcpu, unsigned nr,
874 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info); 916 vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, intr_info);
875} 917}
876 918
919static bool vmx_rdtscp_supported(void)
920{
921 return cpu_has_vmx_rdtscp();
922}
923
877/* 924/*
878 * Swap MSR entry in host/guest MSR entry array. 925 * Swap MSR entry in host/guest MSR entry array.
879 */ 926 */
880#ifdef CONFIG_X86_64
881static void move_msr_up(struct vcpu_vmx *vmx, int from, int to) 927static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
882{ 928{
883 struct kvm_msr_entry tmp; 929 struct shared_msr_entry tmp;
884 930
885 tmp = vmx->guest_msrs[to]; 931 tmp = vmx->guest_msrs[to];
886 vmx->guest_msrs[to] = vmx->guest_msrs[from]; 932 vmx->guest_msrs[to] = vmx->guest_msrs[from];
887 vmx->guest_msrs[from] = tmp; 933 vmx->guest_msrs[from] = tmp;
888 tmp = vmx->host_msrs[to];
889 vmx->host_msrs[to] = vmx->host_msrs[from];
890 vmx->host_msrs[from] = tmp;
891} 934}
892#endif
893 935
894/* 936/*
895 * Set up the vmcs to automatically save and restore system 937 * Set up the vmcs to automatically save and restore system
@@ -898,15 +940,13 @@ static void move_msr_up(struct vcpu_vmx *vmx, int from, int to)
898 */ 940 */
899static void setup_msrs(struct vcpu_vmx *vmx) 941static void setup_msrs(struct vcpu_vmx *vmx)
900{ 942{
901 int save_nmsrs; 943 int save_nmsrs, index;
902 unsigned long *msr_bitmap; 944 unsigned long *msr_bitmap;
903 945
904 vmx_load_host_state(vmx); 946 vmx_load_host_state(vmx);
905 save_nmsrs = 0; 947 save_nmsrs = 0;
906#ifdef CONFIG_X86_64 948#ifdef CONFIG_X86_64
907 if (is_long_mode(&vmx->vcpu)) { 949 if (is_long_mode(&vmx->vcpu)) {
908 int index;
909
910 index = __find_msr_index(vmx, MSR_SYSCALL_MASK); 950 index = __find_msr_index(vmx, MSR_SYSCALL_MASK);
911 if (index >= 0) 951 if (index >= 0)
912 move_msr_up(vmx, index, save_nmsrs++); 952 move_msr_up(vmx, index, save_nmsrs++);
@@ -916,25 +956,23 @@ static void setup_msrs(struct vcpu_vmx *vmx)
916 index = __find_msr_index(vmx, MSR_CSTAR); 956 index = __find_msr_index(vmx, MSR_CSTAR);
917 if (index >= 0) 957 if (index >= 0)
918 move_msr_up(vmx, index, save_nmsrs++); 958 move_msr_up(vmx, index, save_nmsrs++);
919 index = __find_msr_index(vmx, MSR_KERNEL_GS_BASE); 959 index = __find_msr_index(vmx, MSR_TSC_AUX);
920 if (index >= 0) 960 if (index >= 0 && vmx->rdtscp_enabled)
921 move_msr_up(vmx, index, save_nmsrs++); 961 move_msr_up(vmx, index, save_nmsrs++);
922 /* 962 /*
923 * MSR_K6_STAR is only needed on long mode guests, and only 963 * MSR_K6_STAR is only needed on long mode guests, and only
924 * if efer.sce is enabled. 964 * if efer.sce is enabled.
925 */ 965 */
926 index = __find_msr_index(vmx, MSR_K6_STAR); 966 index = __find_msr_index(vmx, MSR_K6_STAR);
927 if ((index >= 0) && (vmx->vcpu.arch.shadow_efer & EFER_SCE)) 967 if ((index >= 0) && (vmx->vcpu.arch.efer & EFER_SCE))
928 move_msr_up(vmx, index, save_nmsrs++); 968 move_msr_up(vmx, index, save_nmsrs++);
929 } 969 }
930#endif 970#endif
931 vmx->save_nmsrs = save_nmsrs; 971 index = __find_msr_index(vmx, MSR_EFER);
972 if (index >= 0 && update_transition_efer(vmx, index))
973 move_msr_up(vmx, index, save_nmsrs++);
932 974
933#ifdef CONFIG_X86_64 975 vmx->save_nmsrs = save_nmsrs;
934 vmx->msr_offset_kernel_gs_base =
935 __find_msr_index(vmx, MSR_KERNEL_GS_BASE);
936#endif
937 vmx->msr_offset_efer = __find_msr_index(vmx, MSR_EFER);
938 976
939 if (cpu_has_vmx_msr_bitmap()) { 977 if (cpu_has_vmx_msr_bitmap()) {
940 if (is_long_mode(&vmx->vcpu)) 978 if (is_long_mode(&vmx->vcpu))
@@ -976,7 +1014,7 @@ static void guest_write_tsc(u64 guest_tsc, u64 host_tsc)
976static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata) 1014static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
977{ 1015{
978 u64 data; 1016 u64 data;
979 struct kvm_msr_entry *msr; 1017 struct shared_msr_entry *msr;
980 1018
981 if (!pdata) { 1019 if (!pdata) {
982 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n"); 1020 printk(KERN_ERR "BUG: get_msr called with NULL pdata\n");
@@ -991,9 +1029,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
991 case MSR_GS_BASE: 1029 case MSR_GS_BASE:
992 data = vmcs_readl(GUEST_GS_BASE); 1030 data = vmcs_readl(GUEST_GS_BASE);
993 break; 1031 break;
1032 case MSR_KERNEL_GS_BASE:
1033 vmx_load_host_state(to_vmx(vcpu));
1034 data = to_vmx(vcpu)->msr_guest_kernel_gs_base;
1035 break;
1036#endif
994 case MSR_EFER: 1037 case MSR_EFER:
995 return kvm_get_msr_common(vcpu, msr_index, pdata); 1038 return kvm_get_msr_common(vcpu, msr_index, pdata);
996#endif
997 case MSR_IA32_TSC: 1039 case MSR_IA32_TSC:
998 data = guest_read_tsc(); 1040 data = guest_read_tsc();
999 break; 1041 break;
@@ -1006,7 +1048,12 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1006 case MSR_IA32_SYSENTER_ESP: 1048 case MSR_IA32_SYSENTER_ESP:
1007 data = vmcs_readl(GUEST_SYSENTER_ESP); 1049 data = vmcs_readl(GUEST_SYSENTER_ESP);
1008 break; 1050 break;
1051 case MSR_TSC_AUX:
1052 if (!to_vmx(vcpu)->rdtscp_enabled)
1053 return 1;
1054 /* Otherwise falls through */
1009 default: 1055 default:
1056 vmx_load_host_state(to_vmx(vcpu));
1010 msr = find_msr_entry(to_vmx(vcpu), msr_index); 1057 msr = find_msr_entry(to_vmx(vcpu), msr_index);
1011 if (msr) { 1058 if (msr) {
1012 vmx_load_host_state(to_vmx(vcpu)); 1059 vmx_load_host_state(to_vmx(vcpu));
@@ -1028,7 +1075,7 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 *pdata)
1028static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data) 1075static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1029{ 1076{
1030 struct vcpu_vmx *vmx = to_vmx(vcpu); 1077 struct vcpu_vmx *vmx = to_vmx(vcpu);
1031 struct kvm_msr_entry *msr; 1078 struct shared_msr_entry *msr;
1032 u64 host_tsc; 1079 u64 host_tsc;
1033 int ret = 0; 1080 int ret = 0;
1034 1081
@@ -1044,6 +1091,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1044 case MSR_GS_BASE: 1091 case MSR_GS_BASE:
1045 vmcs_writel(GUEST_GS_BASE, data); 1092 vmcs_writel(GUEST_GS_BASE, data);
1046 break; 1093 break;
1094 case MSR_KERNEL_GS_BASE:
1095 vmx_load_host_state(vmx);
1096 vmx->msr_guest_kernel_gs_base = data;
1097 break;
1047#endif 1098#endif
1048 case MSR_IA32_SYSENTER_CS: 1099 case MSR_IA32_SYSENTER_CS:
1049 vmcs_write32(GUEST_SYSENTER_CS, data); 1100 vmcs_write32(GUEST_SYSENTER_CS, data);
@@ -1064,7 +1115,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, u32 msr_index, u64 data)
1064 vcpu->arch.pat = data; 1115 vcpu->arch.pat = data;
1065 break; 1116 break;
1066 } 1117 }
1067 /* Otherwise falls through to kvm_set_msr_common */ 1118 ret = kvm_set_msr_common(vcpu, msr_index, data);
1119 break;
1120 case MSR_TSC_AUX:
1121 if (!vmx->rdtscp_enabled)
1122 return 1;
1123 /* Check reserved bit, higher 32 bits should be zero */
1124 if ((data >> 32) != 0)
1125 return 1;
1126 /* Otherwise falls through */
1068 default: 1127 default:
1069 msr = find_msr_entry(vmx, msr_index); 1128 msr = find_msr_entry(vmx, msr_index);
1070 if (msr) { 1129 if (msr) {
@@ -1097,30 +1156,14 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg)
1097 } 1156 }
1098} 1157}
1099 1158
1100static int set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) 1159static void set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg)
1101{ 1160{
1102 int old_debug = vcpu->guest_debug;
1103 unsigned long flags;
1104
1105 vcpu->guest_debug = dbg->control;
1106 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
1107 vcpu->guest_debug = 0;
1108
1109 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) 1161 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
1110 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]); 1162 vmcs_writel(GUEST_DR7, dbg->arch.debugreg[7]);
1111 else 1163 else
1112 vmcs_writel(GUEST_DR7, vcpu->arch.dr7); 1164 vmcs_writel(GUEST_DR7, vcpu->arch.dr7);
1113 1165
1114 flags = vmcs_readl(GUEST_RFLAGS);
1115 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
1116 flags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
1117 else if (old_debug & KVM_GUESTDBG_SINGLESTEP)
1118 flags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
1119 vmcs_writel(GUEST_RFLAGS, flags);
1120
1121 update_exception_bitmap(vcpu); 1166 update_exception_bitmap(vcpu);
1122
1123 return 0;
1124} 1167}
1125 1168
1126static __init int cpu_has_kvm_support(void) 1169static __init int cpu_has_kvm_support(void)
@@ -1139,12 +1182,15 @@ static __init int vmx_disabled_by_bios(void)
1139 /* locked but not enabled */ 1182 /* locked but not enabled */
1140} 1183}
1141 1184
1142static void hardware_enable(void *garbage) 1185static int hardware_enable(void *garbage)
1143{ 1186{
1144 int cpu = raw_smp_processor_id(); 1187 int cpu = raw_smp_processor_id();
1145 u64 phys_addr = __pa(per_cpu(vmxarea, cpu)); 1188 u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
1146 u64 old; 1189 u64 old;
1147 1190
1191 if (read_cr4() & X86_CR4_VMXE)
1192 return -EBUSY;
1193
1148 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu)); 1194 INIT_LIST_HEAD(&per_cpu(vcpus_on_cpu, cpu));
1149 rdmsrl(MSR_IA32_FEATURE_CONTROL, old); 1195 rdmsrl(MSR_IA32_FEATURE_CONTROL, old);
1150 if ((old & (FEATURE_CONTROL_LOCKED | 1196 if ((old & (FEATURE_CONTROL_LOCKED |
@@ -1159,6 +1205,10 @@ static void hardware_enable(void *garbage)
1159 asm volatile (ASM_VMX_VMXON_RAX 1205 asm volatile (ASM_VMX_VMXON_RAX
1160 : : "a"(&phys_addr), "m"(phys_addr) 1206 : : "a"(&phys_addr), "m"(phys_addr)
1161 : "memory", "cc"); 1207 : "memory", "cc");
1208
1209 ept_sync_global();
1210
1211 return 0;
1162} 1212}
1163 1213
1164static void vmclear_local_vcpus(void) 1214static void vmclear_local_vcpus(void)
@@ -1232,6 +1282,8 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1232 CPU_BASED_USE_IO_BITMAPS | 1282 CPU_BASED_USE_IO_BITMAPS |
1233 CPU_BASED_MOV_DR_EXITING | 1283 CPU_BASED_MOV_DR_EXITING |
1234 CPU_BASED_USE_TSC_OFFSETING | 1284 CPU_BASED_USE_TSC_OFFSETING |
1285 CPU_BASED_MWAIT_EXITING |
1286 CPU_BASED_MONITOR_EXITING |
1235 CPU_BASED_INVLPG_EXITING; 1287 CPU_BASED_INVLPG_EXITING;
1236 opt = CPU_BASED_TPR_SHADOW | 1288 opt = CPU_BASED_TPR_SHADOW |
1237 CPU_BASED_USE_MSR_BITMAPS | 1289 CPU_BASED_USE_MSR_BITMAPS |
@@ -1250,7 +1302,9 @@ static __init int setup_vmcs_config(struct vmcs_config *vmcs_conf)
1250 SECONDARY_EXEC_WBINVD_EXITING | 1302 SECONDARY_EXEC_WBINVD_EXITING |
1251 SECONDARY_EXEC_ENABLE_VPID | 1303 SECONDARY_EXEC_ENABLE_VPID |
1252 SECONDARY_EXEC_ENABLE_EPT | 1304 SECONDARY_EXEC_ENABLE_EPT |
1253 SECONDARY_EXEC_UNRESTRICTED_GUEST; 1305 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1306 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
1307 SECONDARY_EXEC_RDTSCP;
1254 if (adjust_vmx_controls(min2, opt2, 1308 if (adjust_vmx_controls(min2, opt2,
1255 MSR_IA32_VMX_PROCBASED_CTLS2, 1309 MSR_IA32_VMX_PROCBASED_CTLS2,
1256 &_cpu_based_2nd_exec_control) < 0) 1310 &_cpu_based_2nd_exec_control) < 0)
@@ -1344,15 +1398,17 @@ static void free_kvm_area(void)
1344{ 1398{
1345 int cpu; 1399 int cpu;
1346 1400
1347 for_each_online_cpu(cpu) 1401 for_each_possible_cpu(cpu) {
1348 free_vmcs(per_cpu(vmxarea, cpu)); 1402 free_vmcs(per_cpu(vmxarea, cpu));
1403 per_cpu(vmxarea, cpu) = NULL;
1404 }
1349} 1405}
1350 1406
1351static __init int alloc_kvm_area(void) 1407static __init int alloc_kvm_area(void)
1352{ 1408{
1353 int cpu; 1409 int cpu;
1354 1410
1355 for_each_online_cpu(cpu) { 1411 for_each_possible_cpu(cpu) {
1356 struct vmcs *vmcs; 1412 struct vmcs *vmcs;
1357 1413
1358 vmcs = alloc_vmcs_cpu(cpu); 1414 vmcs = alloc_vmcs_cpu(cpu);
@@ -1394,6 +1450,9 @@ static __init int hardware_setup(void)
1394 if (enable_ept && !cpu_has_vmx_ept_2m_page()) 1450 if (enable_ept && !cpu_has_vmx_ept_2m_page())
1395 kvm_disable_largepages(); 1451 kvm_disable_largepages();
1396 1452
1453 if (!cpu_has_vmx_ple())
1454 ple_gap = 0;
1455
1397 return alloc_kvm_area(); 1456 return alloc_kvm_area();
1398} 1457}
1399 1458
@@ -1431,8 +1490,8 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1431 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar); 1490 vmcs_write32(GUEST_TR_AR_BYTES, vmx->rmode.tr.ar);
1432 1491
1433 flags = vmcs_readl(GUEST_RFLAGS); 1492 flags = vmcs_readl(GUEST_RFLAGS);
1434 flags &= ~(X86_EFLAGS_IOPL | X86_EFLAGS_VM); 1493 flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1435 flags |= (vmx->rmode.save_iopl << IOPL_SHIFT); 1494 flags |= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1436 vmcs_writel(GUEST_RFLAGS, flags); 1495 vmcs_writel(GUEST_RFLAGS, flags);
1437 1496
1438 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) | 1497 vmcs_writel(GUEST_CR4, (vmcs_readl(GUEST_CR4) & ~X86_CR4_VME) |
@@ -1459,8 +1518,12 @@ static void enter_pmode(struct kvm_vcpu *vcpu)
1459static gva_t rmode_tss_base(struct kvm *kvm) 1518static gva_t rmode_tss_base(struct kvm *kvm)
1460{ 1519{
1461 if (!kvm->arch.tss_addr) { 1520 if (!kvm->arch.tss_addr) {
1462 gfn_t base_gfn = kvm->memslots[0].base_gfn + 1521 struct kvm_memslots *slots;
1463 kvm->memslots[0].npages - 3; 1522 gfn_t base_gfn;
1523
1524 slots = rcu_dereference(kvm->memslots);
1525 base_gfn = kvm->memslots->memslots[0].base_gfn +
1526 kvm->memslots->memslots[0].npages - 3;
1464 return base_gfn << PAGE_SHIFT; 1527 return base_gfn << PAGE_SHIFT;
1465 } 1528 }
1466 return kvm->arch.tss_addr; 1529 return kvm->arch.tss_addr;
@@ -1501,8 +1564,7 @@ static void enter_rmode(struct kvm_vcpu *vcpu)
1501 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b); 1564 vmcs_write32(GUEST_TR_AR_BYTES, 0x008b);
1502 1565
1503 flags = vmcs_readl(GUEST_RFLAGS); 1566 flags = vmcs_readl(GUEST_RFLAGS);
1504 vmx->rmode.save_iopl 1567 vmx->rmode.save_rflags = flags;
1505 = (flags & X86_EFLAGS_IOPL) >> IOPL_SHIFT;
1506 1568
1507 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM; 1569 flags |= X86_EFLAGS_IOPL | X86_EFLAGS_VM;
1508 1570
@@ -1536,11 +1598,17 @@ continue_rmode:
1536static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) 1598static void vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
1537{ 1599{
1538 struct vcpu_vmx *vmx = to_vmx(vcpu); 1600 struct vcpu_vmx *vmx = to_vmx(vcpu);
1539 struct kvm_msr_entry *msr = find_msr_entry(vmx, MSR_EFER); 1601 struct shared_msr_entry *msr = find_msr_entry(vmx, MSR_EFER);
1540 1602
1541 vcpu->arch.shadow_efer = efer;
1542 if (!msr) 1603 if (!msr)
1543 return; 1604 return;
1605
1606 /*
1607 * Force kernel_gs_base reloading before EFER changes, as control
1608 * of this msr depends on is_long_mode().
1609 */
1610 vmx_load_host_state(to_vmx(vcpu));
1611 vcpu->arch.efer = efer;
1544 if (efer & EFER_LMA) { 1612 if (efer & EFER_LMA) {
1545 vmcs_write32(VM_ENTRY_CONTROLS, 1613 vmcs_write32(VM_ENTRY_CONTROLS,
1546 vmcs_read32(VM_ENTRY_CONTROLS) | 1614 vmcs_read32(VM_ENTRY_CONTROLS) |
@@ -1570,13 +1638,13 @@ static void enter_lmode(struct kvm_vcpu *vcpu)
1570 (guest_tr_ar & ~AR_TYPE_MASK) 1638 (guest_tr_ar & ~AR_TYPE_MASK)
1571 | AR_TYPE_BUSY_64_TSS); 1639 | AR_TYPE_BUSY_64_TSS);
1572 } 1640 }
1573 vcpu->arch.shadow_efer |= EFER_LMA; 1641 vcpu->arch.efer |= EFER_LMA;
1574 vmx_set_efer(vcpu, vcpu->arch.shadow_efer); 1642 vmx_set_efer(vcpu, vcpu->arch.efer);
1575} 1643}
1576 1644
1577static void exit_lmode(struct kvm_vcpu *vcpu) 1645static void exit_lmode(struct kvm_vcpu *vcpu)
1578{ 1646{
1579 vcpu->arch.shadow_efer &= ~EFER_LMA; 1647 vcpu->arch.efer &= ~EFER_LMA;
1580 1648
1581 vmcs_write32(VM_ENTRY_CONTROLS, 1649 vmcs_write32(VM_ENTRY_CONTROLS,
1582 vmcs_read32(VM_ENTRY_CONTROLS) 1650 vmcs_read32(VM_ENTRY_CONTROLS)
@@ -1592,10 +1660,20 @@ static void vmx_flush_tlb(struct kvm_vcpu *vcpu)
1592 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa)); 1660 ept_sync_context(construct_eptp(vcpu->arch.mmu.root_hpa));
1593} 1661}
1594 1662
1663static void vmx_decache_cr0_guest_bits(struct kvm_vcpu *vcpu)
1664{
1665 ulong cr0_guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
1666
1667 vcpu->arch.cr0 &= ~cr0_guest_owned_bits;
1668 vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & cr0_guest_owned_bits;
1669}
1670
1595static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu) 1671static void vmx_decache_cr4_guest_bits(struct kvm_vcpu *vcpu)
1596{ 1672{
1597 vcpu->arch.cr4 &= KVM_GUEST_CR4_MASK; 1673 ulong cr4_guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
1598 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & ~KVM_GUEST_CR4_MASK; 1674
1675 vcpu->arch.cr4 &= ~cr4_guest_owned_bits;
1676 vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & cr4_guest_owned_bits;
1599} 1677}
1600 1678
1601static void ept_load_pdptrs(struct kvm_vcpu *vcpu) 1679static void ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -1640,7 +1718,7 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1640 (CPU_BASED_CR3_LOAD_EXITING | 1718 (CPU_BASED_CR3_LOAD_EXITING |
1641 CPU_BASED_CR3_STORE_EXITING)); 1719 CPU_BASED_CR3_STORE_EXITING));
1642 vcpu->arch.cr0 = cr0; 1720 vcpu->arch.cr0 = cr0;
1643 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1721 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1644 } else if (!is_paging(vcpu)) { 1722 } else if (!is_paging(vcpu)) {
1645 /* From nonpaging to paging */ 1723 /* From nonpaging to paging */
1646 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL, 1724 vmcs_write32(CPU_BASED_VM_EXEC_CONTROL,
@@ -1648,23 +1726,13 @@ static void ept_update_paging_mode_cr0(unsigned long *hw_cr0,
1648 ~(CPU_BASED_CR3_LOAD_EXITING | 1726 ~(CPU_BASED_CR3_LOAD_EXITING |
1649 CPU_BASED_CR3_STORE_EXITING)); 1727 CPU_BASED_CR3_STORE_EXITING));
1650 vcpu->arch.cr0 = cr0; 1728 vcpu->arch.cr0 = cr0;
1651 vmx_set_cr4(vcpu, vcpu->arch.cr4); 1729 vmx_set_cr4(vcpu, kvm_read_cr4(vcpu));
1652 } 1730 }
1653 1731
1654 if (!(cr0 & X86_CR0_WP)) 1732 if (!(cr0 & X86_CR0_WP))
1655 *hw_cr0 &= ~X86_CR0_WP; 1733 *hw_cr0 &= ~X86_CR0_WP;
1656} 1734}
1657 1735
1658static void ept_update_paging_mode_cr4(unsigned long *hw_cr4,
1659 struct kvm_vcpu *vcpu)
1660{
1661 if (!is_paging(vcpu)) {
1662 *hw_cr4 &= ~X86_CR4_PAE;
1663 *hw_cr4 |= X86_CR4_PSE;
1664 } else if (!(vcpu->arch.cr4 & X86_CR4_PAE))
1665 *hw_cr4 &= ~X86_CR4_PAE;
1666}
1667
1668static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 1736static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1669{ 1737{
1670 struct vcpu_vmx *vmx = to_vmx(vcpu); 1738 struct vcpu_vmx *vmx = to_vmx(vcpu);
@@ -1676,8 +1744,6 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1676 else 1744 else
1677 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON; 1745 hw_cr0 = (cr0 & ~KVM_GUEST_CR0_MASK) | KVM_VM_CR0_ALWAYS_ON;
1678 1746
1679 vmx_fpu_deactivate(vcpu);
1680
1681 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) 1747 if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
1682 enter_pmode(vcpu); 1748 enter_pmode(vcpu);
1683 1749
@@ -1685,7 +1751,7 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1685 enter_rmode(vcpu); 1751 enter_rmode(vcpu);
1686 1752
1687#ifdef CONFIG_X86_64 1753#ifdef CONFIG_X86_64
1688 if (vcpu->arch.shadow_efer & EFER_LME) { 1754 if (vcpu->arch.efer & EFER_LME) {
1689 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) 1755 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG))
1690 enter_lmode(vcpu); 1756 enter_lmode(vcpu);
1691 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) 1757 if (is_paging(vcpu) && !(cr0 & X86_CR0_PG))
@@ -1696,12 +1762,12 @@ static void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
1696 if (enable_ept) 1762 if (enable_ept)
1697 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); 1763 ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu);
1698 1764
1765 if (!vcpu->fpu_active)
1766 hw_cr0 |= X86_CR0_TS | X86_CR0_MP;
1767
1699 vmcs_writel(CR0_READ_SHADOW, cr0); 1768 vmcs_writel(CR0_READ_SHADOW, cr0);
1700 vmcs_writel(GUEST_CR0, hw_cr0); 1769 vmcs_writel(GUEST_CR0, hw_cr0);
1701 vcpu->arch.cr0 = cr0; 1770 vcpu->arch.cr0 = cr0;
1702
1703 if (!(cr0 & X86_CR0_TS) || !(cr0 & X86_CR0_PE))
1704 vmx_fpu_activate(vcpu);
1705} 1771}
1706 1772
1707static u64 construct_eptp(unsigned long root_hpa) 1773static u64 construct_eptp(unsigned long root_hpa)
@@ -1727,12 +1793,11 @@ static void vmx_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
1727 vmcs_write64(EPT_POINTER, eptp); 1793 vmcs_write64(EPT_POINTER, eptp);
1728 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 : 1794 guest_cr3 = is_paging(vcpu) ? vcpu->arch.cr3 :
1729 vcpu->kvm->arch.ept_identity_map_addr; 1795 vcpu->kvm->arch.ept_identity_map_addr;
1796 ept_load_pdptrs(vcpu);
1730 } 1797 }
1731 1798
1732 vmx_flush_tlb(vcpu); 1799 vmx_flush_tlb(vcpu);
1733 vmcs_writel(GUEST_CR3, guest_cr3); 1800 vmcs_writel(GUEST_CR3, guest_cr3);
1734 if (vcpu->arch.cr0 & X86_CR0_PE)
1735 vmx_fpu_deactivate(vcpu);
1736} 1801}
1737 1802
1738static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 1803static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
@@ -1741,8 +1806,14 @@ static void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
1741 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON); 1806 KVM_RMODE_VM_CR4_ALWAYS_ON : KVM_PMODE_VM_CR4_ALWAYS_ON);
1742 1807
1743 vcpu->arch.cr4 = cr4; 1808 vcpu->arch.cr4 = cr4;
1744 if (enable_ept) 1809 if (enable_ept) {
1745 ept_update_paging_mode_cr4(&hw_cr4, vcpu); 1810 if (!is_paging(vcpu)) {
1811 hw_cr4 &= ~X86_CR4_PAE;
1812 hw_cr4 |= X86_CR4_PSE;
1813 } else if (!(cr4 & X86_CR4_PAE)) {
1814 hw_cr4 &= ~X86_CR4_PAE;
1815 }
1816 }
1746 1817
1747 vmcs_writel(CR4_READ_SHADOW, cr4); 1818 vmcs_writel(CR4_READ_SHADOW, cr4);
1748 vmcs_writel(GUEST_CR4, hw_cr4); 1819 vmcs_writel(GUEST_CR4, hw_cr4);
@@ -1780,7 +1851,7 @@ static void vmx_get_segment(struct kvm_vcpu *vcpu,
1780 1851
1781static int vmx_get_cpl(struct kvm_vcpu *vcpu) 1852static int vmx_get_cpl(struct kvm_vcpu *vcpu)
1782{ 1853{
1783 if (!(vcpu->arch.cr0 & X86_CR0_PE)) /* if real mode */ 1854 if (!is_protmode(vcpu))
1784 return 0; 1855 return 0;
1785 1856
1786 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */ 1857 if (vmx_get_rflags(vcpu) & X86_EFLAGS_VM) /* if virtual 8086 */
@@ -2035,7 +2106,7 @@ static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
2035static bool guest_state_valid(struct kvm_vcpu *vcpu) 2106static bool guest_state_valid(struct kvm_vcpu *vcpu)
2036{ 2107{
2037 /* real mode guest state checks */ 2108 /* real mode guest state checks */
2038 if (!(vcpu->arch.cr0 & X86_CR0_PE)) { 2109 if (!is_protmode(vcpu)) {
2039 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS)) 2110 if (!rmode_segment_valid(vcpu, VCPU_SREG_CS))
2040 return false; 2111 return false;
2041 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS)) 2112 if (!rmode_segment_valid(vcpu, VCPU_SREG_SS))
@@ -2168,7 +2239,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2168 struct kvm_userspace_memory_region kvm_userspace_mem; 2239 struct kvm_userspace_memory_region kvm_userspace_mem;
2169 int r = 0; 2240 int r = 0;
2170 2241
2171 down_write(&kvm->slots_lock); 2242 mutex_lock(&kvm->slots_lock);
2172 if (kvm->arch.apic_access_page) 2243 if (kvm->arch.apic_access_page)
2173 goto out; 2244 goto out;
2174 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT; 2245 kvm_userspace_mem.slot = APIC_ACCESS_PAGE_PRIVATE_MEMSLOT;
@@ -2181,7 +2252,7 @@ static int alloc_apic_access_page(struct kvm *kvm)
2181 2252
2182 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00); 2253 kvm->arch.apic_access_page = gfn_to_page(kvm, 0xfee00);
2183out: 2254out:
2184 up_write(&kvm->slots_lock); 2255 mutex_unlock(&kvm->slots_lock);
2185 return r; 2256 return r;
2186} 2257}
2187 2258
@@ -2190,7 +2261,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2190 struct kvm_userspace_memory_region kvm_userspace_mem; 2261 struct kvm_userspace_memory_region kvm_userspace_mem;
2191 int r = 0; 2262 int r = 0;
2192 2263
2193 down_write(&kvm->slots_lock); 2264 mutex_lock(&kvm->slots_lock);
2194 if (kvm->arch.ept_identity_pagetable) 2265 if (kvm->arch.ept_identity_pagetable)
2195 goto out; 2266 goto out;
2196 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT; 2267 kvm_userspace_mem.slot = IDENTITY_PAGETABLE_PRIVATE_MEMSLOT;
@@ -2205,7 +2276,7 @@ static int alloc_identity_pagetable(struct kvm *kvm)
2205 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm, 2276 kvm->arch.ept_identity_pagetable = gfn_to_page(kvm,
2206 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT); 2277 kvm->arch.ept_identity_map_addr >> PAGE_SHIFT);
2207out: 2278out:
2208 up_write(&kvm->slots_lock); 2279 mutex_unlock(&kvm->slots_lock);
2209 return r; 2280 return r;
2210} 2281}
2211 2282
@@ -2302,13 +2373,22 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2302 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES; 2373 ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
2303 if (vmx->vpid == 0) 2374 if (vmx->vpid == 0)
2304 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID; 2375 exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
2305 if (!enable_ept) 2376 if (!enable_ept) {
2306 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT; 2377 exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
2378 enable_unrestricted_guest = 0;
2379 }
2307 if (!enable_unrestricted_guest) 2380 if (!enable_unrestricted_guest)
2308 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST; 2381 exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
2382 if (!ple_gap)
2383 exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
2309 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control); 2384 vmcs_write32(SECONDARY_VM_EXEC_CONTROL, exec_control);
2310 } 2385 }
2311 2386
2387 if (ple_gap) {
2388 vmcs_write32(PLE_GAP, ple_gap);
2389 vmcs_write32(PLE_WINDOW, ple_window);
2390 }
2391
2312 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf); 2392 vmcs_write32(PAGE_FAULT_ERROR_CODE_MASK, !!bypass_guest_pf);
2313 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf); 2393 vmcs_write32(PAGE_FAULT_ERROR_CODE_MATCH, !!bypass_guest_pf);
2314 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */ 2394 vmcs_write32(CR3_TARGET_COUNT, 0); /* 22.2.1 */
@@ -2368,18 +2448,15 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2368 for (i = 0; i < NR_VMX_MSR; ++i) { 2448 for (i = 0; i < NR_VMX_MSR; ++i) {
2369 u32 index = vmx_msr_index[i]; 2449 u32 index = vmx_msr_index[i];
2370 u32 data_low, data_high; 2450 u32 data_low, data_high;
2371 u64 data;
2372 int j = vmx->nmsrs; 2451 int j = vmx->nmsrs;
2373 2452
2374 if (rdmsr_safe(index, &data_low, &data_high) < 0) 2453 if (rdmsr_safe(index, &data_low, &data_high) < 0)
2375 continue; 2454 continue;
2376 if (wrmsr_safe(index, data_low, data_high) < 0) 2455 if (wrmsr_safe(index, data_low, data_high) < 0)
2377 continue; 2456 continue;
2378 data = data_low | ((u64)data_high << 32); 2457 vmx->guest_msrs[j].index = i;
2379 vmx->host_msrs[j].index = index; 2458 vmx->guest_msrs[j].data = 0;
2380 vmx->host_msrs[j].reserved = 0; 2459 vmx->guest_msrs[j].mask = -1ull;
2381 vmx->host_msrs[j].data = data;
2382 vmx->guest_msrs[j] = vmx->host_msrs[j];
2383 ++vmx->nmsrs; 2460 ++vmx->nmsrs;
2384 } 2461 }
2385 2462
@@ -2389,7 +2466,10 @@ static int vmx_vcpu_setup(struct vcpu_vmx *vmx)
2389 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl); 2466 vmcs_write32(VM_ENTRY_CONTROLS, vmcs_config.vmentry_ctrl);
2390 2467
2391 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL); 2468 vmcs_writel(CR0_GUEST_HOST_MASK, ~0UL);
2392 vmcs_writel(CR4_GUEST_HOST_MASK, KVM_GUEST_CR4_MASK); 2469 vmx->vcpu.arch.cr4_guest_owned_bits = KVM_CR4_GUEST_OWNED_BITS;
2470 if (enable_ept)
2471 vmx->vcpu.arch.cr4_guest_owned_bits |= X86_CR4_PGE;
2472 vmcs_writel(CR4_GUEST_HOST_MASK, ~vmx->vcpu.arch.cr4_guest_owned_bits);
2393 2473
2394 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc; 2474 tsc_base = vmx->vcpu.kvm->arch.vm_init_tsc;
2395 rdtscll(tsc_this); 2475 rdtscll(tsc_this);
@@ -2414,10 +2494,10 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2414{ 2494{
2415 struct vcpu_vmx *vmx = to_vmx(vcpu); 2495 struct vcpu_vmx *vmx = to_vmx(vcpu);
2416 u64 msr; 2496 u64 msr;
2417 int ret; 2497 int ret, idx;
2418 2498
2419 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP)); 2499 vcpu->arch.regs_avail = ~((1 << VCPU_REGS_RIP) | (1 << VCPU_REGS_RSP));
2420 down_read(&vcpu->kvm->slots_lock); 2500 idx = srcu_read_lock(&vcpu->kvm->srcu);
2421 if (!init_rmode(vmx->vcpu.kvm)) { 2501 if (!init_rmode(vmx->vcpu.kvm)) {
2422 ret = -ENOMEM; 2502 ret = -ENOMEM;
2423 goto out; 2503 goto out;
@@ -2510,8 +2590,8 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2510 if (vmx->vpid != 0) 2590 if (vmx->vpid != 0)
2511 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid); 2591 vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->vpid);
2512 2592
2513 vmx->vcpu.arch.cr0 = 0x60000010; 2593 vmx->vcpu.arch.cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET;
2514 vmx_set_cr0(&vmx->vcpu, vmx->vcpu.arch.cr0); /* enter rmode */ 2594 vmx_set_cr0(&vmx->vcpu, kvm_read_cr0(vcpu)); /* enter rmode */
2515 vmx_set_cr4(&vmx->vcpu, 0); 2595 vmx_set_cr4(&vmx->vcpu, 0);
2516 vmx_set_efer(&vmx->vcpu, 0); 2596 vmx_set_efer(&vmx->vcpu, 0);
2517 vmx_fpu_activate(&vmx->vcpu); 2597 vmx_fpu_activate(&vmx->vcpu);
@@ -2525,7 +2605,7 @@ static int vmx_vcpu_reset(struct kvm_vcpu *vcpu)
2525 vmx->emulation_required = 0; 2605 vmx->emulation_required = 0;
2526 2606
2527out: 2607out:
2528 up_read(&vcpu->kvm->slots_lock); 2608 srcu_read_unlock(&vcpu->kvm->srcu, idx);
2529 return ret; 2609 return ret;
2530} 2610}
2531 2611
@@ -2623,8 +2703,35 @@ static int vmx_nmi_allowed(struct kvm_vcpu *vcpu)
2623 return 0; 2703 return 0;
2624 2704
2625 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) & 2705 return !(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2626 (GUEST_INTR_STATE_STI | GUEST_INTR_STATE_MOV_SS | 2706 (GUEST_INTR_STATE_MOV_SS | GUEST_INTR_STATE_NMI));
2627 GUEST_INTR_STATE_NMI)); 2707}
2708
2709static bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
2710{
2711 if (!cpu_has_virtual_nmis())
2712 return to_vmx(vcpu)->soft_vnmi_blocked;
2713 else
2714 return !!(vmcs_read32(GUEST_INTERRUPTIBILITY_INFO) &
2715 GUEST_INTR_STATE_NMI);
2716}
2717
2718static void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
2719{
2720 struct vcpu_vmx *vmx = to_vmx(vcpu);
2721
2722 if (!cpu_has_virtual_nmis()) {
2723 if (vmx->soft_vnmi_blocked != masked) {
2724 vmx->soft_vnmi_blocked = masked;
2725 vmx->vnmi_blocked_time = 0;
2726 }
2727 } else {
2728 if (masked)
2729 vmcs_set_bits(GUEST_INTERRUPTIBILITY_INFO,
2730 GUEST_INTR_STATE_NMI);
2731 else
2732 vmcs_clear_bits(GUEST_INTERRUPTIBILITY_INFO,
2733 GUEST_INTR_STATE_NMI);
2734 }
2628} 2735}
2629 2736
2630static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu) 2737static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu)
@@ -2659,7 +2766,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2659 * Cause the #SS fault with 0 error code in VM86 mode. 2766 * Cause the #SS fault with 0 error code in VM86 mode.
2660 */ 2767 */
2661 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0) 2768 if (((vec == GP_VECTOR) || (vec == SS_VECTOR)) && err_code == 0)
2662 if (emulate_instruction(vcpu, NULL, 0, 0, 0) == EMULATE_DONE) 2769 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DONE)
2663 return 1; 2770 return 1;
2664 /* 2771 /*
2665 * Forward all other exceptions that are valid in real mode. 2772 * Forward all other exceptions that are valid in real mode.
@@ -2674,6 +2781,12 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
2674 kvm_queue_exception(vcpu, vec); 2781 kvm_queue_exception(vcpu, vec);
2675 return 1; 2782 return 1;
2676 case BP_VECTOR: 2783 case BP_VECTOR:
2784 /*
2785 * Update instruction length as we may reinject the exception
2786 * from user space while in guest debugging mode.
2787 */
2788 to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
2789 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2677 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) 2790 if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
2678 return 0; 2791 return 0;
2679 /* fall through */ 2792 /* fall through */
@@ -2710,15 +2823,16 @@ static void kvm_machine_check(void)
2710#endif 2823#endif
2711} 2824}
2712 2825
2713static int handle_machine_check(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2826static int handle_machine_check(struct kvm_vcpu *vcpu)
2714{ 2827{
2715 /* already handled by vcpu_run */ 2828 /* already handled by vcpu_run */
2716 return 1; 2829 return 1;
2717} 2830}
2718 2831
2719static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2832static int handle_exception(struct kvm_vcpu *vcpu)
2720{ 2833{
2721 struct vcpu_vmx *vmx = to_vmx(vcpu); 2834 struct vcpu_vmx *vmx = to_vmx(vcpu);
2835 struct kvm_run *kvm_run = vcpu->run;
2722 u32 intr_info, ex_no, error_code; 2836 u32 intr_info, ex_no, error_code;
2723 unsigned long cr2, rip, dr6; 2837 unsigned long cr2, rip, dr6;
2724 u32 vect_info; 2838 u32 vect_info;
@@ -2728,12 +2842,17 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2728 intr_info = vmcs_read32(VM_EXIT_INTR_INFO); 2842 intr_info = vmcs_read32(VM_EXIT_INTR_INFO);
2729 2843
2730 if (is_machine_check(intr_info)) 2844 if (is_machine_check(intr_info))
2731 return handle_machine_check(vcpu, kvm_run); 2845 return handle_machine_check(vcpu);
2732 2846
2733 if ((vect_info & VECTORING_INFO_VALID_MASK) && 2847 if ((vect_info & VECTORING_INFO_VALID_MASK) &&
2734 !is_page_fault(intr_info)) 2848 !is_page_fault(intr_info)) {
2735 printk(KERN_ERR "%s: unexpected, vectoring info 0x%x " 2849 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
2736 "intr info 0x%x\n", __func__, vect_info, intr_info); 2850 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
2851 vcpu->run->internal.ndata = 2;
2852 vcpu->run->internal.data[0] = vect_info;
2853 vcpu->run->internal.data[1] = intr_info;
2854 return 0;
2855 }
2737 2856
2738 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR) 2857 if ((intr_info & INTR_INFO_INTR_TYPE_MASK) == INTR_TYPE_NMI_INTR)
2739 return 1; /* already handled by vmx_vcpu_run() */ 2858 return 1; /* already handled by vmx_vcpu_run() */
@@ -2744,7 +2863,7 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2744 } 2863 }
2745 2864
2746 if (is_invalid_opcode(intr_info)) { 2865 if (is_invalid_opcode(intr_info)) {
2747 er = emulate_instruction(vcpu, kvm_run, 0, 0, EMULTYPE_TRAP_UD); 2866 er = emulate_instruction(vcpu, 0, 0, EMULTYPE_TRAP_UD);
2748 if (er != EMULATE_DONE) 2867 if (er != EMULATE_DONE)
2749 kvm_queue_exception(vcpu, UD_VECTOR); 2868 kvm_queue_exception(vcpu, UD_VECTOR);
2750 return 1; 2869 return 1;
@@ -2790,6 +2909,13 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2790 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7); 2909 kvm_run->debug.arch.dr7 = vmcs_readl(GUEST_DR7);
2791 /* fall through */ 2910 /* fall through */
2792 case BP_VECTOR: 2911 case BP_VECTOR:
2912 /*
2913 * Update instruction length as we may reinject #BP from
2914 * user space while in guest debugging mode. Reading it for
2915 * #DB as well causes no harm, it is not used in that case.
2916 */
2917 vmx->vcpu.arch.event_exit_inst_len =
2918 vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
2793 kvm_run->exit_reason = KVM_EXIT_DEBUG; 2919 kvm_run->exit_reason = KVM_EXIT_DEBUG;
2794 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip; 2920 kvm_run->debug.arch.pc = vmcs_readl(GUEST_CS_BASE) + rip;
2795 kvm_run->debug.arch.exception = ex_no; 2921 kvm_run->debug.arch.exception = ex_no;
@@ -2803,20 +2929,19 @@ static int handle_exception(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2803 return 0; 2929 return 0;
2804} 2930}
2805 2931
2806static int handle_external_interrupt(struct kvm_vcpu *vcpu, 2932static int handle_external_interrupt(struct kvm_vcpu *vcpu)
2807 struct kvm_run *kvm_run)
2808{ 2933{
2809 ++vcpu->stat.irq_exits; 2934 ++vcpu->stat.irq_exits;
2810 return 1; 2935 return 1;
2811} 2936}
2812 2937
2813static int handle_triple_fault(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2938static int handle_triple_fault(struct kvm_vcpu *vcpu)
2814{ 2939{
2815 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 2940 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
2816 return 0; 2941 return 0;
2817} 2942}
2818 2943
2819static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2944static int handle_io(struct kvm_vcpu *vcpu)
2820{ 2945{
2821 unsigned long exit_qualification; 2946 unsigned long exit_qualification;
2822 int size, in, string; 2947 int size, in, string;
@@ -2827,8 +2952,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2827 string = (exit_qualification & 16) != 0; 2952 string = (exit_qualification & 16) != 0;
2828 2953
2829 if (string) { 2954 if (string) {
2830 if (emulate_instruction(vcpu, 2955 if (emulate_instruction(vcpu, 0, 0, 0) == EMULATE_DO_MMIO)
2831 kvm_run, 0, 0, 0) == EMULATE_DO_MMIO)
2832 return 0; 2956 return 0;
2833 return 1; 2957 return 1;
2834 } 2958 }
@@ -2838,7 +2962,7 @@ static int handle_io(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2838 port = exit_qualification >> 16; 2962 port = exit_qualification >> 16;
2839 2963
2840 skip_emulated_instruction(vcpu); 2964 skip_emulated_instruction(vcpu);
2841 return kvm_emulate_pio(vcpu, kvm_run, in, size, port); 2965 return kvm_emulate_pio(vcpu, in, size, port);
2842} 2966}
2843 2967
2844static void 2968static void
@@ -2852,7 +2976,7 @@ vmx_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall)
2852 hypercall[2] = 0xc1; 2976 hypercall[2] = 0xc1;
2853} 2977}
2854 2978
2855static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 2979static int handle_cr(struct kvm_vcpu *vcpu)
2856{ 2980{
2857 unsigned long exit_qualification, val; 2981 unsigned long exit_qualification, val;
2858 int cr; 2982 int cr;
@@ -2887,17 +3011,16 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2887 return 1; 3011 return 1;
2888 if (cr8_prev <= cr8) 3012 if (cr8_prev <= cr8)
2889 return 1; 3013 return 1;
2890 kvm_run->exit_reason = KVM_EXIT_SET_TPR; 3014 vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
2891 return 0; 3015 return 0;
2892 } 3016 }
2893 }; 3017 };
2894 break; 3018 break;
2895 case 2: /* clts */ 3019 case 2: /* clts */
2896 vmx_fpu_deactivate(vcpu); 3020 vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
2897 vcpu->arch.cr0 &= ~X86_CR0_TS; 3021 trace_kvm_cr_write(0, kvm_read_cr0(vcpu));
2898 vmcs_writel(CR0_READ_SHADOW, vcpu->arch.cr0);
2899 vmx_fpu_activate(vcpu);
2900 skip_emulated_instruction(vcpu); 3022 skip_emulated_instruction(vcpu);
3023 vmx_fpu_activate(vcpu);
2901 return 1; 3024 return 1;
2902 case 1: /*mov from cr*/ 3025 case 1: /*mov from cr*/
2903 switch (cr) { 3026 switch (cr) {
@@ -2915,25 +3038,37 @@ static int handle_cr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2915 } 3038 }
2916 break; 3039 break;
2917 case 3: /* lmsw */ 3040 case 3: /* lmsw */
2918 kvm_lmsw(vcpu, (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f); 3041 val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & 0x0f;
3042 trace_kvm_cr_write(0, (kvm_read_cr0(vcpu) & ~0xful) | val);
3043 kvm_lmsw(vcpu, val);
2919 3044
2920 skip_emulated_instruction(vcpu); 3045 skip_emulated_instruction(vcpu);
2921 return 1; 3046 return 1;
2922 default: 3047 default:
2923 break; 3048 break;
2924 } 3049 }
2925 kvm_run->exit_reason = 0; 3050 vcpu->run->exit_reason = 0;
2926 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n", 3051 pr_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
2927 (int)(exit_qualification >> 4) & 3, cr); 3052 (int)(exit_qualification >> 4) & 3, cr);
2928 return 0; 3053 return 0;
2929} 3054}
2930 3055
2931static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3056static int check_dr_alias(struct kvm_vcpu *vcpu)
3057{
3058 if (kvm_read_cr4_bits(vcpu, X86_CR4_DE)) {
3059 kvm_queue_exception(vcpu, UD_VECTOR);
3060 return -1;
3061 }
3062 return 0;
3063}
3064
3065static int handle_dr(struct kvm_vcpu *vcpu)
2932{ 3066{
2933 unsigned long exit_qualification; 3067 unsigned long exit_qualification;
2934 unsigned long val; 3068 unsigned long val;
2935 int dr, reg; 3069 int dr, reg;
2936 3070
3071 /* Do not handle if the CPL > 0, will trigger GP on re-entry */
2937 if (!kvm_require_cpl(vcpu, 0)) 3072 if (!kvm_require_cpl(vcpu, 0))
2938 return 1; 3073 return 1;
2939 dr = vmcs_readl(GUEST_DR7); 3074 dr = vmcs_readl(GUEST_DR7);
@@ -2944,13 +3079,13 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2944 * guest debugging itself. 3079 * guest debugging itself.
2945 */ 3080 */
2946 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { 3081 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
2947 kvm_run->debug.arch.dr6 = vcpu->arch.dr6; 3082 vcpu->run->debug.arch.dr6 = vcpu->arch.dr6;
2948 kvm_run->debug.arch.dr7 = dr; 3083 vcpu->run->debug.arch.dr7 = dr;
2949 kvm_run->debug.arch.pc = 3084 vcpu->run->debug.arch.pc =
2950 vmcs_readl(GUEST_CS_BASE) + 3085 vmcs_readl(GUEST_CS_BASE) +
2951 vmcs_readl(GUEST_RIP); 3086 vmcs_readl(GUEST_RIP);
2952 kvm_run->debug.arch.exception = DB_VECTOR; 3087 vcpu->run->debug.arch.exception = DB_VECTOR;
2953 kvm_run->exit_reason = KVM_EXIT_DEBUG; 3088 vcpu->run->exit_reason = KVM_EXIT_DEBUG;
2954 return 0; 3089 return 0;
2955 } else { 3090 } else {
2956 vcpu->arch.dr7 &= ~DR7_GD; 3091 vcpu->arch.dr7 &= ~DR7_GD;
@@ -2969,14 +3104,20 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2969 case 0 ... 3: 3104 case 0 ... 3:
2970 val = vcpu->arch.db[dr]; 3105 val = vcpu->arch.db[dr];
2971 break; 3106 break;
3107 case 4:
3108 if (check_dr_alias(vcpu) < 0)
3109 return 1;
3110 /* fall through */
2972 case 6: 3111 case 6:
2973 val = vcpu->arch.dr6; 3112 val = vcpu->arch.dr6;
2974 break; 3113 break;
2975 case 7: 3114 case 5:
3115 if (check_dr_alias(vcpu) < 0)
3116 return 1;
3117 /* fall through */
3118 default: /* 7 */
2976 val = vcpu->arch.dr7; 3119 val = vcpu->arch.dr7;
2977 break; 3120 break;
2978 default:
2979 val = 0;
2980 } 3121 }
2981 kvm_register_write(vcpu, reg, val); 3122 kvm_register_write(vcpu, reg, val);
2982 } else { 3123 } else {
@@ -2987,21 +3128,25 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
2987 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) 3128 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
2988 vcpu->arch.eff_db[dr] = val; 3129 vcpu->arch.eff_db[dr] = val;
2989 break; 3130 break;
2990 case 4 ... 5: 3131 case 4:
2991 if (vcpu->arch.cr4 & X86_CR4_DE) 3132 if (check_dr_alias(vcpu) < 0)
2992 kvm_queue_exception(vcpu, UD_VECTOR); 3133 return 1;
2993 break; 3134 /* fall through */
2994 case 6: 3135 case 6:
2995 if (val & 0xffffffff00000000ULL) { 3136 if (val & 0xffffffff00000000ULL) {
2996 kvm_queue_exception(vcpu, GP_VECTOR); 3137 kvm_inject_gp(vcpu, 0);
2997 break; 3138 return 1;
2998 } 3139 }
2999 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1; 3140 vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
3000 break; 3141 break;
3001 case 7: 3142 case 5:
3143 if (check_dr_alias(vcpu) < 0)
3144 return 1;
3145 /* fall through */
3146 default: /* 7 */
3002 if (val & 0xffffffff00000000ULL) { 3147 if (val & 0xffffffff00000000ULL) {
3003 kvm_queue_exception(vcpu, GP_VECTOR); 3148 kvm_inject_gp(vcpu, 0);
3004 break; 3149 return 1;
3005 } 3150 }
3006 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; 3151 vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
3007 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { 3152 if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) {
@@ -3016,18 +3161,19 @@ static int handle_dr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3016 return 1; 3161 return 1;
3017} 3162}
3018 3163
3019static int handle_cpuid(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3164static int handle_cpuid(struct kvm_vcpu *vcpu)
3020{ 3165{
3021 kvm_emulate_cpuid(vcpu); 3166 kvm_emulate_cpuid(vcpu);
3022 return 1; 3167 return 1;
3023} 3168}
3024 3169
3025static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3170static int handle_rdmsr(struct kvm_vcpu *vcpu)
3026{ 3171{
3027 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3172 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3028 u64 data; 3173 u64 data;
3029 3174
3030 if (vmx_get_msr(vcpu, ecx, &data)) { 3175 if (vmx_get_msr(vcpu, ecx, &data)) {
3176 trace_kvm_msr_read_ex(ecx);
3031 kvm_inject_gp(vcpu, 0); 3177 kvm_inject_gp(vcpu, 0);
3032 return 1; 3178 return 1;
3033 } 3179 }
@@ -3041,31 +3187,29 @@ static int handle_rdmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3041 return 1; 3187 return 1;
3042} 3188}
3043 3189
3044static int handle_wrmsr(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3190static int handle_wrmsr(struct kvm_vcpu *vcpu)
3045{ 3191{
3046 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX]; 3192 u32 ecx = vcpu->arch.regs[VCPU_REGS_RCX];
3047 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u) 3193 u64 data = (vcpu->arch.regs[VCPU_REGS_RAX] & -1u)
3048 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32); 3194 | ((u64)(vcpu->arch.regs[VCPU_REGS_RDX] & -1u) << 32);
3049 3195
3050 trace_kvm_msr_write(ecx, data);
3051
3052 if (vmx_set_msr(vcpu, ecx, data) != 0) { 3196 if (vmx_set_msr(vcpu, ecx, data) != 0) {
3197 trace_kvm_msr_write_ex(ecx, data);
3053 kvm_inject_gp(vcpu, 0); 3198 kvm_inject_gp(vcpu, 0);
3054 return 1; 3199 return 1;
3055 } 3200 }
3056 3201
3202 trace_kvm_msr_write(ecx, data);
3057 skip_emulated_instruction(vcpu); 3203 skip_emulated_instruction(vcpu);
3058 return 1; 3204 return 1;
3059} 3205}
3060 3206
3061static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu, 3207static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
3062 struct kvm_run *kvm_run)
3063{ 3208{
3064 return 1; 3209 return 1;
3065} 3210}
3066 3211
3067static int handle_interrupt_window(struct kvm_vcpu *vcpu, 3212static int handle_interrupt_window(struct kvm_vcpu *vcpu)
3068 struct kvm_run *kvm_run)
3069{ 3213{
3070 u32 cpu_based_vm_exec_control; 3214 u32 cpu_based_vm_exec_control;
3071 3215
@@ -3081,34 +3225,34 @@ static int handle_interrupt_window(struct kvm_vcpu *vcpu,
3081 * possible 3225 * possible
3082 */ 3226 */
3083 if (!irqchip_in_kernel(vcpu->kvm) && 3227 if (!irqchip_in_kernel(vcpu->kvm) &&
3084 kvm_run->request_interrupt_window && 3228 vcpu->run->request_interrupt_window &&
3085 !kvm_cpu_has_interrupt(vcpu)) { 3229 !kvm_cpu_has_interrupt(vcpu)) {
3086 kvm_run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; 3230 vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN;
3087 return 0; 3231 return 0;
3088 } 3232 }
3089 return 1; 3233 return 1;
3090} 3234}
3091 3235
3092static int handle_halt(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3236static int handle_halt(struct kvm_vcpu *vcpu)
3093{ 3237{
3094 skip_emulated_instruction(vcpu); 3238 skip_emulated_instruction(vcpu);
3095 return kvm_emulate_halt(vcpu); 3239 return kvm_emulate_halt(vcpu);
3096} 3240}
3097 3241
3098static int handle_vmcall(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3242static int handle_vmcall(struct kvm_vcpu *vcpu)
3099{ 3243{
3100 skip_emulated_instruction(vcpu); 3244 skip_emulated_instruction(vcpu);
3101 kvm_emulate_hypercall(vcpu); 3245 kvm_emulate_hypercall(vcpu);
3102 return 1; 3246 return 1;
3103} 3247}
3104 3248
3105static int handle_vmx_insn(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3249static int handle_vmx_insn(struct kvm_vcpu *vcpu)
3106{ 3250{
3107 kvm_queue_exception(vcpu, UD_VECTOR); 3251 kvm_queue_exception(vcpu, UD_VECTOR);
3108 return 1; 3252 return 1;
3109} 3253}
3110 3254
3111static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3255static int handle_invlpg(struct kvm_vcpu *vcpu)
3112{ 3256{
3113 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3257 unsigned long exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3114 3258
@@ -3117,14 +3261,14 @@ static int handle_invlpg(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3117 return 1; 3261 return 1;
3118} 3262}
3119 3263
3120static int handle_wbinvd(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3264static int handle_wbinvd(struct kvm_vcpu *vcpu)
3121{ 3265{
3122 skip_emulated_instruction(vcpu); 3266 skip_emulated_instruction(vcpu);
3123 /* TODO: Add support for VT-d/pass-through device */ 3267 /* TODO: Add support for VT-d/pass-through device */
3124 return 1; 3268 return 1;
3125} 3269}
3126 3270
3127static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3271static int handle_apic_access(struct kvm_vcpu *vcpu)
3128{ 3272{
3129 unsigned long exit_qualification; 3273 unsigned long exit_qualification;
3130 enum emulation_result er; 3274 enum emulation_result er;
@@ -3133,7 +3277,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3133 exit_qualification = vmcs_readl(EXIT_QUALIFICATION); 3277 exit_qualification = vmcs_readl(EXIT_QUALIFICATION);
3134 offset = exit_qualification & 0xffful; 3278 offset = exit_qualification & 0xffful;
3135 3279
3136 er = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3280 er = emulate_instruction(vcpu, 0, 0, 0);
3137 3281
3138 if (er != EMULATE_DONE) { 3282 if (er != EMULATE_DONE) {
3139 printk(KERN_ERR 3283 printk(KERN_ERR
@@ -3144,7 +3288,7 @@ static int handle_apic_access(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3144 return 1; 3288 return 1;
3145} 3289}
3146 3290
3147static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3291static int handle_task_switch(struct kvm_vcpu *vcpu)
3148{ 3292{
3149 struct vcpu_vmx *vmx = to_vmx(vcpu); 3293 struct vcpu_vmx *vmx = to_vmx(vcpu);
3150 unsigned long exit_qualification; 3294 unsigned long exit_qualification;
@@ -3198,7 +3342,7 @@ static int handle_task_switch(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3198 return 1; 3342 return 1;
3199} 3343}
3200 3344
3201static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3345static int handle_ept_violation(struct kvm_vcpu *vcpu)
3202{ 3346{
3203 unsigned long exit_qualification; 3347 unsigned long exit_qualification;
3204 gpa_t gpa; 3348 gpa_t gpa;
@@ -3219,8 +3363,8 @@ static int handle_ept_violation(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3219 vmcs_readl(GUEST_LINEAR_ADDRESS)); 3363 vmcs_readl(GUEST_LINEAR_ADDRESS));
3220 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n", 3364 printk(KERN_ERR "EPT: Exit qualification is 0x%lx\n",
3221 (long unsigned int)exit_qualification); 3365 (long unsigned int)exit_qualification);
3222 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3366 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3223 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION; 3367 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_VIOLATION;
3224 return 0; 3368 return 0;
3225 } 3369 }
3226 3370
@@ -3290,7 +3434,7 @@ static void ept_misconfig_inspect_spte(struct kvm_vcpu *vcpu, u64 spte,
3290 } 3434 }
3291} 3435}
3292 3436
3293static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3437static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
3294{ 3438{
3295 u64 sptes[4]; 3439 u64 sptes[4];
3296 int nr_sptes, i; 3440 int nr_sptes, i;
@@ -3306,13 +3450,13 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3306 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i) 3450 for (i = PT64_ROOT_LEVEL; i > PT64_ROOT_LEVEL - nr_sptes; --i)
3307 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i); 3451 ept_misconfig_inspect_spte(vcpu, sptes[i-1], i);
3308 3452
3309 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3453 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3310 kvm_run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG; 3454 vcpu->run->hw.hardware_exit_reason = EXIT_REASON_EPT_MISCONFIG;
3311 3455
3312 return 0; 3456 return 0;
3313} 3457}
3314 3458
3315static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3459static int handle_nmi_window(struct kvm_vcpu *vcpu)
3316{ 3460{
3317 u32 cpu_based_vm_exec_control; 3461 u32 cpu_based_vm_exec_control;
3318 3462
@@ -3325,36 +3469,55 @@ static int handle_nmi_window(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3325 return 1; 3469 return 1;
3326} 3470}
3327 3471
3328static void handle_invalid_guest_state(struct kvm_vcpu *vcpu, 3472static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
3329 struct kvm_run *kvm_run)
3330{ 3473{
3331 struct vcpu_vmx *vmx = to_vmx(vcpu); 3474 struct vcpu_vmx *vmx = to_vmx(vcpu);
3332 enum emulation_result err = EMULATE_DONE; 3475 enum emulation_result err = EMULATE_DONE;
3333 3476 int ret = 1;
3334 local_irq_enable();
3335 preempt_enable();
3336 3477
3337 while (!guest_state_valid(vcpu)) { 3478 while (!guest_state_valid(vcpu)) {
3338 err = emulate_instruction(vcpu, kvm_run, 0, 0, 0); 3479 err = emulate_instruction(vcpu, 0, 0, 0);
3339 3480
3340 if (err == EMULATE_DO_MMIO) 3481 if (err == EMULATE_DO_MMIO) {
3341 break; 3482 ret = 0;
3483 goto out;
3484 }
3342 3485
3343 if (err != EMULATE_DONE) { 3486 if (err != EMULATE_DONE) {
3344 kvm_report_emulation_failure(vcpu, "emulation failure"); 3487 vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
3345 break; 3488 vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
3489 vcpu->run->internal.ndata = 0;
3490 ret = 0;
3491 goto out;
3346 } 3492 }
3347 3493
3348 if (signal_pending(current)) 3494 if (signal_pending(current))
3349 break; 3495 goto out;
3350 if (need_resched()) 3496 if (need_resched())
3351 schedule(); 3497 schedule();
3352 } 3498 }
3353 3499
3354 preempt_disable(); 3500 vmx->emulation_required = 0;
3355 local_irq_disable(); 3501out:
3502 return ret;
3503}
3504
3505/*
3506 * Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
3507 * exiting, so only get here on cpu with PAUSE-Loop-Exiting.
3508 */
3509static int handle_pause(struct kvm_vcpu *vcpu)
3510{
3511 skip_emulated_instruction(vcpu);
3512 kvm_vcpu_on_spin(vcpu);
3513
3514 return 1;
3515}
3356 3516
3357 vmx->invalid_state_emulation_result = err; 3517static int handle_invalid_op(struct kvm_vcpu *vcpu)
3518{
3519 kvm_queue_exception(vcpu, UD_VECTOR);
3520 return 1;
3358} 3521}
3359 3522
3360/* 3523/*
@@ -3362,8 +3525,7 @@ static void handle_invalid_guest_state(struct kvm_vcpu *vcpu,
3362 * may resume. Otherwise they set the kvm_run parameter to indicate what needs 3525 * may resume. Otherwise they set the kvm_run parameter to indicate what needs
3363 * to be done to userspace and return 0. 3526 * to be done to userspace and return 0.
3364 */ 3527 */
3365static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu, 3528static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu) = {
3366 struct kvm_run *kvm_run) = {
3367 [EXIT_REASON_EXCEPTION_NMI] = handle_exception, 3529 [EXIT_REASON_EXCEPTION_NMI] = handle_exception,
3368 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt, 3530 [EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
3369 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault, 3531 [EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
@@ -3394,6 +3556,9 @@ static int (*kvm_vmx_exit_handlers[])(struct kvm_vcpu *vcpu,
3394 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check, 3556 [EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
3395 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation, 3557 [EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
3396 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig, 3558 [EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
3559 [EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
3560 [EXIT_REASON_MWAIT_INSTRUCTION] = handle_invalid_op,
3561 [EXIT_REASON_MONITOR_INSTRUCTION] = handle_invalid_op,
3397}; 3562};
3398 3563
3399static const int kvm_vmx_max_exit_handlers = 3564static const int kvm_vmx_max_exit_handlers =
@@ -3403,7 +3568,7 @@ static const int kvm_vmx_max_exit_handlers =
3403 * The guest has exited. See if we can fix it or if we need userspace 3568 * The guest has exited. See if we can fix it or if we need userspace
3404 * assistance. 3569 * assistance.
3405 */ 3570 */
3406static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) 3571static int vmx_handle_exit(struct kvm_vcpu *vcpu)
3407{ 3572{
3408 struct vcpu_vmx *vmx = to_vmx(vcpu); 3573 struct vcpu_vmx *vmx = to_vmx(vcpu);
3409 u32 exit_reason = vmx->exit_reason; 3574 u32 exit_reason = vmx->exit_reason;
@@ -3411,13 +3576,9 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3411 3576
3412 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu)); 3577 trace_kvm_exit(exit_reason, kvm_rip_read(vcpu));
3413 3578
3414 /* If we need to emulate an MMIO from handle_invalid_guest_state 3579 /* If guest state is invalid, start emulating */
3415 * we just return 0 */ 3580 if (vmx->emulation_required && emulate_invalid_guest_state)
3416 if (vmx->emulation_required && emulate_invalid_guest_state) { 3581 return handle_invalid_guest_state(vcpu);
3417 if (guest_state_valid(vcpu))
3418 vmx->emulation_required = 0;
3419 return vmx->invalid_state_emulation_result != EMULATE_DO_MMIO;
3420 }
3421 3582
3422 /* Access CR3 don't cause VMExit in paging mode, so we need 3583 /* Access CR3 don't cause VMExit in paging mode, so we need
3423 * to sync with guest real CR3. */ 3584 * to sync with guest real CR3. */
@@ -3425,8 +3586,8 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3425 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); 3586 vcpu->arch.cr3 = vmcs_readl(GUEST_CR3);
3426 3587
3427 if (unlikely(vmx->fail)) { 3588 if (unlikely(vmx->fail)) {
3428 kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; 3589 vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
3429 kvm_run->fail_entry.hardware_entry_failure_reason 3590 vcpu->run->fail_entry.hardware_entry_failure_reason
3430 = vmcs_read32(VM_INSTRUCTION_ERROR); 3591 = vmcs_read32(VM_INSTRUCTION_ERROR);
3431 return 0; 3592 return 0;
3432 } 3593 }
@@ -3459,10 +3620,10 @@ static int vmx_handle_exit(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu)
3459 3620
3460 if (exit_reason < kvm_vmx_max_exit_handlers 3621 if (exit_reason < kvm_vmx_max_exit_handlers
3461 && kvm_vmx_exit_handlers[exit_reason]) 3622 && kvm_vmx_exit_handlers[exit_reason])
3462 return kvm_vmx_exit_handlers[exit_reason](vcpu, kvm_run); 3623 return kvm_vmx_exit_handlers[exit_reason](vcpu);
3463 else { 3624 else {
3464 kvm_run->exit_reason = KVM_EXIT_UNKNOWN; 3625 vcpu->run->exit_reason = KVM_EXIT_UNKNOWN;
3465 kvm_run->hw.hardware_exit_reason = exit_reason; 3626 vcpu->run->hw.hardware_exit_reason = exit_reason;
3466 } 3627 }
3467 return 0; 3628 return 0;
3468} 3629}
@@ -3600,23 +3761,18 @@ static void fixup_rmode_irq(struct vcpu_vmx *vmx)
3600#define Q "l" 3761#define Q "l"
3601#endif 3762#endif
3602 3763
3603static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 3764static void vmx_vcpu_run(struct kvm_vcpu *vcpu)
3604{ 3765{
3605 struct vcpu_vmx *vmx = to_vmx(vcpu); 3766 struct vcpu_vmx *vmx = to_vmx(vcpu);
3606 3767
3607 if (enable_ept && is_paging(vcpu)) {
3608 vmcs_writel(GUEST_CR3, vcpu->arch.cr3);
3609 ept_load_pdptrs(vcpu);
3610 }
3611 /* Record the guest's net vcpu time for enforced NMI injections. */ 3768 /* Record the guest's net vcpu time for enforced NMI injections. */
3612 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked)) 3769 if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
3613 vmx->entry_time = ktime_get(); 3770 vmx->entry_time = ktime_get();
3614 3771
3615 /* Handle invalid guest state instead of entering VMX */ 3772 /* Don't enter VMX if guest state is invalid, let the exit handler
3616 if (vmx->emulation_required && emulate_invalid_guest_state) { 3773 start emulation until we arrive back to a valid state */
3617 handle_invalid_guest_state(vcpu, kvm_run); 3774 if (vmx->emulation_required && emulate_invalid_guest_state)
3618 return; 3775 return;
3619 }
3620 3776
3621 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty)) 3777 if (test_bit(VCPU_REGS_RSP, (unsigned long *)&vcpu->arch.regs_dirty))
3622 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]); 3778 vmcs_writel(GUEST_RSP, vcpu->arch.regs[VCPU_REGS_RSP]);
@@ -3636,9 +3792,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3636 */ 3792 */
3637 vmcs_writel(HOST_CR0, read_cr0()); 3793 vmcs_writel(HOST_CR0, read_cr0());
3638 3794
3639 if (vcpu->arch.switch_db_regs)
3640 set_debugreg(vcpu->arch.dr6, 6);
3641
3642 asm( 3795 asm(
3643 /* Store host registers */ 3796 /* Store host registers */
3644 "push %%"R"dx; push %%"R"bp;" 3797 "push %%"R"dx; push %%"R"bp;"
@@ -3739,9 +3892,6 @@ static void vmx_vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3739 | (1 << VCPU_EXREG_PDPTR)); 3892 | (1 << VCPU_EXREG_PDPTR));
3740 vcpu->arch.regs_dirty = 0; 3893 vcpu->arch.regs_dirty = 0;
3741 3894
3742 if (vcpu->arch.switch_db_regs)
3743 get_debugreg(vcpu->arch.dr6, 6);
3744
3745 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD); 3895 vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
3746 if (vmx->rmode.irq.pending) 3896 if (vmx->rmode.irq.pending)
3747 fixup_rmode_irq(vmx); 3897 fixup_rmode_irq(vmx);
@@ -3775,7 +3925,6 @@ static void vmx_free_vcpu(struct kvm_vcpu *vcpu)
3775 __clear_bit(vmx->vpid, vmx_vpid_bitmap); 3925 __clear_bit(vmx->vpid, vmx_vpid_bitmap);
3776 spin_unlock(&vmx_vpid_lock); 3926 spin_unlock(&vmx_vpid_lock);
3777 vmx_free_vmcs(vcpu); 3927 vmx_free_vmcs(vcpu);
3778 kfree(vmx->host_msrs);
3779 kfree(vmx->guest_msrs); 3928 kfree(vmx->guest_msrs);
3780 kvm_vcpu_uninit(vcpu); 3929 kvm_vcpu_uninit(vcpu);
3781 kmem_cache_free(kvm_vcpu_cache, vmx); 3930 kmem_cache_free(kvm_vcpu_cache, vmx);
@@ -3802,10 +3951,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3802 goto uninit_vcpu; 3951 goto uninit_vcpu;
3803 } 3952 }
3804 3953
3805 vmx->host_msrs = kmalloc(PAGE_SIZE, GFP_KERNEL);
3806 if (!vmx->host_msrs)
3807 goto free_guest_msrs;
3808
3809 vmx->vmcs = alloc_vmcs(); 3954 vmx->vmcs = alloc_vmcs();
3810 if (!vmx->vmcs) 3955 if (!vmx->vmcs)
3811 goto free_msrs; 3956 goto free_msrs;
@@ -3836,8 +3981,6 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id)
3836free_vmcs: 3981free_vmcs:
3837 free_vmcs(vmx->vmcs); 3982 free_vmcs(vmx->vmcs);
3838free_msrs: 3983free_msrs:
3839 kfree(vmx->host_msrs);
3840free_guest_msrs:
3841 kfree(vmx->guest_msrs); 3984 kfree(vmx->guest_msrs);
3842uninit_vcpu: 3985uninit_vcpu:
3843 kvm_vcpu_uninit(&vmx->vcpu); 3986 kvm_vcpu_uninit(&vmx->vcpu);
@@ -3877,7 +4020,7 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3877 * b. VT-d with snooping control feature: snooping control feature of 4020 * b. VT-d with snooping control feature: snooping control feature of
3878 * VT-d engine can guarantee the cache correctness. Just set it 4021 * VT-d engine can guarantee the cache correctness. Just set it
3879 * to WB to keep consistent with host. So the same as item 3. 4022 * to WB to keep consistent with host. So the same as item 3.
3880 * 3. EPT without VT-d: always map as WB and set IGMT=1 to keep 4023 * 3. EPT without VT-d: always map as WB and set IPAT=1 to keep
3881 * consistent with host MTRR 4024 * consistent with host MTRR
3882 */ 4025 */
3883 if (is_mmio) 4026 if (is_mmio)
@@ -3888,37 +4031,88 @@ static u64 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
3888 VMX_EPT_MT_EPTE_SHIFT; 4031 VMX_EPT_MT_EPTE_SHIFT;
3889 else 4032 else
3890 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) 4033 ret = (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT)
3891 | VMX_EPT_IGMT_BIT; 4034 | VMX_EPT_IPAT_BIT;
3892 4035
3893 return ret; 4036 return ret;
3894} 4037}
3895 4038
4039#define _ER(x) { EXIT_REASON_##x, #x }
4040
3896static const struct trace_print_flags vmx_exit_reasons_str[] = { 4041static const struct trace_print_flags vmx_exit_reasons_str[] = {
3897 { EXIT_REASON_EXCEPTION_NMI, "exception" }, 4042 _ER(EXCEPTION_NMI),
3898 { EXIT_REASON_EXTERNAL_INTERRUPT, "ext_irq" }, 4043 _ER(EXTERNAL_INTERRUPT),
3899 { EXIT_REASON_TRIPLE_FAULT, "triple_fault" }, 4044 _ER(TRIPLE_FAULT),
3900 { EXIT_REASON_NMI_WINDOW, "nmi_window" }, 4045 _ER(PENDING_INTERRUPT),
3901 { EXIT_REASON_IO_INSTRUCTION, "io_instruction" }, 4046 _ER(NMI_WINDOW),
3902 { EXIT_REASON_CR_ACCESS, "cr_access" }, 4047 _ER(TASK_SWITCH),
3903 { EXIT_REASON_DR_ACCESS, "dr_access" }, 4048 _ER(CPUID),
3904 { EXIT_REASON_CPUID, "cpuid" }, 4049 _ER(HLT),
3905 { EXIT_REASON_MSR_READ, "rdmsr" }, 4050 _ER(INVLPG),
3906 { EXIT_REASON_MSR_WRITE, "wrmsr" }, 4051 _ER(RDPMC),
3907 { EXIT_REASON_PENDING_INTERRUPT, "interrupt_window" }, 4052 _ER(RDTSC),
3908 { EXIT_REASON_HLT, "halt" }, 4053 _ER(VMCALL),
3909 { EXIT_REASON_INVLPG, "invlpg" }, 4054 _ER(VMCLEAR),
3910 { EXIT_REASON_VMCALL, "hypercall" }, 4055 _ER(VMLAUNCH),
3911 { EXIT_REASON_TPR_BELOW_THRESHOLD, "tpr_below_thres" }, 4056 _ER(VMPTRLD),
3912 { EXIT_REASON_APIC_ACCESS, "apic_access" }, 4057 _ER(VMPTRST),
3913 { EXIT_REASON_WBINVD, "wbinvd" }, 4058 _ER(VMREAD),
3914 { EXIT_REASON_TASK_SWITCH, "task_switch" }, 4059 _ER(VMRESUME),
3915 { EXIT_REASON_EPT_VIOLATION, "ept_violation" }, 4060 _ER(VMWRITE),
4061 _ER(VMOFF),
4062 _ER(VMON),
4063 _ER(CR_ACCESS),
4064 _ER(DR_ACCESS),
4065 _ER(IO_INSTRUCTION),
4066 _ER(MSR_READ),
4067 _ER(MSR_WRITE),
4068 _ER(MWAIT_INSTRUCTION),
4069 _ER(MONITOR_INSTRUCTION),
4070 _ER(PAUSE_INSTRUCTION),
4071 _ER(MCE_DURING_VMENTRY),
4072 _ER(TPR_BELOW_THRESHOLD),
4073 _ER(APIC_ACCESS),
4074 _ER(EPT_VIOLATION),
4075 _ER(EPT_MISCONFIG),
4076 _ER(WBINVD),
3916 { -1, NULL } 4077 { -1, NULL }
3917}; 4078};
3918 4079
3919static bool vmx_gb_page_enable(void) 4080#undef _ER
4081
4082static int vmx_get_lpage_level(void)
3920{ 4083{
3921 return false; 4084 if (enable_ept && !cpu_has_vmx_ept_1g_page())
4085 return PT_DIRECTORY_LEVEL;
4086 else
4087 /* For shadow and EPT supported 1GB page */
4088 return PT_PDPE_LEVEL;
4089}
4090
4091static inline u32 bit(int bitno)
4092{
4093 return 1 << (bitno & 31);
4094}
4095
4096static void vmx_cpuid_update(struct kvm_vcpu *vcpu)
4097{
4098 struct kvm_cpuid_entry2 *best;
4099 struct vcpu_vmx *vmx = to_vmx(vcpu);
4100 u32 exec_control;
4101
4102 vmx->rdtscp_enabled = false;
4103 if (vmx_rdtscp_supported()) {
4104 exec_control = vmcs_read32(SECONDARY_VM_EXEC_CONTROL);
4105 if (exec_control & SECONDARY_EXEC_RDTSCP) {
4106 best = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
4107 if (best && (best->edx & bit(X86_FEATURE_RDTSCP)))
4108 vmx->rdtscp_enabled = true;
4109 else {
4110 exec_control &= ~SECONDARY_EXEC_RDTSCP;
4111 vmcs_write32(SECONDARY_VM_EXEC_CONTROL,
4112 exec_control);
4113 }
4114 }
4115 }
3922} 4116}
3923 4117
3924static struct kvm_x86_ops vmx_x86_ops = { 4118static struct kvm_x86_ops vmx_x86_ops = {
@@ -3947,6 +4141,7 @@ static struct kvm_x86_ops vmx_x86_ops = {
3947 .set_segment = vmx_set_segment, 4141 .set_segment = vmx_set_segment,
3948 .get_cpl = vmx_get_cpl, 4142 .get_cpl = vmx_get_cpl,
3949 .get_cs_db_l_bits = vmx_get_cs_db_l_bits, 4143 .get_cs_db_l_bits = vmx_get_cs_db_l_bits,
4144 .decache_cr0_guest_bits = vmx_decache_cr0_guest_bits,
3950 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits, 4145 .decache_cr4_guest_bits = vmx_decache_cr4_guest_bits,
3951 .set_cr0 = vmx_set_cr0, 4146 .set_cr0 = vmx_set_cr0,
3952 .set_cr3 = vmx_set_cr3, 4147 .set_cr3 = vmx_set_cr3,
@@ -3959,6 +4154,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3959 .cache_reg = vmx_cache_reg, 4154 .cache_reg = vmx_cache_reg,
3960 .get_rflags = vmx_get_rflags, 4155 .get_rflags = vmx_get_rflags,
3961 .set_rflags = vmx_set_rflags, 4156 .set_rflags = vmx_set_rflags,
4157 .fpu_activate = vmx_fpu_activate,
4158 .fpu_deactivate = vmx_fpu_deactivate,
3962 4159
3963 .tlb_flush = vmx_flush_tlb, 4160 .tlb_flush = vmx_flush_tlb,
3964 4161
@@ -3973,6 +4170,8 @@ static struct kvm_x86_ops vmx_x86_ops = {
3973 .queue_exception = vmx_queue_exception, 4170 .queue_exception = vmx_queue_exception,
3974 .interrupt_allowed = vmx_interrupt_allowed, 4171 .interrupt_allowed = vmx_interrupt_allowed,
3975 .nmi_allowed = vmx_nmi_allowed, 4172 .nmi_allowed = vmx_nmi_allowed,
4173 .get_nmi_mask = vmx_get_nmi_mask,
4174 .set_nmi_mask = vmx_set_nmi_mask,
3976 .enable_nmi_window = enable_nmi_window, 4175 .enable_nmi_window = enable_nmi_window,
3977 .enable_irq_window = enable_irq_window, 4176 .enable_irq_window = enable_irq_window,
3978 .update_cr8_intercept = update_cr8_intercept, 4177 .update_cr8_intercept = update_cr8_intercept,
@@ -3982,12 +4181,21 @@ static struct kvm_x86_ops vmx_x86_ops = {
3982 .get_mt_mask = vmx_get_mt_mask, 4181 .get_mt_mask = vmx_get_mt_mask,
3983 4182
3984 .exit_reasons_str = vmx_exit_reasons_str, 4183 .exit_reasons_str = vmx_exit_reasons_str,
3985 .gb_page_enable = vmx_gb_page_enable, 4184 .get_lpage_level = vmx_get_lpage_level,
4185
4186 .cpuid_update = vmx_cpuid_update,
4187
4188 .rdtscp_supported = vmx_rdtscp_supported,
3986}; 4189};
3987 4190
3988static int __init vmx_init(void) 4191static int __init vmx_init(void)
3989{ 4192{
3990 int r; 4193 int r, i;
4194
4195 rdmsrl_safe(MSR_EFER, &host_efer);
4196
4197 for (i = 0; i < NR_VMX_MSR; ++i)
4198 kvm_define_shared_msr(i, vmx_msr_index[i]);
3991 4199
3992 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL); 4200 vmx_io_bitmap_a = (unsigned long *)__get_free_page(GFP_KERNEL);
3993 if (!vmx_io_bitmap_a) 4201 if (!vmx_io_bitmap_a)
@@ -4049,8 +4257,6 @@ static int __init vmx_init(void)
4049 if (bypass_guest_pf) 4257 if (bypass_guest_pf)
4050 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull); 4258 kvm_mmu_set_nonpresent_ptes(~0xffeull, 0ull);
4051 4259
4052 ept_sync_global();
4053
4054 return 0; 4260 return 0;
4055 4261
4056out3: 4262out3:
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ae07d261527c..c4f35b545c1d 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -37,11 +37,15 @@
37#include <linux/iommu.h> 37#include <linux/iommu.h>
38#include <linux/intel-iommu.h> 38#include <linux/intel-iommu.h>
39#include <linux/cpufreq.h> 39#include <linux/cpufreq.h>
40#include <linux/user-return-notifier.h>
41#include <linux/srcu.h>
42#include <linux/slab.h>
40#include <trace/events/kvm.h> 43#include <trace/events/kvm.h>
41#undef TRACE_INCLUDE_FILE 44#undef TRACE_INCLUDE_FILE
42#define CREATE_TRACE_POINTS 45#define CREATE_TRACE_POINTS
43#include "trace.h" 46#include "trace.h"
44 47
48#include <asm/debugreg.h>
45#include <asm/uaccess.h> 49#include <asm/uaccess.h>
46#include <asm/msr.h> 50#include <asm/msr.h>
47#include <asm/desc.h> 51#include <asm/desc.h>
@@ -87,6 +91,25 @@ EXPORT_SYMBOL_GPL(kvm_x86_ops);
87int ignore_msrs = 0; 91int ignore_msrs = 0;
88module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR); 92module_param_named(ignore_msrs, ignore_msrs, bool, S_IRUGO | S_IWUSR);
89 93
94#define KVM_NR_SHARED_MSRS 16
95
96struct kvm_shared_msrs_global {
97 int nr;
98 u32 msrs[KVM_NR_SHARED_MSRS];
99};
100
101struct kvm_shared_msrs {
102 struct user_return_notifier urn;
103 bool registered;
104 struct kvm_shared_msr_values {
105 u64 host;
106 u64 curr;
107 } values[KVM_NR_SHARED_MSRS];
108};
109
110static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
111static DEFINE_PER_CPU(struct kvm_shared_msrs, shared_msrs);
112
90struct kvm_stats_debugfs_item debugfs_entries[] = { 113struct kvm_stats_debugfs_item debugfs_entries[] = {
91 { "pf_fixed", VCPU_STAT(pf_fixed) }, 114 { "pf_fixed", VCPU_STAT(pf_fixed) },
92 { "pf_guest", VCPU_STAT(pf_guest) }, 115 { "pf_guest", VCPU_STAT(pf_guest) },
@@ -123,6 +146,83 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
123 { NULL } 146 { NULL }
124}; 147};
125 148
149static void kvm_on_user_return(struct user_return_notifier *urn)
150{
151 unsigned slot;
152 struct kvm_shared_msrs *locals
153 = container_of(urn, struct kvm_shared_msrs, urn);
154 struct kvm_shared_msr_values *values;
155
156 for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
157 values = &locals->values[slot];
158 if (values->host != values->curr) {
159 wrmsrl(shared_msrs_global.msrs[slot], values->host);
160 values->curr = values->host;
161 }
162 }
163 locals->registered = false;
164 user_return_notifier_unregister(urn);
165}
166
167static void shared_msr_update(unsigned slot, u32 msr)
168{
169 struct kvm_shared_msrs *smsr;
170 u64 value;
171
172 smsr = &__get_cpu_var(shared_msrs);
173 /* only read, and nobody should modify it at this time,
174 * so don't need lock */
175 if (slot >= shared_msrs_global.nr) {
176 printk(KERN_ERR "kvm: invalid MSR slot!");
177 return;
178 }
179 rdmsrl_safe(msr, &value);
180 smsr->values[slot].host = value;
181 smsr->values[slot].curr = value;
182}
183
184void kvm_define_shared_msr(unsigned slot, u32 msr)
185{
186 if (slot >= shared_msrs_global.nr)
187 shared_msrs_global.nr = slot + 1;
188 shared_msrs_global.msrs[slot] = msr;
189 /* we need ensured the shared_msr_global have been updated */
190 smp_wmb();
191}
192EXPORT_SYMBOL_GPL(kvm_define_shared_msr);
193
194static void kvm_shared_msr_cpu_online(void)
195{
196 unsigned i;
197
198 for (i = 0; i < shared_msrs_global.nr; ++i)
199 shared_msr_update(i, shared_msrs_global.msrs[i]);
200}
201
202void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
203{
204 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
205
206 if (((value ^ smsr->values[slot].curr) & mask) == 0)
207 return;
208 smsr->values[slot].curr = value;
209 wrmsrl(shared_msrs_global.msrs[slot], value);
210 if (!smsr->registered) {
211 smsr->urn.on_user_return = kvm_on_user_return;
212 user_return_notifier_register(&smsr->urn);
213 smsr->registered = true;
214 }
215}
216EXPORT_SYMBOL_GPL(kvm_set_shared_msr);
217
218static void drop_user_return_notifiers(void *ignore)
219{
220 struct kvm_shared_msrs *smsr = &__get_cpu_var(shared_msrs);
221
222 if (smsr->registered)
223 kvm_on_user_return(&smsr->urn);
224}
225
126unsigned long segment_base(u16 selector) 226unsigned long segment_base(u16 selector)
127{ 227{
128 struct descriptor_table gdt; 228 struct descriptor_table gdt;
@@ -170,12 +270,68 @@ void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
170} 270}
171EXPORT_SYMBOL_GPL(kvm_set_apic_base); 271EXPORT_SYMBOL_GPL(kvm_set_apic_base);
172 272
273#define EXCPT_BENIGN 0
274#define EXCPT_CONTRIBUTORY 1
275#define EXCPT_PF 2
276
277static int exception_class(int vector)
278{
279 switch (vector) {
280 case PF_VECTOR:
281 return EXCPT_PF;
282 case DE_VECTOR:
283 case TS_VECTOR:
284 case NP_VECTOR:
285 case SS_VECTOR:
286 case GP_VECTOR:
287 return EXCPT_CONTRIBUTORY;
288 default:
289 break;
290 }
291 return EXCPT_BENIGN;
292}
293
294static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
295 unsigned nr, bool has_error, u32 error_code)
296{
297 u32 prev_nr;
298 int class1, class2;
299
300 if (!vcpu->arch.exception.pending) {
301 queue:
302 vcpu->arch.exception.pending = true;
303 vcpu->arch.exception.has_error_code = has_error;
304 vcpu->arch.exception.nr = nr;
305 vcpu->arch.exception.error_code = error_code;
306 return;
307 }
308
309 /* to check exception */
310 prev_nr = vcpu->arch.exception.nr;
311 if (prev_nr == DF_VECTOR) {
312 /* triple fault -> shutdown */
313 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
314 return;
315 }
316 class1 = exception_class(prev_nr);
317 class2 = exception_class(nr);
318 if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
319 || (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
320 /* generate double fault per SDM Table 5-5 */
321 vcpu->arch.exception.pending = true;
322 vcpu->arch.exception.has_error_code = true;
323 vcpu->arch.exception.nr = DF_VECTOR;
324 vcpu->arch.exception.error_code = 0;
325 } else
326 /* replace previous exception with a new one in a hope
327 that instruction re-execution will regenerate lost
328 exception */
329 goto queue;
330}
331
173void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) 332void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
174{ 333{
175 WARN_ON(vcpu->arch.exception.pending); 334 kvm_multiple_exception(vcpu, nr, false, 0);
176 vcpu->arch.exception.pending = true;
177 vcpu->arch.exception.has_error_code = false;
178 vcpu->arch.exception.nr = nr;
179} 335}
180EXPORT_SYMBOL_GPL(kvm_queue_exception); 336EXPORT_SYMBOL_GPL(kvm_queue_exception);
181 337
@@ -183,25 +339,6 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, unsigned long addr,
183 u32 error_code) 339 u32 error_code)
184{ 340{
185 ++vcpu->stat.pf_guest; 341 ++vcpu->stat.pf_guest;
186
187 if (vcpu->arch.exception.pending) {
188 switch(vcpu->arch.exception.nr) {
189 case DF_VECTOR:
190 /* triple fault -> shutdown */
191 set_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests);
192 return;
193 case PF_VECTOR:
194 vcpu->arch.exception.nr = DF_VECTOR;
195 vcpu->arch.exception.error_code = 0;
196 return;
197 default:
198 /* replace previous exception with a new one in a hope
199 that instruction re-execution will regenerate lost
200 exception */
201 vcpu->arch.exception.pending = false;
202 break;
203 }
204 }
205 vcpu->arch.cr2 = addr; 342 vcpu->arch.cr2 = addr;
206 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code); 343 kvm_queue_exception_e(vcpu, PF_VECTOR, error_code);
207} 344}
@@ -214,11 +351,7 @@ EXPORT_SYMBOL_GPL(kvm_inject_nmi);
214 351
215void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) 352void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
216{ 353{
217 WARN_ON(vcpu->arch.exception.pending); 354 kvm_multiple_exception(vcpu, nr, true, error_code);
218 vcpu->arch.exception.pending = true;
219 vcpu->arch.exception.has_error_code = true;
220 vcpu->arch.exception.nr = nr;
221 vcpu->arch.exception.error_code = error_code;
222} 355}
223EXPORT_SYMBOL_GPL(kvm_queue_exception_e); 356EXPORT_SYMBOL_GPL(kvm_queue_exception_e);
224 357
@@ -296,41 +429,38 @@ out:
296 429
297void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) 430void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
298{ 431{
299 if (cr0 & CR0_RESERVED_BITS) { 432 cr0 |= X86_CR0_ET;
300 printk(KERN_DEBUG "set_cr0: 0x%lx #GP, reserved bits 0x%lx\n", 433
301 cr0, vcpu->arch.cr0); 434#ifdef CONFIG_X86_64
435 if (cr0 & 0xffffffff00000000UL) {
302 kvm_inject_gp(vcpu, 0); 436 kvm_inject_gp(vcpu, 0);
303 return; 437 return;
304 } 438 }
439#endif
440
441 cr0 &= ~CR0_RESERVED_BITS;
305 442
306 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) { 443 if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) {
307 printk(KERN_DEBUG "set_cr0: #GP, CD == 0 && NW == 1\n");
308 kvm_inject_gp(vcpu, 0); 444 kvm_inject_gp(vcpu, 0);
309 return; 445 return;
310 } 446 }
311 447
312 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) { 448 if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) {
313 printk(KERN_DEBUG "set_cr0: #GP, set PG flag "
314 "and a clear PE flag\n");
315 kvm_inject_gp(vcpu, 0); 449 kvm_inject_gp(vcpu, 0);
316 return; 450 return;
317 } 451 }
318 452
319 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { 453 if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
320#ifdef CONFIG_X86_64 454#ifdef CONFIG_X86_64
321 if ((vcpu->arch.shadow_efer & EFER_LME)) { 455 if ((vcpu->arch.efer & EFER_LME)) {
322 int cs_db, cs_l; 456 int cs_db, cs_l;
323 457
324 if (!is_pae(vcpu)) { 458 if (!is_pae(vcpu)) {
325 printk(KERN_DEBUG "set_cr0: #GP, start paging "
326 "in long mode while PAE is disabled\n");
327 kvm_inject_gp(vcpu, 0); 459 kvm_inject_gp(vcpu, 0);
328 return; 460 return;
329 } 461 }
330 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 462 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
331 if (cs_l) { 463 if (cs_l) {
332 printk(KERN_DEBUG "set_cr0: #GP, start paging "
333 "in long mode while CS.L == 1\n");
334 kvm_inject_gp(vcpu, 0); 464 kvm_inject_gp(vcpu, 0);
335 return; 465 return;
336 466
@@ -338,8 +468,6 @@ void kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
338 } else 468 } else
339#endif 469#endif
340 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 470 if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
341 printk(KERN_DEBUG "set_cr0: #GP, pdptrs "
342 "reserved bits\n");
343 kvm_inject_gp(vcpu, 0); 471 kvm_inject_gp(vcpu, 0);
344 return; 472 return;
345 } 473 }
@@ -356,38 +484,33 @@ EXPORT_SYMBOL_GPL(kvm_set_cr0);
356 484
357void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) 485void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
358{ 486{
359 kvm_set_cr0(vcpu, (vcpu->arch.cr0 & ~0x0ful) | (msw & 0x0f)); 487 kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0ful) | (msw & 0x0f));
360} 488}
361EXPORT_SYMBOL_GPL(kvm_lmsw); 489EXPORT_SYMBOL_GPL(kvm_lmsw);
362 490
363void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) 491void kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
364{ 492{
365 unsigned long old_cr4 = vcpu->arch.cr4; 493 unsigned long old_cr4 = kvm_read_cr4(vcpu);
366 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE; 494 unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE | X86_CR4_PAE;
367 495
368 if (cr4 & CR4_RESERVED_BITS) { 496 if (cr4 & CR4_RESERVED_BITS) {
369 printk(KERN_DEBUG "set_cr4: #GP, reserved bits\n");
370 kvm_inject_gp(vcpu, 0); 497 kvm_inject_gp(vcpu, 0);
371 return; 498 return;
372 } 499 }
373 500
374 if (is_long_mode(vcpu)) { 501 if (is_long_mode(vcpu)) {
375 if (!(cr4 & X86_CR4_PAE)) { 502 if (!(cr4 & X86_CR4_PAE)) {
376 printk(KERN_DEBUG "set_cr4: #GP, clearing PAE while "
377 "in long mode\n");
378 kvm_inject_gp(vcpu, 0); 503 kvm_inject_gp(vcpu, 0);
379 return; 504 return;
380 } 505 }
381 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) 506 } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
382 && ((cr4 ^ old_cr4) & pdptr_bits) 507 && ((cr4 ^ old_cr4) & pdptr_bits)
383 && !load_pdptrs(vcpu, vcpu->arch.cr3)) { 508 && !load_pdptrs(vcpu, vcpu->arch.cr3)) {
384 printk(KERN_DEBUG "set_cr4: #GP, pdptrs reserved bits\n");
385 kvm_inject_gp(vcpu, 0); 509 kvm_inject_gp(vcpu, 0);
386 return; 510 return;
387 } 511 }
388 512
389 if (cr4 & X86_CR4_VMXE) { 513 if (cr4 & X86_CR4_VMXE) {
390 printk(KERN_DEBUG "set_cr4: #GP, setting VMXE\n");
391 kvm_inject_gp(vcpu, 0); 514 kvm_inject_gp(vcpu, 0);
392 return; 515 return;
393 } 516 }
@@ -408,21 +531,16 @@ void kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
408 531
409 if (is_long_mode(vcpu)) { 532 if (is_long_mode(vcpu)) {
410 if (cr3 & CR3_L_MODE_RESERVED_BITS) { 533 if (cr3 & CR3_L_MODE_RESERVED_BITS) {
411 printk(KERN_DEBUG "set_cr3: #GP, reserved bits\n");
412 kvm_inject_gp(vcpu, 0); 534 kvm_inject_gp(vcpu, 0);
413 return; 535 return;
414 } 536 }
415 } else { 537 } else {
416 if (is_pae(vcpu)) { 538 if (is_pae(vcpu)) {
417 if (cr3 & CR3_PAE_RESERVED_BITS) { 539 if (cr3 & CR3_PAE_RESERVED_BITS) {
418 printk(KERN_DEBUG
419 "set_cr3: #GP, reserved bits\n");
420 kvm_inject_gp(vcpu, 0); 540 kvm_inject_gp(vcpu, 0);
421 return; 541 return;
422 } 542 }
423 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) { 543 if (is_paging(vcpu) && !load_pdptrs(vcpu, cr3)) {
424 printk(KERN_DEBUG "set_cr3: #GP, pdptrs "
425 "reserved bits\n");
426 kvm_inject_gp(vcpu, 0); 544 kvm_inject_gp(vcpu, 0);
427 return; 545 return;
428 } 546 }
@@ -454,7 +572,6 @@ EXPORT_SYMBOL_GPL(kvm_set_cr3);
454void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) 572void kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
455{ 573{
456 if (cr8 & CR8_RESERVED_BITS) { 574 if (cr8 & CR8_RESERVED_BITS) {
457 printk(KERN_DEBUG "set_cr8: #GP, reserved bits 0x%lx\n", cr8);
458 kvm_inject_gp(vcpu, 0); 575 kvm_inject_gp(vcpu, 0);
459 return; 576 return;
460 } 577 }
@@ -484,16 +601,21 @@ static inline u32 bit(int bitno)
484 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. 601 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
485 * 602 *
486 * This list is modified at module load time to reflect the 603 * This list is modified at module load time to reflect the
487 * capabilities of the host cpu. 604 * capabilities of the host cpu. This capabilities test skips MSRs that are
605 * kvm-specific. Those are put in the beginning of the list.
488 */ 606 */
607
608#define KVM_SAVE_MSRS_BEGIN 5
489static u32 msrs_to_save[] = { 609static u32 msrs_to_save[] = {
610 MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
611 HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
612 HV_X64_MSR_APIC_ASSIST_PAGE,
490 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, 613 MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
491 MSR_K6_STAR, 614 MSR_K6_STAR,
492#ifdef CONFIG_X86_64 615#ifdef CONFIG_X86_64
493 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, 616 MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
494#endif 617#endif
495 MSR_IA32_TSC, MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, 618 MSR_IA32_TSC, MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
496 MSR_IA32_PERF_STATUS, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
497}; 619};
498 620
499static unsigned num_msrs_to_save; 621static unsigned num_msrs_to_save;
@@ -505,15 +627,12 @@ static u32 emulated_msrs[] = {
505static void set_efer(struct kvm_vcpu *vcpu, u64 efer) 627static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
506{ 628{
507 if (efer & efer_reserved_bits) { 629 if (efer & efer_reserved_bits) {
508 printk(KERN_DEBUG "set_efer: 0x%llx #GP, reserved bits\n",
509 efer);
510 kvm_inject_gp(vcpu, 0); 630 kvm_inject_gp(vcpu, 0);
511 return; 631 return;
512 } 632 }
513 633
514 if (is_paging(vcpu) 634 if (is_paging(vcpu)
515 && (vcpu->arch.shadow_efer & EFER_LME) != (efer & EFER_LME)) { 635 && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) {
516 printk(KERN_DEBUG "set_efer: #GP, change LME while paging\n");
517 kvm_inject_gp(vcpu, 0); 636 kvm_inject_gp(vcpu, 0);
518 return; 637 return;
519 } 638 }
@@ -523,7 +642,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
523 642
524 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 643 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
525 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) { 644 if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT))) {
526 printk(KERN_DEBUG "set_efer: #GP, enable FFXSR w/o CPUID capability\n");
527 kvm_inject_gp(vcpu, 0); 645 kvm_inject_gp(vcpu, 0);
528 return; 646 return;
529 } 647 }
@@ -534,7 +652,6 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
534 652
535 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0); 653 feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
536 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) { 654 if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM))) {
537 printk(KERN_DEBUG "set_efer: #GP, enable SVM w/o SVM\n");
538 kvm_inject_gp(vcpu, 0); 655 kvm_inject_gp(vcpu, 0);
539 return; 656 return;
540 } 657 }
@@ -543,9 +660,9 @@ static void set_efer(struct kvm_vcpu *vcpu, u64 efer)
543 kvm_x86_ops->set_efer(vcpu, efer); 660 kvm_x86_ops->set_efer(vcpu, efer);
544 661
545 efer &= ~EFER_LMA; 662 efer &= ~EFER_LMA;
546 efer |= vcpu->arch.shadow_efer & EFER_LMA; 663 efer |= vcpu->arch.efer & EFER_LMA;
547 664
548 vcpu->arch.shadow_efer = efer; 665 vcpu->arch.efer = efer;
549 666
550 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled; 667 vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
551 kvm_mmu_reset_context(vcpu); 668 kvm_mmu_reset_context(vcpu);
@@ -580,7 +697,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
580{ 697{
581 static int version; 698 static int version;
582 struct pvclock_wall_clock wc; 699 struct pvclock_wall_clock wc;
583 struct timespec now, sys, boot; 700 struct timespec boot;
584 701
585 if (!wall_clock) 702 if (!wall_clock)
586 return; 703 return;
@@ -595,9 +712,7 @@ static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
595 * wall clock specified here. guest system time equals host 712 * wall clock specified here. guest system time equals host
596 * system time for us, thus we must fill in host boot time here. 713 * system time for us, thus we must fill in host boot time here.
597 */ 714 */
598 now = current_kernel_time(); 715 getboottime(&boot);
599 ktime_get_ts(&sys);
600 boot = ns_to_timespec(timespec_to_ns(&now) - timespec_to_ns(&sys));
601 716
602 wc.sec = boot.tv_sec; 717 wc.sec = boot.tv_sec;
603 wc.nsec = boot.tv_nsec; 718 wc.nsec = boot.tv_nsec;
@@ -672,12 +787,14 @@ static void kvm_write_guest_time(struct kvm_vcpu *v)
672 local_irq_save(flags); 787 local_irq_save(flags);
673 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp); 788 kvm_get_msr(v, MSR_IA32_TSC, &vcpu->hv_clock.tsc_timestamp);
674 ktime_get_ts(&ts); 789 ktime_get_ts(&ts);
790 monotonic_to_bootbased(&ts);
675 local_irq_restore(flags); 791 local_irq_restore(flags);
676 792
677 /* With all the info we got, fill in the values */ 793 /* With all the info we got, fill in the values */
678 794
679 vcpu->hv_clock.system_time = ts.tv_nsec + 795 vcpu->hv_clock.system_time = ts.tv_nsec +
680 (NSEC_PER_SEC * (u64)ts.tv_sec); 796 (NSEC_PER_SEC * (u64)ts.tv_sec) + v->kvm->arch.kvmclock_offset;
797
681 /* 798 /*
682 * The interface expects us to write an even number signaling that the 799 * The interface expects us to write an even number signaling that the
683 * update is finished. Since the guest won't see the intermediate 800 * update is finished. Since the guest won't see the intermediate
@@ -823,9 +940,13 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
823 if (msr >= MSR_IA32_MC0_CTL && 940 if (msr >= MSR_IA32_MC0_CTL &&
824 msr < MSR_IA32_MC0_CTL + 4 * bank_num) { 941 msr < MSR_IA32_MC0_CTL + 4 * bank_num) {
825 u32 offset = msr - MSR_IA32_MC0_CTL; 942 u32 offset = msr - MSR_IA32_MC0_CTL;
826 /* only 0 or all 1s can be written to IA32_MCi_CTL */ 943 /* only 0 or all 1s can be written to IA32_MCi_CTL
944 * some Linux kernels though clear bit 10 in bank 4 to
945 * workaround a BIOS/GART TBL issue on AMD K8s, ignore
946 * this to avoid an uncatched #GP in the guest
947 */
827 if ((offset & 0x3) == 0 && 948 if ((offset & 0x3) == 0 &&
828 data != 0 && data != ~(u64)0) 949 data != 0 && (data | (1 << 10)) != ~(u64)0)
829 return -1; 950 return -1;
830 vcpu->arch.mce_banks[offset] = data; 951 vcpu->arch.mce_banks[offset] = data;
831 break; 952 break;
@@ -835,6 +956,132 @@ static int set_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 data)
835 return 0; 956 return 0;
836} 957}
837 958
959static int xen_hvm_config(struct kvm_vcpu *vcpu, u64 data)
960{
961 struct kvm *kvm = vcpu->kvm;
962 int lm = is_long_mode(vcpu);
963 u8 *blob_addr = lm ? (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_64
964 : (u8 *)(long)kvm->arch.xen_hvm_config.blob_addr_32;
965 u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
966 : kvm->arch.xen_hvm_config.blob_size_32;
967 u32 page_num = data & ~PAGE_MASK;
968 u64 page_addr = data & PAGE_MASK;
969 u8 *page;
970 int r;
971
972 r = -E2BIG;
973 if (page_num >= blob_size)
974 goto out;
975 r = -ENOMEM;
976 page = kzalloc(PAGE_SIZE, GFP_KERNEL);
977 if (!page)
978 goto out;
979 r = -EFAULT;
980 if (copy_from_user(page, blob_addr + (page_num * PAGE_SIZE), PAGE_SIZE))
981 goto out_free;
982 if (kvm_write_guest(kvm, page_addr, page, PAGE_SIZE))
983 goto out_free;
984 r = 0;
985out_free:
986 kfree(page);
987out:
988 return r;
989}
990
991static bool kvm_hv_hypercall_enabled(struct kvm *kvm)
992{
993 return kvm->arch.hv_hypercall & HV_X64_MSR_HYPERCALL_ENABLE;
994}
995
996static bool kvm_hv_msr_partition_wide(u32 msr)
997{
998 bool r = false;
999 switch (msr) {
1000 case HV_X64_MSR_GUEST_OS_ID:
1001 case HV_X64_MSR_HYPERCALL:
1002 r = true;
1003 break;
1004 }
1005
1006 return r;
1007}
1008
1009static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1010{
1011 struct kvm *kvm = vcpu->kvm;
1012
1013 switch (msr) {
1014 case HV_X64_MSR_GUEST_OS_ID:
1015 kvm->arch.hv_guest_os_id = data;
1016 /* setting guest os id to zero disables hypercall page */
1017 if (!kvm->arch.hv_guest_os_id)
1018 kvm->arch.hv_hypercall &= ~HV_X64_MSR_HYPERCALL_ENABLE;
1019 break;
1020 case HV_X64_MSR_HYPERCALL: {
1021 u64 gfn;
1022 unsigned long addr;
1023 u8 instructions[4];
1024
1025 /* if guest os id is not set hypercall should remain disabled */
1026 if (!kvm->arch.hv_guest_os_id)
1027 break;
1028 if (!(data & HV_X64_MSR_HYPERCALL_ENABLE)) {
1029 kvm->arch.hv_hypercall = data;
1030 break;
1031 }
1032 gfn = data >> HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT;
1033 addr = gfn_to_hva(kvm, gfn);
1034 if (kvm_is_error_hva(addr))
1035 return 1;
1036 kvm_x86_ops->patch_hypercall(vcpu, instructions);
1037 ((unsigned char *)instructions)[3] = 0xc3; /* ret */
1038 if (copy_to_user((void __user *)addr, instructions, 4))
1039 return 1;
1040 kvm->arch.hv_hypercall = data;
1041 break;
1042 }
1043 default:
1044 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1045 "data 0x%llx\n", msr, data);
1046 return 1;
1047 }
1048 return 0;
1049}
1050
1051static int set_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 data)
1052{
1053 switch (msr) {
1054 case HV_X64_MSR_APIC_ASSIST_PAGE: {
1055 unsigned long addr;
1056
1057 if (!(data & HV_X64_MSR_APIC_ASSIST_PAGE_ENABLE)) {
1058 vcpu->arch.hv_vapic = data;
1059 break;
1060 }
1061 addr = gfn_to_hva(vcpu->kvm, data >>
1062 HV_X64_MSR_APIC_ASSIST_PAGE_ADDRESS_SHIFT);
1063 if (kvm_is_error_hva(addr))
1064 return 1;
1065 if (clear_user((void __user *)addr, PAGE_SIZE))
1066 return 1;
1067 vcpu->arch.hv_vapic = data;
1068 break;
1069 }
1070 case HV_X64_MSR_EOI:
1071 return kvm_hv_vapic_msr_write(vcpu, APIC_EOI, data);
1072 case HV_X64_MSR_ICR:
1073 return kvm_hv_vapic_msr_write(vcpu, APIC_ICR, data);
1074 case HV_X64_MSR_TPR:
1075 return kvm_hv_vapic_msr_write(vcpu, APIC_TASKPRI, data);
1076 default:
1077 pr_unimpl(vcpu, "HYPER-V unimplemented wrmsr: 0x%x "
1078 "data 0x%llx\n", msr, data);
1079 return 1;
1080 }
1081
1082 return 0;
1083}
1084
838int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data) 1085int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
839{ 1086{
840 switch (msr) { 1087 switch (msr) {
@@ -949,7 +1196,19 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 data)
949 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: " 1196 pr_unimpl(vcpu, "unimplemented perfctr wrmsr: "
950 "0x%x data 0x%llx\n", msr, data); 1197 "0x%x data 0x%llx\n", msr, data);
951 break; 1198 break;
1199 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1200 if (kvm_hv_msr_partition_wide(msr)) {
1201 int r;
1202 mutex_lock(&vcpu->kvm->lock);
1203 r = set_msr_hyperv_pw(vcpu, msr, data);
1204 mutex_unlock(&vcpu->kvm->lock);
1205 return r;
1206 } else
1207 return set_msr_hyperv(vcpu, msr, data);
1208 break;
952 default: 1209 default:
1210 if (msr && (msr == vcpu->kvm->arch.xen_hvm_config.msr))
1211 return xen_hvm_config(vcpu, data);
953 if (!ignore_msrs) { 1212 if (!ignore_msrs) {
954 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n", 1213 pr_unimpl(vcpu, "unhandled wrmsr: 0x%x data %llx\n",
955 msr, data); 1214 msr, data);
@@ -1046,6 +1305,54 @@ static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1046 return 0; 1305 return 0;
1047} 1306}
1048 1307
1308static int get_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1309{
1310 u64 data = 0;
1311 struct kvm *kvm = vcpu->kvm;
1312
1313 switch (msr) {
1314 case HV_X64_MSR_GUEST_OS_ID:
1315 data = kvm->arch.hv_guest_os_id;
1316 break;
1317 case HV_X64_MSR_HYPERCALL:
1318 data = kvm->arch.hv_hypercall;
1319 break;
1320 default:
1321 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1322 return 1;
1323 }
1324
1325 *pdata = data;
1326 return 0;
1327}
1328
1329static int get_msr_hyperv(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1330{
1331 u64 data = 0;
1332
1333 switch (msr) {
1334 case HV_X64_MSR_VP_INDEX: {
1335 int r;
1336 struct kvm_vcpu *v;
1337 kvm_for_each_vcpu(r, v, vcpu->kvm)
1338 if (v == vcpu)
1339 data = r;
1340 break;
1341 }
1342 case HV_X64_MSR_EOI:
1343 return kvm_hv_vapic_msr_read(vcpu, APIC_EOI, pdata);
1344 case HV_X64_MSR_ICR:
1345 return kvm_hv_vapic_msr_read(vcpu, APIC_ICR, pdata);
1346 case HV_X64_MSR_TPR:
1347 return kvm_hv_vapic_msr_read(vcpu, APIC_TASKPRI, pdata);
1348 default:
1349 pr_unimpl(vcpu, "Hyper-V unhandled rdmsr: 0x%x\n", msr);
1350 return 1;
1351 }
1352 *pdata = data;
1353 return 0;
1354}
1355
1049int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata) 1356int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1050{ 1357{
1051 u64 data; 1358 u64 data;
@@ -1097,7 +1404,7 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1097 data |= (((uint64_t)4ULL) << 40); 1404 data |= (((uint64_t)4ULL) << 40);
1098 break; 1405 break;
1099 case MSR_EFER: 1406 case MSR_EFER:
1100 data = vcpu->arch.shadow_efer; 1407 data = vcpu->arch.efer;
1101 break; 1408 break;
1102 case MSR_KVM_WALL_CLOCK: 1409 case MSR_KVM_WALL_CLOCK:
1103 data = vcpu->kvm->arch.wall_clock; 1410 data = vcpu->kvm->arch.wall_clock;
@@ -1112,6 +1419,16 @@ int kvm_get_msr_common(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata)
1112 case MSR_IA32_MCG_STATUS: 1419 case MSR_IA32_MCG_STATUS:
1113 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1: 1420 case MSR_IA32_MC0_CTL ... MSR_IA32_MC0_CTL + 4 * KVM_MAX_MCE_BANKS - 1:
1114 return get_msr_mce(vcpu, msr, pdata); 1421 return get_msr_mce(vcpu, msr, pdata);
1422 case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15:
1423 if (kvm_hv_msr_partition_wide(msr)) {
1424 int r;
1425 mutex_lock(&vcpu->kvm->lock);
1426 r = get_msr_hyperv_pw(vcpu, msr, pdata);
1427 mutex_unlock(&vcpu->kvm->lock);
1428 return r;
1429 } else
1430 return get_msr_hyperv(vcpu, msr, pdata);
1431 break;
1115 default: 1432 default:
1116 if (!ignore_msrs) { 1433 if (!ignore_msrs) {
1117 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr); 1434 pr_unimpl(vcpu, "unhandled rdmsr: 0x%x\n", msr);
@@ -1137,15 +1454,15 @@ static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs,
1137 int (*do_msr)(struct kvm_vcpu *vcpu, 1454 int (*do_msr)(struct kvm_vcpu *vcpu,
1138 unsigned index, u64 *data)) 1455 unsigned index, u64 *data))
1139{ 1456{
1140 int i; 1457 int i, idx;
1141 1458
1142 vcpu_load(vcpu); 1459 vcpu_load(vcpu);
1143 1460
1144 down_read(&vcpu->kvm->slots_lock); 1461 idx = srcu_read_lock(&vcpu->kvm->srcu);
1145 for (i = 0; i < msrs->nmsrs; ++i) 1462 for (i = 0; i < msrs->nmsrs; ++i)
1146 if (do_msr(vcpu, entries[i].index, &entries[i].data)) 1463 if (do_msr(vcpu, entries[i].index, &entries[i].data))
1147 break; 1464 break;
1148 up_read(&vcpu->kvm->slots_lock); 1465 srcu_read_unlock(&vcpu->kvm->srcu, idx);
1149 1466
1150 vcpu_put(vcpu); 1467 vcpu_put(vcpu);
1151 1468
@@ -1224,6 +1541,14 @@ int kvm_dev_ioctl_check_extension(long ext)
1224 case KVM_CAP_PIT2: 1541 case KVM_CAP_PIT2:
1225 case KVM_CAP_PIT_STATE2: 1542 case KVM_CAP_PIT_STATE2:
1226 case KVM_CAP_SET_IDENTITY_MAP_ADDR: 1543 case KVM_CAP_SET_IDENTITY_MAP_ADDR:
1544 case KVM_CAP_XEN_HVM:
1545 case KVM_CAP_ADJUST_CLOCK:
1546 case KVM_CAP_VCPU_EVENTS:
1547 case KVM_CAP_HYPERV:
1548 case KVM_CAP_HYPERV_VAPIC:
1549 case KVM_CAP_HYPERV_SPIN:
1550 case KVM_CAP_PCI_SEGMENT:
1551 case KVM_CAP_X86_ROBUST_SINGLESTEP:
1227 r = 1; 1552 r = 1;
1228 break; 1553 break;
1229 case KVM_CAP_COALESCED_MMIO: 1554 case KVM_CAP_COALESCED_MMIO:
@@ -1238,8 +1563,8 @@ int kvm_dev_ioctl_check_extension(long ext)
1238 case KVM_CAP_NR_MEMSLOTS: 1563 case KVM_CAP_NR_MEMSLOTS:
1239 r = KVM_MEMORY_SLOTS; 1564 r = KVM_MEMORY_SLOTS;
1240 break; 1565 break;
1241 case KVM_CAP_PV_MMU: 1566 case KVM_CAP_PV_MMU: /* obsolete */
1242 r = !tdp_enabled; 1567 r = 0;
1243 break; 1568 break;
1244 case KVM_CAP_IOMMU: 1569 case KVM_CAP_IOMMU:
1245 r = iommu_found(); 1570 r = iommu_found();
@@ -1326,13 +1651,19 @@ out:
1326void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) 1651void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
1327{ 1652{
1328 kvm_x86_ops->vcpu_load(vcpu, cpu); 1653 kvm_x86_ops->vcpu_load(vcpu, cpu);
1654 if (unlikely(per_cpu(cpu_tsc_khz, cpu) == 0)) {
1655 unsigned long khz = cpufreq_quick_get(cpu);
1656 if (!khz)
1657 khz = tsc_khz;
1658 per_cpu(cpu_tsc_khz, cpu) = khz;
1659 }
1329 kvm_request_guest_time_update(vcpu); 1660 kvm_request_guest_time_update(vcpu);
1330} 1661}
1331 1662
1332void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) 1663void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
1333{ 1664{
1334 kvm_x86_ops->vcpu_put(vcpu);
1335 kvm_put_guest_fpu(vcpu); 1665 kvm_put_guest_fpu(vcpu);
1666 kvm_x86_ops->vcpu_put(vcpu);
1336} 1667}
1337 1668
1338static int is_efer_nx(void) 1669static int is_efer_nx(void)
@@ -1381,6 +1712,7 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1381 if (copy_from_user(cpuid_entries, entries, 1712 if (copy_from_user(cpuid_entries, entries,
1382 cpuid->nent * sizeof(struct kvm_cpuid_entry))) 1713 cpuid->nent * sizeof(struct kvm_cpuid_entry)))
1383 goto out_free; 1714 goto out_free;
1715 vcpu_load(vcpu);
1384 for (i = 0; i < cpuid->nent; i++) { 1716 for (i = 0; i < cpuid->nent; i++) {
1385 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function; 1717 vcpu->arch.cpuid_entries[i].function = cpuid_entries[i].function;
1386 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax; 1718 vcpu->arch.cpuid_entries[i].eax = cpuid_entries[i].eax;
@@ -1397,6 +1729,8 @@ static int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
1397 cpuid_fix_nx_cap(vcpu); 1729 cpuid_fix_nx_cap(vcpu);
1398 r = 0; 1730 r = 0;
1399 kvm_apic_set_version(vcpu); 1731 kvm_apic_set_version(vcpu);
1732 kvm_x86_ops->cpuid_update(vcpu);
1733 vcpu_put(vcpu);
1400 1734
1401out_free: 1735out_free:
1402 vfree(cpuid_entries); 1736 vfree(cpuid_entries);
@@ -1417,8 +1751,11 @@ static int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
1417 if (copy_from_user(&vcpu->arch.cpuid_entries, entries, 1751 if (copy_from_user(&vcpu->arch.cpuid_entries, entries,
1418 cpuid->nent * sizeof(struct kvm_cpuid_entry2))) 1752 cpuid->nent * sizeof(struct kvm_cpuid_entry2)))
1419 goto out; 1753 goto out;
1754 vcpu_load(vcpu);
1420 vcpu->arch.cpuid_nent = cpuid->nent; 1755 vcpu->arch.cpuid_nent = cpuid->nent;
1421 kvm_apic_set_version(vcpu); 1756 kvm_apic_set_version(vcpu);
1757 kvm_x86_ops->cpuid_update(vcpu);
1758 vcpu_put(vcpu);
1422 return 0; 1759 return 0;
1423 1760
1424out: 1761out:
@@ -1461,12 +1798,15 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1461 u32 index, int *nent, int maxnent) 1798 u32 index, int *nent, int maxnent)
1462{ 1799{
1463 unsigned f_nx = is_efer_nx() ? F(NX) : 0; 1800 unsigned f_nx = is_efer_nx() ? F(NX) : 0;
1464 unsigned f_gbpages = kvm_x86_ops->gb_page_enable() ? F(GBPAGES) : 0;
1465#ifdef CONFIG_X86_64 1801#ifdef CONFIG_X86_64
1802 unsigned f_gbpages = (kvm_x86_ops->get_lpage_level() == PT_PDPE_LEVEL)
1803 ? F(GBPAGES) : 0;
1466 unsigned f_lm = F(LM); 1804 unsigned f_lm = F(LM);
1467#else 1805#else
1806 unsigned f_gbpages = 0;
1468 unsigned f_lm = 0; 1807 unsigned f_lm = 0;
1469#endif 1808#endif
1809 unsigned f_rdtscp = kvm_x86_ops->rdtscp_supported() ? F(RDTSCP) : 0;
1470 1810
1471 /* cpuid 1.edx */ 1811 /* cpuid 1.edx */
1472 const u32 kvm_supported_word0_x86_features = 1812 const u32 kvm_supported_word0_x86_features =
@@ -1486,7 +1826,7 @@ static void do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function,
1486 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) | 1826 F(MTRR) | F(PGE) | F(MCA) | F(CMOV) |
1487 F(PAT) | F(PSE36) | 0 /* Reserved */ | 1827 F(PAT) | F(PSE36) | 0 /* Reserved */ |
1488 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) | 1828 f_nx | 0 /* Reserved */ | F(MMXEXT) | F(MMX) |
1489 F(FXSR) | F(FXSR_OPT) | f_gbpages | 0 /* RDTSCP */ | 1829 F(FXSR) | F(FXSR_OPT) | f_gbpages | f_rdtscp |
1490 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW); 1830 0 /* Reserved */ | f_lm | F(3DNOWEXT) | F(3DNOW);
1491 /* cpuid 1.ecx */ 1831 /* cpuid 1.ecx */
1492 const u32 kvm_supported_word4_x86_features = 1832 const u32 kvm_supported_word4_x86_features =
@@ -1733,7 +2073,7 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1733 return 0; 2073 return 0;
1734 if (mce->status & MCI_STATUS_UC) { 2074 if (mce->status & MCI_STATUS_UC) {
1735 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || 2075 if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) ||
1736 !(vcpu->arch.cr4 & X86_CR4_MCE)) { 2076 !kvm_read_cr4_bits(vcpu, X86_CR4_MCE)) {
1737 printk(KERN_DEBUG "kvm: set_mce: " 2077 printk(KERN_DEBUG "kvm: set_mce: "
1738 "injects mce exception while " 2078 "injects mce exception while "
1739 "previous one is in progress!\n"); 2079 "previous one is in progress!\n");
@@ -1759,6 +2099,65 @@ static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu,
1759 return 0; 2099 return 0;
1760} 2100}
1761 2101
2102static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu,
2103 struct kvm_vcpu_events *events)
2104{
2105 vcpu_load(vcpu);
2106
2107 events->exception.injected = vcpu->arch.exception.pending;
2108 events->exception.nr = vcpu->arch.exception.nr;
2109 events->exception.has_error_code = vcpu->arch.exception.has_error_code;
2110 events->exception.error_code = vcpu->arch.exception.error_code;
2111
2112 events->interrupt.injected = vcpu->arch.interrupt.pending;
2113 events->interrupt.nr = vcpu->arch.interrupt.nr;
2114 events->interrupt.soft = vcpu->arch.interrupt.soft;
2115
2116 events->nmi.injected = vcpu->arch.nmi_injected;
2117 events->nmi.pending = vcpu->arch.nmi_pending;
2118 events->nmi.masked = kvm_x86_ops->get_nmi_mask(vcpu);
2119
2120 events->sipi_vector = vcpu->arch.sipi_vector;
2121
2122 events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING
2123 | KVM_VCPUEVENT_VALID_SIPI_VECTOR);
2124
2125 vcpu_put(vcpu);
2126}
2127
2128static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu,
2129 struct kvm_vcpu_events *events)
2130{
2131 if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING
2132 | KVM_VCPUEVENT_VALID_SIPI_VECTOR))
2133 return -EINVAL;
2134
2135 vcpu_load(vcpu);
2136
2137 vcpu->arch.exception.pending = events->exception.injected;
2138 vcpu->arch.exception.nr = events->exception.nr;
2139 vcpu->arch.exception.has_error_code = events->exception.has_error_code;
2140 vcpu->arch.exception.error_code = events->exception.error_code;
2141
2142 vcpu->arch.interrupt.pending = events->interrupt.injected;
2143 vcpu->arch.interrupt.nr = events->interrupt.nr;
2144 vcpu->arch.interrupt.soft = events->interrupt.soft;
2145 if (vcpu->arch.interrupt.pending && irqchip_in_kernel(vcpu->kvm))
2146 kvm_pic_clear_isr_ack(vcpu->kvm);
2147
2148 vcpu->arch.nmi_injected = events->nmi.injected;
2149 if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING)
2150 vcpu->arch.nmi_pending = events->nmi.pending;
2151 kvm_x86_ops->set_nmi_mask(vcpu, events->nmi.masked);
2152
2153 if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR)
2154 vcpu->arch.sipi_vector = events->sipi_vector;
2155
2156 vcpu_put(vcpu);
2157
2158 return 0;
2159}
2160
1762long kvm_arch_vcpu_ioctl(struct file *filp, 2161long kvm_arch_vcpu_ioctl(struct file *filp,
1763 unsigned int ioctl, unsigned long arg) 2162 unsigned int ioctl, unsigned long arg)
1764{ 2163{
@@ -1769,6 +2168,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1769 2168
1770 switch (ioctl) { 2169 switch (ioctl) {
1771 case KVM_GET_LAPIC: { 2170 case KVM_GET_LAPIC: {
2171 r = -EINVAL;
2172 if (!vcpu->arch.apic)
2173 goto out;
1772 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2174 lapic = kzalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1773 2175
1774 r = -ENOMEM; 2176 r = -ENOMEM;
@@ -1784,6 +2186,9 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1784 break; 2186 break;
1785 } 2187 }
1786 case KVM_SET_LAPIC: { 2188 case KVM_SET_LAPIC: {
2189 r = -EINVAL;
2190 if (!vcpu->arch.apic)
2191 goto out;
1787 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL); 2192 lapic = kmalloc(sizeof(struct kvm_lapic_state), GFP_KERNEL);
1788 r = -ENOMEM; 2193 r = -ENOMEM;
1789 if (!lapic) 2194 if (!lapic)
@@ -1910,6 +2315,27 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
1910 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce); 2315 r = kvm_vcpu_ioctl_x86_set_mce(vcpu, &mce);
1911 break; 2316 break;
1912 } 2317 }
2318 case KVM_GET_VCPU_EVENTS: {
2319 struct kvm_vcpu_events events;
2320
2321 kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, &events);
2322
2323 r = -EFAULT;
2324 if (copy_to_user(argp, &events, sizeof(struct kvm_vcpu_events)))
2325 break;
2326 r = 0;
2327 break;
2328 }
2329 case KVM_SET_VCPU_EVENTS: {
2330 struct kvm_vcpu_events events;
2331
2332 r = -EFAULT;
2333 if (copy_from_user(&events, argp, sizeof(struct kvm_vcpu_events)))
2334 break;
2335
2336 r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, &events);
2337 break;
2338 }
1913 default: 2339 default:
1914 r = -EINVAL; 2340 r = -EINVAL;
1915 } 2341 }
@@ -1941,14 +2367,14 @@ static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm,
1941 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) 2367 if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES)
1942 return -EINVAL; 2368 return -EINVAL;
1943 2369
1944 down_write(&kvm->slots_lock); 2370 mutex_lock(&kvm->slots_lock);
1945 spin_lock(&kvm->mmu_lock); 2371 spin_lock(&kvm->mmu_lock);
1946 2372
1947 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); 2373 kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages);
1948 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; 2374 kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages;
1949 2375
1950 spin_unlock(&kvm->mmu_lock); 2376 spin_unlock(&kvm->mmu_lock);
1951 up_write(&kvm->slots_lock); 2377 mutex_unlock(&kvm->slots_lock);
1952 return 0; 2378 return 0;
1953} 2379}
1954 2380
@@ -1957,13 +2383,35 @@ static int kvm_vm_ioctl_get_nr_mmu_pages(struct kvm *kvm)
1957 return kvm->arch.n_alloc_mmu_pages; 2383 return kvm->arch.n_alloc_mmu_pages;
1958} 2384}
1959 2385
2386gfn_t unalias_gfn_instantiation(struct kvm *kvm, gfn_t gfn)
2387{
2388 int i;
2389 struct kvm_mem_alias *alias;
2390 struct kvm_mem_aliases *aliases;
2391
2392 aliases = rcu_dereference(kvm->arch.aliases);
2393
2394 for (i = 0; i < aliases->naliases; ++i) {
2395 alias = &aliases->aliases[i];
2396 if (alias->flags & KVM_ALIAS_INVALID)
2397 continue;
2398 if (gfn >= alias->base_gfn
2399 && gfn < alias->base_gfn + alias->npages)
2400 return alias->target_gfn + gfn - alias->base_gfn;
2401 }
2402 return gfn;
2403}
2404
1960gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn) 2405gfn_t unalias_gfn(struct kvm *kvm, gfn_t gfn)
1961{ 2406{
1962 int i; 2407 int i;
1963 struct kvm_mem_alias *alias; 2408 struct kvm_mem_alias *alias;
2409 struct kvm_mem_aliases *aliases;
1964 2410
1965 for (i = 0; i < kvm->arch.naliases; ++i) { 2411 aliases = rcu_dereference(kvm->arch.aliases);
1966 alias = &kvm->arch.aliases[i]; 2412
2413 for (i = 0; i < aliases->naliases; ++i) {
2414 alias = &aliases->aliases[i];
1967 if (gfn >= alias->base_gfn 2415 if (gfn >= alias->base_gfn
1968 && gfn < alias->base_gfn + alias->npages) 2416 && gfn < alias->base_gfn + alias->npages)
1969 return alias->target_gfn + gfn - alias->base_gfn; 2417 return alias->target_gfn + gfn - alias->base_gfn;
@@ -1981,6 +2429,7 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1981{ 2429{
1982 int r, n; 2430 int r, n;
1983 struct kvm_mem_alias *p; 2431 struct kvm_mem_alias *p;
2432 struct kvm_mem_aliases *aliases, *old_aliases;
1984 2433
1985 r = -EINVAL; 2434 r = -EINVAL;
1986 /* General sanity checks */ 2435 /* General sanity checks */
@@ -1997,26 +2446,48 @@ static int kvm_vm_ioctl_set_memory_alias(struct kvm *kvm,
1997 < alias->target_phys_addr) 2446 < alias->target_phys_addr)
1998 goto out; 2447 goto out;
1999 2448
2000 down_write(&kvm->slots_lock); 2449 r = -ENOMEM;
2001 spin_lock(&kvm->mmu_lock); 2450 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2451 if (!aliases)
2452 goto out;
2002 2453
2003 p = &kvm->arch.aliases[alias->slot]; 2454 mutex_lock(&kvm->slots_lock);
2455
2456 /* invalidate any gfn reference in case of deletion/shrinking */
2457 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2458 aliases->aliases[alias->slot].flags |= KVM_ALIAS_INVALID;
2459 old_aliases = kvm->arch.aliases;
2460 rcu_assign_pointer(kvm->arch.aliases, aliases);
2461 synchronize_srcu_expedited(&kvm->srcu);
2462 kvm_mmu_zap_all(kvm);
2463 kfree(old_aliases);
2464
2465 r = -ENOMEM;
2466 aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
2467 if (!aliases)
2468 goto out_unlock;
2469
2470 memcpy(aliases, kvm->arch.aliases, sizeof(struct kvm_mem_aliases));
2471
2472 p = &aliases->aliases[alias->slot];
2004 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT; 2473 p->base_gfn = alias->guest_phys_addr >> PAGE_SHIFT;
2005 p->npages = alias->memory_size >> PAGE_SHIFT; 2474 p->npages = alias->memory_size >> PAGE_SHIFT;
2006 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT; 2475 p->target_gfn = alias->target_phys_addr >> PAGE_SHIFT;
2476 p->flags &= ~(KVM_ALIAS_INVALID);
2007 2477
2008 for (n = KVM_ALIAS_SLOTS; n > 0; --n) 2478 for (n = KVM_ALIAS_SLOTS; n > 0; --n)
2009 if (kvm->arch.aliases[n - 1].npages) 2479 if (aliases->aliases[n - 1].npages)
2010 break; 2480 break;
2011 kvm->arch.naliases = n; 2481 aliases->naliases = n;
2012
2013 spin_unlock(&kvm->mmu_lock);
2014 kvm_mmu_zap_all(kvm);
2015
2016 up_write(&kvm->slots_lock);
2017 2482
2018 return 0; 2483 old_aliases = kvm->arch.aliases;
2484 rcu_assign_pointer(kvm->arch.aliases, aliases);
2485 synchronize_srcu_expedited(&kvm->srcu);
2486 kfree(old_aliases);
2487 r = 0;
2019 2488
2489out_unlock:
2490 mutex_unlock(&kvm->slots_lock);
2020out: 2491out:
2021 return r; 2492 return r;
2022} 2493}
@@ -2038,9 +2509,7 @@ static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2038 sizeof(struct kvm_pic_state)); 2509 sizeof(struct kvm_pic_state));
2039 break; 2510 break;
2040 case KVM_IRQCHIP_IOAPIC: 2511 case KVM_IRQCHIP_IOAPIC:
2041 memcpy(&chip->chip.ioapic, 2512 r = kvm_get_ioapic(kvm, &chip->chip.ioapic);
2042 ioapic_irqchip(kvm),
2043 sizeof(struct kvm_ioapic_state));
2044 break; 2513 break;
2045 default: 2514 default:
2046 r = -EINVAL; 2515 r = -EINVAL;
@@ -2056,25 +2525,21 @@ static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip)
2056 r = 0; 2525 r = 0;
2057 switch (chip->chip_id) { 2526 switch (chip->chip_id) {
2058 case KVM_IRQCHIP_PIC_MASTER: 2527 case KVM_IRQCHIP_PIC_MASTER:
2059 spin_lock(&pic_irqchip(kvm)->lock); 2528 raw_spin_lock(&pic_irqchip(kvm)->lock);
2060 memcpy(&pic_irqchip(kvm)->pics[0], 2529 memcpy(&pic_irqchip(kvm)->pics[0],
2061 &chip->chip.pic, 2530 &chip->chip.pic,
2062 sizeof(struct kvm_pic_state)); 2531 sizeof(struct kvm_pic_state));
2063 spin_unlock(&pic_irqchip(kvm)->lock); 2532 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2064 break; 2533 break;
2065 case KVM_IRQCHIP_PIC_SLAVE: 2534 case KVM_IRQCHIP_PIC_SLAVE:
2066 spin_lock(&pic_irqchip(kvm)->lock); 2535 raw_spin_lock(&pic_irqchip(kvm)->lock);
2067 memcpy(&pic_irqchip(kvm)->pics[1], 2536 memcpy(&pic_irqchip(kvm)->pics[1],
2068 &chip->chip.pic, 2537 &chip->chip.pic,
2069 sizeof(struct kvm_pic_state)); 2538 sizeof(struct kvm_pic_state));
2070 spin_unlock(&pic_irqchip(kvm)->lock); 2539 raw_spin_unlock(&pic_irqchip(kvm)->lock);
2071 break; 2540 break;
2072 case KVM_IRQCHIP_IOAPIC: 2541 case KVM_IRQCHIP_IOAPIC:
2073 mutex_lock(&kvm->irq_lock); 2542 r = kvm_set_ioapic(kvm, &chip->chip.ioapic);
2074 memcpy(ioapic_irqchip(kvm),
2075 &chip->chip.ioapic,
2076 sizeof(struct kvm_ioapic_state));
2077 mutex_unlock(&kvm->irq_lock);
2078 break; 2543 break;
2079 default: 2544 default:
2080 r = -EINVAL; 2545 r = -EINVAL;
@@ -2151,29 +2616,63 @@ static int kvm_vm_ioctl_reinject(struct kvm *kvm,
2151int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm, 2616int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2152 struct kvm_dirty_log *log) 2617 struct kvm_dirty_log *log)
2153{ 2618{
2154 int r; 2619 int r, i;
2155 int n;
2156 struct kvm_memory_slot *memslot; 2620 struct kvm_memory_slot *memslot;
2157 int is_dirty = 0; 2621 unsigned long n;
2622 unsigned long is_dirty = 0;
2623 unsigned long *dirty_bitmap = NULL;
2158 2624
2159 down_write(&kvm->slots_lock); 2625 mutex_lock(&kvm->slots_lock);
2160 2626
2161 r = kvm_get_dirty_log(kvm, log, &is_dirty); 2627 r = -EINVAL;
2162 if (r) 2628 if (log->slot >= KVM_MEMORY_SLOTS)
2629 goto out;
2630
2631 memslot = &kvm->memslots->memslots[log->slot];
2632 r = -ENOENT;
2633 if (!memslot->dirty_bitmap)
2634 goto out;
2635
2636 n = kvm_dirty_bitmap_bytes(memslot);
2637
2638 r = -ENOMEM;
2639 dirty_bitmap = vmalloc(n);
2640 if (!dirty_bitmap)
2163 goto out; 2641 goto out;
2642 memset(dirty_bitmap, 0, n);
2643
2644 for (i = 0; !is_dirty && i < n/sizeof(long); i++)
2645 is_dirty = memslot->dirty_bitmap[i];
2164 2646
2165 /* If nothing is dirty, don't bother messing with page tables. */ 2647 /* If nothing is dirty, don't bother messing with page tables. */
2166 if (is_dirty) { 2648 if (is_dirty) {
2649 struct kvm_memslots *slots, *old_slots;
2650
2167 spin_lock(&kvm->mmu_lock); 2651 spin_lock(&kvm->mmu_lock);
2168 kvm_mmu_slot_remove_write_access(kvm, log->slot); 2652 kvm_mmu_slot_remove_write_access(kvm, log->slot);
2169 spin_unlock(&kvm->mmu_lock); 2653 spin_unlock(&kvm->mmu_lock);
2170 memslot = &kvm->memslots[log->slot]; 2654
2171 n = ALIGN(memslot->npages, BITS_PER_LONG) / 8; 2655 slots = kzalloc(sizeof(struct kvm_memslots), GFP_KERNEL);
2172 memset(memslot->dirty_bitmap, 0, n); 2656 if (!slots)
2657 goto out_free;
2658
2659 memcpy(slots, kvm->memslots, sizeof(struct kvm_memslots));
2660 slots->memslots[log->slot].dirty_bitmap = dirty_bitmap;
2661
2662 old_slots = kvm->memslots;
2663 rcu_assign_pointer(kvm->memslots, slots);
2664 synchronize_srcu_expedited(&kvm->srcu);
2665 dirty_bitmap = old_slots->memslots[log->slot].dirty_bitmap;
2666 kfree(old_slots);
2173 } 2667 }
2668
2174 r = 0; 2669 r = 0;
2670 if (copy_to_user(log->dirty_bitmap, dirty_bitmap, n))
2671 r = -EFAULT;
2672out_free:
2673 vfree(dirty_bitmap);
2175out: 2674out:
2176 up_write(&kvm->slots_lock); 2675 mutex_unlock(&kvm->slots_lock);
2177 return r; 2676 return r;
2178} 2677}
2179 2678
@@ -2182,7 +2681,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2182{ 2681{
2183 struct kvm *kvm = filp->private_data; 2682 struct kvm *kvm = filp->private_data;
2184 void __user *argp = (void __user *)arg; 2683 void __user *argp = (void __user *)arg;
2185 int r = -EINVAL; 2684 int r = -ENOTTY;
2186 /* 2685 /*
2187 * This union makes it completely explicit to gcc-3.x 2686 * This union makes it completely explicit to gcc-3.x
2188 * that these two variables' stack usage should be 2687 * that these two variables' stack usage should be
@@ -2244,25 +2743,39 @@ long kvm_arch_vm_ioctl(struct file *filp,
2244 if (r) 2743 if (r)
2245 goto out; 2744 goto out;
2246 break; 2745 break;
2247 case KVM_CREATE_IRQCHIP: 2746 case KVM_CREATE_IRQCHIP: {
2747 struct kvm_pic *vpic;
2748
2749 mutex_lock(&kvm->lock);
2750 r = -EEXIST;
2751 if (kvm->arch.vpic)
2752 goto create_irqchip_unlock;
2248 r = -ENOMEM; 2753 r = -ENOMEM;
2249 kvm->arch.vpic = kvm_create_pic(kvm); 2754 vpic = kvm_create_pic(kvm);
2250 if (kvm->arch.vpic) { 2755 if (vpic) {
2251 r = kvm_ioapic_init(kvm); 2756 r = kvm_ioapic_init(kvm);
2252 if (r) { 2757 if (r) {
2253 kfree(kvm->arch.vpic); 2758 kvm_io_bus_unregister_dev(kvm, KVM_PIO_BUS,
2254 kvm->arch.vpic = NULL; 2759 &vpic->dev);
2255 goto out; 2760 kfree(vpic);
2761 goto create_irqchip_unlock;
2256 } 2762 }
2257 } else 2763 } else
2258 goto out; 2764 goto create_irqchip_unlock;
2765 smp_wmb();
2766 kvm->arch.vpic = vpic;
2767 smp_wmb();
2259 r = kvm_setup_default_irq_routing(kvm); 2768 r = kvm_setup_default_irq_routing(kvm);
2260 if (r) { 2769 if (r) {
2261 kfree(kvm->arch.vpic); 2770 mutex_lock(&kvm->irq_lock);
2262 kfree(kvm->arch.vioapic); 2771 kvm_ioapic_destroy(kvm);
2263 goto out; 2772 kvm_destroy_pic(kvm);
2773 mutex_unlock(&kvm->irq_lock);
2264 } 2774 }
2775 create_irqchip_unlock:
2776 mutex_unlock(&kvm->lock);
2265 break; 2777 break;
2778 }
2266 case KVM_CREATE_PIT: 2779 case KVM_CREATE_PIT:
2267 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; 2780 u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY;
2268 goto create_pit; 2781 goto create_pit;
@@ -2272,7 +2785,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2272 sizeof(struct kvm_pit_config))) 2785 sizeof(struct kvm_pit_config)))
2273 goto out; 2786 goto out;
2274 create_pit: 2787 create_pit:
2275 down_write(&kvm->slots_lock); 2788 mutex_lock(&kvm->slots_lock);
2276 r = -EEXIST; 2789 r = -EEXIST;
2277 if (kvm->arch.vpit) 2790 if (kvm->arch.vpit)
2278 goto create_pit_unlock; 2791 goto create_pit_unlock;
@@ -2281,7 +2794,7 @@ long kvm_arch_vm_ioctl(struct file *filp,
2281 if (kvm->arch.vpit) 2794 if (kvm->arch.vpit)
2282 r = 0; 2795 r = 0;
2283 create_pit_unlock: 2796 create_pit_unlock:
2284 up_write(&kvm->slots_lock); 2797 mutex_unlock(&kvm->slots_lock);
2285 break; 2798 break;
2286 case KVM_IRQ_LINE_STATUS: 2799 case KVM_IRQ_LINE_STATUS:
2287 case KVM_IRQ_LINE: { 2800 case KVM_IRQ_LINE: {
@@ -2292,10 +2805,8 @@ long kvm_arch_vm_ioctl(struct file *filp,
2292 goto out; 2805 goto out;
2293 if (irqchip_in_kernel(kvm)) { 2806 if (irqchip_in_kernel(kvm)) {
2294 __s32 status; 2807 __s32 status;
2295 mutex_lock(&kvm->irq_lock);
2296 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, 2808 status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID,
2297 irq_event.irq, irq_event.level); 2809 irq_event.irq, irq_event.level);
2298 mutex_unlock(&kvm->irq_lock);
2299 if (ioctl == KVM_IRQ_LINE_STATUS) { 2810 if (ioctl == KVM_IRQ_LINE_STATUS) {
2300 irq_event.status = status; 2811 irq_event.status = status;
2301 if (copy_to_user(argp, &irq_event, 2812 if (copy_to_user(argp, &irq_event,
@@ -2421,6 +2932,55 @@ long kvm_arch_vm_ioctl(struct file *filp,
2421 r = 0; 2932 r = 0;
2422 break; 2933 break;
2423 } 2934 }
2935 case KVM_XEN_HVM_CONFIG: {
2936 r = -EFAULT;
2937 if (copy_from_user(&kvm->arch.xen_hvm_config, argp,
2938 sizeof(struct kvm_xen_hvm_config)))
2939 goto out;
2940 r = -EINVAL;
2941 if (kvm->arch.xen_hvm_config.flags)
2942 goto out;
2943 r = 0;
2944 break;
2945 }
2946 case KVM_SET_CLOCK: {
2947 struct timespec now;
2948 struct kvm_clock_data user_ns;
2949 u64 now_ns;
2950 s64 delta;
2951
2952 r = -EFAULT;
2953 if (copy_from_user(&user_ns, argp, sizeof(user_ns)))
2954 goto out;
2955
2956 r = -EINVAL;
2957 if (user_ns.flags)
2958 goto out;
2959
2960 r = 0;
2961 ktime_get_ts(&now);
2962 now_ns = timespec_to_ns(&now);
2963 delta = user_ns.clock - now_ns;
2964 kvm->arch.kvmclock_offset = delta;
2965 break;
2966 }
2967 case KVM_GET_CLOCK: {
2968 struct timespec now;
2969 struct kvm_clock_data user_ns;
2970 u64 now_ns;
2971
2972 ktime_get_ts(&now);
2973 now_ns = timespec_to_ns(&now);
2974 user_ns.clock = kvm->arch.kvmclock_offset + now_ns;
2975 user_ns.flags = 0;
2976
2977 r = -EFAULT;
2978 if (copy_to_user(argp, &user_ns, sizeof(user_ns)))
2979 goto out;
2980 r = 0;
2981 break;
2982 }
2983
2424 default: 2984 default:
2425 ; 2985 ;
2426 } 2986 }
@@ -2433,7 +2993,8 @@ static void kvm_init_msr_list(void)
2433 u32 dummy[2]; 2993 u32 dummy[2];
2434 unsigned i, j; 2994 unsigned i, j;
2435 2995
2436 for (i = j = 0; i < ARRAY_SIZE(msrs_to_save); i++) { 2996 /* skip the first msrs in the list. KVM-specific */
2997 for (i = j = KVM_SAVE_MSRS_BEGIN; i < ARRAY_SIZE(msrs_to_save); i++) {
2437 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0) 2998 if (rdmsr_safe(msrs_to_save[i], &dummy[0], &dummy[1]) < 0)
2438 continue; 2999 continue;
2439 if (j < i) 3000 if (j < i)
@@ -2450,7 +3011,7 @@ static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len,
2450 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v)) 3011 !kvm_iodevice_write(&vcpu->arch.apic->dev, addr, len, v))
2451 return 0; 3012 return 0;
2452 3013
2453 return kvm_io_bus_write(&vcpu->kvm->mmio_bus, addr, len, v); 3014 return kvm_io_bus_write(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2454} 3015}
2455 3016
2456static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) 3017static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
@@ -2459,17 +3020,44 @@ static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v)
2459 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v)) 3020 !kvm_iodevice_read(&vcpu->arch.apic->dev, addr, len, v))
2460 return 0; 3021 return 0;
2461 3022
2462 return kvm_io_bus_read(&vcpu->kvm->mmio_bus, addr, len, v); 3023 return kvm_io_bus_read(vcpu->kvm, KVM_MMIO_BUS, addr, len, v);
2463} 3024}
2464 3025
2465static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes, 3026gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
2466 struct kvm_vcpu *vcpu) 3027{
3028 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3029 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3030}
3031
3032 gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3033{
3034 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3035 access |= PFERR_FETCH_MASK;
3036 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3037}
3038
3039gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3040{
3041 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3042 access |= PFERR_WRITE_MASK;
3043 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, access, error);
3044}
3045
3046/* uses this to access any guest's mapped memory without checking CPL */
3047gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, u32 *error)
3048{
3049 return vcpu->arch.mmu.gva_to_gpa(vcpu, gva, 0, error);
3050}
3051
3052static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes,
3053 struct kvm_vcpu *vcpu, u32 access,
3054 u32 *error)
2467{ 3055{
2468 void *data = val; 3056 void *data = val;
2469 int r = X86EMUL_CONTINUE; 3057 int r = X86EMUL_CONTINUE;
2470 3058
2471 while (bytes) { 3059 while (bytes) {
2472 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3060 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr, access, error);
2473 unsigned offset = addr & (PAGE_SIZE-1); 3061 unsigned offset = addr & (PAGE_SIZE-1);
2474 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); 3062 unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset);
2475 int ret; 3063 int ret;
@@ -2492,14 +3080,37 @@ out:
2492 return r; 3080 return r;
2493} 3081}
2494 3082
3083/* used for instruction fetching */
3084static int kvm_fetch_guest_virt(gva_t addr, void *val, unsigned int bytes,
3085 struct kvm_vcpu *vcpu, u32 *error)
3086{
3087 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3088 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu,
3089 access | PFERR_FETCH_MASK, error);
3090}
3091
3092static int kvm_read_guest_virt(gva_t addr, void *val, unsigned int bytes,
3093 struct kvm_vcpu *vcpu, u32 *error)
3094{
3095 u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
3096 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access,
3097 error);
3098}
3099
3100static int kvm_read_guest_virt_system(gva_t addr, void *val, unsigned int bytes,
3101 struct kvm_vcpu *vcpu, u32 *error)
3102{
3103 return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, 0, error);
3104}
3105
2495static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes, 3106static int kvm_write_guest_virt(gva_t addr, void *val, unsigned int bytes,
2496 struct kvm_vcpu *vcpu) 3107 struct kvm_vcpu *vcpu, u32 *error)
2497{ 3108{
2498 void *data = val; 3109 void *data = val;
2499 int r = X86EMUL_CONTINUE; 3110 int r = X86EMUL_CONTINUE;
2500 3111
2501 while (bytes) { 3112 while (bytes) {
2502 gpa_t gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3113 gpa_t gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, error);
2503 unsigned offset = addr & (PAGE_SIZE-1); 3114 unsigned offset = addr & (PAGE_SIZE-1);
2504 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); 3115 unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset);
2505 int ret; 3116 int ret;
@@ -2529,6 +3140,7 @@ static int emulator_read_emulated(unsigned long addr,
2529 struct kvm_vcpu *vcpu) 3140 struct kvm_vcpu *vcpu)
2530{ 3141{
2531 gpa_t gpa; 3142 gpa_t gpa;
3143 u32 error_code;
2532 3144
2533 if (vcpu->mmio_read_completed) { 3145 if (vcpu->mmio_read_completed) {
2534 memcpy(val, vcpu->mmio_data, bytes); 3146 memcpy(val, vcpu->mmio_data, bytes);
@@ -2538,17 +3150,20 @@ static int emulator_read_emulated(unsigned long addr,
2538 return X86EMUL_CONTINUE; 3150 return X86EMUL_CONTINUE;
2539 } 3151 }
2540 3152
2541 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3153 gpa = kvm_mmu_gva_to_gpa_read(vcpu, addr, &error_code);
3154
3155 if (gpa == UNMAPPED_GVA) {
3156 kvm_inject_page_fault(vcpu, addr, error_code);
3157 return X86EMUL_PROPAGATE_FAULT;
3158 }
2542 3159
2543 /* For APIC access vmexit */ 3160 /* For APIC access vmexit */
2544 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3161 if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
2545 goto mmio; 3162 goto mmio;
2546 3163
2547 if (kvm_read_guest_virt(addr, val, bytes, vcpu) 3164 if (kvm_read_guest_virt(addr, val, bytes, vcpu, NULL)
2548 == X86EMUL_CONTINUE) 3165 == X86EMUL_CONTINUE)
2549 return X86EMUL_CONTINUE; 3166 return X86EMUL_CONTINUE;
2550 if (gpa == UNMAPPED_GVA)
2551 return X86EMUL_PROPAGATE_FAULT;
2552 3167
2553mmio: 3168mmio:
2554 /* 3169 /*
@@ -2587,11 +3202,12 @@ static int emulator_write_emulated_onepage(unsigned long addr,
2587 struct kvm_vcpu *vcpu) 3202 struct kvm_vcpu *vcpu)
2588{ 3203{
2589 gpa_t gpa; 3204 gpa_t gpa;
3205 u32 error_code;
2590 3206
2591 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3207 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, &error_code);
2592 3208
2593 if (gpa == UNMAPPED_GVA) { 3209 if (gpa == UNMAPPED_GVA) {
2594 kvm_inject_page_fault(vcpu, addr, 2); 3210 kvm_inject_page_fault(vcpu, addr, error_code);
2595 return X86EMUL_PROPAGATE_FAULT; 3211 return X86EMUL_PROPAGATE_FAULT;
2596 } 3212 }
2597 3213
@@ -2655,7 +3271,7 @@ static int emulator_cmpxchg_emulated(unsigned long addr,
2655 char *kaddr; 3271 char *kaddr;
2656 u64 val; 3272 u64 val;
2657 3273
2658 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, addr); 3274 gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL);
2659 3275
2660 if (gpa == UNMAPPED_GVA || 3276 if (gpa == UNMAPPED_GVA ||
2661 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) 3277 (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE)
@@ -2692,35 +3308,21 @@ int emulate_invlpg(struct kvm_vcpu *vcpu, gva_t address)
2692 3308
2693int emulate_clts(struct kvm_vcpu *vcpu) 3309int emulate_clts(struct kvm_vcpu *vcpu)
2694{ 3310{
2695 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 & ~X86_CR0_TS); 3311 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS));
3312 kvm_x86_ops->fpu_activate(vcpu);
2696 return X86EMUL_CONTINUE; 3313 return X86EMUL_CONTINUE;
2697} 3314}
2698 3315
2699int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest) 3316int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
2700{ 3317{
2701 struct kvm_vcpu *vcpu = ctxt->vcpu; 3318 return kvm_x86_ops->get_dr(ctxt->vcpu, dr, dest);
2702
2703 switch (dr) {
2704 case 0 ... 3:
2705 *dest = kvm_x86_ops->get_dr(vcpu, dr);
2706 return X86EMUL_CONTINUE;
2707 default:
2708 pr_unimpl(vcpu, "%s: unexpected dr %u\n", __func__, dr);
2709 return X86EMUL_UNHANDLEABLE;
2710 }
2711} 3319}
2712 3320
2713int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value) 3321int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long value)
2714{ 3322{
2715 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U; 3323 unsigned long mask = (ctxt->mode == X86EMUL_MODE_PROT64) ? ~0ULL : ~0U;
2716 int exception;
2717 3324
2718 kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask, &exception); 3325 return kvm_x86_ops->set_dr(ctxt->vcpu, dr, value & mask);
2719 if (exception) {
2720 /* FIXME: better handling */
2721 return X86EMUL_UNHANDLEABLE;
2722 }
2723 return X86EMUL_CONTINUE;
2724} 3326}
2725 3327
2726void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context) 3328void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
@@ -2734,7 +3336,7 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2734 3336
2735 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS); 3337 rip_linear = rip + get_segment_base(vcpu, VCPU_SREG_CS);
2736 3338
2737 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu); 3339 kvm_read_guest_virt(rip_linear, (void *)opcodes, 4, vcpu, NULL);
2738 3340
2739 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n", 3341 printk(KERN_ERR "emulation failed (%s) rip %lx %02x %02x %02x %02x\n",
2740 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]); 3342 context, rip, opcodes[0], opcodes[1], opcodes[2], opcodes[3]);
@@ -2742,7 +3344,8 @@ void kvm_report_emulation_failure(struct kvm_vcpu *vcpu, const char *context)
2742EXPORT_SYMBOL_GPL(kvm_report_emulation_failure); 3344EXPORT_SYMBOL_GPL(kvm_report_emulation_failure);
2743 3345
2744static struct x86_emulate_ops emulate_ops = { 3346static struct x86_emulate_ops emulate_ops = {
2745 .read_std = kvm_read_guest_virt, 3347 .read_std = kvm_read_guest_virt_system,
3348 .fetch = kvm_fetch_guest_virt,
2746 .read_emulated = emulator_read_emulated, 3349 .read_emulated = emulator_read_emulated,
2747 .write_emulated = emulator_write_emulated, 3350 .write_emulated = emulator_write_emulated,
2748 .cmpxchg_emulated = emulator_cmpxchg_emulated, 3351 .cmpxchg_emulated = emulator_cmpxchg_emulated,
@@ -2757,13 +3360,13 @@ static void cache_all_regs(struct kvm_vcpu *vcpu)
2757} 3360}
2758 3361
2759int emulate_instruction(struct kvm_vcpu *vcpu, 3362int emulate_instruction(struct kvm_vcpu *vcpu,
2760 struct kvm_run *run,
2761 unsigned long cr2, 3363 unsigned long cr2,
2762 u16 error_code, 3364 u16 error_code,
2763 int emulation_type) 3365 int emulation_type)
2764{ 3366{
2765 int r, shadow_mask; 3367 int r, shadow_mask;
2766 struct decode_cache *c; 3368 struct decode_cache *c;
3369 struct kvm_run *run = vcpu->run;
2767 3370
2768 kvm_clear_exception_queue(vcpu); 3371 kvm_clear_exception_queue(vcpu);
2769 vcpu->arch.mmio_fault_cr2 = cr2; 3372 vcpu->arch.mmio_fault_cr2 = cr2;
@@ -2783,10 +3386,11 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2783 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l); 3386 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
2784 3387
2785 vcpu->arch.emulate_ctxt.vcpu = vcpu; 3388 vcpu->arch.emulate_ctxt.vcpu = vcpu;
2786 vcpu->arch.emulate_ctxt.eflags = kvm_x86_ops->get_rflags(vcpu); 3389 vcpu->arch.emulate_ctxt.eflags = kvm_get_rflags(vcpu);
2787 vcpu->arch.emulate_ctxt.mode = 3390 vcpu->arch.emulate_ctxt.mode =
3391 (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL :
2788 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM) 3392 (vcpu->arch.emulate_ctxt.eflags & X86_EFLAGS_VM)
2789 ? X86EMUL_MODE_REAL : cs_l 3393 ? X86EMUL_MODE_VM86 : cs_l
2790 ? X86EMUL_MODE_PROT64 : cs_db 3394 ? X86EMUL_MODE_PROT64 : cs_db
2791 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16; 3395 ? X86EMUL_MODE_PROT32 : X86EMUL_MODE_PROT16;
2792 3396
@@ -2861,7 +3465,7 @@ int emulate_instruction(struct kvm_vcpu *vcpu,
2861 return EMULATE_DO_MMIO; 3465 return EMULATE_DO_MMIO;
2862 } 3466 }
2863 3467
2864 kvm_x86_ops->set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags); 3468 kvm_set_rflags(vcpu, vcpu->arch.emulate_ctxt.eflags);
2865 3469
2866 if (vcpu->mmio_is_write) { 3470 if (vcpu->mmio_is_write) {
2867 vcpu->mmio_needed = 0; 3471 vcpu->mmio_needed = 0;
@@ -2878,12 +3482,17 @@ static int pio_copy_data(struct kvm_vcpu *vcpu)
2878 gva_t q = vcpu->arch.pio.guest_gva; 3482 gva_t q = vcpu->arch.pio.guest_gva;
2879 unsigned bytes; 3483 unsigned bytes;
2880 int ret; 3484 int ret;
3485 u32 error_code;
2881 3486
2882 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count; 3487 bytes = vcpu->arch.pio.size * vcpu->arch.pio.cur_count;
2883 if (vcpu->arch.pio.in) 3488 if (vcpu->arch.pio.in)
2884 ret = kvm_write_guest_virt(q, p, bytes, vcpu); 3489 ret = kvm_write_guest_virt(q, p, bytes, vcpu, &error_code);
2885 else 3490 else
2886 ret = kvm_read_guest_virt(q, p, bytes, vcpu); 3491 ret = kvm_read_guest_virt(q, p, bytes, vcpu, &error_code);
3492
3493 if (ret == X86EMUL_PROPAGATE_FAULT)
3494 kvm_inject_page_fault(vcpu, q, error_code);
3495
2887 return ret; 3496 return ret;
2888} 3497}
2889 3498
@@ -2904,7 +3513,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
2904 if (io->in) { 3513 if (io->in) {
2905 r = pio_copy_data(vcpu); 3514 r = pio_copy_data(vcpu);
2906 if (r) 3515 if (r)
2907 return r; 3516 goto out;
2908 } 3517 }
2909 3518
2910 delta = 1; 3519 delta = 1;
@@ -2931,7 +3540,7 @@ int complete_pio(struct kvm_vcpu *vcpu)
2931 kvm_register_write(vcpu, VCPU_REGS_RSI, val); 3540 kvm_register_write(vcpu, VCPU_REGS_RSI, val);
2932 } 3541 }
2933 } 3542 }
2934 3543out:
2935 io->count -= io->cur_count; 3544 io->count -= io->cur_count;
2936 io->cur_count = 0; 3545 io->cur_count = 0;
2937 3546
@@ -2944,11 +3553,12 @@ static int kernel_pio(struct kvm_vcpu *vcpu, void *pd)
2944 int r; 3553 int r;
2945 3554
2946 if (vcpu->arch.pio.in) 3555 if (vcpu->arch.pio.in)
2947 r = kvm_io_bus_read(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3556 r = kvm_io_bus_read(vcpu->kvm, KVM_PIO_BUS, vcpu->arch.pio.port,
2948 vcpu->arch.pio.size, pd); 3557 vcpu->arch.pio.size, pd);
2949 else 3558 else
2950 r = kvm_io_bus_write(&vcpu->kvm->pio_bus, vcpu->arch.pio.port, 3559 r = kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
2951 vcpu->arch.pio.size, pd); 3560 vcpu->arch.pio.port, vcpu->arch.pio.size,
3561 pd);
2952 return r; 3562 return r;
2953} 3563}
2954 3564
@@ -2959,7 +3569,7 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2959 int i, r = 0; 3569 int i, r = 0;
2960 3570
2961 for (i = 0; i < io->cur_count; i++) { 3571 for (i = 0; i < io->cur_count; i++) {
2962 if (kvm_io_bus_write(&vcpu->kvm->pio_bus, 3572 if (kvm_io_bus_write(vcpu->kvm, KVM_PIO_BUS,
2963 io->port, io->size, pd)) { 3573 io->port, io->size, pd)) {
2964 r = -EOPNOTSUPP; 3574 r = -EOPNOTSUPP;
2965 break; 3575 break;
@@ -2969,11 +3579,12 @@ static int pio_string_write(struct kvm_vcpu *vcpu)
2969 return r; 3579 return r;
2970} 3580}
2971 3581
2972int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3582int kvm_emulate_pio(struct kvm_vcpu *vcpu, int in, int size, unsigned port)
2973 int size, unsigned port)
2974{ 3583{
2975 unsigned long val; 3584 unsigned long val;
2976 3585
3586 trace_kvm_pio(!in, port, size, 1);
3587
2977 vcpu->run->exit_reason = KVM_EXIT_IO; 3588 vcpu->run->exit_reason = KVM_EXIT_IO;
2978 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3589 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
2979 vcpu->run->io.size = vcpu->arch.pio.size = size; 3590 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -2985,11 +3596,10 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2985 vcpu->arch.pio.down = 0; 3596 vcpu->arch.pio.down = 0;
2986 vcpu->arch.pio.rep = 0; 3597 vcpu->arch.pio.rep = 0;
2987 3598
2988 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port, 3599 if (!vcpu->arch.pio.in) {
2989 size, 1); 3600 val = kvm_register_read(vcpu, VCPU_REGS_RAX);
2990 3601 memcpy(vcpu->arch.pio_data, &val, 4);
2991 val = kvm_register_read(vcpu, VCPU_REGS_RAX); 3602 }
2992 memcpy(vcpu->arch.pio_data, &val, 4);
2993 3603
2994 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) { 3604 if (!kernel_pio(vcpu, vcpu->arch.pio_data)) {
2995 complete_pio(vcpu); 3605 complete_pio(vcpu);
@@ -2999,13 +3609,15 @@ int kvm_emulate_pio(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
2999} 3609}
3000EXPORT_SYMBOL_GPL(kvm_emulate_pio); 3610EXPORT_SYMBOL_GPL(kvm_emulate_pio);
3001 3611
3002int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in, 3612int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, int in,
3003 int size, unsigned long count, int down, 3613 int size, unsigned long count, int down,
3004 gva_t address, int rep, unsigned port) 3614 gva_t address, int rep, unsigned port)
3005{ 3615{
3006 unsigned now, in_page; 3616 unsigned now, in_page;
3007 int ret = 0; 3617 int ret = 0;
3008 3618
3619 trace_kvm_pio(!in, port, size, count);
3620
3009 vcpu->run->exit_reason = KVM_EXIT_IO; 3621 vcpu->run->exit_reason = KVM_EXIT_IO;
3010 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; 3622 vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT;
3011 vcpu->run->io.size = vcpu->arch.pio.size = size; 3623 vcpu->run->io.size = vcpu->arch.pio.size = size;
@@ -3017,9 +3629,6 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
3017 vcpu->arch.pio.down = down; 3629 vcpu->arch.pio.down = down;
3018 vcpu->arch.pio.rep = rep; 3630 vcpu->arch.pio.rep = rep;
3019 3631
3020 trace_kvm_pio(vcpu->run->io.direction == KVM_EXIT_IO_OUT, port,
3021 size, count);
3022
3023 if (!count) { 3632 if (!count) {
3024 kvm_x86_ops->skip_emulated_instruction(vcpu); 3633 kvm_x86_ops->skip_emulated_instruction(vcpu);
3025 return 1; 3634 return 1;
@@ -3051,10 +3660,8 @@ int kvm_emulate_pio_string(struct kvm_vcpu *vcpu, struct kvm_run *run, int in,
3051 if (!vcpu->arch.pio.in) { 3660 if (!vcpu->arch.pio.in) {
3052 /* string PIO write */ 3661 /* string PIO write */
3053 ret = pio_copy_data(vcpu); 3662 ret = pio_copy_data(vcpu);
3054 if (ret == X86EMUL_PROPAGATE_FAULT) { 3663 if (ret == X86EMUL_PROPAGATE_FAULT)
3055 kvm_inject_gp(vcpu, 0);
3056 return 1; 3664 return 1;
3057 }
3058 if (ret == 0 && !pio_string_write(vcpu)) { 3665 if (ret == 0 && !pio_string_write(vcpu)) {
3059 complete_pio(vcpu); 3666 complete_pio(vcpu);
3060 if (vcpu->arch.pio.count == 0) 3667 if (vcpu->arch.pio.count == 0)
@@ -3072,9 +3679,6 @@ static void bounce_off(void *info)
3072 /* nothing */ 3679 /* nothing */
3073} 3680}
3074 3681
3075static unsigned int ref_freq;
3076static unsigned long tsc_khz_ref;
3077
3078static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, 3682static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val,
3079 void *data) 3683 void *data)
3080{ 3684{
@@ -3083,14 +3687,11 @@ static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long va
3083 struct kvm_vcpu *vcpu; 3687 struct kvm_vcpu *vcpu;
3084 int i, send_ipi = 0; 3688 int i, send_ipi = 0;
3085 3689
3086 if (!ref_freq)
3087 ref_freq = freq->old;
3088
3089 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) 3690 if (val == CPUFREQ_PRECHANGE && freq->old > freq->new)
3090 return 0; 3691 return 0;
3091 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) 3692 if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new)
3092 return 0; 3693 return 0;
3093 per_cpu(cpu_tsc_khz, freq->cpu) = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); 3694 per_cpu(cpu_tsc_khz, freq->cpu) = freq->new;
3094 3695
3095 spin_lock(&kvm_lock); 3696 spin_lock(&kvm_lock);
3096 list_for_each_entry(kvm, &vm_list, vm_list) { 3697 list_for_each_entry(kvm, &vm_list, vm_list) {
@@ -3127,9 +3728,28 @@ static struct notifier_block kvmclock_cpufreq_notifier_block = {
3127 .notifier_call = kvmclock_cpufreq_notifier 3728 .notifier_call = kvmclock_cpufreq_notifier
3128}; 3729};
3129 3730
3731static void kvm_timer_init(void)
3732{
3733 int cpu;
3734
3735 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3736 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3737 CPUFREQ_TRANSITION_NOTIFIER);
3738 for_each_online_cpu(cpu) {
3739 unsigned long khz = cpufreq_get(cpu);
3740 if (!khz)
3741 khz = tsc_khz;
3742 per_cpu(cpu_tsc_khz, cpu) = khz;
3743 }
3744 } else {
3745 for_each_possible_cpu(cpu)
3746 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3747 }
3748}
3749
3130int kvm_arch_init(void *opaque) 3750int kvm_arch_init(void *opaque)
3131{ 3751{
3132 int r, cpu; 3752 int r;
3133 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque; 3753 struct kvm_x86_ops *ops = (struct kvm_x86_ops *)opaque;
3134 3754
3135 if (kvm_x86_ops) { 3755 if (kvm_x86_ops) {
@@ -3161,13 +3781,7 @@ int kvm_arch_init(void *opaque)
3161 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK, 3781 kvm_mmu_set_mask_ptes(PT_USER_MASK, PT_ACCESSED_MASK,
3162 PT_DIRTY_MASK, PT64_NX_MASK, 0); 3782 PT_DIRTY_MASK, PT64_NX_MASK, 0);
3163 3783
3164 for_each_possible_cpu(cpu) 3784 kvm_timer_init();
3165 per_cpu(cpu_tsc_khz, cpu) = tsc_khz;
3166 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
3167 tsc_khz_ref = tsc_khz;
3168 cpufreq_register_notifier(&kvmclock_cpufreq_notifier_block,
3169 CPUFREQ_TRANSITION_NOTIFIER);
3170 }
3171 3785
3172 return 0; 3786 return 0;
3173 3787
@@ -3206,11 +3820,76 @@ static inline gpa_t hc_gpa(struct kvm_vcpu *vcpu, unsigned long a0,
3206 return a0 | ((gpa_t)a1 << 32); 3820 return a0 | ((gpa_t)a1 << 32);
3207} 3821}
3208 3822
3823int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
3824{
3825 u64 param, ingpa, outgpa, ret;
3826 uint16_t code, rep_idx, rep_cnt, res = HV_STATUS_SUCCESS, rep_done = 0;
3827 bool fast, longmode;
3828 int cs_db, cs_l;
3829
3830 /*
3831 * hypercall generates UD from non zero cpl and real mode
3832 * per HYPER-V spec
3833 */
3834 if (kvm_x86_ops->get_cpl(vcpu) != 0 || !is_protmode(vcpu)) {
3835 kvm_queue_exception(vcpu, UD_VECTOR);
3836 return 0;
3837 }
3838
3839 kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
3840 longmode = is_long_mode(vcpu) && cs_l == 1;
3841
3842 if (!longmode) {
3843 param = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDX) << 32) |
3844 (kvm_register_read(vcpu, VCPU_REGS_RAX) & 0xffffffff);
3845 ingpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RBX) << 32) |
3846 (kvm_register_read(vcpu, VCPU_REGS_RCX) & 0xffffffff);
3847 outgpa = ((u64)kvm_register_read(vcpu, VCPU_REGS_RDI) << 32) |
3848 (kvm_register_read(vcpu, VCPU_REGS_RSI) & 0xffffffff);
3849 }
3850#ifdef CONFIG_X86_64
3851 else {
3852 param = kvm_register_read(vcpu, VCPU_REGS_RCX);
3853 ingpa = kvm_register_read(vcpu, VCPU_REGS_RDX);
3854 outgpa = kvm_register_read(vcpu, VCPU_REGS_R8);
3855 }
3856#endif
3857
3858 code = param & 0xffff;
3859 fast = (param >> 16) & 0x1;
3860 rep_cnt = (param >> 32) & 0xfff;
3861 rep_idx = (param >> 48) & 0xfff;
3862
3863 trace_kvm_hv_hypercall(code, fast, rep_cnt, rep_idx, ingpa, outgpa);
3864
3865 switch (code) {
3866 case HV_X64_HV_NOTIFY_LONG_SPIN_WAIT:
3867 kvm_vcpu_on_spin(vcpu);
3868 break;
3869 default:
3870 res = HV_STATUS_INVALID_HYPERCALL_CODE;
3871 break;
3872 }
3873
3874 ret = res | (((u64)rep_done & 0xfff) << 32);
3875 if (longmode) {
3876 kvm_register_write(vcpu, VCPU_REGS_RAX, ret);
3877 } else {
3878 kvm_register_write(vcpu, VCPU_REGS_RDX, ret >> 32);
3879 kvm_register_write(vcpu, VCPU_REGS_RAX, ret & 0xffffffff);
3880 }
3881
3882 return 1;
3883}
3884
3209int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) 3885int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
3210{ 3886{
3211 unsigned long nr, a0, a1, a2, a3, ret; 3887 unsigned long nr, a0, a1, a2, a3, ret;
3212 int r = 1; 3888 int r = 1;
3213 3889
3890 if (kvm_hv_hypercall_enabled(vcpu->kvm))
3891 return kvm_hv_hypercall(vcpu);
3892
3214 nr = kvm_register_read(vcpu, VCPU_REGS_RAX); 3893 nr = kvm_register_read(vcpu, VCPU_REGS_RAX);
3215 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX); 3894 a0 = kvm_register_read(vcpu, VCPU_REGS_RBX);
3216 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX); 3895 a1 = kvm_register_read(vcpu, VCPU_REGS_RCX);
@@ -3253,10 +3932,8 @@ EXPORT_SYMBOL_GPL(kvm_emulate_hypercall);
3253int kvm_fix_hypercall(struct kvm_vcpu *vcpu) 3932int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3254{ 3933{
3255 char instruction[3]; 3934 char instruction[3];
3256 int ret = 0;
3257 unsigned long rip = kvm_rip_read(vcpu); 3935 unsigned long rip = kvm_rip_read(vcpu);
3258 3936
3259
3260 /* 3937 /*
3261 * Blow out the MMU to ensure that no other VCPU has an active mapping 3938 * Blow out the MMU to ensure that no other VCPU has an active mapping
3262 * to ensure that the updated hypercall appears atomically across all 3939 * to ensure that the updated hypercall appears atomically across all
@@ -3265,11 +3942,8 @@ int kvm_fix_hypercall(struct kvm_vcpu *vcpu)
3265 kvm_mmu_zap_all(vcpu->kvm); 3942 kvm_mmu_zap_all(vcpu->kvm);
3266 3943
3267 kvm_x86_ops->patch_hypercall(vcpu, instruction); 3944 kvm_x86_ops->patch_hypercall(vcpu, instruction);
3268 if (emulator_write_emulated(rip, instruction, 3, vcpu)
3269 != X86EMUL_CONTINUE)
3270 ret = -EFAULT;
3271 3945
3272 return ret; 3946 return emulator_write_emulated(rip, instruction, 3, vcpu);
3273} 3947}
3274 3948
3275static u64 mk_cr_64(u64 curr_cr, u32 new_val) 3949static u64 mk_cr_64(u64 curr_cr, u32 new_val)
@@ -3295,17 +3969,16 @@ void realmode_lmsw(struct kvm_vcpu *vcpu, unsigned long msw,
3295 unsigned long *rflags) 3969 unsigned long *rflags)
3296{ 3970{
3297 kvm_lmsw(vcpu, msw); 3971 kvm_lmsw(vcpu, msw);
3298 *rflags = kvm_x86_ops->get_rflags(vcpu); 3972 *rflags = kvm_get_rflags(vcpu);
3299} 3973}
3300 3974
3301unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr) 3975unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3302{ 3976{
3303 unsigned long value; 3977 unsigned long value;
3304 3978
3305 kvm_x86_ops->decache_cr4_guest_bits(vcpu);
3306 switch (cr) { 3979 switch (cr) {
3307 case 0: 3980 case 0:
3308 value = vcpu->arch.cr0; 3981 value = kvm_read_cr0(vcpu);
3309 break; 3982 break;
3310 case 2: 3983 case 2:
3311 value = vcpu->arch.cr2; 3984 value = vcpu->arch.cr2;
@@ -3314,7 +3987,7 @@ unsigned long realmode_get_cr(struct kvm_vcpu *vcpu, int cr)
3314 value = vcpu->arch.cr3; 3987 value = vcpu->arch.cr3;
3315 break; 3988 break;
3316 case 4: 3989 case 4:
3317 value = vcpu->arch.cr4; 3990 value = kvm_read_cr4(vcpu);
3318 break; 3991 break;
3319 case 8: 3992 case 8:
3320 value = kvm_get_cr8(vcpu); 3993 value = kvm_get_cr8(vcpu);
@@ -3332,8 +4005,8 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3332{ 4005{
3333 switch (cr) { 4006 switch (cr) {
3334 case 0: 4007 case 0:
3335 kvm_set_cr0(vcpu, mk_cr_64(vcpu->arch.cr0, val)); 4008 kvm_set_cr0(vcpu, mk_cr_64(kvm_read_cr0(vcpu), val));
3336 *rflags = kvm_x86_ops->get_rflags(vcpu); 4009 *rflags = kvm_get_rflags(vcpu);
3337 break; 4010 break;
3338 case 2: 4011 case 2:
3339 vcpu->arch.cr2 = val; 4012 vcpu->arch.cr2 = val;
@@ -3342,7 +4015,7 @@ void realmode_set_cr(struct kvm_vcpu *vcpu, int cr, unsigned long val,
3342 kvm_set_cr3(vcpu, val); 4015 kvm_set_cr3(vcpu, val);
3343 break; 4016 break;
3344 case 4: 4017 case 4:
3345 kvm_set_cr4(vcpu, mk_cr_64(vcpu->arch.cr4, val)); 4018 kvm_set_cr4(vcpu, mk_cr_64(kvm_read_cr4(vcpu), val));
3346 break; 4019 break;
3347 case 8: 4020 case 8:
3348 kvm_set_cr8(vcpu, val & 0xfUL); 4021 kvm_set_cr8(vcpu, val & 0xfUL);
@@ -3409,6 +4082,7 @@ struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
3409 } 4082 }
3410 return best; 4083 return best;
3411} 4084}
4085EXPORT_SYMBOL_GPL(kvm_find_cpuid_entry);
3412 4086
3413int cpuid_maxphyaddr(struct kvm_vcpu *vcpu) 4087int cpuid_maxphyaddr(struct kvm_vcpu *vcpu)
3414{ 4088{
@@ -3453,18 +4127,18 @@ EXPORT_SYMBOL_GPL(kvm_emulate_cpuid);
3453 * 4127 *
3454 * No need to exit to userspace if we already have an interrupt queued. 4128 * No need to exit to userspace if we already have an interrupt queued.
3455 */ 4129 */
3456static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu, 4130static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu)
3457 struct kvm_run *kvm_run)
3458{ 4131{
3459 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) && 4132 return (!irqchip_in_kernel(vcpu->kvm) && !kvm_cpu_has_interrupt(vcpu) &&
3460 kvm_run->request_interrupt_window && 4133 vcpu->run->request_interrupt_window &&
3461 kvm_arch_interrupt_allowed(vcpu)); 4134 kvm_arch_interrupt_allowed(vcpu));
3462} 4135}
3463 4136
3464static void post_kvm_run_save(struct kvm_vcpu *vcpu, 4137static void post_kvm_run_save(struct kvm_vcpu *vcpu)
3465 struct kvm_run *kvm_run)
3466{ 4138{
3467 kvm_run->if_flag = (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_IF) != 0; 4139 struct kvm_run *kvm_run = vcpu->run;
4140
4141 kvm_run->if_flag = (kvm_get_rflags(vcpu) & X86_EFLAGS_IF) != 0;
3468 kvm_run->cr8 = kvm_get_cr8(vcpu); 4142 kvm_run->cr8 = kvm_get_cr8(vcpu);
3469 kvm_run->apic_base = kvm_get_apic_base(vcpu); 4143 kvm_run->apic_base = kvm_get_apic_base(vcpu);
3470 if (irqchip_in_kernel(vcpu->kvm)) 4144 if (irqchip_in_kernel(vcpu->kvm))
@@ -3492,14 +4166,15 @@ static void vapic_enter(struct kvm_vcpu *vcpu)
3492static void vapic_exit(struct kvm_vcpu *vcpu) 4166static void vapic_exit(struct kvm_vcpu *vcpu)
3493{ 4167{
3494 struct kvm_lapic *apic = vcpu->arch.apic; 4168 struct kvm_lapic *apic = vcpu->arch.apic;
4169 int idx;
3495 4170
3496 if (!apic || !apic->vapic_addr) 4171 if (!apic || !apic->vapic_addr)
3497 return; 4172 return;
3498 4173
3499 down_read(&vcpu->kvm->slots_lock); 4174 idx = srcu_read_lock(&vcpu->kvm->srcu);
3500 kvm_release_page_dirty(apic->vapic_page); 4175 kvm_release_page_dirty(apic->vapic_page);
3501 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT); 4176 mark_page_dirty(vcpu->kvm, apic->vapic_addr >> PAGE_SHIFT);
3502 up_read(&vcpu->kvm->slots_lock); 4177 srcu_read_unlock(&vcpu->kvm->srcu, idx);
3503} 4178}
3504 4179
3505static void update_cr8_intercept(struct kvm_vcpu *vcpu) 4180static void update_cr8_intercept(struct kvm_vcpu *vcpu)
@@ -3525,7 +4200,7 @@ static void update_cr8_intercept(struct kvm_vcpu *vcpu)
3525 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr); 4200 kvm_x86_ops->update_cr8_intercept(vcpu, tpr, max_irr);
3526} 4201}
3527 4202
3528static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4203static void inject_pending_event(struct kvm_vcpu *vcpu)
3529{ 4204{
3530 /* try to reinject previous events if any */ 4205 /* try to reinject previous events if any */
3531 if (vcpu->arch.exception.pending) { 4206 if (vcpu->arch.exception.pending) {
@@ -3561,11 +4236,11 @@ static void inject_pending_event(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3561 } 4236 }
3562} 4237}
3563 4238
3564static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4239static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
3565{ 4240{
3566 int r; 4241 int r;
3567 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) && 4242 bool req_int_win = !irqchip_in_kernel(vcpu->kvm) &&
3568 kvm_run->request_interrupt_window; 4243 vcpu->run->request_interrupt_window;
3569 4244
3570 if (vcpu->requests) 4245 if (vcpu->requests)
3571 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests)) 4246 if (test_and_clear_bit(KVM_REQ_MMU_RELOAD, &vcpu->requests))
@@ -3586,21 +4261,26 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3586 kvm_x86_ops->tlb_flush(vcpu); 4261 kvm_x86_ops->tlb_flush(vcpu);
3587 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS, 4262 if (test_and_clear_bit(KVM_REQ_REPORT_TPR_ACCESS,
3588 &vcpu->requests)) { 4263 &vcpu->requests)) {
3589 kvm_run->exit_reason = KVM_EXIT_TPR_ACCESS; 4264 vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
3590 r = 0; 4265 r = 0;
3591 goto out; 4266 goto out;
3592 } 4267 }
3593 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) { 4268 if (test_and_clear_bit(KVM_REQ_TRIPLE_FAULT, &vcpu->requests)) {
3594 kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; 4269 vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
3595 r = 0; 4270 r = 0;
3596 goto out; 4271 goto out;
3597 } 4272 }
4273 if (test_and_clear_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests)) {
4274 vcpu->fpu_active = 0;
4275 kvm_x86_ops->fpu_deactivate(vcpu);
4276 }
3598 } 4277 }
3599 4278
3600 preempt_disable(); 4279 preempt_disable();
3601 4280
3602 kvm_x86_ops->prepare_guest_switch(vcpu); 4281 kvm_x86_ops->prepare_guest_switch(vcpu);
3603 kvm_load_guest_fpu(vcpu); 4282 if (vcpu->fpu_active)
4283 kvm_load_guest_fpu(vcpu);
3604 4284
3605 local_irq_disable(); 4285 local_irq_disable();
3606 4286
@@ -3615,7 +4295,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3615 goto out; 4295 goto out;
3616 } 4296 }
3617 4297
3618 inject_pending_event(vcpu, kvm_run); 4298 inject_pending_event(vcpu);
3619 4299
3620 /* enable NMI/IRQ window open exits if needed */ 4300 /* enable NMI/IRQ window open exits if needed */
3621 if (vcpu->arch.nmi_pending) 4301 if (vcpu->arch.nmi_pending)
@@ -3628,7 +4308,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3628 kvm_lapic_sync_to_vapic(vcpu); 4308 kvm_lapic_sync_to_vapic(vcpu);
3629 } 4309 }
3630 4310
3631 up_read(&vcpu->kvm->slots_lock); 4311 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3632 4312
3633 kvm_guest_enter(); 4313 kvm_guest_enter();
3634 4314
@@ -3641,16 +4321,17 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3641 } 4321 }
3642 4322
3643 trace_kvm_entry(vcpu->vcpu_id); 4323 trace_kvm_entry(vcpu->vcpu_id);
3644 kvm_x86_ops->run(vcpu, kvm_run); 4324 kvm_x86_ops->run(vcpu);
3645 4325
3646 if (unlikely(vcpu->arch.switch_db_regs || test_thread_flag(TIF_DEBUG))) { 4326 /*
3647 set_debugreg(current->thread.debugreg0, 0); 4327 * If the guest has used debug registers, at least dr7
3648 set_debugreg(current->thread.debugreg1, 1); 4328 * will be disabled while returning to the host.
3649 set_debugreg(current->thread.debugreg2, 2); 4329 * If we don't have active breakpoints in the host, we don't
3650 set_debugreg(current->thread.debugreg3, 3); 4330 * care about the messed up debug address registers. But if
3651 set_debugreg(current->thread.debugreg6, 6); 4331 * we have some of them active, restore the old state.
3652 set_debugreg(current->thread.debugreg7, 7); 4332 */
3653 } 4333 if (hw_breakpoint_active())
4334 hw_breakpoint_restore();
3654 4335
3655 set_bit(KVM_REQ_KICK, &vcpu->requests); 4336 set_bit(KVM_REQ_KICK, &vcpu->requests);
3656 local_irq_enable(); 4337 local_irq_enable();
@@ -3669,7 +4350,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3669 4350
3670 preempt_enable(); 4351 preempt_enable();
3671 4352
3672 down_read(&vcpu->kvm->slots_lock); 4353 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3673 4354
3674 /* 4355 /*
3675 * Profile KVM exit RIPs: 4356 * Profile KVM exit RIPs:
@@ -3682,15 +4363,16 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3682 4363
3683 kvm_lapic_sync_from_vapic(vcpu); 4364 kvm_lapic_sync_from_vapic(vcpu);
3684 4365
3685 r = kvm_x86_ops->handle_exit(kvm_run, vcpu); 4366 r = kvm_x86_ops->handle_exit(vcpu);
3686out: 4367out:
3687 return r; 4368 return r;
3688} 4369}
3689 4370
3690 4371
3691static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run) 4372static int __vcpu_run(struct kvm_vcpu *vcpu)
3692{ 4373{
3693 int r; 4374 int r;
4375 struct kvm *kvm = vcpu->kvm;
3694 4376
3695 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) { 4377 if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_SIPI_RECEIVED)) {
3696 pr_debug("vcpu %d received sipi with vector # %x\n", 4378 pr_debug("vcpu %d received sipi with vector # %x\n",
@@ -3702,17 +4384,17 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3702 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 4384 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
3703 } 4385 }
3704 4386
3705 down_read(&vcpu->kvm->slots_lock); 4387 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3706 vapic_enter(vcpu); 4388 vapic_enter(vcpu);
3707 4389
3708 r = 1; 4390 r = 1;
3709 while (r > 0) { 4391 while (r > 0) {
3710 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE) 4392 if (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE)
3711 r = vcpu_enter_guest(vcpu, kvm_run); 4393 r = vcpu_enter_guest(vcpu);
3712 else { 4394 else {
3713 up_read(&vcpu->kvm->slots_lock); 4395 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3714 kvm_vcpu_block(vcpu); 4396 kvm_vcpu_block(vcpu);
3715 down_read(&vcpu->kvm->slots_lock); 4397 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3716 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests)) 4398 if (test_and_clear_bit(KVM_REQ_UNHALT, &vcpu->requests))
3717 { 4399 {
3718 switch(vcpu->arch.mp_state) { 4400 switch(vcpu->arch.mp_state) {
@@ -3736,25 +4418,25 @@ static int __vcpu_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3736 if (kvm_cpu_has_pending_timer(vcpu)) 4418 if (kvm_cpu_has_pending_timer(vcpu))
3737 kvm_inject_pending_timer_irqs(vcpu); 4419 kvm_inject_pending_timer_irqs(vcpu);
3738 4420
3739 if (dm_request_for_irq_injection(vcpu, kvm_run)) { 4421 if (dm_request_for_irq_injection(vcpu)) {
3740 r = -EINTR; 4422 r = -EINTR;
3741 kvm_run->exit_reason = KVM_EXIT_INTR; 4423 vcpu->run->exit_reason = KVM_EXIT_INTR;
3742 ++vcpu->stat.request_irq_exits; 4424 ++vcpu->stat.request_irq_exits;
3743 } 4425 }
3744 if (signal_pending(current)) { 4426 if (signal_pending(current)) {
3745 r = -EINTR; 4427 r = -EINTR;
3746 kvm_run->exit_reason = KVM_EXIT_INTR; 4428 vcpu->run->exit_reason = KVM_EXIT_INTR;
3747 ++vcpu->stat.signal_exits; 4429 ++vcpu->stat.signal_exits;
3748 } 4430 }
3749 if (need_resched()) { 4431 if (need_resched()) {
3750 up_read(&vcpu->kvm->slots_lock); 4432 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3751 kvm_resched(vcpu); 4433 kvm_resched(vcpu);
3752 down_read(&vcpu->kvm->slots_lock); 4434 vcpu->srcu_idx = srcu_read_lock(&kvm->srcu);
3753 } 4435 }
3754 } 4436 }
3755 4437
3756 up_read(&vcpu->kvm->slots_lock); 4438 srcu_read_unlock(&kvm->srcu, vcpu->srcu_idx);
3757 post_kvm_run_save(vcpu, kvm_run); 4439 post_kvm_run_save(vcpu);
3758 4440
3759 vapic_exit(vcpu); 4441 vapic_exit(vcpu);
3760 4442
@@ -3783,21 +4465,21 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3783 kvm_set_cr8(vcpu, kvm_run->cr8); 4465 kvm_set_cr8(vcpu, kvm_run->cr8);
3784 4466
3785 if (vcpu->arch.pio.cur_count) { 4467 if (vcpu->arch.pio.cur_count) {
4468 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3786 r = complete_pio(vcpu); 4469 r = complete_pio(vcpu);
4470 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3787 if (r) 4471 if (r)
3788 goto out; 4472 goto out;
3789 } 4473 }
3790#if CONFIG_HAS_IOMEM
3791 if (vcpu->mmio_needed) { 4474 if (vcpu->mmio_needed) {
3792 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8); 4475 memcpy(vcpu->mmio_data, kvm_run->mmio.data, 8);
3793 vcpu->mmio_read_completed = 1; 4476 vcpu->mmio_read_completed = 1;
3794 vcpu->mmio_needed = 0; 4477 vcpu->mmio_needed = 0;
3795 4478
3796 down_read(&vcpu->kvm->slots_lock); 4479 vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu);
3797 r = emulate_instruction(vcpu, kvm_run, 4480 r = emulate_instruction(vcpu, vcpu->arch.mmio_fault_cr2, 0,
3798 vcpu->arch.mmio_fault_cr2, 0,
3799 EMULTYPE_NO_DECODE); 4481 EMULTYPE_NO_DECODE);
3800 up_read(&vcpu->kvm->slots_lock); 4482 srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx);
3801 if (r == EMULATE_DO_MMIO) { 4483 if (r == EMULATE_DO_MMIO) {
3802 /* 4484 /*
3803 * Read-modify-write. Back to userspace. 4485 * Read-modify-write. Back to userspace.
@@ -3806,12 +4488,11 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct kvm_run *kvm_run)
3806 goto out; 4488 goto out;
3807 } 4489 }
3808 } 4490 }
3809#endif
3810 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL) 4491 if (kvm_run->exit_reason == KVM_EXIT_HYPERCALL)
3811 kvm_register_write(vcpu, VCPU_REGS_RAX, 4492 kvm_register_write(vcpu, VCPU_REGS_RAX,
3812 kvm_run->hypercall.ret); 4493 kvm_run->hypercall.ret);
3813 4494
3814 r = __vcpu_run(vcpu, kvm_run); 4495 r = __vcpu_run(vcpu);
3815 4496
3816out: 4497out:
3817 if (vcpu->sigset_active) 4498 if (vcpu->sigset_active)
@@ -3845,13 +4526,7 @@ int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3845#endif 4526#endif
3846 4527
3847 regs->rip = kvm_rip_read(vcpu); 4528 regs->rip = kvm_rip_read(vcpu);
3848 regs->rflags = kvm_x86_ops->get_rflags(vcpu); 4529 regs->rflags = kvm_get_rflags(vcpu);
3849
3850 /*
3851 * Don't leak debug flags in case they were set for guest debugging
3852 */
3853 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
3854 regs->rflags &= ~(X86_EFLAGS_TF | X86_EFLAGS_RF);
3855 4530
3856 vcpu_put(vcpu); 4531 vcpu_put(vcpu);
3857 4532
@@ -3879,12 +4554,10 @@ int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs)
3879 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13); 4554 kvm_register_write(vcpu, VCPU_REGS_R13, regs->r13);
3880 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14); 4555 kvm_register_write(vcpu, VCPU_REGS_R14, regs->r14);
3881 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15); 4556 kvm_register_write(vcpu, VCPU_REGS_R15, regs->r15);
3882
3883#endif 4557#endif
3884 4558
3885 kvm_rip_write(vcpu, regs->rip); 4559 kvm_rip_write(vcpu, regs->rip);
3886 kvm_x86_ops->set_rflags(vcpu, regs->rflags); 4560 kvm_set_rflags(vcpu, regs->rflags);
3887
3888 4561
3889 vcpu->arch.exception.pending = false; 4562 vcpu->arch.exception.pending = false;
3890 4563
@@ -3933,13 +4606,12 @@ int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu,
3933 sregs->gdt.limit = dt.limit; 4606 sregs->gdt.limit = dt.limit;
3934 sregs->gdt.base = dt.base; 4607 sregs->gdt.base = dt.base;
3935 4608
3936 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 4609 sregs->cr0 = kvm_read_cr0(vcpu);
3937 sregs->cr0 = vcpu->arch.cr0;
3938 sregs->cr2 = vcpu->arch.cr2; 4610 sregs->cr2 = vcpu->arch.cr2;
3939 sregs->cr3 = vcpu->arch.cr3; 4611 sregs->cr3 = vcpu->arch.cr3;
3940 sregs->cr4 = vcpu->arch.cr4; 4612 sregs->cr4 = kvm_read_cr4(vcpu);
3941 sregs->cr8 = kvm_get_cr8(vcpu); 4613 sregs->cr8 = kvm_get_cr8(vcpu);
3942 sregs->efer = vcpu->arch.shadow_efer; 4614 sregs->efer = vcpu->arch.efer;
3943 sregs->apic_base = kvm_get_apic_base(vcpu); 4615 sregs->apic_base = kvm_get_apic_base(vcpu);
3944 4616
3945 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap); 4617 memset(sregs->interrupt_bitmap, 0, sizeof sregs->interrupt_bitmap);
@@ -4027,14 +4699,23 @@ static int load_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4027{ 4699{
4028 struct descriptor_table dtable; 4700 struct descriptor_table dtable;
4029 u16 index = selector >> 3; 4701 u16 index = selector >> 3;
4702 int ret;
4703 u32 err;
4704 gva_t addr;
4030 4705
4031 get_segment_descriptor_dtable(vcpu, selector, &dtable); 4706 get_segment_descriptor_dtable(vcpu, selector, &dtable);
4032 4707
4033 if (dtable.limit < index * 8 + 7) { 4708 if (dtable.limit < index * 8 + 7) {
4034 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc); 4709 kvm_queue_exception_e(vcpu, GP_VECTOR, selector & 0xfffc);
4035 return 1; 4710 return X86EMUL_PROPAGATE_FAULT;
4036 } 4711 }
4037 return kvm_read_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4712 addr = dtable.base + index * 8;
4713 ret = kvm_read_guest_virt_system(addr, seg_desc, sizeof(*seg_desc),
4714 vcpu, &err);
4715 if (ret == X86EMUL_PROPAGATE_FAULT)
4716 kvm_inject_page_fault(vcpu, addr, err);
4717
4718 return ret;
4038} 4719}
4039 4720
4040/* allowed just for 8 bytes segments */ 4721/* allowed just for 8 bytes segments */
@@ -4048,15 +4729,23 @@ static int save_guest_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector,
4048 4729
4049 if (dtable.limit < index * 8 + 7) 4730 if (dtable.limit < index * 8 + 7)
4050 return 1; 4731 return 1;
4051 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu); 4732 return kvm_write_guest_virt(dtable.base + index*8, seg_desc, sizeof(*seg_desc), vcpu, NULL);
4733}
4734
4735static gpa_t get_tss_base_addr_write(struct kvm_vcpu *vcpu,
4736 struct desc_struct *seg_desc)
4737{
4738 u32 base_addr = get_desc_base(seg_desc);
4739
4740 return kvm_mmu_gva_to_gpa_write(vcpu, base_addr, NULL);
4052} 4741}
4053 4742
4054static gpa_t get_tss_base_addr(struct kvm_vcpu *vcpu, 4743static gpa_t get_tss_base_addr_read(struct kvm_vcpu *vcpu,
4055 struct desc_struct *seg_desc) 4744 struct desc_struct *seg_desc)
4056{ 4745{
4057 u32 base_addr = get_desc_base(seg_desc); 4746 u32 base_addr = get_desc_base(seg_desc);
4058 4747
4059 return vcpu->arch.mmu.gva_to_gpa(vcpu, base_addr); 4748 return kvm_mmu_gva_to_gpa_read(vcpu, base_addr, NULL);
4060} 4749}
4061 4750
4062static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg) 4751static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
@@ -4067,18 +4756,6 @@ static u16 get_segment_selector(struct kvm_vcpu *vcpu, int seg)
4067 return kvm_seg.selector; 4756 return kvm_seg.selector;
4068} 4757}
4069 4758
4070static int load_segment_descriptor_to_kvm_desct(struct kvm_vcpu *vcpu,
4071 u16 selector,
4072 struct kvm_segment *kvm_seg)
4073{
4074 struct desc_struct seg_desc;
4075
4076 if (load_guest_segment_descriptor(vcpu, selector, &seg_desc))
4077 return 1;
4078 seg_desct_to_kvm_desct(&seg_desc, selector, kvm_seg);
4079 return 0;
4080}
4081
4082static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg) 4759static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int seg)
4083{ 4760{
4084 struct kvm_segment segvar = { 4761 struct kvm_segment segvar = {
@@ -4096,34 +4773,122 @@ static int kvm_load_realmode_segment(struct kvm_vcpu *vcpu, u16 selector, int se
4096 .unusable = 0, 4773 .unusable = 0,
4097 }; 4774 };
4098 kvm_x86_ops->set_segment(vcpu, &segvar, seg); 4775 kvm_x86_ops->set_segment(vcpu, &segvar, seg);
4099 return 0; 4776 return X86EMUL_CONTINUE;
4100} 4777}
4101 4778
4102static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg) 4779static int is_vm86_segment(struct kvm_vcpu *vcpu, int seg)
4103{ 4780{
4104 return (seg != VCPU_SREG_LDTR) && 4781 return (seg != VCPU_SREG_LDTR) &&
4105 (seg != VCPU_SREG_TR) && 4782 (seg != VCPU_SREG_TR) &&
4106 (kvm_x86_ops->get_rflags(vcpu) & X86_EFLAGS_VM); 4783 (kvm_get_rflags(vcpu) & X86_EFLAGS_VM);
4107} 4784}
4108 4785
4109int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, 4786int kvm_load_segment_descriptor(struct kvm_vcpu *vcpu, u16 selector, int seg)
4110 int type_bits, int seg)
4111{ 4787{
4112 struct kvm_segment kvm_seg; 4788 struct kvm_segment kvm_seg;
4789 struct desc_struct seg_desc;
4790 u8 dpl, rpl, cpl;
4791 unsigned err_vec = GP_VECTOR;
4792 u32 err_code = 0;
4793 bool null_selector = !(selector & ~0x3); /* 0000-0003 are null */
4794 int ret;
4113 4795
4114 if (is_vm86_segment(vcpu, seg) || !(vcpu->arch.cr0 & X86_CR0_PE)) 4796 if (is_vm86_segment(vcpu, seg) || !is_protmode(vcpu))
4115 return kvm_load_realmode_segment(vcpu, selector, seg); 4797 return kvm_load_realmode_segment(vcpu, selector, seg);
4116 if (load_segment_descriptor_to_kvm_desct(vcpu, selector, &kvm_seg))
4117 return 1;
4118 kvm_seg.type |= type_bits;
4119 4798
4120 if (seg != VCPU_SREG_SS && seg != VCPU_SREG_CS && 4799 /* NULL selector is not valid for TR, CS and SS */
4121 seg != VCPU_SREG_LDTR) 4800 if ((seg == VCPU_SREG_CS || seg == VCPU_SREG_SS || seg == VCPU_SREG_TR)
4122 if (!kvm_seg.s) 4801 && null_selector)
4123 kvm_seg.unusable = 1; 4802 goto exception;
4803
4804 /* TR should be in GDT only */
4805 if (seg == VCPU_SREG_TR && (selector & (1 << 2)))
4806 goto exception;
4807
4808 ret = load_guest_segment_descriptor(vcpu, selector, &seg_desc);
4809 if (ret)
4810 return ret;
4811
4812 seg_desct_to_kvm_desct(&seg_desc, selector, &kvm_seg);
4813
4814 if (null_selector) { /* for NULL selector skip all following checks */
4815 kvm_seg.unusable = 1;
4816 goto load;
4817 }
4818
4819 err_code = selector & 0xfffc;
4820 err_vec = GP_VECTOR;
4124 4821
4822 /* can't load system descriptor into segment selecor */
4823 if (seg <= VCPU_SREG_GS && !kvm_seg.s)
4824 goto exception;
4825
4826 if (!kvm_seg.present) {
4827 err_vec = (seg == VCPU_SREG_SS) ? SS_VECTOR : NP_VECTOR;
4828 goto exception;
4829 }
4830
4831 rpl = selector & 3;
4832 dpl = kvm_seg.dpl;
4833 cpl = kvm_x86_ops->get_cpl(vcpu);
4834
4835 switch (seg) {
4836 case VCPU_SREG_SS:
4837 /*
4838 * segment is not a writable data segment or segment
4839 * selector's RPL != CPL or segment selector's RPL != CPL
4840 */
4841 if (rpl != cpl || (kvm_seg.type & 0xa) != 0x2 || dpl != cpl)
4842 goto exception;
4843 break;
4844 case VCPU_SREG_CS:
4845 if (!(kvm_seg.type & 8))
4846 goto exception;
4847
4848 if (kvm_seg.type & 4) {
4849 /* conforming */
4850 if (dpl > cpl)
4851 goto exception;
4852 } else {
4853 /* nonconforming */
4854 if (rpl > cpl || dpl != cpl)
4855 goto exception;
4856 }
4857 /* CS(RPL) <- CPL */
4858 selector = (selector & 0xfffc) | cpl;
4859 break;
4860 case VCPU_SREG_TR:
4861 if (kvm_seg.s || (kvm_seg.type != 1 && kvm_seg.type != 9))
4862 goto exception;
4863 break;
4864 case VCPU_SREG_LDTR:
4865 if (kvm_seg.s || kvm_seg.type != 2)
4866 goto exception;
4867 break;
4868 default: /* DS, ES, FS, or GS */
4869 /*
4870 * segment is not a data or readable code segment or
4871 * ((segment is a data or nonconforming code segment)
4872 * and (both RPL and CPL > DPL))
4873 */
4874 if ((kvm_seg.type & 0xa) == 0x8 ||
4875 (((kvm_seg.type & 0xc) != 0xc) && (rpl > dpl && cpl > dpl)))
4876 goto exception;
4877 break;
4878 }
4879
4880 if (!kvm_seg.unusable && kvm_seg.s) {
4881 /* mark segment as accessed */
4882 kvm_seg.type |= 1;
4883 seg_desc.type |= 1;
4884 save_guest_segment_descriptor(vcpu, selector, &seg_desc);
4885 }
4886load:
4125 kvm_set_segment(vcpu, &kvm_seg, seg); 4887 kvm_set_segment(vcpu, &kvm_seg, seg);
4126 return 0; 4888 return X86EMUL_CONTINUE;
4889exception:
4890 kvm_queue_exception_e(vcpu, err_vec, err_code);
4891 return X86EMUL_PROPAGATE_FAULT;
4127} 4892}
4128 4893
4129static void save_state_to_tss32(struct kvm_vcpu *vcpu, 4894static void save_state_to_tss32(struct kvm_vcpu *vcpu,
@@ -4131,7 +4896,7 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4131{ 4896{
4132 tss->cr3 = vcpu->arch.cr3; 4897 tss->cr3 = vcpu->arch.cr3;
4133 tss->eip = kvm_rip_read(vcpu); 4898 tss->eip = kvm_rip_read(vcpu);
4134 tss->eflags = kvm_x86_ops->get_rflags(vcpu); 4899 tss->eflags = kvm_get_rflags(vcpu);
4135 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4900 tss->eax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4136 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4901 tss->ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4137 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4902 tss->edx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4149,13 +4914,21 @@ static void save_state_to_tss32(struct kvm_vcpu *vcpu,
4149 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4914 tss->ldt_selector = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4150} 4915}
4151 4916
4917static void kvm_load_segment_selector(struct kvm_vcpu *vcpu, u16 sel, int seg)
4918{
4919 struct kvm_segment kvm_seg;
4920 kvm_get_segment(vcpu, &kvm_seg, seg);
4921 kvm_seg.selector = sel;
4922 kvm_set_segment(vcpu, &kvm_seg, seg);
4923}
4924
4152static int load_state_from_tss32(struct kvm_vcpu *vcpu, 4925static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4153 struct tss_segment_32 *tss) 4926 struct tss_segment_32 *tss)
4154{ 4927{
4155 kvm_set_cr3(vcpu, tss->cr3); 4928 kvm_set_cr3(vcpu, tss->cr3);
4156 4929
4157 kvm_rip_write(vcpu, tss->eip); 4930 kvm_rip_write(vcpu, tss->eip);
4158 kvm_x86_ops->set_rflags(vcpu, tss->eflags | 2); 4931 kvm_set_rflags(vcpu, tss->eflags | 2);
4159 4932
4160 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax); 4933 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->eax);
4161 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx); 4934 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->ecx);
@@ -4166,25 +4939,41 @@ static int load_state_from_tss32(struct kvm_vcpu *vcpu,
4166 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi); 4939 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->esi);
4167 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi); 4940 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->edi);
4168 4941
4169 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, 0, VCPU_SREG_LDTR)) 4942 /*
4943 * SDM says that segment selectors are loaded before segment
4944 * descriptors
4945 */
4946 kvm_load_segment_selector(vcpu, tss->ldt_selector, VCPU_SREG_LDTR);
4947 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
4948 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
4949 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
4950 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
4951 kvm_load_segment_selector(vcpu, tss->fs, VCPU_SREG_FS);
4952 kvm_load_segment_selector(vcpu, tss->gs, VCPU_SREG_GS);
4953
4954 /*
4955 * Now load segment descriptors. If fault happenes at this stage
4956 * it is handled in a context of new task
4957 */
4958 if (kvm_load_segment_descriptor(vcpu, tss->ldt_selector, VCPU_SREG_LDTR))
4170 return 1; 4959 return 1;
4171 4960
4172 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 4961 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4173 return 1; 4962 return 1;
4174 4963
4175 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 4964 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4176 return 1; 4965 return 1;
4177 4966
4178 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 4967 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4179 return 1; 4968 return 1;
4180 4969
4181 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 4970 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4182 return 1; 4971 return 1;
4183 4972
4184 if (kvm_load_segment_descriptor(vcpu, tss->fs, 1, VCPU_SREG_FS)) 4973 if (kvm_load_segment_descriptor(vcpu, tss->fs, VCPU_SREG_FS))
4185 return 1; 4974 return 1;
4186 4975
4187 if (kvm_load_segment_descriptor(vcpu, tss->gs, 1, VCPU_SREG_GS)) 4976 if (kvm_load_segment_descriptor(vcpu, tss->gs, VCPU_SREG_GS))
4188 return 1; 4977 return 1;
4189 return 0; 4978 return 0;
4190} 4979}
@@ -4193,7 +4982,7 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4193 struct tss_segment_16 *tss) 4982 struct tss_segment_16 *tss)
4194{ 4983{
4195 tss->ip = kvm_rip_read(vcpu); 4984 tss->ip = kvm_rip_read(vcpu);
4196 tss->flag = kvm_x86_ops->get_rflags(vcpu); 4985 tss->flag = kvm_get_rflags(vcpu);
4197 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX); 4986 tss->ax = kvm_register_read(vcpu, VCPU_REGS_RAX);
4198 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX); 4987 tss->cx = kvm_register_read(vcpu, VCPU_REGS_RCX);
4199 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX); 4988 tss->dx = kvm_register_read(vcpu, VCPU_REGS_RDX);
@@ -4208,14 +4997,13 @@ static void save_state_to_tss16(struct kvm_vcpu *vcpu,
4208 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS); 4997 tss->ss = get_segment_selector(vcpu, VCPU_SREG_SS);
4209 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS); 4998 tss->ds = get_segment_selector(vcpu, VCPU_SREG_DS);
4210 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR); 4999 tss->ldt = get_segment_selector(vcpu, VCPU_SREG_LDTR);
4211 tss->prev_task_link = get_segment_selector(vcpu, VCPU_SREG_TR);
4212} 5000}
4213 5001
4214static int load_state_from_tss16(struct kvm_vcpu *vcpu, 5002static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4215 struct tss_segment_16 *tss) 5003 struct tss_segment_16 *tss)
4216{ 5004{
4217 kvm_rip_write(vcpu, tss->ip); 5005 kvm_rip_write(vcpu, tss->ip);
4218 kvm_x86_ops->set_rflags(vcpu, tss->flag | 2); 5006 kvm_set_rflags(vcpu, tss->flag | 2);
4219 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax); 5007 kvm_register_write(vcpu, VCPU_REGS_RAX, tss->ax);
4220 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx); 5008 kvm_register_write(vcpu, VCPU_REGS_RCX, tss->cx);
4221 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx); 5009 kvm_register_write(vcpu, VCPU_REGS_RDX, tss->dx);
@@ -4225,19 +5013,33 @@ static int load_state_from_tss16(struct kvm_vcpu *vcpu,
4225 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si); 5013 kvm_register_write(vcpu, VCPU_REGS_RSI, tss->si);
4226 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di); 5014 kvm_register_write(vcpu, VCPU_REGS_RDI, tss->di);
4227 5015
4228 if (kvm_load_segment_descriptor(vcpu, tss->ldt, 0, VCPU_SREG_LDTR)) 5016 /*
5017 * SDM says that segment selectors are loaded before segment
5018 * descriptors
5019 */
5020 kvm_load_segment_selector(vcpu, tss->ldt, VCPU_SREG_LDTR);
5021 kvm_load_segment_selector(vcpu, tss->es, VCPU_SREG_ES);
5022 kvm_load_segment_selector(vcpu, tss->cs, VCPU_SREG_CS);
5023 kvm_load_segment_selector(vcpu, tss->ss, VCPU_SREG_SS);
5024 kvm_load_segment_selector(vcpu, tss->ds, VCPU_SREG_DS);
5025
5026 /*
5027 * Now load segment descriptors. If fault happenes at this stage
5028 * it is handled in a context of new task
5029 */
5030 if (kvm_load_segment_descriptor(vcpu, tss->ldt, VCPU_SREG_LDTR))
4229 return 1; 5031 return 1;
4230 5032
4231 if (kvm_load_segment_descriptor(vcpu, tss->es, 1, VCPU_SREG_ES)) 5033 if (kvm_load_segment_descriptor(vcpu, tss->es, VCPU_SREG_ES))
4232 return 1; 5034 return 1;
4233 5035
4234 if (kvm_load_segment_descriptor(vcpu, tss->cs, 9, VCPU_SREG_CS)) 5036 if (kvm_load_segment_descriptor(vcpu, tss->cs, VCPU_SREG_CS))
4235 return 1; 5037 return 1;
4236 5038
4237 if (kvm_load_segment_descriptor(vcpu, tss->ss, 1, VCPU_SREG_SS)) 5039 if (kvm_load_segment_descriptor(vcpu, tss->ss, VCPU_SREG_SS))
4238 return 1; 5040 return 1;
4239 5041
4240 if (kvm_load_segment_descriptor(vcpu, tss->ds, 1, VCPU_SREG_DS)) 5042 if (kvm_load_segment_descriptor(vcpu, tss->ds, VCPU_SREG_DS))
4241 return 1; 5043 return 1;
4242 return 0; 5044 return 0;
4243} 5045}
@@ -4259,7 +5061,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4259 sizeof tss_segment_16)) 5061 sizeof tss_segment_16))
4260 goto out; 5062 goto out;
4261 5063
4262 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5064 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4263 &tss_segment_16, sizeof tss_segment_16)) 5065 &tss_segment_16, sizeof tss_segment_16))
4264 goto out; 5066 goto out;
4265 5067
@@ -4267,7 +5069,7 @@ static int kvm_task_switch_16(struct kvm_vcpu *vcpu, u16 tss_selector,
4267 tss_segment_16.prev_task_link = old_tss_sel; 5069 tss_segment_16.prev_task_link = old_tss_sel;
4268 5070
4269 if (kvm_write_guest(vcpu->kvm, 5071 if (kvm_write_guest(vcpu->kvm,
4270 get_tss_base_addr(vcpu, nseg_desc), 5072 get_tss_base_addr_write(vcpu, nseg_desc),
4271 &tss_segment_16.prev_task_link, 5073 &tss_segment_16.prev_task_link,
4272 sizeof tss_segment_16.prev_task_link)) 5074 sizeof tss_segment_16.prev_task_link))
4273 goto out; 5075 goto out;
@@ -4298,7 +5100,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4298 sizeof tss_segment_32)) 5100 sizeof tss_segment_32))
4299 goto out; 5101 goto out;
4300 5102
4301 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr(vcpu, nseg_desc), 5103 if (kvm_read_guest(vcpu->kvm, get_tss_base_addr_read(vcpu, nseg_desc),
4302 &tss_segment_32, sizeof tss_segment_32)) 5104 &tss_segment_32, sizeof tss_segment_32))
4303 goto out; 5105 goto out;
4304 5106
@@ -4306,7 +5108,7 @@ static int kvm_task_switch_32(struct kvm_vcpu *vcpu, u16 tss_selector,
4306 tss_segment_32.prev_task_link = old_tss_sel; 5108 tss_segment_32.prev_task_link = old_tss_sel;
4307 5109
4308 if (kvm_write_guest(vcpu->kvm, 5110 if (kvm_write_guest(vcpu->kvm,
4309 get_tss_base_addr(vcpu, nseg_desc), 5111 get_tss_base_addr_write(vcpu, nseg_desc),
4310 &tss_segment_32.prev_task_link, 5112 &tss_segment_32.prev_task_link,
4311 sizeof tss_segment_32.prev_task_link)) 5113 sizeof tss_segment_32.prev_task_link))
4312 goto out; 5114 goto out;
@@ -4328,8 +5130,9 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4328 int ret = 0; 5130 int ret = 0;
4329 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR); 5131 u32 old_tss_base = get_segment_base(vcpu, VCPU_SREG_TR);
4330 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR); 5132 u16 old_tss_sel = get_segment_selector(vcpu, VCPU_SREG_TR);
5133 u32 desc_limit;
4331 5134
4332 old_tss_base = vcpu->arch.mmu.gva_to_gpa(vcpu, old_tss_base); 5135 old_tss_base = kvm_mmu_gva_to_gpa_write(vcpu, old_tss_base, NULL);
4333 5136
4334 /* FIXME: Handle errors. Failure to read either TSS or their 5137 /* FIXME: Handle errors. Failure to read either TSS or their
4335 * descriptors should generate a pagefault. 5138 * descriptors should generate a pagefault.
@@ -4350,7 +5153,10 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4350 } 5153 }
4351 } 5154 }
4352 5155
4353 if (!nseg_desc.p || get_desc_limit(&nseg_desc) < 0x67) { 5156 desc_limit = get_desc_limit(&nseg_desc);
5157 if (!nseg_desc.p ||
5158 ((desc_limit < 0x67 && (nseg_desc.type & 8)) ||
5159 desc_limit < 0x2b)) {
4354 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc); 5160 kvm_queue_exception_e(vcpu, TS_VECTOR, tss_selector & 0xfffc);
4355 return 1; 5161 return 1;
4356 } 5162 }
@@ -4361,8 +5167,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4361 } 5167 }
4362 5168
4363 if (reason == TASK_SWITCH_IRET) { 5169 if (reason == TASK_SWITCH_IRET) {
4364 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 5170 u32 eflags = kvm_get_rflags(vcpu);
4365 kvm_x86_ops->set_rflags(vcpu, eflags & ~X86_EFLAGS_NT); 5171 kvm_set_rflags(vcpu, eflags & ~X86_EFLAGS_NT);
4366 } 5172 }
4367 5173
4368 /* set back link to prev task only if NT bit is set in eflags 5174 /* set back link to prev task only if NT bit is set in eflags
@@ -4370,11 +5176,6 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4370 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE) 5176 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4371 old_tss_sel = 0xffff; 5177 old_tss_sel = 0xffff;
4372 5178
4373 /* set back link to prev task only if NT bit is set in eflags
4374 note that old_tss_sel is not used afetr this point */
4375 if (reason != TASK_SWITCH_CALL && reason != TASK_SWITCH_GATE)
4376 old_tss_sel = 0xffff;
4377
4378 if (nseg_desc.type & 8) 5179 if (nseg_desc.type & 8)
4379 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel, 5180 ret = kvm_task_switch_32(vcpu, tss_selector, old_tss_sel,
4380 old_tss_base, &nseg_desc); 5181 old_tss_base, &nseg_desc);
@@ -4383,8 +5184,8 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4383 old_tss_base, &nseg_desc); 5184 old_tss_base, &nseg_desc);
4384 5185
4385 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) { 5186 if (reason == TASK_SWITCH_CALL || reason == TASK_SWITCH_GATE) {
4386 u32 eflags = kvm_x86_ops->get_rflags(vcpu); 5187 u32 eflags = kvm_get_rflags(vcpu);
4387 kvm_x86_ops->set_rflags(vcpu, eflags | X86_EFLAGS_NT); 5188 kvm_set_rflags(vcpu, eflags | X86_EFLAGS_NT);
4388 } 5189 }
4389 5190
4390 if (reason != TASK_SWITCH_IRET) { 5191 if (reason != TASK_SWITCH_IRET) {
@@ -4393,7 +5194,7 @@ int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int reason)
4393 &nseg_desc); 5194 &nseg_desc);
4394 } 5195 }
4395 5196
4396 kvm_x86_ops->set_cr0(vcpu, vcpu->arch.cr0 | X86_CR0_TS); 5197 kvm_x86_ops->set_cr0(vcpu, kvm_read_cr0(vcpu) | X86_CR0_TS);
4397 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg); 5198 seg_desct_to_kvm_desct(&nseg_desc, tss_selector, &tr_seg);
4398 tr_seg.type = 11; 5199 tr_seg.type = 11;
4399 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR); 5200 kvm_set_segment(vcpu, &tr_seg, VCPU_SREG_TR);
@@ -4424,20 +5225,20 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4424 5225
4425 kvm_set_cr8(vcpu, sregs->cr8); 5226 kvm_set_cr8(vcpu, sregs->cr8);
4426 5227
4427 mmu_reset_needed |= vcpu->arch.shadow_efer != sregs->efer; 5228 mmu_reset_needed |= vcpu->arch.efer != sregs->efer;
4428 kvm_x86_ops->set_efer(vcpu, sregs->efer); 5229 kvm_x86_ops->set_efer(vcpu, sregs->efer);
4429 kvm_set_apic_base(vcpu, sregs->apic_base); 5230 kvm_set_apic_base(vcpu, sregs->apic_base);
4430 5231
4431 kvm_x86_ops->decache_cr4_guest_bits(vcpu); 5232 mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0;
4432
4433 mmu_reset_needed |= vcpu->arch.cr0 != sregs->cr0;
4434 kvm_x86_ops->set_cr0(vcpu, sregs->cr0); 5233 kvm_x86_ops->set_cr0(vcpu, sregs->cr0);
4435 vcpu->arch.cr0 = sregs->cr0; 5234 vcpu->arch.cr0 = sregs->cr0;
4436 5235
4437 mmu_reset_needed |= vcpu->arch.cr4 != sregs->cr4; 5236 mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4;
4438 kvm_x86_ops->set_cr4(vcpu, sregs->cr4); 5237 kvm_x86_ops->set_cr4(vcpu, sregs->cr4);
4439 if (!is_long_mode(vcpu) && is_pae(vcpu)) 5238 if (!is_long_mode(vcpu) && is_pae(vcpu)) {
4440 load_pdptrs(vcpu, vcpu->arch.cr3); 5239 load_pdptrs(vcpu, vcpu->arch.cr3);
5240 mmu_reset_needed = 1;
5241 }
4441 5242
4442 if (mmu_reset_needed) 5243 if (mmu_reset_needed)
4443 kvm_mmu_reset_context(vcpu); 5244 kvm_mmu_reset_context(vcpu);
@@ -4467,7 +5268,7 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4467 /* Older userspace won't unhalt the vcpu on reset. */ 5268 /* Older userspace won't unhalt the vcpu on reset. */
4468 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && 5269 if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 &&
4469 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && 5270 sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 &&
4470 !(vcpu->arch.cr0 & X86_CR0_PE)) 5271 !is_protmode(vcpu))
4471 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; 5272 vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
4472 5273
4473 vcpu_put(vcpu); 5274 vcpu_put(vcpu);
@@ -4478,12 +5279,32 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu,
4478int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, 5279int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4479 struct kvm_guest_debug *dbg) 5280 struct kvm_guest_debug *dbg)
4480{ 5281{
5282 unsigned long rflags;
4481 int i, r; 5283 int i, r;
4482 5284
4483 vcpu_load(vcpu); 5285 vcpu_load(vcpu);
4484 5286
4485 if ((dbg->control & (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) == 5287 if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) {
4486 (KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_USE_HW_BP)) { 5288 r = -EBUSY;
5289 if (vcpu->arch.exception.pending)
5290 goto unlock_out;
5291 if (dbg->control & KVM_GUESTDBG_INJECT_DB)
5292 kvm_queue_exception(vcpu, DB_VECTOR);
5293 else
5294 kvm_queue_exception(vcpu, BP_VECTOR);
5295 }
5296
5297 /*
5298 * Read rflags as long as potentially injected trace flags are still
5299 * filtered out.
5300 */
5301 rflags = kvm_get_rflags(vcpu);
5302
5303 vcpu->guest_debug = dbg->control;
5304 if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE))
5305 vcpu->guest_debug = 0;
5306
5307 if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
4487 for (i = 0; i < KVM_NR_DB_REGS; ++i) 5308 for (i = 0; i < KVM_NR_DB_REGS; ++i)
4488 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; 5309 vcpu->arch.eff_db[i] = dbg->arch.debugreg[i];
4489 vcpu->arch.switch_db_regs = 5310 vcpu->arch.switch_db_regs =
@@ -4494,13 +5315,23 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu,
4494 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK); 5315 vcpu->arch.switch_db_regs = (vcpu->arch.dr7 & DR7_BP_EN_MASK);
4495 } 5316 }
4496 5317
4497 r = kvm_x86_ops->set_guest_debug(vcpu, dbg); 5318 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) {
5319 vcpu->arch.singlestep_cs =
5320 get_segment_selector(vcpu, VCPU_SREG_CS);
5321 vcpu->arch.singlestep_rip = kvm_rip_read(vcpu);
5322 }
5323
5324 /*
5325 * Trigger an rflags update that will inject or remove the trace
5326 * flags.
5327 */
5328 kvm_set_rflags(vcpu, rflags);
4498 5329
4499 if (dbg->control & KVM_GUESTDBG_INJECT_DB) 5330 kvm_x86_ops->set_guest_debug(vcpu, dbg);
4500 kvm_queue_exception(vcpu, DB_VECTOR);
4501 else if (dbg->control & KVM_GUESTDBG_INJECT_BP)
4502 kvm_queue_exception(vcpu, BP_VECTOR);
4503 5331
5332 r = 0;
5333
5334unlock_out:
4504 vcpu_put(vcpu); 5335 vcpu_put(vcpu);
4505 5336
4506 return r; 5337 return r;
@@ -4535,11 +5366,12 @@ int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu,
4535{ 5366{
4536 unsigned long vaddr = tr->linear_address; 5367 unsigned long vaddr = tr->linear_address;
4537 gpa_t gpa; 5368 gpa_t gpa;
5369 int idx;
4538 5370
4539 vcpu_load(vcpu); 5371 vcpu_load(vcpu);
4540 down_read(&vcpu->kvm->slots_lock); 5372 idx = srcu_read_lock(&vcpu->kvm->srcu);
4541 gpa = vcpu->arch.mmu.gva_to_gpa(vcpu, vaddr); 5373 gpa = kvm_mmu_gva_to_gpa_system(vcpu, vaddr, NULL);
4542 up_read(&vcpu->kvm->slots_lock); 5374 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4543 tr->physical_address = gpa; 5375 tr->physical_address = gpa;
4544 tr->valid = gpa != UNMAPPED_GVA; 5376 tr->valid = gpa != UNMAPPED_GVA;
4545 tr->writeable = 1; 5377 tr->writeable = 1;
@@ -4620,14 +5452,14 @@ EXPORT_SYMBOL_GPL(fx_init);
4620 5452
4621void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) 5453void kvm_load_guest_fpu(struct kvm_vcpu *vcpu)
4622{ 5454{
4623 if (!vcpu->fpu_active || vcpu->guest_fpu_loaded) 5455 if (vcpu->guest_fpu_loaded)
4624 return; 5456 return;
4625 5457
4626 vcpu->guest_fpu_loaded = 1; 5458 vcpu->guest_fpu_loaded = 1;
4627 kvm_fx_save(&vcpu->arch.host_fx_image); 5459 kvm_fx_save(&vcpu->arch.host_fx_image);
4628 kvm_fx_restore(&vcpu->arch.guest_fx_image); 5460 kvm_fx_restore(&vcpu->arch.guest_fx_image);
5461 trace_kvm_fpu(1);
4629} 5462}
4630EXPORT_SYMBOL_GPL(kvm_load_guest_fpu);
4631 5463
4632void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) 5464void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4633{ 5465{
@@ -4638,8 +5470,9 @@ void kvm_put_guest_fpu(struct kvm_vcpu *vcpu)
4638 kvm_fx_save(&vcpu->arch.guest_fx_image); 5470 kvm_fx_save(&vcpu->arch.guest_fx_image);
4639 kvm_fx_restore(&vcpu->arch.host_fx_image); 5471 kvm_fx_restore(&vcpu->arch.host_fx_image);
4640 ++vcpu->stat.fpu_reload; 5472 ++vcpu->stat.fpu_reload;
5473 set_bit(KVM_REQ_DEACTIVATE_FPU, &vcpu->requests);
5474 trace_kvm_fpu(0);
4641} 5475}
4642EXPORT_SYMBOL_GPL(kvm_put_guest_fpu);
4643 5476
4644void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu) 5477void kvm_arch_vcpu_free(struct kvm_vcpu *vcpu)
4645{ 5478{
@@ -4701,14 +5534,26 @@ int kvm_arch_vcpu_reset(struct kvm_vcpu *vcpu)
4701 return kvm_x86_ops->vcpu_reset(vcpu); 5534 return kvm_x86_ops->vcpu_reset(vcpu);
4702} 5535}
4703 5536
4704void kvm_arch_hardware_enable(void *garbage) 5537int kvm_arch_hardware_enable(void *garbage)
4705{ 5538{
4706 kvm_x86_ops->hardware_enable(garbage); 5539 /*
5540 * Since this may be called from a hotplug notifcation,
5541 * we can't get the CPU frequency directly.
5542 */
5543 if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
5544 int cpu = raw_smp_processor_id();
5545 per_cpu(cpu_tsc_khz, cpu) = 0;
5546 }
5547
5548 kvm_shared_msr_cpu_online();
5549
5550 return kvm_x86_ops->hardware_enable(garbage);
4707} 5551}
4708 5552
4709void kvm_arch_hardware_disable(void *garbage) 5553void kvm_arch_hardware_disable(void *garbage)
4710{ 5554{
4711 kvm_x86_ops->hardware_disable(garbage); 5555 kvm_x86_ops->hardware_disable(garbage);
5556 drop_user_return_notifiers(garbage);
4712} 5557}
4713 5558
4714int kvm_arch_hardware_setup(void) 5559int kvm_arch_hardware_setup(void)
@@ -4762,12 +5607,13 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
4762 GFP_KERNEL); 5607 GFP_KERNEL);
4763 if (!vcpu->arch.mce_banks) { 5608 if (!vcpu->arch.mce_banks) {
4764 r = -ENOMEM; 5609 r = -ENOMEM;
4765 goto fail_mmu_destroy; 5610 goto fail_free_lapic;
4766 } 5611 }
4767 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; 5612 vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS;
4768 5613
4769 return 0; 5614 return 0;
4770 5615fail_free_lapic:
5616 kvm_free_lapic(vcpu);
4771fail_mmu_destroy: 5617fail_mmu_destroy:
4772 kvm_mmu_destroy(vcpu); 5618 kvm_mmu_destroy(vcpu);
4773fail_free_pio_data: 5619fail_free_pio_data:
@@ -4778,10 +5624,13 @@ fail:
4778 5624
4779void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu) 5625void kvm_arch_vcpu_uninit(struct kvm_vcpu *vcpu)
4780{ 5626{
5627 int idx;
5628
5629 kfree(vcpu->arch.mce_banks);
4781 kvm_free_lapic(vcpu); 5630 kvm_free_lapic(vcpu);
4782 down_read(&vcpu->kvm->slots_lock); 5631 idx = srcu_read_lock(&vcpu->kvm->srcu);
4783 kvm_mmu_destroy(vcpu); 5632 kvm_mmu_destroy(vcpu);
4784 up_read(&vcpu->kvm->slots_lock); 5633 srcu_read_unlock(&vcpu->kvm->srcu, idx);
4785 free_page((unsigned long)vcpu->arch.pio_data); 5634 free_page((unsigned long)vcpu->arch.pio_data);
4786} 5635}
4787 5636
@@ -4792,6 +5641,12 @@ struct kvm *kvm_arch_create_vm(void)
4792 if (!kvm) 5641 if (!kvm)
4793 return ERR_PTR(-ENOMEM); 5642 return ERR_PTR(-ENOMEM);
4794 5643
5644 kvm->arch.aliases = kzalloc(sizeof(struct kvm_mem_aliases), GFP_KERNEL);
5645 if (!kvm->arch.aliases) {
5646 kfree(kvm);
5647 return ERR_PTR(-ENOMEM);
5648 }
5649
4795 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages); 5650 INIT_LIST_HEAD(&kvm->arch.active_mmu_pages);
4796 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head); 5651 INIT_LIST_HEAD(&kvm->arch.assigned_dev_head);
4797 5652
@@ -4848,16 +5703,18 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
4848 put_page(kvm->arch.apic_access_page); 5703 put_page(kvm->arch.apic_access_page);
4849 if (kvm->arch.ept_identity_pagetable) 5704 if (kvm->arch.ept_identity_pagetable)
4850 put_page(kvm->arch.ept_identity_pagetable); 5705 put_page(kvm->arch.ept_identity_pagetable);
5706 cleanup_srcu_struct(&kvm->srcu);
5707 kfree(kvm->arch.aliases);
4851 kfree(kvm); 5708 kfree(kvm);
4852} 5709}
4853 5710
4854int kvm_arch_set_memory_region(struct kvm *kvm, 5711int kvm_arch_prepare_memory_region(struct kvm *kvm,
4855 struct kvm_userspace_memory_region *mem, 5712 struct kvm_memory_slot *memslot,
4856 struct kvm_memory_slot old, 5713 struct kvm_memory_slot old,
5714 struct kvm_userspace_memory_region *mem,
4857 int user_alloc) 5715 int user_alloc)
4858{ 5716{
4859 int npages = mem->memory_size >> PAGE_SHIFT; 5717 int npages = memslot->npages;
4860 struct kvm_memory_slot *memslot = &kvm->memslots[mem->slot];
4861 5718
4862 /*To keep backward compatibility with older userspace, 5719 /*To keep backward compatibility with older userspace,
4863 *x86 needs to hanlde !user_alloc case. 5720 *x86 needs to hanlde !user_alloc case.
@@ -4877,26 +5734,35 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4877 if (IS_ERR((void *)userspace_addr)) 5734 if (IS_ERR((void *)userspace_addr))
4878 return PTR_ERR((void *)userspace_addr); 5735 return PTR_ERR((void *)userspace_addr);
4879 5736
4880 /* set userspace_addr atomically for kvm_hva_to_rmapp */
4881 spin_lock(&kvm->mmu_lock);
4882 memslot->userspace_addr = userspace_addr; 5737 memslot->userspace_addr = userspace_addr;
4883 spin_unlock(&kvm->mmu_lock);
4884 } else {
4885 if (!old.user_alloc && old.rmap) {
4886 int ret;
4887
4888 down_write(&current->mm->mmap_sem);
4889 ret = do_munmap(current->mm, old.userspace_addr,
4890 old.npages * PAGE_SIZE);
4891 up_write(&current->mm->mmap_sem);
4892 if (ret < 0)
4893 printk(KERN_WARNING
4894 "kvm_vm_ioctl_set_memory_region: "
4895 "failed to munmap memory\n");
4896 }
4897 } 5738 }
4898 } 5739 }
4899 5740
5741
5742 return 0;
5743}
5744
5745void kvm_arch_commit_memory_region(struct kvm *kvm,
5746 struct kvm_userspace_memory_region *mem,
5747 struct kvm_memory_slot old,
5748 int user_alloc)
5749{
5750
5751 int npages = mem->memory_size >> PAGE_SHIFT;
5752
5753 if (!user_alloc && !old.user_alloc && old.rmap && !npages) {
5754 int ret;
5755
5756 down_write(&current->mm->mmap_sem);
5757 ret = do_munmap(current->mm, old.userspace_addr,
5758 old.npages * PAGE_SIZE);
5759 up_write(&current->mm->mmap_sem);
5760 if (ret < 0)
5761 printk(KERN_WARNING
5762 "kvm_vm_ioctl_set_memory_region: "
5763 "failed to munmap memory\n");
5764 }
5765
4900 spin_lock(&kvm->mmu_lock); 5766 spin_lock(&kvm->mmu_lock);
4901 if (!kvm->arch.n_requested_mmu_pages) { 5767 if (!kvm->arch.n_requested_mmu_pages) {
4902 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm); 5768 unsigned int nr_mmu_pages = kvm_mmu_calculate_mmu_pages(kvm);
@@ -4905,8 +5771,6 @@ int kvm_arch_set_memory_region(struct kvm *kvm,
4905 5771
4906 kvm_mmu_slot_remove_write_access(kvm, mem->slot); 5772 kvm_mmu_slot_remove_write_access(kvm, mem->slot);
4907 spin_unlock(&kvm->mmu_lock); 5773 spin_unlock(&kvm->mmu_lock);
4908
4909 return 0;
4910} 5774}
4911 5775
4912void kvm_arch_flush_shadow(struct kvm *kvm) 5776void kvm_arch_flush_shadow(struct kvm *kvm)
@@ -4946,8 +5810,36 @@ int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu)
4946 return kvm_x86_ops->interrupt_allowed(vcpu); 5810 return kvm_x86_ops->interrupt_allowed(vcpu);
4947} 5811}
4948 5812
5813unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu)
5814{
5815 unsigned long rflags;
5816
5817 rflags = kvm_x86_ops->get_rflags(vcpu);
5818 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
5819 rflags &= ~(unsigned long)(X86_EFLAGS_TF | X86_EFLAGS_RF);
5820 return rflags;
5821}
5822EXPORT_SYMBOL_GPL(kvm_get_rflags);
5823
5824void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags)
5825{
5826 if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP &&
5827 vcpu->arch.singlestep_cs ==
5828 get_segment_selector(vcpu, VCPU_SREG_CS) &&
5829 vcpu->arch.singlestep_rip == kvm_rip_read(vcpu))
5830 rflags |= X86_EFLAGS_TF | X86_EFLAGS_RF;
5831 kvm_x86_ops->set_rflags(vcpu, rflags);
5832}
5833EXPORT_SYMBOL_GPL(kvm_set_rflags);
5834
4949EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); 5835EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit);
4950EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); 5836EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq);
4951EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); 5837EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault);
4952EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); 5838EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr);
4953EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); 5839EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr);
5840EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmrun);
5841EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit);
5842EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject);
5843EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit);
5844EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga);
5845EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit);
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index 5eadea585d2a..2d101639bd8d 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -2,6 +2,7 @@
2#define ARCH_X86_KVM_X86_H 2#define ARCH_X86_KVM_X86_H
3 3
4#include <linux/kvm_host.h> 4#include <linux/kvm_host.h>
5#include "kvm_cache_regs.h"
5 6
6static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu) 7static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
7{ 8{
@@ -35,4 +36,33 @@ static inline bool kvm_exception_is_soft(unsigned int nr)
35struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu, 36struct kvm_cpuid_entry2 *kvm_find_cpuid_entry(struct kvm_vcpu *vcpu,
36 u32 function, u32 index); 37 u32 function, u32 index);
37 38
39static inline bool is_protmode(struct kvm_vcpu *vcpu)
40{
41 return kvm_read_cr0_bits(vcpu, X86_CR0_PE);
42}
43
44static inline int is_long_mode(struct kvm_vcpu *vcpu)
45{
46#ifdef CONFIG_X86_64
47 return vcpu->arch.efer & EFER_LMA;
48#else
49 return 0;
50#endif
51}
52
53static inline int is_pae(struct kvm_vcpu *vcpu)
54{
55 return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);
56}
57
58static inline int is_pse(struct kvm_vcpu *vcpu)
59{
60 return kvm_read_cr4_bits(vcpu, X86_CR4_PSE);
61}
62
63static inline int is_paging(struct kvm_vcpu *vcpu)
64{
65 return kvm_read_cr0_bits(vcpu, X86_CR0_PG);
66}
67
38#endif 68#endif
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index 7e59dc1d3fc2..2bdf628066bd 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -115,7 +115,7 @@ static void async_hcall(unsigned long call, unsigned long arg1,
115 local_irq_save(flags); 115 local_irq_save(flags);
116 if (lguest_data.hcall_status[next_call] != 0xFF) { 116 if (lguest_data.hcall_status[next_call] != 0xFF) {
117 /* Table full, so do normal hcall which will flush table. */ 117 /* Table full, so do normal hcall which will flush table. */
118 kvm_hypercall4(call, arg1, arg2, arg3, arg4); 118 hcall(call, arg1, arg2, arg3, arg4);
119 } else { 119 } else {
120 lguest_data.hcalls[next_call].arg0 = call; 120 lguest_data.hcalls[next_call].arg0 = call;
121 lguest_data.hcalls[next_call].arg1 = arg1; 121 lguest_data.hcalls[next_call].arg1 = arg1;
@@ -145,46 +145,45 @@ static void async_hcall(unsigned long call, unsigned long arg1,
145 * So, when we're in lazy mode, we call async_hcall() to store the call for 145 * So, when we're in lazy mode, we call async_hcall() to store the call for
146 * future processing: 146 * future processing:
147 */ 147 */
148static void lazy_hcall1(unsigned long call, 148static void lazy_hcall1(unsigned long call, unsigned long arg1)
149 unsigned long arg1)
150{ 149{
151 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 150 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
152 kvm_hypercall1(call, arg1); 151 hcall(call, arg1, 0, 0, 0);
153 else 152 else
154 async_hcall(call, arg1, 0, 0, 0); 153 async_hcall(call, arg1, 0, 0, 0);
155} 154}
156 155
157/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ 156/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/
158static void lazy_hcall2(unsigned long call, 157static void lazy_hcall2(unsigned long call,
159 unsigned long arg1, 158 unsigned long arg1,
160 unsigned long arg2) 159 unsigned long arg2)
161{ 160{
162 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 161 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
163 kvm_hypercall2(call, arg1, arg2); 162 hcall(call, arg1, arg2, 0, 0);
164 else 163 else
165 async_hcall(call, arg1, arg2, 0, 0); 164 async_hcall(call, arg1, arg2, 0, 0);
166} 165}
167 166
168static void lazy_hcall3(unsigned long call, 167static void lazy_hcall3(unsigned long call,
169 unsigned long arg1, 168 unsigned long arg1,
170 unsigned long arg2, 169 unsigned long arg2,
171 unsigned long arg3) 170 unsigned long arg3)
172{ 171{
173 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 172 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
174 kvm_hypercall3(call, arg1, arg2, arg3); 173 hcall(call, arg1, arg2, arg3, 0);
175 else 174 else
176 async_hcall(call, arg1, arg2, arg3, 0); 175 async_hcall(call, arg1, arg2, arg3, 0);
177} 176}
178 177
179#ifdef CONFIG_X86_PAE 178#ifdef CONFIG_X86_PAE
180static void lazy_hcall4(unsigned long call, 179static void lazy_hcall4(unsigned long call,
181 unsigned long arg1, 180 unsigned long arg1,
182 unsigned long arg2, 181 unsigned long arg2,
183 unsigned long arg3, 182 unsigned long arg3,
184 unsigned long arg4) 183 unsigned long arg4)
185{ 184{
186 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE) 185 if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_NONE)
187 kvm_hypercall4(call, arg1, arg2, arg3, arg4); 186 hcall(call, arg1, arg2, arg3, arg4);
188 else 187 else
189 async_hcall(call, arg1, arg2, arg3, arg4); 188 async_hcall(call, arg1, arg2, arg3, arg4);
190} 189}
@@ -196,13 +195,13 @@ static void lazy_hcall4(unsigned long call,
196:*/ 195:*/
197static void lguest_leave_lazy_mmu_mode(void) 196static void lguest_leave_lazy_mmu_mode(void)
198{ 197{
199 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 198 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
200 paravirt_leave_lazy_mmu(); 199 paravirt_leave_lazy_mmu();
201} 200}
202 201
203static void lguest_end_context_switch(struct task_struct *next) 202static void lguest_end_context_switch(struct task_struct *next)
204{ 203{
205 kvm_hypercall0(LHCALL_FLUSH_ASYNC); 204 hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0, 0);
206 paravirt_end_context_switch(next); 205 paravirt_end_context_switch(next);
207} 206}
208 207
@@ -286,7 +285,7 @@ static void lguest_write_idt_entry(gate_desc *dt,
286 /* Keep the local copy up to date. */ 285 /* Keep the local copy up to date. */
287 native_write_idt_entry(dt, entrynum, g); 286 native_write_idt_entry(dt, entrynum, g);
288 /* Tell Host about this new entry. */ 287 /* Tell Host about this new entry. */
289 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); 288 hcall(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1], 0);
290} 289}
291 290
292/* 291/*
@@ -300,7 +299,7 @@ static void lguest_load_idt(const struct desc_ptr *desc)
300 struct desc_struct *idt = (void *)desc->address; 299 struct desc_struct *idt = (void *)desc->address;
301 300
302 for (i = 0; i < (desc->size+1)/8; i++) 301 for (i = 0; i < (desc->size+1)/8; i++)
303 kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b); 302 hcall(LHCALL_LOAD_IDT_ENTRY, i, idt[i].a, idt[i].b, 0);
304} 303}
305 304
306/* 305/*
@@ -321,7 +320,7 @@ static void lguest_load_gdt(const struct desc_ptr *desc)
321 struct desc_struct *gdt = (void *)desc->address; 320 struct desc_struct *gdt = (void *)desc->address;
322 321
323 for (i = 0; i < (desc->size+1)/8; i++) 322 for (i = 0; i < (desc->size+1)/8; i++)
324 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); 323 hcall(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b, 0);
325} 324}
326 325
327/* 326/*
@@ -334,8 +333,8 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum,
334{ 333{
335 native_write_gdt_entry(dt, entrynum, desc, type); 334 native_write_gdt_entry(dt, entrynum, desc, type);
336 /* Tell Host about this new entry. */ 335 /* Tell Host about this new entry. */
337 kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, entrynum, 336 hcall(LHCALL_LOAD_GDT_ENTRY, entrynum,
338 dt[entrynum].a, dt[entrynum].b); 337 dt[entrynum].a, dt[entrynum].b, 0);
339} 338}
340 339
341/* 340/*
@@ -931,7 +930,7 @@ static int lguest_clockevent_set_next_event(unsigned long delta,
931 } 930 }
932 931
933 /* Please wake us this far in the future. */ 932 /* Please wake us this far in the future. */
934 kvm_hypercall1(LHCALL_SET_CLOCKEVENT, delta); 933 hcall(LHCALL_SET_CLOCKEVENT, delta, 0, 0, 0);
935 return 0; 934 return 0;
936} 935}
937 936
@@ -942,7 +941,7 @@ static void lguest_clockevent_set_mode(enum clock_event_mode mode,
942 case CLOCK_EVT_MODE_UNUSED: 941 case CLOCK_EVT_MODE_UNUSED:
943 case CLOCK_EVT_MODE_SHUTDOWN: 942 case CLOCK_EVT_MODE_SHUTDOWN:
944 /* A 0 argument shuts the clock down. */ 943 /* A 0 argument shuts the clock down. */
945 kvm_hypercall0(LHCALL_SET_CLOCKEVENT); 944 hcall(LHCALL_SET_CLOCKEVENT, 0, 0, 0, 0);
946 break; 945 break;
947 case CLOCK_EVT_MODE_ONESHOT: 946 case CLOCK_EVT_MODE_ONESHOT:
948 /* This is what we expect. */ 947 /* This is what we expect. */
@@ -1100,7 +1099,7 @@ static void set_lguest_basic_apic_ops(void)
1100/* STOP! Until an interrupt comes in. */ 1099/* STOP! Until an interrupt comes in. */
1101static void lguest_safe_halt(void) 1100static void lguest_safe_halt(void)
1102{ 1101{
1103 kvm_hypercall0(LHCALL_HALT); 1102 hcall(LHCALL_HALT, 0, 0, 0, 0);
1104} 1103}
1105 1104
1106/* 1105/*
@@ -1112,8 +1111,8 @@ static void lguest_safe_halt(void)
1112 */ 1111 */
1113static void lguest_power_off(void) 1112static void lguest_power_off(void)
1114{ 1113{
1115 kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), 1114 hcall(LHCALL_SHUTDOWN, __pa("Power down"),
1116 LGUEST_SHUTDOWN_POWEROFF); 1115 LGUEST_SHUTDOWN_POWEROFF, 0, 0);
1117} 1116}
1118 1117
1119/* 1118/*
@@ -1123,7 +1122,7 @@ static void lguest_power_off(void)
1123 */ 1122 */
1124static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p) 1123static int lguest_panic(struct notifier_block *nb, unsigned long l, void *p)
1125{ 1124{
1126 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF); 1125 hcall(LHCALL_SHUTDOWN, __pa(p), LGUEST_SHUTDOWN_POWEROFF, 0, 0);
1127 /* The hcall won't return, but to keep gcc happy, we're "done". */ 1126 /* The hcall won't return, but to keep gcc happy, we're "done". */
1128 return NOTIFY_DONE; 1127 return NOTIFY_DONE;
1129} 1128}
@@ -1162,7 +1161,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1162 len = sizeof(scratch) - 1; 1161 len = sizeof(scratch) - 1;
1163 scratch[len] = '\0'; 1162 scratch[len] = '\0';
1164 memcpy(scratch, buf, len); 1163 memcpy(scratch, buf, len);
1165 kvm_hypercall1(LHCALL_NOTIFY, __pa(scratch)); 1164 hcall(LHCALL_NOTIFY, __pa(scratch), 0, 0, 0);
1166 1165
1167 /* This routine returns the number of bytes actually written. */ 1166 /* This routine returns the number of bytes actually written. */
1168 return len; 1167 return len;
@@ -1174,7 +1173,7 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count)
1174 */ 1173 */
1175static void lguest_restart(char *reason) 1174static void lguest_restart(char *reason)
1176{ 1175{
1177 kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); 1176 hcall(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART, 0, 0);
1178} 1177}
1179 1178
1180/*G:050 1179/*G:050
diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S
index 27eac0faee48..4f420c2f2d55 100644
--- a/arch/x86/lguest/i386_head.S
+++ b/arch/x86/lguest/i386_head.S
@@ -32,7 +32,7 @@ ENTRY(lguest_entry)
32 */ 32 */
33 movl $LHCALL_LGUEST_INIT, %eax 33 movl $LHCALL_LGUEST_INIT, %eax
34 movl $lguest_data - __PAGE_OFFSET, %ebx 34 movl $lguest_data - __PAGE_OFFSET, %ebx
35 .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ 35 int $LGUEST_TRAP_ENTRY
36 36
37 /* Set up the initial stack so we can run C code. */ 37 /* Set up the initial stack so we can run C code. */
38 movl $(init_thread_union+THREAD_SIZE),%esp 38 movl $(init_thread_union+THREAD_SIZE),%esp
diff --git a/arch/x86/lib/.gitignore b/arch/x86/lib/.gitignore
new file mode 100644
index 000000000000..8df89f0a3fe6
--- /dev/null
+++ b/arch/x86/lib/.gitignore
@@ -0,0 +1 @@
inat-tables.c
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 85f5db95c60f..419386c24b82 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -2,14 +2,27 @@
2# Makefile for x86 specific library files. 2# Makefile for x86 specific library files.
3# 3#
4 4
5obj-$(CONFIG_SMP) := msr.o 5inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk
6inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt
7quiet_cmd_inat_tables = GEN $@
8 cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@
9
10$(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps)
11 $(call cmd,inat_tables)
12
13$(obj)/inat.o: $(obj)/inat-tables.c
14
15clean-files := inat-tables.c
16
17obj-$(CONFIG_SMP) += msr-smp.o cache-smp.o
6 18
7lib-y := delay.o 19lib-y := delay.o
8lib-y += thunk_$(BITS).o 20lib-y += thunk_$(BITS).o
9lib-y += usercopy_$(BITS).o getuser.o putuser.o 21lib-y += usercopy_$(BITS).o getuser.o putuser.o
10lib-y += memcpy_$(BITS).o 22lib-y += memcpy_$(BITS).o
23lib-$(CONFIG_KPROBES) += insn.o inat.o
11 24
12obj-y += msr-reg.o msr-reg-export.o 25obj-y += msr.o msr-reg.o msr-reg-export.o
13 26
14ifeq ($(CONFIG_X86_32),y) 27ifeq ($(CONFIG_X86_32),y)
15 obj-y += atomic64_32.o 28 obj-y += atomic64_32.o
@@ -21,9 +34,10 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
21endif 34endif
22 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o 35 lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
23else 36else
24 obj-y += io_64.o iomap_copy_64.o 37 obj-y += iomap_copy_64.o
25 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o 38 lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
26 lib-y += thunk_64.o clear_page_64.o copy_page_64.o 39 lib-y += thunk_64.o clear_page_64.o copy_page_64.o
27 lib-y += memmove_64.o memset_64.o 40 lib-y += memmove_64.o memset_64.o
28 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o 41 lib-y += copy_user_64.o rwlock_64.o copy_user_nocache_64.o
42 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem_64.o
29endif 43endif
diff --git a/arch/x86/lib/cache-smp.c b/arch/x86/lib/cache-smp.c
new file mode 100644
index 000000000000..a3c668875038
--- /dev/null
+++ b/arch/x86/lib/cache-smp.c
@@ -0,0 +1,19 @@
1#include <linux/smp.h>
2#include <linux/module.h>
3
4static void __wbinvd(void *dummy)
5{
6 wbinvd();
7}
8
9void wbinvd_on_cpu(int cpu)
10{
11 smp_call_function_single(cpu, __wbinvd, NULL, 1);
12}
13EXPORT_SYMBOL(wbinvd_on_cpu);
14
15int wbinvd_on_all_cpus(void)
16{
17 return on_each_cpu(__wbinvd, NULL, 1);
18}
19EXPORT_SYMBOL(wbinvd_on_all_cpus);
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index 6ba0f7bb85ea..71100c98e337 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -65,7 +65,7 @@
65 .endm 65 .endm
66 66
67/* Standard copy_to_user with segment limit checking */ 67/* Standard copy_to_user with segment limit checking */
68ENTRY(copy_to_user) 68ENTRY(_copy_to_user)
69 CFI_STARTPROC 69 CFI_STARTPROC
70 GET_THREAD_INFO(%rax) 70 GET_THREAD_INFO(%rax)
71 movq %rdi,%rcx 71 movq %rdi,%rcx
@@ -75,10 +75,10 @@ ENTRY(copy_to_user)
75 jae bad_to_user 75 jae bad_to_user
76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 76 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
77 CFI_ENDPROC 77 CFI_ENDPROC
78ENDPROC(copy_to_user) 78ENDPROC(_copy_to_user)
79 79
80/* Standard copy_from_user with segment limit checking */ 80/* Standard copy_from_user with segment limit checking */
81ENTRY(copy_from_user) 81ENTRY(_copy_from_user)
82 CFI_STARTPROC 82 CFI_STARTPROC
83 GET_THREAD_INFO(%rax) 83 GET_THREAD_INFO(%rax)
84 movq %rsi,%rcx 84 movq %rsi,%rcx
@@ -88,19 +88,7 @@ ENTRY(copy_from_user)
88 jae bad_from_user 88 jae bad_from_user
89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string 89 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
90 CFI_ENDPROC 90 CFI_ENDPROC
91ENDPROC(copy_from_user) 91ENDPROC(_copy_from_user)
92
93ENTRY(copy_user_generic)
94 CFI_STARTPROC
95 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
96 CFI_ENDPROC
97ENDPROC(copy_user_generic)
98
99ENTRY(__copy_from_user_inatomic)
100 CFI_STARTPROC
101 ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,copy_user_generic_unrolled,copy_user_generic_string
102 CFI_ENDPROC
103ENDPROC(__copy_from_user_inatomic)
104 92
105 .section .fixup,"ax" 93 .section .fixup,"ax"
106 /* must zero dest */ 94 /* must zero dest */
diff --git a/arch/x86/lib/inat.c b/arch/x86/lib/inat.c
new file mode 100644
index 000000000000..46fc4ee09fc4
--- /dev/null
+++ b/arch/x86/lib/inat.c
@@ -0,0 +1,90 @@
1/*
2 * x86 instruction attribute tables
3 *
4 * Written by Masami Hiramatsu <mhiramat@redhat.com>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
19 *
20 */
21#include <asm/insn.h>
22
23/* Attribute tables are generated from opcode map */
24#include "inat-tables.c"
25
26/* Attribute search APIs */
27insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode)
28{
29 return inat_primary_table[opcode];
30}
31
32insn_attr_t inat_get_escape_attribute(insn_byte_t opcode, insn_byte_t last_pfx,
33 insn_attr_t esc_attr)
34{
35 const insn_attr_t *table;
36 insn_attr_t lpfx_attr;
37 int n, m = 0;
38
39 n = inat_escape_id(esc_attr);
40 if (last_pfx) {
41 lpfx_attr = inat_get_opcode_attribute(last_pfx);
42 m = inat_last_prefix_id(lpfx_attr);
43 }
44 table = inat_escape_tables[n][0];
45 if (!table)
46 return 0;
47 if (inat_has_variant(table[opcode]) && m) {
48 table = inat_escape_tables[n][m];
49 if (!table)
50 return 0;
51 }
52 return table[opcode];
53}
54
55insn_attr_t inat_get_group_attribute(insn_byte_t modrm, insn_byte_t last_pfx,
56 insn_attr_t grp_attr)
57{
58 const insn_attr_t *table;
59 insn_attr_t lpfx_attr;
60 int n, m = 0;
61
62 n = inat_group_id(grp_attr);
63 if (last_pfx) {
64 lpfx_attr = inat_get_opcode_attribute(last_pfx);
65 m = inat_last_prefix_id(lpfx_attr);
66 }
67 table = inat_group_tables[n][0];
68 if (!table)
69 return inat_group_common_attribute(grp_attr);
70 if (inat_has_variant(table[X86_MODRM_REG(modrm)]) && m) {
71 table = inat_group_tables[n][m];
72 if (!table)
73 return inat_group_common_attribute(grp_attr);
74 }
75 return table[X86_MODRM_REG(modrm)] |
76 inat_group_common_attribute(grp_attr);
77}
78
79insn_attr_t inat_get_avx_attribute(insn_byte_t opcode, insn_byte_t vex_m,
80 insn_byte_t vex_p)
81{
82 const insn_attr_t *table;
83 if (vex_m > X86_VEX_M_MAX || vex_p > INAT_LSTPFX_MAX)
84 return 0;
85 table = inat_avx_tables[vex_m][vex_p];
86 if (!table)
87 return 0;
88 return table[opcode];
89}
90
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
new file mode 100644
index 000000000000..9f33b984d0ef
--- /dev/null
+++ b/arch/x86/lib/insn.c
@@ -0,0 +1,516 @@
1/*
2 * x86 instruction analysis
3 *
4 * This program is free software; you can redistribute it and/or modify
5 * it under the terms of the GNU General Public License as published by
6 * the Free Software Foundation; either version 2 of the License, or
7 * (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write to the Free Software
16 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
17 *
18 * Copyright (C) IBM Corporation, 2002, 2004, 2009
19 */
20
21#include <linux/string.h>
22#include <asm/inat.h>
23#include <asm/insn.h>
24
25#define get_next(t, insn) \
26 ({t r; r = *(t*)insn->next_byte; insn->next_byte += sizeof(t); r; })
27
28#define peek_next(t, insn) \
29 ({t r; r = *(t*)insn->next_byte; r; })
30
31#define peek_nbyte_next(t, insn, n) \
32 ({t r; r = *(t*)((insn)->next_byte + n); r; })
33
34/**
35 * insn_init() - initialize struct insn
36 * @insn: &struct insn to be initialized
37 * @kaddr: address (in kernel memory) of instruction (or copy thereof)
38 * @x86_64: !0 for 64-bit kernel or 64-bit app
39 */
40void insn_init(struct insn *insn, const void *kaddr, int x86_64)
41{
42 memset(insn, 0, sizeof(*insn));
43 insn->kaddr = kaddr;
44 insn->next_byte = kaddr;
45 insn->x86_64 = x86_64 ? 1 : 0;
46 insn->opnd_bytes = 4;
47 if (x86_64)
48 insn->addr_bytes = 8;
49 else
50 insn->addr_bytes = 4;
51}
52
53/**
54 * insn_get_prefixes - scan x86 instruction prefix bytes
55 * @insn: &struct insn containing instruction
56 *
57 * Populates the @insn->prefixes bitmap, and updates @insn->next_byte
58 * to point to the (first) opcode. No effect if @insn->prefixes.got
59 * is already set.
60 */
61void insn_get_prefixes(struct insn *insn)
62{
63 struct insn_field *prefixes = &insn->prefixes;
64 insn_attr_t attr;
65 insn_byte_t b, lb;
66 int i, nb;
67
68 if (prefixes->got)
69 return;
70
71 nb = 0;
72 lb = 0;
73 b = peek_next(insn_byte_t, insn);
74 attr = inat_get_opcode_attribute(b);
75 while (inat_is_legacy_prefix(attr)) {
76 /* Skip if same prefix */
77 for (i = 0; i < nb; i++)
78 if (prefixes->bytes[i] == b)
79 goto found;
80 if (nb == 4)
81 /* Invalid instruction */
82 break;
83 prefixes->bytes[nb++] = b;
84 if (inat_is_address_size_prefix(attr)) {
85 /* address size switches 2/4 or 4/8 */
86 if (insn->x86_64)
87 insn->addr_bytes ^= 12;
88 else
89 insn->addr_bytes ^= 6;
90 } else if (inat_is_operand_size_prefix(attr)) {
91 /* oprand size switches 2/4 */
92 insn->opnd_bytes ^= 6;
93 }
94found:
95 prefixes->nbytes++;
96 insn->next_byte++;
97 lb = b;
98 b = peek_next(insn_byte_t, insn);
99 attr = inat_get_opcode_attribute(b);
100 }
101 /* Set the last prefix */
102 if (lb && lb != insn->prefixes.bytes[3]) {
103 if (unlikely(insn->prefixes.bytes[3])) {
104 /* Swap the last prefix */
105 b = insn->prefixes.bytes[3];
106 for (i = 0; i < nb; i++)
107 if (prefixes->bytes[i] == lb)
108 prefixes->bytes[i] = b;
109 }
110 insn->prefixes.bytes[3] = lb;
111 }
112
113 /* Decode REX prefix */
114 if (insn->x86_64) {
115 b = peek_next(insn_byte_t, insn);
116 attr = inat_get_opcode_attribute(b);
117 if (inat_is_rex_prefix(attr)) {
118 insn->rex_prefix.value = b;
119 insn->rex_prefix.nbytes = 1;
120 insn->next_byte++;
121 if (X86_REX_W(b))
122 /* REX.W overrides opnd_size */
123 insn->opnd_bytes = 8;
124 }
125 }
126 insn->rex_prefix.got = 1;
127
128 /* Decode VEX prefix */
129 b = peek_next(insn_byte_t, insn);
130 attr = inat_get_opcode_attribute(b);
131 if (inat_is_vex_prefix(attr)) {
132 insn_byte_t b2 = peek_nbyte_next(insn_byte_t, insn, 1);
133 if (!insn->x86_64) {
134 /*
135 * In 32-bits mode, if the [7:6] bits (mod bits of
136 * ModRM) on the second byte are not 11b, it is
137 * LDS or LES.
138 */
139 if (X86_MODRM_MOD(b2) != 3)
140 goto vex_end;
141 }
142 insn->vex_prefix.bytes[0] = b;
143 insn->vex_prefix.bytes[1] = b2;
144 if (inat_is_vex3_prefix(attr)) {
145 b2 = peek_nbyte_next(insn_byte_t, insn, 2);
146 insn->vex_prefix.bytes[2] = b2;
147 insn->vex_prefix.nbytes = 3;
148 insn->next_byte += 3;
149 if (insn->x86_64 && X86_VEX_W(b2))
150 /* VEX.W overrides opnd_size */
151 insn->opnd_bytes = 8;
152 } else {
153 insn->vex_prefix.nbytes = 2;
154 insn->next_byte += 2;
155 }
156 }
157vex_end:
158 insn->vex_prefix.got = 1;
159
160 prefixes->got = 1;
161 return;
162}
163
164/**
165 * insn_get_opcode - collect opcode(s)
166 * @insn: &struct insn containing instruction
167 *
168 * Populates @insn->opcode, updates @insn->next_byte to point past the
169 * opcode byte(s), and set @insn->attr (except for groups).
170 * If necessary, first collects any preceding (prefix) bytes.
171 * Sets @insn->opcode.value = opcode1. No effect if @insn->opcode.got
172 * is already 1.
173 */
174void insn_get_opcode(struct insn *insn)
175{
176 struct insn_field *opcode = &insn->opcode;
177 insn_byte_t op, pfx;
178 if (opcode->got)
179 return;
180 if (!insn->prefixes.got)
181 insn_get_prefixes(insn);
182
183 /* Get first opcode */
184 op = get_next(insn_byte_t, insn);
185 opcode->bytes[0] = op;
186 opcode->nbytes = 1;
187
188 /* Check if there is VEX prefix or not */
189 if (insn_is_avx(insn)) {
190 insn_byte_t m, p;
191 m = insn_vex_m_bits(insn);
192 p = insn_vex_p_bits(insn);
193 insn->attr = inat_get_avx_attribute(op, m, p);
194 if (!inat_accept_vex(insn->attr))
195 insn->attr = 0; /* This instruction is bad */
196 goto end; /* VEX has only 1 byte for opcode */
197 }
198
199 insn->attr = inat_get_opcode_attribute(op);
200 while (inat_is_escape(insn->attr)) {
201 /* Get escaped opcode */
202 op = get_next(insn_byte_t, insn);
203 opcode->bytes[opcode->nbytes++] = op;
204 pfx = insn_last_prefix(insn);
205 insn->attr = inat_get_escape_attribute(op, pfx, insn->attr);
206 }
207 if (inat_must_vex(insn->attr))
208 insn->attr = 0; /* This instruction is bad */
209end:
210 opcode->got = 1;
211}
212
213/**
214 * insn_get_modrm - collect ModRM byte, if any
215 * @insn: &struct insn containing instruction
216 *
217 * Populates @insn->modrm and updates @insn->next_byte to point past the
218 * ModRM byte, if any. If necessary, first collects the preceding bytes
219 * (prefixes and opcode(s)). No effect if @insn->modrm.got is already 1.
220 */
221void insn_get_modrm(struct insn *insn)
222{
223 struct insn_field *modrm = &insn->modrm;
224 insn_byte_t pfx, mod;
225 if (modrm->got)
226 return;
227 if (!insn->opcode.got)
228 insn_get_opcode(insn);
229
230 if (inat_has_modrm(insn->attr)) {
231 mod = get_next(insn_byte_t, insn);
232 modrm->value = mod;
233 modrm->nbytes = 1;
234 if (inat_is_group(insn->attr)) {
235 pfx = insn_last_prefix(insn);
236 insn->attr = inat_get_group_attribute(mod, pfx,
237 insn->attr);
238 }
239 }
240
241 if (insn->x86_64 && inat_is_force64(insn->attr))
242 insn->opnd_bytes = 8;
243 modrm->got = 1;
244}
245
246
247/**
248 * insn_rip_relative() - Does instruction use RIP-relative addressing mode?
249 * @insn: &struct insn containing instruction
250 *
251 * If necessary, first collects the instruction up to and including the
252 * ModRM byte. No effect if @insn->x86_64 is 0.
253 */
254int insn_rip_relative(struct insn *insn)
255{
256 struct insn_field *modrm = &insn->modrm;
257
258 if (!insn->x86_64)
259 return 0;
260 if (!modrm->got)
261 insn_get_modrm(insn);
262 /*
263 * For rip-relative instructions, the mod field (top 2 bits)
264 * is zero and the r/m field (bottom 3 bits) is 0x5.
265 */
266 return (modrm->nbytes && (modrm->value & 0xc7) == 0x5);
267}
268
269/**
270 * insn_get_sib() - Get the SIB byte of instruction
271 * @insn: &struct insn containing instruction
272 *
273 * If necessary, first collects the instruction up to and including the
274 * ModRM byte.
275 */
276void insn_get_sib(struct insn *insn)
277{
278 insn_byte_t modrm;
279
280 if (insn->sib.got)
281 return;
282 if (!insn->modrm.got)
283 insn_get_modrm(insn);
284 if (insn->modrm.nbytes) {
285 modrm = (insn_byte_t)insn->modrm.value;
286 if (insn->addr_bytes != 2 &&
287 X86_MODRM_MOD(modrm) != 3 && X86_MODRM_RM(modrm) == 4) {
288 insn->sib.value = get_next(insn_byte_t, insn);
289 insn->sib.nbytes = 1;
290 }
291 }
292 insn->sib.got = 1;
293}
294
295
296/**
297 * insn_get_displacement() - Get the displacement of instruction
298 * @insn: &struct insn containing instruction
299 *
300 * If necessary, first collects the instruction up to and including the
301 * SIB byte.
302 * Displacement value is sign-expanded.
303 */
304void insn_get_displacement(struct insn *insn)
305{
306 insn_byte_t mod, rm, base;
307
308 if (insn->displacement.got)
309 return;
310 if (!insn->sib.got)
311 insn_get_sib(insn);
312 if (insn->modrm.nbytes) {
313 /*
314 * Interpreting the modrm byte:
315 * mod = 00 - no displacement fields (exceptions below)
316 * mod = 01 - 1-byte displacement field
317 * mod = 10 - displacement field is 4 bytes, or 2 bytes if
318 * address size = 2 (0x67 prefix in 32-bit mode)
319 * mod = 11 - no memory operand
320 *
321 * If address size = 2...
322 * mod = 00, r/m = 110 - displacement field is 2 bytes
323 *
324 * If address size != 2...
325 * mod != 11, r/m = 100 - SIB byte exists
326 * mod = 00, SIB base = 101 - displacement field is 4 bytes
327 * mod = 00, r/m = 101 - rip-relative addressing, displacement
328 * field is 4 bytes
329 */
330 mod = X86_MODRM_MOD(insn->modrm.value);
331 rm = X86_MODRM_RM(insn->modrm.value);
332 base = X86_SIB_BASE(insn->sib.value);
333 if (mod == 3)
334 goto out;
335 if (mod == 1) {
336 insn->displacement.value = get_next(char, insn);
337 insn->displacement.nbytes = 1;
338 } else if (insn->addr_bytes == 2) {
339 if ((mod == 0 && rm == 6) || mod == 2) {
340 insn->displacement.value =
341 get_next(short, insn);
342 insn->displacement.nbytes = 2;
343 }
344 } else {
345 if ((mod == 0 && rm == 5) || mod == 2 ||
346 (mod == 0 && base == 5)) {
347 insn->displacement.value = get_next(int, insn);
348 insn->displacement.nbytes = 4;
349 }
350 }
351 }
352out:
353 insn->displacement.got = 1;
354}
355
356/* Decode moffset16/32/64 */
357static void __get_moffset(struct insn *insn)
358{
359 switch (insn->addr_bytes) {
360 case 2:
361 insn->moffset1.value = get_next(short, insn);
362 insn->moffset1.nbytes = 2;
363 break;
364 case 4:
365 insn->moffset1.value = get_next(int, insn);
366 insn->moffset1.nbytes = 4;
367 break;
368 case 8:
369 insn->moffset1.value = get_next(int, insn);
370 insn->moffset1.nbytes = 4;
371 insn->moffset2.value = get_next(int, insn);
372 insn->moffset2.nbytes = 4;
373 break;
374 }
375 insn->moffset1.got = insn->moffset2.got = 1;
376}
377
378/* Decode imm v32(Iz) */
379static void __get_immv32(struct insn *insn)
380{
381 switch (insn->opnd_bytes) {
382 case 2:
383 insn->immediate.value = get_next(short, insn);
384 insn->immediate.nbytes = 2;
385 break;
386 case 4:
387 case 8:
388 insn->immediate.value = get_next(int, insn);
389 insn->immediate.nbytes = 4;
390 break;
391 }
392}
393
394/* Decode imm v64(Iv/Ov) */
395static void __get_immv(struct insn *insn)
396{
397 switch (insn->opnd_bytes) {
398 case 2:
399 insn->immediate1.value = get_next(short, insn);
400 insn->immediate1.nbytes = 2;
401 break;
402 case 4:
403 insn->immediate1.value = get_next(int, insn);
404 insn->immediate1.nbytes = 4;
405 break;
406 case 8:
407 insn->immediate1.value = get_next(int, insn);
408 insn->immediate1.nbytes = 4;
409 insn->immediate2.value = get_next(int, insn);
410 insn->immediate2.nbytes = 4;
411 break;
412 }
413 insn->immediate1.got = insn->immediate2.got = 1;
414}
415
416/* Decode ptr16:16/32(Ap) */
417static void __get_immptr(struct insn *insn)
418{
419 switch (insn->opnd_bytes) {
420 case 2:
421 insn->immediate1.value = get_next(short, insn);
422 insn->immediate1.nbytes = 2;
423 break;
424 case 4:
425 insn->immediate1.value = get_next(int, insn);
426 insn->immediate1.nbytes = 4;
427 break;
428 case 8:
429 /* ptr16:64 is not exist (no segment) */
430 return;
431 }
432 insn->immediate2.value = get_next(unsigned short, insn);
433 insn->immediate2.nbytes = 2;
434 insn->immediate1.got = insn->immediate2.got = 1;
435}
436
437/**
438 * insn_get_immediate() - Get the immediates of instruction
439 * @insn: &struct insn containing instruction
440 *
441 * If necessary, first collects the instruction up to and including the
442 * displacement bytes.
443 * Basically, most of immediates are sign-expanded. Unsigned-value can be
444 * get by bit masking with ((1 << (nbytes * 8)) - 1)
445 */
446void insn_get_immediate(struct insn *insn)
447{
448 if (insn->immediate.got)
449 return;
450 if (!insn->displacement.got)
451 insn_get_displacement(insn);
452
453 if (inat_has_moffset(insn->attr)) {
454 __get_moffset(insn);
455 goto done;
456 }
457
458 if (!inat_has_immediate(insn->attr))
459 /* no immediates */
460 goto done;
461
462 switch (inat_immediate_size(insn->attr)) {
463 case INAT_IMM_BYTE:
464 insn->immediate.value = get_next(char, insn);
465 insn->immediate.nbytes = 1;
466 break;
467 case INAT_IMM_WORD:
468 insn->immediate.value = get_next(short, insn);
469 insn->immediate.nbytes = 2;
470 break;
471 case INAT_IMM_DWORD:
472 insn->immediate.value = get_next(int, insn);
473 insn->immediate.nbytes = 4;
474 break;
475 case INAT_IMM_QWORD:
476 insn->immediate1.value = get_next(int, insn);
477 insn->immediate1.nbytes = 4;
478 insn->immediate2.value = get_next(int, insn);
479 insn->immediate2.nbytes = 4;
480 break;
481 case INAT_IMM_PTR:
482 __get_immptr(insn);
483 break;
484 case INAT_IMM_VWORD32:
485 __get_immv32(insn);
486 break;
487 case INAT_IMM_VWORD:
488 __get_immv(insn);
489 break;
490 default:
491 break;
492 }
493 if (inat_has_second_immediate(insn->attr)) {
494 insn->immediate2.value = get_next(char, insn);
495 insn->immediate2.nbytes = 1;
496 }
497done:
498 insn->immediate.got = 1;
499}
500
501/**
502 * insn_get_length() - Get the length of instruction
503 * @insn: &struct insn containing instruction
504 *
505 * If necessary, first collects the instruction up to and including the
506 * immediates bytes.
507 */
508void insn_get_length(struct insn *insn)
509{
510 if (insn->length)
511 return;
512 if (!insn->immediate.got)
513 insn_get_immediate(insn);
514 insn->length = (unsigned char)((unsigned long)insn->next_byte
515 - (unsigned long)insn->kaddr);
516}
diff --git a/arch/x86/lib/io_64.c b/arch/x86/lib/io_64.c
deleted file mode 100644
index 3f1eb59b5f08..000000000000
--- a/arch/x86/lib/io_64.c
+++ /dev/null
@@ -1,25 +0,0 @@
1#include <linux/string.h>
2#include <linux/module.h>
3#include <asm/io.h>
4
5void __memcpy_toio(unsigned long dst, const void *src, unsigned len)
6{
7 __inline_memcpy((void *)dst, src, len);
8}
9EXPORT_SYMBOL(__memcpy_toio);
10
11void __memcpy_fromio(void *dst, unsigned long src, unsigned len)
12{
13 __inline_memcpy(dst, (const void *)src, len);
14}
15EXPORT_SYMBOL(__memcpy_fromio);
16
17void memset_io(volatile void __iomem *a, int b, size_t c)
18{
19 /*
20 * TODO: memset can mangle the IO patterns quite a bit.
21 * perhaps it would be better to use a dumb one:
22 */
23 memset((void *)a, b, c);
24}
25EXPORT_SYMBOL(memset_io);
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index ad5441ed1b57..f82e884928af 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -20,12 +20,11 @@
20/* 20/*
21 * memcpy_c() - fast string ops (REP MOVSQ) based variant. 21 * memcpy_c() - fast string ops (REP MOVSQ) based variant.
22 * 22 *
23 * Calls to this get patched into the kernel image via the 23 * This gets patched over the unrolled variant (below) via the
24 * alternative instructions framework: 24 * alternative instructions framework:
25 */ 25 */
26 ALIGN 26 .section .altinstr_replacement, "ax", @progbits
27memcpy_c: 27.Lmemcpy_c:
28 CFI_STARTPROC
29 movq %rdi, %rax 28 movq %rdi, %rax
30 29
31 movl %edx, %ecx 30 movl %edx, %ecx
@@ -35,8 +34,8 @@ memcpy_c:
35 movl %edx, %ecx 34 movl %edx, %ecx
36 rep movsb 35 rep movsb
37 ret 36 ret
38 CFI_ENDPROC 37.Lmemcpy_e:
39ENDPROC(memcpy_c) 38 .previous
40 39
41ENTRY(__memcpy) 40ENTRY(__memcpy)
42ENTRY(memcpy) 41ENTRY(memcpy)
@@ -128,16 +127,10 @@ ENDPROC(__memcpy)
128 * It is also a lot simpler. Use this when possible: 127 * It is also a lot simpler. Use this when possible:
129 */ 128 */
130 129
131 .section .altinstr_replacement, "ax"
1321: .byte 0xeb /* jmp <disp8> */
133 .byte (memcpy_c - memcpy) - (2f - 1b) /* offset */
1342:
135 .previous
136
137 .section .altinstructions, "a" 130 .section .altinstructions, "a"
138 .align 8 131 .align 8
139 .quad memcpy 132 .quad memcpy
140 .quad 1b 133 .quad .Lmemcpy_c
141 .byte X86_FEATURE_REP_GOOD 134 .byte X86_FEATURE_REP_GOOD
142 135
143 /* 136 /*
@@ -145,6 +138,6 @@ ENDPROC(__memcpy)
145 * so it is silly to overwrite itself with nops - reboot is the 138 * so it is silly to overwrite itself with nops - reboot is the
146 * only outcome... 139 * only outcome...
147 */ 140 */
148 .byte 2b - 1b 141 .byte .Lmemcpy_e - .Lmemcpy_c
149 .byte 2b - 1b 142 .byte .Lmemcpy_e - .Lmemcpy_c
150 .previous 143 .previous
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 2c5948116bd2..e88d3b81644a 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -12,9 +12,8 @@
12 * 12 *
13 * rax original destination 13 * rax original destination
14 */ 14 */
15 ALIGN 15 .section .altinstr_replacement, "ax", @progbits
16memset_c: 16.Lmemset_c:
17 CFI_STARTPROC
18 movq %rdi,%r9 17 movq %rdi,%r9
19 movl %edx,%r8d 18 movl %edx,%r8d
20 andl $7,%r8d 19 andl $7,%r8d
@@ -29,8 +28,8 @@ memset_c:
29 rep stosb 28 rep stosb
30 movq %r9,%rax 29 movq %r9,%rax
31 ret 30 ret
32 CFI_ENDPROC 31.Lmemset_e:
33ENDPROC(memset_c) 32 .previous
34 33
35ENTRY(memset) 34ENTRY(memset)
36ENTRY(__memset) 35ENTRY(__memset)
@@ -118,16 +117,11 @@ ENDPROC(__memset)
118 117
119#include <asm/cpufeature.h> 118#include <asm/cpufeature.h>
120 119
121 .section .altinstr_replacement,"ax"
1221: .byte 0xeb /* jmp <disp8> */
123 .byte (memset_c - memset) - (2f - 1b) /* offset */
1242:
125 .previous
126 .section .altinstructions,"a" 120 .section .altinstructions,"a"
127 .align 8 121 .align 8
128 .quad memset 122 .quad memset
129 .quad 1b 123 .quad .Lmemset_c
130 .byte X86_FEATURE_REP_GOOD 124 .byte X86_FEATURE_REP_GOOD
131 .byte .Lfinal - memset 125 .byte .Lfinal - memset
132 .byte 2b - 1b 126 .byte .Lmemset_e - .Lmemset_c
133 .previous 127 .previous
diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c
new file mode 100644
index 000000000000..a6b1b86d2253
--- /dev/null
+++ b/arch/x86/lib/msr-smp.c
@@ -0,0 +1,204 @@
1#include <linux/module.h>
2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h>
5
6static void __rdmsr_on_cpu(void *info)
7{
8 struct msr_info *rv = info;
9 struct msr *reg;
10 int this_cpu = raw_smp_processor_id();
11
12 if (rv->msrs)
13 reg = per_cpu_ptr(rv->msrs, this_cpu);
14 else
15 reg = &rv->reg;
16
17 rdmsr(rv->msr_no, reg->l, reg->h);
18}
19
20static void __wrmsr_on_cpu(void *info)
21{
22 struct msr_info *rv = info;
23 struct msr *reg;
24 int this_cpu = raw_smp_processor_id();
25
26 if (rv->msrs)
27 reg = per_cpu_ptr(rv->msrs, this_cpu);
28 else
29 reg = &rv->reg;
30
31 wrmsr(rv->msr_no, reg->l, reg->h);
32}
33
34int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
35{
36 int err;
37 struct msr_info rv;
38
39 memset(&rv, 0, sizeof(rv));
40
41 rv.msr_no = msr_no;
42 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
43 *l = rv.reg.l;
44 *h = rv.reg.h;
45
46 return err;
47}
48EXPORT_SYMBOL(rdmsr_on_cpu);
49
50int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
51{
52 int err;
53 struct msr_info rv;
54
55 memset(&rv, 0, sizeof(rv));
56
57 rv.msr_no = msr_no;
58 rv.reg.l = l;
59 rv.reg.h = h;
60 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
61
62 return err;
63}
64EXPORT_SYMBOL(wrmsr_on_cpu);
65
66static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no,
67 struct msr *msrs,
68 void (*msr_func) (void *info))
69{
70 struct msr_info rv;
71 int this_cpu;
72
73 memset(&rv, 0, sizeof(rv));
74
75 rv.msrs = msrs;
76 rv.msr_no = msr_no;
77
78 this_cpu = get_cpu();
79
80 if (cpumask_test_cpu(this_cpu, mask))
81 msr_func(&rv);
82
83 smp_call_function_many(mask, msr_func, &rv, 1);
84 put_cpu();
85}
86
87/* rdmsr on a bunch of CPUs
88 *
89 * @mask: which CPUs
90 * @msr_no: which MSR
91 * @msrs: array of MSR values
92 *
93 */
94void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
95{
96 __rwmsr_on_cpus(mask, msr_no, msrs, __rdmsr_on_cpu);
97}
98EXPORT_SYMBOL(rdmsr_on_cpus);
99
100/*
101 * wrmsr on a bunch of CPUs
102 *
103 * @mask: which CPUs
104 * @msr_no: which MSR
105 * @msrs: array of MSR values
106 *
107 */
108void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs)
109{
110 __rwmsr_on_cpus(mask, msr_no, msrs, __wrmsr_on_cpu);
111}
112EXPORT_SYMBOL(wrmsr_on_cpus);
113
114/* These "safe" variants are slower and should be used when the target MSR
115 may not actually exist. */
116static void __rdmsr_safe_on_cpu(void *info)
117{
118 struct msr_info *rv = info;
119
120 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
121}
122
123static void __wrmsr_safe_on_cpu(void *info)
124{
125 struct msr_info *rv = info;
126
127 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
128}
129
130int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
131{
132 int err;
133 struct msr_info rv;
134
135 memset(&rv, 0, sizeof(rv));
136
137 rv.msr_no = msr_no;
138 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
139 *l = rv.reg.l;
140 *h = rv.reg.h;
141
142 return err ? err : rv.err;
143}
144EXPORT_SYMBOL(rdmsr_safe_on_cpu);
145
146int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
147{
148 int err;
149 struct msr_info rv;
150
151 memset(&rv, 0, sizeof(rv));
152
153 rv.msr_no = msr_no;
154 rv.reg.l = l;
155 rv.reg.h = h;
156 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
157
158 return err ? err : rv.err;
159}
160EXPORT_SYMBOL(wrmsr_safe_on_cpu);
161
162/*
163 * These variants are significantly slower, but allows control over
164 * the entire 32-bit GPR set.
165 */
166static void __rdmsr_safe_regs_on_cpu(void *info)
167{
168 struct msr_regs_info *rv = info;
169
170 rv->err = rdmsr_safe_regs(rv->regs);
171}
172
173static void __wrmsr_safe_regs_on_cpu(void *info)
174{
175 struct msr_regs_info *rv = info;
176
177 rv->err = wrmsr_safe_regs(rv->regs);
178}
179
180int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
181{
182 int err;
183 struct msr_regs_info rv;
184
185 rv.regs = regs;
186 rv.err = -EIO;
187 err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
188
189 return err ? err : rv.err;
190}
191EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
192
193int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
194{
195 int err;
196 struct msr_regs_info rv;
197
198 rv.regs = regs;
199 rv.err = -EIO;
200 err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
201
202 return err ? err : rv.err;
203}
204EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu);
diff --git a/arch/x86/lib/msr.c b/arch/x86/lib/msr.c
index 33a1e3ca22d8..8f8eebdca7d4 100644
--- a/arch/x86/lib/msr.c
+++ b/arch/x86/lib/msr.c
@@ -1,226 +1,23 @@
1#include <linux/module.h> 1#include <linux/module.h>
2#include <linux/preempt.h> 2#include <linux/preempt.h>
3#include <linux/smp.h>
4#include <asm/msr.h> 3#include <asm/msr.h>
5 4
6struct msr_info { 5struct msr *msrs_alloc(void)
7 u32 msr_no;
8 struct msr reg;
9 struct msr *msrs;
10 int off;
11 int err;
12};
13
14static void __rdmsr_on_cpu(void *info)
15{
16 struct msr_info *rv = info;
17 struct msr *reg;
18 int this_cpu = raw_smp_processor_id();
19
20 if (rv->msrs)
21 reg = &rv->msrs[this_cpu - rv->off];
22 else
23 reg = &rv->reg;
24
25 rdmsr(rv->msr_no, reg->l, reg->h);
26}
27
28static void __wrmsr_on_cpu(void *info)
29{
30 struct msr_info *rv = info;
31 struct msr *reg;
32 int this_cpu = raw_smp_processor_id();
33
34 if (rv->msrs)
35 reg = &rv->msrs[this_cpu - rv->off];
36 else
37 reg = &rv->reg;
38
39 wrmsr(rv->msr_no, reg->l, reg->h);
40}
41
42int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
43{
44 int err;
45 struct msr_info rv;
46
47 memset(&rv, 0, sizeof(rv));
48
49 rv.msr_no = msr_no;
50 err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1);
51 *l = rv.reg.l;
52 *h = rv.reg.h;
53
54 return err;
55}
56EXPORT_SYMBOL(rdmsr_on_cpu);
57
58int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h)
59{
60 int err;
61 struct msr_info rv;
62
63 memset(&rv, 0, sizeof(rv));
64
65 rv.msr_no = msr_no;
66 rv.reg.l = l;
67 rv.reg.h = h;
68 err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1);
69
70 return err;
71}
72EXPORT_SYMBOL(wrmsr_on_cpu);
73
74/* rdmsr on a bunch of CPUs
75 *
76 * @mask: which CPUs
77 * @msr_no: which MSR
78 * @msrs: array of MSR values
79 *
80 */
81void rdmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
82{
83 struct msr_info rv;
84 int this_cpu;
85
86 memset(&rv, 0, sizeof(rv));
87
88 rv.off = cpumask_first(mask);
89 rv.msrs = msrs;
90 rv.msr_no = msr_no;
91
92 this_cpu = get_cpu();
93
94 if (cpumask_test_cpu(this_cpu, mask))
95 __rdmsr_on_cpu(&rv);
96
97 smp_call_function_many(mask, __rdmsr_on_cpu, &rv, 1);
98 put_cpu();
99}
100EXPORT_SYMBOL(rdmsr_on_cpus);
101
102/*
103 * wrmsr on a bunch of CPUs
104 *
105 * @mask: which CPUs
106 * @msr_no: which MSR
107 * @msrs: array of MSR values
108 *
109 */
110void wrmsr_on_cpus(const cpumask_t *mask, u32 msr_no, struct msr *msrs)
111{
112 struct msr_info rv;
113 int this_cpu;
114
115 memset(&rv, 0, sizeof(rv));
116
117 rv.off = cpumask_first(mask);
118 rv.msrs = msrs;
119 rv.msr_no = msr_no;
120
121 this_cpu = get_cpu();
122
123 if (cpumask_test_cpu(this_cpu, mask))
124 __wrmsr_on_cpu(&rv);
125
126 smp_call_function_many(mask, __wrmsr_on_cpu, &rv, 1);
127 put_cpu();
128}
129EXPORT_SYMBOL(wrmsr_on_cpus);
130
131/* These "safe" variants are slower and should be used when the target MSR
132 may not actually exist. */
133static void __rdmsr_safe_on_cpu(void *info)
134{
135 struct msr_info *rv = info;
136
137 rv->err = rdmsr_safe(rv->msr_no, &rv->reg.l, &rv->reg.h);
138}
139
140static void __wrmsr_safe_on_cpu(void *info)
141{
142 struct msr_info *rv = info;
143
144 rv->err = wrmsr_safe(rv->msr_no, rv->reg.l, rv->reg.h);
145}
146
147int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h)
148{ 6{
149 int err; 7 struct msr *msrs = NULL;
150 struct msr_info rv;
151 8
152 memset(&rv, 0, sizeof(rv)); 9 msrs = alloc_percpu(struct msr);
10 if (!msrs) {
11 pr_warning("%s: error allocating msrs\n", __func__);
12 return NULL;
13 }
153 14
154 rv.msr_no = msr_no; 15 return msrs;
155 err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1);
156 *l = rv.reg.l;
157 *h = rv.reg.h;
158
159 return err ? err : rv.err;
160} 16}
161EXPORT_SYMBOL(rdmsr_safe_on_cpu); 17EXPORT_SYMBOL(msrs_alloc);
162 18
163int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) 19void msrs_free(struct msr *msrs)
164{ 20{
165 int err; 21 free_percpu(msrs);
166 struct msr_info rv;
167
168 memset(&rv, 0, sizeof(rv));
169
170 rv.msr_no = msr_no;
171 rv.reg.l = l;
172 rv.reg.h = h;
173 err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1);
174
175 return err ? err : rv.err;
176}
177EXPORT_SYMBOL(wrmsr_safe_on_cpu);
178
179/*
180 * These variants are significantly slower, but allows control over
181 * the entire 32-bit GPR set.
182 */
183struct msr_regs_info {
184 u32 *regs;
185 int err;
186};
187
188static void __rdmsr_safe_regs_on_cpu(void *info)
189{
190 struct msr_regs_info *rv = info;
191
192 rv->err = rdmsr_safe_regs(rv->regs);
193}
194
195static void __wrmsr_safe_regs_on_cpu(void *info)
196{
197 struct msr_regs_info *rv = info;
198
199 rv->err = wrmsr_safe_regs(rv->regs);
200}
201
202int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
203{
204 int err;
205 struct msr_regs_info rv;
206
207 rv.regs = regs;
208 rv.err = -EIO;
209 err = smp_call_function_single(cpu, __rdmsr_safe_regs_on_cpu, &rv, 1);
210
211 return err ? err : rv.err;
212}
213EXPORT_SYMBOL(rdmsr_safe_regs_on_cpu);
214
215int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 *regs)
216{
217 int err;
218 struct msr_regs_info rv;
219
220 rv.regs = regs;
221 rv.err = -EIO;
222 err = smp_call_function_single(cpu, __wrmsr_safe_regs_on_cpu, &rv, 1);
223
224 return err ? err : rv.err;
225} 22}
226EXPORT_SYMBOL(wrmsr_safe_regs_on_cpu); 23EXPORT_SYMBOL(msrs_free);
diff --git a/arch/x86/lib/rwsem_64.S b/arch/x86/lib/rwsem_64.S
new file mode 100644
index 000000000000..41fcf00e49df
--- /dev/null
+++ b/arch/x86/lib/rwsem_64.S
@@ -0,0 +1,81 @@
1/*
2 * x86-64 rwsem wrappers
3 *
4 * This interfaces the inline asm code to the slow-path
5 * C routines. We need to save the call-clobbered regs
6 * that the asm does not mark as clobbered, and move the
7 * argument from %rax to %rdi.
8 *
9 * NOTE! We don't need to save %rax, because the functions
10 * will always return the semaphore pointer in %rax (which
11 * is also the input argument to these helpers)
12 *
13 * The following can clobber %rdx because the asm clobbers it:
14 * call_rwsem_down_write_failed
15 * call_rwsem_wake
16 * but %rdi, %rsi, %rcx, %r8-r11 always need saving.
17 */
18
19#include <linux/linkage.h>
20#include <asm/rwlock.h>
21#include <asm/alternative-asm.h>
22#include <asm/frame.h>
23#include <asm/dwarf2.h>
24
25#define save_common_regs \
26 pushq %rdi; \
27 pushq %rsi; \
28 pushq %rcx; \
29 pushq %r8; \
30 pushq %r9; \
31 pushq %r10; \
32 pushq %r11
33
34#define restore_common_regs \
35 popq %r11; \
36 popq %r10; \
37 popq %r9; \
38 popq %r8; \
39 popq %rcx; \
40 popq %rsi; \
41 popq %rdi
42
43/* Fix up special calling conventions */
44ENTRY(call_rwsem_down_read_failed)
45 save_common_regs
46 pushq %rdx
47 movq %rax,%rdi
48 call rwsem_down_read_failed
49 popq %rdx
50 restore_common_regs
51 ret
52 ENDPROC(call_rwsem_down_read_failed)
53
54ENTRY(call_rwsem_down_write_failed)
55 save_common_regs
56 movq %rax,%rdi
57 call rwsem_down_write_failed
58 restore_common_regs
59 ret
60 ENDPROC(call_rwsem_down_write_failed)
61
62ENTRY(call_rwsem_wake)
63 decl %edx /* do nothing if still outstanding active readers */
64 jnz 1f
65 save_common_regs
66 movq %rax,%rdi
67 call rwsem_wake
68 restore_common_regs
691: ret
70 ENDPROC(call_rwsem_wake)
71
72/* Fix up special calling conventions */
73ENTRY(call_rwsem_downgrade_wake)
74 save_common_regs
75 pushq %rdx
76 movq %rax,%rdi
77 call rwsem_downgrade_wake
78 popq %rdx
79 restore_common_regs
80 ret
81 ENDPROC(call_rwsem_downgrade_wake)
diff --git a/arch/x86/lib/usercopy_32.c b/arch/x86/lib/usercopy_32.c
index 1f118d462acc..e218d5df85ff 100644
--- a/arch/x86/lib/usercopy_32.c
+++ b/arch/x86/lib/usercopy_32.c
@@ -874,7 +874,7 @@ EXPORT_SYMBOL(copy_to_user);
874 * data to the requested size using zero bytes. 874 * data to the requested size using zero bytes.
875 */ 875 */
876unsigned long 876unsigned long
877copy_from_user(void *to, const void __user *from, unsigned long n) 877_copy_from_user(void *to, const void __user *from, unsigned long n)
878{ 878{
879 if (access_ok(VERIFY_READ, from, n)) 879 if (access_ok(VERIFY_READ, from, n))
880 n = __copy_from_user(to, from, n); 880 n = __copy_from_user(to, from, n);
@@ -882,4 +882,10 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
882 memset(to, 0, n); 882 memset(to, 0, n);
883 return n; 883 return n;
884} 884}
885EXPORT_SYMBOL(copy_from_user); 885EXPORT_SYMBOL(_copy_from_user);
886
887void copy_from_user_overflow(void)
888{
889 WARN(1, "Buffer overflow detected!\n");
890}
891EXPORT_SYMBOL(copy_from_user_overflow);
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
new file mode 100644
index 000000000000..a793da5e560e
--- /dev/null
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -0,0 +1,893 @@
1# x86 Opcode Maps
2#
3#<Opcode maps>
4# Table: table-name
5# Referrer: escaped-name
6# AVXcode: avx-code
7# opcode: mnemonic|GrpXXX [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
8# (or)
9# opcode: escape # escaped-name
10# EndTable
11#
12#<group maps>
13# GrpTable: GrpXXX
14# reg: mnemonic [operand1[,operand2...]] [(extra1)[,(extra2)...] [| 2nd-mnemonic ...]
15# EndTable
16#
17# AVX Superscripts
18# (VEX): this opcode can accept VEX prefix.
19# (oVEX): this opcode requires VEX prefix.
20# (o128): this opcode only supports 128bit VEX.
21# (o256): this opcode only supports 256bit VEX.
22#
23
24Table: one byte opcode
25Referrer:
26AVXcode:
27# 0x00 - 0x0f
2800: ADD Eb,Gb
2901: ADD Ev,Gv
3002: ADD Gb,Eb
3103: ADD Gv,Ev
3204: ADD AL,Ib
3305: ADD rAX,Iz
3406: PUSH ES (i64)
3507: POP ES (i64)
3608: OR Eb,Gb
3709: OR Ev,Gv
380a: OR Gb,Eb
390b: OR Gv,Ev
400c: OR AL,Ib
410d: OR rAX,Iz
420e: PUSH CS (i64)
430f: escape # 2-byte escape
44# 0x10 - 0x1f
4510: ADC Eb,Gb
4611: ADC Ev,Gv
4712: ADC Gb,Eb
4813: ADC Gv,Ev
4914: ADC AL,Ib
5015: ADC rAX,Iz
5116: PUSH SS (i64)
5217: POP SS (i64)
5318: SBB Eb,Gb
5419: SBB Ev,Gv
551a: SBB Gb,Eb
561b: SBB Gv,Ev
571c: SBB AL,Ib
581d: SBB rAX,Iz
591e: PUSH DS (i64)
601f: POP DS (i64)
61# 0x20 - 0x2f
6220: AND Eb,Gb
6321: AND Ev,Gv
6422: AND Gb,Eb
6523: AND Gv,Ev
6624: AND AL,Ib
6725: AND rAx,Iz
6826: SEG=ES (Prefix)
6927: DAA (i64)
7028: SUB Eb,Gb
7129: SUB Ev,Gv
722a: SUB Gb,Eb
732b: SUB Gv,Ev
742c: SUB AL,Ib
752d: SUB rAX,Iz
762e: SEG=CS (Prefix)
772f: DAS (i64)
78# 0x30 - 0x3f
7930: XOR Eb,Gb
8031: XOR Ev,Gv
8132: XOR Gb,Eb
8233: XOR Gv,Ev
8334: XOR AL,Ib
8435: XOR rAX,Iz
8536: SEG=SS (Prefix)
8637: AAA (i64)
8738: CMP Eb,Gb
8839: CMP Ev,Gv
893a: CMP Gb,Eb
903b: CMP Gv,Ev
913c: CMP AL,Ib
923d: CMP rAX,Iz
933e: SEG=DS (Prefix)
943f: AAS (i64)
95# 0x40 - 0x4f
9640: INC eAX (i64) | REX (o64)
9741: INC eCX (i64) | REX.B (o64)
9842: INC eDX (i64) | REX.X (o64)
9943: INC eBX (i64) | REX.XB (o64)
10044: INC eSP (i64) | REX.R (o64)
10145: INC eBP (i64) | REX.RB (o64)
10246: INC eSI (i64) | REX.RX (o64)
10347: INC eDI (i64) | REX.RXB (o64)
10448: DEC eAX (i64) | REX.W (o64)
10549: DEC eCX (i64) | REX.WB (o64)
1064a: DEC eDX (i64) | REX.WX (o64)
1074b: DEC eBX (i64) | REX.WXB (o64)
1084c: DEC eSP (i64) | REX.WR (o64)
1094d: DEC eBP (i64) | REX.WRB (o64)
1104e: DEC eSI (i64) | REX.WRX (o64)
1114f: DEC eDI (i64) | REX.WRXB (o64)
112# 0x50 - 0x5f
11350: PUSH rAX/r8 (d64)
11451: PUSH rCX/r9 (d64)
11552: PUSH rDX/r10 (d64)
11653: PUSH rBX/r11 (d64)
11754: PUSH rSP/r12 (d64)
11855: PUSH rBP/r13 (d64)
11956: PUSH rSI/r14 (d64)
12057: PUSH rDI/r15 (d64)
12158: POP rAX/r8 (d64)
12259: POP rCX/r9 (d64)
1235a: POP rDX/r10 (d64)
1245b: POP rBX/r11 (d64)
1255c: POP rSP/r12 (d64)
1265d: POP rBP/r13 (d64)
1275e: POP rSI/r14 (d64)
1285f: POP rDI/r15 (d64)
129# 0x60 - 0x6f
13060: PUSHA/PUSHAD (i64)
13161: POPA/POPAD (i64)
13262: BOUND Gv,Ma (i64)
13363: ARPL Ew,Gw (i64) | MOVSXD Gv,Ev (o64)
13464: SEG=FS (Prefix)
13565: SEG=GS (Prefix)
13666: Operand-Size (Prefix)
13767: Address-Size (Prefix)
13868: PUSH Iz (d64)
13969: IMUL Gv,Ev,Iz
1406a: PUSH Ib (d64)
1416b: IMUL Gv,Ev,Ib
1426c: INS/INSB Yb,DX
1436d: INS/INSW/INSD Yz,DX
1446e: OUTS/OUTSB DX,Xb
1456f: OUTS/OUTSW/OUTSD DX,Xz
146# 0x70 - 0x7f
14770: JO Jb
14871: JNO Jb
14972: JB/JNAE/JC Jb
15073: JNB/JAE/JNC Jb
15174: JZ/JE Jb
15275: JNZ/JNE Jb
15376: JBE/JNA Jb
15477: JNBE/JA Jb
15578: JS Jb
15679: JNS Jb
1577a: JP/JPE Jb
1587b: JNP/JPO Jb
1597c: JL/JNGE Jb
1607d: JNL/JGE Jb
1617e: JLE/JNG Jb
1627f: JNLE/JG Jb
163# 0x80 - 0x8f
16480: Grp1 Eb,Ib (1A)
16581: Grp1 Ev,Iz (1A)
16682: Grp1 Eb,Ib (1A),(i64)
16783: Grp1 Ev,Ib (1A)
16884: TEST Eb,Gb
16985: TEST Ev,Gv
17086: XCHG Eb,Gb
17187: XCHG Ev,Gv
17288: MOV Eb,Gb
17389: MOV Ev,Gv
1748a: MOV Gb,Eb
1758b: MOV Gv,Ev
1768c: MOV Ev,Sw
1778d: LEA Gv,M
1788e: MOV Sw,Ew
1798f: Grp1A (1A) | POP Ev (d64)
180# 0x90 - 0x9f
18190: NOP | PAUSE (F3) | XCHG r8,rAX
18291: XCHG rCX/r9,rAX
18392: XCHG rDX/r10,rAX
18493: XCHG rBX/r11,rAX
18594: XCHG rSP/r12,rAX
18695: XCHG rBP/r13,rAX
18796: XCHG rSI/r14,rAX
18897: XCHG rDI/r15,rAX
18998: CBW/CWDE/CDQE
19099: CWD/CDQ/CQO
1919a: CALLF Ap (i64)
1929b: FWAIT/WAIT
1939c: PUSHF/D/Q Fv (d64)
1949d: POPF/D/Q Fv (d64)
1959e: SAHF
1969f: LAHF
197# 0xa0 - 0xaf
198a0: MOV AL,Ob
199a1: MOV rAX,Ov
200a2: MOV Ob,AL
201a3: MOV Ov,rAX
202a4: MOVS/B Xb,Yb
203a5: MOVS/W/D/Q Xv,Yv
204a6: CMPS/B Xb,Yb
205a7: CMPS/W/D Xv,Yv
206a8: TEST AL,Ib
207a9: TEST rAX,Iz
208aa: STOS/B Yb,AL
209ab: STOS/W/D/Q Yv,rAX
210ac: LODS/B AL,Xb
211ad: LODS/W/D/Q rAX,Xv
212ae: SCAS/B AL,Yb
213af: SCAS/W/D/Q rAX,Xv
214# 0xb0 - 0xbf
215b0: MOV AL/R8L,Ib
216b1: MOV CL/R9L,Ib
217b2: MOV DL/R10L,Ib
218b3: MOV BL/R11L,Ib
219b4: MOV AH/R12L,Ib
220b5: MOV CH/R13L,Ib
221b6: MOV DH/R14L,Ib
222b7: MOV BH/R15L,Ib
223b8: MOV rAX/r8,Iv
224b9: MOV rCX/r9,Iv
225ba: MOV rDX/r10,Iv
226bb: MOV rBX/r11,Iv
227bc: MOV rSP/r12,Iv
228bd: MOV rBP/r13,Iv
229be: MOV rSI/r14,Iv
230bf: MOV rDI/r15,Iv
231# 0xc0 - 0xcf
232c0: Grp2 Eb,Ib (1A)
233c1: Grp2 Ev,Ib (1A)
234c2: RETN Iw (f64)
235c3: RETN
236c4: LES Gz,Mp (i64) | 3bytes-VEX (Prefix)
237c5: LDS Gz,Mp (i64) | 2bytes-VEX (Prefix)
238c6: Grp11 Eb,Ib (1A)
239c7: Grp11 Ev,Iz (1A)
240c8: ENTER Iw,Ib
241c9: LEAVE (d64)
242ca: RETF Iw
243cb: RETF
244cc: INT3
245cd: INT Ib
246ce: INTO (i64)
247cf: IRET/D/Q
248# 0xd0 - 0xdf
249d0: Grp2 Eb,1 (1A)
250d1: Grp2 Ev,1 (1A)
251d2: Grp2 Eb,CL (1A)
252d3: Grp2 Ev,CL (1A)
253d4: AAM Ib (i64)
254d5: AAD Ib (i64)
255d6:
256d7: XLAT/XLATB
257d8: ESC
258d9: ESC
259da: ESC
260db: ESC
261dc: ESC
262dd: ESC
263de: ESC
264df: ESC
265# 0xe0 - 0xef
266e0: LOOPNE/LOOPNZ Jb (f64)
267e1: LOOPE/LOOPZ Jb (f64)
268e2: LOOP Jb (f64)
269e3: JrCXZ Jb (f64)
270e4: IN AL,Ib
271e5: IN eAX,Ib
272e6: OUT Ib,AL
273e7: OUT Ib,eAX
274e8: CALL Jz (f64)
275e9: JMP-near Jz (f64)
276ea: JMP-far Ap (i64)
277eb: JMP-short Jb (f64)
278ec: IN AL,DX
279ed: IN eAX,DX
280ee: OUT DX,AL
281ef: OUT DX,eAX
282# 0xf0 - 0xff
283f0: LOCK (Prefix)
284f1:
285f2: REPNE (Prefix)
286f3: REP/REPE (Prefix)
287f4: HLT
288f5: CMC
289f6: Grp3_1 Eb (1A)
290f7: Grp3_2 Ev (1A)
291f8: CLC
292f9: STC
293fa: CLI
294fb: STI
295fc: CLD
296fd: STD
297fe: Grp4 (1A)
298ff: Grp5 (1A)
299EndTable
300
301Table: 2-byte opcode (0x0f)
302Referrer: 2-byte escape
303AVXcode: 1
304# 0x0f 0x00-0x0f
30500: Grp6 (1A)
30601: Grp7 (1A)
30702: LAR Gv,Ew
30803: LSL Gv,Ew
30904:
31005: SYSCALL (o64)
31106: CLTS
31207: SYSRET (o64)
31308: INVD
31409: WBINVD
3150a:
3160b: UD2 (1B)
3170c:
3180d: NOP Ev | GrpP
3190e: FEMMS
320# 3DNow! uses the last imm byte as opcode extension.
3210f: 3DNow! Pq,Qq,Ib
322# 0x0f 0x10-0x1f
32310: movups Vps,Wps (VEX) | movss Vss,Wss (F3),(VEX),(o128) | movupd Vpd,Wpd (66),(VEX) | movsd Vsd,Wsd (F2),(VEX),(o128)
32411: movups Wps,Vps (VEX) | movss Wss,Vss (F3),(VEX),(o128) | movupd Wpd,Vpd (66),(VEX) | movsd Wsd,Vsd (F2),(VEX),(o128)
32512: movlps Vq,Mq (VEX),(o128) | movlpd Vq,Mq (66),(VEX),(o128) | movhlps Vq,Uq (VEX),(o128) | movddup Vq,Wq (F2),(VEX) | movsldup Vq,Wq (F3),(VEX)
32613: mpvlps Mq,Vq (VEX),(o128) | movlpd Mq,Vq (66),(VEX),(o128)
32714: unpcklps Vps,Wq (VEX) | unpcklpd Vpd,Wq (66),(VEX)
32815: unpckhps Vps,Wq (VEX) | unpckhpd Vpd,Wq (66),(VEX)
32916: movhps Vq,Mq (VEX),(o128) | movhpd Vq,Mq (66),(VEX),(o128) | movlsps Vq,Uq (VEX),(o128) | movshdup Vq,Wq (F3),(VEX)
33017: movhps Mq,Vq (VEX),(o128) | movhpd Mq,Vq (66),(VEX),(o128)
33118: Grp16 (1A)
33219:
3331a:
3341b:
3351c:
3361d:
3371e:
3381f: NOP Ev
339# 0x0f 0x20-0x2f
34020: MOV Rd,Cd
34121: MOV Rd,Dd
34222: MOV Cd,Rd
34323: MOV Dd,Rd
34424:
34525:
34626:
34727:
34828: movaps Vps,Wps (VEX) | movapd Vpd,Wpd (66),(VEX)
34929: movaps Wps,Vps (VEX) | movapd Wpd,Vpd (66),(VEX)
3502a: cvtpi2ps Vps,Qpi | cvtsi2ss Vss,Ed/q (F3),(VEX),(o128) | cvtpi2pd Vpd,Qpi (66) | cvtsi2sd Vsd,Ed/q (F2),(VEX),(o128)
3512b: movntps Mps,Vps (VEX) | movntpd Mpd,Vpd (66),(VEX)
3522c: cvttps2pi Ppi,Wps | cvttss2si Gd/q,Wss (F3),(VEX),(o128) | cvttpd2pi Ppi,Wpd (66) | cvttsd2si Gd/q,Wsd (F2),(VEX),(o128)
3532d: cvtps2pi Ppi,Wps | cvtss2si Gd/q,Wss (F3),(VEX),(o128) | cvtpd2pi Qpi,Wpd (66) | cvtsd2si Gd/q,Wsd (F2),(VEX),(o128)
3542e: ucomiss Vss,Wss (VEX),(o128) | ucomisd Vsd,Wsd (66),(VEX),(o128)
3552f: comiss Vss,Wss (VEX),(o128) | comisd Vsd,Wsd (66),(VEX),(o128)
356# 0x0f 0x30-0x3f
35730: WRMSR
35831: RDTSC
35932: RDMSR
36033: RDPMC
36134: SYSENTER
36235: SYSEXIT
36336:
36437: GETSEC
36538: escape # 3-byte escape 1
36639:
3673a: escape # 3-byte escape 2
3683b:
3693c:
3703d:
3713e:
3723f:
373# 0x0f 0x40-0x4f
37440: CMOVO Gv,Ev
37541: CMOVNO Gv,Ev
37642: CMOVB/C/NAE Gv,Ev
37743: CMOVAE/NB/NC Gv,Ev
37844: CMOVE/Z Gv,Ev
37945: CMOVNE/NZ Gv,Ev
38046: CMOVBE/NA Gv,Ev
38147: CMOVA/NBE Gv,Ev
38248: CMOVS Gv,Ev
38349: CMOVNS Gv,Ev
3844a: CMOVP/PE Gv,Ev
3854b: CMOVNP/PO Gv,Ev
3864c: CMOVL/NGE Gv,Ev
3874d: CMOVNL/GE Gv,Ev
3884e: CMOVLE/NG Gv,Ev
3894f: CMOVNLE/G Gv,Ev
390# 0x0f 0x50-0x5f
39150: movmskps Gd/q,Ups (VEX) | movmskpd Gd/q,Upd (66),(VEX)
39251: sqrtps Vps,Wps (VEX) | sqrtss Vss,Wss (F3),(VEX),(o128) | sqrtpd Vpd,Wpd (66),(VEX) | sqrtsd Vsd,Wsd (F2),(VEX),(o128)
39352: rsqrtps Vps,Wps (VEX) | rsqrtss Vss,Wss (F3),(VEX),(o128)
39453: rcpps Vps,Wps (VEX) | rcpss Vss,Wss (F3),(VEX),(o128)
39554: andps Vps,Wps (VEX) | andpd Vpd,Wpd (66),(VEX)
39655: andnps Vps,Wps (VEX) | andnpd Vpd,Wpd (66),(VEX)
39756: orps Vps,Wps (VEX) | orpd Vpd,Wpd (66),(VEX)
39857: xorps Vps,Wps (VEX) | xorpd Vpd,Wpd (66),(VEX)
39958: addps Vps,Wps (VEX) | addss Vss,Wss (F3),(VEX),(o128) | addpd Vpd,Wpd (66),(VEX) | addsd Vsd,Wsd (F2),(VEX),(o128)
40059: mulps Vps,Wps (VEX) | mulss Vss,Wss (F3),(VEX),(o128) | mulpd Vpd,Wpd (66),(VEX) | mulsd Vsd,Wsd (F2),(VEX),(o128)
4015a: cvtps2pd Vpd,Wps (VEX) | cvtss2sd Vsd,Wss (F3),(VEX),(o128) | cvtpd2ps Vps,Wpd (66),(VEX) | cvtsd2ss Vsd,Wsd (F2),(VEX),(o128)
4025b: cvtdq2ps Vps,Wdq (VEX) | cvtps2dq Vdq,Wps (66),(VEX) | cvttps2dq Vdq,Wps (F3),(VEX)
4035c: subps Vps,Wps (VEX) | subss Vss,Wss (F3),(VEX),(o128) | subpd Vpd,Wpd (66),(VEX) | subsd Vsd,Wsd (F2),(VEX),(o128)
4045d: minps Vps,Wps (VEX) | minss Vss,Wss (F3),(VEX),(o128) | minpd Vpd,Wpd (66),(VEX) | minsd Vsd,Wsd (F2),(VEX),(o128)
4055e: divps Vps,Wps (VEX) | divss Vss,Wss (F3),(VEX),(o128) | divpd Vpd,Wpd (66),(VEX) | divsd Vsd,Wsd (F2),(VEX),(o128)
4065f: maxps Vps,Wps (VEX) | maxss Vss,Wss (F3),(VEX),(o128) | maxpd Vpd,Wpd (66),(VEX) | maxsd Vsd,Wsd (F2),(VEX),(o128)
407# 0x0f 0x60-0x6f
40860: punpcklbw Pq,Qd | punpcklbw Vdq,Wdq (66),(VEX),(o128)
40961: punpcklwd Pq,Qd | punpcklwd Vdq,Wdq (66),(VEX),(o128)
41062: punpckldq Pq,Qd | punpckldq Vdq,Wdq (66),(VEX),(o128)
41163: packsswb Pq,Qq | packsswb Vdq,Wdq (66),(VEX),(o128)
41264: pcmpgtb Pq,Qq | pcmpgtb Vdq,Wdq (66),(VEX),(o128)
41365: pcmpgtw Pq,Qq | pcmpgtw Vdq,Wdq (66),(VEX),(o128)
41466: pcmpgtd Pq,Qq | pcmpgtd Vdq,Wdq (66),(VEX),(o128)
41567: packuswb Pq,Qq | packuswb Vdq,Wdq (66),(VEX),(o128)
41668: punpckhbw Pq,Qd | punpckhbw Vdq,Wdq (66),(VEX),(o128)
41769: punpckhwd Pq,Qd | punpckhwd Vdq,Wdq (66),(VEX),(o128)
4186a: punpckhdq Pq,Qd | punpckhdq Vdq,Wdq (66),(VEX),(o128)
4196b: packssdw Pq,Qd | packssdw Vdq,Wdq (66),(VEX),(o128)
4206c: punpcklqdq Vdq,Wdq (66),(VEX),(o128)
4216d: punpckhqdq Vdq,Wdq (66),(VEX),(o128)
4226e: movd/q/ Pd,Ed/q | movd/q Vdq,Ed/q (66),(VEX),(o128)
4236f: movq Pq,Qq | movdqa Vdq,Wdq (66),(VEX) | movdqu Vdq,Wdq (F3),(VEX)
424# 0x0f 0x70-0x7f
42570: pshufw Pq,Qq,Ib | pshufd Vdq,Wdq,Ib (66),(VEX),(o128) | pshufhw Vdq,Wdq,Ib (F3),(VEX),(o128) | pshuflw VdqWdq,Ib (F2),(VEX),(o128)
42671: Grp12 (1A)
42772: Grp13 (1A)
42873: Grp14 (1A)
42974: pcmpeqb Pq,Qq | pcmpeqb Vdq,Wdq (66),(VEX),(o128)
43075: pcmpeqw Pq,Qq | pcmpeqw Vdq,Wdq (66),(VEX),(o128)
43176: pcmpeqd Pq,Qq | pcmpeqd Vdq,Wdq (66),(VEX),(o128)
43277: emms/vzeroupper/vzeroall (VEX)
43378: VMREAD Ed/q,Gd/q
43479: VMWRITE Gd/q,Ed/q
4357a:
4367b:
4377c: haddps Vps,Wps (F2),(VEX) | haddpd Vpd,Wpd (66),(VEX)
4387d: hsubps Vps,Wps (F2),(VEX) | hsubpd Vpd,Wpd (66),(VEX)
4397e: movd/q Ed/q,Pd | movd/q Ed/q,Vdq (66),(VEX),(o128) | movq Vq,Wq (F3),(VEX),(o128)
4407f: movq Qq,Pq | movdqa Wdq,Vdq (66),(VEX) | movdqu Wdq,Vdq (F3),(VEX)
441# 0x0f 0x80-0x8f
44280: JO Jz (f64)
44381: JNO Jz (f64)
44482: JB/JNAE/JC Jz (f64)
44583: JNB/JAE/JNC Jz (f64)
44684: JZ/JE Jz (f64)
44785: JNZ/JNE Jz (f64)
44886: JBE/JNA Jz (f64)
44987: JNBE/JA Jz (f64)
45088: JS Jz (f64)
45189: JNS Jz (f64)
4528a: JP/JPE Jz (f64)
4538b: JNP/JPO Jz (f64)
4548c: JL/JNGE Jz (f64)
4558d: JNL/JGE Jz (f64)
4568e: JLE/JNG Jz (f64)
4578f: JNLE/JG Jz (f64)
458# 0x0f 0x90-0x9f
45990: SETO Eb
46091: SETNO Eb
46192: SETB/C/NAE Eb
46293: SETAE/NB/NC Eb
46394: SETE/Z Eb
46495: SETNE/NZ Eb
46596: SETBE/NA Eb
46697: SETA/NBE Eb
46798: SETS Eb
46899: SETNS Eb
4699a: SETP/PE Eb
4709b: SETNP/PO Eb
4719c: SETL/NGE Eb
4729d: SETNL/GE Eb
4739e: SETLE/NG Eb
4749f: SETNLE/G Eb
475# 0x0f 0xa0-0xaf
476a0: PUSH FS (d64)
477a1: POP FS (d64)
478a2: CPUID
479a3: BT Ev,Gv
480a4: SHLD Ev,Gv,Ib
481a5: SHLD Ev,Gv,CL
482a6: GrpPDLK
483a7: GrpRNG
484a8: PUSH GS (d64)
485a9: POP GS (d64)
486aa: RSM
487ab: BTS Ev,Gv
488ac: SHRD Ev,Gv,Ib
489ad: SHRD Ev,Gv,CL
490ae: Grp15 (1A),(1C)
491af: IMUL Gv,Ev
492# 0x0f 0xb0-0xbf
493b0: CMPXCHG Eb,Gb
494b1: CMPXCHG Ev,Gv
495b2: LSS Gv,Mp
496b3: BTR Ev,Gv
497b4: LFS Gv,Mp
498b5: LGS Gv,Mp
499b6: MOVZX Gv,Eb
500b7: MOVZX Gv,Ew
501b8: JMPE | POPCNT Gv,Ev (F3)
502b9: Grp10 (1A)
503ba: Grp8 Ev,Ib (1A)
504bb: BTC Ev,Gv
505bc: BSF Gv,Ev
506bd: BSR Gv,Ev
507be: MOVSX Gv,Eb
508bf: MOVSX Gv,Ew
509# 0x0f 0xc0-0xcf
510c0: XADD Eb,Gb
511c1: XADD Ev,Gv
512c2: cmpps Vps,Wps,Ib (VEX) | cmpss Vss,Wss,Ib (F3),(VEX),(o128) | cmppd Vpd,Wpd,Ib (66),(VEX) | cmpsd Vsd,Wsd,Ib (F2),(VEX)
513c3: movnti Md/q,Gd/q
514c4: pinsrw Pq,Rd/q/Mw,Ib | pinsrw Vdq,Rd/q/Mw,Ib (66),(VEX),(o128)
515c5: pextrw Gd,Nq,Ib | pextrw Gd,Udq,Ib (66),(VEX),(o128)
516c6: shufps Vps,Wps,Ib (VEX) | shufpd Vpd,Wpd,Ib (66),(VEX)
517c7: Grp9 (1A)
518c8: BSWAP RAX/EAX/R8/R8D
519c9: BSWAP RCX/ECX/R9/R9D
520ca: BSWAP RDX/EDX/R10/R10D
521cb: BSWAP RBX/EBX/R11/R11D
522cc: BSWAP RSP/ESP/R12/R12D
523cd: BSWAP RBP/EBP/R13/R13D
524ce: BSWAP RSI/ESI/R14/R14D
525cf: BSWAP RDI/EDI/R15/R15D
526# 0x0f 0xd0-0xdf
527d0: addsubps Vps,Wps (F2),(VEX) | addsubpd Vpd,Wpd (66),(VEX)
528d1: psrlw Pq,Qq | psrlw Vdq,Wdq (66),(VEX),(o128)
529d2: psrld Pq,Qq | psrld Vdq,Wdq (66),(VEX),(o128)
530d3: psrlq Pq,Qq | psrlq Vdq,Wdq (66),(VEX),(o128)
531d4: paddq Pq,Qq | paddq Vdq,Wdq (66),(VEX),(o128)
532d5: pmullw Pq,Qq | pmullw Vdq,Wdq (66),(VEX),(o128)
533d6: movq Wq,Vq (66),(VEX),(o128) | movq2dq Vdq,Nq (F3) | movdq2q Pq,Uq (F2)
534d7: pmovmskb Gd,Nq | pmovmskb Gd,Udq (66),(VEX),(o128)
535d8: psubusb Pq,Qq | psubusb Vdq,Wdq (66),(VEX),(o128)
536d9: psubusw Pq,Qq | psubusw Vdq,Wdq (66),(VEX),(o128)
537da: pminub Pq,Qq | pminub Vdq,Wdq (66),(VEX),(o128)
538db: pand Pq,Qq | pand Vdq,Wdq (66),(VEX),(o128)
539dc: paddusb Pq,Qq | paddusb Vdq,Wdq (66),(VEX),(o128)
540dd: paddusw Pq,Qq | paddusw Vdq,Wdq (66),(VEX),(o128)
541de: pmaxub Pq,Qq | pmaxub Vdq,Wdq (66),(VEX),(o128)
542df: pandn Pq,Qq | pandn Vdq,Wdq (66),(VEX),(o128)
543# 0x0f 0xe0-0xef
544e0: pavgb Pq,Qq | pavgb Vdq,Wdq (66),(VEX),(o128)
545e1: psraw Pq,Qq | psraw Vdq,Wdq (66),(VEX),(o128)
546e2: psrad Pq,Qq | psrad Vdq,Wdq (66),(VEX),(o128)
547e3: pavgw Pq,Qq | pavgw Vdq,Wdq (66),(VEX),(o128)
548e4: pmulhuw Pq,Qq | pmulhuw Vdq,Wdq (66),(VEX),(o128)
549e5: pmulhw Pq,Qq | pmulhw Vdq,Wdq (66),(VEX),(o128)
550e6: cvtpd2dq Vdq,Wpd (F2),(VEX) | cvttpd2dq Vdq,Wpd (66),(VEX) | cvtdq2pd Vpd,Wdq (F3),(VEX)
551e7: movntq Mq,Pq | movntdq Mdq,Vdq (66),(VEX)
552e8: psubsb Pq,Qq | psubsb Vdq,Wdq (66),(VEX),(o128)
553e9: psubsw Pq,Qq | psubsw Vdq,Wdq (66),(VEX),(o128)
554ea: pminsw Pq,Qq | pminsw Vdq,Wdq (66),(VEX),(o128)
555eb: por Pq,Qq | por Vdq,Wdq (66),(VEX),(o128)
556ec: paddsb Pq,Qq | paddsb Vdq,Wdq (66),(VEX),(o128)
557ed: paddsw Pq,Qq | paddsw Vdq,Wdq (66),(VEX),(o128)
558ee: pmaxsw Pq,Qq | pmaxsw Vdq,Wdq (66),(VEX),(o128)
559ef: pxor Pq,Qq | pxor Vdq,Wdq (66),(VEX),(o128)
560# 0x0f 0xf0-0xff
561f0: lddqu Vdq,Mdq (F2),(VEX)
562f1: psllw Pq,Qq | psllw Vdq,Wdq (66),(VEX),(o128)
563f2: pslld Pq,Qq | pslld Vdq,Wdq (66),(VEX),(o128)
564f3: psllq Pq,Qq | psllq Vdq,Wdq (66),(VEX),(o128)
565f4: pmuludq Pq,Qq | pmuludq Vdq,Wdq (66),(VEX),(o128)
566f5: pmaddwd Pq,Qq | pmaddwd Vdq,Wdq (66),(VEX),(o128)
567f6: psadbw Pq,Qq | psadbw Vdq,Wdq (66),(VEX),(o128)
568f7: maskmovq Pq,Nq | maskmovdqu Vdq,Udq (66),(VEX),(o128)
569f8: psubb Pq,Qq | psubb Vdq,Wdq (66),(VEX),(o128)
570f9: psubw Pq,Qq | psubw Vdq,Wdq (66),(VEX),(o128)
571fa: psubd Pq,Qq | psubd Vdq,Wdq (66),(VEX),(o128)
572fb: psubq Pq,Qq | psubq Vdq,Wdq (66),(VEX),(o128)
573fc: paddb Pq,Qq | paddb Vdq,Wdq (66),(VEX),(o128)
574fd: paddw Pq,Qq | paddw Vdq,Wdq (66),(VEX),(o128)
575fe: paddd Pq,Qq | paddd Vdq,Wdq (66),(VEX),(o128)
576ff:
577EndTable
578
579Table: 3-byte opcode 1 (0x0f 0x38)
580Referrer: 3-byte escape 1
581AVXcode: 2
582# 0x0f 0x38 0x00-0x0f
58300: pshufb Pq,Qq | pshufb Vdq,Wdq (66),(VEX),(o128)
58401: phaddw Pq,Qq | phaddw Vdq,Wdq (66),(VEX),(o128)
58502: phaddd Pq,Qq | phaddd Vdq,Wdq (66),(VEX),(o128)
58603: phaddsw Pq,Qq | phaddsw Vdq,Wdq (66),(VEX),(o128)
58704: pmaddubsw Pq,Qq | pmaddubsw Vdq,Wdq (66),(VEX),(o128)
58805: phsubw Pq,Qq | phsubw Vdq,Wdq (66),(VEX),(o128)
58906: phsubd Pq,Qq | phsubd Vdq,Wdq (66),(VEX),(o128)
59007: phsubsw Pq,Qq | phsubsw Vdq,Wdq (66),(VEX),(o128)
59108: psignb Pq,Qq | psignb Vdq,Wdq (66),(VEX),(o128)
59209: psignw Pq,Qq | psignw Vdq,Wdq (66),(VEX),(o128)
5930a: psignd Pq,Qq | psignd Vdq,Wdq (66),(VEX),(o128)
5940b: pmulhrsw Pq,Qq | pmulhrsw Vdq,Wdq (66),(VEX),(o128)
5950c: Vpermilps /r (66),(oVEX)
5960d: Vpermilpd /r (66),(oVEX)
5970e: vtestps /r (66),(oVEX)
5980f: vtestpd /r (66),(oVEX)
599# 0x0f 0x38 0x10-0x1f
60010: pblendvb Vdq,Wdq (66)
60111:
60212:
60313:
60414: blendvps Vdq,Wdq (66)
60515: blendvpd Vdq,Wdq (66)
60616:
60717: ptest Vdq,Wdq (66),(VEX)
60818: vbroadcastss /r (66),(oVEX)
60919: vbroadcastsd /r (66),(oVEX),(o256)
6101a: vbroadcastf128 /r (66),(oVEX),(o256)
6111b:
6121c: pabsb Pq,Qq | pabsb Vdq,Wdq (66),(VEX),(o128)
6131d: pabsw Pq,Qq | pabsw Vdq,Wdq (66),(VEX),(o128)
6141e: pabsd Pq,Qq | pabsd Vdq,Wdq (66),(VEX),(o128)
6151f:
616# 0x0f 0x38 0x20-0x2f
61720: pmovsxbw Vdq,Udq/Mq (66),(VEX),(o128)
61821: pmovsxbd Vdq,Udq/Md (66),(VEX),(o128)
61922: pmovsxbq Vdq,Udq/Mw (66),(VEX),(o128)
62023: pmovsxwd Vdq,Udq/Mq (66),(VEX),(o128)
62124: pmovsxwq Vdq,Udq/Md (66),(VEX),(o128)
62225: pmovsxdq Vdq,Udq/Mq (66),(VEX),(o128)
62326:
62427:
62528: pmuldq Vdq,Wdq (66),(VEX),(o128)
62629: pcmpeqq Vdq,Wdq (66),(VEX),(o128)
6272a: movntdqa Vdq,Mdq (66),(VEX),(o128)
6282b: packusdw Vdq,Wdq (66),(VEX),(o128)
6292c: vmaskmovps(ld) /r (66),(oVEX)
6302d: vmaskmovpd(ld) /r (66),(oVEX)
6312e: vmaskmovps(st) /r (66),(oVEX)
6322f: vmaskmovpd(st) /r (66),(oVEX)
633# 0x0f 0x38 0x30-0x3f
63430: pmovzxbw Vdq,Udq/Mq (66),(VEX),(o128)
63531: pmovzxbd Vdq,Udq/Md (66),(VEX),(o128)
63632: pmovzxbq Vdq,Udq/Mw (66),(VEX),(o128)
63733: pmovzxwd Vdq,Udq/Mq (66),(VEX),(o128)
63834: pmovzxwq Vdq,Udq/Md (66),(VEX),(o128)
63935: pmovzxdq Vdq,Udq/Mq (66),(VEX),(o128)
64036:
64137: pcmpgtq Vdq,Wdq (66),(VEX),(o128)
64238: pminsb Vdq,Wdq (66),(VEX),(o128)
64339: pminsd Vdq,Wdq (66),(VEX),(o128)
6443a: pminuw Vdq,Wdq (66),(VEX),(o128)
6453b: pminud Vdq,Wdq (66),(VEX),(o128)
6463c: pmaxsb Vdq,Wdq (66),(VEX),(o128)
6473d: pmaxsd Vdq,Wdq (66),(VEX),(o128)
6483e: pmaxuw Vdq,Wdq (66),(VEX),(o128)
6493f: pmaxud Vdq,Wdq (66),(VEX),(o128)
650# 0x0f 0x38 0x40-0x8f
65140: pmulld Vdq,Wdq (66),(VEX),(o128)
65241: phminposuw Vdq,Wdq (66),(VEX),(o128)
65380: INVEPT Gd/q,Mdq (66)
65481: INVPID Gd/q,Mdq (66)
655# 0x0f 0x38 0x90-0xbf (FMA)
65696: vfmaddsub132pd/ps /r (66),(VEX)
65797: vfmsubadd132pd/ps /r (66),(VEX)
65898: vfmadd132pd/ps /r (66),(VEX)
65999: vfmadd132sd/ss /r (66),(VEX),(o128)
6609a: vfmsub132pd/ps /r (66),(VEX)
6619b: vfmsub132sd/ss /r (66),(VEX),(o128)
6629c: vfnmadd132pd/ps /r (66),(VEX)
6639d: vfnmadd132sd/ss /r (66),(VEX),(o128)
6649e: vfnmsub132pd/ps /r (66),(VEX)
6659f: vfnmsub132sd/ss /r (66),(VEX),(o128)
666a6: vfmaddsub213pd/ps /r (66),(VEX)
667a7: vfmsubadd213pd/ps /r (66),(VEX)
668a8: vfmadd213pd/ps /r (66),(VEX)
669a9: vfmadd213sd/ss /r (66),(VEX),(o128)
670aa: vfmsub213pd/ps /r (66),(VEX)
671ab: vfmsub213sd/ss /r (66),(VEX),(o128)
672ac: vfnmadd213pd/ps /r (66),(VEX)
673ad: vfnmadd213sd/ss /r (66),(VEX),(o128)
674ae: vfnmsub213pd/ps /r (66),(VEX)
675af: vfnmsub213sd/ss /r (66),(VEX),(o128)
676b6: vfmaddsub231pd/ps /r (66),(VEX)
677b7: vfmsubadd231pd/ps /r (66),(VEX)
678b8: vfmadd231pd/ps /r (66),(VEX)
679b9: vfmadd231sd/ss /r (66),(VEX),(o128)
680ba: vfmsub231pd/ps /r (66),(VEX)
681bb: vfmsub231sd/ss /r (66),(VEX),(o128)
682bc: vfnmadd231pd/ps /r (66),(VEX)
683bd: vfnmadd231sd/ss /r (66),(VEX),(o128)
684be: vfnmsub231pd/ps /r (66),(VEX)
685bf: vfnmsub231sd/ss /r (66),(VEX),(o128)
686# 0x0f 0x38 0xc0-0xff
687db: aesimc Vdq,Wdq (66),(VEX),(o128)
688dc: aesenc Vdq,Wdq (66),(VEX),(o128)
689dd: aesenclast Vdq,Wdq (66),(VEX),(o128)
690de: aesdec Vdq,Wdq (66),(VEX),(o128)
691df: aesdeclast Vdq,Wdq (66),(VEX),(o128)
692f0: MOVBE Gv,Mv | CRC32 Gd,Eb (F2)
693f1: MOVBE Mv,Gv | CRC32 Gd,Ev (F2)
694EndTable
695
696Table: 3-byte opcode 2 (0x0f 0x3a)
697Referrer: 3-byte escape 2
698AVXcode: 3
699# 0x0f 0x3a 0x00-0xff
70004: vpermilps /r,Ib (66),(oVEX)
70105: vpermilpd /r,Ib (66),(oVEX)
70206: vperm2f128 /r,Ib (66),(oVEX),(o256)
70308: roundps Vdq,Wdq,Ib (66),(VEX)
70409: roundpd Vdq,Wdq,Ib (66),(VEX)
7050a: roundss Vss,Wss,Ib (66),(VEX),(o128)
7060b: roundsd Vsd,Wsd,Ib (66),(VEX),(o128)
7070c: blendps Vdq,Wdq,Ib (66),(VEX)
7080d: blendpd Vdq,Wdq,Ib (66),(VEX)
7090e: pblendw Vdq,Wdq,Ib (66),(VEX),(o128)
7100f: palignr Pq,Qq,Ib | palignr Vdq,Wdq,Ib (66),(VEX),(o128)
71114: pextrb Rd/Mb,Vdq,Ib (66),(VEX),(o128)
71215: pextrw Rd/Mw,Vdq,Ib (66),(VEX),(o128)
71316: pextrd/pextrq Ed/q,Vdq,Ib (66),(VEX),(o128)
71417: extractps Ed,Vdq,Ib (66),(VEX),(o128)
71518: vinsertf128 /r,Ib (66),(oVEX),(o256)
71619: vextractf128 /r,Ib (66),(oVEX),(o256)
71720: pinsrb Vdq,Rd/q/Mb,Ib (66),(VEX),(o128)
71821: insertps Vdq,Udq/Md,Ib (66),(VEX),(o128)
71922: pinsrd/pinsrq Vdq,Ed/q,Ib (66),(VEX),(o128)
72040: dpps Vdq,Wdq,Ib (66),(VEX)
72141: dppd Vdq,Wdq,Ib (66),(VEX),(o128)
72242: mpsadbw Vdq,Wdq,Ib (66),(VEX),(o128)
72344: pclmulq Vdq,Wdq,Ib (66),(VEX),(o128)
7244a: vblendvps /r,Ib (66),(oVEX)
7254b: vblendvpd /r,Ib (66),(oVEX)
7264c: vpblendvb /r,Ib (66),(oVEX),(o128)
72760: pcmpestrm Vdq,Wdq,Ib (66),(VEX),(o128)
72861: pcmpestri Vdq,Wdq,Ib (66),(VEX),(o128)
72962: pcmpistrm Vdq,Wdq,Ib (66),(VEX),(o128)
73063: pcmpistri Vdq,Wdq,Ib (66),(VEX),(o128)
731df: aeskeygenassist Vdq,Wdq,Ib (66),(VEX),(o128)
732EndTable
733
734GrpTable: Grp1
7350: ADD
7361: OR
7372: ADC
7383: SBB
7394: AND
7405: SUB
7416: XOR
7427: CMP
743EndTable
744
745GrpTable: Grp1A
7460: POP
747EndTable
748
749GrpTable: Grp2
7500: ROL
7511: ROR
7522: RCL
7533: RCR
7544: SHL/SAL
7555: SHR
7566:
7577: SAR
758EndTable
759
760GrpTable: Grp3_1
7610: TEST Eb,Ib
7621:
7632: NOT Eb
7643: NEG Eb
7654: MUL AL,Eb
7665: IMUL AL,Eb
7676: DIV AL,Eb
7687: IDIV AL,Eb
769EndTable
770
771GrpTable: Grp3_2
7720: TEST Ev,Iz
7731:
7742: NOT Ev
7753: NEG Ev
7764: MUL rAX,Ev
7775: IMUL rAX,Ev
7786: DIV rAX,Ev
7797: IDIV rAX,Ev
780EndTable
781
782GrpTable: Grp4
7830: INC Eb
7841: DEC Eb
785EndTable
786
787GrpTable: Grp5
7880: INC Ev
7891: DEC Ev
7902: CALLN Ev (f64)
7913: CALLF Ep
7924: JMPN Ev (f64)
7935: JMPF Ep
7946: PUSH Ev (d64)
7957:
796EndTable
797
798GrpTable: Grp6
7990: SLDT Rv/Mw
8001: STR Rv/Mw
8012: LLDT Ew
8023: LTR Ew
8034: VERR Ew
8045: VERW Ew
805EndTable
806
807GrpTable: Grp7
8080: SGDT Ms | VMCALL (001),(11B) | VMLAUNCH (010),(11B) | VMRESUME (011),(11B) | VMXOFF (100),(11B)
8091: SIDT Ms | MONITOR (000),(11B) | MWAIT (001)
8102: LGDT Ms | XGETBV (000),(11B) | XSETBV (001),(11B)
8113: LIDT Ms
8124: SMSW Mw/Rv
8135:
8146: LMSW Ew
8157: INVLPG Mb | SWAPGS (o64),(000),(11B) | RDTSCP (001),(11B)
816EndTable
817
818GrpTable: Grp8
8194: BT
8205: BTS
8216: BTR
8227: BTC
823EndTable
824
825GrpTable: Grp9
8261: CMPXCHG8B/16B Mq/Mdq
8276: VMPTRLD Mq | VMCLEAR Mq (66) | VMXON Mq (F3)
8287: VMPTRST Mq
829EndTable
830
831GrpTable: Grp10
832EndTable
833
834GrpTable: Grp11
8350: MOV
836EndTable
837
838GrpTable: Grp12
8392: psrlw Nq,Ib (11B) | psrlw Udq,Ib (66),(11B),(VEX),(o128)
8404: psraw Nq,Ib (11B) | psraw Udq,Ib (66),(11B),(VEX),(o128)
8416: psllw Nq,Ib (11B) | psllw Udq,Ib (66),(11B),(VEX),(o128)
842EndTable
843
844GrpTable: Grp13
8452: psrld Nq,Ib (11B) | psrld Udq,Ib (66),(11B),(VEX),(o128)
8464: psrad Nq,Ib (11B) | psrad Udq,Ib (66),(11B),(VEX),(o128)
8476: pslld Nq,Ib (11B) | pslld Udq,Ib (66),(11B),(VEX),(o128)
848EndTable
849
850GrpTable: Grp14
8512: psrlq Nq,Ib (11B) | psrlq Udq,Ib (66),(11B),(VEX),(o128)
8523: psrldq Udq,Ib (66),(11B),(VEX),(o128)
8536: psllq Nq,Ib (11B) | psllq Udq,Ib (66),(11B),(VEX),(o128)
8547: pslldq Udq,Ib (66),(11B),(VEX),(o128)
855EndTable
856
857GrpTable: Grp15
8580: fxsave
8591: fxstor
8602: ldmxcsr (VEX)
8613: stmxcsr (VEX)
8624: XSAVE
8635: XRSTOR | lfence (11B)
8646: mfence (11B)
8657: clflush | sfence (11B)
866EndTable
867
868GrpTable: Grp16
8690: prefetch NTA
8701: prefetch T0
8712: prefetch T1
8723: prefetch T2
873EndTable
874
875# AMD's Prefetch Group
876GrpTable: GrpP
8770: PREFETCH
8781: PREFETCHW
879EndTable
880
881GrpTable: GrpPDLK
8820: MONTMUL
8831: XSHA1
8842: XSHA2
885EndTable
886
887GrpTable: GrpRNG
8880: xstore-rng
8891: xcrypt-ecb
8902: xcrypt-cbc
8914: xcrypt-cfb
8925: xcrypt-ofb
893EndTable
diff --git a/arch/x86/mm/extable.c b/arch/x86/mm/extable.c
index 61b41ca3b5a2..d0474ad2a6e5 100644
--- a/arch/x86/mm/extable.c
+++ b/arch/x86/mm/extable.c
@@ -35,34 +35,3 @@ int fixup_exception(struct pt_regs *regs)
35 35
36 return 0; 36 return 0;
37} 37}
38
39#ifdef CONFIG_X86_64
40/*
41 * Need to defined our own search_extable on X86_64 to work around
42 * a B stepping K8 bug.
43 */
44const struct exception_table_entry *
45search_extable(const struct exception_table_entry *first,
46 const struct exception_table_entry *last,
47 unsigned long value)
48{
49 /* B stepping K8 bug */
50 if ((value >> 32) == 0)
51 value |= 0xffffffffUL << 32;
52
53 while (first <= last) {
54 const struct exception_table_entry *mid;
55 long diff;
56
57 mid = (last - first) / 2 + first;
58 diff = mid->insn - value;
59 if (diff == 0)
60 return mid;
61 else if (diff < 0)
62 first = mid+1;
63 else
64 last = mid-1;
65 }
66 return NULL;
67}
68#endif
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index f4cee9028cf0..f62777940dfb 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -38,7 +38,8 @@ enum x86_pf_error_code {
38 * Returns 0 if mmiotrace is disabled, or if the fault is not 38 * Returns 0 if mmiotrace is disabled, or if the fault is not
39 * handled by mmiotrace: 39 * handled by mmiotrace:
40 */ 40 */
41static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr) 41static inline int __kprobes
42kmmio_fault(struct pt_regs *regs, unsigned long addr)
42{ 43{
43 if (unlikely(is_kmmio_active())) 44 if (unlikely(is_kmmio_active()))
44 if (kmmio_handler(regs, addr) == 1) 45 if (kmmio_handler(regs, addr) == 1)
@@ -46,7 +47,7 @@ static inline int kmmio_fault(struct pt_regs *regs, unsigned long addr)
46 return 0; 47 return 0;
47} 48}
48 49
49static inline int notify_page_fault(struct pt_regs *regs) 50static inline int __kprobes notify_page_fault(struct pt_regs *regs)
50{ 51{
51 int ret = 0; 52 int ret = 0;
52 53
@@ -240,7 +241,7 @@ void vmalloc_sync_all(void)
240 * 241 *
241 * Handle a fault on the vmalloc or module mapping area 242 * Handle a fault on the vmalloc or module mapping area
242 */ 243 */
243static noinline int vmalloc_fault(unsigned long address) 244static noinline __kprobes int vmalloc_fault(unsigned long address)
244{ 245{
245 unsigned long pgd_paddr; 246 unsigned long pgd_paddr;
246 pmd_t *pmd_k; 247 pmd_t *pmd_k;
@@ -357,7 +358,7 @@ void vmalloc_sync_all(void)
357 * 358 *
358 * This assumes no large pages in there. 359 * This assumes no large pages in there.
359 */ 360 */
360static noinline int vmalloc_fault(unsigned long address) 361static noinline __kprobes int vmalloc_fault(unsigned long address)
361{ 362{
362 pgd_t *pgd, *pgd_ref; 363 pgd_t *pgd, *pgd_ref;
363 pud_t *pud, *pud_ref; 364 pud_t *pud, *pud_ref;
@@ -658,7 +659,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
658 show_fault_oops(regs, error_code, address); 659 show_fault_oops(regs, error_code, address);
659 660
660 stackend = end_of_stack(tsk); 661 stackend = end_of_stack(tsk);
661 if (*stackend != STACK_END_MAGIC) 662 if (tsk != &init_task && *stackend != STACK_END_MAGIC)
662 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n"); 663 printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
663 664
664 tsk->thread.cr2 = address; 665 tsk->thread.cr2 = address;
@@ -860,7 +861,7 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
860 * There are no security implications to leaving a stale TLB when 861 * There are no security implications to leaving a stale TLB when
861 * increasing the permissions on a page. 862 * increasing the permissions on a page.
862 */ 863 */
863static noinline int 864static noinline __kprobes int
864spurious_fault(unsigned long error_code, unsigned long address) 865spurious_fault(unsigned long error_code, unsigned long address)
865{ 866{
866 pgd_t *pgd; 867 pgd_t *pgd;
diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c
index 71da1bca13cb..738e6593799d 100644
--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -18,7 +18,7 @@ static inline pte_t gup_get_pte(pte_t *ptep)
18#else 18#else
19 /* 19 /*
20 * With get_user_pages_fast, we walk down the pagetables without taking 20 * With get_user_pages_fast, we walk down the pagetables without taking
21 * any locks. For this we would like to load the pointers atoimcally, 21 * any locks. For this we would like to load the pointers atomically,
22 * but that is not possible (without expensive cmpxchg8b) on PAE. What 22 * but that is not possible (without expensive cmpxchg8b) on PAE. What
23 * we do have is the guarantee that a pte will only either go from not 23 * we do have is the guarantee that a pte will only either go from not
24 * present to present, or present to not present or both -- it will not 24 * present to present, or present to not present or both -- it will not
diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c
index f46c340727b8..069ce7c37c01 100644
--- a/arch/x86/mm/hugetlbpage.c
+++ b/arch/x86/mm/hugetlbpage.c
@@ -9,7 +9,6 @@
9#include <linux/mm.h> 9#include <linux/mm.h>
10#include <linux/hugetlb.h> 10#include <linux/hugetlb.h>
11#include <linux/pagemap.h> 11#include <linux/pagemap.h>
12#include <linux/slab.h>
13#include <linux/err.h> 12#include <linux/err.h>
14#include <linux/sysctl.h> 13#include <linux/sysctl.h>
15#include <asm/mman.h> 14#include <asm/mman.h>
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index 73ffd5536f62..b278535b14aa 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -1,3 +1,4 @@
1#include <linux/gfp.h>
1#include <linux/initrd.h> 2#include <linux/initrd.h>
2#include <linux/ioport.h> 3#include <linux/ioport.h>
3#include <linux/swap.h> 4#include <linux/swap.h>
@@ -146,10 +147,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
146 use_gbpages = direct_gbpages; 147 use_gbpages = direct_gbpages;
147#endif 148#endif
148 149
149 set_nx();
150 if (nx_enabled)
151 printk(KERN_INFO "NX (Execute Disable) protection: active\n");
152
153 /* Enable PSE if available */ 150 /* Enable PSE if available */
154 if (cpu_has_pse) 151 if (cpu_has_pse)
155 set_in_cr4(X86_CR4_PSE); 152 set_in_cr4(X86_CR4_PSE);
@@ -270,16 +267,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start,
270 if (!after_bootmem) 267 if (!after_bootmem)
271 find_early_table_space(end, use_pse, use_gbpages); 268 find_early_table_space(end, use_pse, use_gbpages);
272 269
273#ifdef CONFIG_X86_32
274 for (i = 0; i < nr_range; i++)
275 kernel_physical_mapping_init(mr[i].start, mr[i].end,
276 mr[i].page_size_mask);
277 ret = end;
278#else /* CONFIG_X86_64 */
279 for (i = 0; i < nr_range; i++) 270 for (i = 0; i < nr_range; i++)
280 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end, 271 ret = kernel_physical_mapping_init(mr[i].start, mr[i].end,
281 mr[i].page_size_mask); 272 mr[i].page_size_mask);
282#endif
283 273
284#ifdef CONFIG_X86_32 274#ifdef CONFIG_X86_32
285 early_ioremap_page_table_range_init(); 275 early_ioremap_page_table_range_init();
@@ -342,11 +332,23 @@ int devmem_is_allowed(unsigned long pagenr)
342 332
343void free_init_pages(char *what, unsigned long begin, unsigned long end) 333void free_init_pages(char *what, unsigned long begin, unsigned long end)
344{ 334{
345 unsigned long addr = begin; 335 unsigned long addr;
336 unsigned long begin_aligned, end_aligned;
337
338 /* Make sure boundaries are page aligned */
339 begin_aligned = PAGE_ALIGN(begin);
340 end_aligned = end & PAGE_MASK;
346 341
347 if (addr >= end) 342 if (WARN_ON(begin_aligned != begin || end_aligned != end)) {
343 begin = begin_aligned;
344 end = end_aligned;
345 }
346
347 if (begin >= end)
348 return; 348 return;
349 349
350 addr = begin;
351
350 /* 352 /*
351 * If debugging page accesses then do not free this memory but 353 * If debugging page accesses then do not free this memory but
352 * mark them not present - any buggy init-section access will 354 * mark them not present - any buggy init-section access will
@@ -354,7 +356,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
354 */ 356 */
355#ifdef CONFIG_DEBUG_PAGEALLOC 357#ifdef CONFIG_DEBUG_PAGEALLOC
356 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n", 358 printk(KERN_INFO "debug: unmapping init memory %08lx..%08lx\n",
357 begin, PAGE_ALIGN(end)); 359 begin, end);
358 set_memory_np(begin, (end - begin) >> PAGE_SHIFT); 360 set_memory_np(begin, (end - begin) >> PAGE_SHIFT);
359#else 361#else
360 /* 362 /*
@@ -369,8 +371,7 @@ void free_init_pages(char *what, unsigned long begin, unsigned long end)
369 for (; addr < end; addr += PAGE_SIZE) { 371 for (; addr < end; addr += PAGE_SIZE) {
370 ClearPageReserved(virt_to_page(addr)); 372 ClearPageReserved(virt_to_page(addr));
371 init_page_count(virt_to_page(addr)); 373 init_page_count(virt_to_page(addr));
372 memset((void *)(addr & ~(PAGE_SIZE-1)), 374 memset((void *)addr, POISON_FREE_INITMEM, PAGE_SIZE);
373 POISON_FREE_INITMEM, PAGE_SIZE);
374 free_page(addr); 375 free_page(addr);
375 totalram_pages++; 376 totalram_pages++;
376 } 377 }
@@ -387,6 +388,15 @@ void free_initmem(void)
387#ifdef CONFIG_BLK_DEV_INITRD 388#ifdef CONFIG_BLK_DEV_INITRD
388void free_initrd_mem(unsigned long start, unsigned long end) 389void free_initrd_mem(unsigned long start, unsigned long end)
389{ 390{
390 free_init_pages("initrd memory", start, end); 391 /*
392 * end could be not aligned, and We can not align that,
393 * decompresser could be confused by aligned initrd_end
394 * We already reserve the end partial page before in
395 * - i386_start_kernel()
396 * - x86_64_start_kernel()
397 * - relocate_initrd()
398 * So here We can do PAGE_ALIGN() safely to get partial page to be freed
399 */
400 free_init_pages("initrd memory", start, PAGE_ALIGN(end));
391} 401}
392#endif 402#endif
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 30938c1d8d5d..bca79091b9d6 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -25,11 +25,11 @@
25#include <linux/pfn.h> 25#include <linux/pfn.h>
26#include <linux/poison.h> 26#include <linux/poison.h>
27#include <linux/bootmem.h> 27#include <linux/bootmem.h>
28#include <linux/slab.h>
29#include <linux/proc_fs.h> 28#include <linux/proc_fs.h>
30#include <linux/memory_hotplug.h> 29#include <linux/memory_hotplug.h>
31#include <linux/initrd.h> 30#include <linux/initrd.h>
32#include <linux/cpumask.h> 31#include <linux/cpumask.h>
32#include <linux/gfp.h>
33 33
34#include <asm/asm.h> 34#include <asm/asm.h>
35#include <asm/bios_ebda.h> 35#include <asm/bios_ebda.h>
@@ -241,6 +241,7 @@ kernel_physical_mapping_init(unsigned long start,
241 unsigned long page_size_mask) 241 unsigned long page_size_mask)
242{ 242{
243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M); 243 int use_pse = page_size_mask == (1<<PG_LEVEL_2M);
244 unsigned long last_map_addr = end;
244 unsigned long start_pfn, end_pfn; 245 unsigned long start_pfn, end_pfn;
245 pgd_t *pgd_base = swapper_pg_dir; 246 pgd_t *pgd_base = swapper_pg_dir;
246 int pgd_idx, pmd_idx, pte_ofs; 247 int pgd_idx, pmd_idx, pte_ofs;
@@ -341,9 +342,10 @@ repeat:
341 prot = PAGE_KERNEL_EXEC; 342 prot = PAGE_KERNEL_EXEC;
342 343
343 pages_4k++; 344 pages_4k++;
344 if (mapping_iter == 1) 345 if (mapping_iter == 1) {
345 set_pte(pte, pfn_pte(pfn, init_prot)); 346 set_pte(pte, pfn_pte(pfn, init_prot));
346 else 347 last_map_addr = (pfn << PAGE_SHIFT) + PAGE_SIZE;
348 } else
347 set_pte(pte, pfn_pte(pfn, prot)); 349 set_pte(pte, pfn_pte(pfn, prot));
348 } 350 }
349 } 351 }
@@ -368,7 +370,7 @@ repeat:
368 mapping_iter = 2; 370 mapping_iter = 2;
369 goto repeat; 371 goto repeat;
370 } 372 }
371 return 0; 373 return last_map_addr;
372} 374}
373 375
374pte_t *kmap_pte; 376pte_t *kmap_pte;
@@ -412,7 +414,7 @@ static void __init permanent_kmaps_init(pgd_t *pgd_base)
412 pkmap_page_table = pte; 414 pkmap_page_table = pte;
413} 415}
414 416
415static void __init add_one_highpage_init(struct page *page, int pfn) 417static void __init add_one_highpage_init(struct page *page)
416{ 418{
417 ClearPageReserved(page); 419 ClearPageReserved(page);
418 init_page_count(page); 420 init_page_count(page);
@@ -445,7 +447,7 @@ static int __init add_highpages_work_fn(unsigned long start_pfn,
445 if (!pfn_valid(node_pfn)) 447 if (!pfn_valid(node_pfn))
446 continue; 448 continue;
447 page = pfn_to_page(node_pfn); 449 page = pfn_to_page(node_pfn);
448 add_one_highpage_init(page, node_pfn); 450 add_one_highpage_init(page);
449 } 451 }
450 452
451 return 0; 453 return 0;
@@ -703,8 +705,8 @@ void __init find_low_pfn_range(void)
703} 705}
704 706
705#ifndef CONFIG_NEED_MULTIPLE_NODES 707#ifndef CONFIG_NEED_MULTIPLE_NODES
706void __init initmem_init(unsigned long start_pfn, 708void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
707 unsigned long end_pfn) 709 int acpi, int k8)
708{ 710{
709#ifdef CONFIG_HIGHMEM 711#ifdef CONFIG_HIGHMEM
710 highstart_pfn = highend_pfn = max_pfn; 712 highstart_pfn = highend_pfn = max_pfn;
@@ -748,6 +750,7 @@ static void __init zone_sizes_init(void)
748 free_area_init_nodes(max_zone_pfns); 750 free_area_init_nodes(max_zone_pfns);
749} 751}
750 752
753#ifndef CONFIG_NO_BOOTMEM
751static unsigned long __init setup_node_bootmem(int nodeid, 754static unsigned long __init setup_node_bootmem(int nodeid,
752 unsigned long start_pfn, 755 unsigned long start_pfn,
753 unsigned long end_pfn, 756 unsigned long end_pfn,
@@ -764,13 +767,14 @@ static unsigned long __init setup_node_bootmem(int nodeid,
764 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n", 767 printk(KERN_INFO " node %d bootmap %08lx - %08lx\n",
765 nodeid, bootmap, bootmap + bootmap_size); 768 nodeid, bootmap, bootmap + bootmap_size);
766 free_bootmem_with_active_regions(nodeid, end_pfn); 769 free_bootmem_with_active_regions(nodeid, end_pfn);
767 early_res_to_bootmem(start_pfn<<PAGE_SHIFT, end_pfn<<PAGE_SHIFT);
768 770
769 return bootmap + bootmap_size; 771 return bootmap + bootmap_size;
770} 772}
773#endif
771 774
772void __init setup_bootmem_allocator(void) 775void __init setup_bootmem_allocator(void)
773{ 776{
777#ifndef CONFIG_NO_BOOTMEM
774 int nodeid; 778 int nodeid;
775 unsigned long bootmap_size, bootmap; 779 unsigned long bootmap_size, bootmap;
776 /* 780 /*
@@ -782,11 +786,13 @@ void __init setup_bootmem_allocator(void)
782 if (bootmap == -1L) 786 if (bootmap == -1L)
783 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 787 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
784 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP"); 788 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
789#endif
785 790
786 printk(KERN_INFO " mapped low ram: 0 - %08lx\n", 791 printk(KERN_INFO " mapped low ram: 0 - %08lx\n",
787 max_pfn_mapped<<PAGE_SHIFT); 792 max_pfn_mapped<<PAGE_SHIFT);
788 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT); 793 printk(KERN_INFO " low ram: 0 - %08lx\n", max_low_pfn<<PAGE_SHIFT);
789 794
795#ifndef CONFIG_NO_BOOTMEM
790 for_each_online_node(nodeid) { 796 for_each_online_node(nodeid) {
791 unsigned long start_pfn, end_pfn; 797 unsigned long start_pfn, end_pfn;
792 798
@@ -804,6 +810,7 @@ void __init setup_bootmem_allocator(void)
804 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn, 810 bootmap = setup_node_bootmem(nodeid, start_pfn, end_pfn,
805 bootmap); 811 bootmap);
806 } 812 }
813#endif
807 814
808 after_bootmem = 1; 815 after_bootmem = 1;
809} 816}
@@ -892,8 +899,7 @@ void __init mem_init(void)
892 reservedpages << (PAGE_SHIFT-10), 899 reservedpages << (PAGE_SHIFT-10),
893 datasize >> 10, 900 datasize >> 10,
894 initsize >> 10, 901 initsize >> 10,
895 (unsigned long) (totalhigh_pages << (PAGE_SHIFT-10)) 902 totalhigh_pages << (PAGE_SHIFT-10));
896 );
897 903
898 printk(KERN_INFO "virtual kernel memory layout:\n" 904 printk(KERN_INFO "virtual kernel memory layout:\n"
899 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" 905 " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n"
@@ -997,7 +1003,7 @@ static noinline int do_test_wp_bit(void)
997const int rodata_test_data = 0xC3; 1003const int rodata_test_data = 0xC3;
998EXPORT_SYMBOL_GPL(rodata_test_data); 1004EXPORT_SYMBOL_GPL(rodata_test_data);
999 1005
1000static int kernel_set_to_readonly; 1006int kernel_set_to_readonly __read_mostly;
1001 1007
1002void set_kernel_text_rw(void) 1008void set_kernel_text_rw(void)
1003{ 1009{
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5a4398a6006b..ee41bba315d1 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -29,6 +29,7 @@
29#include <linux/module.h> 29#include <linux/module.h>
30#include <linux/memory_hotplug.h> 30#include <linux/memory_hotplug.h>
31#include <linux/nmi.h> 31#include <linux/nmi.h>
32#include <linux/gfp.h>
32 33
33#include <asm/processor.h> 34#include <asm/processor.h>
34#include <asm/bios_ebda.h> 35#include <asm/bios_ebda.h>
@@ -49,6 +50,7 @@
49#include <asm/numa.h> 50#include <asm/numa.h>
50#include <asm/cacheflush.h> 51#include <asm/cacheflush.h>
51#include <asm/init.h> 52#include <asm/init.h>
53#include <linux/bootmem.h>
52 54
53static unsigned long dma_reserve __initdata; 55static unsigned long dma_reserve __initdata;
54 56
@@ -568,8 +570,10 @@ kernel_physical_mapping_init(unsigned long start,
568} 570}
569 571
570#ifndef CONFIG_NUMA 572#ifndef CONFIG_NUMA
571void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) 573void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
574 int acpi, int k8)
572{ 575{
576#ifndef CONFIG_NO_BOOTMEM
573 unsigned long bootmap_size, bootmap; 577 unsigned long bootmap_size, bootmap;
574 578
575 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT; 579 bootmap_size = bootmem_bootmap_pages(end_pfn)<<PAGE_SHIFT;
@@ -577,13 +581,15 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn)
577 PAGE_SIZE); 581 PAGE_SIZE);
578 if (bootmap == -1L) 582 if (bootmap == -1L)
579 panic("Cannot find bootmem map of size %ld\n", bootmap_size); 583 panic("Cannot find bootmem map of size %ld\n", bootmap_size);
584 reserve_early(bootmap, bootmap + bootmap_size, "BOOTMAP");
580 /* don't touch min_low_pfn */ 585 /* don't touch min_low_pfn */
581 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT, 586 bootmap_size = init_bootmem_node(NODE_DATA(0), bootmap >> PAGE_SHIFT,
582 0, end_pfn); 587 0, end_pfn);
583 e820_register_active_regions(0, start_pfn, end_pfn); 588 e820_register_active_regions(0, start_pfn, end_pfn);
584 free_bootmem_with_active_regions(0, end_pfn); 589 free_bootmem_with_active_regions(0, end_pfn);
585 early_res_to_bootmem(0, end_pfn<<PAGE_SHIFT); 590#else
586 reserve_bootmem(bootmap, bootmap_size, BOOTMEM_DEFAULT); 591 e820_register_active_regions(0, start_pfn, end_pfn);
592#endif
587} 593}
588#endif 594#endif
589 595
@@ -615,6 +621,21 @@ void __init paging_init(void)
615 */ 621 */
616#ifdef CONFIG_MEMORY_HOTPLUG 622#ifdef CONFIG_MEMORY_HOTPLUG
617/* 623/*
624 * After memory hotplug the variables max_pfn, max_low_pfn and high_memory need
625 * updating.
626 */
627static void update_end_of_memory_vars(u64 start, u64 size)
628{
629 unsigned long end_pfn = PFN_UP(start + size);
630
631 if (end_pfn > max_pfn) {
632 max_pfn = end_pfn;
633 max_low_pfn = end_pfn;
634 high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1;
635 }
636}
637
638/*
618 * Memory is added always to NORMAL zone. This means you will never get 639 * Memory is added always to NORMAL zone. This means you will never get
619 * additional DMA/DMA32 memory. 640 * additional DMA/DMA32 memory.
620 */ 641 */
@@ -633,6 +654,9 @@ int arch_add_memory(int nid, u64 start, u64 size)
633 ret = __add_pages(nid, zone, start_pfn, nr_pages); 654 ret = __add_pages(nid, zone, start_pfn, nr_pages);
634 WARN_ON_ONCE(ret); 655 WARN_ON_ONCE(ret);
635 656
657 /* update max_pfn, max_low_pfn and high_memory */
658 update_end_of_memory_vars(start, size);
659
636 return ret; 660 return ret;
637} 661}
638EXPORT_SYMBOL_GPL(arch_add_memory); 662EXPORT_SYMBOL_GPL(arch_add_memory);
@@ -694,12 +718,12 @@ void __init mem_init(void)
694const int rodata_test_data = 0xC3; 718const int rodata_test_data = 0xC3;
695EXPORT_SYMBOL_GPL(rodata_test_data); 719EXPORT_SYMBOL_GPL(rodata_test_data);
696 720
697static int kernel_set_to_readonly; 721int kernel_set_to_readonly;
698 722
699void set_kernel_text_rw(void) 723void set_kernel_text_rw(void)
700{ 724{
701 unsigned long start = PFN_ALIGN(_stext); 725 unsigned long start = PFN_ALIGN(_text);
702 unsigned long end = PFN_ALIGN(__start_rodata); 726 unsigned long end = PFN_ALIGN(__stop___ex_table);
703 727
704 if (!kernel_set_to_readonly) 728 if (!kernel_set_to_readonly)
705 return; 729 return;
@@ -707,13 +731,18 @@ void set_kernel_text_rw(void)
707 pr_debug("Set kernel text: %lx - %lx for read write\n", 731 pr_debug("Set kernel text: %lx - %lx for read write\n",
708 start, end); 732 start, end);
709 733
734 /*
735 * Make the kernel identity mapping for text RW. Kernel text
736 * mapping will always be RO. Refer to the comment in
737 * static_protections() in pageattr.c
738 */
710 set_memory_rw(start, (end - start) >> PAGE_SHIFT); 739 set_memory_rw(start, (end - start) >> PAGE_SHIFT);
711} 740}
712 741
713void set_kernel_text_ro(void) 742void set_kernel_text_ro(void)
714{ 743{
715 unsigned long start = PFN_ALIGN(_stext); 744 unsigned long start = PFN_ALIGN(_text);
716 unsigned long end = PFN_ALIGN(__start_rodata); 745 unsigned long end = PFN_ALIGN(__stop___ex_table);
717 746
718 if (!kernel_set_to_readonly) 747 if (!kernel_set_to_readonly)
719 return; 748 return;
@@ -721,14 +750,21 @@ void set_kernel_text_ro(void)
721 pr_debug("Set kernel text: %lx - %lx for read only\n", 750 pr_debug("Set kernel text: %lx - %lx for read only\n",
722 start, end); 751 start, end);
723 752
753 /*
754 * Set the kernel identity mapping for text RO.
755 */
724 set_memory_ro(start, (end - start) >> PAGE_SHIFT); 756 set_memory_ro(start, (end - start) >> PAGE_SHIFT);
725} 757}
726 758
727void mark_rodata_ro(void) 759void mark_rodata_ro(void)
728{ 760{
729 unsigned long start = PFN_ALIGN(_stext), end = PFN_ALIGN(__end_rodata); 761 unsigned long start = PFN_ALIGN(_text);
730 unsigned long rodata_start = 762 unsigned long rodata_start =
731 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK; 763 ((unsigned long)__start_rodata + PAGE_SIZE - 1) & PAGE_MASK;
764 unsigned long end = (unsigned long) &__end_rodata_hpage_align;
765 unsigned long text_end = PAGE_ALIGN((unsigned long) &__stop___ex_table);
766 unsigned long rodata_end = PAGE_ALIGN((unsigned long) &__end_rodata);
767 unsigned long data_start = (unsigned long) &_sdata;
732 768
733 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", 769 printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n",
734 (end - start) >> 10); 770 (end - start) >> 10);
@@ -751,6 +787,14 @@ void mark_rodata_ro(void)
751 printk(KERN_INFO "Testing CPA: again\n"); 787 printk(KERN_INFO "Testing CPA: again\n");
752 set_memory_ro(start, (end-start) >> PAGE_SHIFT); 788 set_memory_ro(start, (end-start) >> PAGE_SHIFT);
753#endif 789#endif
790
791 free_init_pages("unused kernel memory",
792 (unsigned long) page_address(virt_to_page(text_end)),
793 (unsigned long)
794 page_address(virt_to_page(rodata_start)));
795 free_init_pages("unused kernel memory",
796 (unsigned long) page_address(virt_to_page(rodata_end)),
797 (unsigned long) page_address(virt_to_page(data_start)));
754} 798}
755 799
756#endif 800#endif
@@ -934,7 +978,7 @@ vmemmap_populate(struct page *start_page, unsigned long size, int node)
934 if (pmd_none(*pmd)) { 978 if (pmd_none(*pmd)) {
935 pte_t entry; 979 pte_t entry;
936 980
937 p = vmemmap_alloc_block(PMD_SIZE, node); 981 p = vmemmap_alloc_block_buf(PMD_SIZE, node);
938 if (!p) 982 if (!p)
939 return -ENOMEM; 983 return -ENOMEM;
940 984
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 2feb9bdedaaf..12e4d2d3c110 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -24,43 +24,6 @@
24 24
25#include "physaddr.h" 25#include "physaddr.h"
26 26
27int page_is_ram(unsigned long pagenr)
28{
29 resource_size_t addr, end;
30 int i;
31
32 /*
33 * A special case is the first 4Kb of memory;
34 * This is a BIOS owned area, not kernel ram, but generally
35 * not listed as such in the E820 table.
36 */
37 if (pagenr == 0)
38 return 0;
39
40 /*
41 * Second special case: Some BIOSen report the PC BIOS
42 * area (640->1Mb) as ram even though it is not.
43 */
44 if (pagenr >= (BIOS_BEGIN >> PAGE_SHIFT) &&
45 pagenr < (BIOS_END >> PAGE_SHIFT))
46 return 0;
47
48 for (i = 0; i < e820.nr_map; i++) {
49 /*
50 * Not usable memory:
51 */
52 if (e820.map[i].type != E820_RAM)
53 continue;
54 addr = (e820.map[i].addr + PAGE_SIZE-1) >> PAGE_SHIFT;
55 end = (e820.map[i].addr + e820.map[i].size) >> PAGE_SHIFT;
56
57
58 if ((pagenr >= addr) && (pagenr < end))
59 return 1;
60 }
61 return 0;
62}
63
64/* 27/*
65 * Fix up the linear direct mapping of the kernel to avoid cache attribute 28 * Fix up the linear direct mapping of the kernel to avoid cache attribute
66 * conflicts. 29 * conflicts.
@@ -281,30 +244,6 @@ void __iomem *ioremap_cache(resource_size_t phys_addr, unsigned long size)
281} 244}
282EXPORT_SYMBOL(ioremap_cache); 245EXPORT_SYMBOL(ioremap_cache);
283 246
284static void __iomem *ioremap_default(resource_size_t phys_addr,
285 unsigned long size)
286{
287 unsigned long flags;
288 void __iomem *ret;
289 int err;
290
291 /*
292 * - WB for WB-able memory and no other conflicting mappings
293 * - UC_MINUS for non-WB-able memory with no other conflicting mappings
294 * - Inherit from confliting mappings otherwise
295 */
296 err = reserve_memtype(phys_addr, phys_addr + size,
297 _PAGE_CACHE_WB, &flags);
298 if (err < 0)
299 return NULL;
300
301 ret = __ioremap_caller(phys_addr, size, flags,
302 __builtin_return_address(0));
303
304 free_memtype(phys_addr, phys_addr + size);
305 return ret;
306}
307
308void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size, 247void __iomem *ioremap_prot(resource_size_t phys_addr, unsigned long size,
309 unsigned long prot_val) 248 unsigned long prot_val)
310{ 249{
@@ -380,7 +319,7 @@ void *xlate_dev_mem_ptr(unsigned long phys)
380 if (page_is_ram(start >> PAGE_SHIFT)) 319 if (page_is_ram(start >> PAGE_SHIFT))
381 return __va(phys); 320 return __va(phys);
382 321
383 addr = (void __force *)ioremap_default(start, PAGE_SIZE); 322 addr = (void __force *)ioremap_cache(start, PAGE_SIZE);
384 if (addr) 323 if (addr)
385 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK)); 324 addr = (void *)((unsigned long)addr | (phys & ~PAGE_MASK));
386 325
@@ -446,6 +385,10 @@ void __init early_ioremap_init(void)
446 * The boot-ioremap range spans multiple pmds, for which 385 * The boot-ioremap range spans multiple pmds, for which
447 * we are not prepared: 386 * we are not prepared:
448 */ 387 */
388#define __FIXADDR_TOP (-PAGE_SIZE)
389 BUILD_BUG_ON((__fix_to_virt(FIX_BTMAP_BEGIN) >> PMD_SHIFT)
390 != (__fix_to_virt(FIX_BTMAP_END) >> PMD_SHIFT));
391#undef __FIXADDR_TOP
449 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) { 392 if (pmd != early_ioremap_pmd(fix_to_virt(FIX_BTMAP_END))) {
450 WARN_ON(1); 393 WARN_ON(1);
451 printk(KERN_WARNING "pmd %p != %p\n", 394 printk(KERN_WARNING "pmd %p != %p\n",
@@ -505,6 +448,20 @@ static inline void __init early_clear_fixmap(enum fixed_addresses idx)
505static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata; 448static void __iomem *prev_map[FIX_BTMAPS_SLOTS] __initdata;
506static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata; 449static unsigned long prev_size[FIX_BTMAPS_SLOTS] __initdata;
507 450
451void __init fixup_early_ioremap(void)
452{
453 int i;
454
455 for (i = 0; i < FIX_BTMAPS_SLOTS; i++) {
456 if (prev_map[i]) {
457 WARN_ON(1);
458 break;
459 }
460 }
461
462 early_ioremap_init();
463}
464
508static int __init check_early_ioremap_leak(void) 465static int __init check_early_ioremap_leak(void)
509{ 466{
510 int count = 0; 467 int count = 0;
diff --git a/arch/x86/mm/k8topology_64.c b/arch/x86/mm/k8topology_64.c
index 268f8255280f..970ed579d4e4 100644
--- a/arch/x86/mm/k8topology_64.c
+++ b/arch/x86/mm/k8topology_64.c
@@ -24,6 +24,9 @@
24#include <asm/apic.h> 24#include <asm/apic.h>
25#include <asm/k8.h> 25#include <asm/k8.h>
26 26
27static struct bootnode __initdata nodes[8];
28static nodemask_t __initdata nodes_parsed = NODE_MASK_NONE;
29
27static __init int find_northbridge(void) 30static __init int find_northbridge(void)
28{ 31{
29 int num; 32 int num;
@@ -54,18 +57,6 @@ static __init void early_get_boot_cpu_id(void)
54 * need to get boot_cpu_id so can use that to create apicid_to_node 57 * need to get boot_cpu_id so can use that to create apicid_to_node
55 * in k8_scan_nodes() 58 * in k8_scan_nodes()
56 */ 59 */
57 /*
58 * Find possible boot-time SMP configuration:
59 */
60#ifdef CONFIG_X86_MPPARSE
61 early_find_smp_config();
62#endif
63#ifdef CONFIG_ACPI
64 /*
65 * Read APIC information from ACPI tables.
66 */
67 early_acpi_boot_init();
68#endif
69#ifdef CONFIG_X86_MPPARSE 60#ifdef CONFIG_X86_MPPARSE
70 /* 61 /*
71 * get boot-time SMP configuration: 62 * get boot-time SMP configuration:
@@ -76,12 +67,26 @@ static __init void early_get_boot_cpu_id(void)
76 early_init_lapic_mapping(); 67 early_init_lapic_mapping();
77} 68}
78 69
79int __init k8_scan_nodes(unsigned long start, unsigned long end) 70int __init k8_get_nodes(struct bootnode *physnodes)
80{ 71{
81 unsigned numnodes, cores, bits, apicid_base; 72 int i;
73 int ret = 0;
74
75 for_each_node_mask(i, nodes_parsed) {
76 physnodes[ret].start = nodes[i].start;
77 physnodes[ret].end = nodes[i].end;
78 ret++;
79 }
80 return ret;
81}
82
83int __init k8_numa_init(unsigned long start_pfn, unsigned long end_pfn)
84{
85 unsigned long start = PFN_PHYS(start_pfn);
86 unsigned long end = PFN_PHYS(end_pfn);
87 unsigned numnodes;
82 unsigned long prevbase; 88 unsigned long prevbase;
83 struct bootnode nodes[8]; 89 int i, nb, found = 0;
84 int i, j, nb, found = 0;
85 u32 nodeid, reg; 90 u32 nodeid, reg;
86 91
87 if (!early_pci_allowed()) 92 if (!early_pci_allowed())
@@ -91,16 +96,15 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
91 if (nb < 0) 96 if (nb < 0)
92 return nb; 97 return nb;
93 98
94 printk(KERN_INFO "Scanning NUMA topology in Northbridge %d\n", nb); 99 pr_info("Scanning NUMA topology in Northbridge %d\n", nb);
95 100
96 reg = read_pci_config(0, nb, 0, 0x60); 101 reg = read_pci_config(0, nb, 0, 0x60);
97 numnodes = ((reg >> 4) & 0xF) + 1; 102 numnodes = ((reg >> 4) & 0xF) + 1;
98 if (numnodes <= 1) 103 if (numnodes <= 1)
99 return -1; 104 return -1;
100 105
101 printk(KERN_INFO "Number of nodes %d\n", numnodes); 106 pr_info("Number of physical nodes %d\n", numnodes);
102 107
103 memset(&nodes, 0, sizeof(nodes));
104 prevbase = 0; 108 prevbase = 0;
105 for (i = 0; i < 8; i++) { 109 for (i = 0; i < 8; i++) {
106 unsigned long base, limit; 110 unsigned long base, limit;
@@ -111,28 +115,28 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
111 nodeid = limit & 7; 115 nodeid = limit & 7;
112 if ((base & 3) == 0) { 116 if ((base & 3) == 0) {
113 if (i < numnodes) 117 if (i < numnodes)
114 printk("Skipping disabled node %d\n", i); 118 pr_info("Skipping disabled node %d\n", i);
115 continue; 119 continue;
116 } 120 }
117 if (nodeid >= numnodes) { 121 if (nodeid >= numnodes) {
118 printk("Ignoring excess node %d (%lx:%lx)\n", nodeid, 122 pr_info("Ignoring excess node %d (%lx:%lx)\n", nodeid,
119 base, limit); 123 base, limit);
120 continue; 124 continue;
121 } 125 }
122 126
123 if (!limit) { 127 if (!limit) {
124 printk(KERN_INFO "Skipping node entry %d (base %lx)\n", 128 pr_info("Skipping node entry %d (base %lx)\n",
125 i, base); 129 i, base);
126 continue; 130 continue;
127 } 131 }
128 if ((base >> 8) & 3 || (limit >> 8) & 3) { 132 if ((base >> 8) & 3 || (limit >> 8) & 3) {
129 printk(KERN_ERR "Node %d using interleaving mode %lx/%lx\n", 133 pr_err("Node %d using interleaving mode %lx/%lx\n",
130 nodeid, (base>>8)&3, (limit>>8) & 3); 134 nodeid, (base >> 8) & 3, (limit >> 8) & 3);
131 return -1; 135 return -1;
132 } 136 }
133 if (node_isset(nodeid, node_possible_map)) { 137 if (node_isset(nodeid, nodes_parsed)) {
134 printk(KERN_INFO "Node %d already present. Skipping\n", 138 pr_info("Node %d already present, skipping\n",
135 nodeid); 139 nodeid);
136 continue; 140 continue;
137 } 141 }
138 142
@@ -141,8 +145,8 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
141 limit |= (1<<24)-1; 145 limit |= (1<<24)-1;
142 limit++; 146 limit++;
143 147
144 if (limit > max_pfn << PAGE_SHIFT) 148 if (limit > end)
145 limit = max_pfn << PAGE_SHIFT; 149 limit = end;
146 if (limit <= base) 150 if (limit <= base)
147 continue; 151 continue;
148 152
@@ -154,24 +158,24 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
154 if (limit > end) 158 if (limit > end)
155 limit = end; 159 limit = end;
156 if (limit == base) { 160 if (limit == base) {
157 printk(KERN_ERR "Empty node %d\n", nodeid); 161 pr_err("Empty node %d\n", nodeid);
158 continue; 162 continue;
159 } 163 }
160 if (limit < base) { 164 if (limit < base) {
161 printk(KERN_ERR "Node %d bogus settings %lx-%lx.\n", 165 pr_err("Node %d bogus settings %lx-%lx.\n",
162 nodeid, base, limit); 166 nodeid, base, limit);
163 continue; 167 continue;
164 } 168 }
165 169
166 /* Could sort here, but pun for now. Should not happen anyroads. */ 170 /* Could sort here, but pun for now. Should not happen anyroads. */
167 if (prevbase > base) { 171 if (prevbase > base) {
168 printk(KERN_ERR "Node map not sorted %lx,%lx\n", 172 pr_err("Node map not sorted %lx,%lx\n",
169 prevbase, base); 173 prevbase, base);
170 return -1; 174 return -1;
171 } 175 }
172 176
173 printk(KERN_INFO "Node %d MemBase %016lx Limit %016lx\n", 177 pr_info("Node %d MemBase %016lx Limit %016lx\n",
174 nodeid, base, limit); 178 nodeid, base, limit);
175 179
176 found++; 180 found++;
177 181
@@ -180,18 +184,29 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
180 184
181 prevbase = base; 185 prevbase = base;
182 186
183 node_set(nodeid, node_possible_map); 187 node_set(nodeid, nodes_parsed);
184 } 188 }
185 189
186 if (!found) 190 if (!found)
187 return -1; 191 return -1;
192 return 0;
193}
194
195int __init k8_scan_nodes(void)
196{
197 unsigned int bits;
198 unsigned int cores;
199 unsigned int apicid_base;
200 int i;
188 201
202 BUG_ON(nodes_empty(nodes_parsed));
203 node_possible_map = nodes_parsed;
189 memnode_shift = compute_hash_shift(nodes, 8, NULL); 204 memnode_shift = compute_hash_shift(nodes, 8, NULL);
190 if (memnode_shift < 0) { 205 if (memnode_shift < 0) {
191 printk(KERN_ERR "No NUMA node hash function found. Contact maintainer\n"); 206 pr_err("No NUMA node hash function found. Contact maintainer\n");
192 return -1; 207 return -1;
193 } 208 }
194 printk(KERN_INFO "Using node hash shift of %d\n", memnode_shift); 209 pr_info("Using node hash shift of %d\n", memnode_shift);
195 210
196 /* use the coreid bits from early_identify_cpu */ 211 /* use the coreid bits from early_identify_cpu */
197 bits = boot_cpu_data.x86_coreid_bits; 212 bits = boot_cpu_data.x86_coreid_bits;
@@ -200,14 +215,12 @@ int __init k8_scan_nodes(unsigned long start, unsigned long end)
200 /* need to get boot_cpu_id early for system with apicid lifting */ 215 /* need to get boot_cpu_id early for system with apicid lifting */
201 early_get_boot_cpu_id(); 216 early_get_boot_cpu_id();
202 if (boot_cpu_physical_apicid > 0) { 217 if (boot_cpu_physical_apicid > 0) {
203 printk(KERN_INFO "BSP APIC ID: %02x\n", 218 pr_info("BSP APIC ID: %02x\n", boot_cpu_physical_apicid);
204 boot_cpu_physical_apicid);
205 apicid_base = boot_cpu_physical_apicid; 219 apicid_base = boot_cpu_physical_apicid;
206 } 220 }
207 221
208 for (i = 0; i < 8; i++) { 222 for_each_node_mask(i, node_possible_map) {
209 if (nodes[i].start == nodes[i].end) 223 int j;
210 continue;
211 224
212 e820_register_active_regions(i, 225 e820_register_active_regions(i,
213 nodes[i].start >> PAGE_SHIFT, 226 nodes[i].start >> PAGE_SHIFT,
diff --git a/arch/x86/mm/kmemcheck/error.c b/arch/x86/mm/kmemcheck/error.c
index 4901d0dafda6..af3b6c8a436f 100644
--- a/arch/x86/mm/kmemcheck/error.c
+++ b/arch/x86/mm/kmemcheck/error.c
@@ -106,26 +106,25 @@ void kmemcheck_error_recall(void)
106 106
107 switch (e->type) { 107 switch (e->type) {
108 case KMEMCHECK_ERROR_INVALID_ACCESS: 108 case KMEMCHECK_ERROR_INVALID_ACCESS:
109 printk(KERN_ERR "WARNING: kmemcheck: Caught %d-bit read " 109 printk(KERN_WARNING "WARNING: kmemcheck: Caught %d-bit read from %s memory (%p)\n",
110 "from %s memory (%p)\n",
111 8 * e->size, e->state < ARRAY_SIZE(desc) ? 110 8 * e->size, e->state < ARRAY_SIZE(desc) ?
112 desc[e->state] : "(invalid shadow state)", 111 desc[e->state] : "(invalid shadow state)",
113 (void *) e->address); 112 (void *) e->address);
114 113
115 printk(KERN_INFO); 114 printk(KERN_WARNING);
116 for (i = 0; i < SHADOW_COPY_SIZE; ++i) 115 for (i = 0; i < SHADOW_COPY_SIZE; ++i)
117 printk("%02x", e->memory_copy[i]); 116 printk(KERN_CONT "%02x", e->memory_copy[i]);
118 printk("\n"); 117 printk(KERN_CONT "\n");
119 118
120 printk(KERN_INFO); 119 printk(KERN_WARNING);
121 for (i = 0; i < SHADOW_COPY_SIZE; ++i) { 120 for (i = 0; i < SHADOW_COPY_SIZE; ++i) {
122 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc)) 121 if (e->shadow_copy[i] < ARRAY_SIZE(short_desc))
123 printk(" %c", short_desc[e->shadow_copy[i]]); 122 printk(KERN_CONT " %c", short_desc[e->shadow_copy[i]]);
124 else 123 else
125 printk(" ?"); 124 printk(KERN_CONT " ?");
126 } 125 }
127 printk("\n"); 126 printk(KERN_CONT "\n");
128 printk(KERN_INFO "%*c\n", 2 + 2 127 printk(KERN_WARNING "%*c\n", 2 + 2
129 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^'); 128 * (int) (e->address & (SHADOW_COPY_SIZE - 1)), '^');
130 break; 129 break;
131 case KMEMCHECK_ERROR_BUG: 130 case KMEMCHECK_ERROR_BUG:
diff --git a/arch/x86/mm/kmemcheck/kmemcheck.c b/arch/x86/mm/kmemcheck/kmemcheck.c
index 8cc183344140..b3b531a4f8e5 100644
--- a/arch/x86/mm/kmemcheck/kmemcheck.c
+++ b/arch/x86/mm/kmemcheck/kmemcheck.c
@@ -337,7 +337,7 @@ bool kmemcheck_is_obj_initialized(unsigned long addr, size_t size)
337 if (!shadow) 337 if (!shadow)
338 return true; 338 return true;
339 339
340 status = kmemcheck_shadow_test(shadow, size); 340 status = kmemcheck_shadow_test_all(shadow, size);
341 341
342 return status == KMEMCHECK_SHADOW_INITIALIZED; 342 return status == KMEMCHECK_SHADOW_INITIALIZED;
343} 343}
diff --git a/arch/x86/mm/kmemcheck/shadow.c b/arch/x86/mm/kmemcheck/shadow.c
index 3f66b82076a3..aec124214d97 100644
--- a/arch/x86/mm/kmemcheck/shadow.c
+++ b/arch/x86/mm/kmemcheck/shadow.c
@@ -125,12 +125,12 @@ void kmemcheck_mark_initialized_pages(struct page *p, unsigned int n)
125 125
126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size) 126enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
127{ 127{
128#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
128 uint8_t *x; 129 uint8_t *x;
129 unsigned int i; 130 unsigned int i;
130 131
131 x = shadow; 132 x = shadow;
132 133
133#ifdef CONFIG_KMEMCHECK_PARTIAL_OK
134 /* 134 /*
135 * Make sure _some_ bytes are initialized. Gcc frequently generates 135 * Make sure _some_ bytes are initialized. Gcc frequently generates
136 * code to access neighboring bytes. 136 * code to access neighboring bytes.
@@ -139,13 +139,25 @@ enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size)
139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED) 139 if (x[i] == KMEMCHECK_SHADOW_INITIALIZED)
140 return x[i]; 140 return x[i];
141 } 141 }
142
143 return x[0];
142#else 144#else
145 return kmemcheck_shadow_test_all(shadow, size);
146#endif
147}
148
149enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow, unsigned int size)
150{
151 uint8_t *x;
152 unsigned int i;
153
154 x = shadow;
155
143 /* All bytes must be initialized. */ 156 /* All bytes must be initialized. */
144 for (i = 0; i < size; ++i) { 157 for (i = 0; i < size; ++i) {
145 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED) 158 if (x[i] != KMEMCHECK_SHADOW_INITIALIZED)
146 return x[i]; 159 return x[i];
147 } 160 }
148#endif
149 161
150 return x[0]; 162 return x[0];
151} 163}
diff --git a/arch/x86/mm/kmemcheck/shadow.h b/arch/x86/mm/kmemcheck/shadow.h
index af46d9ab9d86..ff0b2f70fbcb 100644
--- a/arch/x86/mm/kmemcheck/shadow.h
+++ b/arch/x86/mm/kmemcheck/shadow.h
@@ -11,6 +11,8 @@ enum kmemcheck_shadow {
11void *kmemcheck_shadow_lookup(unsigned long address); 11void *kmemcheck_shadow_lookup(unsigned long address);
12 12
13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size); 13enum kmemcheck_shadow kmemcheck_shadow_test(void *shadow, unsigned int size);
14enum kmemcheck_shadow kmemcheck_shadow_test_all(void *shadow,
15 unsigned int size);
14void kmemcheck_shadow_set(void *shadow, unsigned int size); 16void kmemcheck_shadow_set(void *shadow, unsigned int size);
15 17
16#endif 18#endif
diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c
index 16ccbd77917f..5d0e67fff1a6 100644
--- a/arch/x86/mm/kmmio.c
+++ b/arch/x86/mm/kmmio.c
@@ -5,6 +5,8 @@
5 * 2008 Pekka Paalanen <pq@iki.fi> 5 * 2008 Pekka Paalanen <pq@iki.fi>
6 */ 6 */
7 7
8#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
8#include <linux/list.h> 10#include <linux/list.h>
9#include <linux/rculist.h> 11#include <linux/rculist.h>
10#include <linux/spinlock.h> 12#include <linux/spinlock.h>
@@ -19,6 +21,7 @@
19#include <linux/kdebug.h> 21#include <linux/kdebug.h>
20#include <linux/mutex.h> 22#include <linux/mutex.h>
21#include <linux/io.h> 23#include <linux/io.h>
24#include <linux/slab.h>
22#include <asm/cacheflush.h> 25#include <asm/cacheflush.h>
23#include <asm/tlbflush.h> 26#include <asm/tlbflush.h>
24#include <linux/errno.h> 27#include <linux/errno.h>
@@ -136,7 +139,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
136 pte_t *pte = lookup_address(f->page, &level); 139 pte_t *pte = lookup_address(f->page, &level);
137 140
138 if (!pte) { 141 if (!pte) {
139 pr_err("kmmio: no pte for page 0x%08lx\n", f->page); 142 pr_err("no pte for page 0x%08lx\n", f->page);
140 return -1; 143 return -1;
141 } 144 }
142 145
@@ -148,7 +151,7 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
148 clear_pte_presence(pte, clear, &f->old_presence); 151 clear_pte_presence(pte, clear, &f->old_presence);
149 break; 152 break;
150 default: 153 default:
151 pr_err("kmmio: unexpected page level 0x%x.\n", level); 154 pr_err("unexpected page level 0x%x.\n", level);
152 return -1; 155 return -1;
153 } 156 }
154 157
@@ -170,13 +173,14 @@ static int clear_page_presence(struct kmmio_fault_page *f, bool clear)
170static int arm_kmmio_fault_page(struct kmmio_fault_page *f) 173static int arm_kmmio_fault_page(struct kmmio_fault_page *f)
171{ 174{
172 int ret; 175 int ret;
173 WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); 176 WARN_ONCE(f->armed, KERN_ERR pr_fmt("kmmio page already armed.\n"));
174 if (f->armed) { 177 if (f->armed) {
175 pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", 178 pr_warning("double-arm: page 0x%08lx, ref %d, old %d\n",
176 f->page, f->count, !!f->old_presence); 179 f->page, f->count, !!f->old_presence);
177 } 180 }
178 ret = clear_page_presence(f, true); 181 ret = clear_page_presence(f, true);
179 WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); 182 WARN_ONCE(ret < 0, KERN_ERR pr_fmt("arming 0x%08lx failed.\n"),
183 f->page);
180 f->armed = true; 184 f->armed = true;
181 return ret; 185 return ret;
182} 186}
@@ -203,7 +207,7 @@ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f)
203 */ 207 */
204/* 208/*
205 * Interrupts are disabled on entry as trap3 is an interrupt gate 209 * Interrupts are disabled on entry as trap3 is an interrupt gate
206 * and they remain disabled thorough out this function. 210 * and they remain disabled throughout this function.
207 */ 211 */
208int kmmio_handler(struct pt_regs *regs, unsigned long addr) 212int kmmio_handler(struct pt_regs *regs, unsigned long addr)
209{ 213{
@@ -240,24 +244,21 @@ int kmmio_handler(struct pt_regs *regs, unsigned long addr)
240 * condition needs handling by do_page_fault(), the 244 * condition needs handling by do_page_fault(), the
241 * page really not being present is the most common. 245 * page really not being present is the most common.
242 */ 246 */
243 pr_debug("kmmio: secondary hit for 0x%08lx CPU %d.\n", 247 pr_debug("secondary hit for 0x%08lx CPU %d.\n",
244 addr, smp_processor_id()); 248 addr, smp_processor_id());
245 249
246 if (!faultpage->old_presence) 250 if (!faultpage->old_presence)
247 pr_info("kmmio: unexpected secondary hit for " 251 pr_info("unexpected secondary hit for address 0x%08lx on CPU %d.\n",
248 "address 0x%08lx on CPU %d.\n", addr, 252 addr, smp_processor_id());
249 smp_processor_id());
250 } else { 253 } else {
251 /* 254 /*
252 * Prevent overwriting already in-flight context. 255 * Prevent overwriting already in-flight context.
253 * This should not happen, let's hope disarming at 256 * This should not happen, let's hope disarming at
254 * least prevents a panic. 257 * least prevents a panic.
255 */ 258 */
256 pr_emerg("kmmio: recursive probe hit on CPU %d, " 259 pr_emerg("recursive probe hit on CPU %d, for address 0x%08lx. Ignoring.\n",
257 "for address 0x%08lx. Ignoring.\n", 260 smp_processor_id(), addr);
258 smp_processor_id(), addr); 261 pr_emerg("previous hit was at 0x%08lx.\n", ctx->addr);
259 pr_emerg("kmmio: previous hit was at 0x%08lx.\n",
260 ctx->addr);
261 disarm_kmmio_fault_page(faultpage); 262 disarm_kmmio_fault_page(faultpage);
262 } 263 }
263 goto no_kmmio_ctx; 264 goto no_kmmio_ctx;
@@ -302,7 +303,7 @@ no_kmmio:
302 303
303/* 304/*
304 * Interrupts are disabled on entry as trap1 is an interrupt gate 305 * Interrupts are disabled on entry as trap1 is an interrupt gate
305 * and they remain disabled thorough out this function. 306 * and they remain disabled throughout this function.
306 * This must always get called as the pair to kmmio_handler(). 307 * This must always get called as the pair to kmmio_handler().
307 */ 308 */
308static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) 309static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
@@ -316,8 +317,8 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs)
316 * something external causing them (f.e. using a debugger while 317 * something external causing them (f.e. using a debugger while
317 * mmio tracing enabled), or erroneous behaviour 318 * mmio tracing enabled), or erroneous behaviour
318 */ 319 */
319 pr_warning("kmmio: unexpected debug trap on CPU %d.\n", 320 pr_warning("unexpected debug trap on CPU %d.\n",
320 smp_processor_id()); 321 smp_processor_id());
321 goto out; 322 goto out;
322 } 323 }
323 324
@@ -425,7 +426,7 @@ int register_kmmio_probe(struct kmmio_probe *p)
425 list_add_rcu(&p->list, &kmmio_probes); 426 list_add_rcu(&p->list, &kmmio_probes);
426 while (size < size_lim) { 427 while (size < size_lim) {
427 if (add_kmmio_fault_page(p->addr + size)) 428 if (add_kmmio_fault_page(p->addr + size))
428 pr_err("kmmio: Unable to set page fault.\n"); 429 pr_err("Unable to set page fault.\n");
429 size += PAGE_SIZE; 430 size += PAGE_SIZE;
430 } 431 }
431out: 432out:
@@ -490,7 +491,7 @@ static void remove_kmmio_fault_pages(struct rcu_head *head)
490 * 2. remove_kmmio_fault_pages() 491 * 2. remove_kmmio_fault_pages()
491 * Remove the pages from kmmio_page_table. 492 * Remove the pages from kmmio_page_table.
492 * 3. rcu_free_kmmio_fault_pages() 493 * 3. rcu_free_kmmio_fault_pages()
493 * Actally free the kmmio_fault_page structs as with RCU. 494 * Actually free the kmmio_fault_page structs as with RCU.
494 */ 495 */
495void unregister_kmmio_probe(struct kmmio_probe *p) 496void unregister_kmmio_probe(struct kmmio_probe *p)
496{ 497{
@@ -511,7 +512,7 @@ void unregister_kmmio_probe(struct kmmio_probe *p)
511 512
512 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC); 513 drelease = kmalloc(sizeof(*drelease), GFP_ATOMIC);
513 if (!drelease) { 514 if (!drelease) {
514 pr_crit("kmmio: leaking kmmio_fault_page objects.\n"); 515 pr_crit("leaking kmmio_fault_page objects.\n");
515 return; 516 return;
516 } 517 }
517 drelease->release_list = release_list; 518 drelease->release_list = release_list;
@@ -538,10 +539,17 @@ static int
538kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) 539kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args)
539{ 540{
540 struct die_args *arg = args; 541 struct die_args *arg = args;
542 unsigned long* dr6_p = (unsigned long *)ERR_PTR(arg->err);
541 543
542 if (val == DIE_DEBUG && (arg->err & DR_STEP)) 544 if (val == DIE_DEBUG && (*dr6_p & DR_STEP))
543 if (post_kmmio_handler(arg->err, arg->regs) == 1) 545 if (post_kmmio_handler(*dr6_p, arg->regs) == 1) {
546 /*
547 * Reset the BS bit in dr6 (pointed by args->err) to
548 * denote completion of processing
549 */
550 *dr6_p &= ~DR_STEP;
544 return NOTIFY_STOP; 551 return NOTIFY_STOP;
552 }
545 553
546 return NOTIFY_DONE; 554 return NOTIFY_DONE;
547} 555}
diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c
index c8191defc38a..1dab5194fd9d 100644
--- a/arch/x86/mm/mmap.c
+++ b/arch/x86/mm/mmap.c
@@ -71,7 +71,7 @@ static int mmap_is_legacy(void)
71 if (current->personality & ADDR_COMPAT_LAYOUT) 71 if (current->personality & ADDR_COMPAT_LAYOUT)
72 return 1; 72 return 1;
73 73
74 if (current->signal->rlim[RLIMIT_STACK].rlim_cur == RLIM_INFINITY) 74 if (rlimit(RLIMIT_STACK) == RLIM_INFINITY)
75 return 1; 75 return 1;
76 76
77 return sysctl_legacy_va_layout; 77 return sysctl_legacy_va_layout;
@@ -96,7 +96,7 @@ static unsigned long mmap_rnd(void)
96 96
97static unsigned long mmap_base(void) 97static unsigned long mmap_base(void)
98{ 98{
99 unsigned long gap = current->signal->rlim[RLIMIT_STACK].rlim_cur; 99 unsigned long gap = rlimit(RLIMIT_STACK);
100 100
101 if (gap < MIN_GAP) 101 if (gap < MIN_GAP)
102 gap = MIN_GAP; 102 gap = MIN_GAP;
diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c
index 132772a8ec57..3adff7dcc148 100644
--- a/arch/x86/mm/mmio-mod.c
+++ b/arch/x86/mm/mmio-mod.c
@@ -19,10 +19,14 @@
19 * 19 *
20 * Derived from the read-mod example from relay-examples by Tom Zanussi. 20 * Derived from the read-mod example from relay-examples by Tom Zanussi.
21 */ 21 */
22
23#define pr_fmt(fmt) "mmiotrace: " fmt
24
22#define DEBUG 1 25#define DEBUG 1
23 26
24#include <linux/module.h> 27#include <linux/module.h>
25#include <linux/debugfs.h> 28#include <linux/debugfs.h>
29#include <linux/slab.h>
26#include <linux/uaccess.h> 30#include <linux/uaccess.h>
27#include <linux/io.h> 31#include <linux/io.h>
28#include <linux/version.h> 32#include <linux/version.h>
@@ -36,8 +40,6 @@
36 40
37#include "pf_in.h" 41#include "pf_in.h"
38 42
39#define NAME "mmiotrace: "
40
41struct trap_reason { 43struct trap_reason {
42 unsigned long addr; 44 unsigned long addr;
43 unsigned long ip; 45 unsigned long ip;
@@ -96,17 +98,18 @@ static void print_pte(unsigned long address)
96 pte_t *pte = lookup_address(address, &level); 98 pte_t *pte = lookup_address(address, &level);
97 99
98 if (!pte) { 100 if (!pte) {
99 pr_err(NAME "Error in %s: no pte for page 0x%08lx\n", 101 pr_err("Error in %s: no pte for page 0x%08lx\n",
100 __func__, address); 102 __func__, address);
101 return; 103 return;
102 } 104 }
103 105
104 if (level == PG_LEVEL_2M) { 106 if (level == PG_LEVEL_2M) {
105 pr_emerg(NAME "4MB pages are not currently supported: " 107 pr_emerg("4MB pages are not currently supported: 0x%08lx\n",
106 "0x%08lx\n", address); 108 address);
107 BUG(); 109 BUG();
108 } 110 }
109 pr_info(NAME "pte for 0x%lx: 0x%llx 0x%llx\n", address, 111 pr_info("pte for 0x%lx: 0x%llx 0x%llx\n",
112 address,
110 (unsigned long long)pte_val(*pte), 113 (unsigned long long)pte_val(*pte),
111 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT); 114 (unsigned long long)pte_val(*pte) & _PAGE_PRESENT);
112} 115}
@@ -118,22 +121,21 @@ static void print_pte(unsigned long address)
118static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr) 121static void die_kmmio_nesting_error(struct pt_regs *regs, unsigned long addr)
119{ 122{
120 const struct trap_reason *my_reason = &get_cpu_var(pf_reason); 123 const struct trap_reason *my_reason = &get_cpu_var(pf_reason);
121 pr_emerg(NAME "unexpected fault for address: 0x%08lx, " 124 pr_emerg("unexpected fault for address: 0x%08lx, last fault for address: 0x%08lx\n",
122 "last fault for address: 0x%08lx\n", 125 addr, my_reason->addr);
123 addr, my_reason->addr);
124 print_pte(addr); 126 print_pte(addr);
125 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip); 127 print_symbol(KERN_EMERG "faulting IP is at %s\n", regs->ip);
126 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip); 128 print_symbol(KERN_EMERG "last faulting IP was at %s\n", my_reason->ip);
127#ifdef __i386__ 129#ifdef __i386__
128 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n", 130 pr_emerg("eax: %08lx ebx: %08lx ecx: %08lx edx: %08lx\n",
129 regs->ax, regs->bx, regs->cx, regs->dx); 131 regs->ax, regs->bx, regs->cx, regs->dx);
130 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n", 132 pr_emerg("esi: %08lx edi: %08lx ebp: %08lx esp: %08lx\n",
131 regs->si, regs->di, regs->bp, regs->sp); 133 regs->si, regs->di, regs->bp, regs->sp);
132#else 134#else
133 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n", 135 pr_emerg("rax: %016lx rcx: %016lx rdx: %016lx\n",
134 regs->ax, regs->cx, regs->dx); 136 regs->ax, regs->cx, regs->dx);
135 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n", 137 pr_emerg("rsi: %016lx rdi: %016lx rbp: %016lx rsp: %016lx\n",
136 regs->si, regs->di, regs->bp, regs->sp); 138 regs->si, regs->di, regs->bp, regs->sp);
137#endif 139#endif
138 put_cpu_var(pf_reason); 140 put_cpu_var(pf_reason);
139 BUG(); 141 BUG();
@@ -213,7 +215,7 @@ static void post(struct kmmio_probe *p, unsigned long condition,
213 /* this should always return the active_trace count to 0 */ 215 /* this should always return the active_trace count to 0 */
214 my_reason->active_traces--; 216 my_reason->active_traces--;
215 if (my_reason->active_traces) { 217 if (my_reason->active_traces) {
216 pr_emerg(NAME "unexpected post handler"); 218 pr_emerg("unexpected post handler");
217 BUG(); 219 BUG();
218 } 220 }
219 221
@@ -244,7 +246,7 @@ static void ioremap_trace_core(resource_size_t offset, unsigned long size,
244 }; 246 };
245 247
246 if (!trace) { 248 if (!trace) {
247 pr_err(NAME "kmalloc failed in ioremap\n"); 249 pr_err("kmalloc failed in ioremap\n");
248 return; 250 return;
249 } 251 }
250 252
@@ -282,8 +284,8 @@ void mmiotrace_ioremap(resource_size_t offset, unsigned long size,
282 if (!is_enabled()) /* recheck and proper locking in *_core() */ 284 if (!is_enabled()) /* recheck and proper locking in *_core() */
283 return; 285 return;
284 286
285 pr_debug(NAME "ioremap_*(0x%llx, 0x%lx) = %p\n", 287 pr_debug("ioremap_*(0x%llx, 0x%lx) = %p\n",
286 (unsigned long long)offset, size, addr); 288 (unsigned long long)offset, size, addr);
287 if ((filter_offset) && (offset != filter_offset)) 289 if ((filter_offset) && (offset != filter_offset))
288 return; 290 return;
289 ioremap_trace_core(offset, size, addr); 291 ioremap_trace_core(offset, size, addr);
@@ -301,7 +303,7 @@ static void iounmap_trace_core(volatile void __iomem *addr)
301 struct remap_trace *tmp; 303 struct remap_trace *tmp;
302 struct remap_trace *found_trace = NULL; 304 struct remap_trace *found_trace = NULL;
303 305
304 pr_debug(NAME "Unmapping %p.\n", addr); 306 pr_debug("Unmapping %p.\n", addr);
305 307
306 spin_lock_irq(&trace_lock); 308 spin_lock_irq(&trace_lock);
307 if (!is_enabled()) 309 if (!is_enabled())
@@ -363,9 +365,8 @@ static void clear_trace_list(void)
363 * Caller also ensures is_enabled() cannot change. 365 * Caller also ensures is_enabled() cannot change.
364 */ 366 */
365 list_for_each_entry(trace, &trace_list, list) { 367 list_for_each_entry(trace, &trace_list, list) {
366 pr_notice(NAME "purging non-iounmapped " 368 pr_notice("purging non-iounmapped trace @0x%08lx, size 0x%lx.\n",
367 "trace @0x%08lx, size 0x%lx.\n", 369 trace->probe.addr, trace->probe.len);
368 trace->probe.addr, trace->probe.len);
369 if (!nommiotrace) 370 if (!nommiotrace)
370 unregister_kmmio_probe(&trace->probe); 371 unregister_kmmio_probe(&trace->probe);
371 } 372 }
@@ -387,7 +388,7 @@ static void enter_uniprocessor(void)
387 388
388 if (downed_cpus == NULL && 389 if (downed_cpus == NULL &&
389 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) { 390 !alloc_cpumask_var(&downed_cpus, GFP_KERNEL)) {
390 pr_notice(NAME "Failed to allocate mask\n"); 391 pr_notice("Failed to allocate mask\n");
391 goto out; 392 goto out;
392 } 393 }
393 394
@@ -395,20 +396,19 @@ static void enter_uniprocessor(void)
395 cpumask_copy(downed_cpus, cpu_online_mask); 396 cpumask_copy(downed_cpus, cpu_online_mask);
396 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus); 397 cpumask_clear_cpu(cpumask_first(cpu_online_mask), downed_cpus);
397 if (num_online_cpus() > 1) 398 if (num_online_cpus() > 1)
398 pr_notice(NAME "Disabling non-boot CPUs...\n"); 399 pr_notice("Disabling non-boot CPUs...\n");
399 put_online_cpus(); 400 put_online_cpus();
400 401
401 for_each_cpu(cpu, downed_cpus) { 402 for_each_cpu(cpu, downed_cpus) {
402 err = cpu_down(cpu); 403 err = cpu_down(cpu);
403 if (!err) 404 if (!err)
404 pr_info(NAME "CPU%d is down.\n", cpu); 405 pr_info("CPU%d is down.\n", cpu);
405 else 406 else
406 pr_err(NAME "Error taking CPU%d down: %d\n", cpu, err); 407 pr_err("Error taking CPU%d down: %d\n", cpu, err);
407 } 408 }
408out: 409out:
409 if (num_online_cpus() > 1) 410 if (num_online_cpus() > 1)
410 pr_warning(NAME "multiple CPUs still online, " 411 pr_warning("multiple CPUs still online, may miss events.\n");
411 "may miss events.\n");
412} 412}
413 413
414/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit, 414/* __ref because leave_uniprocessor calls cpu_up which is __cpuinit,
@@ -420,13 +420,13 @@ static void __ref leave_uniprocessor(void)
420 420
421 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0) 421 if (downed_cpus == NULL || cpumask_weight(downed_cpus) == 0)
422 return; 422 return;
423 pr_notice(NAME "Re-enabling CPUs...\n"); 423 pr_notice("Re-enabling CPUs...\n");
424 for_each_cpu(cpu, downed_cpus) { 424 for_each_cpu(cpu, downed_cpus) {
425 err = cpu_up(cpu); 425 err = cpu_up(cpu);
426 if (!err) 426 if (!err)
427 pr_info(NAME "enabled CPU%d.\n", cpu); 427 pr_info("enabled CPU%d.\n", cpu);
428 else 428 else
429 pr_err(NAME "cannot re-enable CPU%d: %d\n", cpu, err); 429 pr_err("cannot re-enable CPU%d: %d\n", cpu, err);
430 } 430 }
431} 431}
432 432
@@ -434,8 +434,8 @@ static void __ref leave_uniprocessor(void)
434static void enter_uniprocessor(void) 434static void enter_uniprocessor(void)
435{ 435{
436 if (num_online_cpus() > 1) 436 if (num_online_cpus() > 1)
437 pr_warning(NAME "multiple CPUs are online, may miss events. " 437 pr_warning("multiple CPUs are online, may miss events. "
438 "Suggest booting with maxcpus=1 kernel argument.\n"); 438 "Suggest booting with maxcpus=1 kernel argument.\n");
439} 439}
440 440
441static void leave_uniprocessor(void) 441static void leave_uniprocessor(void)
@@ -450,13 +450,13 @@ void enable_mmiotrace(void)
450 goto out; 450 goto out;
451 451
452 if (nommiotrace) 452 if (nommiotrace)
453 pr_info(NAME "MMIO tracing disabled.\n"); 453 pr_info("MMIO tracing disabled.\n");
454 kmmio_init(); 454 kmmio_init();
455 enter_uniprocessor(); 455 enter_uniprocessor();
456 spin_lock_irq(&trace_lock); 456 spin_lock_irq(&trace_lock);
457 atomic_inc(&mmiotrace_enabled); 457 atomic_inc(&mmiotrace_enabled);
458 spin_unlock_irq(&trace_lock); 458 spin_unlock_irq(&trace_lock);
459 pr_info(NAME "enabled.\n"); 459 pr_info("enabled.\n");
460out: 460out:
461 mutex_unlock(&mmiotrace_mutex); 461 mutex_unlock(&mmiotrace_mutex);
462} 462}
@@ -475,7 +475,7 @@ void disable_mmiotrace(void)
475 clear_trace_list(); /* guarantees: no more kmmio callbacks */ 475 clear_trace_list(); /* guarantees: no more kmmio callbacks */
476 leave_uniprocessor(); 476 leave_uniprocessor();
477 kmmio_cleanup(); 477 kmmio_cleanup();
478 pr_info(NAME "disabled.\n"); 478 pr_info("disabled.\n");
479out: 479out:
480 mutex_unlock(&mmiotrace_mutex); 480 mutex_unlock(&mmiotrace_mutex);
481} 481}
diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c
index d2530062fe00..809baaaf48b1 100644
--- a/arch/x86/mm/numa_32.c
+++ b/arch/x86/mm/numa_32.c
@@ -347,8 +347,8 @@ static void init_remap_allocator(int nid)
347 (ulong) node_remap_end_vaddr[nid]); 347 (ulong) node_remap_end_vaddr[nid]);
348} 348}
349 349
350void __init initmem_init(unsigned long start_pfn, 350void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn,
351 unsigned long end_pfn) 351 int acpi, int k8)
352{ 352{
353 int nid; 353 int nid;
354 long kva_target_pfn; 354 long kva_target_pfn;
@@ -418,7 +418,10 @@ void __init initmem_init(unsigned long start_pfn,
418 418
419 for_each_online_node(nid) { 419 for_each_online_node(nid) {
420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data)); 420 memset(NODE_DATA(nid), 0, sizeof(struct pglist_data));
421 NODE_DATA(nid)->node_id = nid;
422#ifndef CONFIG_NO_BOOTMEM
421 NODE_DATA(nid)->bdata = &bootmem_node_data[nid]; 423 NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
424#endif
422 } 425 }
423 426
424 setup_bootmem_allocator(); 427 setup_bootmem_allocator();
diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c
index 459913beac71..8948f47fde05 100644
--- a/arch/x86/mm/numa_64.c
+++ b/arch/x86/mm/numa_64.c
@@ -163,30 +163,48 @@ static void * __init early_node_mem(int nodeid, unsigned long start,
163 unsigned long end, unsigned long size, 163 unsigned long end, unsigned long size,
164 unsigned long align) 164 unsigned long align)
165{ 165{
166 unsigned long mem = find_e820_area(start, end, size, align); 166 unsigned long mem;
167 void *ptr;
168 167
168 /*
169 * put it on high as possible
170 * something will go with NODE_DATA
171 */
172 if (start < (MAX_DMA_PFN<<PAGE_SHIFT))
173 start = MAX_DMA_PFN<<PAGE_SHIFT;
174 if (start < (MAX_DMA32_PFN<<PAGE_SHIFT) &&
175 end > (MAX_DMA32_PFN<<PAGE_SHIFT))
176 start = MAX_DMA32_PFN<<PAGE_SHIFT;
177 mem = find_e820_area(start, end, size, align);
178 if (mem != -1L)
179 return __va(mem);
180
181 /* extend the search scope */
182 end = max_pfn_mapped << PAGE_SHIFT;
183 if (end > (MAX_DMA32_PFN<<PAGE_SHIFT))
184 start = MAX_DMA32_PFN<<PAGE_SHIFT;
185 else
186 start = MAX_DMA_PFN<<PAGE_SHIFT;
187 mem = find_e820_area(start, end, size, align);
169 if (mem != -1L) 188 if (mem != -1L)
170 return __va(mem); 189 return __va(mem);
171 190
172 ptr = __alloc_bootmem_nopanic(size, align, __pa(MAX_DMA_ADDRESS)); 191 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
173 if (ptr == NULL) {
174 printk(KERN_ERR "Cannot find %lu bytes in node %d\n",
175 size, nodeid); 192 size, nodeid);
176 return NULL; 193
177 } 194 return NULL;
178 return ptr;
179} 195}
180 196
181/* Initialize bootmem allocator for a node */ 197/* Initialize bootmem allocator for a node */
182void __init 198void __init
183setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) 199setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
184{ 200{
185 unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; 201 unsigned long start_pfn, last_pfn, nodedata_phys;
186 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); 202 const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE);
187 unsigned long bootmap_start, nodedata_phys;
188 void *bootmap;
189 int nid; 203 int nid;
204#ifndef CONFIG_NO_BOOTMEM
205 unsigned long bootmap_start, bootmap_pages, bootmap_size;
206 void *bootmap;
207#endif
190 208
191 if (!end) 209 if (!end)
192 return; 210 return;
@@ -200,7 +218,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
200 218
201 start = roundup(start, ZONE_ALIGN); 219 start = roundup(start, ZONE_ALIGN);
202 220
203 printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, 221 printk(KERN_INFO "Initmem setup node %d %016lx-%016lx\n", nodeid,
204 start, end); 222 start, end);
205 223
206 start_pfn = start >> PAGE_SHIFT; 224 start_pfn = start >> PAGE_SHIFT;
@@ -211,14 +229,21 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
211 if (node_data[nodeid] == NULL) 229 if (node_data[nodeid] == NULL)
212 return; 230 return;
213 nodedata_phys = __pa(node_data[nodeid]); 231 nodedata_phys = __pa(node_data[nodeid]);
232 reserve_early(nodedata_phys, nodedata_phys + pgdat_size, "NODE_DATA");
214 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys, 233 printk(KERN_INFO " NODE_DATA [%016lx - %016lx]\n", nodedata_phys,
215 nodedata_phys + pgdat_size - 1); 234 nodedata_phys + pgdat_size - 1);
235 nid = phys_to_nid(nodedata_phys);
236 if (nid != nodeid)
237 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
216 238
217 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t)); 239 memset(NODE_DATA(nodeid), 0, sizeof(pg_data_t));
218 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid]; 240 NODE_DATA(nodeid)->node_id = nodeid;
219 NODE_DATA(nodeid)->node_start_pfn = start_pfn; 241 NODE_DATA(nodeid)->node_start_pfn = start_pfn;
220 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn; 242 NODE_DATA(nodeid)->node_spanned_pages = last_pfn - start_pfn;
221 243
244#ifndef CONFIG_NO_BOOTMEM
245 NODE_DATA(nodeid)->bdata = &bootmem_node_data[nodeid];
246
222 /* 247 /*
223 * Find a place for the bootmem map 248 * Find a place for the bootmem map
224 * nodedata_phys could be on other nodes by alloc_bootmem, 249 * nodedata_phys could be on other nodes by alloc_bootmem,
@@ -227,11 +252,7 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
227 * of alloc_bootmem, that could clash with reserved range 252 * of alloc_bootmem, that could clash with reserved range
228 */ 253 */
229 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn); 254 bootmap_pages = bootmem_bootmap_pages(last_pfn - start_pfn);
230 nid = phys_to_nid(nodedata_phys); 255 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
231 if (nid == nodeid)
232 bootmap_start = roundup(nodedata_phys + pgdat_size, PAGE_SIZE);
233 else
234 bootmap_start = roundup(start, PAGE_SIZE);
235 /* 256 /*
236 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like 257 * SMP_CACHE_BYTES could be enough, but init_bootmem_node like
237 * to use that to align to PAGE_SIZE 258 * to use that to align to PAGE_SIZE
@@ -239,12 +260,13 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
239 bootmap = early_node_mem(nodeid, bootmap_start, end, 260 bootmap = early_node_mem(nodeid, bootmap_start, end,
240 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE); 261 bootmap_pages<<PAGE_SHIFT, PAGE_SIZE);
241 if (bootmap == NULL) { 262 if (bootmap == NULL) {
242 if (nodedata_phys < start || nodedata_phys >= end) 263 free_early(nodedata_phys, nodedata_phys + pgdat_size);
243 free_bootmem(nodedata_phys, pgdat_size);
244 node_data[nodeid] = NULL; 264 node_data[nodeid] = NULL;
245 return; 265 return;
246 } 266 }
247 bootmap_start = __pa(bootmap); 267 bootmap_start = __pa(bootmap);
268 reserve_early(bootmap_start, bootmap_start+(bootmap_pages<<PAGE_SHIFT),
269 "BOOTMAP");
248 270
249 bootmap_size = init_bootmem_node(NODE_DATA(nodeid), 271 bootmap_size = init_bootmem_node(NODE_DATA(nodeid),
250 bootmap_start >> PAGE_SHIFT, 272 bootmap_start >> PAGE_SHIFT,
@@ -253,31 +275,12 @@ setup_node_bootmem(int nodeid, unsigned long start, unsigned long end)
253 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n", 275 printk(KERN_INFO " bootmap [%016lx - %016lx] pages %lx\n",
254 bootmap_start, bootmap_start + bootmap_size - 1, 276 bootmap_start, bootmap_start + bootmap_size - 1,
255 bootmap_pages); 277 bootmap_pages);
256
257 free_bootmem_with_active_regions(nodeid, end);
258
259 /*
260 * convert early reserve to bootmem reserve earlier
261 * otherwise early_node_mem could use early reserved mem
262 * on previous node
263 */
264 early_res_to_bootmem(start, end);
265
266 /*
267 * in some case early_node_mem could use alloc_bootmem
268 * to get range on other node, don't reserve that again
269 */
270 if (nid != nodeid)
271 printk(KERN_INFO " NODE_DATA(%d) on node %d\n", nodeid, nid);
272 else
273 reserve_bootmem_node(NODE_DATA(nodeid), nodedata_phys,
274 pgdat_size, BOOTMEM_DEFAULT);
275 nid = phys_to_nid(bootmap_start); 278 nid = phys_to_nid(bootmap_start);
276 if (nid != nodeid) 279 if (nid != nodeid)
277 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid); 280 printk(KERN_INFO " bootmap(%d) on node %d\n", nodeid, nid);
278 else 281
279 reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, 282 free_bootmem_with_active_regions(nodeid, end);
280 bootmap_pages<<PAGE_SHIFT, BOOTMEM_DEFAULT); 283#endif
281 284
282 node_set_online(nodeid); 285 node_set_online(nodeid);
283} 286}
@@ -306,8 +309,71 @@ void __init numa_init_array(void)
306 309
307#ifdef CONFIG_NUMA_EMU 310#ifdef CONFIG_NUMA_EMU
308/* Numa emulation */ 311/* Numa emulation */
312static struct bootnode nodes[MAX_NUMNODES] __initdata;
313static struct bootnode physnodes[MAX_NUMNODES] __initdata;
309static char *cmdline __initdata; 314static char *cmdline __initdata;
310 315
316static int __init setup_physnodes(unsigned long start, unsigned long end,
317 int acpi, int k8)
318{
319 int nr_nodes = 0;
320 int ret = 0;
321 int i;
322
323#ifdef CONFIG_ACPI_NUMA
324 if (acpi)
325 nr_nodes = acpi_get_nodes(physnodes);
326#endif
327#ifdef CONFIG_K8_NUMA
328 if (k8)
329 nr_nodes = k8_get_nodes(physnodes);
330#endif
331 /*
332 * Basic sanity checking on the physical node map: there may be errors
333 * if the SRAT or K8 incorrectly reported the topology or the mem=
334 * kernel parameter is used.
335 */
336 for (i = 0; i < nr_nodes; i++) {
337 if (physnodes[i].start == physnodes[i].end)
338 continue;
339 if (physnodes[i].start > end) {
340 physnodes[i].end = physnodes[i].start;
341 continue;
342 }
343 if (physnodes[i].end < start) {
344 physnodes[i].start = physnodes[i].end;
345 continue;
346 }
347 if (physnodes[i].start < start)
348 physnodes[i].start = start;
349 if (physnodes[i].end > end)
350 physnodes[i].end = end;
351 }
352
353 /*
354 * Remove all nodes that have no memory or were truncated because of the
355 * limited address range.
356 */
357 for (i = 0; i < nr_nodes; i++) {
358 if (physnodes[i].start == physnodes[i].end)
359 continue;
360 physnodes[ret].start = physnodes[i].start;
361 physnodes[ret].end = physnodes[i].end;
362 ret++;
363 }
364
365 /*
366 * If no physical topology was detected, a single node is faked to cover
367 * the entire address space.
368 */
369 if (!ret) {
370 physnodes[ret].start = start;
371 physnodes[ret].end = end;
372 ret = 1;
373 }
374 return ret;
375}
376
311/* 377/*
312 * Setups up nid to range from addr to addr + size. If the end 378 * Setups up nid to range from addr to addr + size. If the end
313 * boundary is greater than max_addr, then max_addr is used instead. 379 * boundary is greater than max_addr, then max_addr is used instead.
@@ -315,11 +381,9 @@ static char *cmdline __initdata;
315 * allocation past addr and -1 otherwise. addr is adjusted to be at 381 * allocation past addr and -1 otherwise. addr is adjusted to be at
316 * the end of the node. 382 * the end of the node.
317 */ 383 */
318static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr, 384static int __init setup_node_range(int nid, u64 *addr, u64 size, u64 max_addr)
319 u64 size, u64 max_addr)
320{ 385{
321 int ret = 0; 386 int ret = 0;
322
323 nodes[nid].start = *addr; 387 nodes[nid].start = *addr;
324 *addr += size; 388 *addr += size;
325 if (*addr >= max_addr) { 389 if (*addr >= max_addr) {
@@ -335,167 +399,234 @@ static int __init setup_node_range(int nid, struct bootnode *nodes, u64 *addr,
335} 399}
336 400
337/* 401/*
338 * Splits num_nodes nodes up equally starting at node_start. The return value 402 * Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
339 * is the number of nodes split up and addr is adjusted to be at the end of the 403 * to max_addr. The return value is the number of nodes allocated.
340 * last node allocated.
341 */ 404 */
342static int __init split_nodes_equally(struct bootnode *nodes, u64 *addr, 405static int __init split_nodes_interleave(u64 addr, u64 max_addr,
343 u64 max_addr, int node_start, 406 int nr_phys_nodes, int nr_nodes)
344 int num_nodes)
345{ 407{
346 unsigned int big; 408 nodemask_t physnode_mask = NODE_MASK_NONE;
347 u64 size; 409 u64 size;
410 int big;
411 int ret = 0;
348 int i; 412 int i;
349 413
350 if (num_nodes <= 0) 414 if (nr_nodes <= 0)
351 return -1; 415 return -1;
352 if (num_nodes > MAX_NUMNODES) 416 if (nr_nodes > MAX_NUMNODES) {
353 num_nodes = MAX_NUMNODES; 417 pr_info("numa=fake=%d too large, reducing to %d\n",
354 size = (max_addr - *addr - e820_hole_size(*addr, max_addr)) / 418 nr_nodes, MAX_NUMNODES);
355 num_nodes; 419 nr_nodes = MAX_NUMNODES;
420 }
421
422 size = (max_addr - addr - e820_hole_size(addr, max_addr)) / nr_nodes;
356 /* 423 /*
357 * Calculate the number of big nodes that can be allocated as a result 424 * Calculate the number of big nodes that can be allocated as a result
358 * of consolidating the leftovers. 425 * of consolidating the remainder.
359 */ 426 */
360 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * num_nodes) / 427 big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
361 FAKE_NODE_MIN_SIZE; 428 FAKE_NODE_MIN_SIZE;
362 429
363 /* Round down to nearest FAKE_NODE_MIN_SIZE. */
364 size &= FAKE_NODE_MIN_HASH_MASK; 430 size &= FAKE_NODE_MIN_HASH_MASK;
365 if (!size) { 431 if (!size) {
366 printk(KERN_ERR "Not enough memory for each node. " 432 pr_err("Not enough memory for each node. "
367 "NUMA emulation disabled.\n"); 433 "NUMA emulation disabled.\n");
368 return -1; 434 return -1;
369 } 435 }
370 436
371 for (i = node_start; i < num_nodes + node_start; i++) { 437 for (i = 0; i < nr_phys_nodes; i++)
372 u64 end = *addr + size; 438 if (physnodes[i].start != physnodes[i].end)
439 node_set(i, physnode_mask);
373 440
374 if (i < big) 441 /*
375 end += FAKE_NODE_MIN_SIZE; 442 * Continue to fill physical nodes with fake nodes until there is no
376 /* 443 * memory left on any of them.
377 * The final node can have the remaining system RAM. Other 444 */
378 * nodes receive roughly the same amount of available pages. 445 while (nodes_weight(physnode_mask)) {
379 */ 446 for_each_node_mask(i, physnode_mask) {
380 if (i == num_nodes + node_start - 1) 447 u64 end = physnodes[i].start + size;
381 end = max_addr; 448 u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
382 else 449
383 while (end - *addr - e820_hole_size(*addr, end) < 450 if (ret < big)
384 size) { 451 end += FAKE_NODE_MIN_SIZE;
452
453 /*
454 * Continue to add memory to this fake node if its
455 * non-reserved memory is less than the per-node size.
456 */
457 while (end - physnodes[i].start -
458 e820_hole_size(physnodes[i].start, end) < size) {
385 end += FAKE_NODE_MIN_SIZE; 459 end += FAKE_NODE_MIN_SIZE;
386 if (end > max_addr) { 460 if (end > physnodes[i].end) {
387 end = max_addr; 461 end = physnodes[i].end;
388 break; 462 break;
389 } 463 }
390 } 464 }
391 if (setup_node_range(i, nodes, addr, end - *addr, max_addr) < 0) 465
392 break; 466 /*
467 * If there won't be at least FAKE_NODE_MIN_SIZE of
468 * non-reserved memory in ZONE_DMA32 for the next node,
469 * this one must extend to the boundary.
470 */
471 if (end < dma32_end && dma32_end - end -
472 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
473 end = dma32_end;
474
475 /*
476 * If there won't be enough non-reserved memory for the
477 * next node, this one must extend to the end of the
478 * physical node.
479 */
480 if (physnodes[i].end - end -
481 e820_hole_size(end, physnodes[i].end) < size)
482 end = physnodes[i].end;
483
484 /*
485 * Avoid allocating more nodes than requested, which can
486 * happen as a result of rounding down each node's size
487 * to FAKE_NODE_MIN_SIZE.
488 */
489 if (nodes_weight(physnode_mask) + ret >= nr_nodes)
490 end = physnodes[i].end;
491
492 if (setup_node_range(ret++, &physnodes[i].start,
493 end - physnodes[i].start,
494 physnodes[i].end) < 0)
495 node_clear(i, physnode_mask);
496 }
393 } 497 }
394 return i - node_start + 1; 498 return ret;
395} 499}
396 500
397/* 501/*
398 * Splits the remaining system RAM into chunks of size. The remaining memory is 502 * Returns the end address of a node so that there is at least `size' amount of
399 * always assigned to a final node and can be asymmetric. Returns the number of 503 * non-reserved memory or `max_addr' is reached.
400 * nodes split.
401 */ 504 */
402static int __init split_nodes_by_size(struct bootnode *nodes, u64 *addr, 505static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
403 u64 max_addr, int node_start, u64 size)
404{ 506{
405 int i = node_start; 507 u64 end = start + size;
406 size = (size << 20) & FAKE_NODE_MIN_HASH_MASK; 508
407 while (!setup_node_range(i++, nodes, addr, size, max_addr)) 509 while (end - start - e820_hole_size(start, end) < size) {
408 ; 510 end += FAKE_NODE_MIN_SIZE;
409 return i - node_start; 511 if (end > max_addr) {
512 end = max_addr;
513 break;
514 }
515 }
516 return end;
410} 517}
411 518
412/* 519/*
413 * Sets up the system RAM area from start_pfn to last_pfn according to the 520 * Sets up fake nodes of `size' interleaved over physical nodes ranging from
414 * numa=fake command-line option. 521 * `addr' to `max_addr'. The return value is the number of nodes allocated.
415 */ 522 */
416static struct bootnode nodes[MAX_NUMNODES] __initdata; 523static int __init split_nodes_size_interleave(u64 addr, u64 max_addr, u64 size)
417
418static int __init numa_emulation(unsigned long start_pfn, unsigned long last_pfn)
419{ 524{
420 u64 size, addr = start_pfn << PAGE_SHIFT; 525 nodemask_t physnode_mask = NODE_MASK_NONE;
421 u64 max_addr = last_pfn << PAGE_SHIFT; 526 u64 min_size;
422 int num_nodes = 0, num = 0, coeff_flag, coeff = -1, i; 527 int ret = 0;
528 int i;
423 529
424 memset(&nodes, 0, sizeof(nodes)); 530 if (!size)
531 return -1;
425 /* 532 /*
426 * If the numa=fake command-line is just a single number N, split the 533 * The limit on emulated nodes is MAX_NUMNODES, so the size per node is
427 * system RAM into N fake nodes. 534 * increased accordingly if the requested size is too small. This
535 * creates a uniform distribution of node sizes across the entire
536 * machine (but not necessarily over physical nodes).
428 */ 537 */
429 if (!strchr(cmdline, '*') && !strchr(cmdline, ',')) { 538 min_size = (max_addr - addr - e820_hole_size(addr, max_addr)) /
430 long n = simple_strtol(cmdline, NULL, 0); 539 MAX_NUMNODES;
431 540 min_size = max(min_size, FAKE_NODE_MIN_SIZE);
432 num_nodes = split_nodes_equally(nodes, &addr, max_addr, 0, n); 541 if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
433 if (num_nodes < 0) 542 min_size = (min_size + FAKE_NODE_MIN_SIZE) &
434 return num_nodes; 543 FAKE_NODE_MIN_HASH_MASK;
435 goto out; 544 if (size < min_size) {
545 pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
546 size >> 20, min_size >> 20);
547 size = min_size;
436 } 548 }
549 size &= FAKE_NODE_MIN_HASH_MASK;
437 550
438 /* Parse the command line. */ 551 for (i = 0; i < MAX_NUMNODES; i++)
439 for (coeff_flag = 0; ; cmdline++) { 552 if (physnodes[i].start != physnodes[i].end)
440 if (*cmdline && isdigit(*cmdline)) { 553 node_set(i, physnode_mask);
441 num = num * 10 + *cmdline - '0'; 554 /*
442 continue; 555 * Fill physical nodes with fake nodes of size until there is no memory
443 } 556 * left on any of them.
444 if (*cmdline == '*') { 557 */
445 if (num > 0) 558 while (nodes_weight(physnode_mask)) {
446 coeff = num; 559 for_each_node_mask(i, physnode_mask) {
447 coeff_flag = 1; 560 u64 dma32_end = MAX_DMA32_PFN << PAGE_SHIFT;
448 } 561 u64 end;
449 if (!*cmdline || *cmdline == ',') { 562
450 if (!coeff_flag) 563 end = find_end_of_node(physnodes[i].start,
451 coeff = 1; 564 physnodes[i].end, size);
452 /* 565 /*
453 * Round down to the nearest FAKE_NODE_MIN_SIZE. 566 * If there won't be at least FAKE_NODE_MIN_SIZE of
454 * Command-line coefficients are in megabytes. 567 * non-reserved memory in ZONE_DMA32 for the next node,
568 * this one must extend to the boundary.
455 */ 569 */
456 size = ((u64)num << 20) & FAKE_NODE_MIN_HASH_MASK; 570 if (end < dma32_end && dma32_end - end -
457 if (size) 571 e820_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
458 for (i = 0; i < coeff; i++, num_nodes++) 572 end = dma32_end;
459 if (setup_node_range(num_nodes, nodes, 573
460 &addr, size, max_addr) < 0) 574 /*
461 goto done; 575 * If there won't be enough non-reserved memory for the
462 if (!*cmdline) 576 * next node, this one must extend to the end of the
463 break; 577 * physical node.
464 coeff_flag = 0; 578 */
465 coeff = -1; 579 if (physnodes[i].end - end -
580 e820_hole_size(end, physnodes[i].end) < size)
581 end = physnodes[i].end;
582
583 /*
584 * Setup the fake node that will be allocated as bootmem
585 * later. If setup_node_range() returns non-zero, there
586 * is no more memory available on this physical node.
587 */
588 if (setup_node_range(ret++, &physnodes[i].start,
589 end - physnodes[i].start,
590 physnodes[i].end) < 0)
591 node_clear(i, physnode_mask);
466 } 592 }
467 num = 0;
468 } 593 }
469done: 594 return ret;
470 if (!num_nodes) 595}
471 return -1; 596
472 /* Fill remainder of system RAM, if appropriate. */ 597/*
473 if (addr < max_addr) { 598 * Sets up the system RAM area from start_pfn to last_pfn according to the
474 if (coeff_flag && coeff < 0) { 599 * numa=fake command-line option.
475 /* Split remaining nodes into num-sized chunks */ 600 */
476 num_nodes += split_nodes_by_size(nodes, &addr, max_addr, 601static int __init numa_emulation(unsigned long start_pfn,
477 num_nodes, num); 602 unsigned long last_pfn, int acpi, int k8)
478 goto out; 603{
479 } 604 u64 addr = start_pfn << PAGE_SHIFT;
480 switch (*(cmdline - 1)) { 605 u64 max_addr = last_pfn << PAGE_SHIFT;
481 case '*': 606 int num_phys_nodes;
482 /* Split remaining nodes into coeff chunks */ 607 int num_nodes;
483 if (coeff <= 0) 608 int i;
484 break; 609
485 num_nodes += split_nodes_equally(nodes, &addr, max_addr, 610 num_phys_nodes = setup_physnodes(addr, max_addr, acpi, k8);
486 num_nodes, coeff); 611 /*
487 break; 612 * If the numa=fake command-line contains a 'M' or 'G', it represents
488 case ',': 613 * the fixed node size. Otherwise, if it is just a single number N,
489 /* Do not allocate remaining system RAM */ 614 * split the system RAM into N fake nodes.
490 break; 615 */
491 default: 616 if (strchr(cmdline, 'M') || strchr(cmdline, 'G')) {
492 /* Give one final node */ 617 u64 size;
493 setup_node_range(num_nodes, nodes, &addr, 618
494 max_addr - addr, max_addr); 619 size = memparse(cmdline, &cmdline);
495 num_nodes++; 620 num_nodes = split_nodes_size_interleave(addr, max_addr, size);
496 } 621 } else {
622 unsigned long n;
623
624 n = simple_strtoul(cmdline, NULL, 0);
625 num_nodes = split_nodes_interleave(addr, max_addr, num_phys_nodes, n);
497 } 626 }
498out: 627
628 if (num_nodes < 0)
629 return num_nodes;
499 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL); 630 memnode_shift = compute_hash_shift(nodes, num_nodes, NULL);
500 if (memnode_shift < 0) { 631 if (memnode_shift < 0) {
501 memnode_shift = 0; 632 memnode_shift = 0;
@@ -505,14 +636,10 @@ out:
505 } 636 }
506 637
507 /* 638 /*
508 * We need to vacate all active ranges that may have been registered by 639 * We need to vacate all active ranges that may have been registered for
509 * SRAT and set acpi_numa to -1 so that srat_disabled() always returns 640 * the e820 memory map.
510 * true. NUMA emulation has succeeded so we will not scan ACPI nodes.
511 */ 641 */
512 remove_all_active_ranges(); 642 remove_all_active_ranges();
513#ifdef CONFIG_ACPI_NUMA
514 acpi_numa = -1;
515#endif
516 for_each_node_mask(i, node_possible_map) { 643 for_each_node_mask(i, node_possible_map) {
517 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT, 644 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
518 nodes[i].end >> PAGE_SHIFT); 645 nodes[i].end >> PAGE_SHIFT);
@@ -524,7 +651,8 @@ out:
524} 651}
525#endif /* CONFIG_NUMA_EMU */ 652#endif /* CONFIG_NUMA_EMU */
526 653
527void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn) 654void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn,
655 int acpi, int k8)
528{ 656{
529 int i; 657 int i;
530 658
@@ -532,23 +660,22 @@ void __init initmem_init(unsigned long start_pfn, unsigned long last_pfn)
532 nodes_clear(node_online_map); 660 nodes_clear(node_online_map);
533 661
534#ifdef CONFIG_NUMA_EMU 662#ifdef CONFIG_NUMA_EMU
535 if (cmdline && !numa_emulation(start_pfn, last_pfn)) 663 if (cmdline && !numa_emulation(start_pfn, last_pfn, acpi, k8))
536 return; 664 return;
537 nodes_clear(node_possible_map); 665 nodes_clear(node_possible_map);
538 nodes_clear(node_online_map); 666 nodes_clear(node_online_map);
539#endif 667#endif
540 668
541#ifdef CONFIG_ACPI_NUMA 669#ifdef CONFIG_ACPI_NUMA
542 if (!numa_off && !acpi_scan_nodes(start_pfn << PAGE_SHIFT, 670 if (!numa_off && acpi && !acpi_scan_nodes(start_pfn << PAGE_SHIFT,
543 last_pfn << PAGE_SHIFT)) 671 last_pfn << PAGE_SHIFT))
544 return; 672 return;
545 nodes_clear(node_possible_map); 673 nodes_clear(node_possible_map);
546 nodes_clear(node_online_map); 674 nodes_clear(node_online_map);
547#endif 675#endif
548 676
549#ifdef CONFIG_K8_NUMA 677#ifdef CONFIG_K8_NUMA
550 if (!numa_off && !k8_scan_nodes(start_pfn<<PAGE_SHIFT, 678 if (!numa_off && k8 && !k8_scan_nodes())
551 last_pfn<<PAGE_SHIFT))
552 return; 679 return;
553 nodes_clear(node_possible_map); 680 nodes_clear(node_possible_map);
554 nodes_clear(node_online_map); 681 nodes_clear(node_online_map);
@@ -579,6 +706,10 @@ unsigned long __init numa_free_all_bootmem(void)
579 for_each_online_node(i) 706 for_each_online_node(i)
580 pages += free_all_bootmem_node(NODE_DATA(i)); 707 pages += free_all_bootmem_node(NODE_DATA(i));
581 708
709#ifdef CONFIG_NO_BOOTMEM
710 pages += free_all_memory_core_early(MAX_NUMNODES);
711#endif
712
582 return pages; 713 return pages;
583} 714}
584 715
@@ -601,6 +732,25 @@ static __init int numa_setup(char *opt)
601early_param("numa", numa_setup); 732early_param("numa", numa_setup);
602 733
603#ifdef CONFIG_NUMA 734#ifdef CONFIG_NUMA
735
736static __init int find_near_online_node(int node)
737{
738 int n, val;
739 int min_val = INT_MAX;
740 int best_node = -1;
741
742 for_each_online_node(n) {
743 val = node_distance(node, n);
744
745 if (val < min_val) {
746 min_val = val;
747 best_node = n;
748 }
749 }
750
751 return best_node;
752}
753
604/* 754/*
605 * Setup early cpu_to_node. 755 * Setup early cpu_to_node.
606 * 756 *
@@ -632,7 +782,7 @@ void __init init_cpu_to_node(void)
632 if (node == NUMA_NO_NODE) 782 if (node == NUMA_NO_NODE)
633 continue; 783 continue;
634 if (!node_online(node)) 784 if (!node_online(node))
635 continue; 785 node = find_near_online_node(node);
636 numa_set_node(cpu, node); 786 numa_set_node(cpu, node);
637 } 787 }
638} 788}
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dd38bfbefd1f..28195c350b97 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -6,13 +6,13 @@
6#include <linux/bootmem.h> 6#include <linux/bootmem.h>
7#include <linux/module.h> 7#include <linux/module.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
10#include <linux/mm.h> 9#include <linux/mm.h>
11#include <linux/interrupt.h> 10#include <linux/interrupt.h>
12#include <linux/seq_file.h> 11#include <linux/seq_file.h>
13#include <linux/debugfs.h> 12#include <linux/debugfs.h>
14#include <linux/pfn.h> 13#include <linux/pfn.h>
15#include <linux/percpu.h> 14#include <linux/percpu.h>
15#include <linux/gfp.h>
16 16
17#include <asm/e820.h> 17#include <asm/e820.h>
18#include <asm/processor.h> 18#include <asm/processor.h>
@@ -279,6 +279,43 @@ static inline pgprot_t static_protections(pgprot_t prot, unsigned long address,
279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT)) 279 __pa((unsigned long)__end_rodata) >> PAGE_SHIFT))
280 pgprot_val(forbidden) |= _PAGE_RW; 280 pgprot_val(forbidden) |= _PAGE_RW;
281 281
282#if defined(CONFIG_X86_64) && defined(CONFIG_DEBUG_RODATA)
283 /*
284 * Once the kernel maps the text as RO (kernel_set_to_readonly is set),
285 * kernel text mappings for the large page aligned text, rodata sections
286 * will be always read-only. For the kernel identity mappings covering
287 * the holes caused by this alignment can be anything that user asks.
288 *
289 * This will preserve the large page mappings for kernel text/data
290 * at no extra cost.
291 */
292 if (kernel_set_to_readonly &&
293 within(address, (unsigned long)_text,
294 (unsigned long)__end_rodata_hpage_align)) {
295 unsigned int level;
296
297 /*
298 * Don't enforce the !RW mapping for the kernel text mapping,
299 * if the current mapping is already using small page mapping.
300 * No need to work hard to preserve large page mappings in this
301 * case.
302 *
303 * This also fixes the Linux Xen paravirt guest boot failure
304 * (because of unexpected read-only mappings for kernel identity
305 * mappings). In this paravirt guest case, the kernel text
306 * mapping and the kernel identity mapping share the same
307 * page-table pages. Thus we can't really use different
308 * protections for the kernel text and identity mappings. Also,
309 * these shared mappings are made of small page mappings.
310 * Thus this don't enforce !RW mapping for small page kernel
311 * text mapping logic will help Linux Xen parvirt guest boot
312 * aswell.
313 */
314 if (lookup_address(address, &level) && (level != PG_LEVEL_4K))
315 pgprot_val(forbidden) |= _PAGE_RW;
316 }
317#endif
318
282 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden)); 319 prot = __pgprot(pgprot_val(prot) & ~pgprot_val(forbidden));
283 320
284 return prot; 321 return prot;
@@ -1069,12 +1106,18 @@ EXPORT_SYMBOL(set_memory_array_wb);
1069 1106
1070int set_memory_x(unsigned long addr, int numpages) 1107int set_memory_x(unsigned long addr, int numpages)
1071{ 1108{
1109 if (!(__supported_pte_mask & _PAGE_NX))
1110 return 0;
1111
1072 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0); 1112 return change_page_attr_clear(&addr, numpages, __pgprot(_PAGE_NX), 0);
1073} 1113}
1074EXPORT_SYMBOL(set_memory_x); 1114EXPORT_SYMBOL(set_memory_x);
1075 1115
1076int set_memory_nx(unsigned long addr, int numpages) 1116int set_memory_nx(unsigned long addr, int numpages)
1077{ 1117{
1118 if (!(__supported_pte_mask & _PAGE_NX))
1119 return 0;
1120
1078 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0); 1121 return change_page_attr_set(&addr, numpages, __pgprot(_PAGE_NX), 0);
1079} 1122}
1080EXPORT_SYMBOL(set_memory_nx); 1123EXPORT_SYMBOL(set_memory_nx);
diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c
index e78cd0ec2bcf..edc8b95afc1a 100644
--- a/arch/x86/mm/pat.c
+++ b/arch/x86/mm/pat.c
@@ -12,7 +12,7 @@
12#include <linux/debugfs.h> 12#include <linux/debugfs.h>
13#include <linux/kernel.h> 13#include <linux/kernel.h>
14#include <linux/module.h> 14#include <linux/module.h>
15#include <linux/gfp.h> 15#include <linux/slab.h>
16#include <linux/mm.h> 16#include <linux/mm.h>
17#include <linux/fs.h> 17#include <linux/fs.h>
18#include <linux/rbtree.h> 18#include <linux/rbtree.h>
@@ -20,6 +20,7 @@
20#include <asm/cacheflush.h> 20#include <asm/cacheflush.h>
21#include <asm/processor.h> 21#include <asm/processor.h>
22#include <asm/tlbflush.h> 22#include <asm/tlbflush.h>
23#include <asm/x86_init.h>
23#include <asm/pgtable.h> 24#include <asm/pgtable.h>
24#include <asm/fcntl.h> 25#include <asm/fcntl.h>
25#include <asm/e820.h> 26#include <asm/e820.h>
@@ -355,9 +356,6 @@ static int free_ram_pages_type(u64 start, u64 end)
355 * - _PAGE_CACHE_UC_MINUS 356 * - _PAGE_CACHE_UC_MINUS
356 * - _PAGE_CACHE_UC 357 * - _PAGE_CACHE_UC
357 * 358 *
358 * req_type will have a special case value '-1', when requester want to inherit
359 * the memory type from mtrr (if WB), existing PAT, defaulting to UC_MINUS.
360 *
361 * If new_type is NULL, function will return an error if it cannot reserve the 359 * If new_type is NULL, function will return an error if it cannot reserve the
362 * region with req_type. If new_type is non-NULL, function will return 360 * region with req_type. If new_type is non-NULL, function will return
363 * available type in new_type in case of no error. In case of any error 361 * available type in new_type in case of no error. In case of any error
@@ -377,9 +375,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
377 if (!pat_enabled) { 375 if (!pat_enabled) {
378 /* This is identical to page table setting without PAT */ 376 /* This is identical to page table setting without PAT */
379 if (new_type) { 377 if (new_type) {
380 if (req_type == -1) 378 if (req_type == _PAGE_CACHE_WC)
381 *new_type = _PAGE_CACHE_WB;
382 else if (req_type == _PAGE_CACHE_WC)
383 *new_type = _PAGE_CACHE_UC_MINUS; 379 *new_type = _PAGE_CACHE_UC_MINUS;
384 else 380 else
385 *new_type = req_type & _PAGE_CACHE_MASK; 381 *new_type = req_type & _PAGE_CACHE_MASK;
@@ -388,7 +384,7 @@ int reserve_memtype(u64 start, u64 end, unsigned long req_type,
388 } 384 }
389 385
390 /* Low ISA region is always mapped WB in page table. No need to track */ 386 /* Low ISA region is always mapped WB in page table. No need to track */
391 if (is_ISA_range(start, end - 1)) { 387 if (x86_platform.is_untracked_pat_range(start, end)) {
392 if (new_type) 388 if (new_type)
393 *new_type = _PAGE_CACHE_WB; 389 *new_type = _PAGE_CACHE_WB;
394 return 0; 390 return 0;
@@ -499,7 +495,7 @@ int free_memtype(u64 start, u64 end)
499 return 0; 495 return 0;
500 496
501 /* Low ISA region is always mapped WB. No need to track */ 497 /* Low ISA region is always mapped WB. No need to track */
502 if (is_ISA_range(start, end - 1)) 498 if (x86_platform.is_untracked_pat_range(start, end))
503 return 0; 499 return 0;
504 500
505 is_range_ram = pat_pagerange_is_ram(start, end); 501 is_range_ram = pat_pagerange_is_ram(start, end);
@@ -582,7 +578,7 @@ static unsigned long lookup_memtype(u64 paddr)
582 int rettype = _PAGE_CACHE_WB; 578 int rettype = _PAGE_CACHE_WB;
583 struct memtype *entry; 579 struct memtype *entry;
584 580
585 if (is_ISA_range(paddr, paddr + PAGE_SIZE - 1)) 581 if (x86_platform.is_untracked_pat_range(paddr, paddr + PAGE_SIZE))
586 return rettype; 582 return rettype;
587 583
588 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) { 584 if (pat_pagerange_is_ram(paddr, paddr + PAGE_SIZE)) {
@@ -708,9 +704,8 @@ int phys_mem_access_prot_allowed(struct file *file, unsigned long pfn,
708 if (!range_is_allowed(pfn, size)) 704 if (!range_is_allowed(pfn, size))
709 return 0; 705 return 0;
710 706
711 if (file->f_flags & O_SYNC) { 707 if (file->f_flags & O_DSYNC)
712 flags = _PAGE_CACHE_UC_MINUS; 708 flags = _PAGE_CACHE_UC_MINUS;
713 }
714 709
715#ifdef CONFIG_X86_32 710#ifdef CONFIG_X86_32
716 /* 711 /*
@@ -1018,8 +1013,10 @@ static const struct file_operations memtype_fops = {
1018 1013
1019static int __init pat_memtype_list_init(void) 1014static int __init pat_memtype_list_init(void)
1020{ 1015{
1021 debugfs_create_file("pat_memtype_list", S_IRUSR, arch_debugfs_dir, 1016 if (pat_enabled) {
1022 NULL, &memtype_fops); 1017 debugfs_create_file("pat_memtype_list", S_IRUSR,
1018 arch_debugfs_dir, NULL, &memtype_fops);
1019 }
1023 return 0; 1020 return 0;
1024} 1021}
1025 1022
diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c
index ed34f5e35999..5c4ee422590e 100644
--- a/arch/x86/mm/pgtable.c
+++ b/arch/x86/mm/pgtable.c
@@ -1,4 +1,5 @@
1#include <linux/mm.h> 1#include <linux/mm.h>
2#include <linux/gfp.h>
2#include <asm/pgalloc.h> 3#include <asm/pgalloc.h>
3#include <asm/pgtable.h> 4#include <asm/pgtable.h>
4#include <asm/tlb.h> 5#include <asm/tlb.h>
@@ -6,6 +7,14 @@
6 7
7#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO 8#define PGALLOC_GFP GFP_KERNEL | __GFP_NOTRACK | __GFP_REPEAT | __GFP_ZERO
8 9
10#ifdef CONFIG_HIGHPTE
11#define PGALLOC_USER_GFP __GFP_HIGHMEM
12#else
13#define PGALLOC_USER_GFP 0
14#endif
15
16gfp_t __userpte_alloc_gfp = PGALLOC_GFP | PGALLOC_USER_GFP;
17
9pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address) 18pte_t *pte_alloc_one_kernel(struct mm_struct *mm, unsigned long address)
10{ 19{
11 return (pte_t *)__get_free_page(PGALLOC_GFP); 20 return (pte_t *)__get_free_page(PGALLOC_GFP);
@@ -15,16 +24,29 @@ pgtable_t pte_alloc_one(struct mm_struct *mm, unsigned long address)
15{ 24{
16 struct page *pte; 25 struct page *pte;
17 26
18#ifdef CONFIG_HIGHPTE 27 pte = alloc_pages(__userpte_alloc_gfp, 0);
19 pte = alloc_pages(PGALLOC_GFP | __GFP_HIGHMEM, 0);
20#else
21 pte = alloc_pages(PGALLOC_GFP, 0);
22#endif
23 if (pte) 28 if (pte)
24 pgtable_page_ctor(pte); 29 pgtable_page_ctor(pte);
25 return pte; 30 return pte;
26} 31}
27 32
33static int __init setup_userpte(char *arg)
34{
35 if (!arg)
36 return -EINVAL;
37
38 /*
39 * "userpte=nohigh" disables allocation of user pagetables in
40 * high memory.
41 */
42 if (strcmp(arg, "nohigh") == 0)
43 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
44 else
45 return -EINVAL;
46 return 0;
47}
48early_param("userpte", setup_userpte);
49
28void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) 50void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte)
29{ 51{
30 pgtable_page_dtor(pte); 52 pgtable_page_dtor(pte);
diff --git a/arch/x86/mm/pgtable_32.c b/arch/x86/mm/pgtable_32.c
index 46c8834aedc0..792854003ed3 100644
--- a/arch/x86/mm/pgtable_32.c
+++ b/arch/x86/mm/pgtable_32.c
@@ -6,7 +6,6 @@
6#include <linux/swap.h> 6#include <linux/swap.h>
7#include <linux/smp.h> 7#include <linux/smp.h>
8#include <linux/highmem.h> 8#include <linux/highmem.h>
9#include <linux/slab.h>
10#include <linux/pagemap.h> 9#include <linux/pagemap.h>
11#include <linux/spinlock.h> 10#include <linux/spinlock.h>
12#include <linux/module.h> 11#include <linux/module.h>
@@ -19,6 +18,7 @@
19#include <asm/e820.h> 18#include <asm/e820.h>
20#include <asm/tlb.h> 19#include <asm/tlb.h>
21#include <asm/tlbflush.h> 20#include <asm/tlbflush.h>
21#include <asm/io.h>
22 22
23unsigned int __VMALLOC_RESERVE = 128 << 20; 23unsigned int __VMALLOC_RESERVE = 128 << 20;
24 24
@@ -129,6 +129,7 @@ static int __init parse_reservetop(char *arg)
129 129
130 address = memparse(arg, &arg); 130 address = memparse(arg, &arg);
131 reserve_top_address(address); 131 reserve_top_address(address);
132 fixup_early_ioremap();
132 return 0; 133 return 0;
133} 134}
134early_param("reservetop", parse_reservetop); 135early_param("reservetop", parse_reservetop);
diff --git a/arch/x86/mm/setup_nx.c b/arch/x86/mm/setup_nx.c
index 513d8ed5d2ec..a3250aa34086 100644
--- a/arch/x86/mm/setup_nx.c
+++ b/arch/x86/mm/setup_nx.c
@@ -3,10 +3,8 @@
3#include <linux/init.h> 3#include <linux/init.h>
4 4
5#include <asm/pgtable.h> 5#include <asm/pgtable.h>
6#include <asm/proto.h>
6 7
7int nx_enabled;
8
9#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
10static int disable_nx __cpuinitdata; 8static int disable_nx __cpuinitdata;
11 9
12/* 10/*
@@ -22,48 +20,41 @@ static int __init noexec_setup(char *str)
22 if (!str) 20 if (!str)
23 return -EINVAL; 21 return -EINVAL;
24 if (!strncmp(str, "on", 2)) { 22 if (!strncmp(str, "on", 2)) {
25 __supported_pte_mask |= _PAGE_NX;
26 disable_nx = 0; 23 disable_nx = 0;
27 } else if (!strncmp(str, "off", 3)) { 24 } else if (!strncmp(str, "off", 3)) {
28 disable_nx = 1; 25 disable_nx = 1;
29 __supported_pte_mask &= ~_PAGE_NX;
30 } 26 }
27 x86_configure_nx();
31 return 0; 28 return 0;
32} 29}
33early_param("noexec", noexec_setup); 30early_param("noexec", noexec_setup);
34#endif
35 31
36#ifdef CONFIG_X86_PAE 32void __cpuinit x86_configure_nx(void)
37void __init set_nx(void)
38{ 33{
39 unsigned int v[4], l, h; 34 if (cpu_has_nx && !disable_nx)
40 35 __supported_pte_mask |= _PAGE_NX;
41 if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { 36 else
42 cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); 37 __supported_pte_mask &= ~_PAGE_NX;
38}
43 39
44 if ((v[3] & (1 << 20)) && !disable_nx) { 40void __init x86_report_nx(void)
45 rdmsr(MSR_EFER, l, h); 41{
46 l |= EFER_NX; 42 if (!cpu_has_nx) {
47 wrmsr(MSR_EFER, l, h); 43 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
48 nx_enabled = 1; 44 "missing in CPU or disabled in BIOS!\n");
49 __supported_pte_mask |= _PAGE_NX; 45 } else {
46#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE)
47 if (disable_nx) {
48 printk(KERN_INFO "NX (Execute Disable) protection: "
49 "disabled by kernel command line option\n");
50 } else {
51 printk(KERN_INFO "NX (Execute Disable) protection: "
52 "active\n");
50 } 53 }
51 }
52}
53#else 54#else
54void set_nx(void) 55 /* 32bit non-PAE kernel, NX cannot be used */
55{ 56 printk(KERN_NOTICE "Notice: NX (Execute Disable) protection "
56} 57 "cannot be enabled: non-PAE kernel!\n");
57#endif 58#endif
58 59 }
59#ifdef CONFIG_X86_64
60void __cpuinit check_efer(void)
61{
62 unsigned long efer;
63
64 rdmsrl(MSR_EFER, efer);
65 if (!(efer & EFER_NX) || disable_nx)
66 __supported_pte_mask &= ~_PAGE_NX;
67} 60}
68#endif
69
diff --git a/arch/x86/mm/srat_32.c b/arch/x86/mm/srat_32.c
index 6f8aa33031c7..9324f13492d5 100644
--- a/arch/x86/mm/srat_32.c
+++ b/arch/x86/mm/srat_32.c
@@ -267,6 +267,8 @@ int __init get_memcfg_from_srat(void)
267 e820_register_active_regions(chunk->nid, chunk->start_pfn, 267 e820_register_active_regions(chunk->nid, chunk->start_pfn,
268 min(chunk->end_pfn, max_pfn)); 268 min(chunk->end_pfn, max_pfn));
269 } 269 }
270 /* for out of order entries in SRAT */
271 sort_node_map();
270 272
271 for_each_online_node(nid) { 273 for_each_online_node(nid) {
272 unsigned long start = node_start_pfn[nid]; 274 unsigned long start = node_start_pfn[nid];
diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c
index dbb5381f7b3b..38512d0c4742 100644
--- a/arch/x86/mm/srat_64.c
+++ b/arch/x86/mm/srat_64.c
@@ -136,7 +136,7 @@ acpi_numa_x2apic_affinity_init(struct acpi_srat_x2apic_cpu_affinity *pa)
136 apicid_to_node[apic_id] = node; 136 apicid_to_node[apic_id] = node;
137 node_set(node, cpu_nodes_parsed); 137 node_set(node, cpu_nodes_parsed);
138 acpi_numa = 1; 138 acpi_numa = 1;
139 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 139 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%04x -> Node %u\n",
140 pxm, apic_id, node); 140 pxm, apic_id, node);
141} 141}
142 142
@@ -170,7 +170,7 @@ acpi_numa_processor_affinity_init(struct acpi_srat_cpu_affinity *pa)
170 apicid_to_node[apic_id] = node; 170 apicid_to_node[apic_id] = node;
171 node_set(node, cpu_nodes_parsed); 171 node_set(node, cpu_nodes_parsed);
172 acpi_numa = 1; 172 acpi_numa = 1;
173 printk(KERN_INFO "SRAT: PXM %u -> APIC %u -> Node %u\n", 173 printk(KERN_INFO "SRAT: PXM %u -> APIC 0x%02x -> Node %u\n",
174 pxm, apic_id, node); 174 pxm, apic_id, node);
175} 175}
176 176
@@ -229,9 +229,11 @@ update_nodes_add(int node, unsigned long start, unsigned long end)
229 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); 229 printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n");
230 } 230 }
231 231
232 if (changed) 232 if (changed) {
233 node_set(node, cpu_nodes_parsed);
233 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", 234 printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n",
234 nd->start, nd->end); 235 nd->start, nd->end);
236 }
235} 237}
236 238
237/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ 239/* Callback for parsing of the Proximity Domain <-> Memory Area mappings */
@@ -290,8 +292,6 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma)
290 292
291 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm, 293 printk(KERN_INFO "SRAT: Node %u PXM %u %lx-%lx\n", node, pxm,
292 start, end); 294 start, end);
293 e820_register_active_regions(node, start >> PAGE_SHIFT,
294 end >> PAGE_SHIFT);
295 295
296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 296 if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) {
297 update_nodes_add(node, start, end); 297 update_nodes_add(node, start, end);
@@ -319,7 +319,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
319 unsigned long s = nodes[i].start >> PAGE_SHIFT; 319 unsigned long s = nodes[i].start >> PAGE_SHIFT;
320 unsigned long e = nodes[i].end >> PAGE_SHIFT; 320 unsigned long e = nodes[i].end >> PAGE_SHIFT;
321 pxmram += e - s; 321 pxmram += e - s;
322 pxmram -= absent_pages_in_range(s, e); 322 pxmram -= __absent_pages_in_range(i, s, e);
323 if ((long)pxmram < 0) 323 if ((long)pxmram < 0)
324 pxmram = 0; 324 pxmram = 0;
325 } 325 }
@@ -338,6 +338,19 @@ static int __init nodes_cover_memory(const struct bootnode *nodes)
338 338
339void __init acpi_numa_arch_fixup(void) {} 339void __init acpi_numa_arch_fixup(void) {}
340 340
341int __init acpi_get_nodes(struct bootnode *physnodes)
342{
343 int i;
344 int ret = 0;
345
346 for_each_node_mask(i, nodes_parsed) {
347 physnodes[ret].start = nodes[i].start;
348 physnodes[ret].end = nodes[i].end;
349 ret++;
350 }
351 return ret;
352}
353
341/* Use the information discovered above to actually set up the nodes. */ 354/* Use the information discovered above to actually set up the nodes. */
342int __init acpi_scan_nodes(unsigned long start, unsigned long end) 355int __init acpi_scan_nodes(unsigned long start, unsigned long end)
343{ 356{
@@ -350,11 +363,6 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
350 for (i = 0; i < MAX_NUMNODES; i++) 363 for (i = 0; i < MAX_NUMNODES; i++)
351 cutoff_node(i, start, end); 364 cutoff_node(i, start, end);
352 365
353 if (!nodes_cover_memory(nodes)) {
354 bad_srat();
355 return -1;
356 }
357
358 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks, 366 memnode_shift = compute_hash_shift(node_memblk_range, num_node_memblks,
359 memblk_nodeid); 367 memblk_nodeid);
360 if (memnode_shift < 0) { 368 if (memnode_shift < 0) {
@@ -364,6 +372,16 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end)
364 return -1; 372 return -1;
365 } 373 }
366 374
375 for_each_node_mask(i, nodes_parsed)
376 e820_register_active_regions(i, nodes[i].start >> PAGE_SHIFT,
377 nodes[i].end >> PAGE_SHIFT);
378 /* for out of order entries in SRAT */
379 sort_node_map();
380 if (!nodes_cover_memory(nodes)) {
381 bad_srat();
382 return -1;
383 }
384
367 /* Account for nodes with cpus and no memory */ 385 /* Account for nodes with cpus and no memory */
368 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed); 386 nodes_or(node_possible_map, nodes_parsed, cpu_nodes_parsed);
369 387
@@ -443,7 +461,8 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
443 * node, it must now point to the fake node ID. 461 * node, it must now point to the fake node ID.
444 */ 462 */
445 for (j = 0; j < MAX_LOCAL_APIC; j++) 463 for (j = 0; j < MAX_LOCAL_APIC; j++)
446 if (apicid_to_node[j] == nid) 464 if (apicid_to_node[j] == nid &&
465 fake_apicid_to_node[j] == NUMA_NO_NODE)
447 fake_apicid_to_node[j] = i; 466 fake_apicid_to_node[j] = i;
448 } 467 }
449 for (i = 0; i < num_nodes; i++) 468 for (i = 0; i < num_nodes; i++)
@@ -454,7 +473,6 @@ void __init acpi_fake_nodes(const struct bootnode *fake_nodes, int num_nodes)
454 for (i = 0; i < num_nodes; i++) 473 for (i = 0; i < num_nodes; i++)
455 if (fake_nodes[i].start != fake_nodes[i].end) 474 if (fake_nodes[i].start != fake_nodes[i].end)
456 node_set(i, nodes_parsed); 475 node_set(i, nodes_parsed);
457 WARN_ON(!nodes_cover_memory(fake_nodes));
458} 476}
459 477
460static int null_slit_node_compare(int a, int b) 478static int null_slit_node_compare(int a, int b)
diff --git a/arch/x86/mm/testmmiotrace.c b/arch/x86/mm/testmmiotrace.c
index 427fd1b56df5..8565d944f7cf 100644
--- a/arch/x86/mm/testmmiotrace.c
+++ b/arch/x86/mm/testmmiotrace.c
@@ -1,12 +1,13 @@
1/* 1/*
2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi> 2 * Written by Pekka Paalanen, 2008-2009 <pq@iki.fi>
3 */ 3 */
4
5#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
6
4#include <linux/module.h> 7#include <linux/module.h>
5#include <linux/io.h> 8#include <linux/io.h>
6#include <linux/mmiotrace.h> 9#include <linux/mmiotrace.h>
7 10
8#define MODULE_NAME "testmmiotrace"
9
10static unsigned long mmio_address; 11static unsigned long mmio_address;
11module_param(mmio_address, ulong, 0); 12module_param(mmio_address, ulong, 0);
12MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB " 13MODULE_PARM_DESC(mmio_address, " Start address of the mapping of 16 kB "
@@ -30,7 +31,7 @@ static unsigned v32(unsigned i)
30static void do_write_test(void __iomem *p) 31static void do_write_test(void __iomem *p)
31{ 32{
32 unsigned int i; 33 unsigned int i;
33 pr_info(MODULE_NAME ": write test.\n"); 34 pr_info("write test.\n");
34 mmiotrace_printk("Write test.\n"); 35 mmiotrace_printk("Write test.\n");
35 36
36 for (i = 0; i < 256; i++) 37 for (i = 0; i < 256; i++)
@@ -47,7 +48,7 @@ static void do_read_test(void __iomem *p)
47{ 48{
48 unsigned int i; 49 unsigned int i;
49 unsigned errs[3] = { 0 }; 50 unsigned errs[3] = { 0 };
50 pr_info(MODULE_NAME ": read test.\n"); 51 pr_info("read test.\n");
51 mmiotrace_printk("Read test.\n"); 52 mmiotrace_printk("Read test.\n");
52 53
53 for (i = 0; i < 256; i++) 54 for (i = 0; i < 256; i++)
@@ -68,7 +69,7 @@ static void do_read_test(void __iomem *p)
68 69
69static void do_read_far_test(void __iomem *p) 70static void do_read_far_test(void __iomem *p)
70{ 71{
71 pr_info(MODULE_NAME ": read far test.\n"); 72 pr_info("read far test.\n");
72 mmiotrace_printk("Read far test.\n"); 73 mmiotrace_printk("Read far test.\n");
73 74
74 ioread32(p + read_far); 75 ioread32(p + read_far);
@@ -78,7 +79,7 @@ static void do_test(unsigned long size)
78{ 79{
79 void __iomem *p = ioremap_nocache(mmio_address, size); 80 void __iomem *p = ioremap_nocache(mmio_address, size);
80 if (!p) { 81 if (!p) {
81 pr_err(MODULE_NAME ": could not ioremap, aborting.\n"); 82 pr_err("could not ioremap, aborting.\n");
82 return; 83 return;
83 } 84 }
84 mmiotrace_printk("ioremap returned %p.\n", p); 85 mmiotrace_printk("ioremap returned %p.\n", p);
@@ -94,24 +95,22 @@ static int __init init(void)
94 unsigned long size = (read_far) ? (8 << 20) : (16 << 10); 95 unsigned long size = (read_far) ? (8 << 20) : (16 << 10);
95 96
96 if (mmio_address == 0) { 97 if (mmio_address == 0) {
97 pr_err(MODULE_NAME ": you have to use the module argument " 98 pr_err("you have to use the module argument mmio_address.\n");
98 "mmio_address.\n"); 99 pr_err("DO NOT LOAD THIS MODULE UNLESS YOU REALLY KNOW WHAT YOU ARE DOING!\n");
99 pr_err(MODULE_NAME ": DO NOT LOAD THIS MODULE UNLESS"
100 " YOU REALLY KNOW WHAT YOU ARE DOING!\n");
101 return -ENXIO; 100 return -ENXIO;
102 } 101 }
103 102
104 pr_warning(MODULE_NAME ": WARNING: mapping %lu kB @ 0x%08lx in PCI " 103 pr_warning("WARNING: mapping %lu kB @ 0x%08lx in PCI address space, "
105 "address space, and writing 16 kB of rubbish in there.\n", 104 "and writing 16 kB of rubbish in there.\n",
106 size >> 10, mmio_address); 105 size >> 10, mmio_address);
107 do_test(size); 106 do_test(size);
108 pr_info(MODULE_NAME ": All done.\n"); 107 pr_info("All done.\n");
109 return 0; 108 return 0;
110} 109}
111 110
112static void __exit cleanup(void) 111static void __exit cleanup(void)
113{ 112{
114 pr_debug(MODULE_NAME ": unloaded.\n"); 113 pr_debug("unloaded.\n");
115} 114}
116 115
117module_init(init); 116module_init(init);
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 36fe08eeb5c3..426f3a1a64d3 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -8,6 +8,7 @@
8 8
9#include <asm/tlbflush.h> 9#include <asm/tlbflush.h>
10#include <asm/mmu_context.h> 10#include <asm/mmu_context.h>
11#include <asm/cache.h>
11#include <asm/apic.h> 12#include <asm/apic.h>
12#include <asm/uv/uv.h> 13#include <asm/uv/uv.h>
13 14
@@ -40,10 +41,10 @@ union smp_flush_state {
40 struct { 41 struct {
41 struct mm_struct *flush_mm; 42 struct mm_struct *flush_mm;
42 unsigned long flush_va; 43 unsigned long flush_va;
43 spinlock_t tlbstate_lock; 44 raw_spinlock_t tlbstate_lock;
44 DECLARE_BITMAP(flush_cpumask, NR_CPUS); 45 DECLARE_BITMAP(flush_cpumask, NR_CPUS);
45 }; 46 };
46 char pad[CONFIG_X86_INTERNODE_CACHE_BYTES]; 47 char pad[INTERNODE_CACHE_BYTES];
47} ____cacheline_internodealigned_in_smp; 48} ____cacheline_internodealigned_in_smp;
48 49
49/* State is put into the per CPU data section, but padded 50/* State is put into the per CPU data section, but padded
@@ -180,7 +181,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
180 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is 181 * num_online_cpus() <= NUM_INVALIDATE_TLB_VECTORS, but it is
181 * probably not worth checking this for a cache-hot lock. 182 * probably not worth checking this for a cache-hot lock.
182 */ 183 */
183 spin_lock(&f->tlbstate_lock); 184 raw_spin_lock(&f->tlbstate_lock);
184 185
185 f->flush_mm = mm; 186 f->flush_mm = mm;
186 f->flush_va = va; 187 f->flush_va = va;
@@ -198,7 +199,7 @@ static void flush_tlb_others_ipi(const struct cpumask *cpumask,
198 199
199 f->flush_mm = NULL; 200 f->flush_mm = NULL;
200 f->flush_va = 0; 201 f->flush_va = 0;
201 spin_unlock(&f->tlbstate_lock); 202 raw_spin_unlock(&f->tlbstate_lock);
202} 203}
203 204
204void native_flush_tlb_others(const struct cpumask *cpumask, 205void native_flush_tlb_others(const struct cpumask *cpumask,
@@ -222,7 +223,7 @@ static int __cpuinit init_smp_flush(void)
222 int i; 223 int i;
223 224
224 for (i = 0; i < ARRAY_SIZE(flush_state); i++) 225 for (i = 0; i < ARRAY_SIZE(flush_state); i++)
225 spin_lock_init(&flush_state[i].tlbstate_lock); 226 raw_spin_lock_init(&flush_state[i].tlbstate_lock);
226 227
227 return 0; 228 return 0;
228} 229}
diff --git a/arch/x86/oprofile/backtrace.c b/arch/x86/oprofile/backtrace.c
index 044897be021f..3855096c59b8 100644
--- a/arch/x86/oprofile/backtrace.c
+++ b/arch/x86/oprofile/backtrace.c
@@ -41,10 +41,11 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
41} 41}
42 42
43static struct stacktrace_ops backtrace_ops = { 43static struct stacktrace_ops backtrace_ops = {
44 .warning = backtrace_warning, 44 .warning = backtrace_warning,
45 .warning_symbol = backtrace_warning_symbol, 45 .warning_symbol = backtrace_warning_symbol,
46 .stack = backtrace_stack, 46 .stack = backtrace_stack,
47 .address = backtrace_address, 47 .address = backtrace_address,
48 .walk_stack = print_context_stack,
48}; 49};
49 50
50struct frame_head { 51struct frame_head {
diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c
index cb88b1a0bd5f..2c505ee71014 100644
--- a/arch/x86/oprofile/nmi_int.c
+++ b/arch/x86/oprofile/nmi_int.c
@@ -159,7 +159,7 @@ static int nmi_setup_mux(void)
159 159
160 for_each_possible_cpu(i) { 160 for_each_possible_cpu(i) {
161 per_cpu(cpu_msrs, i).multiplex = 161 per_cpu(cpu_msrs, i).multiplex =
162 kmalloc(multiplex_size, GFP_KERNEL); 162 kzalloc(multiplex_size, GFP_KERNEL);
163 if (!per_cpu(cpu_msrs, i).multiplex) 163 if (!per_cpu(cpu_msrs, i).multiplex)
164 return 0; 164 return 0;
165 } 165 }
@@ -179,7 +179,6 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
179 if (counter_config[i].enabled) { 179 if (counter_config[i].enabled) {
180 multiplex[i].saved = -(u64)counter_config[i].count; 180 multiplex[i].saved = -(u64)counter_config[i].count;
181 } else { 181 } else {
182 multiplex[i].addr = 0;
183 multiplex[i].saved = 0; 182 multiplex[i].saved = 0;
184 } 183 }
185 } 184 }
@@ -189,25 +188,27 @@ static void nmi_cpu_setup_mux(int cpu, struct op_msrs const * const msrs)
189 188
190static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs) 189static void nmi_cpu_save_mpx_registers(struct op_msrs *msrs)
191{ 190{
191 struct op_msr *counters = msrs->counters;
192 struct op_msr *multiplex = msrs->multiplex; 192 struct op_msr *multiplex = msrs->multiplex;
193 int i; 193 int i;
194 194
195 for (i = 0; i < model->num_counters; ++i) { 195 for (i = 0; i < model->num_counters; ++i) {
196 int virt = op_x86_phys_to_virt(i); 196 int virt = op_x86_phys_to_virt(i);
197 if (multiplex[virt].addr) 197 if (counters[i].addr)
198 rdmsrl(multiplex[virt].addr, multiplex[virt].saved); 198 rdmsrl(counters[i].addr, multiplex[virt].saved);
199 } 199 }
200} 200}
201 201
202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs) 202static void nmi_cpu_restore_mpx_registers(struct op_msrs *msrs)
203{ 203{
204 struct op_msr *counters = msrs->counters;
204 struct op_msr *multiplex = msrs->multiplex; 205 struct op_msr *multiplex = msrs->multiplex;
205 int i; 206 int i;
206 207
207 for (i = 0; i < model->num_counters; ++i) { 208 for (i = 0; i < model->num_counters; ++i) {
208 int virt = op_x86_phys_to_virt(i); 209 int virt = op_x86_phys_to_virt(i);
209 if (multiplex[virt].addr) 210 if (counters[i].addr)
210 wrmsrl(multiplex[virt].addr, multiplex[virt].saved); 211 wrmsrl(counters[i].addr, multiplex[virt].saved);
211 } 212 }
212} 213}
213 214
@@ -222,7 +223,7 @@ static void nmi_cpu_switch(void *dummy)
222 223
223 /* move to next set */ 224 /* move to next set */
224 si += model->num_counters; 225 si += model->num_counters;
225 if ((si > model->num_virt_counters) || (counter_config[si].count == 0)) 226 if ((si >= model->num_virt_counters) || (counter_config[si].count == 0))
226 per_cpu(switch_index, cpu) = 0; 227 per_cpu(switch_index, cpu) = 0;
227 else 228 else
228 per_cpu(switch_index, cpu) = si; 229 per_cpu(switch_index, cpu) = si;
@@ -303,11 +304,11 @@ static int allocate_msrs(void)
303 304
304 int i; 305 int i;
305 for_each_possible_cpu(i) { 306 for_each_possible_cpu(i) {
306 per_cpu(cpu_msrs, i).counters = kmalloc(counters_size, 307 per_cpu(cpu_msrs, i).counters = kzalloc(counters_size,
307 GFP_KERNEL); 308 GFP_KERNEL);
308 if (!per_cpu(cpu_msrs, i).counters) 309 if (!per_cpu(cpu_msrs, i).counters)
309 return 0; 310 return 0;
310 per_cpu(cpu_msrs, i).controls = kmalloc(controls_size, 311 per_cpu(cpu_msrs, i).controls = kzalloc(controls_size,
311 GFP_KERNEL); 312 GFP_KERNEL);
312 if (!per_cpu(cpu_msrs, i).controls) 313 if (!per_cpu(cpu_msrs, i).controls)
313 return 0; 314 return 0;
@@ -598,6 +599,7 @@ static int __init ppro_init(char **cpu_type)
598 case 15: case 23: 599 case 15: case 23:
599 *cpu_type = "i386/core_2"; 600 *cpu_type = "i386/core_2";
600 break; 601 break;
602 case 0x2e:
601 case 26: 603 case 26:
602 spec = &op_arch_perfmon_spec; 604 spec = &op_arch_perfmon_spec;
603 *cpu_type = "i386/core_i7"; 605 *cpu_type = "i386/core_i7";
diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c
index 39686c29f03a..090cbbec7dbd 100644
--- a/arch/x86/oprofile/op_model_amd.c
+++ b/arch/x86/oprofile/op_model_amd.c
@@ -22,6 +22,9 @@
22#include <asm/ptrace.h> 22#include <asm/ptrace.h>
23#include <asm/msr.h> 23#include <asm/msr.h>
24#include <asm/nmi.h> 24#include <asm/nmi.h>
25#include <asm/apic.h>
26#include <asm/processor.h>
27#include <asm/cpufeature.h>
25 28
26#include "op_x86_model.h" 29#include "op_x86_model.h"
27#include "op_counter.h" 30#include "op_counter.h"
@@ -43,23 +46,10 @@
43 46
44static unsigned long reset_value[NUM_VIRT_COUNTERS]; 47static unsigned long reset_value[NUM_VIRT_COUNTERS];
45 48
46#ifdef CONFIG_OPROFILE_IBS
47
48/* IbsFetchCtl bits/masks */
49#define IBS_FETCH_RAND_EN (1ULL<<57)
50#define IBS_FETCH_VAL (1ULL<<49)
51#define IBS_FETCH_ENABLE (1ULL<<48)
52#define IBS_FETCH_CNT_MASK 0xFFFF0000ULL
53
54/*IbsOpCtl bits */
55#define IBS_OP_CNT_CTL (1ULL<<19)
56#define IBS_OP_VAL (1ULL<<18)
57#define IBS_OP_ENABLE (1ULL<<17)
58
59#define IBS_FETCH_SIZE 6 49#define IBS_FETCH_SIZE 6
60#define IBS_OP_SIZE 12 50#define IBS_OP_SIZE 12
61 51
62static int has_ibs; /* AMD Family10h and later */ 52static u32 ibs_caps;
63 53
64struct op_ibs_config { 54struct op_ibs_config {
65 unsigned long op_enabled; 55 unsigned long op_enabled;
@@ -71,24 +61,52 @@ struct op_ibs_config {
71}; 61};
72 62
73static struct op_ibs_config ibs_config; 63static struct op_ibs_config ibs_config;
64static u64 ibs_op_ctl;
74 65
75#endif 66/*
67 * IBS cpuid feature detection
68 */
76 69
77#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX 70#define IBS_CPUID_FEATURES 0x8000001b
71
72/*
73 * Same bit mask as for IBS cpuid feature flags (Fn8000_001B_EAX), but
74 * bit 0 is used to indicate the existence of IBS.
75 */
76#define IBS_CAPS_AVAIL (1LL<<0)
77#define IBS_CAPS_RDWROPCNT (1LL<<3)
78#define IBS_CAPS_OPCNT (1LL<<4)
78 79
79static void op_mux_fill_in_addresses(struct op_msrs * const msrs) 80/*
81 * IBS randomization macros
82 */
83#define IBS_RANDOM_BITS 12
84#define IBS_RANDOM_MASK ((1ULL << IBS_RANDOM_BITS) - 1)
85#define IBS_RANDOM_MAXCNT_OFFSET (1ULL << (IBS_RANDOM_BITS - 5))
86
87static u32 get_ibs_caps(void)
80{ 88{
81 int i; 89 u32 ibs_caps;
90 unsigned int max_level;
82 91
83 for (i = 0; i < NUM_VIRT_COUNTERS; i++) { 92 if (!boot_cpu_has(X86_FEATURE_IBS))
84 int hw_counter = op_x86_virt_to_phys(i); 93 return 0;
85 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 94
86 msrs->multiplex[i].addr = MSR_K7_PERFCTR0 + hw_counter; 95 /* check IBS cpuid feature flags */
87 else 96 max_level = cpuid_eax(0x80000000);
88 msrs->multiplex[i].addr = 0; 97 if (max_level < IBS_CPUID_FEATURES)
89 } 98 return IBS_CAPS_AVAIL;
99
100 ibs_caps = cpuid_eax(IBS_CPUID_FEATURES);
101 if (!(ibs_caps & IBS_CAPS_AVAIL))
102 /* cpuid flags not valid */
103 return IBS_CAPS_AVAIL;
104
105 return ibs_caps;
90} 106}
91 107
108#ifdef CONFIG_OPROFILE_EVENT_MULTIPLEX
109
92static void op_mux_switch_ctrl(struct op_x86_model_spec const *model, 110static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
93 struct op_msrs const * const msrs) 111 struct op_msrs const * const msrs)
94{ 112{
@@ -98,7 +116,7 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
98 /* enable active counters */ 116 /* enable active counters */
99 for (i = 0; i < NUM_COUNTERS; ++i) { 117 for (i = 0; i < NUM_COUNTERS; ++i) {
100 int virt = op_x86_phys_to_virt(i); 118 int virt = op_x86_phys_to_virt(i);
101 if (!counter_config[virt].enabled) 119 if (!reset_value[virt])
102 continue; 120 continue;
103 rdmsrl(msrs->controls[i].addr, val); 121 rdmsrl(msrs->controls[i].addr, val);
104 val &= model->reserved; 122 val &= model->reserved;
@@ -107,10 +125,6 @@ static void op_mux_switch_ctrl(struct op_x86_model_spec const *model,
107 } 125 }
108} 126}
109 127
110#else
111
112static inline void op_mux_fill_in_addresses(struct op_msrs * const msrs) { }
113
114#endif 128#endif
115 129
116/* functions for op_amd_spec */ 130/* functions for op_amd_spec */
@@ -122,18 +136,12 @@ static void op_amd_fill_in_addresses(struct op_msrs * const msrs)
122 for (i = 0; i < NUM_COUNTERS; i++) { 136 for (i = 0; i < NUM_COUNTERS; i++) {
123 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i)) 137 if (reserve_perfctr_nmi(MSR_K7_PERFCTR0 + i))
124 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i; 138 msrs->counters[i].addr = MSR_K7_PERFCTR0 + i;
125 else
126 msrs->counters[i].addr = 0;
127 } 139 }
128 140
129 for (i = 0; i < NUM_CONTROLS; i++) { 141 for (i = 0; i < NUM_CONTROLS; i++) {
130 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i)) 142 if (reserve_evntsel_nmi(MSR_K7_EVNTSEL0 + i))
131 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i; 143 msrs->controls[i].addr = MSR_K7_EVNTSEL0 + i;
132 else
133 msrs->controls[i].addr = 0;
134 } 144 }
135
136 op_mux_fill_in_addresses(msrs);
137} 145}
138 146
139static void op_amd_setup_ctrs(struct op_x86_model_spec const *model, 147static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
@@ -144,7 +152,8 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
144 152
145 /* setup reset_value */ 153 /* setup reset_value */
146 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) { 154 for (i = 0; i < NUM_VIRT_COUNTERS; ++i) {
147 if (counter_config[i].enabled) 155 if (counter_config[i].enabled
156 && msrs->counters[op_x86_virt_to_phys(i)].addr)
148 reset_value[i] = counter_config[i].count; 157 reset_value[i] = counter_config[i].count;
149 else 158 else
150 reset_value[i] = 0; 159 reset_value[i] = 0;
@@ -152,9 +161,18 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
152 161
153 /* clear all counters */ 162 /* clear all counters */
154 for (i = 0; i < NUM_CONTROLS; ++i) { 163 for (i = 0; i < NUM_CONTROLS; ++i) {
155 if (unlikely(!msrs->controls[i].addr)) 164 if (unlikely(!msrs->controls[i].addr)) {
165 if (counter_config[i].enabled && !smp_processor_id())
166 /*
167 * counter is reserved, this is on all
168 * cpus, so report only for cpu #0
169 */
170 op_x86_warn_reserved(i);
156 continue; 171 continue;
172 }
157 rdmsrl(msrs->controls[i].addr, val); 173 rdmsrl(msrs->controls[i].addr, val);
174 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
175 op_x86_warn_in_use(i);
158 val &= model->reserved; 176 val &= model->reserved;
159 wrmsrl(msrs->controls[i].addr, val); 177 wrmsrl(msrs->controls[i].addr, val);
160 } 178 }
@@ -169,9 +187,7 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
169 /* enable active counters */ 187 /* enable active counters */
170 for (i = 0; i < NUM_COUNTERS; ++i) { 188 for (i = 0; i < NUM_COUNTERS; ++i) {
171 int virt = op_x86_phys_to_virt(i); 189 int virt = op_x86_phys_to_virt(i);
172 if (!counter_config[virt].enabled) 190 if (!reset_value[virt])
173 continue;
174 if (!msrs->counters[i].addr)
175 continue; 191 continue;
176 192
177 /* setup counter registers */ 193 /* setup counter registers */
@@ -185,7 +201,60 @@ static void op_amd_setup_ctrs(struct op_x86_model_spec const *model,
185 } 201 }
186} 202}
187 203
188#ifdef CONFIG_OPROFILE_IBS 204/*
205 * 16-bit Linear Feedback Shift Register (LFSR)
206 *
207 * 16 14 13 11
208 * Feedback polynomial = X + X + X + X + 1
209 */
210static unsigned int lfsr_random(void)
211{
212 static unsigned int lfsr_value = 0xF00D;
213 unsigned int bit;
214
215 /* Compute next bit to shift in */
216 bit = ((lfsr_value >> 0) ^
217 (lfsr_value >> 2) ^
218 (lfsr_value >> 3) ^
219 (lfsr_value >> 5)) & 0x0001;
220
221 /* Advance to next register value */
222 lfsr_value = (lfsr_value >> 1) | (bit << 15);
223
224 return lfsr_value;
225}
226
227/*
228 * IBS software randomization
229 *
230 * The IBS periodic op counter is randomized in software. The lower 12
231 * bits of the 20 bit counter are randomized. IbsOpCurCnt is
232 * initialized with a 12 bit random value.
233 */
234static inline u64 op_amd_randomize_ibs_op(u64 val)
235{
236 unsigned int random = lfsr_random();
237
238 if (!(ibs_caps & IBS_CAPS_RDWROPCNT))
239 /*
240 * Work around if the hw can not write to IbsOpCurCnt
241 *
242 * Randomize the lower 8 bits of the 16 bit
243 * IbsOpMaxCnt [15:0] value in the range of -128 to
244 * +127 by adding/subtracting an offset to the
245 * maximum count (IbsOpMaxCnt).
246 *
247 * To avoid over or underflows and protect upper bits
248 * starting at bit 16, the initial value for
249 * IbsOpMaxCnt must fit in the range from 0x0081 to
250 * 0xff80.
251 */
252 val += (s8)(random >> 4);
253 else
254 val |= (u64)(random & IBS_RANDOM_MASK) << 32;
255
256 return val;
257}
189 258
190static inline void 259static inline void
191op_amd_handle_ibs(struct pt_regs * const regs, 260op_amd_handle_ibs(struct pt_regs * const regs,
@@ -194,7 +263,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
194 u64 val, ctl; 263 u64 val, ctl;
195 struct op_entry entry; 264 struct op_entry entry;
196 265
197 if (!has_ibs) 266 if (!ibs_caps)
198 return; 267 return;
199 268
200 if (ibs_config.fetch_enabled) { 269 if (ibs_config.fetch_enabled) {
@@ -210,7 +279,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
210 oprofile_write_commit(&entry); 279 oprofile_write_commit(&entry);
211 280
212 /* reenable the IRQ */ 281 /* reenable the IRQ */
213 ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT_MASK); 282 ctl &= ~(IBS_FETCH_VAL | IBS_FETCH_CNT);
214 ctl |= IBS_FETCH_ENABLE; 283 ctl |= IBS_FETCH_ENABLE;
215 wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl); 284 wrmsrl(MSR_AMD64_IBSFETCHCTL, ctl);
216 } 285 }
@@ -236,8 +305,7 @@ op_amd_handle_ibs(struct pt_regs * const regs,
236 oprofile_write_commit(&entry); 305 oprofile_write_commit(&entry);
237 306
238 /* reenable the IRQ */ 307 /* reenable the IRQ */
239 ctl &= ~IBS_OP_VAL & 0xFFFFFFFF; 308 ctl = op_amd_randomize_ibs_op(ibs_op_ctl);
240 ctl |= IBS_OP_ENABLE;
241 wrmsrl(MSR_AMD64_IBSOPCTL, ctl); 309 wrmsrl(MSR_AMD64_IBSOPCTL, ctl);
242 } 310 }
243 } 311 }
@@ -246,41 +314,57 @@ op_amd_handle_ibs(struct pt_regs * const regs,
246static inline void op_amd_start_ibs(void) 314static inline void op_amd_start_ibs(void)
247{ 315{
248 u64 val; 316 u64 val;
249 if (has_ibs && ibs_config.fetch_enabled) { 317
250 val = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; 318 if (!ibs_caps)
319 return;
320
321 if (ibs_config.fetch_enabled) {
322 val = (ibs_config.max_cnt_fetch >> 4) & IBS_FETCH_MAX_CNT;
251 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0; 323 val |= ibs_config.rand_en ? IBS_FETCH_RAND_EN : 0;
252 val |= IBS_FETCH_ENABLE; 324 val |= IBS_FETCH_ENABLE;
253 wrmsrl(MSR_AMD64_IBSFETCHCTL, val); 325 wrmsrl(MSR_AMD64_IBSFETCHCTL, val);
254 } 326 }
255 327
256 if (has_ibs && ibs_config.op_enabled) { 328 if (ibs_config.op_enabled) {
257 val = (ibs_config.max_cnt_op >> 4) & 0xFFFF; 329 ibs_op_ctl = ibs_config.max_cnt_op >> 4;
258 val |= ibs_config.dispatched_ops ? IBS_OP_CNT_CTL : 0; 330 if (!(ibs_caps & IBS_CAPS_RDWROPCNT)) {
259 val |= IBS_OP_ENABLE; 331 /*
332 * IbsOpCurCnt not supported. See
333 * op_amd_randomize_ibs_op() for details.
334 */
335 ibs_op_ctl = clamp(ibs_op_ctl, 0x0081ULL, 0xFF80ULL);
336 } else {
337 /*
338 * The start value is randomized with a
339 * positive offset, we need to compensate it
340 * with the half of the randomized range. Also
341 * avoid underflows.
342 */
343 ibs_op_ctl = min(ibs_op_ctl + IBS_RANDOM_MAXCNT_OFFSET,
344 IBS_OP_MAX_CNT);
345 }
346 if (ibs_caps & IBS_CAPS_OPCNT && ibs_config.dispatched_ops)
347 ibs_op_ctl |= IBS_OP_CNT_CTL;
348 ibs_op_ctl |= IBS_OP_ENABLE;
349 val = op_amd_randomize_ibs_op(ibs_op_ctl);
260 wrmsrl(MSR_AMD64_IBSOPCTL, val); 350 wrmsrl(MSR_AMD64_IBSOPCTL, val);
261 } 351 }
262} 352}
263 353
264static void op_amd_stop_ibs(void) 354static void op_amd_stop_ibs(void)
265{ 355{
266 if (has_ibs && ibs_config.fetch_enabled) 356 if (!ibs_caps)
357 return;
358
359 if (ibs_config.fetch_enabled)
267 /* clear max count and enable */ 360 /* clear max count and enable */
268 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0); 361 wrmsrl(MSR_AMD64_IBSFETCHCTL, 0);
269 362
270 if (has_ibs && ibs_config.op_enabled) 363 if (ibs_config.op_enabled)
271 /* clear max count and enable */ 364 /* clear max count and enable */
272 wrmsrl(MSR_AMD64_IBSOPCTL, 0); 365 wrmsrl(MSR_AMD64_IBSOPCTL, 0);
273} 366}
274 367
275#else
276
277static inline void op_amd_handle_ibs(struct pt_regs * const regs,
278 struct op_msrs const * const msrs) { }
279static inline void op_amd_start_ibs(void) { }
280static inline void op_amd_stop_ibs(void) { }
281
282#endif
283
284static int op_amd_check_ctrs(struct pt_regs * const regs, 368static int op_amd_check_ctrs(struct pt_regs * const regs,
285 struct op_msrs const * const msrs) 369 struct op_msrs const * const msrs)
286{ 370{
@@ -314,7 +398,7 @@ static void op_amd_start(struct op_msrs const * const msrs)
314 if (!reset_value[op_x86_phys_to_virt(i)]) 398 if (!reset_value[op_x86_phys_to_virt(i)])
315 continue; 399 continue;
316 rdmsrl(msrs->controls[i].addr, val); 400 rdmsrl(msrs->controls[i].addr, val);
317 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 401 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
318 wrmsrl(msrs->controls[i].addr, val); 402 wrmsrl(msrs->controls[i].addr, val);
319 } 403 }
320 404
@@ -334,7 +418,7 @@ static void op_amd_stop(struct op_msrs const * const msrs)
334 if (!reset_value[op_x86_phys_to_virt(i)]) 418 if (!reset_value[op_x86_phys_to_virt(i)])
335 continue; 419 continue;
336 rdmsrl(msrs->controls[i].addr, val); 420 rdmsrl(msrs->controls[i].addr, val);
337 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 421 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
338 wrmsrl(msrs->controls[i].addr, val); 422 wrmsrl(msrs->controls[i].addr, val);
339 } 423 }
340 424
@@ -355,8 +439,6 @@ static void op_amd_shutdown(struct op_msrs const * const msrs)
355 } 439 }
356} 440}
357 441
358#ifdef CONFIG_OPROFILE_IBS
359
360static u8 ibs_eilvt_off; 442static u8 ibs_eilvt_off;
361 443
362static inline void apic_init_ibs_nmi_per_cpu(void *arg) 444static inline void apic_init_ibs_nmi_per_cpu(void *arg)
@@ -405,45 +487,36 @@ static int init_ibs_nmi(void)
405 return 1; 487 return 1;
406 } 488 }
407 489
408#ifdef CONFIG_NUMA
409 /* Sanity check */
410 /* Works only for 64bit with proper numa implementation. */
411 if (nodes != num_possible_nodes()) {
412 printk(KERN_DEBUG "Failed to setup CPU node(s) for IBS, "
413 "found: %d, expected %d",
414 nodes, num_possible_nodes());
415 return 1;
416 }
417#endif
418 return 0; 490 return 0;
419} 491}
420 492
421/* uninitialize the APIC for the IBS interrupts if needed */ 493/* uninitialize the APIC for the IBS interrupts if needed */
422static void clear_ibs_nmi(void) 494static void clear_ibs_nmi(void)
423{ 495{
424 if (has_ibs) 496 if (ibs_caps)
425 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); 497 on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1);
426} 498}
427 499
428/* initialize the APIC for the IBS interrupts if available */ 500/* initialize the APIC for the IBS interrupts if available */
429static void ibs_init(void) 501static void ibs_init(void)
430{ 502{
431 has_ibs = boot_cpu_has(X86_FEATURE_IBS); 503 ibs_caps = get_ibs_caps();
432 504
433 if (!has_ibs) 505 if (!ibs_caps)
434 return; 506 return;
435 507
436 if (init_ibs_nmi()) { 508 if (init_ibs_nmi()) {
437 has_ibs = 0; 509 ibs_caps = 0;
438 return; 510 return;
439 } 511 }
440 512
441 printk(KERN_INFO "oprofile: AMD IBS detected\n"); 513 printk(KERN_INFO "oprofile: AMD IBS detected (0x%08x)\n",
514 (unsigned)ibs_caps);
442} 515}
443 516
444static void ibs_exit(void) 517static void ibs_exit(void)
445{ 518{
446 if (!has_ibs) 519 if (!ibs_caps)
447 return; 520 return;
448 521
449 clear_ibs_nmi(); 522 clear_ibs_nmi();
@@ -463,7 +536,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
463 if (ret) 536 if (ret)
464 return ret; 537 return ret;
465 538
466 if (!has_ibs) 539 if (!ibs_caps)
467 return ret; 540 return ret;
468 541
469 /* model specific files */ 542 /* model specific files */
@@ -473,7 +546,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
473 ibs_config.fetch_enabled = 0; 546 ibs_config.fetch_enabled = 0;
474 ibs_config.max_cnt_op = 250000; 547 ibs_config.max_cnt_op = 250000;
475 ibs_config.op_enabled = 0; 548 ibs_config.op_enabled = 0;
476 ibs_config.dispatched_ops = 1; 549 ibs_config.dispatched_ops = 0;
477 550
478 dir = oprofilefs_mkdir(sb, root, "ibs_fetch"); 551 dir = oprofilefs_mkdir(sb, root, "ibs_fetch");
479 oprofilefs_create_ulong(sb, dir, "enable", 552 oprofilefs_create_ulong(sb, dir, "enable",
@@ -488,8 +561,9 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root)
488 &ibs_config.op_enabled); 561 &ibs_config.op_enabled);
489 oprofilefs_create_ulong(sb, dir, "max_count", 562 oprofilefs_create_ulong(sb, dir, "max_count",
490 &ibs_config.max_cnt_op); 563 &ibs_config.max_cnt_op);
491 oprofilefs_create_ulong(sb, dir, "dispatched_ops", 564 if (ibs_caps & IBS_CAPS_OPCNT)
492 &ibs_config.dispatched_ops); 565 oprofilefs_create_ulong(sb, dir, "dispatched_ops",
566 &ibs_config.dispatched_ops);
493 567
494 return 0; 568 return 0;
495} 569}
@@ -507,19 +581,6 @@ static void op_amd_exit(void)
507 ibs_exit(); 581 ibs_exit();
508} 582}
509 583
510#else
511
512/* no IBS support */
513
514static int op_amd_init(struct oprofile_operations *ops)
515{
516 return 0;
517}
518
519static void op_amd_exit(void) {}
520
521#endif /* CONFIG_OPROFILE_IBS */
522
523struct op_x86_model_spec op_amd_spec = { 584struct op_x86_model_spec op_amd_spec = {
524 .num_counters = NUM_COUNTERS, 585 .num_counters = NUM_COUNTERS,
525 .num_controls = NUM_CONTROLS, 586 .num_controls = NUM_CONTROLS,
diff --git a/arch/x86/oprofile/op_model_p4.c b/arch/x86/oprofile/op_model_p4.c
index ac6b354becdf..e6a160a4684a 100644
--- a/arch/x86/oprofile/op_model_p4.c
+++ b/arch/x86/oprofile/op_model_p4.c
@@ -394,12 +394,6 @@ static void p4_fill_in_addresses(struct op_msrs * const msrs)
394 setup_num_counters(); 394 setup_num_counters();
395 stag = get_stagger(); 395 stag = get_stagger();
396 396
397 /* initialize some registers */
398 for (i = 0; i < num_counters; ++i)
399 msrs->counters[i].addr = 0;
400 for (i = 0; i < num_controls; ++i)
401 msrs->controls[i].addr = 0;
402
403 /* the counter & cccr registers we pay attention to */ 397 /* the counter & cccr registers we pay attention to */
404 for (i = 0; i < num_counters; ++i) { 398 for (i = 0; i < num_counters; ++i) {
405 addr = p4_counters[VIRT_CTR(stag, i)].counter_address; 399 addr = p4_counters[VIRT_CTR(stag, i)].counter_address;
diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c
index 8eb05878554c..2bf90fafa7b5 100644
--- a/arch/x86/oprofile/op_model_ppro.c
+++ b/arch/x86/oprofile/op_model_ppro.c
@@ -37,15 +37,11 @@ static void ppro_fill_in_addresses(struct op_msrs * const msrs)
37 for (i = 0; i < num_counters; i++) { 37 for (i = 0; i < num_counters; i++) {
38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i)) 38 if (reserve_perfctr_nmi(MSR_P6_PERFCTR0 + i))
39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i; 39 msrs->counters[i].addr = MSR_P6_PERFCTR0 + i;
40 else
41 msrs->counters[i].addr = 0;
42 } 40 }
43 41
44 for (i = 0; i < num_counters; i++) { 42 for (i = 0; i < num_counters; i++) {
45 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i)) 43 if (reserve_evntsel_nmi(MSR_P6_EVNTSEL0 + i))
46 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i; 44 msrs->controls[i].addr = MSR_P6_EVNTSEL0 + i;
47 else
48 msrs->controls[i].addr = 0;
49 } 45 }
50} 46}
51 47
@@ -57,7 +53,7 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
57 int i; 53 int i;
58 54
59 if (!reset_value) { 55 if (!reset_value) {
60 reset_value = kmalloc(sizeof(reset_value[0]) * num_counters, 56 reset_value = kzalloc(sizeof(reset_value[0]) * num_counters,
61 GFP_ATOMIC); 57 GFP_ATOMIC);
62 if (!reset_value) 58 if (!reset_value)
63 return; 59 return;
@@ -82,9 +78,18 @@ static void ppro_setup_ctrs(struct op_x86_model_spec const *model,
82 78
83 /* clear all counters */ 79 /* clear all counters */
84 for (i = 0; i < num_counters; ++i) { 80 for (i = 0; i < num_counters; ++i) {
85 if (unlikely(!msrs->controls[i].addr)) 81 if (unlikely(!msrs->controls[i].addr)) {
82 if (counter_config[i].enabled && !smp_processor_id())
83 /*
84 * counter is reserved, this is on all
85 * cpus, so report only for cpu #0
86 */
87 op_x86_warn_reserved(i);
86 continue; 88 continue;
89 }
87 rdmsrl(msrs->controls[i].addr, val); 90 rdmsrl(msrs->controls[i].addr, val);
91 if (val & ARCH_PERFMON_EVENTSEL_ENABLE)
92 op_x86_warn_in_use(i);
88 val &= model->reserved; 93 val &= model->reserved;
89 wrmsrl(msrs->controls[i].addr, val); 94 wrmsrl(msrs->controls[i].addr, val);
90 } 95 }
@@ -161,7 +166,7 @@ static void ppro_start(struct op_msrs const * const msrs)
161 for (i = 0; i < num_counters; ++i) { 166 for (i = 0; i < num_counters; ++i) {
162 if (reset_value[i]) { 167 if (reset_value[i]) {
163 rdmsrl(msrs->controls[i].addr, val); 168 rdmsrl(msrs->controls[i].addr, val);
164 val |= ARCH_PERFMON_EVENTSEL0_ENABLE; 169 val |= ARCH_PERFMON_EVENTSEL_ENABLE;
165 wrmsrl(msrs->controls[i].addr, val); 170 wrmsrl(msrs->controls[i].addr, val);
166 } 171 }
167 } 172 }
@@ -179,7 +184,7 @@ static void ppro_stop(struct op_msrs const * const msrs)
179 if (!reset_value[i]) 184 if (!reset_value[i])
180 continue; 185 continue;
181 rdmsrl(msrs->controls[i].addr, val); 186 rdmsrl(msrs->controls[i].addr, val);
182 val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; 187 val &= ~ARCH_PERFMON_EVENTSEL_ENABLE;
183 wrmsrl(msrs->controls[i].addr, val); 188 wrmsrl(msrs->controls[i].addr, val);
184 } 189 }
185} 190}
diff --git a/arch/x86/oprofile/op_x86_model.h b/arch/x86/oprofile/op_x86_model.h
index 7b8e75d16081..ff82a755edd4 100644
--- a/arch/x86/oprofile/op_x86_model.h
+++ b/arch/x86/oprofile/op_x86_model.h
@@ -57,6 +57,26 @@ struct op_x86_model_spec {
57 57
58struct op_counter_config; 58struct op_counter_config;
59 59
60static inline void op_x86_warn_in_use(int counter)
61{
62 /*
63 * The warning indicates an already running counter. If
64 * oprofile doesn't collect data, then try using a different
65 * performance counter on your platform to monitor the desired
66 * event. Delete counter #%d from the desired event by editing
67 * the /usr/share/oprofile/%s/<cpu>/events file. If the event
68 * cannot be monitored by any other counter, contact your
69 * hardware or BIOS vendor.
70 */
71 pr_warning("oprofile: counter #%d on cpu #%d may already be used\n",
72 counter, smp_processor_id());
73}
74
75static inline void op_x86_warn_reserved(int counter)
76{
77 pr_warning("oprofile: counter #%d is already reserved\n", counter);
78}
79
60extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model, 80extern u64 op_x86_get_ctrl(struct op_x86_model_spec const *model,
61 struct op_counter_config *counter_config); 81 struct op_counter_config *counter_config);
62extern int op_x86_phys_to_virt(int phys); 82extern int op_x86_phys_to_virt(int phys);
diff --git a/arch/x86/pci/Makefile b/arch/x86/pci/Makefile
index d49202e740ea..b110d97fb925 100644
--- a/arch/x86/pci/Makefile
+++ b/arch/x86/pci/Makefile
@@ -13,5 +13,11 @@ obj-$(CONFIG_X86_VISWS) += visws.o
13 13
14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o 14obj-$(CONFIG_X86_NUMAQ) += numaq_32.o
15 15
16obj-$(CONFIG_X86_MRST) += mrst.o
17
16obj-y += common.o early.o 18obj-y += common.o early.o
17obj-y += amd_bus.o 19obj-y += amd_bus.o bus_numa.o
20
21ifeq ($(CONFIG_PCI_DEBUG),y)
22EXTRA_CFLAGS += -DDEBUG
23endif
diff --git a/arch/x86/pci/acpi.c b/arch/x86/pci/acpi.c
index 1014eb4bfc37..31930fd30ea9 100644
--- a/arch/x86/pci/acpi.c
+++ b/arch/x86/pci/acpi.c
@@ -3,10 +3,12 @@
3#include <linux/init.h> 3#include <linux/init.h>
4#include <linux/irq.h> 4#include <linux/irq.h>
5#include <linux/dmi.h> 5#include <linux/dmi.h>
6#include <linux/slab.h>
6#include <asm/numa.h> 7#include <asm/numa.h>
7#include <asm/pci_x86.h> 8#include <asm/pci_x86.h>
8 9
9struct pci_root_info { 10struct pci_root_info {
11 struct acpi_device *bridge;
10 char *name; 12 char *name;
11 unsigned int res_num; 13 unsigned int res_num;
12 struct resource *res; 14 struct resource *res;
@@ -14,19 +16,94 @@ struct pci_root_info {
14 int busnum; 16 int busnum;
15}; 17};
16 18
19static bool pci_use_crs = true;
20
21static int __init set_use_crs(const struct dmi_system_id *id)
22{
23 pci_use_crs = true;
24 return 0;
25}
26
27static const struct dmi_system_id pci_use_crs_table[] __initconst = {
28 /* http://bugzilla.kernel.org/show_bug.cgi?id=14183 */
29 {
30 .callback = set_use_crs,
31 .ident = "IBM System x3800",
32 .matches = {
33 DMI_MATCH(DMI_SYS_VENDOR, "IBM"),
34 DMI_MATCH(DMI_PRODUCT_NAME, "x3800"),
35 },
36 },
37 {}
38};
39
40void __init pci_acpi_crs_quirks(void)
41{
42 int year;
43
44 if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) && year < 2008)
45 pci_use_crs = false;
46
47 dmi_check_system(pci_use_crs_table);
48
49 /*
50 * If the user specifies "pci=use_crs" or "pci=nocrs" explicitly, that
51 * takes precedence over anything we figured out above.
52 */
53 if (pci_probe & PCI_ROOT_NO_CRS)
54 pci_use_crs = false;
55 else if (pci_probe & PCI_USE__CRS)
56 pci_use_crs = true;
57
58 printk(KERN_INFO "PCI: %s host bridge windows from ACPI; "
59 "if necessary, use \"pci=%s\" and report a bug\n",
60 pci_use_crs ? "Using" : "Ignoring",
61 pci_use_crs ? "nocrs" : "use_crs");
62}
63
17static acpi_status 64static acpi_status
18resource_to_addr(struct acpi_resource *resource, 65resource_to_addr(struct acpi_resource *resource,
19 struct acpi_resource_address64 *addr) 66 struct acpi_resource_address64 *addr)
20{ 67{
21 acpi_status status; 68 acpi_status status;
22 69 struct acpi_resource_memory24 *memory24;
23 status = acpi_resource_to_address64(resource, addr); 70 struct acpi_resource_memory32 *memory32;
24 if (ACPI_SUCCESS(status) && 71 struct acpi_resource_fixed_memory32 *fixed_memory32;
25 (addr->resource_type == ACPI_MEMORY_RANGE || 72
26 addr->resource_type == ACPI_IO_RANGE) && 73 memset(addr, 0, sizeof(*addr));
27 addr->address_length > 0 && 74 switch (resource->type) {
28 addr->producer_consumer == ACPI_PRODUCER) { 75 case ACPI_RESOURCE_TYPE_MEMORY24:
76 memory24 = &resource->data.memory24;
77 addr->resource_type = ACPI_MEMORY_RANGE;
78 addr->minimum = memory24->minimum;
79 addr->address_length = memory24->address_length;
80 addr->maximum = addr->minimum + addr->address_length - 1;
81 return AE_OK;
82 case ACPI_RESOURCE_TYPE_MEMORY32:
83 memory32 = &resource->data.memory32;
84 addr->resource_type = ACPI_MEMORY_RANGE;
85 addr->minimum = memory32->minimum;
86 addr->address_length = memory32->address_length;
87 addr->maximum = addr->minimum + addr->address_length - 1;
29 return AE_OK; 88 return AE_OK;
89 case ACPI_RESOURCE_TYPE_FIXED_MEMORY32:
90 fixed_memory32 = &resource->data.fixed_memory32;
91 addr->resource_type = ACPI_MEMORY_RANGE;
92 addr->minimum = fixed_memory32->address;
93 addr->address_length = fixed_memory32->address_length;
94 addr->maximum = addr->minimum + addr->address_length - 1;
95 return AE_OK;
96 case ACPI_RESOURCE_TYPE_ADDRESS16:
97 case ACPI_RESOURCE_TYPE_ADDRESS32:
98 case ACPI_RESOURCE_TYPE_ADDRESS64:
99 status = acpi_resource_to_address64(resource, addr);
100 if (ACPI_SUCCESS(status) &&
101 (addr->resource_type == ACPI_MEMORY_RANGE ||
102 addr->resource_type == ACPI_IO_RANGE) &&
103 addr->address_length > 0) {
104 return AE_OK;
105 }
106 break;
30 } 107 }
31 return AE_ERROR; 108 return AE_ERROR;
32} 109}
@@ -44,20 +121,6 @@ count_resource(struct acpi_resource *acpi_res, void *data)
44 return AE_OK; 121 return AE_OK;
45} 122}
46 123
47static int
48bus_has_transparent_bridge(struct pci_bus *bus)
49{
50 struct pci_dev *dev;
51
52 list_for_each_entry(dev, &bus->devices, bus_list) {
53 u16 class = dev->class >> 8;
54
55 if (class == PCI_CLASS_BRIDGE_PCI && dev->transparent)
56 return true;
57 }
58 return false;
59}
60
61static acpi_status 124static acpi_status
62setup_resource(struct acpi_resource *acpi_res, void *data) 125setup_resource(struct acpi_resource *acpi_res, void *data)
63{ 126{
@@ -66,13 +129,9 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
66 struct acpi_resource_address64 addr; 129 struct acpi_resource_address64 addr;
67 acpi_status status; 130 acpi_status status;
68 unsigned long flags; 131 unsigned long flags;
69 struct resource *root; 132 struct resource *root, *conflict;
70 int max_root_bus_resources = PCI_BUS_NUM_RESOURCES;
71 u64 start, end; 133 u64 start, end;
72 134
73 if (bus_has_transparent_bridge(info->bus))
74 max_root_bus_resources -= 3;
75
76 status = resource_to_addr(acpi_res, &addr); 135 status = resource_to_addr(acpi_res, &addr);
77 if (!ACPI_SUCCESS(status)) 136 if (!ACPI_SUCCESS(status))
78 return AE_OK; 137 return AE_OK;
@@ -89,15 +148,7 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
89 return AE_OK; 148 return AE_OK;
90 149
91 start = addr.minimum + addr.translation_offset; 150 start = addr.minimum + addr.translation_offset;
92 end = start + addr.address_length - 1; 151 end = addr.maximum + addr.translation_offset;
93 if (info->res_num >= max_root_bus_resources) {
94 printk(KERN_WARNING "PCI: Failed to allocate 0x%lx-0x%lx "
95 "from %s for %s due to _CRS returning more than "
96 "%d resource descriptors\n", (unsigned long) start,
97 (unsigned long) end, root->name, info->name,
98 max_root_bus_resources);
99 return AE_OK;
100 }
101 152
102 res = &info->res[info->res_num]; 153 res = &info->res[info->res_num];
103 res->name = info->name; 154 res->name = info->name;
@@ -106,13 +157,29 @@ setup_resource(struct acpi_resource *acpi_res, void *data)
106 res->end = end; 157 res->end = end;
107 res->child = NULL; 158 res->child = NULL;
108 159
109 if (insert_resource(root, res)) { 160 if (!pci_use_crs) {
110 printk(KERN_ERR "PCI: Failed to allocate 0x%lx-0x%lx " 161 dev_printk(KERN_DEBUG, &info->bridge->dev,
111 "from %s for %s\n", (unsigned long) res->start, 162 "host bridge window %pR (ignored)\n", res);
112 (unsigned long) res->end, root->name, info->name); 163 return AE_OK;
164 }
165
166 conflict = insert_resource_conflict(root, res);
167 if (conflict) {
168 dev_err(&info->bridge->dev,
169 "address space collision: host bridge window %pR "
170 "conflicts with %s %pR\n",
171 res, conflict->name, conflict);
113 } else { 172 } else {
114 info->bus->resource[info->res_num] = res; 173 pci_bus_add_resource(info->bus, res, 0);
115 info->res_num++; 174 info->res_num++;
175 if (addr.translation_offset)
176 dev_info(&info->bridge->dev, "host bridge window %pR "
177 "(PCI address [%#llx-%#llx])\n",
178 res, res->start - addr.translation_offset,
179 res->end - addr.translation_offset);
180 else
181 dev_info(&info->bridge->dev,
182 "host bridge window %pR\n", res);
116 } 183 }
117 return AE_OK; 184 return AE_OK;
118} 185}
@@ -124,6 +191,10 @@ get_current_resources(struct acpi_device *device, int busnum,
124 struct pci_root_info info; 191 struct pci_root_info info;
125 size_t size; 192 size_t size;
126 193
194 if (pci_use_crs)
195 pci_bus_remove_resources(bus);
196
197 info.bridge = device;
127 info.bus = bus; 198 info.bus = bus;
128 info.res_num = 0; 199 info.res_num = 0;
129 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource, 200 acpi_walk_resources(device->handle, METHOD_NAME__CRS, count_resource,
@@ -163,8 +234,9 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
163#endif 234#endif
164 235
165 if (domain && !pci_domains_supported) { 236 if (domain && !pci_domains_supported) {
166 printk(KERN_WARNING "PCI: Multiple domains not supported " 237 printk(KERN_WARNING "pci_bus %04x:%02x: "
167 "(dom %d, bus %d)\n", domain, busnum); 238 "ignored (multiple domains not supported)\n",
239 domain, busnum);
168 return NULL; 240 return NULL;
169 } 241 }
170 242
@@ -188,7 +260,8 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
188 */ 260 */
189 sd = kzalloc(sizeof(*sd), GFP_KERNEL); 261 sd = kzalloc(sizeof(*sd), GFP_KERNEL);
190 if (!sd) { 262 if (!sd) {
191 printk(KERN_ERR "PCI: OOM, not probing PCI bus %02x\n", busnum); 263 printk(KERN_WARNING "pci_bus %04x:%02x: "
264 "ignored (out of memory)\n", domain, busnum);
192 return NULL; 265 return NULL;
193 } 266 }
194 267
@@ -209,9 +282,7 @@ struct pci_bus * __devinit pci_acpi_scan_root(struct acpi_device *device, int do
209 } else { 282 } else {
210 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd); 283 bus = pci_create_bus(NULL, busnum, &pci_root_ops, sd);
211 if (bus) { 284 if (bus) {
212 if (pci_probe & PCI_USE__CRS) 285 get_current_resources(device, busnum, domain, bus);
213 get_current_resources(device, busnum, domain,
214 bus);
215 bus->subordinate = pci_scan_child_bus(bus); 286 bus->subordinate = pci_scan_child_bus(bus);
216 } 287 }
217 } 288 }
@@ -236,17 +307,14 @@ int __init pci_acpi_init(void)
236{ 307{
237 struct pci_dev *dev = NULL; 308 struct pci_dev *dev = NULL;
238 309
239 if (pcibios_scanned)
240 return 0;
241
242 if (acpi_noirq) 310 if (acpi_noirq)
243 return 0; 311 return -ENODEV;
244 312
245 printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n"); 313 printk(KERN_INFO "PCI: Using ACPI for IRQ routing\n");
246 acpi_irq_penalty_init(); 314 acpi_irq_penalty_init();
247 pcibios_scanned++;
248 pcibios_enable_irq = acpi_pci_irq_enable; 315 pcibios_enable_irq = acpi_pci_irq_enable;
249 pcibios_disable_irq = acpi_pci_irq_disable; 316 pcibios_disable_irq = acpi_pci_irq_disable;
317 x86_init.pci.init_irq = x86_init_noop;
250 318
251 if (pci_routeirq) { 319 if (pci_routeirq) {
252 /* 320 /*
diff --git a/arch/x86/pci/amd_bus.c b/arch/x86/pci/amd_bus.c
index 572ee9782f2a..fc1e8fe07e5c 100644
--- a/arch/x86/pci/amd_bus.c
+++ b/arch/x86/pci/amd_bus.c
@@ -2,180 +2,19 @@
2#include <linux/pci.h> 2#include <linux/pci.h>
3#include <linux/topology.h> 3#include <linux/topology.h>
4#include <linux/cpu.h> 4#include <linux/cpu.h>
5#include <linux/range.h>
6
5#include <asm/pci_x86.h> 7#include <asm/pci_x86.h>
6 8
7#ifdef CONFIG_X86_64
8#include <asm/pci-direct.h> 9#include <asm/pci-direct.h>
9#include <asm/mpspec.h> 10
10#include <linux/cpumask.h> 11#include "bus_numa.h"
11#endif
12 12
13/* 13/*
14 * This discovers the pcibus <-> node mapping on AMD K8. 14 * This discovers the pcibus <-> node mapping on AMD K8.
15 * also get peer root bus resource for io,mmio 15 * also get peer root bus resource for io,mmio
16 */ 16 */
17 17
18#ifdef CONFIG_X86_64
19
20/*
21 * sub bus (transparent) will use entres from 3 to store extra from root,
22 * so need to make sure have enought slot there, increase PCI_BUS_NUM_RESOURCES?
23 */
24#define RES_NUM 16
25struct pci_root_info {
26 char name[12];
27 unsigned int res_num;
28 struct resource res[RES_NUM];
29 int bus_min;
30 int bus_max;
31 int node;
32 int link;
33};
34
35/* 4 at this time, it may become to 32 */
36#define PCI_ROOT_NR 4
37static int pci_root_num;
38static struct pci_root_info pci_root_info[PCI_ROOT_NR];
39
40void x86_pci_root_bus_res_quirks(struct pci_bus *b)
41{
42 int i;
43 int j;
44 struct pci_root_info *info;
45
46 /* don't go for it if _CRS is used already */
47 if (b->resource[0] != &ioport_resource ||
48 b->resource[1] != &iomem_resource)
49 return;
50
51 /* if only one root bus, don't need to anything */
52 if (pci_root_num < 2)
53 return;
54
55 for (i = 0; i < pci_root_num; i++) {
56 if (pci_root_info[i].bus_min == b->number)
57 break;
58 }
59
60 if (i == pci_root_num)
61 return;
62
63 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
64 b->number);
65
66 info = &pci_root_info[i];
67 for (j = 0; j < info->res_num; j++) {
68 struct resource *res;
69 struct resource *root;
70
71 res = &info->res[j];
72 b->resource[j] = res;
73 if (res->flags & IORESOURCE_IO)
74 root = &ioport_resource;
75 else
76 root = &iomem_resource;
77 insert_resource(root, res);
78 }
79}
80
81#define RANGE_NUM 16
82
83struct res_range {
84 size_t start;
85 size_t end;
86};
87
88static void __init update_range(struct res_range *range, size_t start,
89 size_t end)
90{
91 int i;
92 int j;
93
94 for (j = 0; j < RANGE_NUM; j++) {
95 if (!range[j].end)
96 continue;
97
98 if (start <= range[j].start && end >= range[j].end) {
99 range[j].start = 0;
100 range[j].end = 0;
101 continue;
102 }
103
104 if (start <= range[j].start && end < range[j].end && range[j].start < end + 1) {
105 range[j].start = end + 1;
106 continue;
107 }
108
109
110 if (start > range[j].start && end >= range[j].end && range[j].end > start - 1) {
111 range[j].end = start - 1;
112 continue;
113 }
114
115 if (start > range[j].start && end < range[j].end) {
116 /* find the new spare */
117 for (i = 0; i < RANGE_NUM; i++) {
118 if (range[i].end == 0)
119 break;
120 }
121 if (i < RANGE_NUM) {
122 range[i].end = range[j].end;
123 range[i].start = end + 1;
124 } else {
125 printk(KERN_ERR "run of slot in ranges\n");
126 }
127 range[j].end = start - 1;
128 continue;
129 }
130 }
131}
132
133static void __init update_res(struct pci_root_info *info, size_t start,
134 size_t end, unsigned long flags, int merge)
135{
136 int i;
137 struct resource *res;
138
139 if (!merge)
140 goto addit;
141
142 /* try to merge it with old one */
143 for (i = 0; i < info->res_num; i++) {
144 size_t final_start, final_end;
145 size_t common_start, common_end;
146
147 res = &info->res[i];
148 if (res->flags != flags)
149 continue;
150
151 common_start = max((size_t)res->start, start);
152 common_end = min((size_t)res->end, end);
153 if (common_start > common_end + 1)
154 continue;
155
156 final_start = min((size_t)res->start, start);
157 final_end = max((size_t)res->end, end);
158
159 res->start = final_start;
160 res->end = final_end;
161 return;
162 }
163
164addit:
165
166 /* need to add that */
167 if (info->res_num >= RES_NUM)
168 return;
169
170 res = &info->res[info->res_num];
171 res->name = info->name;
172 res->flags = flags;
173 res->start = start;
174 res->end = end;
175 res->child = NULL;
176 info->res_num++;
177}
178
179struct pci_hostbridge_probe { 18struct pci_hostbridge_probe {
180 u32 bus; 19 u32 bus;
181 u32 slot; 20 u32 slot;
@@ -218,6 +57,8 @@ static void __init get_pci_mmcfg_amd_fam10h_range(void)
218 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1; 57 fam10h_mmconf_end = base + (1ULL<<(segn_busn_bits + 20)) - 1;
219} 58}
220 59
60#define RANGE_NUM 16
61
221/** 62/**
222 * early_fill_mp_bus_to_node() 63 * early_fill_mp_bus_to_node()
223 * called before pcibios_scan_root and pci_scan_bus 64 * called before pcibios_scan_root and pci_scan_bus
@@ -230,7 +71,6 @@ static int __init early_fill_mp_bus_info(void)
230 int j; 71 int j;
231 unsigned bus; 72 unsigned bus;
232 unsigned slot; 73 unsigned slot;
233 int found;
234 int node; 74 int node;
235 int link; 75 int link;
236 int def_node; 76 int def_node;
@@ -238,16 +78,17 @@ static int __init early_fill_mp_bus_info(void)
238 struct pci_root_info *info; 78 struct pci_root_info *info;
239 u32 reg; 79 u32 reg;
240 struct resource *res; 80 struct resource *res;
241 size_t start; 81 u64 start;
242 size_t end; 82 u64 end;
243 struct res_range range[RANGE_NUM]; 83 struct range range[RANGE_NUM];
244 u64 val; 84 u64 val;
245 u32 address; 85 u32 address;
86 bool found;
246 87
247 if (!early_pci_allowed()) 88 if (!early_pci_allowed())
248 return -1; 89 return -1;
249 90
250 found = 0; 91 found = false;
251 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) { 92 for (i = 0; i < ARRAY_SIZE(pci_probes); i++) {
252 u32 id; 93 u32 id;
253 u16 device; 94 u16 device;
@@ -261,7 +102,7 @@ static int __init early_fill_mp_bus_info(void)
261 device = (id>>16) & 0xffff; 102 device = (id>>16) & 0xffff;
262 if (pci_probes[i].vendor == vendor && 103 if (pci_probes[i].vendor == vendor &&
263 pci_probes[i].device == device) { 104 pci_probes[i].device == device) {
264 found = 1; 105 found = true;
265 break; 106 break;
266 } 107 }
267 } 108 }
@@ -304,7 +145,7 @@ static int __init early_fill_mp_bus_info(void)
304 def_link = (reg >> 8) & 0x03; 145 def_link = (reg >> 8) & 0x03;
305 146
306 memset(range, 0, sizeof(range)); 147 memset(range, 0, sizeof(range));
307 range[0].end = 0xffff; 148 add_range(range, RANGE_NUM, 0, 0, 0xffff + 1);
308 /* io port resource */ 149 /* io port resource */
309 for (i = 0; i < 4; i++) { 150 for (i = 0; i < 4; i++) {
310 reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3)); 151 reg = read_pci_config(bus, slot, 1, 0xc0 + (i << 3));
@@ -328,13 +169,13 @@ static int __init early_fill_mp_bus_info(void)
328 169
329 info = &pci_root_info[j]; 170 info = &pci_root_info[j];
330 printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n", 171 printk(KERN_DEBUG "node %d link %d: io port [%llx, %llx]\n",
331 node, link, (u64)start, (u64)end); 172 node, link, start, end);
332 173
333 /* kernel only handle 16 bit only */ 174 /* kernel only handle 16 bit only */
334 if (end > 0xffff) 175 if (end > 0xffff)
335 end = 0xffff; 176 end = 0xffff;
336 update_res(info, start, end, IORESOURCE_IO, 1); 177 update_res(info, start, end, IORESOURCE_IO, 1);
337 update_range(range, start, end); 178 subtract_range(range, RANGE_NUM, start, end + 1);
338 } 179 }
339 /* add left over io port range to def node/link, [0, 0xffff] */ 180 /* add left over io port range to def node/link, [0, 0xffff] */
340 /* find the position */ 181 /* find the position */
@@ -349,29 +190,32 @@ static int __init early_fill_mp_bus_info(void)
349 if (!range[i].end) 190 if (!range[i].end)
350 continue; 191 continue;
351 192
352 update_res(info, range[i].start, range[i].end, 193 update_res(info, range[i].start, range[i].end - 1,
353 IORESOURCE_IO, 1); 194 IORESOURCE_IO, 1);
354 } 195 }
355 } 196 }
356 197
357 memset(range, 0, sizeof(range)); 198 memset(range, 0, sizeof(range));
358 /* 0xfd00000000-0xffffffffff for HT */ 199 /* 0xfd00000000-0xffffffffff for HT */
359 range[0].end = (0xfdULL<<32) - 1; 200 end = cap_resource((0xfdULL<<32) - 1);
201 end++;
202 add_range(range, RANGE_NUM, 0, 0, end);
360 203
361 /* need to take out [0, TOM) for RAM*/ 204 /* need to take out [0, TOM) for RAM*/
362 address = MSR_K8_TOP_MEM1; 205 address = MSR_K8_TOP_MEM1;
363 rdmsrl(address, val); 206 rdmsrl(address, val);
364 end = (val & 0xffffff800000ULL); 207 end = (val & 0xffffff800000ULL);
365 printk(KERN_INFO "TOM: %016lx aka %ldM\n", end, end>>20); 208 printk(KERN_INFO "TOM: %016llx aka %lldM\n", end, end>>20);
366 if (end < (1ULL<<32)) 209 if (end < (1ULL<<32))
367 update_range(range, 0, end - 1); 210 subtract_range(range, RANGE_NUM, 0, end);
368 211
369 /* get mmconfig */ 212 /* get mmconfig */
370 get_pci_mmcfg_amd_fam10h_range(); 213 get_pci_mmcfg_amd_fam10h_range();
371 /* need to take out mmconf range */ 214 /* need to take out mmconf range */
372 if (fam10h_mmconf_end) { 215 if (fam10h_mmconf_end) {
373 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end); 216 printk(KERN_DEBUG "Fam 10h mmconf [%llx, %llx]\n", fam10h_mmconf_start, fam10h_mmconf_end);
374 update_range(range, fam10h_mmconf_start, fam10h_mmconf_end); 217 subtract_range(range, RANGE_NUM, fam10h_mmconf_start,
218 fam10h_mmconf_end + 1);
375 } 219 }
376 220
377 /* mmio resource */ 221 /* mmio resource */
@@ -401,7 +245,7 @@ static int __init early_fill_mp_bus_info(void)
401 info = &pci_root_info[j]; 245 info = &pci_root_info[j];
402 246
403 printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]", 247 printk(KERN_DEBUG "node %d link %d: mmio [%llx, %llx]",
404 node, link, (u64)start, (u64)end); 248 node, link, start, end);
405 /* 249 /*
406 * some sick allocation would have range overlap with fam10h 250 * some sick allocation would have range overlap with fam10h
407 * mmconf range, so need to update start and end. 251 * mmconf range, so need to update start and end.
@@ -426,14 +270,15 @@ static int __init early_fill_mp_bus_info(void)
426 /* we got a hole */ 270 /* we got a hole */
427 endx = fam10h_mmconf_start - 1; 271 endx = fam10h_mmconf_start - 1;
428 update_res(info, start, endx, IORESOURCE_MEM, 0); 272 update_res(info, start, endx, IORESOURCE_MEM, 0);
429 update_range(range, start, endx); 273 subtract_range(range, RANGE_NUM, start,
430 printk(KERN_CONT " ==> [%llx, %llx]", (u64)start, endx); 274 endx + 1);
275 printk(KERN_CONT " ==> [%llx, %llx]", start, endx);
431 start = fam10h_mmconf_end + 1; 276 start = fam10h_mmconf_end + 1;
432 changed = 1; 277 changed = 1;
433 } 278 }
434 if (changed) { 279 if (changed) {
435 if (start <= end) { 280 if (start <= end) {
436 printk(KERN_CONT " %s [%llx, %llx]", endx?"and":"==>", (u64)start, (u64)end); 281 printk(KERN_CONT " %s [%llx, %llx]", endx ? "and" : "==>", start, end);
437 } else { 282 } else {
438 printk(KERN_CONT "%s\n", endx?"":" ==> none"); 283 printk(KERN_CONT "%s\n", endx?"":" ==> none");
439 continue; 284 continue;
@@ -441,8 +286,9 @@ static int __init early_fill_mp_bus_info(void)
441 } 286 }
442 } 287 }
443 288
444 update_res(info, start, end, IORESOURCE_MEM, 1); 289 update_res(info, cap_resource(start), cap_resource(end),
445 update_range(range, start, end); 290 IORESOURCE_MEM, 1);
291 subtract_range(range, RANGE_NUM, start, end + 1);
446 printk(KERN_CONT "\n"); 292 printk(KERN_CONT "\n");
447 } 293 }
448 294
@@ -456,8 +302,8 @@ static int __init early_fill_mp_bus_info(void)
456 address = MSR_K8_TOP_MEM2; 302 address = MSR_K8_TOP_MEM2;
457 rdmsrl(address, val); 303 rdmsrl(address, val);
458 end = (val & 0xffffff800000ULL); 304 end = (val & 0xffffff800000ULL);
459 printk(KERN_INFO "TOM2: %016lx aka %ldM\n", end, end>>20); 305 printk(KERN_INFO "TOM2: %016llx aka %lldM\n", end, end>>20);
460 update_range(range, 1ULL<<32, end - 1); 306 subtract_range(range, RANGE_NUM, 1ULL<<32, end);
461 } 307 }
462 308
463 /* 309 /*
@@ -476,7 +322,8 @@ static int __init early_fill_mp_bus_info(void)
476 if (!range[i].end) 322 if (!range[i].end)
477 continue; 323 continue;
478 324
479 update_res(info, range[i].start, range[i].end, 325 update_res(info, cap_resource(range[i].start),
326 cap_resource(range[i].end - 1),
480 IORESOURCE_MEM, 1); 327 IORESOURCE_MEM, 1);
481 } 328 }
482 } 329 }
@@ -488,28 +335,18 @@ static int __init early_fill_mp_bus_info(void)
488 info = &pci_root_info[i]; 335 info = &pci_root_info[i];
489 res_num = info->res_num; 336 res_num = info->res_num;
490 busnum = info->bus_min; 337 busnum = info->bus_min;
491 printk(KERN_DEBUG "bus: [%02x,%02x] on node %x link %x\n", 338 printk(KERN_DEBUG "bus: [%02x, %02x] on node %x link %x\n",
492 info->bus_min, info->bus_max, info->node, info->link); 339 info->bus_min, info->bus_max, info->node, info->link);
493 for (j = 0; j < res_num; j++) { 340 for (j = 0; j < res_num; j++) {
494 res = &info->res[j]; 341 res = &info->res[j];
495 printk(KERN_DEBUG "bus: %02x index %x %s: [%llx, %llx]\n", 342 printk(KERN_DEBUG "bus: %02x index %x %pR\n",
496 busnum, j, 343 busnum, j, res);
497 (res->flags & IORESOURCE_IO)?"io port":"mmio",
498 res->start, res->end);
499 } 344 }
500 } 345 }
501 346
502 return 0; 347 return 0;
503} 348}
504 349
505#else /* !CONFIG_X86_64 */
506
507static int __init early_fill_mp_bus_info(void) { return 0; }
508
509#endif /* !CONFIG_X86_64 */
510
511/* common 32/64 bit code */
512
513#define ENABLE_CF8_EXT_CFG (1ULL << 46) 350#define ENABLE_CF8_EXT_CFG (1ULL << 46)
514 351
515static void enable_pci_io_ecs(void *unused) 352static void enable_pci_io_ecs(void *unused)
diff --git a/arch/x86/pci/bus_numa.c b/arch/x86/pci/bus_numa.c
new file mode 100644
index 000000000000..64a122883896
--- /dev/null
+++ b/arch/x86/pci/bus_numa.c
@@ -0,0 +1,101 @@
1#include <linux/init.h>
2#include <linux/pci.h>
3#include <linux/range.h>
4
5#include "bus_numa.h"
6
7int pci_root_num;
8struct pci_root_info pci_root_info[PCI_ROOT_NR];
9
10void x86_pci_root_bus_res_quirks(struct pci_bus *b)
11{
12 int i;
13 int j;
14 struct pci_root_info *info;
15
16 /* don't go for it if _CRS is used already */
17 if (b->resource[0] != &ioport_resource ||
18 b->resource[1] != &iomem_resource)
19 return;
20
21 if (!pci_root_num)
22 return;
23
24 for (i = 0; i < pci_root_num; i++) {
25 if (pci_root_info[i].bus_min == b->number)
26 break;
27 }
28
29 if (i == pci_root_num)
30 return;
31
32 printk(KERN_DEBUG "PCI: peer root bus %02x res updated from pci conf\n",
33 b->number);
34
35 pci_bus_remove_resources(b);
36 info = &pci_root_info[i];
37 for (j = 0; j < info->res_num; j++) {
38 struct resource *res;
39 struct resource *root;
40
41 res = &info->res[j];
42 pci_bus_add_resource(b, res, 0);
43 if (res->flags & IORESOURCE_IO)
44 root = &ioport_resource;
45 else
46 root = &iomem_resource;
47 insert_resource(root, res);
48 }
49}
50
51void __devinit update_res(struct pci_root_info *info, resource_size_t start,
52 resource_size_t end, unsigned long flags, int merge)
53{
54 int i;
55 struct resource *res;
56
57 if (start > end)
58 return;
59
60 if (start == MAX_RESOURCE)
61 return;
62
63 if (!merge)
64 goto addit;
65
66 /* try to merge it with old one */
67 for (i = 0; i < info->res_num; i++) {
68 resource_size_t final_start, final_end;
69 resource_size_t common_start, common_end;
70
71 res = &info->res[i];
72 if (res->flags != flags)
73 continue;
74
75 common_start = max(res->start, start);
76 common_end = min(res->end, end);
77 if (common_start > common_end + 1)
78 continue;
79
80 final_start = min(res->start, start);
81 final_end = max(res->end, end);
82
83 res->start = final_start;
84 res->end = final_end;
85 return;
86 }
87
88addit:
89
90 /* need to add that */
91 if (info->res_num >= RES_NUM)
92 return;
93
94 res = &info->res[info->res_num];
95 res->name = info->name;
96 res->flags = flags;
97 res->start = start;
98 res->end = end;
99 res->child = NULL;
100 info->res_num++;
101}
diff --git a/arch/x86/pci/bus_numa.h b/arch/x86/pci/bus_numa.h
new file mode 100644
index 000000000000..804a4b40c31a
--- /dev/null
+++ b/arch/x86/pci/bus_numa.h
@@ -0,0 +1,25 @@
1#ifndef __BUS_NUMA_H
2#define __BUS_NUMA_H
3/*
4 * sub bus (transparent) will use entres from 3 to store extra from
5 * root, so need to make sure we have enough slot there.
6 */
7#define RES_NUM 16
8struct pci_root_info {
9 char name[12];
10 unsigned int res_num;
11 struct resource res[RES_NUM];
12 int bus_min;
13 int bus_max;
14 int node;
15 int link;
16};
17
18/* 4 at this time, it may become to 32 */
19#define PCI_ROOT_NR 4
20extern int pci_root_num;
21extern struct pci_root_info pci_root_info[PCI_ROOT_NR];
22
23extern void update_res(struct pci_root_info *info, resource_size_t start,
24 resource_size_t end, unsigned long flags, int merge);
25#endif
diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c
index 1331fcf26143..cf2e93869c48 100644
--- a/arch/x86/pci/common.c
+++ b/arch/x86/pci/common.c
@@ -9,6 +9,7 @@
9#include <linux/ioport.h> 9#include <linux/ioport.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/dmi.h> 11#include <linux/dmi.h>
12#include <linux/slab.h>
12 13
13#include <asm/acpi.h> 14#include <asm/acpi.h>
14#include <asm/segment.h> 15#include <asm/segment.h>
@@ -72,12 +73,6 @@ struct pci_ops pci_root_ops = {
72}; 73};
73 74
74/* 75/*
75 * legacy, numa, and acpi all want to call pcibios_scan_root
76 * from their initcalls. This flag prevents that.
77 */
78int pcibios_scanned;
79
80/*
81 * This interrupt-safe spinlock protects all accesses to PCI 76 * This interrupt-safe spinlock protects all accesses to PCI
82 * configuration space. 77 * configuration space.
83 */ 78 */
@@ -410,8 +405,6 @@ struct pci_bus * __devinit pcibios_scan_root(int busnum)
410 return bus; 405 return bus;
411} 406}
412 407
413extern u8 pci_cache_line_size;
414
415int __init pcibios_init(void) 408int __init pcibios_init(void)
416{ 409{
417 struct cpuinfo_x86 *c = &boot_cpu_data; 410 struct cpuinfo_x86 *c = &boot_cpu_data;
@@ -422,15 +415,19 @@ int __init pcibios_init(void)
422 } 415 }
423 416
424 /* 417 /*
425 * Assume PCI cacheline size of 32 bytes for all x86s except K7/K8 418 * Set PCI cacheline size to that of the CPU if the CPU has reported it.
426 * and P4. It's also good for 386/486s (which actually have 16) 419 * (For older CPUs that don't support cpuid, we se it to 32 bytes
420 * It's also good for 386/486s (which actually have 16)
427 * as quite a few PCI devices do not support smaller values. 421 * as quite a few PCI devices do not support smaller values.
428 */ 422 */
429 pci_cache_line_size = 32 >> 2; 423 if (c->x86_clflush_size > 0) {
430 if (c->x86 >= 6 && c->x86_vendor == X86_VENDOR_AMD) 424 pci_dfl_cache_line_size = c->x86_clflush_size >> 2;
431 pci_cache_line_size = 64 >> 2; /* K7 & K8 */ 425 printk(KERN_DEBUG "PCI: pci_cache_line_size set to %d bytes\n",
432 else if (c->x86 > 6 && c->x86_vendor == X86_VENDOR_INTEL) 426 pci_dfl_cache_line_size << 2);
433 pci_cache_line_size = 128 >> 2; /* P4 */ 427 } else {
428 pci_dfl_cache_line_size = 32 >> 2;
429 printk(KERN_DEBUG "PCI: Unknown cacheline size. Setting to 32 bytes\n");
430 }
434 431
435 pcibios_resource_survey(); 432 pcibios_resource_survey();
436 433
@@ -518,6 +515,9 @@ char * __devinit pcibios_setup(char *str)
518 } else if (!strcmp(str, "use_crs")) { 515 } else if (!strcmp(str, "use_crs")) {
519 pci_probe |= PCI_USE__CRS; 516 pci_probe |= PCI_USE__CRS;
520 return NULL; 517 return NULL;
518 } else if (!strcmp(str, "nocrs")) {
519 pci_probe |= PCI_ROOT_NO_CRS;
520 return NULL;
521 } else if (!strcmp(str, "earlydump")) { 521 } else if (!strcmp(str, "earlydump")) {
522 pci_early_dump_regs = 1; 522 pci_early_dump_regs = 1;
523 return NULL; 523 return NULL;
diff --git a/arch/x86/pci/early.c b/arch/x86/pci/early.c
index aaf26ae58cd5..d1067d539bee 100644
--- a/arch/x86/pci/early.c
+++ b/arch/x86/pci/early.c
@@ -12,8 +12,6 @@ u32 read_pci_config(u8 bus, u8 slot, u8 func, u8 offset)
12 u32 v; 12 u32 v;
13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 13 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
14 v = inl(0xcfc); 14 v = inl(0xcfc);
15 if (v != 0xffffffff)
16 pr_debug("%x reading 4 from %x: %x\n", slot, offset, v);
17 return v; 15 return v;
18} 16}
19 17
@@ -22,7 +20,6 @@ u8 read_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset)
22 u8 v; 20 u8 v;
23 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 21 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
24 v = inb(0xcfc + (offset&3)); 22 v = inb(0xcfc + (offset&3));
25 pr_debug("%x reading 1 from %x: %x\n", slot, offset, v);
26 return v; 23 return v;
27} 24}
28 25
@@ -31,28 +28,24 @@ u16 read_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset)
31 u16 v; 28 u16 v;
32 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 29 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
33 v = inw(0xcfc + (offset&2)); 30 v = inw(0xcfc + (offset&2));
34 pr_debug("%x reading 2 from %x: %x\n", slot, offset, v);
35 return v; 31 return v;
36} 32}
37 33
38void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset, 34void write_pci_config(u8 bus, u8 slot, u8 func, u8 offset,
39 u32 val) 35 u32 val)
40{ 36{
41 pr_debug("%x writing to %x: %x\n", slot, offset, val);
42 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 37 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
43 outl(val, 0xcfc); 38 outl(val, 0xcfc);
44} 39}
45 40
46void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val) 41void write_pci_config_byte(u8 bus, u8 slot, u8 func, u8 offset, u8 val)
47{ 42{
48 pr_debug("%x writing to %x: %x\n", slot, offset, val);
49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 43 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
50 outb(val, 0xcfc + (offset&3)); 44 outb(val, 0xcfc + (offset&3));
51} 45}
52 46
53void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val) 47void write_pci_config_16(u8 bus, u8 slot, u8 func, u8 offset, u16 val)
54{ 48{
55 pr_debug("%x writing to %x: %x\n", slot, offset, val);
56 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8); 49 outl(0x80000000 | (bus<<16) | (slot<<11) | (func<<8) | offset, 0xcf8);
57 outw(val, 0xcfc + (offset&2)); 50 outw(val, 0xcfc + (offset&2));
58} 51}
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index b22d13b0c71d..97da2ba9344b 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -60,22 +60,23 @@ skip_isa_ioresource_align(struct pci_dev *dev) {
60 * but we want to try to avoid allocating at 0x2900-0x2bff 60 * but we want to try to avoid allocating at 0x2900-0x2bff
61 * which might have be mirrored at 0x0100-0x03ff.. 61 * which might have be mirrored at 0x0100-0x03ff..
62 */ 62 */
63void 63resource_size_t
64pcibios_align_resource(void *data, struct resource *res, 64pcibios_align_resource(void *data, const struct resource *res,
65 resource_size_t size, resource_size_t align) 65 resource_size_t size, resource_size_t align)
66{ 66{
67 struct pci_dev *dev = data; 67 struct pci_dev *dev = data;
68 resource_size_t start = res->start;
68 69
69 if (res->flags & IORESOURCE_IO) { 70 if (res->flags & IORESOURCE_IO) {
70 resource_size_t start = res->start;
71
72 if (skip_isa_ioresource_align(dev)) 71 if (skip_isa_ioresource_align(dev))
73 return; 72 return start;
74 if (start & 0x300) { 73 if (start & 0x300)
75 start = (start + 0x3ff) & ~0x3ff; 74 start = (start + 0x3ff) & ~0x3ff;
76 res->start = start; 75 } else if (res->flags & IORESOURCE_MEM) {
77 } 76 if (start < BIOS_END)
77 start = BIOS_END;
78 } 78 }
79 return start;
79} 80}
80EXPORT_SYMBOL(pcibios_align_resource); 81EXPORT_SYMBOL(pcibios_align_resource);
81 82
@@ -129,7 +130,6 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
129 continue; 130 continue;
130 if (!r->start || 131 if (!r->start ||
131 pci_claim_resource(dev, idx) < 0) { 132 pci_claim_resource(dev, idx) < 0) {
132 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
133 /* 133 /*
134 * Something is wrong with the region. 134 * Something is wrong with the region.
135 * Invalidate the resource to prevent 135 * Invalidate the resource to prevent
@@ -144,16 +144,29 @@ static void __init pcibios_allocate_bus_resources(struct list_head *bus_list)
144 } 144 }
145} 145}
146 146
147struct pci_check_idx_range {
148 int start;
149 int end;
150};
151
147static void __init pcibios_allocate_resources(int pass) 152static void __init pcibios_allocate_resources(int pass)
148{ 153{
149 struct pci_dev *dev = NULL; 154 struct pci_dev *dev = NULL;
150 int idx, disabled; 155 int idx, disabled, i;
151 u16 command; 156 u16 command;
152 struct resource *r; 157 struct resource *r;
153 158
159 struct pci_check_idx_range idx_range[] = {
160 { PCI_STD_RESOURCES, PCI_STD_RESOURCE_END },
161#ifdef CONFIG_PCI_IOV
162 { PCI_IOV_RESOURCES, PCI_IOV_RESOURCE_END },
163#endif
164 };
165
154 for_each_pci_dev(dev) { 166 for_each_pci_dev(dev) {
155 pci_read_config_word(dev, PCI_COMMAND, &command); 167 pci_read_config_word(dev, PCI_COMMAND, &command);
156 for (idx = 0; idx < PCI_ROM_RESOURCE; idx++) { 168 for (i = 0; i < ARRAY_SIZE(idx_range); i++)
169 for (idx = idx_range[i].start; idx <= idx_range[i].end; idx++) {
157 r = &dev->resource[idx]; 170 r = &dev->resource[idx];
158 if (r->parent) /* Already allocated */ 171 if (r->parent) /* Already allocated */
159 continue; 172 continue;
@@ -164,12 +177,10 @@ static void __init pcibios_allocate_resources(int pass)
164 else 177 else
165 disabled = !(command & PCI_COMMAND_MEMORY); 178 disabled = !(command & PCI_COMMAND_MEMORY);
166 if (pass == disabled) { 179 if (pass == disabled) {
167 dev_dbg(&dev->dev, "resource %#08llx-%#08llx (f=%lx, d=%d, p=%d)\n", 180 dev_dbg(&dev->dev,
168 (unsigned long long) r->start, 181 "BAR %d: reserving %pr (d=%d, p=%d)\n",
169 (unsigned long long) r->end, 182 idx, r, disabled, pass);
170 r->flags, disabled, pass);
171 if (pci_claim_resource(dev, idx) < 0) { 183 if (pci_claim_resource(dev, idx) < 0) {
172 dev_info(&dev->dev, "BAR %d: can't allocate resource\n", idx);
173 /* We'll assign a new address later */ 184 /* We'll assign a new address later */
174 r->end -= r->start; 185 r->end -= r->start;
175 r->start = 0; 186 r->start = 0;
@@ -182,7 +193,7 @@ static void __init pcibios_allocate_resources(int pass)
182 /* Turn the ROM off, leave the resource region, 193 /* Turn the ROM off, leave the resource region,
183 * but keep it unregistered. */ 194 * but keep it unregistered. */
184 u32 reg; 195 u32 reg;
185 dev_dbg(&dev->dev, "disabling ROM\n"); 196 dev_dbg(&dev->dev, "disabling ROM %pR\n", r);
186 r->flags &= ~IORESOURCE_ROM_ENABLE; 197 r->flags &= ~IORESOURCE_ROM_ENABLE;
187 pci_read_config_dword(dev, 198 pci_read_config_dword(dev,
188 dev->rom_base_reg, &reg); 199 dev->rom_base_reg, &reg);
@@ -242,10 +253,6 @@ void __init pcibios_resource_survey(void)
242 */ 253 */
243fs_initcall(pcibios_assign_resources); 254fs_initcall(pcibios_assign_resources);
244 255
245void __weak x86_pci_root_bus_res_quirks(struct pci_bus *b)
246{
247}
248
249/* 256/*
250 * If we set up a device for bus mastering, we need to check the latency 257 * If we set up a device for bus mastering, we need to check the latency
251 * timer as certain crappy BIOSes forget to set it properly. 258 * timer as certain crappy BIOSes forget to set it properly.
@@ -282,6 +289,15 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
282 return -EINVAL; 289 return -EINVAL;
283 290
284 prot = pgprot_val(vma->vm_page_prot); 291 prot = pgprot_val(vma->vm_page_prot);
292
293 /*
294 * Return error if pat is not enabled and write_combine is requested.
295 * Caller can followup with UC MINUS request and add a WC mtrr if there
296 * is a free mtrr slot.
297 */
298 if (!pat_enabled && write_combine)
299 return -EINVAL;
300
285 if (pat_enabled && write_combine) 301 if (pat_enabled && write_combine)
286 prot |= _PAGE_CACHE_WC; 302 prot |= _PAGE_CACHE_WC;
287 else if (pat_enabled || boot_cpu_data.x86 > 3) 303 else if (pat_enabled || boot_cpu_data.x86 > 3)
diff --git a/arch/x86/pci/init.c b/arch/x86/pci/init.c
index 25a1f8efed4a..adb62aaa7ecd 100644
--- a/arch/x86/pci/init.c
+++ b/arch/x86/pci/init.c
@@ -1,6 +1,7 @@
1#include <linux/pci.h> 1#include <linux/pci.h>
2#include <linux/init.h> 2#include <linux/init.h>
3#include <asm/pci_x86.h> 3#include <asm/pci_x86.h>
4#include <asm/x86_init.h>
4 5
5/* arch_initcall has too random ordering, so call the initializers 6/* arch_initcall has too random ordering, so call the initializers
6 in the right sequence from here. */ 7 in the right sequence from here. */
@@ -15,10 +16,9 @@ static __init int pci_arch_init(void)
15 if (!(pci_probe & PCI_PROBE_NOEARLY)) 16 if (!(pci_probe & PCI_PROBE_NOEARLY))
16 pci_mmcfg_early_init(); 17 pci_mmcfg_early_init();
17 18
18#ifdef CONFIG_PCI_OLPC 19 if (x86_init.pci.arch_init && !x86_init.pci.arch_init())
19 if (!pci_olpc_init()) 20 return 0;
20 return 0; /* skip additional checks if it's an XO */ 21
21#endif
22#ifdef CONFIG_PCI_BIOS 22#ifdef CONFIG_PCI_BIOS
23 pci_pcbios_init(); 23 pci_pcbios_init();
24#endif 24#endif
diff --git a/arch/x86/pci/irq.c b/arch/x86/pci/irq.c
index 0696d506c4ad..5d362b5ba06f 100644
--- a/arch/x86/pci/irq.c
+++ b/arch/x86/pci/irq.c
@@ -8,7 +8,6 @@
8#include <linux/kernel.h> 8#include <linux/kernel.h>
9#include <linux/pci.h> 9#include <linux/pci.h>
10#include <linux/init.h> 10#include <linux/init.h>
11#include <linux/slab.h>
12#include <linux/interrupt.h> 11#include <linux/interrupt.h>
13#include <linux/dmi.h> 12#include <linux/dmi.h>
14#include <linux/io.h> 13#include <linux/io.h>
@@ -53,7 +52,7 @@ struct irq_router_handler {
53 int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device); 52 int (*probe)(struct irq_router *r, struct pci_dev *router, u16 device);
54}; 53};
55 54
56int (*pcibios_enable_irq)(struct pci_dev *dev) = NULL; 55int (*pcibios_enable_irq)(struct pci_dev *dev) = pirq_enable_irq;
57void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL; 56void (*pcibios_disable_irq)(struct pci_dev *dev) = NULL;
58 57
59/* 58/*
@@ -590,6 +589,8 @@ static __init int intel_router_probe(struct irq_router *r, struct pci_dev *route
590 case PCI_DEVICE_ID_INTEL_ICH10_1: 589 case PCI_DEVICE_ID_INTEL_ICH10_1:
591 case PCI_DEVICE_ID_INTEL_ICH10_2: 590 case PCI_DEVICE_ID_INTEL_ICH10_2:
592 case PCI_DEVICE_ID_INTEL_ICH10_3: 591 case PCI_DEVICE_ID_INTEL_ICH10_3:
592 case PCI_DEVICE_ID_INTEL_CPT_LPC1:
593 case PCI_DEVICE_ID_INTEL_CPT_LPC2:
593 r->name = "PIIX/ICH"; 594 r->name = "PIIX/ICH";
594 r->get = pirq_piix_get; 595 r->get = pirq_piix_get;
595 r->set = pirq_piix_set; 596 r->set = pirq_piix_set;
@@ -1016,7 +1017,7 @@ static int pcibios_lookup_irq(struct pci_dev *dev, int assign)
1016 return 1; 1017 return 1;
1017} 1018}
1018 1019
1019static void __init pcibios_fixup_irqs(void) 1020void __init pcibios_fixup_irqs(void)
1020{ 1021{
1021 struct pci_dev *dev = NULL; 1022 struct pci_dev *dev = NULL;
1022 u8 pin; 1023 u8 pin;
@@ -1110,12 +1111,12 @@ static struct dmi_system_id __initdata pciirq_dmi_table[] = {
1110 { } 1111 { }
1111}; 1112};
1112 1113
1113int __init pcibios_irq_init(void) 1114void __init pcibios_irq_init(void)
1114{ 1115{
1115 DBG(KERN_DEBUG "PCI: IRQ init\n"); 1116 DBG(KERN_DEBUG "PCI: IRQ init\n");
1116 1117
1117 if (pcibios_enable_irq || raw_pci_ops == NULL) 1118 if (raw_pci_ops == NULL)
1118 return 0; 1119 return;
1119 1120
1120 dmi_check_system(pciirq_dmi_table); 1121 dmi_check_system(pciirq_dmi_table);
1121 1122
@@ -1142,9 +1143,7 @@ int __init pcibios_irq_init(void)
1142 pirq_table = NULL; 1143 pirq_table = NULL;
1143 } 1144 }
1144 1145
1145 pcibios_enable_irq = pirq_enable_irq; 1146 x86_init.pci.fixup_irqs();
1146
1147 pcibios_fixup_irqs();
1148 1147
1149 if (io_apic_assign_pci_irqs && pci_routeirq) { 1148 if (io_apic_assign_pci_irqs && pci_routeirq) {
1150 struct pci_dev *dev = NULL; 1149 struct pci_dev *dev = NULL;
@@ -1157,8 +1156,6 @@ int __init pcibios_irq_init(void)
1157 for_each_pci_dev(dev) 1156 for_each_pci_dev(dev)
1158 pirq_enable_irq(dev); 1157 pirq_enable_irq(dev);
1159 } 1158 }
1160
1161 return 0;
1162} 1159}
1163 1160
1164static void pirq_penalize_isa_irq(int irq, int active) 1161static void pirq_penalize_isa_irq(int irq, int active)
diff --git a/arch/x86/pci/legacy.c b/arch/x86/pci/legacy.c
index 4061bb0f267d..0db5eaf54560 100644
--- a/arch/x86/pci/legacy.c
+++ b/arch/x86/pci/legacy.c
@@ -35,16 +35,13 @@ static void __devinit pcibios_fixup_peer_bridges(void)
35 } 35 }
36} 36}
37 37
38static int __init pci_legacy_init(void) 38int __init pci_legacy_init(void)
39{ 39{
40 if (!raw_pci_ops) { 40 if (!raw_pci_ops) {
41 printk("PCI: System does not support PCI\n"); 41 printk("PCI: System does not support PCI\n");
42 return 0; 42 return 0;
43 } 43 }
44 44
45 if (pcibios_scanned++)
46 return 0;
47
48 printk("PCI: Probing PCI hardware\n"); 45 printk("PCI: Probing PCI hardware\n");
49 pci_root_bus = pcibios_scan_root(0); 46 pci_root_bus = pcibios_scan_root(0);
50 if (pci_root_bus) 47 if (pci_root_bus)
@@ -55,18 +52,15 @@ static int __init pci_legacy_init(void)
55 52
56int __init pci_subsys_init(void) 53int __init pci_subsys_init(void)
57{ 54{
58#ifdef CONFIG_X86_NUMAQ 55 /*
59 pci_numaq_init(); 56 * The init function returns an non zero value when
60#endif 57 * pci_legacy_init should be invoked.
61#ifdef CONFIG_ACPI 58 */
62 pci_acpi_init(); 59 if (x86_init.pci.init())
63#endif 60 pci_legacy_init();
64#ifdef CONFIG_X86_VISWS 61
65 pci_visws_init();
66#endif
67 pci_legacy_init();
68 pcibios_fixup_peer_bridges(); 62 pcibios_fixup_peer_bridges();
69 pcibios_irq_init(); 63 x86_init.pci.init_irq();
70 pcibios_init(); 64 pcibios_init();
71 65
72 return 0; 66 return 0;
diff --git a/arch/x86/pci/mmconfig-shared.c b/arch/x86/pci/mmconfig-shared.c
index 602c172d3bd5..39b9ebe8f886 100644
--- a/arch/x86/pci/mmconfig-shared.c
+++ b/arch/x86/pci/mmconfig-shared.c
@@ -15,48 +15,99 @@
15#include <linux/acpi.h> 15#include <linux/acpi.h>
16#include <linux/sfi_acpi.h> 16#include <linux/sfi_acpi.h>
17#include <linux/bitmap.h> 17#include <linux/bitmap.h>
18#include <linux/sort.h> 18#include <linux/dmi.h>
19#include <linux/slab.h>
19#include <asm/e820.h> 20#include <asm/e820.h>
20#include <asm/pci_x86.h> 21#include <asm/pci_x86.h>
21#include <asm/acpi.h> 22#include <asm/acpi.h>
22 23
23#define PREFIX "PCI: " 24#define PREFIX "PCI: "
24 25
25/* aperture is up to 256MB but BIOS may reserve less */
26#define MMCONFIG_APER_MIN (2 * 1024*1024)
27#define MMCONFIG_APER_MAX (256 * 1024*1024)
28
29/* Indicate if the mmcfg resources have been placed into the resource table. */ 26/* Indicate if the mmcfg resources have been placed into the resource table. */
30static int __initdata pci_mmcfg_resources_inserted; 27static int __initdata pci_mmcfg_resources_inserted;
31 28
32static __init int extend_mmcfg(int num) 29LIST_HEAD(pci_mmcfg_list);
30
31static __init void pci_mmconfig_remove(struct pci_mmcfg_region *cfg)
33{ 32{
34 struct acpi_mcfg_allocation *new; 33 if (cfg->res.parent)
35 int new_num = pci_mmcfg_config_num + num; 34 release_resource(&cfg->res);
35 list_del(&cfg->list);
36 kfree(cfg);
37}
36 38
37 new = kzalloc(sizeof(pci_mmcfg_config[0]) * new_num, GFP_KERNEL); 39static __init void free_all_mmcfg(void)
38 if (!new) 40{
39 return -1; 41 struct pci_mmcfg_region *cfg, *tmp;
42
43 pci_mmcfg_arch_free();
44 list_for_each_entry_safe(cfg, tmp, &pci_mmcfg_list, list)
45 pci_mmconfig_remove(cfg);
46}
40 47
41 if (pci_mmcfg_config) { 48static __init void list_add_sorted(struct pci_mmcfg_region *new)
42 memcpy(new, pci_mmcfg_config, 49{
43 sizeof(pci_mmcfg_config[0]) * new_num); 50 struct pci_mmcfg_region *cfg;
44 kfree(pci_mmcfg_config); 51
52 /* keep list sorted by segment and starting bus number */
53 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
54 if (cfg->segment > new->segment ||
55 (cfg->segment == new->segment &&
56 cfg->start_bus >= new->start_bus)) {
57 list_add_tail(&new->list, &cfg->list);
58 return;
59 }
45 } 60 }
46 pci_mmcfg_config = new; 61 list_add_tail(&new->list, &pci_mmcfg_list);
62}
47 63
48 return 0; 64static __init struct pci_mmcfg_region *pci_mmconfig_add(int segment, int start,
65 int end, u64 addr)
66{
67 struct pci_mmcfg_region *new;
68 int num_buses;
69 struct resource *res;
70
71 if (addr == 0)
72 return NULL;
73
74 new = kzalloc(sizeof(*new), GFP_KERNEL);
75 if (!new)
76 return NULL;
77
78 new->address = addr;
79 new->segment = segment;
80 new->start_bus = start;
81 new->end_bus = end;
82
83 list_add_sorted(new);
84
85 num_buses = end - start + 1;
86 res = &new->res;
87 res->start = addr + PCI_MMCFG_BUS_OFFSET(start);
88 res->end = addr + PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
89 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
90 snprintf(new->name, PCI_MMCFG_RESOURCE_NAME_LEN,
91 "PCI MMCONFIG %04x [bus %02x-%02x]", segment, start, end);
92 res->name = new->name;
93
94 printk(KERN_INFO PREFIX "MMCONFIG for domain %04x [bus %02x-%02x] at "
95 "%pR (base %#lx)\n", segment, start, end, &new->res,
96 (unsigned long) addr);
97
98 return new;
49} 99}
50 100
51static __init void fill_one_mmcfg(u64 addr, int segment, int start, int end) 101struct pci_mmcfg_region *pci_mmconfig_lookup(int segment, int bus)
52{ 102{
53 int i = pci_mmcfg_config_num; 103 struct pci_mmcfg_region *cfg;
54 104
55 pci_mmcfg_config_num++; 105 list_for_each_entry(cfg, &pci_mmcfg_list, list)
56 pci_mmcfg_config[i].address = addr; 106 if (cfg->segment == segment &&
57 pci_mmcfg_config[i].pci_segment = segment; 107 cfg->start_bus <= bus && bus <= cfg->end_bus)
58 pci_mmcfg_config[i].start_bus_number = start; 108 return cfg;
59 pci_mmcfg_config[i].end_bus_number = end; 109
110 return NULL;
60} 111}
61 112
62static const char __init *pci_mmcfg_e7520(void) 113static const char __init *pci_mmcfg_e7520(void)
@@ -68,11 +119,9 @@ static const char __init *pci_mmcfg_e7520(void)
68 if (win == 0x0000 || win == 0xf000) 119 if (win == 0x0000 || win == 0xf000)
69 return NULL; 120 return NULL;
70 121
71 if (extend_mmcfg(1) == -1) 122 if (pci_mmconfig_add(0, 0, 255, win << 16) == NULL)
72 return NULL; 123 return NULL;
73 124
74 fill_one_mmcfg(win << 16, 0, 0, 255);
75
76 return "Intel Corporation E7520 Memory Controller Hub"; 125 return "Intel Corporation E7520 Memory Controller Hub";
77} 126}
78 127
@@ -114,11 +163,9 @@ static const char __init *pci_mmcfg_intel_945(void)
114 if ((pciexbar & mask) >= 0xf0000000U) 163 if ((pciexbar & mask) >= 0xf0000000U)
115 return NULL; 164 return NULL;
116 165
117 if (extend_mmcfg(1) == -1) 166 if (pci_mmconfig_add(0, 0, (len >> 20) - 1, pciexbar & mask) == NULL)
118 return NULL; 167 return NULL;
119 168
120 fill_one_mmcfg(pciexbar & mask, 0, 0, (len >> 20) - 1);
121
122 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub"; 169 return "Intel Corporation 945G/GZ/P/PL Express Memory Controller Hub";
123} 170}
124 171
@@ -127,7 +174,7 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
127 u32 low, high, address; 174 u32 low, high, address;
128 u64 base, msr; 175 u64 base, msr;
129 int i; 176 int i;
130 unsigned segnbits = 0, busnbits; 177 unsigned segnbits = 0, busnbits, end_bus;
131 178
132 if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF)) 179 if (!(pci_probe & PCI_CHECK_ENABLE_AMD_MMCONF))
133 return NULL; 180 return NULL;
@@ -161,11 +208,13 @@ static const char __init *pci_mmcfg_amd_fam10h(void)
161 busnbits = 8; 208 busnbits = 8;
162 } 209 }
163 210
164 if (extend_mmcfg(1 << segnbits) == -1) 211 end_bus = (1 << busnbits) - 1;
165 return NULL;
166
167 for (i = 0; i < (1 << segnbits); i++) 212 for (i = 0; i < (1 << segnbits); i++)
168 fill_one_mmcfg(base + (1<<28) * i, i, 0, (1 << busnbits) - 1); 213 if (pci_mmconfig_add(i, 0, end_bus,
214 base + (1<<28) * i) == NULL) {
215 free_all_mmcfg();
216 return NULL;
217 }
169 218
170 return "AMD Family 10h NB"; 219 return "AMD Family 10h NB";
171} 220}
@@ -190,7 +239,7 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
190 /* 239 /*
191 * do check if amd fam10h already took over 240 * do check if amd fam10h already took over
192 */ 241 */
193 if (!acpi_disabled || pci_mmcfg_config_num || mcp55_checked) 242 if (!acpi_disabled || !list_empty(&pci_mmcfg_list) || mcp55_checked)
194 return NULL; 243 return NULL;
195 244
196 mcp55_checked = true; 245 mcp55_checked = true;
@@ -213,16 +262,14 @@ static const char __init *pci_mmcfg_nvidia_mcp55(void)
213 if (!(extcfg & extcfg_enable_mask)) 262 if (!(extcfg & extcfg_enable_mask))
214 continue; 263 continue;
215 264
216 if (extend_mmcfg(1) == -1)
217 continue;
218
219 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift; 265 size_index = (extcfg & extcfg_size_mask) >> extcfg_size_shift;
220 base = extcfg & extcfg_base_mask[size_index]; 266 base = extcfg & extcfg_base_mask[size_index];
221 /* base could > 4G */ 267 /* base could > 4G */
222 base <<= extcfg_base_lshift; 268 base <<= extcfg_base_lshift;
223 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift; 269 start = (extcfg & extcfg_start_mask) >> extcfg_start_shift;
224 end = start + extcfg_sizebus[size_index] - 1; 270 end = start + extcfg_sizebus[size_index] - 1;
225 fill_one_mmcfg(base, 0, start, end); 271 if (pci_mmconfig_add(0, start, end, base) == NULL)
272 continue;
226 mcp55_mmconf_found++; 273 mcp55_mmconf_found++;
227 } 274 }
228 275
@@ -253,45 +300,22 @@ static struct pci_mmcfg_hostbridge_probe pci_mmcfg_probes[] __initdata = {
253 0x0369, pci_mmcfg_nvidia_mcp55 }, 300 0x0369, pci_mmcfg_nvidia_mcp55 },
254}; 301};
255 302
256static int __init cmp_mmcfg(const void *x1, const void *x2)
257{
258 const typeof(pci_mmcfg_config[0]) *m1 = x1;
259 const typeof(pci_mmcfg_config[0]) *m2 = x2;
260 int start1, start2;
261
262 start1 = m1->start_bus_number;
263 start2 = m2->start_bus_number;
264
265 return start1 - start2;
266}
267
268static void __init pci_mmcfg_check_end_bus_number(void) 303static void __init pci_mmcfg_check_end_bus_number(void)
269{ 304{
270 int i; 305 struct pci_mmcfg_region *cfg, *cfgx;
271 typeof(pci_mmcfg_config[0]) *cfg, *cfgx;
272
273 /* sort them at first */
274 sort(pci_mmcfg_config, pci_mmcfg_config_num,
275 sizeof(pci_mmcfg_config[0]), cmp_mmcfg, NULL);
276
277 /* last one*/
278 if (pci_mmcfg_config_num > 0) {
279 i = pci_mmcfg_config_num - 1;
280 cfg = &pci_mmcfg_config[i];
281 if (cfg->end_bus_number < cfg->start_bus_number)
282 cfg->end_bus_number = 255;
283 }
284 306
285 /* don't overlap please */ 307 /* Fixup overlaps */
286 for (i = 0; i < pci_mmcfg_config_num - 1; i++) { 308 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
287 cfg = &pci_mmcfg_config[i]; 309 if (cfg->end_bus < cfg->start_bus)
288 cfgx = &pci_mmcfg_config[i+1]; 310 cfg->end_bus = 255;
289 311
290 if (cfg->end_bus_number < cfg->start_bus_number) 312 /* Don't access the list head ! */
291 cfg->end_bus_number = 255; 313 if (cfg->list.next == &pci_mmcfg_list)
314 break;
292 315
293 if (cfg->end_bus_number >= cfgx->start_bus_number) 316 cfgx = list_entry(cfg->list.next, typeof(*cfg), list);
294 cfg->end_bus_number = cfgx->start_bus_number - 1; 317 if (cfg->end_bus >= cfgx->start_bus)
318 cfg->end_bus = cfgx->start_bus - 1;
295 } 319 }
296} 320}
297 321
@@ -306,8 +330,7 @@ static int __init pci_mmcfg_check_hostbridge(void)
306 if (!raw_pci_ops) 330 if (!raw_pci_ops)
307 return 0; 331 return 0;
308 332
309 pci_mmcfg_config_num = 0; 333 free_all_mmcfg();
310 pci_mmcfg_config = NULL;
311 334
312 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) { 335 for (i = 0; i < ARRAY_SIZE(pci_mmcfg_probes); i++) {
313 bus = pci_mmcfg_probes[i].bus; 336 bus = pci_mmcfg_probes[i].bus;
@@ -322,45 +345,22 @@ static int __init pci_mmcfg_check_hostbridge(void)
322 name = pci_mmcfg_probes[i].probe(); 345 name = pci_mmcfg_probes[i].probe();
323 346
324 if (name) 347 if (name)
325 printk(KERN_INFO "PCI: Found %s with MMCONFIG support.\n", 348 printk(KERN_INFO PREFIX "%s with MMCONFIG support\n",
326 name); 349 name);
327 } 350 }
328 351
329 /* some end_bus_number is crazy, fix it */ 352 /* some end_bus_number is crazy, fix it */
330 pci_mmcfg_check_end_bus_number(); 353 pci_mmcfg_check_end_bus_number();
331 354
332 return pci_mmcfg_config_num != 0; 355 return !list_empty(&pci_mmcfg_list);
333} 356}
334 357
335static void __init pci_mmcfg_insert_resources(void) 358static void __init pci_mmcfg_insert_resources(void)
336{ 359{
337#define PCI_MMCFG_RESOURCE_NAME_LEN 24 360 struct pci_mmcfg_region *cfg;
338 int i;
339 struct resource *res;
340 char *names;
341 unsigned num_buses;
342
343 res = kcalloc(PCI_MMCFG_RESOURCE_NAME_LEN + sizeof(*res),
344 pci_mmcfg_config_num, GFP_KERNEL);
345 if (!res) {
346 printk(KERN_ERR "PCI: Unable to allocate MMCONFIG resources\n");
347 return;
348 }
349 361
350 names = (void *)&res[pci_mmcfg_config_num]; 362 list_for_each_entry(cfg, &pci_mmcfg_list, list)
351 for (i = 0; i < pci_mmcfg_config_num; i++, res++) { 363 insert_resource(&iomem_resource, &cfg->res);
352 struct acpi_mcfg_allocation *cfg = &pci_mmcfg_config[i];
353 num_buses = cfg->end_bus_number - cfg->start_bus_number + 1;
354 res->name = names;
355 snprintf(names, PCI_MMCFG_RESOURCE_NAME_LEN,
356 "PCI MMCONFIG %u [%02x-%02x]", cfg->pci_segment,
357 cfg->start_bus_number, cfg->end_bus_number);
358 res->start = cfg->address + (cfg->start_bus_number << 20);
359 res->end = res->start + (num_buses << 20) - 1;
360 res->flags = IORESOURCE_MEM | IORESOURCE_BUSY;
361 insert_resource(&iomem_resource, res);
362 names += PCI_MMCFG_RESOURCE_NAME_LEN;
363 }
364 364
365 /* Mark that the resources have been inserted. */ 365 /* Mark that the resources have been inserted. */
366 pci_mmcfg_resources_inserted = 1; 366 pci_mmcfg_resources_inserted = 1;
@@ -437,11 +437,12 @@ static int __init is_acpi_reserved(u64 start, u64 end, unsigned not_used)
437typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type); 437typedef int (*check_reserved_t)(u64 start, u64 end, unsigned type);
438 438
439static int __init is_mmconf_reserved(check_reserved_t is_reserved, 439static int __init is_mmconf_reserved(check_reserved_t is_reserved,
440 u64 addr, u64 size, int i, 440 struct pci_mmcfg_region *cfg, int with_e820)
441 typeof(pci_mmcfg_config[0]) *cfg, int with_e820)
442{ 441{
442 u64 addr = cfg->res.start;
443 u64 size = resource_size(&cfg->res);
443 u64 old_size = size; 444 u64 old_size = size;
444 int valid = 0; 445 int valid = 0, num_buses;
445 446
446 while (!is_reserved(addr, addr + size, E820_RESERVED)) { 447 while (!is_reserved(addr, addr + size, E820_RESERVED)) {
447 size >>= 1; 448 size >>= 1;
@@ -450,19 +451,25 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
450 } 451 }
451 452
452 if (size >= (16UL<<20) || size == old_size) { 453 if (size >= (16UL<<20) || size == old_size) {
453 printk(KERN_NOTICE 454 printk(KERN_INFO PREFIX "MMCONFIG at %pR reserved in %s\n",
454 "PCI: MCFG area at %Lx reserved in %s\n", 455 &cfg->res,
455 addr, with_e820?"E820":"ACPI motherboard resources"); 456 with_e820 ? "E820" : "ACPI motherboard resources");
456 valid = 1; 457 valid = 1;
457 458
458 if (old_size != size) { 459 if (old_size != size) {
459 /* update end_bus_number */ 460 /* update end_bus */
460 cfg->end_bus_number = cfg->start_bus_number + ((size>>20) - 1); 461 cfg->end_bus = cfg->start_bus + ((size>>20) - 1);
461 printk(KERN_NOTICE "PCI: updated MCFG configuration %d: base %lx " 462 num_buses = cfg->end_bus - cfg->start_bus + 1;
462 "segment %hu buses %u - %u\n", 463 cfg->res.end = cfg->res.start +
463 i, (unsigned long)cfg->address, cfg->pci_segment, 464 PCI_MMCFG_BUS_OFFSET(num_buses) - 1;
464 (unsigned int)cfg->start_bus_number, 465 snprintf(cfg->name, PCI_MMCFG_RESOURCE_NAME_LEN,
465 (unsigned int)cfg->end_bus_number); 466 "PCI MMCONFIG %04x [bus %02x-%02x]",
467 cfg->segment, cfg->start_bus, cfg->end_bus);
468 printk(KERN_INFO PREFIX
469 "MMCONFIG for %04x [bus%02x-%02x] "
470 "at %pR (base %#lx) (size reduced!)\n",
471 cfg->segment, cfg->start_bus, cfg->end_bus,
472 &cfg->res, (unsigned long) cfg->address);
466 } 473 }
467 } 474 }
468 475
@@ -471,45 +478,26 @@ static int __init is_mmconf_reserved(check_reserved_t is_reserved,
471 478
472static void __init pci_mmcfg_reject_broken(int early) 479static void __init pci_mmcfg_reject_broken(int early)
473{ 480{
474 typeof(pci_mmcfg_config[0]) *cfg; 481 struct pci_mmcfg_region *cfg;
475 int i;
476 482
477 if ((pci_mmcfg_config_num == 0) || 483 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
478 (pci_mmcfg_config == NULL) ||
479 (pci_mmcfg_config[0].address == 0))
480 return;
481
482 for (i = 0; i < pci_mmcfg_config_num; i++) {
483 int valid = 0; 484 int valid = 0;
484 u64 addr, size;
485
486 cfg = &pci_mmcfg_config[i];
487 addr = cfg->start_bus_number;
488 addr <<= 20;
489 addr += cfg->address;
490 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
491 size <<= 20;
492 printk(KERN_NOTICE "PCI: MCFG configuration %d: base %lx "
493 "segment %hu buses %u - %u\n",
494 i, (unsigned long)cfg->address, cfg->pci_segment,
495 (unsigned int)cfg->start_bus_number,
496 (unsigned int)cfg->end_bus_number);
497 485
498 if (!early && !acpi_disabled) 486 if (!early && !acpi_disabled)
499 valid = is_mmconf_reserved(is_acpi_reserved, addr, size, i, cfg, 0); 487 valid = is_mmconf_reserved(is_acpi_reserved, cfg, 0);
500 488
501 if (valid) 489 if (valid)
502 continue; 490 continue;
503 491
504 if (!early) 492 if (!early)
505 printk(KERN_ERR "PCI: BIOS Bug: MCFG area at %Lx is not" 493 printk(KERN_ERR FW_BUG PREFIX
506 " reserved in ACPI motherboard resources\n", 494 "MMCONFIG at %pR not reserved in "
507 cfg->address); 495 "ACPI motherboard resources\n", &cfg->res);
508 496
509 /* Don't try to do this check unless configuration 497 /* Don't try to do this check unless configuration
510 type 1 is available. how about type 2 ?*/ 498 type 1 is available. how about type 2 ?*/
511 if (raw_pci_ops) 499 if (raw_pci_ops)
512 valid = is_mmconf_reserved(e820_all_mapped, addr, size, i, cfg, 1); 500 valid = is_mmconf_reserved(e820_all_mapped, cfg, 1);
513 501
514 if (!valid) 502 if (!valid)
515 goto reject; 503 goto reject;
@@ -518,34 +506,41 @@ static void __init pci_mmcfg_reject_broken(int early)
518 return; 506 return;
519 507
520reject: 508reject:
521 printk(KERN_INFO "PCI: Not using MMCONFIG.\n"); 509 printk(KERN_INFO PREFIX "not using MMCONFIG\n");
522 pci_mmcfg_arch_free(); 510 free_all_mmcfg();
523 kfree(pci_mmcfg_config);
524 pci_mmcfg_config = NULL;
525 pci_mmcfg_config_num = 0;
526} 511}
527 512
528static int __initdata known_bridge; 513static int __initdata known_bridge;
529 514
530static int acpi_mcfg_64bit_base_addr __initdata = FALSE; 515static int __init acpi_mcfg_check_entry(struct acpi_table_mcfg *mcfg,
516 struct acpi_mcfg_allocation *cfg)
517{
518 int year;
531 519
532/* The physical address of the MMCONFIG aperture. Set from ACPI tables. */ 520 if (cfg->address < 0xFFFFFFFF)
533struct acpi_mcfg_allocation *pci_mmcfg_config; 521 return 0;
534int pci_mmcfg_config_num;
535 522
536static int __init acpi_mcfg_oem_check(struct acpi_table_mcfg *mcfg)
537{
538 if (!strcmp(mcfg->header.oem_id, "SGI")) 523 if (!strcmp(mcfg->header.oem_id, "SGI"))
539 acpi_mcfg_64bit_base_addr = TRUE; 524 return 0;
540 525
541 return 0; 526 if (mcfg->header.revision >= 1) {
527 if (dmi_get_date(DMI_BIOS_DATE, &year, NULL, NULL) &&
528 year >= 2010)
529 return 0;
530 }
531
532 printk(KERN_ERR PREFIX "MCFG region for %04x [bus %02x-%02x] at %#llx "
533 "is above 4GB, ignored\n", cfg->pci_segment,
534 cfg->start_bus_number, cfg->end_bus_number, cfg->address);
535 return -EINVAL;
542} 536}
543 537
544static int __init pci_parse_mcfg(struct acpi_table_header *header) 538static int __init pci_parse_mcfg(struct acpi_table_header *header)
545{ 539{
546 struct acpi_table_mcfg *mcfg; 540 struct acpi_table_mcfg *mcfg;
541 struct acpi_mcfg_allocation *cfg_table, *cfg;
547 unsigned long i; 542 unsigned long i;
548 int config_size; 543 int entries;
549 544
550 if (!header) 545 if (!header)
551 return -EINVAL; 546 return -EINVAL;
@@ -553,38 +548,33 @@ static int __init pci_parse_mcfg(struct acpi_table_header *header)
553 mcfg = (struct acpi_table_mcfg *)header; 548 mcfg = (struct acpi_table_mcfg *)header;
554 549
555 /* how many config structures do we have */ 550 /* how many config structures do we have */
556 pci_mmcfg_config_num = 0; 551 free_all_mmcfg();
552 entries = 0;
557 i = header->length - sizeof(struct acpi_table_mcfg); 553 i = header->length - sizeof(struct acpi_table_mcfg);
558 while (i >= sizeof(struct acpi_mcfg_allocation)) { 554 while (i >= sizeof(struct acpi_mcfg_allocation)) {
559 ++pci_mmcfg_config_num; 555 entries++;
560 i -= sizeof(struct acpi_mcfg_allocation); 556 i -= sizeof(struct acpi_mcfg_allocation);
561 }; 557 };
562 if (pci_mmcfg_config_num == 0) { 558 if (entries == 0) {
563 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n"); 559 printk(KERN_ERR PREFIX "MMCONFIG has no entries\n");
564 return -ENODEV; 560 return -ENODEV;
565 } 561 }
566 562
567 config_size = pci_mmcfg_config_num * sizeof(*pci_mmcfg_config); 563 cfg_table = (struct acpi_mcfg_allocation *) &mcfg[1];
568 pci_mmcfg_config = kmalloc(config_size, GFP_KERNEL); 564 for (i = 0; i < entries; i++) {
569 if (!pci_mmcfg_config) { 565 cfg = &cfg_table[i];
570 printk(KERN_WARNING PREFIX 566 if (acpi_mcfg_check_entry(mcfg, cfg)) {
571 "No memory for MCFG config tables\n"); 567 free_all_mmcfg();
572 return -ENOMEM;
573 }
574
575 memcpy(pci_mmcfg_config, &mcfg[1], config_size);
576
577 acpi_mcfg_oem_check(mcfg);
578
579 for (i = 0; i < pci_mmcfg_config_num; ++i) {
580 if ((pci_mmcfg_config[i].address > 0xFFFFFFFF) &&
581 !acpi_mcfg_64bit_base_addr) {
582 printk(KERN_ERR PREFIX
583 "MMCONFIG not in low 4GB of memory\n");
584 kfree(pci_mmcfg_config);
585 pci_mmcfg_config_num = 0;
586 return -ENODEV; 568 return -ENODEV;
587 } 569 }
570
571 if (pci_mmconfig_add(cfg->pci_segment, cfg->start_bus_number,
572 cfg->end_bus_number, cfg->address) == NULL) {
573 printk(KERN_WARNING PREFIX
574 "no memory for MCFG entries\n");
575 free_all_mmcfg();
576 return -ENOMEM;
577 }
588 } 578 }
589 579
590 return 0; 580 return 0;
@@ -614,9 +604,7 @@ static void __init __pci_mmcfg_init(int early)
614 604
615 pci_mmcfg_reject_broken(early); 605 pci_mmcfg_reject_broken(early);
616 606
617 if ((pci_mmcfg_config_num == 0) || 607 if (list_empty(&pci_mmcfg_list))
618 (pci_mmcfg_config == NULL) ||
619 (pci_mmcfg_config[0].address == 0))
620 return; 608 return;
621 609
622 if (pci_mmcfg_arch_init()) 610 if (pci_mmcfg_arch_init())
@@ -648,9 +636,7 @@ static int __init pci_mmcfg_late_insert_resources(void)
648 */ 636 */
649 if ((pci_mmcfg_resources_inserted == 1) || 637 if ((pci_mmcfg_resources_inserted == 1) ||
650 (pci_probe & PCI_PROBE_MMCONF) == 0 || 638 (pci_probe & PCI_PROBE_MMCONF) == 0 ||
651 (pci_mmcfg_config_num == 0) || 639 list_empty(&pci_mmcfg_list))
652 (pci_mmcfg_config == NULL) ||
653 (pci_mmcfg_config[0].address == 0))
654 return 1; 640 return 1;
655 641
656 /* 642 /*
diff --git a/arch/x86/pci/mmconfig_32.c b/arch/x86/pci/mmconfig_32.c
index f10a7e94a84c..90d5fd476ed4 100644
--- a/arch/x86/pci/mmconfig_32.c
+++ b/arch/x86/pci/mmconfig_32.c
@@ -27,18 +27,10 @@ static int mmcfg_last_accessed_cpu;
27 */ 27 */
28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn) 28static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
29{ 29{
30 struct acpi_mcfg_allocation *cfg; 30 struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
31 int cfg_num;
32
33 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
34 cfg = &pci_mmcfg_config[cfg_num];
35 if (cfg->pci_segment == seg &&
36 (cfg->start_bus_number <= bus) &&
37 (cfg->end_bus_number >= bus))
38 return cfg->address;
39 }
40 31
41 /* Fall back to type 0 */ 32 if (cfg)
33 return cfg->address;
42 return 0; 34 return 0;
43} 35}
44 36
@@ -47,7 +39,7 @@ static u32 get_base_addr(unsigned int seg, int bus, unsigned devfn)
47 */ 39 */
48static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn) 40static void pci_exp_set_dev_base(unsigned int base, int bus, int devfn)
49{ 41{
50 u32 dev_base = base | (bus << 20) | (devfn << 12); 42 u32 dev_base = base | PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12);
51 int cpu = smp_processor_id(); 43 int cpu = smp_processor_id();
52 if (dev_base != mmcfg_last_accessed_device || 44 if (dev_base != mmcfg_last_accessed_device ||
53 cpu != mmcfg_last_accessed_cpu) { 45 cpu != mmcfg_last_accessed_cpu) {
diff --git a/arch/x86/pci/mmconfig_64.c b/arch/x86/pci/mmconfig_64.c
index 94349f8b2f96..e783841bd1d7 100644
--- a/arch/x86/pci/mmconfig_64.c
+++ b/arch/x86/pci/mmconfig_64.c
@@ -12,38 +12,15 @@
12#include <asm/e820.h> 12#include <asm/e820.h>
13#include <asm/pci_x86.h> 13#include <asm/pci_x86.h>
14 14
15/* Static virtual mapping of the MMCONFIG aperture */ 15#define PREFIX "PCI: "
16struct mmcfg_virt {
17 struct acpi_mcfg_allocation *cfg;
18 char __iomem *virt;
19};
20static struct mmcfg_virt *pci_mmcfg_virt;
21
22static char __iomem *get_virt(unsigned int seg, unsigned bus)
23{
24 struct acpi_mcfg_allocation *cfg;
25 int cfg_num;
26
27 for (cfg_num = 0; cfg_num < pci_mmcfg_config_num; cfg_num++) {
28 cfg = pci_mmcfg_virt[cfg_num].cfg;
29 if (cfg->pci_segment == seg &&
30 (cfg->start_bus_number <= bus) &&
31 (cfg->end_bus_number >= bus))
32 return pci_mmcfg_virt[cfg_num].virt;
33 }
34
35 /* Fall back to type 0 */
36 return NULL;
37}
38 16
39static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn) 17static char __iomem *pci_dev_base(unsigned int seg, unsigned int bus, unsigned int devfn)
40{ 18{
41 char __iomem *addr; 19 struct pci_mmcfg_region *cfg = pci_mmconfig_lookup(seg, bus);
42 20
43 addr = get_virt(seg, bus); 21 if (cfg && cfg->virt)
44 if (!addr) 22 return cfg->virt + (PCI_MMCFG_BUS_OFFSET(bus) | (devfn << 12));
45 return NULL; 23 return NULL;
46 return addr + ((bus << 20) | (devfn << 12));
47} 24}
48 25
49static int pci_mmcfg_read(unsigned int seg, unsigned int bus, 26static int pci_mmcfg_read(unsigned int seg, unsigned int bus,
@@ -109,42 +86,30 @@ static struct pci_raw_ops pci_mmcfg = {
109 .write = pci_mmcfg_write, 86 .write = pci_mmcfg_write,
110}; 87};
111 88
112static void __iomem * __init mcfg_ioremap(struct acpi_mcfg_allocation *cfg) 89static void __iomem * __init mcfg_ioremap(struct pci_mmcfg_region *cfg)
113{ 90{
114 void __iomem *addr; 91 void __iomem *addr;
115 u64 start, size; 92 u64 start, size;
93 int num_buses;
116 94
117 start = cfg->start_bus_number; 95 start = cfg->address + PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
118 start <<= 20; 96 num_buses = cfg->end_bus - cfg->start_bus + 1;
119 start += cfg->address; 97 size = PCI_MMCFG_BUS_OFFSET(num_buses);
120 size = cfg->end_bus_number + 1 - cfg->start_bus_number;
121 size <<= 20;
122 addr = ioremap_nocache(start, size); 98 addr = ioremap_nocache(start, size);
123 if (addr) { 99 if (addr)
124 printk(KERN_INFO "PCI: Using MMCONFIG at %Lx - %Lx\n", 100 addr -= PCI_MMCFG_BUS_OFFSET(cfg->start_bus);
125 start, start + size - 1);
126 addr -= cfg->start_bus_number << 20;
127 }
128 return addr; 101 return addr;
129} 102}
130 103
131int __init pci_mmcfg_arch_init(void) 104int __init pci_mmcfg_arch_init(void)
132{ 105{
133 int i; 106 struct pci_mmcfg_region *cfg;
134 pci_mmcfg_virt = kzalloc(sizeof(*pci_mmcfg_virt) *
135 pci_mmcfg_config_num, GFP_KERNEL);
136 if (pci_mmcfg_virt == NULL) {
137 printk(KERN_ERR "PCI: Can not allocate memory for mmconfig structures\n");
138 return 0;
139 }
140 107
141 for (i = 0; i < pci_mmcfg_config_num; ++i) { 108 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
142 pci_mmcfg_virt[i].cfg = &pci_mmcfg_config[i]; 109 cfg->virt = mcfg_ioremap(cfg);
143 pci_mmcfg_virt[i].virt = mcfg_ioremap(&pci_mmcfg_config[i]); 110 if (!cfg->virt) {
144 if (!pci_mmcfg_virt[i].virt) { 111 printk(KERN_ERR PREFIX "can't map MMCONFIG at %pR\n",
145 printk(KERN_ERR "PCI: Cannot map mmconfig aperture for " 112 &cfg->res);
146 "segment %d\n",
147 pci_mmcfg_config[i].pci_segment);
148 pci_mmcfg_arch_free(); 113 pci_mmcfg_arch_free();
149 return 0; 114 return 0;
150 } 115 }
@@ -155,19 +120,12 @@ int __init pci_mmcfg_arch_init(void)
155 120
156void __init pci_mmcfg_arch_free(void) 121void __init pci_mmcfg_arch_free(void)
157{ 122{
158 int i; 123 struct pci_mmcfg_region *cfg;
159
160 if (pci_mmcfg_virt == NULL)
161 return;
162 124
163 for (i = 0; i < pci_mmcfg_config_num; ++i) { 125 list_for_each_entry(cfg, &pci_mmcfg_list, list) {
164 if (pci_mmcfg_virt[i].virt) { 126 if (cfg->virt) {
165 iounmap(pci_mmcfg_virt[i].virt + (pci_mmcfg_virt[i].cfg->start_bus_number << 20)); 127 iounmap(cfg->virt + PCI_MMCFG_BUS_OFFSET(cfg->start_bus));
166 pci_mmcfg_virt[i].virt = NULL; 128 cfg->virt = NULL;
167 pci_mmcfg_virt[i].cfg = NULL;
168 } 129 }
169 } 130 }
170
171 kfree(pci_mmcfg_virt);
172 pci_mmcfg_virt = NULL;
173} 131}
diff --git a/arch/x86/pci/mrst.c b/arch/x86/pci/mrst.c
new file mode 100644
index 000000000000..1cdc02cf8fa4
--- /dev/null
+++ b/arch/x86/pci/mrst.c
@@ -0,0 +1,266 @@
1/*
2 * Moorestown PCI support
3 * Copyright (c) 2008 Intel Corporation
4 * Jesse Barnes <jesse.barnes@intel.com>
5 *
6 * Moorestown has an interesting PCI implementation:
7 * - configuration space is memory mapped (as defined by MCFG)
8 * - Lincroft devices also have a real, type 1 configuration space
9 * - Early Lincroft silicon has a type 1 access bug that will cause
10 * a hang if non-existent devices are accessed
11 * - some devices have the "fixed BAR" capability, which means
12 * they can't be relocated or modified; check for that during
13 * BAR sizing
14 *
15 * So, we use the MCFG space for all reads and writes, but also send
16 * Lincroft writes to type 1 space. But only read/write if the device
17 * actually exists, otherwise return all 1s for reads and bit bucket
18 * the writes.
19 */
20
21#include <linux/sched.h>
22#include <linux/pci.h>
23#include <linux/ioport.h>
24#include <linux/init.h>
25#include <linux/dmi.h>
26
27#include <asm/acpi.h>
28#include <asm/segment.h>
29#include <asm/io.h>
30#include <asm/smp.h>
31#include <asm/pci_x86.h>
32#include <asm/hw_irq.h>
33#include <asm/io_apic.h>
34
35#define PCIE_CAP_OFFSET 0x100
36
37/* Fixed BAR fields */
38#define PCIE_VNDR_CAP_ID_FIXED_BAR 0x00 /* Fixed BAR (TBD) */
39#define PCI_FIXED_BAR_0_SIZE 0x04
40#define PCI_FIXED_BAR_1_SIZE 0x08
41#define PCI_FIXED_BAR_2_SIZE 0x0c
42#define PCI_FIXED_BAR_3_SIZE 0x10
43#define PCI_FIXED_BAR_4_SIZE 0x14
44#define PCI_FIXED_BAR_5_SIZE 0x1c
45
46/**
47 * fixed_bar_cap - return the offset of the fixed BAR cap if found
48 * @bus: PCI bus
49 * @devfn: device in question
50 *
51 * Look for the fixed BAR cap on @bus and @devfn, returning its offset
52 * if found or 0 otherwise.
53 */
54static int fixed_bar_cap(struct pci_bus *bus, unsigned int devfn)
55{
56 int pos;
57 u32 pcie_cap = 0, cap_data;
58
59 pos = PCIE_CAP_OFFSET;
60
61 if (!raw_pci_ext_ops)
62 return 0;
63
64 while (pos) {
65 if (raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
66 devfn, pos, 4, &pcie_cap))
67 return 0;
68
69 if (pcie_cap == 0xffffffff)
70 return 0;
71
72 if (PCI_EXT_CAP_ID(pcie_cap) == PCI_EXT_CAP_ID_VNDR) {
73 raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
74 devfn, pos + 4, 4, &cap_data);
75 if ((cap_data & 0xffff) == PCIE_VNDR_CAP_ID_FIXED_BAR)
76 return pos;
77 }
78
79 pos = pcie_cap >> 20;
80 }
81
82 return 0;
83}
84
85static int pci_device_update_fixed(struct pci_bus *bus, unsigned int devfn,
86 int reg, int len, u32 val, int offset)
87{
88 u32 size;
89 unsigned int domain, busnum;
90 int bar = (reg - PCI_BASE_ADDRESS_0) >> 2;
91
92 domain = pci_domain_nr(bus);
93 busnum = bus->number;
94
95 if (val == ~0 && len == 4) {
96 unsigned long decode;
97
98 raw_pci_ext_ops->read(domain, busnum, devfn,
99 offset + 8 + (bar * 4), 4, &size);
100
101 /* Turn the size into a decode pattern for the sizing code */
102 if (size) {
103 decode = size - 1;
104 decode |= decode >> 1;
105 decode |= decode >> 2;
106 decode |= decode >> 4;
107 decode |= decode >> 8;
108 decode |= decode >> 16;
109 decode++;
110 decode = ~(decode - 1);
111 } else {
112 decode = ~0;
113 }
114
115 /*
116 * If val is all ones, the core code is trying to size the reg,
117 * so update the mmconfig space with the real size.
118 *
119 * Note: this assumes the fixed size we got is a power of two.
120 */
121 return raw_pci_ext_ops->write(domain, busnum, devfn, reg, 4,
122 decode);
123 }
124
125 /* This is some other kind of BAR write, so just do it. */
126 return raw_pci_ext_ops->write(domain, busnum, devfn, reg, len, val);
127}
128
129/**
130 * type1_access_ok - check whether to use type 1
131 * @bus: bus number
132 * @devfn: device & function in question
133 *
134 * If the bus is on a Lincroft chip and it exists, or is not on a Lincroft at
135 * all, the we can go ahead with any reads & writes. If it's on a Lincroft,
136 * but doesn't exist, avoid the access altogether to keep the chip from
137 * hanging.
138 */
139static bool type1_access_ok(unsigned int bus, unsigned int devfn, int reg)
140{
141 /* This is a workaround for A0 LNC bug where PCI status register does
142 * not have new CAP bit set. can not be written by SW either.
143 *
144 * PCI header type in real LNC indicates a single function device, this
145 * will prevent probing other devices under the same function in PCI
146 * shim. Therefore, use the header type in shim instead.
147 */
148 if (reg >= 0x100 || reg == PCI_STATUS || reg == PCI_HEADER_TYPE)
149 return 0;
150 if (bus == 0 && (devfn == PCI_DEVFN(2, 0) || devfn == PCI_DEVFN(0, 0)))
151 return 1;
152 return 0; /* langwell on others */
153}
154
155static int pci_read(struct pci_bus *bus, unsigned int devfn, int where,
156 int size, u32 *value)
157{
158 if (type1_access_ok(bus->number, devfn, where))
159 return pci_direct_conf1.read(pci_domain_nr(bus), bus->number,
160 devfn, where, size, value);
161 return raw_pci_ext_ops->read(pci_domain_nr(bus), bus->number,
162 devfn, where, size, value);
163}
164
165static int pci_write(struct pci_bus *bus, unsigned int devfn, int where,
166 int size, u32 value)
167{
168 int offset;
169
170 /* On MRST, there is no PCI ROM BAR, this will cause a subsequent read
171 * to ROM BAR return 0 then being ignored.
172 */
173 if (where == PCI_ROM_ADDRESS)
174 return 0;
175
176 /*
177 * Devices with fixed BARs need special handling:
178 * - BAR sizing code will save, write ~0, read size, restore
179 * - so writes to fixed BARs need special handling
180 * - other writes to fixed BAR devices should go through mmconfig
181 */
182 offset = fixed_bar_cap(bus, devfn);
183 if (offset &&
184 (where >= PCI_BASE_ADDRESS_0 && where <= PCI_BASE_ADDRESS_5)) {
185 return pci_device_update_fixed(bus, devfn, where, size, value,
186 offset);
187 }
188
189 /*
190 * On Moorestown update both real & mmconfig space
191 * Note: early Lincroft silicon can't handle type 1 accesses to
192 * non-existent devices, so just eat the write in that case.
193 */
194 if (type1_access_ok(bus->number, devfn, where))
195 return pci_direct_conf1.write(pci_domain_nr(bus), bus->number,
196 devfn, where, size, value);
197 return raw_pci_ext_ops->write(pci_domain_nr(bus), bus->number, devfn,
198 where, size, value);
199}
200
201static int mrst_pci_irq_enable(struct pci_dev *dev)
202{
203 u8 pin;
204 struct io_apic_irq_attr irq_attr;
205
206 pci_read_config_byte(dev, PCI_INTERRUPT_PIN, &pin);
207
208 /* MRST only have IOAPIC, the PCI irq lines are 1:1 mapped to
209 * IOAPIC RTE entries, so we just enable RTE for the device.
210 */
211 irq_attr.ioapic = mp_find_ioapic(dev->irq);
212 irq_attr.ioapic_pin = dev->irq;
213 irq_attr.trigger = 1; /* level */
214 irq_attr.polarity = 1; /* active low */
215 io_apic_set_pci_routing(&dev->dev, dev->irq, &irq_attr);
216
217 return 0;
218}
219
220struct pci_ops pci_mrst_ops = {
221 .read = pci_read,
222 .write = pci_write,
223};
224
225/**
226 * pci_mrst_init - installs pci_mrst_ops
227 *
228 * Moorestown has an interesting PCI implementation (see above).
229 * Called when the early platform detection installs it.
230 */
231int __init pci_mrst_init(void)
232{
233 printk(KERN_INFO "Moorestown platform detected, using MRST PCI ops\n");
234 pci_mmcfg_late_init();
235 pcibios_enable_irq = mrst_pci_irq_enable;
236 pci_root_ops = pci_mrst_ops;
237 /* Continue with standard init */
238 return 1;
239}
240
241/*
242 * Langwell devices reside at fixed offsets, don't try to move them.
243 */
244static void __devinit pci_fixed_bar_fixup(struct pci_dev *dev)
245{
246 unsigned long offset;
247 u32 size;
248 int i;
249
250 /* Must have extended configuration space */
251 if (dev->cfg_size < PCIE_CAP_OFFSET + 4)
252 return;
253
254 /* Fixup the BAR sizes for fixed BAR devices and make them unmoveable */
255 offset = fixed_bar_cap(dev->bus, dev->devfn);
256 if (!offset || PCI_DEVFN(2, 0) == dev->devfn ||
257 PCI_DEVFN(2, 2) == dev->devfn)
258 return;
259
260 for (i = 0; i < PCI_ROM_RESOURCE; i++) {
261 pci_read_config_dword(dev, offset + 8 + (i * 4), &size);
262 dev->resource[i].end = dev->resource[i].start + size - 1;
263 dev->resource[i].flags |= IORESOURCE_PCI_FIXED;
264 }
265}
266DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, PCI_ANY_ID, pci_fixed_bar_fixup);
diff --git a/arch/x86/pci/numaq_32.c b/arch/x86/pci/numaq_32.c
index 8eb295e116f6..8223738ad806 100644
--- a/arch/x86/pci/numaq_32.c
+++ b/arch/x86/pci/numaq_32.c
@@ -8,9 +8,7 @@
8#include <asm/apic.h> 8#include <asm/apic.h>
9#include <asm/mpspec.h> 9#include <asm/mpspec.h>
10#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
11 11#include <asm/numaq.h>
12#define XQUAD_PORTIO_BASE 0xfe400000
13#define XQUAD_PORTIO_QUAD 0x40000 /* 256k per quad. */
14 12
15#define BUS2QUAD(global) (mp_bus_id_to_node[global]) 13#define BUS2QUAD(global) (mp_bus_id_to_node[global])
16 14
@@ -18,8 +16,6 @@
18 16
19#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local]) 17#define QUADLOCAL2BUS(quad,local) (quad_local_to_mp_bus_id[quad][local])
20 18
21#define XQUAD_PORT_ADDR(port, quad) (xquad_portio + (XQUAD_PORTIO_QUAD*quad) + port)
22
23#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \ 19#define PCI_CONF1_MQ_ADDRESS(bus, devfn, reg) \
24 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3)) 20 (0x80000000 | (BUS2LOCAL(bus) << 16) | (devfn << 8) | (reg & ~3))
25 21
@@ -152,14 +148,8 @@ int __init pci_numaq_init(void)
152{ 148{
153 int quad; 149 int quad;
154 150
155 if (!found_numaq)
156 return 0;
157
158 raw_pci_ops = &pci_direct_conf1_mq; 151 raw_pci_ops = &pci_direct_conf1_mq;
159 152
160 if (pcibios_scanned++)
161 return 0;
162
163 pci_root_bus = pcibios_scan_root(0); 153 pci_root_bus = pcibios_scan_root(0);
164 if (pci_root_bus) 154 if (pci_root_bus)
165 pci_bus_add_devices(pci_root_bus); 155 pci_bus_add_devices(pci_root_bus);
diff --git a/arch/x86/pci/olpc.c b/arch/x86/pci/olpc.c
index b889d824f7c6..b34815408f58 100644
--- a/arch/x86/pci/olpc.c
+++ b/arch/x86/pci/olpc.c
@@ -304,9 +304,6 @@ static struct pci_raw_ops pci_olpc_conf = {
304 304
305int __init pci_olpc_init(void) 305int __init pci_olpc_init(void)
306{ 306{
307 if (!machine_is_olpc() || olpc_has_vsa())
308 return -ENODEV;
309
310 printk(KERN_INFO "PCI: Using configuration type OLPC\n"); 307 printk(KERN_INFO "PCI: Using configuration type OLPC\n");
311 raw_pci_ops = &pci_olpc_conf; 308 raw_pci_ops = &pci_olpc_conf;
312 is_lx = is_geode_lx(); 309 is_lx = is_geode_lx();
diff --git a/arch/x86/pci/pcbios.c b/arch/x86/pci/pcbios.c
index 1c975cc9839e..59a225c17b84 100644
--- a/arch/x86/pci/pcbios.c
+++ b/arch/x86/pci/pcbios.c
@@ -4,6 +4,7 @@
4 4
5#include <linux/pci.h> 5#include <linux/pci.h>
6#include <linux/init.h> 6#include <linux/init.h>
7#include <linux/slab.h>
7#include <linux/module.h> 8#include <linux/module.h>
8#include <linux/uaccess.h> 9#include <linux/uaccess.h>
9#include <asm/pci_x86.h> 10#include <asm/pci_x86.h>
diff --git a/arch/x86/pci/visws.c b/arch/x86/pci/visws.c
index bcead7a46871..03008f72eb04 100644
--- a/arch/x86/pci/visws.c
+++ b/arch/x86/pci/visws.c
@@ -69,9 +69,6 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq)
69 69
70int __init pci_visws_init(void) 70int __init pci_visws_init(void)
71{ 71{
72 if (!is_visws_box())
73 return -1;
74
75 pcibios_enable_irq = &pci_visws_enable_irq; 72 pcibios_enable_irq = &pci_visws_enable_irq;
76 pcibios_disable_irq = &pci_visws_disable_irq; 73 pcibios_disable_irq = &pci_visws_disable_irq;
77 74
@@ -90,5 +87,6 @@ int __init pci_visws_init(void)
90 pci_scan_bus_with_sysdata(pci_bus1); 87 pci_scan_bus_with_sysdata(pci_bus1);
91 pci_fixup_irqs(pci_common_swizzle, visws_map_irq); 88 pci_fixup_irqs(pci_common_swizzle, visws_map_irq);
92 pcibios_resource_survey(); 89 pcibios_resource_survey();
93 return 0; 90 /* Request bus scan */
91 return 1;
94} 92}
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 8aa85f17667e..0a979f3e5b8a 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -18,6 +18,7 @@
18#include <asm/mce.h> 18#include <asm/mce.h>
19#include <asm/xcr.h> 19#include <asm/xcr.h>
20#include <asm/suspend.h> 20#include <asm/suspend.h>
21#include <asm/debugreg.h>
21 22
22#ifdef CONFIG_X86_32 23#ifdef CONFIG_X86_32
23static struct saved_context saved_context; 24static struct saved_context saved_context;
@@ -142,31 +143,6 @@ static void fix_processor_context(void)
142#endif 143#endif
143 load_TR_desc(); /* This does ltr */ 144 load_TR_desc(); /* This does ltr */
144 load_LDT(&current->active_mm->context); /* This does lldt */ 145 load_LDT(&current->active_mm->context); /* This does lldt */
145
146 /*
147 * Now maybe reload the debug registers
148 */
149 if (current->thread.debugreg7) {
150#ifdef CONFIG_X86_32
151 set_debugreg(current->thread.debugreg0, 0);
152 set_debugreg(current->thread.debugreg1, 1);
153 set_debugreg(current->thread.debugreg2, 2);
154 set_debugreg(current->thread.debugreg3, 3);
155 /* no 4 and 5 */
156 set_debugreg(current->thread.debugreg6, 6);
157 set_debugreg(current->thread.debugreg7, 7);
158#else
159 /* CONFIG_X86_64 */
160 loaddebug(&current->thread, 0);
161 loaddebug(&current->thread, 1);
162 loaddebug(&current->thread, 2);
163 loaddebug(&current->thread, 3);
164 /* no 4 and 5 */
165 loaddebug(&current->thread, 6);
166 loaddebug(&current->thread, 7);
167#endif
168 }
169
170} 146}
171 147
172/** 148/**
diff --git a/arch/x86/power/hibernate_32.c b/arch/x86/power/hibernate_32.c
index 81197c62d5b3..3769079874d8 100644
--- a/arch/x86/power/hibernate_32.c
+++ b/arch/x86/power/hibernate_32.c
@@ -6,6 +6,7 @@
6 * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl> 6 * Copyright (c) 2006 Rafael J. Wysocki <rjw@sisk.pl>
7 */ 7 */
8 8
9#include <linux/gfp.h>
9#include <linux/suspend.h> 10#include <linux/suspend.h>
10#include <linux/bootmem.h> 11#include <linux/bootmem.h>
11 12
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 65fdc86e923f..d24f983ba1e5 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -8,6 +8,7 @@
8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org> 8 * Copyright (c) 2001 Patrick Mochel <mochel@osdl.org>
9 */ 9 */
10 10
11#include <linux/gfp.h>
11#include <linux/smp.h> 12#include <linux/smp.h>
12#include <linux/suspend.h> 13#include <linux/suspend.h>
13#include <asm/proto.h> 14#include <asm/proto.h>
diff --git a/arch/x86/power/hibernate_asm_32.S b/arch/x86/power/hibernate_asm_32.S
index b641388d8286..ad47daeafa4e 100644
--- a/arch/x86/power/hibernate_asm_32.S
+++ b/arch/x86/power/hibernate_asm_32.S
@@ -27,10 +27,17 @@ ENTRY(swsusp_arch_suspend)
27 ret 27 ret
28 28
29ENTRY(restore_image) 29ENTRY(restore_image)
30 movl mmu_cr4_features, %ecx
30 movl resume_pg_dir, %eax 31 movl resume_pg_dir, %eax
31 subl $__PAGE_OFFSET, %eax 32 subl $__PAGE_OFFSET, %eax
32 movl %eax, %cr3 33 movl %eax, %cr3
33 34
35 jecxz 1f # cr4 Pentium and higher, skip if zero
36 andl $~(X86_CR4_PGE), %ecx
37 movl %ecx, %cr4; # turn off PGE
38 movl %cr3, %eax; # flush TLB
39 movl %eax, %cr3
401:
34 movl restore_pblist, %edx 41 movl restore_pblist, %edx
35 .p2align 4,,7 42 .p2align 4,,7
36 43
@@ -54,16 +61,8 @@ done:
54 movl $swapper_pg_dir, %eax 61 movl $swapper_pg_dir, %eax
55 subl $__PAGE_OFFSET, %eax 62 subl $__PAGE_OFFSET, %eax
56 movl %eax, %cr3 63 movl %eax, %cr3
57 /* Flush TLB, including "global" things (vmalloc) */
58 movl mmu_cr4_features, %ecx 64 movl mmu_cr4_features, %ecx
59 jecxz 1f # cr4 Pentium and higher, skip if zero 65 jecxz 1f # cr4 Pentium and higher, skip if zero
60 movl %ecx, %edx
61 andl $~(X86_CR4_PGE), %edx
62 movl %edx, %cr4; # turn off PGE
631:
64 movl %cr3, %eax; # flush TLB
65 movl %eax, %cr3
66 jecxz 1f # cr4 Pentium and higher, skip if zero
67 movl %ecx, %cr4; # turn PGE back on 66 movl %ecx, %cr4; # turn PGE back on
681: 671:
69 68
diff --git a/arch/x86/tools/Makefile b/arch/x86/tools/Makefile
new file mode 100644
index 000000000000..f82082677337
--- /dev/null
+++ b/arch/x86/tools/Makefile
@@ -0,0 +1,31 @@
1PHONY += posttest
2
3ifeq ($(KBUILD_VERBOSE),1)
4 posttest_verbose = -v
5else
6 posttest_verbose =
7endif
8
9ifeq ($(CONFIG_64BIT),y)
10 posttest_64bit = -y
11else
12 posttest_64bit = -n
13endif
14
15distill_awk = $(srctree)/arch/x86/tools/distill.awk
16chkobjdump = $(srctree)/arch/x86/tools/chkobjdump.awk
17
18quiet_cmd_posttest = TEST $@
19 cmd_posttest = ($(OBJDUMP) -v | $(AWK) -f $(chkobjdump)) || $(OBJDUMP) -d -j .text $(objtree)/vmlinux | $(AWK) -f $(distill_awk) | $(obj)/test_get_len $(posttest_64bit) $(posttest_verbose)
20
21posttest: $(obj)/test_get_len vmlinux
22 $(call cmd,posttest)
23
24hostprogs-y := test_get_len
25
26# -I needed for generated C source and C source which in the kernel tree.
27HOSTCFLAGS_test_get_len.o := -Wall -I$(objtree)/arch/x86/lib/ -I$(srctree)/arch/x86/include/ -I$(srctree)/arch/x86/lib/ -I$(srctree)/include/
28
29# Dependencies are also needed.
30$(obj)/test_get_len.o: $(srctree)/arch/x86/lib/insn.c $(srctree)/arch/x86/lib/inat.c $(srctree)/arch/x86/include/asm/inat_types.h $(srctree)/arch/x86/include/asm/inat.h $(srctree)/arch/x86/include/asm/insn.h $(objtree)/arch/x86/lib/inat-tables.c
31
diff --git a/arch/x86/tools/chkobjdump.awk b/arch/x86/tools/chkobjdump.awk
new file mode 100644
index 000000000000..fd1ab80be0de
--- /dev/null
+++ b/arch/x86/tools/chkobjdump.awk
@@ -0,0 +1,33 @@
1# GNU objdump version checker
2#
3# Usage:
4# objdump -v | awk -f chkobjdump.awk
5BEGIN {
6 # objdump version 2.19 or later is OK for the test.
7 od_ver = 2;
8 od_sver = 19;
9}
10
11/^GNU objdump/ {
12 verstr = ""
13 for (i = 3; i <= NF; i++)
14 if (match($(i), "^[0-9]")) {
15 verstr = $(i);
16 break;
17 }
18 if (verstr == "") {
19 printf("Warning: Failed to find objdump version number.\n");
20 exit 0;
21 }
22 split(verstr, ver, ".");
23 if (ver[1] > od_ver ||
24 (ver[1] == od_ver && ver[2] >= od_sver)) {
25 exit 1;
26 } else {
27 printf("Warning: objdump version %s is older than %d.%d\n",
28 verstr, od_ver, od_sver);
29 print("Warning: Skipping posttest.");
30 # Logic is inverted, because we just skip test without error.
31 exit 0;
32 }
33}
diff --git a/arch/x86/tools/distill.awk b/arch/x86/tools/distill.awk
new file mode 100644
index 000000000000..c13c0ee48ab4
--- /dev/null
+++ b/arch/x86/tools/distill.awk
@@ -0,0 +1,47 @@
1#!/bin/awk -f
2# Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
3# Distills the disassembly as follows:
4# - Removes all lines except the disassembled instructions.
5# - For instructions that exceed 1 line (7 bytes), crams all the hex bytes
6# into a single line.
7# - Remove bad(or prefix only) instructions
8
9BEGIN {
10 prev_addr = ""
11 prev_hex = ""
12 prev_mnemonic = ""
13 bad_expr = "(\\(bad\\)|^rex|^.byte|^rep(z|nz)$|^lock$|^es$|^cs$|^ss$|^ds$|^fs$|^gs$|^data(16|32)$|^addr(16|32|64))"
14 fwait_expr = "^9b "
15 fwait_str="9b\tfwait"
16}
17
18/^ *[0-9a-f]+ <[^>]*>:/ {
19 # Symbol entry
20 printf("%s%s\n", $2, $1)
21}
22
23/^ *[0-9a-f]+:/ {
24 if (split($0, field, "\t") < 3) {
25 # This is a continuation of the same insn.
26 prev_hex = prev_hex field[2]
27 } else {
28 # Skip bad instructions
29 if (match(prev_mnemonic, bad_expr))
30 prev_addr = ""
31 # Split fwait from other f* instructions
32 if (match(prev_hex, fwait_expr) && prev_mnemonic != "fwait") {
33 printf "%s\t%s\n", prev_addr, fwait_str
34 sub(fwait_expr, "", prev_hex)
35 }
36 if (prev_addr != "")
37 printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
38 prev_addr = field[1]
39 prev_hex = field[2]
40 prev_mnemonic = field[3]
41 }
42}
43
44END {
45 if (prev_addr != "")
46 printf "%s\t%s\t%s\n", prev_addr, prev_hex, prev_mnemonic
47}
diff --git a/arch/x86/tools/gen-insn-attr-x86.awk b/arch/x86/tools/gen-insn-attr-x86.awk
new file mode 100644
index 000000000000..eaf11f52fc0b
--- /dev/null
+++ b/arch/x86/tools/gen-insn-attr-x86.awk
@@ -0,0 +1,378 @@
1#!/bin/awk -f
2# gen-insn-attr-x86.awk: Instruction attribute table generator
3# Written by Masami Hiramatsu <mhiramat@redhat.com>
4#
5# Usage: awk -f gen-insn-attr-x86.awk x86-opcode-map.txt > inat-tables.c
6
7# Awk implementation sanity check
8function check_awk_implement() {
9 if (sprintf("%x", 0) != "0")
10 return "Your awk has a printf-format problem."
11 return ""
12}
13
14# Clear working vars
15function clear_vars() {
16 delete table
17 delete lptable2
18 delete lptable1
19 delete lptable3
20 eid = -1 # escape id
21 gid = -1 # group id
22 aid = -1 # AVX id
23 tname = ""
24}
25
26BEGIN {
27 # Implementation error checking
28 awkchecked = check_awk_implement()
29 if (awkchecked != "") {
30 print "Error: " awkchecked > "/dev/stderr"
31 print "Please try to use gawk." > "/dev/stderr"
32 exit 1
33 }
34
35 # Setup generating tables
36 print "/* x86 opcode map generated from x86-opcode-map.txt */"
37 print "/* Do not change this code. */\n"
38 ggid = 1
39 geid = 1
40 gaid = 0
41 delete etable
42 delete gtable
43 delete atable
44
45 opnd_expr = "^[A-Za-z/]"
46 ext_expr = "^\\("
47 sep_expr = "^\\|$"
48 group_expr = "^Grp[0-9A-Za-z]+"
49
50 imm_expr = "^[IJAO][a-z]"
51 imm_flag["Ib"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
52 imm_flag["Jb"] = "INAT_MAKE_IMM(INAT_IMM_BYTE)"
53 imm_flag["Iw"] = "INAT_MAKE_IMM(INAT_IMM_WORD)"
54 imm_flag["Id"] = "INAT_MAKE_IMM(INAT_IMM_DWORD)"
55 imm_flag["Iq"] = "INAT_MAKE_IMM(INAT_IMM_QWORD)"
56 imm_flag["Ap"] = "INAT_MAKE_IMM(INAT_IMM_PTR)"
57 imm_flag["Iz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
58 imm_flag["Jz"] = "INAT_MAKE_IMM(INAT_IMM_VWORD32)"
59 imm_flag["Iv"] = "INAT_MAKE_IMM(INAT_IMM_VWORD)"
60 imm_flag["Ob"] = "INAT_MOFFSET"
61 imm_flag["Ov"] = "INAT_MOFFSET"
62
63 modrm_expr = "^([CDEGMNPQRSUVW/][a-z]+|NTA|T[012])"
64 force64_expr = "\\([df]64\\)"
65 rex_expr = "^REX(\\.[XRWB]+)*"
66 fpu_expr = "^ESC" # TODO
67
68 lprefix1_expr = "\\(66\\)"
69 lprefix2_expr = "\\(F3\\)"
70 lprefix3_expr = "\\(F2\\)"
71 max_lprefix = 4
72
73 vexok_expr = "\\(VEX\\)"
74 vexonly_expr = "\\(oVEX\\)"
75
76 prefix_expr = "\\(Prefix\\)"
77 prefix_num["Operand-Size"] = "INAT_PFX_OPNDSZ"
78 prefix_num["REPNE"] = "INAT_PFX_REPNE"
79 prefix_num["REP/REPE"] = "INAT_PFX_REPE"
80 prefix_num["LOCK"] = "INAT_PFX_LOCK"
81 prefix_num["SEG=CS"] = "INAT_PFX_CS"
82 prefix_num["SEG=DS"] = "INAT_PFX_DS"
83 prefix_num["SEG=ES"] = "INAT_PFX_ES"
84 prefix_num["SEG=FS"] = "INAT_PFX_FS"
85 prefix_num["SEG=GS"] = "INAT_PFX_GS"
86 prefix_num["SEG=SS"] = "INAT_PFX_SS"
87 prefix_num["Address-Size"] = "INAT_PFX_ADDRSZ"
88 prefix_num["2bytes-VEX"] = "INAT_PFX_VEX2"
89 prefix_num["3bytes-VEX"] = "INAT_PFX_VEX3"
90
91 clear_vars()
92}
93
94function semantic_error(msg) {
95 print "Semantic error at " NR ": " msg > "/dev/stderr"
96 exit 1
97}
98
99function debug(msg) {
100 print "DEBUG: " msg
101}
102
103function array_size(arr, i,c) {
104 c = 0
105 for (i in arr)
106 c++
107 return c
108}
109
110/^Table:/ {
111 print "/* " $0 " */"
112 if (tname != "")
113 semantic_error("Hit Table: before EndTable:.");
114}
115
116/^Referrer:/ {
117 if (NF != 1) {
118 # escape opcode table
119 ref = ""
120 for (i = 2; i <= NF; i++)
121 ref = ref $i
122 eid = escape[ref]
123 tname = sprintf("inat_escape_table_%d", eid)
124 }
125}
126
127/^AVXcode:/ {
128 if (NF != 1) {
129 # AVX/escape opcode table
130 aid = $2
131 if (gaid <= aid)
132 gaid = aid + 1
133 if (tname == "") # AVX only opcode table
134 tname = sprintf("inat_avx_table_%d", $2)
135 }
136 if (aid == -1 && eid == -1) # primary opcode table
137 tname = "inat_primary_table"
138}
139
140/^GrpTable:/ {
141 print "/* " $0 " */"
142 if (!($2 in group))
143 semantic_error("No group: " $2 )
144 gid = group[$2]
145 tname = "inat_group_table_" gid
146}
147
148function print_table(tbl,name,fmt,n)
149{
150 print "const insn_attr_t " name " = {"
151 for (i = 0; i < n; i++) {
152 id = sprintf(fmt, i)
153 if (tbl[id])
154 print " [" id "] = " tbl[id] ","
155 }
156 print "};"
157}
158
159/^EndTable/ {
160 if (gid != -1) {
161 # print group tables
162 if (array_size(table) != 0) {
163 print_table(table, tname "[INAT_GROUP_TABLE_SIZE]",
164 "0x%x", 8)
165 gtable[gid,0] = tname
166 }
167 if (array_size(lptable1) != 0) {
168 print_table(lptable1, tname "_1[INAT_GROUP_TABLE_SIZE]",
169 "0x%x", 8)
170 gtable[gid,1] = tname "_1"
171 }
172 if (array_size(lptable2) != 0) {
173 print_table(lptable2, tname "_2[INAT_GROUP_TABLE_SIZE]",
174 "0x%x", 8)
175 gtable[gid,2] = tname "_2"
176 }
177 if (array_size(lptable3) != 0) {
178 print_table(lptable3, tname "_3[INAT_GROUP_TABLE_SIZE]",
179 "0x%x", 8)
180 gtable[gid,3] = tname "_3"
181 }
182 } else {
183 # print primary/escaped tables
184 if (array_size(table) != 0) {
185 print_table(table, tname "[INAT_OPCODE_TABLE_SIZE]",
186 "0x%02x", 256)
187 etable[eid,0] = tname
188 if (aid >= 0)
189 atable[aid,0] = tname
190 }
191 if (array_size(lptable1) != 0) {
192 print_table(lptable1,tname "_1[INAT_OPCODE_TABLE_SIZE]",
193 "0x%02x", 256)
194 etable[eid,1] = tname "_1"
195 if (aid >= 0)
196 atable[aid,1] = tname "_1"
197 }
198 if (array_size(lptable2) != 0) {
199 print_table(lptable2,tname "_2[INAT_OPCODE_TABLE_SIZE]",
200 "0x%02x", 256)
201 etable[eid,2] = tname "_2"
202 if (aid >= 0)
203 atable[aid,2] = tname "_2"
204 }
205 if (array_size(lptable3) != 0) {
206 print_table(lptable3,tname "_3[INAT_OPCODE_TABLE_SIZE]",
207 "0x%02x", 256)
208 etable[eid,3] = tname "_3"
209 if (aid >= 0)
210 atable[aid,3] = tname "_3"
211 }
212 }
213 print ""
214 clear_vars()
215}
216
217function add_flags(old,new) {
218 if (old && new)
219 return old " | " new
220 else if (old)
221 return old
222 else
223 return new
224}
225
226# convert operands to flags.
227function convert_operands(count,opnd, i,j,imm,mod)
228{
229 imm = null
230 mod = null
231 for (j = 1; j <= count; j++) {
232 i = opnd[j]
233 if (match(i, imm_expr) == 1) {
234 if (!imm_flag[i])
235 semantic_error("Unknown imm opnd: " i)
236 if (imm) {
237 if (i != "Ib")
238 semantic_error("Second IMM error")
239 imm = add_flags(imm, "INAT_SCNDIMM")
240 } else
241 imm = imm_flag[i]
242 } else if (match(i, modrm_expr))
243 mod = "INAT_MODRM"
244 }
245 return add_flags(imm, mod)
246}
247
248/^[0-9a-f]+\:/ {
249 if (NR == 1)
250 next
251 # get index
252 idx = "0x" substr($1, 1, index($1,":") - 1)
253 if (idx in table)
254 semantic_error("Redefine " idx " in " tname)
255
256 # check if escaped opcode
257 if ("escape" == $2) {
258 if ($3 != "#")
259 semantic_error("No escaped name")
260 ref = ""
261 for (i = 4; i <= NF; i++)
262 ref = ref $i
263 if (ref in escape)
264 semantic_error("Redefine escape (" ref ")")
265 escape[ref] = geid
266 geid++
267 table[idx] = "INAT_MAKE_ESCAPE(" escape[ref] ")"
268 next
269 }
270
271 variant = null
272 # converts
273 i = 2
274 while (i <= NF) {
275 opcode = $(i++)
276 delete opnds
277 ext = null
278 flags = null
279 opnd = null
280 # parse one opcode
281 if (match($i, opnd_expr)) {
282 opnd = $i
283 count = split($(i++), opnds, ",")
284 flags = convert_operands(count, opnds)
285 }
286 if (match($i, ext_expr))
287 ext = $(i++)
288 if (match($i, sep_expr))
289 i++
290 else if (i < NF)
291 semantic_error($i " is not a separator")
292
293 # check if group opcode
294 if (match(opcode, group_expr)) {
295 if (!(opcode in group)) {
296 group[opcode] = ggid
297 ggid++
298 }
299 flags = add_flags(flags, "INAT_MAKE_GROUP(" group[opcode] ")")
300 }
301 # check force(or default) 64bit
302 if (match(ext, force64_expr))
303 flags = add_flags(flags, "INAT_FORCE64")
304
305 # check REX prefix
306 if (match(opcode, rex_expr))
307 flags = add_flags(flags, "INAT_MAKE_PREFIX(INAT_PFX_REX)")
308
309 # check coprocessor escape : TODO
310 if (match(opcode, fpu_expr))
311 flags = add_flags(flags, "INAT_MODRM")
312
313 # check VEX only code
314 if (match(ext, vexonly_expr))
315 flags = add_flags(flags, "INAT_VEXOK | INAT_VEXONLY")
316
317 # check VEX only code
318 if (match(ext, vexok_expr))
319 flags = add_flags(flags, "INAT_VEXOK")
320
321 # check prefixes
322 if (match(ext, prefix_expr)) {
323 if (!prefix_num[opcode])
324 semantic_error("Unknown prefix: " opcode)
325 flags = add_flags(flags, "INAT_MAKE_PREFIX(" prefix_num[opcode] ")")
326 }
327 if (length(flags) == 0)
328 continue
329 # check if last prefix
330 if (match(ext, lprefix1_expr)) {
331 lptable1[idx] = add_flags(lptable1[idx],flags)
332 variant = "INAT_VARIANT"
333 } else if (match(ext, lprefix2_expr)) {
334 lptable2[idx] = add_flags(lptable2[idx],flags)
335 variant = "INAT_VARIANT"
336 } else if (match(ext, lprefix3_expr)) {
337 lptable3[idx] = add_flags(lptable3[idx],flags)
338 variant = "INAT_VARIANT"
339 } else {
340 table[idx] = add_flags(table[idx],flags)
341 }
342 }
343 if (variant)
344 table[idx] = add_flags(table[idx],variant)
345}
346
347END {
348 if (awkchecked != "")
349 exit 1
350 # print escape opcode map's array
351 print "/* Escape opcode map array */"
352 print "const insn_attr_t const *inat_escape_tables[INAT_ESC_MAX + 1]" \
353 "[INAT_LSTPFX_MAX + 1] = {"
354 for (i = 0; i < geid; i++)
355 for (j = 0; j < max_lprefix; j++)
356 if (etable[i,j])
357 print " ["i"]["j"] = "etable[i,j]","
358 print "};\n"
359 # print group opcode map's array
360 print "/* Group opcode map array */"
361 print "const insn_attr_t const *inat_group_tables[INAT_GRP_MAX + 1]"\
362 "[INAT_LSTPFX_MAX + 1] = {"
363 for (i = 0; i < ggid; i++)
364 for (j = 0; j < max_lprefix; j++)
365 if (gtable[i,j])
366 print " ["i"]["j"] = "gtable[i,j]","
367 print "};\n"
368 # print AVX opcode map's array
369 print "/* AVX opcode map array */"
370 print "const insn_attr_t const *inat_avx_tables[X86_VEX_M_MAX + 1]"\
371 "[INAT_LSTPFX_MAX + 1] = {"
372 for (i = 0; i < gaid; i++)
373 for (j = 0; j < max_lprefix; j++)
374 if (atable[i,j])
375 print " ["i"]["j"] = "atable[i,j]","
376 print "};"
377}
378
diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c
new file mode 100644
index 000000000000..13403fc95a96
--- /dev/null
+++ b/arch/x86/tools/test_get_len.c
@@ -0,0 +1,173 @@
1/*
2 * This program is free software; you can redistribute it and/or modify
3 * it under the terms of the GNU General Public License as published by
4 * the Free Software Foundation; either version 2 of the License, or
5 * (at your option) any later version.
6 *
7 * This program is distributed in the hope that it will be useful,
8 * but WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
10 * GNU General Public License for more details.
11 *
12 * You should have received a copy of the GNU General Public License
13 * along with this program; if not, write to the Free Software
14 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
15 *
16 * Copyright (C) IBM Corporation, 2009
17 */
18
19#include <stdlib.h>
20#include <stdio.h>
21#include <string.h>
22#include <assert.h>
23#include <unistd.h>
24
25#define unlikely(cond) (cond)
26
27#include <asm/insn.h>
28#include <inat.c>
29#include <insn.c>
30
31/*
32 * Test of instruction analysis in general and insn_get_length() in
33 * particular. See if insn_get_length() and the disassembler agree
34 * on the length of each instruction in an elf disassembly.
35 *
36 * Usage: objdump -d a.out | awk -f distill.awk | ./test_get_len
37 */
38
39const char *prog;
40static int verbose;
41static int x86_64;
42
43static void usage(void)
44{
45 fprintf(stderr, "Usage: objdump -d a.out | awk -f distill.awk |"
46 " %s [-y|-n] [-v]\n", prog);
47 fprintf(stderr, "\t-y 64bit mode\n");
48 fprintf(stderr, "\t-n 32bit mode\n");
49 fprintf(stderr, "\t-v verbose mode\n");
50 exit(1);
51}
52
53static void malformed_line(const char *line, int line_nr)
54{
55 fprintf(stderr, "%s: malformed line %d:\n%s", prog, line_nr, line);
56 exit(3);
57}
58
59static void dump_field(FILE *fp, const char *name, const char *indent,
60 struct insn_field *field)
61{
62 fprintf(fp, "%s.%s = {\n", indent, name);
63 fprintf(fp, "%s\t.value = %d, bytes[] = {%x, %x, %x, %x},\n",
64 indent, field->value, field->bytes[0], field->bytes[1],
65 field->bytes[2], field->bytes[3]);
66 fprintf(fp, "%s\t.got = %d, .nbytes = %d},\n", indent,
67 field->got, field->nbytes);
68}
69
70static void dump_insn(FILE *fp, struct insn *insn)
71{
72 fprintf(fp, "Instruction = {\n");
73 dump_field(fp, "prefixes", "\t", &insn->prefixes);
74 dump_field(fp, "rex_prefix", "\t", &insn->rex_prefix);
75 dump_field(fp, "vex_prefix", "\t", &insn->vex_prefix);
76 dump_field(fp, "opcode", "\t", &insn->opcode);
77 dump_field(fp, "modrm", "\t", &insn->modrm);
78 dump_field(fp, "sib", "\t", &insn->sib);
79 dump_field(fp, "displacement", "\t", &insn->displacement);
80 dump_field(fp, "immediate1", "\t", &insn->immediate1);
81 dump_field(fp, "immediate2", "\t", &insn->immediate2);
82 fprintf(fp, "\t.attr = %x, .opnd_bytes = %d, .addr_bytes = %d,\n",
83 insn->attr, insn->opnd_bytes, insn->addr_bytes);
84 fprintf(fp, "\t.length = %d, .x86_64 = %d, .kaddr = %p}\n",
85 insn->length, insn->x86_64, insn->kaddr);
86}
87
88static void parse_args(int argc, char **argv)
89{
90 int c;
91 prog = argv[0];
92 while ((c = getopt(argc, argv, "ynv")) != -1) {
93 switch (c) {
94 case 'y':
95 x86_64 = 1;
96 break;
97 case 'n':
98 x86_64 = 0;
99 break;
100 case 'v':
101 verbose = 1;
102 break;
103 default:
104 usage();
105 }
106 }
107}
108
109#define BUFSIZE 256
110
111int main(int argc, char **argv)
112{
113 char line[BUFSIZE], sym[BUFSIZE] = "<unknown>";
114 unsigned char insn_buf[16];
115 struct insn insn;
116 int insns = 0;
117 int warnings = 0;
118
119 parse_args(argc, argv);
120
121 while (fgets(line, BUFSIZE, stdin)) {
122 char copy[BUFSIZE], *s, *tab1, *tab2;
123 int nb = 0;
124 unsigned int b;
125
126 if (line[0] == '<') {
127 /* Symbol line */
128 strcpy(sym, line);
129 continue;
130 }
131
132 insns++;
133 memset(insn_buf, 0, 16);
134 strcpy(copy, line);
135 tab1 = strchr(copy, '\t');
136 if (!tab1)
137 malformed_line(line, insns);
138 s = tab1 + 1;
139 s += strspn(s, " ");
140 tab2 = strchr(s, '\t');
141 if (!tab2)
142 malformed_line(line, insns);
143 *tab2 = '\0'; /* Characters beyond tab2 aren't examined */
144 while (s < tab2) {
145 if (sscanf(s, "%x", &b) == 1) {
146 insn_buf[nb++] = (unsigned char) b;
147 s += 3;
148 } else
149 break;
150 }
151 /* Decode an instruction */
152 insn_init(&insn, insn_buf, x86_64);
153 insn_get_length(&insn);
154 if (insn.length != nb) {
155 warnings++;
156 fprintf(stderr, "Warning: %s found difference at %s\n",
157 prog, sym);
158 fprintf(stderr, "Warning: %s", line);
159 fprintf(stderr, "Warning: objdump says %d bytes, but "
160 "insn_get_length() says %d\n", nb,
161 insn.length);
162 if (verbose)
163 dump_insn(stderr, &insn);
164 }
165 }
166 if (warnings)
167 fprintf(stderr, "Warning: decoded and checked %d"
168 " instructions with %d warnings\n", insns, warnings);
169 else
170 fprintf(stderr, "Succeed: decoded and checked %d"
171 " instructions\n", insns);
172 return 0;
173}
diff --git a/arch/x86/vdso/vdso32-setup.c b/arch/x86/vdso/vdso32-setup.c
index 58bc00f68b12..02b442e92007 100644
--- a/arch/x86/vdso/vdso32-setup.c
+++ b/arch/x86/vdso/vdso32-setup.c
@@ -393,7 +393,6 @@ static ctl_table abi_table2[] = {
393 393
394static ctl_table abi_root_table2[] = { 394static ctl_table abi_root_table2[] = {
395 { 395 {
396 .ctl_name = CTL_ABI,
397 .procname = "abi", 396 .procname = "abi",
398 .mode = 0555, 397 .mode = 0555,
399 .child = abi_table2 398 .child = abi_table2
diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c
index 21e1aeb9f3ea..ac74869b8140 100644
--- a/arch/x86/vdso/vma.c
+++ b/arch/x86/vdso/vma.c
@@ -6,6 +6,7 @@
6#include <linux/mm.h> 6#include <linux/mm.h>
7#include <linux/err.h> 7#include <linux/err.h>
8#include <linux/sched.h> 8#include <linux/sched.h>
9#include <linux/slab.h>
9#include <linux/init.h> 10#include <linux/init.h>
10#include <linux/random.h> 11#include <linux/random.h>
11#include <linux/elf.h> 12#include <linux/elf.h>
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index e133ce25e290..1304bcec8ee5 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -1,5 +1,6 @@
1#include <linux/init.h> 1#include <linux/init.h>
2#include <linux/debugfs.h> 2#include <linux/debugfs.h>
3#include <linux/slab.h>
3#include <linux/module.h> 4#include <linux/module.h>
4 5
5#include "debugfs.h" 6#include "debugfs.h"
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index dfbf70e65860..65d8d79b46a8 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -27,7 +27,10 @@
27#include <linux/page-flags.h> 27#include <linux/page-flags.h>
28#include <linux/highmem.h> 28#include <linux/highmem.h>
29#include <linux/console.h> 29#include <linux/console.h>
30#include <linux/pci.h>
31#include <linux/gfp.h>
30 32
33#include <xen/xen.h>
31#include <xen/interface/xen.h> 34#include <xen/interface/xen.h>
32#include <xen/interface/version.h> 35#include <xen/interface/version.h>
33#include <xen/interface/physdev.h> 36#include <xen/interface/physdev.h>
@@ -48,6 +51,7 @@
48#include <asm/traps.h> 51#include <asm/traps.h>
49#include <asm/setup.h> 52#include <asm/setup.h>
50#include <asm/desc.h> 53#include <asm/desc.h>
54#include <asm/pgalloc.h>
51#include <asm/pgtable.h> 55#include <asm/pgtable.h>
52#include <asm/tlbflush.h> 56#include <asm/tlbflush.h>
53#include <asm/reboot.h> 57#include <asm/reboot.h>
@@ -138,24 +142,23 @@ static void xen_vcpu_setup(int cpu)
138 */ 142 */
139void xen_vcpu_restore(void) 143void xen_vcpu_restore(void)
140{ 144{
141 if (have_vcpu_info_placement) { 145 int cpu;
142 int cpu;
143 146
144 for_each_online_cpu(cpu) { 147 for_each_online_cpu(cpu) {
145 bool other_cpu = (cpu != smp_processor_id()); 148 bool other_cpu = (cpu != smp_processor_id());
146 149
147 if (other_cpu && 150 if (other_cpu &&
148 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL)) 151 HYPERVISOR_vcpu_op(VCPUOP_down, cpu, NULL))
149 BUG(); 152 BUG();
150 153
151 xen_vcpu_setup(cpu); 154 xen_setup_runstate_info(cpu);
152 155
153 if (other_cpu && 156 if (have_vcpu_info_placement)
154 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL)) 157 xen_vcpu_setup(cpu);
155 BUG();
156 }
157 158
158 BUG_ON(!have_vcpu_info_placement); 159 if (other_cpu &&
160 HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL))
161 BUG();
159 } 162 }
160} 163}
161 164
@@ -1093,10 +1096,14 @@ asmlinkage void __init xen_start_kernel(void)
1093 1096
1094 __supported_pte_mask |= _PAGE_IOMAP; 1097 __supported_pte_mask |= _PAGE_IOMAP;
1095 1098
1096#ifdef CONFIG_X86_64 1099 /*
1100 * Prevent page tables from being allocated in highmem, even
1101 * if CONFIG_HIGHPTE is enabled.
1102 */
1103 __userpte_alloc_gfp &= ~__GFP_HIGHMEM;
1104
1097 /* Work out if we support NX */ 1105 /* Work out if we support NX */
1098 check_efer(); 1106 x86_configure_nx();
1099#endif
1100 1107
1101 xen_setup_features(); 1108 xen_setup_features();
1102 1109
@@ -1152,9 +1159,13 @@ asmlinkage void __init xen_start_kernel(void)
1152 1159
1153 /* keep using Xen gdt for now; no urgent need to change it */ 1160 /* keep using Xen gdt for now; no urgent need to change it */
1154 1161
1162#ifdef CONFIG_X86_32
1155 pv_info.kernel_rpl = 1; 1163 pv_info.kernel_rpl = 1;
1156 if (xen_feature(XENFEAT_supervisor_mode_kernel)) 1164 if (xen_feature(XENFEAT_supervisor_mode_kernel))
1157 pv_info.kernel_rpl = 0; 1165 pv_info.kernel_rpl = 0;
1166#else
1167 pv_info.kernel_rpl = 0;
1168#endif
1158 1169
1159 /* set the limit of our address space */ 1170 /* set the limit of our address space */
1160 xen_reserve_top(); 1171 xen_reserve_top();
@@ -1178,10 +1189,16 @@ asmlinkage void __init xen_start_kernel(void)
1178 add_preferred_console("xenboot", 0, NULL); 1189 add_preferred_console("xenboot", 0, NULL);
1179 add_preferred_console("tty", 0, NULL); 1190 add_preferred_console("tty", 0, NULL);
1180 add_preferred_console("hvc", 0, NULL); 1191 add_preferred_console("hvc", 0, NULL);
1192 } else {
1193 /* Make sure ACS will be enabled */
1194 pci_request_acs();
1181 } 1195 }
1196
1182 1197
1183 xen_raw_console_write("about to get started...\n"); 1198 xen_raw_console_write("about to get started...\n");
1184 1199
1200 xen_setup_runstate_info(0);
1201
1185 /* Start the world */ 1202 /* Start the world */
1186#ifdef CONFIG_X86_32 1203#ifdef CONFIG_X86_32
1187 i386_start_kernel(); 1204 i386_start_kernel();
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 3bf7b1d250ce..914f04695ce5 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -43,6 +43,7 @@
43#include <linux/debugfs.h> 43#include <linux/debugfs.h>
44#include <linux/bug.h> 44#include <linux/bug.h>
45#include <linux/module.h> 45#include <linux/module.h>
46#include <linux/gfp.h>
46 47
47#include <asm/pgtable.h> 48#include <asm/pgtable.h>
48#include <asm/tlbflush.h> 49#include <asm/tlbflush.h>
@@ -185,7 +186,7 @@ static inline unsigned p2m_index(unsigned long pfn)
185} 186}
186 187
187/* Build the parallel p2m_top_mfn structures */ 188/* Build the parallel p2m_top_mfn structures */
188static void __init xen_build_mfn_list_list(void) 189void xen_build_mfn_list_list(void)
189{ 190{
190 unsigned pfn, idx; 191 unsigned pfn, idx;
191 192
@@ -1427,23 +1428,6 @@ static void xen_pgd_free(struct mm_struct *mm, pgd_t *pgd)
1427#endif 1428#endif
1428} 1429}
1429 1430
1430#ifdef CONFIG_HIGHPTE
1431static void *xen_kmap_atomic_pte(struct page *page, enum km_type type)
1432{
1433 pgprot_t prot = PAGE_KERNEL;
1434
1435 if (PagePinned(page))
1436 prot = PAGE_KERNEL_RO;
1437
1438 if (0 && PageHighMem(page))
1439 printk("mapping highpte %lx type %d prot %s\n",
1440 page_to_pfn(page), type,
1441 (unsigned long)pgprot_val(prot) & _PAGE_RW ? "WRITE" : "READ");
1442
1443 return kmap_atomic_prot(page, type, prot);
1444}
1445#endif
1446
1447#ifdef CONFIG_X86_32 1431#ifdef CONFIG_X86_32
1448static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte) 1432static __init pte_t mask_rw_pte(pte_t *ptep, pte_t pte)
1449{ 1433{
@@ -1902,10 +1886,6 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = {
1902 .alloc_pmd_clone = paravirt_nop, 1886 .alloc_pmd_clone = paravirt_nop,
1903 .release_pmd = xen_release_pmd_init, 1887 .release_pmd = xen_release_pmd_init,
1904 1888
1905#ifdef CONFIG_HIGHPTE
1906 .kmap_atomic_pte = xen_kmap_atomic_pte,
1907#endif
1908
1909#ifdef CONFIG_X86_64 1889#ifdef CONFIG_X86_64
1910 .set_pte = xen_set_pte, 1890 .set_pte = xen_set_pte,
1911#else 1891#else
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index fe03eeed7b48..a29693fd3138 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -14,6 +14,7 @@
14 */ 14 */
15#include <linux/sched.h> 15#include <linux/sched.h>
16#include <linux/err.h> 16#include <linux/err.h>
17#include <linux/slab.h>
17#include <linux/smp.h> 18#include <linux/smp.h>
18 19
19#include <asm/paravirt.h> 20#include <asm/paravirt.h>
@@ -35,10 +36,10 @@
35 36
36cpumask_var_t xen_cpu_initialized_map; 37cpumask_var_t xen_cpu_initialized_map;
37 38
38static DEFINE_PER_CPU(int, resched_irq); 39static DEFINE_PER_CPU(int, xen_resched_irq);
39static DEFINE_PER_CPU(int, callfunc_irq); 40static DEFINE_PER_CPU(int, xen_callfunc_irq);
40static DEFINE_PER_CPU(int, callfuncsingle_irq); 41static DEFINE_PER_CPU(int, xen_callfuncsingle_irq);
41static DEFINE_PER_CPU(int, debug_irq) = -1; 42static DEFINE_PER_CPU(int, xen_debug_irq) = -1;
42 43
43static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id); 44static irqreturn_t xen_call_function_interrupt(int irq, void *dev_id);
44static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id); 45static irqreturn_t xen_call_function_single_interrupt(int irq, void *dev_id);
@@ -73,7 +74,7 @@ static __cpuinit void cpu_bringup(void)
73 74
74 xen_setup_cpu_clockevents(); 75 xen_setup_cpu_clockevents();
75 76
76 cpu_set(cpu, cpu_online_map); 77 set_cpu_online(cpu, true);
77 percpu_write(cpu_state, CPU_ONLINE); 78 percpu_write(cpu_state, CPU_ONLINE);
78 wmb(); 79 wmb();
79 80
@@ -103,7 +104,7 @@ static int xen_smp_intr_init(unsigned int cpu)
103 NULL); 104 NULL);
104 if (rc < 0) 105 if (rc < 0)
105 goto fail; 106 goto fail;
106 per_cpu(resched_irq, cpu) = rc; 107 per_cpu(xen_resched_irq, cpu) = rc;
107 108
108 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu); 109 callfunc_name = kasprintf(GFP_KERNEL, "callfunc%d", cpu);
109 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR, 110 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_VECTOR,
@@ -114,7 +115,7 @@ static int xen_smp_intr_init(unsigned int cpu)
114 NULL); 115 NULL);
115 if (rc < 0) 116 if (rc < 0)
116 goto fail; 117 goto fail;
117 per_cpu(callfunc_irq, cpu) = rc; 118 per_cpu(xen_callfunc_irq, cpu) = rc;
118 119
119 debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu); 120 debug_name = kasprintf(GFP_KERNEL, "debug%d", cpu);
120 rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt, 121 rc = bind_virq_to_irqhandler(VIRQ_DEBUG, cpu, xen_debug_interrupt,
@@ -122,7 +123,7 @@ static int xen_smp_intr_init(unsigned int cpu)
122 debug_name, NULL); 123 debug_name, NULL);
123 if (rc < 0) 124 if (rc < 0)
124 goto fail; 125 goto fail;
125 per_cpu(debug_irq, cpu) = rc; 126 per_cpu(xen_debug_irq, cpu) = rc;
126 127
127 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu); 128 callfunc_name = kasprintf(GFP_KERNEL, "callfuncsingle%d", cpu);
128 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR, 129 rc = bind_ipi_to_irqhandler(XEN_CALL_FUNCTION_SINGLE_VECTOR,
@@ -133,19 +134,20 @@ static int xen_smp_intr_init(unsigned int cpu)
133 NULL); 134 NULL);
134 if (rc < 0) 135 if (rc < 0)
135 goto fail; 136 goto fail;
136 per_cpu(callfuncsingle_irq, cpu) = rc; 137 per_cpu(xen_callfuncsingle_irq, cpu) = rc;
137 138
138 return 0; 139 return 0;
139 140
140 fail: 141 fail:
141 if (per_cpu(resched_irq, cpu) >= 0) 142 if (per_cpu(xen_resched_irq, cpu) >= 0)
142 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); 143 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
143 if (per_cpu(callfunc_irq, cpu) >= 0) 144 if (per_cpu(xen_callfunc_irq, cpu) >= 0)
144 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 145 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
145 if (per_cpu(debug_irq, cpu) >= 0) 146 if (per_cpu(xen_debug_irq, cpu) >= 0)
146 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 147 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
147 if (per_cpu(callfuncsingle_irq, cpu) >= 0) 148 if (per_cpu(xen_callfuncsingle_irq, cpu) >= 0)
148 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); 149 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu),
150 NULL);
149 151
150 return rc; 152 return rc;
151} 153}
@@ -295,6 +297,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu)
295 (unsigned long)task_stack_page(idle) - 297 (unsigned long)task_stack_page(idle) -
296 KERNEL_STACK_OFFSET + THREAD_SIZE; 298 KERNEL_STACK_OFFSET + THREAD_SIZE;
297#endif 299#endif
300 xen_setup_runstate_info(cpu);
298 xen_setup_timer(cpu); 301 xen_setup_timer(cpu);
299 xen_init_lock_cpu(cpu); 302 xen_init_lock_cpu(cpu);
300 303
@@ -348,10 +351,10 @@ static void xen_cpu_die(unsigned int cpu)
348 current->state = TASK_UNINTERRUPTIBLE; 351 current->state = TASK_UNINTERRUPTIBLE;
349 schedule_timeout(HZ/10); 352 schedule_timeout(HZ/10);
350 } 353 }
351 unbind_from_irqhandler(per_cpu(resched_irq, cpu), NULL); 354 unbind_from_irqhandler(per_cpu(xen_resched_irq, cpu), NULL);
352 unbind_from_irqhandler(per_cpu(callfunc_irq, cpu), NULL); 355 unbind_from_irqhandler(per_cpu(xen_callfunc_irq, cpu), NULL);
353 unbind_from_irqhandler(per_cpu(debug_irq, cpu), NULL); 356 unbind_from_irqhandler(per_cpu(xen_debug_irq, cpu), NULL);
354 unbind_from_irqhandler(per_cpu(callfuncsingle_irq, cpu), NULL); 357 unbind_from_irqhandler(per_cpu(xen_callfuncsingle_irq, cpu), NULL);
355 xen_uninit_lock_cpu(cpu); 358 xen_uninit_lock_cpu(cpu);
356 xen_teardown_timer(cpu); 359 xen_teardown_timer(cpu);
357 360
@@ -359,7 +362,7 @@ static void xen_cpu_die(unsigned int cpu)
359 alternatives_smp_switch(0); 362 alternatives_smp_switch(0);
360} 363}
361 364
362static void __cpuinit xen_play_dead(void) /* used only with CPU_HOTPLUG */ 365static void __cpuinit xen_play_dead(void) /* used only with HOTPLUG_CPU */
363{ 366{
364 play_dead_common(); 367 play_dead_common();
365 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL); 368 HYPERVISOR_vcpu_op(VCPUOP_down, smp_processor_id(), NULL);
diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c
index 36a5141108df..e0500646585d 100644
--- a/arch/x86/xen/spinlock.c
+++ b/arch/x86/xen/spinlock.c
@@ -6,6 +6,7 @@
6#include <linux/spinlock.h> 6#include <linux/spinlock.h>
7#include <linux/debugfs.h> 7#include <linux/debugfs.h>
8#include <linux/log2.h> 8#include <linux/log2.h>
9#include <linux/gfp.h>
9 10
10#include <asm/paravirt.h> 11#include <asm/paravirt.h>
11 12
@@ -120,14 +121,14 @@ struct xen_spinlock {
120 unsigned short spinners; /* count of waiting cpus */ 121 unsigned short spinners; /* count of waiting cpus */
121}; 122};
122 123
123static int xen_spin_is_locked(struct raw_spinlock *lock) 124static int xen_spin_is_locked(struct arch_spinlock *lock)
124{ 125{
125 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 126 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
126 127
127 return xl->lock != 0; 128 return xl->lock != 0;
128} 129}
129 130
130static int xen_spin_is_contended(struct raw_spinlock *lock) 131static int xen_spin_is_contended(struct arch_spinlock *lock)
131{ 132{
132 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 133 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
133 134
@@ -136,7 +137,7 @@ static int xen_spin_is_contended(struct raw_spinlock *lock)
136 return xl->spinners != 0; 137 return xl->spinners != 0;
137} 138}
138 139
139static int xen_spin_trylock(struct raw_spinlock *lock) 140static int xen_spin_trylock(struct arch_spinlock *lock)
140{ 141{
141 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 142 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
142 u8 old = 1; 143 u8 old = 1;
@@ -181,7 +182,7 @@ static inline void unspinning_lock(struct xen_spinlock *xl, struct xen_spinlock
181 __get_cpu_var(lock_spinners) = prev; 182 __get_cpu_var(lock_spinners) = prev;
182} 183}
183 184
184static noinline int xen_spin_lock_slow(struct raw_spinlock *lock, bool irq_enable) 185static noinline int xen_spin_lock_slow(struct arch_spinlock *lock, bool irq_enable)
185{ 186{
186 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 187 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
187 struct xen_spinlock *prev; 188 struct xen_spinlock *prev;
@@ -254,7 +255,7 @@ out:
254 return ret; 255 return ret;
255} 256}
256 257
257static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable) 258static inline void __xen_spin_lock(struct arch_spinlock *lock, bool irq_enable)
258{ 259{
259 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 260 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
260 unsigned timeout; 261 unsigned timeout;
@@ -291,12 +292,12 @@ static inline void __xen_spin_lock(struct raw_spinlock *lock, bool irq_enable)
291 spin_time_accum_total(start_spin); 292 spin_time_accum_total(start_spin);
292} 293}
293 294
294static void xen_spin_lock(struct raw_spinlock *lock) 295static void xen_spin_lock(struct arch_spinlock *lock)
295{ 296{
296 __xen_spin_lock(lock, false); 297 __xen_spin_lock(lock, false);
297} 298}
298 299
299static void xen_spin_lock_flags(struct raw_spinlock *lock, unsigned long flags) 300static void xen_spin_lock_flags(struct arch_spinlock *lock, unsigned long flags)
300{ 301{
301 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags)); 302 __xen_spin_lock(lock, !raw_irqs_disabled_flags(flags));
302} 303}
@@ -317,7 +318,7 @@ static noinline void xen_spin_unlock_slow(struct xen_spinlock *xl)
317 } 318 }
318} 319}
319 320
320static void xen_spin_unlock(struct raw_spinlock *lock) 321static void xen_spin_unlock(struct arch_spinlock *lock)
321{ 322{
322 struct xen_spinlock *xl = (struct xen_spinlock *)lock; 323 struct xen_spinlock *xl = (struct xen_spinlock *)lock;
323 324
diff --git a/arch/x86/xen/suspend.c b/arch/x86/xen/suspend.c
index 95be7b434724..987267f79bf5 100644
--- a/arch/x86/xen/suspend.c
+++ b/arch/x86/xen/suspend.c
@@ -1,4 +1,5 @@
1#include <linux/types.h> 1#include <linux/types.h>
2#include <linux/clockchips.h>
2 3
3#include <xen/interface/xen.h> 4#include <xen/interface/xen.h>
4#include <xen/grant_table.h> 5#include <xen/grant_table.h>
@@ -27,6 +28,8 @@ void xen_pre_suspend(void)
27 28
28void xen_post_suspend(int suspend_cancelled) 29void xen_post_suspend(int suspend_cancelled)
29{ 30{
31 xen_build_mfn_list_list();
32
30 xen_setup_shared_info(); 33 xen_setup_shared_info();
31 34
32 if (suspend_cancelled) { 35 if (suspend_cancelled) {
@@ -44,7 +47,19 @@ void xen_post_suspend(int suspend_cancelled)
44 47
45} 48}
46 49
50static void xen_vcpu_notify_restore(void *data)
51{
52 unsigned long reason = (unsigned long)data;
53
54 /* Boot processor notified via generic timekeeping_resume() */
55 if ( smp_processor_id() == 0)
56 return;
57
58 clockevents_notify(reason, NULL);
59}
60
47void xen_arch_resume(void) 61void xen_arch_resume(void)
48{ 62{
49 /* nothing */ 63 smp_call_function(xen_vcpu_notify_restore,
64 (void *)CLOCK_EVT_NOTIFY_RESUME, 1);
50} 65}
diff --git a/arch/x86/xen/time.c b/arch/x86/xen/time.c
index 0a5aa44299a5..32764b8880b5 100644
--- a/arch/x86/xen/time.c
+++ b/arch/x86/xen/time.c
@@ -13,6 +13,7 @@
13#include <linux/clockchips.h> 13#include <linux/clockchips.h>
14#include <linux/kernel_stat.h> 14#include <linux/kernel_stat.h>
15#include <linux/math64.h> 15#include <linux/math64.h>
16#include <linux/gfp.h>
16 17
17#include <asm/pvclock.h> 18#include <asm/pvclock.h>
18#include <asm/xen/hypervisor.h> 19#include <asm/xen/hypervisor.h>
@@ -31,14 +32,14 @@
31#define NS_PER_TICK (1000000000LL / HZ) 32#define NS_PER_TICK (1000000000LL / HZ)
32 33
33/* runstate info updated by Xen */ 34/* runstate info updated by Xen */
34static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate); 35static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate);
35 36
36/* snapshots of runstate info */ 37/* snapshots of runstate info */
37static DEFINE_PER_CPU(struct vcpu_runstate_info, runstate_snapshot); 38static DEFINE_PER_CPU(struct vcpu_runstate_info, xen_runstate_snapshot);
38 39
39/* unused ns of stolen and blocked time */ 40/* unused ns of stolen and blocked time */
40static DEFINE_PER_CPU(u64, residual_stolen); 41static DEFINE_PER_CPU(u64, xen_residual_stolen);
41static DEFINE_PER_CPU(u64, residual_blocked); 42static DEFINE_PER_CPU(u64, xen_residual_blocked);
42 43
43/* return an consistent snapshot of 64-bit time/counter value */ 44/* return an consistent snapshot of 64-bit time/counter value */
44static u64 get64(const u64 *p) 45static u64 get64(const u64 *p)
@@ -79,7 +80,7 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
79 80
80 BUG_ON(preemptible()); 81 BUG_ON(preemptible());
81 82
82 state = &__get_cpu_var(runstate); 83 state = &__get_cpu_var(xen_runstate);
83 84
84 /* 85 /*
85 * The runstate info is always updated by the hypervisor on 86 * The runstate info is always updated by the hypervisor on
@@ -97,14 +98,14 @@ static void get_runstate_snapshot(struct vcpu_runstate_info *res)
97/* return true when a vcpu could run but has no real cpu to run on */ 98/* return true when a vcpu could run but has no real cpu to run on */
98bool xen_vcpu_stolen(int vcpu) 99bool xen_vcpu_stolen(int vcpu)
99{ 100{
100 return per_cpu(runstate, vcpu).state == RUNSTATE_runnable; 101 return per_cpu(xen_runstate, vcpu).state == RUNSTATE_runnable;
101} 102}
102 103
103static void setup_runstate_info(int cpu) 104void xen_setup_runstate_info(int cpu)
104{ 105{
105 struct vcpu_register_runstate_memory_area area; 106 struct vcpu_register_runstate_memory_area area;
106 107
107 area.addr.v = &per_cpu(runstate, cpu); 108 area.addr.v = &per_cpu(xen_runstate, cpu);
108 109
109 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area, 110 if (HYPERVISOR_vcpu_op(VCPUOP_register_runstate_memory_area,
110 cpu, &area)) 111 cpu, &area))
@@ -122,7 +123,7 @@ static void do_stolen_accounting(void)
122 123
123 WARN_ON(state.state != RUNSTATE_running); 124 WARN_ON(state.state != RUNSTATE_running);
124 125
125 snap = &__get_cpu_var(runstate_snapshot); 126 snap = &__get_cpu_var(xen_runstate_snapshot);
126 127
127 /* work out how much time the VCPU has not been runn*ing* */ 128 /* work out how much time the VCPU has not been runn*ing* */
128 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked]; 129 blocked = state.time[RUNSTATE_blocked] - snap->time[RUNSTATE_blocked];
@@ -133,24 +134,24 @@ static void do_stolen_accounting(void)
133 134
134 /* Add the appropriate number of ticks of stolen time, 135 /* Add the appropriate number of ticks of stolen time,
135 including any left-overs from last time. */ 136 including any left-overs from last time. */
136 stolen = runnable + offline + __get_cpu_var(residual_stolen); 137 stolen = runnable + offline + __get_cpu_var(xen_residual_stolen);
137 138
138 if (stolen < 0) 139 if (stolen < 0)
139 stolen = 0; 140 stolen = 0;
140 141
141 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen); 142 ticks = iter_div_u64_rem(stolen, NS_PER_TICK, &stolen);
142 __get_cpu_var(residual_stolen) = stolen; 143 __get_cpu_var(xen_residual_stolen) = stolen;
143 account_steal_ticks(ticks); 144 account_steal_ticks(ticks);
144 145
145 /* Add the appropriate number of ticks of blocked time, 146 /* Add the appropriate number of ticks of blocked time,
146 including any left-overs from last time. */ 147 including any left-overs from last time. */
147 blocked += __get_cpu_var(residual_blocked); 148 blocked += __get_cpu_var(xen_residual_blocked);
148 149
149 if (blocked < 0) 150 if (blocked < 0)
150 blocked = 0; 151 blocked = 0;
151 152
152 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked); 153 ticks = iter_div_u64_rem(blocked, NS_PER_TICK, &blocked);
153 __get_cpu_var(residual_blocked) = blocked; 154 __get_cpu_var(xen_residual_blocked) = blocked;
154 account_idle_ticks(ticks); 155 account_idle_ticks(ticks);
155} 156}
156 157
@@ -434,7 +435,7 @@ void xen_setup_timer(int cpu)
434 name = "<timer kasprintf failed>"; 435 name = "<timer kasprintf failed>";
435 436
436 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt, 437 irq = bind_virq_to_irqhandler(VIRQ_TIMER, cpu, xen_timer_interrupt,
437 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING, 438 IRQF_DISABLED|IRQF_PERCPU|IRQF_NOBALANCING|IRQF_TIMER,
438 name, NULL); 439 name, NULL);
439 440
440 evt = &per_cpu(xen_clock_events, cpu); 441 evt = &per_cpu(xen_clock_events, cpu);
@@ -442,8 +443,6 @@ void xen_setup_timer(int cpu)
442 443
443 evt->cpumask = cpumask_of(cpu); 444 evt->cpumask = cpumask_of(cpu);
444 evt->irq = irq; 445 evt->irq = irq;
445
446 setup_runstate_info(cpu);
447} 446}
448 447
449void xen_teardown_timer(int cpu) 448void xen_teardown_timer(int cpu)
@@ -494,6 +493,7 @@ __init void xen_time_init(void)
494 493
495 setup_force_cpu_cap(X86_FEATURE_TSC); 494 setup_force_cpu_cap(X86_FEATURE_TSC);
496 495
496 xen_setup_runstate_info(cpu);
497 xen_setup_timer(cpu); 497 xen_setup_timer(cpu);
498 xen_setup_cpu_clockevents(); 498 xen_setup_cpu_clockevents();
499} 499}
diff --git a/arch/x86/xen/xen-asm_32.S b/arch/x86/xen/xen-asm_32.S
index 88e15deb8b82..22a2093b5862 100644
--- a/arch/x86/xen/xen-asm_32.S
+++ b/arch/x86/xen/xen-asm_32.S
@@ -90,9 +90,9 @@ ENTRY(xen_iret)
90 GET_THREAD_INFO(%eax) 90 GET_THREAD_INFO(%eax)
91 movl TI_cpu(%eax), %eax 91 movl TI_cpu(%eax), %eax
92 movl __per_cpu_offset(,%eax,4), %eax 92 movl __per_cpu_offset(,%eax,4), %eax
93 mov per_cpu__xen_vcpu(%eax), %eax 93 mov xen_vcpu(%eax), %eax
94#else 94#else
95 movl per_cpu__xen_vcpu, %eax 95 movl xen_vcpu, %eax
96#endif 96#endif
97 97
98 /* check IF state we're restoring */ 98 /* check IF state we're restoring */
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 02f496a8dbaa..53adefda4275 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -96,7 +96,7 @@ ENTRY(xen_sysret32)
96 pushq $__USER32_CS 96 pushq $__USER32_CS
97 pushq %rcx 97 pushq %rcx
98 98
99 pushq $VGCF_in_syscall 99 pushq $0
1001: jmp hypercall_iret 1001: jmp hypercall_iret
101ENDPATCH(xen_sysret32) 101ENDPATCH(xen_sysret32)
102RELOC(xen_sysret32, 1b+1) 102RELOC(xen_sysret32, 1b+1)
@@ -151,7 +151,7 @@ ENTRY(xen_syscall32_target)
151ENTRY(xen_sysenter_target) 151ENTRY(xen_sysenter_target)
152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */ 152 lea 16(%rsp), %rsp /* strip %rcx, %r11 */
153 mov $-ENOSYS, %rax 153 mov $-ENOSYS, %rax
154 pushq $VGCF_in_syscall 154 pushq $0
155 jmp hypercall_iret 155 jmp hypercall_iret
156ENDPROC(xen_syscall32_target) 156ENDPROC(xen_syscall32_target)
157ENDPROC(xen_sysenter_target) 157ENDPROC(xen_sysenter_target)
diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h
index 355fa6b99c9c..f9153a300bce 100644
--- a/arch/x86/xen/xen-ops.h
+++ b/arch/x86/xen/xen-ops.h
@@ -25,6 +25,7 @@ extern struct shared_info *HYPERVISOR_shared_info;
25 25
26void xen_setup_mfn_list_list(void); 26void xen_setup_mfn_list_list(void);
27void xen_setup_shared_info(void); 27void xen_setup_shared_info(void);
28void xen_build_mfn_list_list(void);
28void xen_setup_machphys_mapping(void); 29void xen_setup_machphys_mapping(void);
29pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); 30pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn);
30void xen_ident_map_ISA(void); 31void xen_ident_map_ISA(void);
@@ -41,6 +42,7 @@ void __init xen_build_dynamic_phys_to_machine(void);
41 42
42void xen_init_irq_ops(void); 43void xen_init_irq_ops(void);
43void xen_setup_timer(int cpu); 44void xen_setup_timer(int cpu);
45void xen_setup_runstate_info(int cpu);
44void xen_teardown_timer(int cpu); 46void xen_teardown_timer(int cpu);
45cycle_t xen_clocksource_read(void); 47cycle_t xen_clocksource_read(void);
46void xen_setup_cpu_clockevents(void); 48void xen_setup_cpu_clockevents(void);